-
Notifications
You must be signed in to change notification settings - Fork 7.7k
Expand file tree
/
Copy path_ocr_service.py
More file actions
163 lines (135 loc) · 5.03 KB
/
_ocr_service.py
File metadata and controls
163 lines (135 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
OCR Service Layer for MarkItDown
Provides LLM Vision-based image text extraction.
"""
import base64
import os
from typing import Any, BinaryIO, Optional
from dataclasses import dataclass
from markitdown import StreamInfo
# Environment variable names for OCR configuration
OCR_API_KEY_ENV = "MARKITDOWN_OCR_API_KEY"
OCR_API_BASE_ENV = "MARKITDOWN_OCR_API_BASE"
OCR_MODEL_ENV = "MARKITDOWN_OCR_MODEL"
@dataclass
class OCRResult:
"""Result from OCR extraction."""
text: str
confidence: float | None = None
backend_used: str | None = None
error: str | None = None
class LLMVisionOCRService:
"""OCR service using LLM vision models (OpenAI-compatible)."""
def __init__(
self,
client: Any,
model: str,
default_prompt: str | None = None,
) -> None:
"""
Initialize LLM Vision OCR service.
Args:
client: OpenAI-compatible client
model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
default_prompt: Default prompt for OCR extraction
"""
self.client = client
self.model = model
self.default_prompt = default_prompt or (
"Extract all text from this image. "
"Return ONLY the extracted text, maintaining the original "
"layout and order. Do not add any commentary or description."
)
@classmethod
def from_env(cls, default_prompt: str | None = None) -> "LLMVisionOCRService":
"""
Create LLMVisionOCRService from environment variables.
Required environment variables:
MARKITDOWN_OCR_API_KEY: API key for the OpenAI-compatible service
MARKITDOWN_OCR_MODEL: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
Optional environment variables:
MARKITDOWN_OCR_API_BASE: API base URL (defaults to OpenAI's API)
Args:
default_prompt: Default prompt for OCR extraction
Returns:
LLMVisionOCRService instance
Raises:
ValueError: If required environment variables are not set
"""
api_key = os.environ.get(OCR_API_KEY_ENV)
model = os.environ.get(OCR_MODEL_ENV)
if not api_key:
raise ValueError(
f"Missing required environment variable: {OCR_API_KEY_ENV}"
)
if not model:
raise ValueError(
f"Missing required environment variable: {OCR_MODEL_ENV}"
)
# Import OpenAI here to allow the module to load without it
try:
from openai import OpenAI
except ImportError:
raise ImportError(
"OpenAI package is required for LLMVisionOCRService. "
"Install it with: pip install openai"
)
api_base = os.environ.get(OCR_API_BASE_ENV)
client = OpenAI(api_key=api_key, base_url=api_base)
return cls(client=client, model=model, default_prompt=default_prompt)
def extract_text(
self,
image_stream: BinaryIO,
prompt: str | None = None,
stream_info: StreamInfo | None = None,
**kwargs: Any,
) -> OCRResult:
"""Extract text using LLM vision."""
if self.client is None:
return OCRResult(
text="",
backend_used="llm_vision",
error="LLM client not configured",
)
try:
image_stream.seek(0)
content_type: str | None = None
if stream_info:
content_type = stream_info.mimetype
if not content_type:
try:
from PIL import Image
image_stream.seek(0)
img = Image.open(image_stream)
fmt = img.format.lower() if img.format else "png"
content_type = f"image/{fmt}"
except Exception:
content_type = "image/png"
image_stream.seek(0)
base64_image = base64.b64encode(image_stream.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{base64_image}"
actual_prompt = prompt or self.default_prompt
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": actual_prompt},
{
"type": "image_url",
"image_url": {"url": data_uri},
},
],
}
],
)
text = response.choices[0].message.content
return OCRResult(
text=text.strip() if text else "",
backend_used="llm_vision",
)
except Exception as e:
return OCRResult(text="", backend_used="llm_vision", error=str(e))
finally:
image_stream.seek(0)