markitdown/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py at a2f0a7c3587c7c36991feb16ad77c0de6e688b4f · microsoft/markitdown · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
OCR Service Layer for MarkItDown
Provides LLM Vision-based image text extraction.
"""

import base64
import os
from typing import Any, BinaryIO, Optional
from dataclasses import dataclass

from markitdown import StreamInfo

# Environment variable names for OCR configuration
OCR_API_KEY_ENV = "MARKITDOWN_OCR_API_KEY"
OCR_API_BASE_ENV = "MARKITDOWN_OCR_API_BASE"
OCR_MODEL_ENV = "MARKITDOWN_OCR_MODEL"


@dataclass
class OCRResult:
    """Result from OCR extraction."""

    text: str
    confidence: float | None = None
    backend_used: str | None = None
    error: str | None = None


class LLMVisionOCRService:
    """OCR service using LLM vision models (OpenAI-compatible)."""

    def __init__(
        self,
        client: Any,
        model: str,
        default_prompt: str | None = None,
    ) -> None:
        """
        Initialize LLM Vision OCR service.

        Args:
            client: OpenAI-compatible client
            model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
            default_prompt: Default prompt for OCR extraction
        """
        self.client = client
        self.model = model
        self.default_prompt = default_prompt or (
            "Extract all text from this image. "
            "Return ONLY the extracted text, maintaining the original "
            "layout and order. Do not add any commentary or description."
        )

    @classmethod
    def from_env(cls, default_prompt: str | None = None) -> "LLMVisionOCRService":
        """
        Create LLMVisionOCRService from environment variables.

        Required environment variables:
            MARKITDOWN_OCR_API_KEY: API key for the OpenAI-compatible service
            MARKITDOWN_OCR_MODEL: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')

        Optional environment variables:
            MARKITDOWN_OCR_API_BASE: API base URL (defaults to OpenAI's API)

        Args:
            default_prompt: Default prompt for OCR extraction

        Returns:
            LLMVisionOCRService instance

        Raises:
            ValueError: If required environment variables are not set
        """
        api_key = os.environ.get(OCR_API_KEY_ENV)
        model = os.environ.get(OCR_MODEL_ENV)

        if not api_key:
            raise ValueError(
                f"Missing required environment variable: {OCR_API_KEY_ENV}"
            )
        if not model:
            raise ValueError(
                f"Missing required environment variable: {OCR_MODEL_ENV}"
            )

        # Import OpenAI here to allow the module to load without it
        try:
            from openai import OpenAI
        except ImportError:
            raise ImportError(
                "OpenAI package is required for LLMVisionOCRService. "
                "Install it with: pip install openai"
            )

        api_base = os.environ.get(OCR_API_BASE_ENV)
        client = OpenAI(api_key=api_key, base_url=api_base)

        return cls(client=client, model=model, default_prompt=default_prompt)

    def extract_text(
        self,
        image_stream: BinaryIO,
        prompt: str | None = None,
        stream_info: StreamInfo | None = None,
        **kwargs: Any,
    ) -> OCRResult:
        """Extract text using LLM vision."""
        if self.client is None:
            return OCRResult(
                text="",
                backend_used="llm_vision",
                error="LLM client not configured",
            )

        try:
            image_stream.seek(0)

            content_type: str | None = None
            if stream_info:
                content_type = stream_info.mimetype

            if not content_type:
                try:
                    from PIL import Image

                    image_stream.seek(0)
                    img = Image.open(image_stream)
                    fmt = img.format.lower() if img.format else "png"
                    content_type = f"image/{fmt}"
                except Exception:
                    content_type = "image/png"

            image_stream.seek(0)
            base64_image = base64.b64encode(image_stream.read()).decode("utf-8")
            data_uri = f"data:{content_type};base64,{base64_image}"

            actual_prompt = prompt or self.default_prompt
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": actual_prompt},
                            {
                                "type": "image_url",
                                "image_url": {"url": data_uri},
                            },
                        ],
                    }
                ],
            )

            text = response.choices[0].message.content
            return OCRResult(
                text=text.strip() if text else "",
                backend_used="llm_vision",
            )
        except Exception as e:
            return OCRResult(text="", backend_used="llm_vision", error=str(e))
        finally:
            image_stream.seek(0)