Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
"""

import base64
from typing import Any, BinaryIO
import os
from typing import Any, BinaryIO, Optional
from dataclasses import dataclass

from markitdown import StreamInfo

# Environment variable names for OCR configuration
OCR_API_KEY_ENV = "MARKITDOWN_OCR_API_KEY"
OCR_API_BASE_ENV = "MARKITDOWN_OCR_API_BASE"
OCR_MODEL_ENV = "MARKITDOWN_OCR_MODEL"


@dataclass
class OCRResult:
Expand Down Expand Up @@ -45,6 +51,53 @@ def __init__(
"layout and order. Do not add any commentary or description."
)

@classmethod
def from_env(cls, default_prompt: str | None = None) -> "LLMVisionOCRService":
"""
Create LLMVisionOCRService from environment variables.

Required environment variables:
MARKITDOWN_OCR_API_KEY: API key for the OpenAI-compatible service
MARKITDOWN_OCR_MODEL: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')

Optional environment variables:
MARKITDOWN_OCR_API_BASE: API base URL (defaults to OpenAI's API)

Args:
default_prompt: Default prompt for OCR extraction

Returns:
LLMVisionOCRService instance

Raises:
ValueError: If required environment variables are not set
"""
api_key = os.environ.get(OCR_API_KEY_ENV)
model = os.environ.get(OCR_MODEL_ENV)

if not api_key:
raise ValueError(
f"Missing required environment variable: {OCR_API_KEY_ENV}"
)
if not model:
raise ValueError(
f"Missing required environment variable: {OCR_MODEL_ENV}"
)

# Import OpenAI here to allow the module to load without it
try:
from openai import OpenAI
except ImportError:
raise ImportError(
"OpenAI package is required for LLMVisionOCRService. "
"Install it with: pip install openai"
)

api_base = os.environ.get(OCR_API_BASE_ENV)
client = OpenAI(api_key=api_key, base_url=api_base)

return cls(client=client, model=model, default_prompt=default_prompt)

def extract_text(
self,
image_stream: BinaryIO,
Expand Down
24 changes: 22 additions & 2 deletions packages/markitdown-ocr/src/markitdown_ocr/_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@
Registers OCR-enhanced converters with priority-based replacement strategy.
"""

import os
from typing import Any
from markitdown import MarkItDown

from ._ocr_service import LLMVisionOCRService
from ._ocr_service import (
LLMVisionOCRService,
OCR_API_KEY_ENV,
OCR_API_BASE_ENV,
OCR_MODEL_ENV,
)
from ._pdf_converter_with_ocr import PdfConverterWithOCR
from ._docx_converter_with_ocr import DocxConverterWithOCR
from ._pptx_converter_with_ocr import PptxConverterWithOCR
Expand All @@ -25,10 +31,15 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
converters (which have priority 0.0), effectively replacing them when
the plugin is enabled.

OCR service can be configured in three ways (in order of priority):
1. Explicit kwargs: llm_client, llm_model, llm_prompt
2. Environment variables: MARKITDOWN_OCR_API_KEY, MARKITDOWN_OCR_API_BASE, MARKITDOWN_OCR_MODEL
3. No OCR service (text-only extraction)

Args:
markitdown: MarkItDown instance to register converters with
**kwargs: Additional keyword arguments that may include:
- llm_client: OpenAI-compatible client for LLM-based OCR (required for OCR to work)
- llm_client: OpenAI-compatible client for LLM-based OCR
- llm_model: Model name (e.g., 'gpt-4o')
- llm_prompt: Custom prompt for text extraction
"""
Expand All @@ -39,12 +50,21 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
llm_prompt = kwargs.get("llm_prompt")

ocr_service: LLMVisionOCRService | None = None

# Priority 1: Use explicit kwargs if provided
if llm_client and llm_model:
ocr_service = LLMVisionOCRService(
client=llm_client,
model=llm_model,
default_prompt=llm_prompt,
)
# Priority 2: Try to create from environment variables
elif os.environ.get(OCR_API_KEY_ENV) and os.environ.get(OCR_MODEL_ENV):
try:
ocr_service = LLMVisionOCRService.from_env(default_prompt=llm_prompt)
except Exception:
# If environment config is incomplete or invalid, proceed without OCR
pass

# Register converters with priority -1.0 (before built-ins at 0.0)
# This effectively "replaces" the built-in converters when plugin is installed
Expand Down