diff --git a/packages/markitdown-ocr/README.md b/packages/markitdown-ocr/README.md index d0883db4a..78300c365 100644 --- a/packages/markitdown-ocr/README.md +++ b/packages/markitdown-ocr/README.md @@ -24,6 +24,28 @@ The plugin uses whatever OpenAI-compatible client you already have. Install one pip install openai ``` +### Optional: Malformed PDF fallback (PyMuPDF) + +`markitdown-ocr` includes a fallback for PDFs that `pdfplumber` cannot open (e.g. truncated or malformed files). This fallback uses [PyMuPDF](https://pymupdf.readthedocs.io/) (`fitz`), which is licensed under **AGPL-3.0**. PyMuPDF is **not installed by default** to avoid imposing AGPL requirements on users who do not need it. + +To enable the malformed-PDF fallback: + +```bash +pip install 'markitdown-ocr[pymupdf]' +``` + +Or install everything at once: + +```bash +pip install 'markitdown-ocr[all]' +``` + +> **License notice:** Including `PyMuPDF` (via the `[pymupdf]` or `[all]` extras) adds an AGPL-3.0 +> dependency to your project. If you distribute software that links PyMuPDF, you must comply with +> the AGPL — typically by making your application's source code available. See the +> [PyMuPDF license](https://github.com/pymupdf/PyMuPDF?tab=AGPL-3.0-1-ov-file) for details. +> If AGPL is incompatible with your project's license, install `markitdown-ocr` without this extra. + ## Usage ### Command Line @@ -188,7 +210,9 @@ Contributions are welcome! See the [MarkItDown repository](https://github.com/mi ## License -MIT — see [LICENSE](LICENSE). +`markitdown-ocr` itself is MIT licensed — see [LICENSE](LICENSE). + +**Dependency notice:** The optional `[pymupdf]` extra installs [PyMuPDF](https://github.com/pymupdf/PyMuPDF), which is **AGPL-3.0** licensed. Installing this extra is opt-in. If you do not install it, `markitdown-ocr` operates entirely under MIT-compatible licenses. ## Changelog diff --git a/packages/markitdown-ocr/pyproject.toml b/packages/markitdown-ocr/pyproject.toml index eda3cdda5..d7ec1f545 100644 --- a/packages/markitdown-ocr/pyproject.toml +++ b/packages/markitdown-ocr/pyproject.toml @@ -28,7 +28,6 @@ dependencies = [ "markitdown>=0.1.0", "pdfminer.six>=20251230", "pdfplumber>=0.11.9", - "PyMuPDF>=1.24.0", "mammoth~=1.11.0", "python-docx", "python-pptx", @@ -39,10 +38,20 @@ dependencies = [ # llm_client is passed in by the user (same as for markitdown image descriptions); # install openai or any OpenAI-compatible SDK separately. +# +# NOTE: PyMuPDF (fitz) is AGPL-3.0 licensed. Install the [pymupdf] extra only +# if you need fallback support for malformed PDFs and accept the AGPL terms. [project.optional-dependencies] llm = [ "openai>=1.0.0", ] +pymupdf = [ + "PyMuPDF>=1.24.0", +] +all = [ + "openai>=1.0.0", + "PyMuPDF>=1.24.0", +] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme"