fix: use context manager for pymupdf.open

KylinMountain · rejojer · commit 25b7590fe0d8 · 2026-04-08T21:09:46.000+08:00
diff --git a/openkb/images.py b/openkb/images.py
@@ -36,92 +36,88 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
     page_images: dict[int, list[str]] = {}
     img_counter = 0
 
-    doc = pymupdf.open(str(pdf_path))
-    for page_idx in range(len(doc)):
-        page = doc[page_idx]
-        page_num = page_idx + 1
-
-        for block in page.get_text("dict")["blocks"]:
-            if block["type"] != 1:  # not an image block
-                continue
-
-            width = block.get("width", 0)
-            height = block.get("height", 0)
-            if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
-                continue
-
-            image_bytes = block.get("image")
-            if not image_bytes:
-                continue
-
-            try:
-                pix = pymupdf.Pixmap(image_bytes)
-                if pix.n > 4:
-                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
-                img_counter += 1
-                filename = f"p{page_num}_img{img_counter}.png"
-                save_path = images_dir / filename
-                pix.save(str(save_path))
-                pix = None
-            except Exception:
-                logger.warning("Failed to save image block on page %d", page_num)
-                continue
-
-            rel_path = f"images/{doc_name}/{filename}"
-            page_images.setdefault(page_num, []).append(rel_path)
-
-    doc.close()
-    return page_images
-
-
-def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
-    """Convert a PDF to markdown with inline images using pymupdf dict-mode.
-
-    Iterates blocks in reading order per page. Text blocks become text,
-    image blocks are saved to disk and replaced with ``![image](path)``
-    inline — preserving the original position in the document.
+    with pymupdf.open(str(pdf_path)) as doc:
+        for page_idx in range(len(doc)):
+            page = doc[page_idx]
+            page_num = page_idx + 1
 
-    Returns the full markdown string.
-    """
-    images_dir.mkdir(parents=True, exist_ok=True)
-    parts: list[str] = []
-    img_counter = 0
+            for block in page.get_text("dict")["blocks"]:
+                if block["type"] != 1:  # not an image block
+                    continue
 
-    doc = pymupdf.open(str(pdf_path))
-    for page_idx in range(len(doc)):
-        page = doc[page_idx]
-        page_num = page_idx + 1
-        parts.append(f"\n\n<!-- Page {page_num} -->\n")
-
-        for block in page.get_text("dict")["blocks"]:
-            if block["type"] == 0:  # text block
-                lines = []
-                for line in block["lines"]:
-                    spans_text = "".join(span["text"] for span in line["spans"])
-                    lines.append(spans_text)
-                parts.append("\n".join(lines))
-
-            elif block["type"] == 1:  # image block
                 width = block.get("width", 0)
                 height = block.get("height", 0)
                 if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
                     continue
+
                 image_bytes = block.get("image")
                 if not image_bytes:
                     continue
+
                 try:
                     pix = pymupdf.Pixmap(image_bytes)
                     if pix.n > 4:
                         pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                     img_counter += 1
                     filename = f"p{page_num}_img{img_counter}.png"
-                    (images_dir / filename).write_bytes(pix.tobytes("png"))
+                    save_path = images_dir / filename
+                    pix.save(str(save_path))
                     pix = None
-                    parts.append(f"\n![image](images/{doc_name}/{filename})\n")
                 except Exception:
                     logger.warning("Failed to save image block on page %d", page_num)
+                    continue
+
+                rel_path = f"images/{doc_name}/{filename}"
+                page_images.setdefault(page_num, []).append(rel_path)
+    return page_images
+
+
+def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
+    """Convert a PDF to markdown with inline images using pymupdf dict-mode.
+
+    Iterates blocks in reading order per page. Text blocks become text,
+    image blocks are saved to disk and replaced with ``![image](path)``
+    inline — preserving the original position in the document.
+
+    Returns the full markdown string.
+    """
+    images_dir.mkdir(parents=True, exist_ok=True)
+    parts: list[str] = []
+    img_counter = 0
 
-    doc.close()
+    with pymupdf.open(str(pdf_path)) as doc:
+        for page_idx in range(len(doc)):
+            page = doc[page_idx]
+            page_num = page_idx + 1
+            parts.append(f"\n\n<!-- Page {page_num} -->\n")
+
+            for block in page.get_text("dict")["blocks"]:
+                if block["type"] == 0:  # text block
+                    lines = []
+                    for line in block["lines"]:
+                        spans_text = "".join(span["text"] for span in line["spans"])
+                        lines.append(spans_text)
+                    parts.append("\n".join(lines))
+
+                elif block["type"] == 1:  # image block
+                    width = block.get("width", 0)
+                    height = block.get("height", 0)
+                    if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
+                        continue
+                    image_bytes = block.get("image")
+                    if not image_bytes:
+                        continue
+                    try:
+                        pix = pymupdf.Pixmap(image_bytes)
+                        if pix.n > 4:
+                            pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                        img_counter += 1
+                        filename = f"p{page_num}_img{img_counter}.png"
+                        (images_dir / filename).write_bytes(pix.tobytes("png"))
+                        pix = None
+                        parts.append(f"\n![image](images/{doc_name}/{filename})\n")
+                    except Exception:
+                        logger.warning("Failed to save image block on page %d", page_num)
     return "\n".join(parts)