Skip to content

Commit 25b7590

Browse files
KylinMountainrejojer
authored andcommitted
fix: use context manager for pymupdf.open
1 parent 4ddef69 commit 25b7590

1 file changed

Lines changed: 63 additions & 67 deletions

File tree

openkb/images.py

Lines changed: 63 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -36,92 +36,88 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
3636
page_images: dict[int, list[str]] = {}
3737
img_counter = 0
3838

39-
doc = pymupdf.open(str(pdf_path))
40-
for page_idx in range(len(doc)):
41-
page = doc[page_idx]
42-
page_num = page_idx + 1
43-
44-
for block in page.get_text("dict")["blocks"]:
45-
if block["type"] != 1: # not an image block
46-
continue
47-
48-
width = block.get("width", 0)
49-
height = block.get("height", 0)
50-
if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
51-
continue
52-
53-
image_bytes = block.get("image")
54-
if not image_bytes:
55-
continue
56-
57-
try:
58-
pix = pymupdf.Pixmap(image_bytes)
59-
if pix.n > 4:
60-
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
61-
img_counter += 1
62-
filename = f"p{page_num}_img{img_counter}.png"
63-
save_path = images_dir / filename
64-
pix.save(str(save_path))
65-
pix = None
66-
except Exception:
67-
logger.warning("Failed to save image block on page %d", page_num)
68-
continue
69-
70-
rel_path = f"images/{doc_name}/{filename}"
71-
page_images.setdefault(page_num, []).append(rel_path)
72-
73-
doc.close()
74-
return page_images
75-
76-
77-
def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
78-
"""Convert a PDF to markdown with inline images using pymupdf dict-mode.
79-
80-
Iterates blocks in reading order per page. Text blocks become text,
81-
image blocks are saved to disk and replaced with ``![image](path)``
82-
inline — preserving the original position in the document.
39+
with pymupdf.open(str(pdf_path)) as doc:
40+
for page_idx in range(len(doc)):
41+
page = doc[page_idx]
42+
page_num = page_idx + 1
8343

84-
Returns the full markdown string.
85-
"""
86-
images_dir.mkdir(parents=True, exist_ok=True)
87-
parts: list[str] = []
88-
img_counter = 0
44+
for block in page.get_text("dict")["blocks"]:
45+
if block["type"] != 1: # not an image block
46+
continue
8947

90-
doc = pymupdf.open(str(pdf_path))
91-
for page_idx in range(len(doc)):
92-
page = doc[page_idx]
93-
page_num = page_idx + 1
94-
parts.append(f"\n\n<!-- Page {page_num} -->\n")
95-
96-
for block in page.get_text("dict")["blocks"]:
97-
if block["type"] == 0: # text block
98-
lines = []
99-
for line in block["lines"]:
100-
spans_text = "".join(span["text"] for span in line["spans"])
101-
lines.append(spans_text)
102-
parts.append("\n".join(lines))
103-
104-
elif block["type"] == 1: # image block
10548
width = block.get("width", 0)
10649
height = block.get("height", 0)
10750
if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
10851
continue
52+
10953
image_bytes = block.get("image")
11054
if not image_bytes:
11155
continue
56+
11257
try:
11358
pix = pymupdf.Pixmap(image_bytes)
11459
if pix.n > 4:
11560
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
11661
img_counter += 1
11762
filename = f"p{page_num}_img{img_counter}.png"
118-
(images_dir / filename).write_bytes(pix.tobytes("png"))
63+
save_path = images_dir / filename
64+
pix.save(str(save_path))
11965
pix = None
120-
parts.append(f"\n![image](images/{doc_name}/{filename})\n")
12166
except Exception:
12267
logger.warning("Failed to save image block on page %d", page_num)
68+
continue
69+
70+
rel_path = f"images/{doc_name}/{filename}"
71+
page_images.setdefault(page_num, []).append(rel_path)
72+
return page_images
73+
74+
75+
def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
76+
"""Convert a PDF to markdown with inline images using pymupdf dict-mode.
77+
78+
Iterates blocks in reading order per page. Text blocks become text,
79+
image blocks are saved to disk and replaced with ``![image](path)``
80+
inline — preserving the original position in the document.
81+
82+
Returns the full markdown string.
83+
"""
84+
images_dir.mkdir(parents=True, exist_ok=True)
85+
parts: list[str] = []
86+
img_counter = 0
12387

124-
doc.close()
88+
with pymupdf.open(str(pdf_path)) as doc:
89+
for page_idx in range(len(doc)):
90+
page = doc[page_idx]
91+
page_num = page_idx + 1
92+
parts.append(f"\n\n<!-- Page {page_num} -->\n")
93+
94+
for block in page.get_text("dict")["blocks"]:
95+
if block["type"] == 0: # text block
96+
lines = []
97+
for line in block["lines"]:
98+
spans_text = "".join(span["text"] for span in line["spans"])
99+
lines.append(spans_text)
100+
parts.append("\n".join(lines))
101+
102+
elif block["type"] == 1: # image block
103+
width = block.get("width", 0)
104+
height = block.get("height", 0)
105+
if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
106+
continue
107+
image_bytes = block.get("image")
108+
if not image_bytes:
109+
continue
110+
try:
111+
pix = pymupdf.Pixmap(image_bytes)
112+
if pix.n > 4:
113+
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
114+
img_counter += 1
115+
filename = f"p{page_num}_img{img_counter}.png"
116+
(images_dir / filename).write_bytes(pix.tobytes("png"))
117+
pix = None
118+
parts.append(f"\n![image](images/{doc_name}/{filename})\n")
119+
except Exception:
120+
logger.warning("Failed to save image block on page %d", page_num)
125121
return "\n".join(parts)
126122

127123

0 commit comments

Comments
 (0)