@@ -36,92 +36,88 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
3636 page_images : dict [int , list [str ]] = {}
3737 img_counter = 0
3838
39- doc = pymupdf .open (str (pdf_path ))
40- for page_idx in range (len (doc )):
41- page = doc [page_idx ]
42- page_num = page_idx + 1
43-
44- for block in page .get_text ("dict" )["blocks" ]:
45- if block ["type" ] != 1 : # not an image block
46- continue
47-
48- width = block .get ("width" , 0 )
49- height = block .get ("height" , 0 )
50- if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM :
51- continue
52-
53- image_bytes = block .get ("image" )
54- if not image_bytes :
55- continue
56-
57- try :
58- pix = pymupdf .Pixmap (image_bytes )
59- if pix .n > 4 :
60- pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
61- img_counter += 1
62- filename = f"p{ page_num } _img{ img_counter } .png"
63- save_path = images_dir / filename
64- pix .save (str (save_path ))
65- pix = None
66- except Exception :
67- logger .warning ("Failed to save image block on page %d" , page_num )
68- continue
69-
70- rel_path = f"images/{ doc_name } /{ filename } "
71- page_images .setdefault (page_num , []).append (rel_path )
72-
73- doc .close ()
74- return page_images
75-
76-
77- def convert_pdf_with_images (pdf_path : Path , doc_name : str , images_dir : Path ) -> str :
78- """Convert a PDF to markdown with inline images using pymupdf dict-mode.
79-
80- Iterates blocks in reading order per page. Text blocks become text,
81- image blocks are saved to disk and replaced with ````
82- inline — preserving the original position in the document.
39+ with pymupdf .open (str (pdf_path )) as doc :
40+ for page_idx in range (len (doc )):
41+ page = doc [page_idx ]
42+ page_num = page_idx + 1
8343
84- Returns the full markdown string.
85- """
86- images_dir .mkdir (parents = True , exist_ok = True )
87- parts : list [str ] = []
88- img_counter = 0
44+ for block in page .get_text ("dict" )["blocks" ]:
45+ if block ["type" ] != 1 : # not an image block
46+ continue
8947
90- doc = pymupdf .open (str (pdf_path ))
91- for page_idx in range (len (doc )):
92- page = doc [page_idx ]
93- page_num = page_idx + 1
94- parts .append (f"\n \n <!-- Page { page_num } -->\n " )
95-
96- for block in page .get_text ("dict" )["blocks" ]:
97- if block ["type" ] == 0 : # text block
98- lines = []
99- for line in block ["lines" ]:
100- spans_text = "" .join (span ["text" ] for span in line ["spans" ])
101- lines .append (spans_text )
102- parts .append ("\n " .join (lines ))
103-
104- elif block ["type" ] == 1 : # image block
10548 width = block .get ("width" , 0 )
10649 height = block .get ("height" , 0 )
10750 if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM :
10851 continue
52+
10953 image_bytes = block .get ("image" )
11054 if not image_bytes :
11155 continue
56+
11257 try :
11358 pix = pymupdf .Pixmap (image_bytes )
11459 if pix .n > 4 :
11560 pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
11661 img_counter += 1
11762 filename = f"p{ page_num } _img{ img_counter } .png"
118- (images_dir / filename ).write_bytes (pix .tobytes ("png" ))
63+ save_path = images_dir / filename
64+ pix .save (str (save_path ))
11965 pix = None
120- parts .append (f"\n \n " )
12166 except Exception :
12267 logger .warning ("Failed to save image block on page %d" , page_num )
68+ continue
69+
70+ rel_path = f"images/{ doc_name } /{ filename } "
71+ page_images .setdefault (page_num , []).append (rel_path )
72+ return page_images
73+
74+
75+ def convert_pdf_with_images (pdf_path : Path , doc_name : str , images_dir : Path ) -> str :
76+ """Convert a PDF to markdown with inline images using pymupdf dict-mode.
77+
78+ Iterates blocks in reading order per page. Text blocks become text,
79+ image blocks are saved to disk and replaced with ````
80+ inline — preserving the original position in the document.
81+
82+ Returns the full markdown string.
83+ """
84+ images_dir .mkdir (parents = True , exist_ok = True )
85+ parts : list [str ] = []
86+ img_counter = 0
12387
124- doc .close ()
88+ with pymupdf .open (str (pdf_path )) as doc :
89+ for page_idx in range (len (doc )):
90+ page = doc [page_idx ]
91+ page_num = page_idx + 1
92+ parts .append (f"\n \n <!-- Page { page_num } -->\n " )
93+
94+ for block in page .get_text ("dict" )["blocks" ]:
95+ if block ["type" ] == 0 : # text block
96+ lines = []
97+ for line in block ["lines" ]:
98+ spans_text = "" .join (span ["text" ] for span in line ["spans" ])
99+ lines .append (spans_text )
100+ parts .append ("\n " .join (lines ))
101+
102+ elif block ["type" ] == 1 : # image block
103+ width = block .get ("width" , 0 )
104+ height = block .get ("height" , 0 )
105+ if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM :
106+ continue
107+ image_bytes = block .get ("image" )
108+ if not image_bytes :
109+ continue
110+ try :
111+ pix = pymupdf .Pixmap (image_bytes )
112+ if pix .n > 4 :
113+ pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
114+ img_counter += 1
115+ filename = f"p{ page_num } _img{ img_counter } .png"
116+ (images_dir / filename ).write_bytes (pix .tobytes ("png" ))
117+ pix = None
118+ parts .append (f"\n \n " )
119+ except Exception :
120+ logger .warning ("Failed to save image block on page %d" , page_num )
125121 return "\n " .join (parts )
126122
127123
0 commit comments