11import os
2+ from pathlib import Path
23from ps_helper .pdf .pdf_analyzer import PDFAnalyzer
34
4- LOCAL_PDF_PATH = "test_files/scansmpl.pdf"
5+ TEST_DIR = Path (__file__ ).parent
6+
7+ LOCAL_PDF_PATH = str (TEST_DIR / "test_files/scansmpl.pdf" )
58
69REMOTE_PDF_URL = (
710 "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
811)
912
13+ PLUZ_PDF_PATH = str (TEST_DIR / "test_files/recibo_enel.pdf" )
14+ LUZ_DEL_SUR_PDF_PATH = str (TEST_DIR / "test_files/recibo_luzdelsur.pdf" )
15+ SEAL_PDF_PATH = str (TEST_DIR / "test_files/recibo_seal.pdf" )
16+
1017
1118def test_local_pdf ():
1219 print ("\n ===== Test: Local PDF =====" )
@@ -32,6 +39,103 @@ def test_remote_pdf():
3239 print (batch_result )
3340
3441
42+ def test_pluz_receipt ():
43+ """Pluz Energía bill (recibo_enel.pdf) — text-based, 2 pages."""
44+ analyzer = PDFAnalyzer (ocr_enabled = True , ocr_language = "eng+spa" )
45+ result = analyzer .extract_text_from_pdf (PLUZ_PDF_PATH )
46+
47+ assert result ["success" ] is True
48+ assert result ["error" ] is None
49+ assert result ["total_pages" ] == 2
50+ assert result ["pages_with_text" ] > 0
51+ assert result ["ocr_used" ] is False # embedded text, no OCR needed
52+
53+ text = result ["text" ]
54+ assert "0177339" in text # N° suministro
55+ assert "S820-0005693589" in text # N° recibo
56+ assert "20269985900" in text # RUC Pluz Energía
57+ assert "01065731" in text # N° medidor
58+ assert "763" in text # consumo kWh
59+ assert "0.6119" in text # precio kWh
60+ assert "466.88" in text # cargo por energía
61+ assert "604.61" in text # total mes actual
62+ assert "613.50" in text # total a pagar
63+ assert "BT5B" in text # tarifa
64+ assert "03/MAR/2026" in text # vencimiento
65+ assert "16/FEB/2026" in text # emisión
66+
67+
68+ def test_luz_del_sur_receipt ():
69+ """Luz del Sur bill (recibo_luzdelsur.pdf) — text-based, 1 page."""
70+ analyzer = PDFAnalyzer (ocr_enabled = True , ocr_language = "eng+spa" )
71+ result = analyzer .extract_text_from_pdf (LUZ_DEL_SUR_PDF_PATH )
72+
73+ assert result ["success" ] is True
74+ assert result ["error" ] is None
75+ assert result ["total_pages" ] == 1
76+ assert result ["pages_with_text" ] == 1
77+ assert result ["ocr_used" ] is False # embedded text, no OCR needed
78+
79+ text = result ["text" ]
80+ assert "1536584" in text # N° suministro
81+ assert "S106-639824" in text # N° recibo
82+ assert "20331898008" in text # RUC Luz del Sur
83+ assert "3296148" in text # N° medidor
84+ assert "MIRANDA CARDENAS CARLOS AUGUSTO" in text # titular
85+ assert "09133544" in text # DNI
86+ assert "541.30" in text # consumo kWh
87+ assert "0.5979" in text # precio kWh
88+ assert "323.64" in text # consumo de energía
89+ assert "428.00" in text # total a pagar
90+ assert "BT5B" in text # tarifa
91+ assert "26-Feb-2026" in text # vencimiento
92+ assert "11-Feb-2026" in text # emisión
93+
94+
95+ def test_seal_receipt ():
96+ """SEAL (Sociedad Eléctrica del Sur Oeste) bill — text-based, 2 pages.
97+
98+ Note: uses comma as decimal separator (e.g. 163,50 not 163.50).
99+ Page 2 is a near-blank payment stub so pages_with_text may be 1.
100+ """
101+ analyzer = PDFAnalyzer (ocr_enabled = True , ocr_language = "eng+spa" )
102+ result = analyzer .extract_text_from_pdf (SEAL_PDF_PATH )
103+
104+ assert result ["success" ] is True
105+ assert result ["error" ] is None
106+ assert result ["total_pages" ] == 2
107+ assert result ["pages_with_text" ] >= 1
108+ assert result ["ocr_used" ] is False # embedded text, no OCR needed
109+
110+ text = result ["text" ]
111+ assert "109134" in text # N° contrato
112+ assert "34818910" in text # N° recibo
113+ assert "SE0134" in text # sistema eléctrico (RUC is in header image, not text)
114+ assert "MENDOZA CONDORI GENARA" in text # titular
115+ assert "AREQUIPA" in text # provincia
116+ assert "177,00" in text # consumo kWh
117+ assert "0,6801" in text # precio kWh
118+ assert "120,38" in text # cargo energía
119+ assert "163,50" in text # total a pagar
120+ assert "BT5B" in text # tarifa
121+ assert "02/02/2026" in text # emisión
122+ assert "17/02/2026" in text # vencimiento
123+
124+
125+ def test_receipts_batch ():
126+ """All three receipts processed together via extract_text_batch."""
127+ analyzer = PDFAnalyzer (ocr_enabled = False )
128+ results = analyzer .extract_text_batch ([PLUZ_PDF_PATH , LUZ_DEL_SUR_PDF_PATH , SEAL_PDF_PATH ])
129+
130+ assert len (results ) == 3
131+ assert all (r ["success" ] for r in results )
132+
133+ pluz , lds , seal = results
134+ assert "0177339" in pluz ["text" ]
135+ assert "1536584" in lds ["text" ]
136+ assert "109134" in seal ["text" ]
137+
138+
35139if __name__ == "__main__" :
36140 if not os .path .exists (LOCAL_PDF_PATH ):
37141 print (f"PDF not found. Invalid path { LOCAL_PDF_PATH } " )
0 commit comments