Skip to content

Commit 75d879d

Browse files
committed
✅ test: add tests and test files for providers: ENEL, Luz del Sur, SEAL
1 parent 87afa59 commit 75d879d

5 files changed

Lines changed: 110 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,8 @@ where = ["src"]
3535

3636
[tool.pytest.ini_options]
3737
pythonpath = ["src"]
38+
39+
[dependency-groups]
40+
dev = [
41+
"pytest>=8.4.2",
42+
]

tests/pdf_analyzer_test.py

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
import os
2+
from pathlib import Path
23
from ps_helper.pdf.pdf_analyzer import PDFAnalyzer
34

4-
LOCAL_PDF_PATH = "test_files/scansmpl.pdf"
5+
TEST_DIR = Path(__file__).parent
6+
7+
LOCAL_PDF_PATH = str(TEST_DIR / "test_files/scansmpl.pdf")
58

69
REMOTE_PDF_URL = (
710
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
811
)
912

13+
PLUZ_PDF_PATH = str(TEST_DIR / "test_files/recibo_enel.pdf")
14+
LUZ_DEL_SUR_PDF_PATH = str(TEST_DIR / "test_files/recibo_luzdelsur.pdf")
15+
SEAL_PDF_PATH = str(TEST_DIR / "test_files/recibo_seal.pdf")
16+
1017

1118
def test_local_pdf():
1219
print("\n===== Test: Local PDF =====")
@@ -32,6 +39,103 @@ def test_remote_pdf():
3239
print(batch_result)
3340

3441

42+
def test_pluz_receipt():
43+
"""Pluz Energía bill (recibo_enel.pdf) — text-based, 2 pages."""
44+
analyzer = PDFAnalyzer(ocr_enabled=True, ocr_language="eng+spa")
45+
result = analyzer.extract_text_from_pdf(PLUZ_PDF_PATH)
46+
47+
assert result["success"] is True
48+
assert result["error"] is None
49+
assert result["total_pages"] == 2
50+
assert result["pages_with_text"] > 0
51+
assert result["ocr_used"] is False # embedded text, no OCR needed
52+
53+
text = result["text"]
54+
assert "0177339" in text # N° suministro
55+
assert "S820-0005693589" in text # N° recibo
56+
assert "20269985900" in text # RUC Pluz Energía
57+
assert "01065731" in text # N° medidor
58+
assert "763" in text # consumo kWh
59+
assert "0.6119" in text # precio kWh
60+
assert "466.88" in text # cargo por energía
61+
assert "604.61" in text # total mes actual
62+
assert "613.50" in text # total a pagar
63+
assert "BT5B" in text # tarifa
64+
assert "03/MAR/2026" in text # vencimiento
65+
assert "16/FEB/2026" in text # emisión
66+
67+
68+
def test_luz_del_sur_receipt():
69+
"""Luz del Sur bill (recibo_luzdelsur.pdf) — text-based, 1 page."""
70+
analyzer = PDFAnalyzer(ocr_enabled=True, ocr_language="eng+spa")
71+
result = analyzer.extract_text_from_pdf(LUZ_DEL_SUR_PDF_PATH)
72+
73+
assert result["success"] is True
74+
assert result["error"] is None
75+
assert result["total_pages"] == 1
76+
assert result["pages_with_text"] == 1
77+
assert result["ocr_used"] is False # embedded text, no OCR needed
78+
79+
text = result["text"]
80+
assert "1536584" in text # N° suministro
81+
assert "S106-639824" in text # N° recibo
82+
assert "20331898008" in text # RUC Luz del Sur
83+
assert "3296148" in text # N° medidor
84+
assert "MIRANDA CARDENAS CARLOS AUGUSTO" in text # titular
85+
assert "09133544" in text # DNI
86+
assert "541.30" in text # consumo kWh
87+
assert "0.5979" in text # precio kWh
88+
assert "323.64" in text # consumo de energía
89+
assert "428.00" in text # total a pagar
90+
assert "BT5B" in text # tarifa
91+
assert "26-Feb-2026" in text # vencimiento
92+
assert "11-Feb-2026" in text # emisión
93+
94+
95+
def test_seal_receipt():
96+
"""SEAL (Sociedad Eléctrica del Sur Oeste) bill — text-based, 2 pages.
97+
98+
Note: uses comma as decimal separator (e.g. 163,50 not 163.50).
99+
Page 2 is a near-blank payment stub so pages_with_text may be 1.
100+
"""
101+
analyzer = PDFAnalyzer(ocr_enabled=True, ocr_language="eng+spa")
102+
result = analyzer.extract_text_from_pdf(SEAL_PDF_PATH)
103+
104+
assert result["success"] is True
105+
assert result["error"] is None
106+
assert result["total_pages"] == 2
107+
assert result["pages_with_text"] >= 1
108+
assert result["ocr_used"] is False # embedded text, no OCR needed
109+
110+
text = result["text"]
111+
assert "109134" in text # N° contrato
112+
assert "34818910" in text # N° recibo
113+
assert "SE0134" in text # sistema eléctrico (RUC is in header image, not text)
114+
assert "MENDOZA CONDORI GENARA" in text # titular
115+
assert "AREQUIPA" in text # provincia
116+
assert "177,00" in text # consumo kWh
117+
assert "0,6801" in text # precio kWh
118+
assert "120,38" in text # cargo energía
119+
assert "163,50" in text # total a pagar
120+
assert "BT5B" in text # tarifa
121+
assert "02/02/2026" in text # emisión
122+
assert "17/02/2026" in text # vencimiento
123+
124+
125+
def test_receipts_batch():
126+
"""All three receipts processed together via extract_text_batch."""
127+
analyzer = PDFAnalyzer(ocr_enabled=False)
128+
results = analyzer.extract_text_batch([PLUZ_PDF_PATH, LUZ_DEL_SUR_PDF_PATH, SEAL_PDF_PATH])
129+
130+
assert len(results) == 3
131+
assert all(r["success"] for r in results)
132+
133+
pluz, lds, seal = results
134+
assert "0177339" in pluz["text"]
135+
assert "1536584" in lds["text"]
136+
assert "109134" in seal["text"]
137+
138+
35139
if __name__ == "__main__":
36140
if not os.path.exists(LOCAL_PDF_PATH):
37141
print(f"PDF not found. Invalid path {LOCAL_PDF_PATH}")

tests/test_files/recibo_enel.pdf

607 KB
Binary file not shown.
97.6 KB
Binary file not shown.

tests/test_files/recibo_seal.pdf

311 KB
Binary file not shown.

0 commit comments

Comments
 (0)