-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathscrape_pdf.ts
More file actions
35 lines (29 loc) · 1.11 KB
/
scrape_pdf.ts
File metadata and controls
35 lines (29 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import { ScrapeGraphAI } from "scrapegraph-js";
// reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI({ apiKey: "..." })
const sgai = ScrapeGraphAI();
const res = await sgai.scrape({
url: "https://pdfobject.com/pdf/sample.pdf",
contentType: "application/pdf",
formats: [{ type: "markdown" }],
});
if (res.status === "success") {
const md = res.data?.results.markdown;
const ocr = res.data?.metadata.ocr;
console.log("=== PDF Extraction ===\n");
console.log("Content Type:", res.data?.metadata.contentType);
console.log("OCR Model:", ocr?.model);
console.log("Pages Processed:", ocr?.pagesProcessed);
if (ocr?.pages) {
for (const page of ocr.pages) {
console.log(`\nPage ${page.index + 1}:`);
console.log(` Dimensions: ${page.dimensions.width}x${page.dimensions.height} @ ${page.dimensions.dpi}dpi`);
console.log(` Images: ${page.images.length}`);
console.log(` Tables: ${page.tables.length}`);
console.log(` Hyperlinks: ${page.hyperlinks.length}`);
}
}
console.log("\n=== Extracted Markdown ===\n");
console.log(md?.data?.join("\n\n"));
} else {
console.error("Failed:", res.error);
}