Skip to content

Commit 375f3b2

Browse files
committed
improve getEncodingFromHTML
1 parent 74338fd commit 375f3b2

2 files changed

Lines changed: 16 additions & 11 deletions

File tree

fetchOrLoad.js

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,14 @@ export const escapeURL = (url) => {
1919
return s;
2020
};
2121

22-
const getCharsetFromHTML = (bin) => {
23-
const s = new TextDecoder().decode(bin);
24-
const cs = s.match(/charset="(.+)"/)
25-
if (!cs) {
26-
return "utf-8";
22+
export const getEncodingFromHTML = (bin) => {
23+
for (const name of ["encoding", "charset"]) {
24+
const s = new TextDecoder().decode(bin);
25+
const cs = s.match(new RegExp(name + "=\"(.+)\""));
26+
if (!cs || cs[1].length > 20) continue;
27+
return cs[1];
2728
}
28-
if (cs[1].length > 20) {
29-
return "utf-8";
30-
}
31-
return cs[1];
29+
return "utf-8";
3230
};
3331

3432
const fetchText = async (url) => {
@@ -38,7 +36,7 @@ const fetchText = async (url) => {
3836
},
3937
};
4038
const bin = await fetchBin(url, opt);
41-
const cset = getCharsetFromHTML(bin);
39+
const cset = getEncodingFromHTML(bin);
4240
//console.log(cset);
4341
//const text = SJIS.decodeAuto(bin);
4442
//const text = new TextDecoder("euc-jp").decode(bin);

test/fetchOrLoad.test.js

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import * as t from "https://deno.land/std/testing/asserts.ts";
2-
import { escapeURL, fetchOrLoad } from "../fetchOrLoad.js";
2+
import { escapeURL, fetchOrLoad, getEncodingFromHTML } from "../fetchOrLoad.js";
33

44
const make = (n, len = 200) => {
55
const ss = [];
@@ -30,3 +30,10 @@ Deno.test("as browser", async () => {
3030
const html = await fetchOrLoad(url);
3131
t.assert(html.indexOf("403 Forbidden") !== -1);
3232
});
33+
Deno.test("getEncodingFromHTML", () => {
34+
const e = s => new TextEncoder().encode(s);
35+
t.assert(getEncodingFromHTML(e(`<html encoding="UTF-8">`)), "UTF-8");
36+
t.assert(getEncodingFromHTML(e(`<html charset="UTF-8">`)), "UTF-8");
37+
t.assert(getEncodingFromHTML(e(`<html encoding="UTF-a" charset="UTF-b">`)), "UTF-a");
38+
t.assert(getEncodingFromHTML(e(`<html charset="UTF-b" encoding="UTF-a">`)), "UTF-a");
39+
});

0 commit comments

Comments
 (0)