File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -19,16 +19,14 @@ export const escapeURL = (url) => {
1919 return s ;
2020} ;
2121
22- const getCharsetFromHTML = ( bin ) => {
23- const s = new TextDecoder ( ) . decode ( bin ) ;
24- const cs = s . match ( / c h a r s e t = " ( .+ ) " / )
25- if ( ! cs ) {
26- return "utf-8" ;
22+ export const getEncodingFromHTML = ( bin ) => {
23+ for ( const name of [ "encoding" , "charset" ] ) {
24+ const s = new TextDecoder ( ) . decode ( bin ) ;
25+ const cs = s . match ( new RegExp ( name + "=\"(.+)\"" ) ) ;
26+ if ( ! cs || cs [ 1 ] . length > 20 ) continue ;
27+ return cs [ 1 ] ;
2728 }
28- if ( cs [ 1 ] . length > 20 ) {
29- return "utf-8" ;
30- }
31- return cs [ 1 ] ;
29+ return "utf-8" ;
3230} ;
3331
3432const fetchText = async ( url ) => {
@@ -38,7 +36,7 @@ const fetchText = async (url) => {
3836 } ,
3937 } ;
4038 const bin = await fetchBin ( url , opt ) ;
41- const cset = getCharsetFromHTML ( bin ) ;
39+ const cset = getEncodingFromHTML ( bin ) ;
4240 //console.log(cset);
4341 //const text = SJIS.decodeAuto(bin);
4442 //const text = new TextDecoder("euc-jp").decode(bin);
Original file line number Diff line number Diff line change 11import * as t from "https://deno.land/std/testing/asserts.ts" ;
2- import { escapeURL , fetchOrLoad } from "../fetchOrLoad.js" ;
2+ import { escapeURL , fetchOrLoad , getEncodingFromHTML } from "../fetchOrLoad.js" ;
33
44const make = ( n , len = 200 ) => {
55 const ss = [ ] ;
@@ -30,3 +30,10 @@ Deno.test("as browser", async () => {
3030 const html = await fetchOrLoad ( url ) ;
3131 t . assert ( html . indexOf ( "403 Forbidden" ) !== - 1 ) ;
3232} ) ;
33+ Deno . test ( "getEncodingFromHTML" , ( ) => {
34+ const e = s => new TextEncoder ( ) . encode ( s ) ;
35+ t . assert ( getEncodingFromHTML ( e ( `<html encoding="UTF-8">` ) ) , "UTF-8" ) ;
36+ t . assert ( getEncodingFromHTML ( e ( `<html charset="UTF-8">` ) ) , "UTF-8" ) ;
37+ t . assert ( getEncodingFromHTML ( e ( `<html encoding="UTF-a" charset="UTF-b">` ) ) , "UTF-a" ) ;
38+ t . assert ( getEncodingFromHTML ( e ( `<html charset="UTF-b" encoding="UTF-a">` ) ) , "UTF-a" ) ;
39+ } ) ;
You can’t perform that action at this time.
0 commit comments