Skip to content

Commit 04b98dc

Browse files
committed
Add an HTML parser and URL extractor
This will be used for adding discovered URLs to the crawl queue.
1 parent 1c832a4 commit 04b98dc

4 files changed

Lines changed: 116 additions & 3 deletions

File tree

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
"ext-mbstring": "*",
3030
"ext-simplexml": "*",
3131
"lib-libxml": "*",
32-
"yahnis-elsts/plugin-update-checker": "^5.6"
32+
"yahnis-elsts/plugin-update-checker": "^5.6",
33+
"masterminds/html5": "^2.9"
3334
},
3435
"require-dev": {
3536
"thecodingmachine/phpstan-strict-rules": "*",

composer.lock

Lines changed: 68 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

flake.nix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
let rel = baseNameOf path;
2323
in rel == "composer.json" || rel == "composer.lock";
2424
};
25-
vendorHash = "sha256-lN/24P9LbuRuU/bc3DNPR0kZGtIhZfWcNqeL5puXj2w=";
25+
vendorHash = "sha256-4DTiW/rykJGZAnWCRSqlAaBtDthXKkZshPNe3yYCfA4=";
2626
});
2727
wp2static = runCommand "wp2static" {} ''
2828
export PLUGIN_DIR="$TMPDIR/${name}"

src/ParseHTML.php

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<?php
2+
3+
namespace WP2Static;
4+
5+
class ParseHTML {
6+
/**
7+
* Return URLs found in a DOMNode, recursively.
8+
*
9+
* @param \DOMNode $node
10+
* @return \Iterator<string>
11+
*/
12+
public static function parseURLsDOMNode( \DOMNode $node ) : \Iterator {
13+
foreach ( $node->childNodes as $child ) {
14+
if ( $child instanceof \DOMElement ) {
15+
$tag_name = strtolower( $child->tagName );
16+
switch ( $tag_name ) {
17+
case 'a':
18+
case 'link':
19+
yield $child->getAttribute( 'href' );
20+
break;
21+
case 'img':
22+
case 'script':
23+
case 'source':
24+
yield $child->getAttribute( 'src' );
25+
break;
26+
}
27+
foreach ( self::parseURLsDOMNode( $child ) as $url ) {
28+
yield $url;
29+
}
30+
}
31+
}
32+
}
33+
34+
/**
35+
* Return an iterator of URLs parsed from the provided HTML
36+
*
37+
* @param string $html
38+
* @return \Iterator<string>
39+
*/
40+
public static function parseURLsString( string $html ) : \Iterator {
41+
$html5 = new \Masterminds\HTML5();
42+
$dom = $html5->loadHTML( $html );
43+
return self::parseURLsDOMNode( $dom );
44+
}
45+
}

0 commit comments

Comments
 (0)