Skip to content

Commit f69b252

Browse files
committed
Add discovery of linked URLs during crawling
This works but it doesn't loop around, so new URLs may not be crawled until the next job run.
1 parent 04b98dc commit f69b252

3 files changed

Lines changed: 112 additions & 0 deletions

File tree

src/Crawler.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,14 @@ public static function wp2staticCrawl( string $crawler_slug ) : void {
107107
if ( 'wp2static' === $crawler_slug ) {
108108
$paths = CrawlQueue::getPathsIter();
109109
$crawler = new Crawler();
110+
$url_discovery = new URLDiscovery();
110111
$crawled = $crawler->crawlIter( $paths );
111112
$crawled = CrawlCache::remove404s( $crawled );
112113
$crawled = CrawlCache::writeFilesIter( $crawled );
113114
if ( $crawler->use_crawl_cache ) {
114115
$crawled = CrawlCache::addPathsIter( $crawled );
115116
}
117+
$crawled = $url_discovery->discoverURLs( $crawled );
116118
foreach ( $crawled as $crawled ) {}
117119
$crawler->crawlComplete();
118120
}

src/DirectDeployer.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class DirectDeployer {
1111
private $crawler;
1212
private $deployer;
1313
private $processor;
14+
private $url_discovery;
1415
private $use_crawl_cache;
1516

1617
public function __construct() {
@@ -30,6 +31,7 @@ public function __construct() {
3031

3132
$this->deployer = new $deployer_class();
3233
$this->crawler = new Crawler();
34+
$this->url_discovery = new URLDiscovery();
3335
$this->processor = new PostProcessor();
3436

3537
$this->use_crawl_cache = CoreOptions::getValue( 'useCrawlCaching' );
@@ -57,6 +59,8 @@ public function deployPaths( \Iterator $paths ) : void {
5759
$crawled = CrawlCache::addPathsIter( $crawled );
5860
}
5961

62+
$crawled = $this->url_discovery->discoverURLs( $crawled );
63+
6064
$processed = $this->processor->processIter( $crawled );
6165
$this->deployer->uploadFilesIter( $processed );
6266
}

src/URLDiscovery.php

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
<?php
2+
3+
namespace WP2Static;
4+
5+
use WP2Static\FileFiltering;
6+
use WP2Static\ParseHTML;
7+
8+
class URLDiscovery {
9+
private string $destination_host;
10+
private string $destination_url;
11+
private bool $discover_complete = false;
12+
private FileFiltering $file_filtering;
13+
private string $site_host;
14+
15+
public function __construct( ) {
16+
$this->destination_url = untrailingslashit(
17+
apply_filters(
18+
'wp2static_set_destination_url',
19+
CoreOptions::getValue( 'deploymentURL' )
20+
)
21+
);
22+
$this->destination_host = \Wa72\Url\Url::parse( $this->destination_url )->getHost();
23+
$this->file_filtering = new FileFiltering();
24+
}
25+
26+
public function discoverURLs( \Iterator $iterator ) : \Iterator {
27+
global $wpdb;
28+
29+
$table_name = CrawlQueue::getTableName();
30+
31+
foreach ( $iterator as $arr ) {
32+
if ( isset( $arr['content_type'] ) && str_starts_with( $arr['content_type'], 'text/html' ) ) {
33+
$urls = [];
34+
foreach ( $this->parseURLs( $arr ) as $url ) {
35+
$urls[$url] = true;
36+
}
37+
if ( empty( $urls ) ) {
38+
yield $arr;
39+
continue;
40+
}
41+
$placeholders = array_fill( 0, count( $urls ), '(%s)' );
42+
$sql = "INSERT IGNORE INTO $table_name (url) VALUES " . implode( ',', $placeholders );
43+
$result = $wpdb->query( $wpdb->prepare( $sql, ...array_keys( $urls ) ) );
44+
if ( false === $result ) {
45+
WsLog::w( 'Error inserting into crawl queue: ' . $wpdb->last_error );
46+
}
47+
yield $arr;
48+
} else {
49+
yield $arr;
50+
}
51+
}
52+
53+
$this->discover_complete = true;
54+
}
55+
56+
public function isURLLocal( \Wa72\Url\Url $base_url, \Wa72\Url\Url $url ) : bool {
57+
if ( ! $url->getPath() && ! $url->getHost() ) {
58+
// fragment-only URL
59+
return false;
60+
}
61+
62+
$scheme = $url->getScheme();
63+
64+
if ( ! $scheme ) {
65+
$url = $url->makeAbsolute( $base_url );
66+
$scheme = $url->getScheme();
67+
}
68+
69+
$path = $url->getPath();
70+
71+
if ( ( 'http' === $scheme || 'https' === $scheme ) &&
72+
$url->equalsHost( $this->destination_host ) && 0 < strlen( $url->getPath() ) &&
73+
$this->file_filtering->pathLooksCrawlable( $path ) ) {
74+
return true;
75+
}
76+
77+
return false;
78+
}
79+
80+
public function parseURLs( array $arr ) : \Iterator {
81+
$body = null;
82+
if ( isset( $arr['body'] ) ) {
83+
$body = $arr['body'];
84+
} else if ( isset( $arr['filename'] ) ) {
85+
$body = file_get_contents( $arr['filename'] );
86+
}
87+
88+
if ( ! $body ) {
89+
return;
90+
}
91+
92+
$page_url = \Wa72\Url\Url::parse( $this->destination_url . $arr['path'] );
93+
foreach ( ParseHTML::parseURLsString( $body ) as $url ) {
94+
$discovered_url = \Wa72\Url\Url::parse( $url );
95+
$discovered_url->setFragment( '' );
96+
$discovered_url->setQuery( '' );
97+
$is_local = $this->isURLLocal( $page_url, $discovered_url );
98+
$discovered_url->setHost( '' );
99+
$discovered_url->setScheme( '' );
100+
if ( $is_local ) {
101+
yield $discovered_url->write();
102+
}
103+
}
104+
}
105+
106+
}

0 commit comments

Comments
 (0)