Skip to content

Commit 7acaf2a

Browse files
committed
Add recursive discovery during crawl job
1 parent 79537b7 commit 7acaf2a

2 files changed

Lines changed: 27 additions & 6 deletions

File tree

src/CrawlQueue.php

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,16 +168,17 @@ public static function withPathsIter( \Iterator $paths ) : \Iterator {
168168
/**
169169
* Yields all paths in the table.
170170
*
171+
* @param string $detected_since default to '0000-00-00 00:00:00'
171172
*/
172-
public static function getPathsIter() : \Iterator {
173+
public static function getPathsIter( string $detected_since = '0000-00-00 00:00:00' ) : \Iterator {
173174
global $wpdb;
174175

175176
$table_name = self::getTableName();
176177
$batch_size = 1000;
177178
$last_id = 0;
178179
while ( true ) {
179-
$qs = "SELECT id, url AS path, filename FROM $table_name WHERE id > %d ORDER BY id ASC LIMIT %d";
180-
$q = $wpdb->prepare( $qs, $last_id, $batch_size );
180+
$qs = "SELECT id, url AS path, filename FROM $table_name WHERE id > %d AND detected_at >= %s ORDER BY id ASC LIMIT %d";
181+
$q = $wpdb->prepare( $qs, $last_id, $detected_since, $batch_size );
181182
$rows = $wpdb->get_results( $q, ARRAY_A );
182183

183184
foreach ( $rows as $row ) {

src/Crawler.php

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,18 +104,38 @@ public function __construct() {
104104
}
105105

106106
public static function wp2staticCrawl( string $crawler_slug ) : void {
107+
global $wpdb;
108+
107109
if ( 'wp2static' === $crawler_slug ) {
108-
$paths = CrawlQueue::getPathsIter();
109110
$crawler = new Crawler();
110111
$url_discovery = new URLDiscovery();
111-
$crawled = $crawler->crawlIter( $paths );
112+
113+
$detected = CrawlQueue::getPathsIter();
114+
$last_now = $wpdb->get_var( 'SELECT NOW()' );
115+
$crawled = $crawler->crawlIter( $detected );
112116
$crawled = CrawlCache::remove404s( $crawled );
113117
$crawled = CrawlCache::writeFilesIter( $crawled );
114118
if ( $crawler->use_crawl_cache ) {
115119
$crawled = CrawlCache::addPathsIter( $crawled );
116120
}
117121
$crawled = $url_discovery->discoverURLs( $crawled );
118-
foreach ( $crawled as $crawled ) {}
122+
foreach ( $crawled as $_ ) { }
123+
124+
$has_new = true;
125+
while ( $has_new ) {
126+
$detected = CrawlQueue::getPathsIter( $last_now );
127+
$last_now = $wpdb->get_var( 'SELECT NOW()' );
128+
$crawled = $crawler->crawlIter( $detected );
129+
$crawled = CrawlCache::writeFilesIter( $crawled );
130+
if ( $crawler->use_crawl_cache ) {
131+
$crawled = CrawlCache::addPathsIter( $crawled );
132+
}
133+
$crawled = $url_discovery->discoverURLs( $crawled );
134+
$has_new = false;
135+
foreach ( $crawled as $_ ) {
136+
$has_new = true;
137+
}
138+
}
119139
$crawler->crawlComplete();
120140
}
121141
}

0 commit comments

Comments
 (0)