|
| 1 | +<?php |
| 2 | + |
| 3 | +namespace WP2Static; |
| 4 | + |
| 5 | +use WP2Static\FileFiltering; |
| 6 | +use WP2Static\ParseHTML; |
| 7 | + |
| 8 | +class URLDiscovery { |
| 9 | + private string $destination_host; |
| 10 | + private string $destination_url; |
| 11 | + private bool $discover_complete = false; |
| 12 | + private FileFiltering $file_filtering; |
| 13 | + private string $site_host; |
| 14 | + |
| 15 | + public function __construct( ) { |
| 16 | + $this->destination_url = untrailingslashit( |
| 17 | + apply_filters( |
| 18 | + 'wp2static_set_destination_url', |
| 19 | + CoreOptions::getValue( 'deploymentURL' ) |
| 20 | + ) |
| 21 | + ); |
| 22 | + $this->destination_host = \Wa72\Url\Url::parse( $this->destination_url )->getHost(); |
| 23 | + $this->file_filtering = new FileFiltering(); |
| 24 | + } |
| 25 | + |
| 26 | + public function discoverURLs( \Iterator $iterator ) : \Iterator { |
| 27 | + global $wpdb; |
| 28 | + |
| 29 | + $table_name = CrawlQueue::getTableName(); |
| 30 | + |
| 31 | + foreach ( $iterator as $arr ) { |
| 32 | + if ( isset( $arr['content_type'] ) && str_starts_with( $arr['content_type'], 'text/html' ) ) { |
| 33 | + $urls = []; |
| 34 | + foreach ( $this->parseURLs( $arr ) as $url ) { |
| 35 | + $urls[$url] = true; |
| 36 | + } |
| 37 | + if ( empty( $urls ) ) { |
| 38 | + yield $arr; |
| 39 | + continue; |
| 40 | + } |
| 41 | + $placeholders = array_fill( 0, count( $urls ), '(%s)' ); |
| 42 | + $sql = "INSERT IGNORE INTO $table_name (url) VALUES " . implode( ',', $placeholders ); |
| 43 | + $result = $wpdb->query( $wpdb->prepare( $sql, ...array_keys( $urls ) ) ); |
| 44 | + if ( false === $result ) { |
| 45 | + WsLog::w( 'Error inserting into crawl queue: ' . $wpdb->last_error ); |
| 46 | + } |
| 47 | + yield $arr; |
| 48 | + } else { |
| 49 | + yield $arr; |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + $this->discover_complete = true; |
| 54 | + } |
| 55 | + |
| 56 | + public function isURLLocal( \Wa72\Url\Url $base_url, \Wa72\Url\Url $url ) : bool { |
| 57 | + if ( ! $url->getPath() && ! $url->getHost() ) { |
| 58 | + // fragment-only URL |
| 59 | + return false; |
| 60 | + } |
| 61 | + |
| 62 | + $scheme = $url->getScheme(); |
| 63 | + |
| 64 | + if ( ! $scheme ) { |
| 65 | + $url = $url->makeAbsolute( $base_url ); |
| 66 | + $scheme = $url->getScheme(); |
| 67 | + } |
| 68 | + |
| 69 | + $path = $url->getPath(); |
| 70 | + |
| 71 | + if ( ( 'http' === $scheme || 'https' === $scheme ) && |
| 72 | + $url->equalsHost( $this->destination_host ) && 0 < strlen( $url->getPath() ) && |
| 73 | + $this->file_filtering->pathLooksCrawlable( $path ) ) { |
| 74 | + return true; |
| 75 | + } |
| 76 | + |
| 77 | + return false; |
| 78 | + } |
| 79 | + |
| 80 | + public function parseURLs( array $arr ) : \Iterator { |
| 81 | + $body = null; |
| 82 | + if ( isset( $arr['body'] ) ) { |
| 83 | + $body = $arr['body']; |
| 84 | + } else if ( isset( $arr['filename'] ) ) { |
| 85 | + $body = file_get_contents( $arr['filename'] ); |
| 86 | + } |
| 87 | + |
| 88 | + if ( ! $body ) { |
| 89 | + return; |
| 90 | + } |
| 91 | + |
| 92 | + $page_url = \Wa72\Url\Url::parse( $this->destination_url . $arr['path'] ); |
| 93 | + foreach ( ParseHTML::parseURLsString( $body ) as $url ) { |
| 94 | + $discovered_url = \Wa72\Url\Url::parse( $url ); |
| 95 | + $discovered_url->setFragment( '' ); |
| 96 | + $discovered_url->setQuery( '' ); |
| 97 | + $is_local = $this->isURLLocal( $page_url, $discovered_url ); |
| 98 | + $discovered_url->setHost( '' ); |
| 99 | + $discovered_url->setScheme( '' ); |
| 100 | + if ( $is_local ) { |
| 101 | + yield $discovered_url->write(); |
| 102 | + } |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | +} |
0 commit comments