diff --git a/composer.json b/composer.json index 78c604a..f975e17 100644 --- a/composer.json +++ b/composer.json @@ -9,7 +9,8 @@ ], "require": { "phpunit/phpunit-selenium": "^4.1", - "facebook/webdriver": "^1.7" + "php-webdriver/webdriver": "^1.7", + "spatie/crawler": "^4.6" }, "autoload": { "psr-4": { diff --git a/src/CrawlObserver.php b/src/CrawlObserver.php new file mode 100644 index 0000000..92f2b8c --- /dev/null +++ b/src/CrawlObserver.php @@ -0,0 +1,101 @@ +getStatusCode(); + $type=$response->getHeader('Content-Type')[0]??null; + + // Retrieve both Redirect History headers + $fullRedirectReport = []; + if($response->getHeader('X-Guzzle-Redirect-History')){ + $redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history + $redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history + $fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory]; + } + + $this->results[]=[ + 'link'=>(String)$url, + 'code'=>$code, + 'type'=>$type, + 'parent'=>(string)$foundOnUrl, + 'redirects'=>$fullRedirectReport, + ]; + } + + /** + * Called when the crawler had a problem crawling the given url. + * + * @param \Psr\Http\Message\UriInterface $url + * @param \GuzzleHttp\Exception\RequestException $requestException + * @param \Psr\Http\Message\UriInterface|null $foundOnUrl + */ + public function crawlFailed( + UriInterface $url, + RequestException $requestException, + ?UriInterface $foundOnUrl = null + ){ + if($response=$requestException->getResponse()){ + $code=$response->getStatusCode(); + $type=$response->getHeader('Content-Type')[0]??null; + }else{ + $code='???'; + $type=''; + } + + // Retrieve both Redirect History headers + $fullRedirectReport = []; + if($response && $response->getHeader('X-Guzzle-Redirect-History')){ + $redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history + $redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history + $fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory]; + } + + $this->results[]=[ + 'link'=>(String)$url, + 'code'=>$code, + 'type'=>$type, + 'parent'=>(string)$foundOnUrl, + 'redirects'=>$fullRedirectReport, + ]; + } + + /** + * Called when the crawl has ended. + */ + public function finishedCrawling() { + //print_r($this->results); + } + +} diff --git a/src/CrawlTest.php b/src/CrawlTest.php new file mode 100644 index 0000000..dc769be --- /dev/null +++ b/src/CrawlTest.php @@ -0,0 +1,55 @@ + [ + 'track_redirects' => true, + ], + ]) + ->setCrawlObserver($observer) + ->setCrawlProfile(new CrawlInternalUrls( $this-> getUrl() )) + //->addToCrawlQueue( CrawlUrl::create(new Uri('https://another_entry_point??')) ) + ->startCrawling( $this-> getUrl() ) + ; + $this->assertTrue(true); + return $observer->results; + } + + /** + * @depends testCrawl + */ + public function testBrokenLinks($results){ + $errors=''; + foreach($results as $result){ + if($result['code']!=200){ + $errors.="{$result['code']} {$result['link']} (found on {$result['parent']})\n"; + } + } + if($errors){ + throw new \Exception("\n".$errors); + }else{ + $this->assertTrue(true); + } + } +}