better tests for interlinked and redirect tracking
This commit is contained in:
@@ -37,7 +37,7 @@ class CrawlCommand extends Command
|
||||
protected function execute(InputInterface $input, OutputInterface $output)
|
||||
{
|
||||
$baseUrl = $input->getArgument('url');
|
||||
$crawler=new Crawler();
|
||||
$crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 60, RequestOptions::TIMEOUT => 60]);
|
||||
$crawler->crawl($baseUrl);
|
||||
|
||||
foreach($crawler->getResults() as $url=>$result){
|
||||
|
@@ -51,7 +51,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
||||
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
||||
}
|
||||
|
||||
// PART 2a/2: has this url been found already
|
||||
// PART 1a/2: has this url been found already
|
||||
$fos=$this->results[(string)$url]['foundOn']??[];
|
||||
|
||||
foreach($fullRedirectReport as $k=>$redirect){
|
||||
@@ -62,7 +62,8 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
||||
$response->getHeader('Content-Type')[0]??null,
|
||||
$k == 0 ? $fullRedirectReport : []
|
||||
);
|
||||
// PART 2b/2: this redirecting url has only just been crawled.
|
||||
// PART 1b/2: this redirecting url has only just been crawled.
|
||||
// so we've only just obtained it redirect history
|
||||
// but if it has already been found on other pages
|
||||
// this new redirect history needs to have those foundOnUrls added too
|
||||
if($k>0){
|
||||
@@ -98,9 +99,13 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
||||
UriInterface $url,
|
||||
?UriInterface $foundOnUrl = null
|
||||
){
|
||||
// PART 1/2: if there is an existing result with known redirects
|
||||
// then its redirects must apply to this page
|
||||
if(count($this->results[(string)$url]['redirects']??[])>0){
|
||||
// PART 2/2: if there is an existing result with known redirects
|
||||
// then its redirects must apply to this page
|
||||
// this is a rare case that the URL comes in here (depends on order of crawl)
|
||||
// rather than at PART 1/2.
|
||||
// remove this and the `CrawlerTest::testInterlinked` test will fail
|
||||
// specificlly with interlinked4 not propergating down to foundOn /found
|
||||
foreach($this->results[(string)$url]['redirects'] as $redirect){
|
||||
$this->addResult($redirect['location'],(string)$foundOnUrl);
|
||||
}
|
||||
|
Reference in New Issue
Block a user