From 3ab51ccff97c699059fc60eb7781165b2c8b6733 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 24 Feb 2020 09:07:58 +0000 Subject: [PATCH] better tests for interlinked and redirect tracking --- src/CrawlCommand.php | 2 +- src/CrawlObserver.php | 13 +++++++++---- tests/CrawlerTest.php | 17 ++++++++++++++++- tests/server/server.js | 9 ++++++--- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/CrawlCommand.php b/src/CrawlCommand.php index b7879d8..0b87c2d 100644 --- a/src/CrawlCommand.php +++ b/src/CrawlCommand.php @@ -37,7 +37,7 @@ class CrawlCommand extends Command protected function execute(InputInterface $input, OutputInterface $output) { $baseUrl = $input->getArgument('url'); - $crawler=new Crawler(); + $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 60, RequestOptions::TIMEOUT => 60]); $crawler->crawl($baseUrl); foreach($crawler->getResults() as $url=>$result){ diff --git a/src/CrawlObserver.php b/src/CrawlObserver.php index 75f0c55..4eea726 100644 --- a/src/CrawlObserver.php +++ b/src/CrawlObserver.php @@ -51,7 +51,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver $fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]]; } - // PART 2a/2: has this url been found already + // PART 1a/2: has this url been found already $fos=$this->results[(string)$url]['foundOn']??[]; foreach($fullRedirectReport as $k=>$redirect){ @@ -62,7 +62,8 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver $response->getHeader('Content-Type')[0]??null, $k == 0 ? $fullRedirectReport : [] ); - // PART 2b/2: this redirecting url has only just been crawled. + // PART 1b/2: this redirecting url has only just been crawled. + // so we've only just obtained it redirect history // but if it has already been found on other pages // this new redirect history needs to have those foundOnUrls added too if($k>0){ @@ -98,9 +99,13 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver UriInterface $url, ?UriInterface $foundOnUrl = null ){ - // PART 1/2: if there is an existing result with known redirects - // then its redirects must apply to this page if(count($this->results[(string)$url]['redirects']??[])>0){ + // PART 2/2: if there is an existing result with known redirects + // then its redirects must apply to this page + // this is a rare case that the URL comes in here (depends on order of crawl) + // rather than at PART 1/2. + // remove this and the `CrawlerTest::testInterlinked` test will fail + // specificlly with interlinked4 not propergating down to foundOn /found foreach($this->results[(string)$url]['redirects'] as $redirect){ $this->addResult($redirect['location'],(string)$foundOnUrl); } diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index acf9260..b31b37d 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -76,23 +76,38 @@ class CrawlerTest extends TestCase{ public function testInterlinked(){ $crawler=new Crawler(); $crawler->crawl('http://localhost:8080/interlinked1'); - //$crawler->crawl('http://localhost:8080/page4'); // TODO!!! this ensures the order or results for the URL tracking test 3PARTS. + $crawler->crawl('http://localhost:8080/interlinked4'); //this ensures the order or results for the URL tracking test PART2 $sitemap=$crawler->getResults(); $this->assertTreeContains($sitemap,[ 'http://localhost:8080/interlinked1' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked1' => 1, 'http://localhost:8080/interlinked2' => 1, 'http://localhost:8080/interlinked3' => 1, + 'http://localhost:8080/interlinked4' => 1, ]], 'http://localhost:8080/interlinked2' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked1' => 1, 'http://localhost:8080/interlinked2' => 1, 'http://localhost:8080/interlinked3' => 1, + 'http://localhost:8080/interlinked4' => 1, ]], 'http://localhost:8080/interlinked3' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked1' => 1, 'http://localhost:8080/interlinked2' => 1, 'http://localhost:8080/interlinked3' => 1, + 'http://localhost:8080/interlinked4' => 1, + ]], + 'http://localhost:8080/found' => ['code' => 200 , 'foundOn' => [ + 'http://localhost:8080/interlinked1' => 1, + 'http://localhost:8080/interlinked2' => 1, + 'http://localhost:8080/interlinked3' => 1, + 'http://localhost:8080/interlinked4' => 1, + ]], + 'http://localhost:8080/redirectToFound' => ['code' => 302 , 'foundOn' => [ + 'http://localhost:8080/interlinked1' => 1, + 'http://localhost:8080/interlinked2' => 1, + 'http://localhost:8080/interlinked3' => 1, + 'http://localhost:8080/interlinked4' => 1, ]], ], print_r($sitemap,true)); } diff --git a/tests/server/server.js b/tests/server/server.js index d8155d5..ed9da20 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -84,13 +84,16 @@ app.get('/invalidStatusCode', function (request, response) { }); app.get('/interlinked1', function (request, response) { - response.end('123'); + response.end('123r'); }); app.get('/interlinked2', function (request, response) { - response.end('123'); + response.end('123r'); }); app.get('/interlinked3', function (request, response) { - response.end('123'); + response.end('123r'); +}); +app.get('/interlinked4', function (request, response) { + response.end('123r'); }); let server = app.listen(8080, function () {