diff --git a/src/CrawlObserver.php b/src/CrawlObserver.php index 9f27f3f..cb7831e 100644 --- a/src/CrawlObserver.php +++ b/src/CrawlObserver.php @@ -35,7 +35,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver ResponseInterface $response, ?UriInterface $foundOnUrl = null ){ - // https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests if($response->getHeader('X-Guzzle-Redirect-History')){ // Retrieve both Redirect History headers @@ -52,13 +51,30 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver $fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]]; } - foreach($fullRedirectReport as $rr){ + // PART 2a/2: has this url been found already + $fos=$this->results[(string)$url]['foundOn']??[]; + + foreach($fullRedirectReport as $k=>$redirect){ $this->addResult( - (String)$rr['location'], + (String)$redirect['location'], (string)$foundOnUrl, - $rr['code'], - $response->getHeader('Content-Type')[0]??null + $redirect['code'], + $response->getHeader('Content-Type')[0]??null, + $k == 0 ? $fullRedirectReport : [] ); + // PART 2b/2: this redirecting url has only just been crawled. + // but if it has already been found on other pages + // this new redirect history needs to have those foundOnUrls added too + if($k>0){ + foreach($fos as $kk=>$vv){ + $this->addResult( + (String)$redirect['location'], + (string)$kk, + $redirect['code'], + $response->getHeader('Content-Type')[0]??null + ); + } + } } }else{ $this->addResult( @@ -72,6 +88,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver /** * Called when the crawler has found the url again + * NOTE: That this may be called before crawled or crawlFailed is called for this URL * * @param \Psr\Http\Message\UriInterface $url * @param \Psr\Http\Message\ResponseInterface $response @@ -81,10 +98,17 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver UriInterface $url, ?UriInterface $foundOnUrl = null ){ - $this->addResult((String)$url,(string)$foundOnUrl); + // PART 1/2: if there is an existing result with known redirects + // then its redirects must apply to this page + if(count($this->results[(string)$url]['redirects']??[])>0){ + foreach($this->results[(string)$url]['redirects'] as $redirect){ + $this->addResult($redirect['location'],(string)$foundOnUrl); + } + }else{ + $this->addResult((String)$url,(string)$foundOnUrl); + } } - /** * Called when the crawler had a problem crawling the given url. * @@ -104,20 +128,24 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver } } - public function addResult($url, $foundOn, $code='???', $type='???'){ + public function addResult($url, $foundOn, $code='', $type='', $redirects=[]){ if(!isset($this->results[$url])){ - $this->results[$url]=[ - 'code'=>$code, - 'type'=>$type, - 'foundOn'=>[$foundOn=>1], - ]; - return; + $this->results[$url]=[]; + } + if(!isset($this->results[$url]['code']) || !$this->results[$url]['code']){ + $this->results[$url]['code']=$code; + } + if(!isset($this->results[$url]['type']) || !$this->results[$url]['type']){ + $this->results[$url]['type']=$type; } if(isset($this->results[$url]['foundOn'][$foundOn])){ $this->results[$url]['foundOn'][$foundOn]++; }else{ $this->results[$url]['foundOn'][$foundOn]=1; } + if(!isset($this->results[$url]['redirects']) || !$this->results[$url]['redirects']){ + $this->results[$url]['redirects']=$redirects; + } } /** diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 1438755..06d61b4 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -9,7 +9,7 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/' => ['code' => 200 ], 'http://localhost:8080/link1' => ['code' => 200 ], 'http://localhost:8080/link2' => ['code' => 200 ], @@ -19,11 +19,60 @@ class CrawlerTest extends TestCase{ ]); } + public function testCollectsAllFoundOnUrls(){ + $crawler=new Crawler('http://localhost:8080'); + $crawler->crawl('http://localhost:8080/page1'); + $crawler->crawl('http://localhost:8080/page4'); // this ensures the order or results for the URL tracking test 3PARTS. + $sitemap=$crawler->getResults(); + print_r($sitemap); + $this->assertTreeContains($sitemap,[ + 'http://localhost:8080/page1' => ['code' => 200 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + ]], + 'http://localhost:8080/page2' => ['code' => 200 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + ]], + 'http://localhost:8080/page3' => ['code' => 200 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + ]], + 'http://localhost:8080/notFound1' => ['code' => 404 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + ]], + 'http://localhost:8080/notFound2' => ['code' => 404 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + 'http://localhost:8080/page4' => 1, + ]], + 'http://localhost:8080/redirectToNotFound' => ['code' => 302 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + 'http://localhost:8080/page4' => 1, + ]], + 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 , 'foundOn' => [ + 'http://localhost:8080/page1' => 1, + 'http://localhost:8080/page2' => 1, + 'http://localhost:8080/page3' => 1, + 'http://localhost:8080/page4' => 1, + ]], + + ]); + } + public function testCanFollowRedirectToFound(){ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/redirectToFound'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/redirectToFound' => ['code' => 302 ], 'http://localhost:8080/' => ['code' => 200 ], ]); @@ -33,9 +82,9 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/redirectToNotFound'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/redirectToNotFound' => ['code' => 302 ], - 'http://localhost:8080/notExists' => ['code' => 404 ], + 'http://localhost:8080/notFound2' => ['code' => 404 ], ]); } @@ -43,10 +92,10 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ], 'http://localhost:8080/redirectToNotFound' => ['code' => 302 ], - 'http://localhost:8080/notExists' => ['code' => 404 ], + 'http://localhost:8080/notFound2' => ['code' => 404 ], ]); } @@ -54,7 +103,7 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ], 'http://localhost:8080/redirect1' => ['code' => 302 ], 'http://localhost:8080/redirect2' => ['code' => 302 ], @@ -66,8 +115,8 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/timeout'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ - 'http://localhost:8080/timeout' => ['code' => '???' ], + $this->assertTreeContains($sitemap,[ + 'http://localhost:8080/timeout' => ['code' => '' ], ]); } @@ -75,17 +124,18 @@ class CrawlerTest extends TestCase{ $crawler=new Crawler('http://localhost:8080'); $crawler->crawl('http://localhost:8080/internalServerError'); $sitemap=$crawler->getResults(); - $this->assertSitemapContains($sitemap,[ + $this->assertTreeContains($sitemap,[ 'http://localhost:8080/internalServerError' => ['code' => 500 ], ]); } - public function assertSitemapContains($sitemap, $contains){ - foreach($contains as $url=>$vals){ - $this->assertArrayHasKey($url, $sitemap, "$url not found in sitemap"); - foreach($vals as $k=>$v){ - $this->assertArrayHasKey($k, $sitemap[$url], "$url => $k not found in sitemap"); - $this->assertEquals($v, $sitemap[$url][$k], "$url => $k = $v not found in sitemap"); + public function assertTreeContains($haystack, $contains, $crumbs=''){ + foreach($contains as $k=>$v){ + $this->assertArrayHasKey($k, $haystack, $crumbs); + if(is_array($v)){ + $this->assertTreeContains($haystack[$k], $v, $crumbs.' => '.$k); + }else{ + $this->assertEquals($v, $haystack[$k], $crumbs.' => '.$k); } } } diff --git a/tests/server/server.js b/tests/server/server.js index f8320c4..7d32f87 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -23,7 +23,7 @@ app.get('/link4', function (request, response) { }); app.get('/redirectToNotFound', function (request, response) { - response.redirect('/notExists'); + response.redirect('/notFound2'); }); app.get('/redirectToFound', function (request, response) { @@ -54,6 +54,18 @@ app.get('/internalServerError', function (request, response) { response.status(500).end(); }); +app.get('/page1', function (request, response) { + response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound'); +}); +app.get('/page2', function (request, response) { + response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound'); +}); +app.get('/page3', function (request, response) { + response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound'); +}); +app.get('/page4', function (request, response) { + response.end('redirectToRedirectToNotFound'); +}); let server = app.listen(8080, function () { const host = 'localhost';