test and fix redirect tracking

This commit is contained in:
James 2020-02-22 10:06:18 +00:00
parent d81dc86381
commit 9a84ec204d
3 changed files with 121 additions and 31 deletions

View File

@ -35,7 +35,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
ResponseInterface $response, ResponseInterface $response,
?UriInterface $foundOnUrl = null ?UriInterface $foundOnUrl = null
){ ){
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests // https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
if($response->getHeader('X-Guzzle-Redirect-History')){ if($response->getHeader('X-Guzzle-Redirect-History')){
// Retrieve both Redirect History headers // Retrieve both Redirect History headers
@ -52,14 +51,31 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]]; $fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
} }
foreach($fullRedirectReport as $rr){ // PART 2a/2: has this url been found already
$fos=$this->results[(string)$url]['foundOn']??[];
foreach($fullRedirectReport as $k=>$redirect){
$this->addResult( $this->addResult(
(String)$rr['location'], (String)$redirect['location'],
(string)$foundOnUrl, (string)$foundOnUrl,
$rr['code'], $redirect['code'],
$response->getHeader('Content-Type')[0]??null,
$k == 0 ? $fullRedirectReport : []
);
// PART 2b/2: this redirecting url has only just been crawled.
// but if it has already been found on other pages
// this new redirect history needs to have those foundOnUrls added too
if($k>0){
foreach($fos as $kk=>$vv){
$this->addResult(
(String)$redirect['location'],
(string)$kk,
$redirect['code'],
$response->getHeader('Content-Type')[0]??null $response->getHeader('Content-Type')[0]??null
); );
} }
}
}
}else{ }else{
$this->addResult( $this->addResult(
(String)$url, (String)$url,
@ -72,6 +88,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
/** /**
* Called when the crawler has found the url again * Called when the crawler has found the url again
* NOTE: That this may be called before crawled or crawlFailed is called for this URL
* *
* @param \Psr\Http\Message\UriInterface $url * @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response * @param \Psr\Http\Message\ResponseInterface $response
@ -81,9 +98,16 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
UriInterface $url, UriInterface $url,
?UriInterface $foundOnUrl = null ?UriInterface $foundOnUrl = null
){ ){
// PART 1/2: if there is an existing result with known redirects
// then its redirects must apply to this page
if(count($this->results[(string)$url]['redirects']??[])>0){
foreach($this->results[(string)$url]['redirects'] as $redirect){
$this->addResult($redirect['location'],(string)$foundOnUrl);
}
}else{
$this->addResult((String)$url,(string)$foundOnUrl); $this->addResult((String)$url,(string)$foundOnUrl);
} }
}
/** /**
* Called when the crawler had a problem crawling the given url. * Called when the crawler had a problem crawling the given url.
@ -104,20 +128,24 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
} }
} }
public function addResult($url, $foundOn, $code='???', $type='???'){ public function addResult($url, $foundOn, $code='', $type='', $redirects=[]){
if(!isset($this->results[$url])){ if(!isset($this->results[$url])){
$this->results[$url]=[ $this->results[$url]=[];
'code'=>$code, }
'type'=>$type, if(!isset($this->results[$url]['code']) || !$this->results[$url]['code']){
'foundOn'=>[$foundOn=>1], $this->results[$url]['code']=$code;
]; }
return; if(!isset($this->results[$url]['type']) || !$this->results[$url]['type']){
$this->results[$url]['type']=$type;
} }
if(isset($this->results[$url]['foundOn'][$foundOn])){ if(isset($this->results[$url]['foundOn'][$foundOn])){
$this->results[$url]['foundOn'][$foundOn]++; $this->results[$url]['foundOn'][$foundOn]++;
}else{ }else{
$this->results[$url]['foundOn'][$foundOn]=1; $this->results[$url]['foundOn'][$foundOn]=1;
} }
if(!isset($this->results[$url]['redirects']) || !$this->results[$url]['redirects']){
$this->results[$url]['redirects']=$redirects;
}
} }
/** /**

View File

@ -9,7 +9,7 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080'); $crawler->crawl('http://localhost:8080');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/' => ['code' => 200 ], 'http://localhost:8080/' => ['code' => 200 ],
'http://localhost:8080/link1' => ['code' => 200 ], 'http://localhost:8080/link1' => ['code' => 200 ],
'http://localhost:8080/link2' => ['code' => 200 ], 'http://localhost:8080/link2' => ['code' => 200 ],
@ -19,11 +19,60 @@ class CrawlerTest extends TestCase{
]); ]);
} }
public function testCollectsAllFoundOnUrls(){
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/page1');
$crawler->crawl('http://localhost:8080/page4'); // this ensures the order or results for the URL tracking test 3PARTS.
$sitemap=$crawler->getResults();
print_r($sitemap);
$this->assertTreeContains($sitemap,[
'http://localhost:8080/page1' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
]],
'http://localhost:8080/page2' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
]],
'http://localhost:8080/page3' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
]],
'http://localhost:8080/notFound1' => ['code' => 404 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
]],
'http://localhost:8080/notFound2' => ['code' => 404 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
'http://localhost:8080/page4' => 1,
]],
'http://localhost:8080/redirectToNotFound' => ['code' => 302 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
'http://localhost:8080/page4' => 1,
]],
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 , 'foundOn' => [
'http://localhost:8080/page1' => 1,
'http://localhost:8080/page2' => 1,
'http://localhost:8080/page3' => 1,
'http://localhost:8080/page4' => 1,
]],
]);
}
public function testCanFollowRedirectToFound(){ public function testCanFollowRedirectToFound(){
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToFound'); $crawler->crawl('http://localhost:8080/redirectToFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToFound' => ['code' => 302 ], 'http://localhost:8080/redirectToFound' => ['code' => 302 ],
'http://localhost:8080/' => ['code' => 200 ], 'http://localhost:8080/' => ['code' => 200 ],
]); ]);
@ -33,9 +82,9 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToNotFound'); $crawler->crawl('http://localhost:8080/redirectToNotFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ], 'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
'http://localhost:8080/notExists' => ['code' => 404 ], 'http://localhost:8080/notFound2' => ['code' => 404 ],
]); ]);
} }
@ -43,10 +92,10 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound'); $crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ], 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ],
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ], 'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
'http://localhost:8080/notExists' => ['code' => 404 ], 'http://localhost:8080/notFound2' => ['code' => 404 ],
]); ]);
} }
@ -54,7 +103,7 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation'); $crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ], 'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ],
'http://localhost:8080/redirect1' => ['code' => 302 ], 'http://localhost:8080/redirect1' => ['code' => 302 ],
'http://localhost:8080/redirect2' => ['code' => 302 ], 'http://localhost:8080/redirect2' => ['code' => 302 ],
@ -66,8 +115,8 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/timeout'); $crawler->crawl('http://localhost:8080/timeout');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/timeout' => ['code' => '???' ], 'http://localhost:8080/timeout' => ['code' => '' ],
]); ]);
} }
@ -75,17 +124,18 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080'); $crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/internalServerError'); $crawler->crawl('http://localhost:8080/internalServerError');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertSitemapContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://localhost:8080/internalServerError' => ['code' => 500 ], 'http://localhost:8080/internalServerError' => ['code' => 500 ],
]); ]);
} }
public function assertSitemapContains($sitemap, $contains){ public function assertTreeContains($haystack, $contains, $crumbs=''){
foreach($contains as $url=>$vals){ foreach($contains as $k=>$v){
$this->assertArrayHasKey($url, $sitemap, "$url not found in sitemap"); $this->assertArrayHasKey($k, $haystack, $crumbs);
foreach($vals as $k=>$v){ if(is_array($v)){
$this->assertArrayHasKey($k, $sitemap[$url], "$url => $k not found in sitemap"); $this->assertTreeContains($haystack[$k], $v, $crumbs.' => '.$k);
$this->assertEquals($v, $sitemap[$url][$k], "$url => $k = $v not found in sitemap"); }else{
$this->assertEquals($v, $haystack[$k], $crumbs.' => '.$k);
} }
} }
} }

View File

@ -23,7 +23,7 @@ app.get('/link4', function (request, response) {
}); });
app.get('/redirectToNotFound', function (request, response) { app.get('/redirectToNotFound', function (request, response) {
response.redirect('/notExists'); response.redirect('/notFound2');
}); });
app.get('/redirectToFound', function (request, response) { app.get('/redirectToFound', function (request, response) {
@ -54,6 +54,18 @@ app.get('/internalServerError', function (request, response) {
response.status(500).end(); response.status(500).end();
}); });
app.get('/page1', function (request, response) {
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
});
app.get('/page2', function (request, response) {
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
});
app.get('/page3', function (request, response) {
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
});
app.get('/page4', function (request, response) {
response.end('<a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
});
let server = app.listen(8080, function () { let server = app.listen(8080, function () {
const host = 'localhost'; const host = 'localhost';