test and fix redirect tracking
This commit is contained in:
parent
d81dc86381
commit
9a84ec204d
@ -35,7 +35,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
ResponseInterface $response,
|
ResponseInterface $response,
|
||||||
?UriInterface $foundOnUrl = null
|
?UriInterface $foundOnUrl = null
|
||||||
){
|
){
|
||||||
|
|
||||||
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
|
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
|
||||||
if($response->getHeader('X-Guzzle-Redirect-History')){
|
if($response->getHeader('X-Guzzle-Redirect-History')){
|
||||||
// Retrieve both Redirect History headers
|
// Retrieve both Redirect History headers
|
||||||
@ -52,13 +51,30 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach($fullRedirectReport as $rr){
|
// PART 2a/2: has this url been found already
|
||||||
|
$fos=$this->results[(string)$url]['foundOn']??[];
|
||||||
|
|
||||||
|
foreach($fullRedirectReport as $k=>$redirect){
|
||||||
$this->addResult(
|
$this->addResult(
|
||||||
(String)$rr['location'],
|
(String)$redirect['location'],
|
||||||
(string)$foundOnUrl,
|
(string)$foundOnUrl,
|
||||||
$rr['code'],
|
$redirect['code'],
|
||||||
$response->getHeader('Content-Type')[0]??null
|
$response->getHeader('Content-Type')[0]??null,
|
||||||
|
$k == 0 ? $fullRedirectReport : []
|
||||||
);
|
);
|
||||||
|
// PART 2b/2: this redirecting url has only just been crawled.
|
||||||
|
// but if it has already been found on other pages
|
||||||
|
// this new redirect history needs to have those foundOnUrls added too
|
||||||
|
if($k>0){
|
||||||
|
foreach($fos as $kk=>$vv){
|
||||||
|
$this->addResult(
|
||||||
|
(String)$redirect['location'],
|
||||||
|
(string)$kk,
|
||||||
|
$redirect['code'],
|
||||||
|
$response->getHeader('Content-Type')[0]??null
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}else{
|
}else{
|
||||||
$this->addResult(
|
$this->addResult(
|
||||||
@ -72,6 +88,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when the crawler has found the url again
|
* Called when the crawler has found the url again
|
||||||
|
* NOTE: That this may be called before crawled or crawlFailed is called for this URL
|
||||||
*
|
*
|
||||||
* @param \Psr\Http\Message\UriInterface $url
|
* @param \Psr\Http\Message\UriInterface $url
|
||||||
* @param \Psr\Http\Message\ResponseInterface $response
|
* @param \Psr\Http\Message\ResponseInterface $response
|
||||||
@ -81,10 +98,17 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
UriInterface $url,
|
UriInterface $url,
|
||||||
?UriInterface $foundOnUrl = null
|
?UriInterface $foundOnUrl = null
|
||||||
){
|
){
|
||||||
$this->addResult((String)$url,(string)$foundOnUrl);
|
// PART 1/2: if there is an existing result with known redirects
|
||||||
|
// then its redirects must apply to this page
|
||||||
|
if(count($this->results[(string)$url]['redirects']??[])>0){
|
||||||
|
foreach($this->results[(string)$url]['redirects'] as $redirect){
|
||||||
|
$this->addResult($redirect['location'],(string)$foundOnUrl);
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
$this->addResult((String)$url,(string)$foundOnUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when the crawler had a problem crawling the given url.
|
* Called when the crawler had a problem crawling the given url.
|
||||||
*
|
*
|
||||||
@ -104,20 +128,24 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addResult($url, $foundOn, $code='???', $type='???'){
|
public function addResult($url, $foundOn, $code='', $type='', $redirects=[]){
|
||||||
if(!isset($this->results[$url])){
|
if(!isset($this->results[$url])){
|
||||||
$this->results[$url]=[
|
$this->results[$url]=[];
|
||||||
'code'=>$code,
|
}
|
||||||
'type'=>$type,
|
if(!isset($this->results[$url]['code']) || !$this->results[$url]['code']){
|
||||||
'foundOn'=>[$foundOn=>1],
|
$this->results[$url]['code']=$code;
|
||||||
];
|
}
|
||||||
return;
|
if(!isset($this->results[$url]['type']) || !$this->results[$url]['type']){
|
||||||
|
$this->results[$url]['type']=$type;
|
||||||
}
|
}
|
||||||
if(isset($this->results[$url]['foundOn'][$foundOn])){
|
if(isset($this->results[$url]['foundOn'][$foundOn])){
|
||||||
$this->results[$url]['foundOn'][$foundOn]++;
|
$this->results[$url]['foundOn'][$foundOn]++;
|
||||||
}else{
|
}else{
|
||||||
$this->results[$url]['foundOn'][$foundOn]=1;
|
$this->results[$url]['foundOn'][$foundOn]=1;
|
||||||
}
|
}
|
||||||
|
if(!isset($this->results[$url]['redirects']) || !$this->results[$url]['redirects']){
|
||||||
|
$this->results[$url]['redirects']=$redirects;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -9,7 +9,7 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080');
|
$crawler->crawl('http://localhost:8080');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/' => ['code' => 200 ],
|
'http://localhost:8080/' => ['code' => 200 ],
|
||||||
'http://localhost:8080/link1' => ['code' => 200 ],
|
'http://localhost:8080/link1' => ['code' => 200 ],
|
||||||
'http://localhost:8080/link2' => ['code' => 200 ],
|
'http://localhost:8080/link2' => ['code' => 200 ],
|
||||||
@ -19,11 +19,60 @@ class CrawlerTest extends TestCase{
|
|||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testCollectsAllFoundOnUrls(){
|
||||||
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
|
$crawler->crawl('http://localhost:8080/page1');
|
||||||
|
$crawler->crawl('http://localhost:8080/page4'); // this ensures the order or results for the URL tracking test 3PARTS.
|
||||||
|
$sitemap=$crawler->getResults();
|
||||||
|
print_r($sitemap);
|
||||||
|
$this->assertTreeContains($sitemap,[
|
||||||
|
'http://localhost:8080/page1' => ['code' => 200 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/page2' => ['code' => 200 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/page3' => ['code' => 200 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/notFound1' => ['code' => 404 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/notFound2' => ['code' => 404 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
'http://localhost:8080/page4' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/redirectToNotFound' => ['code' => 302 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
'http://localhost:8080/page4' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/page1' => 1,
|
||||||
|
'http://localhost:8080/page2' => 1,
|
||||||
|
'http://localhost:8080/page3' => 1,
|
||||||
|
'http://localhost:8080/page4' => 1,
|
||||||
|
]],
|
||||||
|
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
public function testCanFollowRedirectToFound(){
|
public function testCanFollowRedirectToFound(){
|
||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/redirectToFound');
|
$crawler->crawl('http://localhost:8080/redirectToFound');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/redirectToFound' => ['code' => 302 ],
|
'http://localhost:8080/redirectToFound' => ['code' => 302 ],
|
||||||
'http://localhost:8080/' => ['code' => 200 ],
|
'http://localhost:8080/' => ['code' => 200 ],
|
||||||
]);
|
]);
|
||||||
@ -33,9 +82,9 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/redirectToNotFound');
|
$crawler->crawl('http://localhost:8080/redirectToNotFound');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
|
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
|
||||||
'http://localhost:8080/notExists' => ['code' => 404 ],
|
'http://localhost:8080/notFound2' => ['code' => 404 ],
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,10 +92,10 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound');
|
$crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ],
|
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ],
|
||||||
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
|
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
|
||||||
'http://localhost:8080/notExists' => ['code' => 404 ],
|
'http://localhost:8080/notFound2' => ['code' => 404 ],
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,7 +103,7 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation');
|
$crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ],
|
'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ],
|
||||||
'http://localhost:8080/redirect1' => ['code' => 302 ],
|
'http://localhost:8080/redirect1' => ['code' => 302 ],
|
||||||
'http://localhost:8080/redirect2' => ['code' => 302 ],
|
'http://localhost:8080/redirect2' => ['code' => 302 ],
|
||||||
@ -66,8 +115,8 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/timeout');
|
$crawler->crawl('http://localhost:8080/timeout');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/timeout' => ['code' => '???' ],
|
'http://localhost:8080/timeout' => ['code' => '' ],
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,17 +124,18 @@ class CrawlerTest extends TestCase{
|
|||||||
$crawler=new Crawler('http://localhost:8080');
|
$crawler=new Crawler('http://localhost:8080');
|
||||||
$crawler->crawl('http://localhost:8080/internalServerError');
|
$crawler->crawl('http://localhost:8080/internalServerError');
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertSitemapContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/internalServerError' => ['code' => 500 ],
|
'http://localhost:8080/internalServerError' => ['code' => 500 ],
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function assertSitemapContains($sitemap, $contains){
|
public function assertTreeContains($haystack, $contains, $crumbs=''){
|
||||||
foreach($contains as $url=>$vals){
|
foreach($contains as $k=>$v){
|
||||||
$this->assertArrayHasKey($url, $sitemap, "$url not found in sitemap");
|
$this->assertArrayHasKey($k, $haystack, $crumbs);
|
||||||
foreach($vals as $k=>$v){
|
if(is_array($v)){
|
||||||
$this->assertArrayHasKey($k, $sitemap[$url], "$url => $k not found in sitemap");
|
$this->assertTreeContains($haystack[$k], $v, $crumbs.' => '.$k);
|
||||||
$this->assertEquals($v, $sitemap[$url][$k], "$url => $k = $v not found in sitemap");
|
}else{
|
||||||
|
$this->assertEquals($v, $haystack[$k], $crumbs.' => '.$k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@ app.get('/link4', function (request, response) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
app.get('/redirectToNotFound', function (request, response) {
|
app.get('/redirectToNotFound', function (request, response) {
|
||||||
response.redirect('/notExists');
|
response.redirect('/notFound2');
|
||||||
});
|
});
|
||||||
|
|
||||||
app.get('/redirectToFound', function (request, response) {
|
app.get('/redirectToFound', function (request, response) {
|
||||||
@ -54,6 +54,18 @@ app.get('/internalServerError', function (request, response) {
|
|||||||
response.status(500).end();
|
response.status(500).end();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.get('/page1', function (request, response) {
|
||||||
|
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
|
||||||
|
});
|
||||||
|
app.get('/page2', function (request, response) {
|
||||||
|
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
|
||||||
|
});
|
||||||
|
app.get('/page3', function (request, response) {
|
||||||
|
response.end('<a href="/page1">Page1</a><a href="/page2">Page2</a><a href="/page3">Page3</a><a href="/notFound1">NotFound</a><a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
|
||||||
|
});
|
||||||
|
app.get('/page4', function (request, response) {
|
||||||
|
response.end('<a href="/redirectToRedirectToNotFound">redirectToRedirectToNotFound</a>');
|
||||||
|
});
|
||||||
|
|
||||||
let server = app.listen(8080, function () {
|
let server = app.listen(8080, function () {
|
||||||
const host = 'localhost';
|
const host = 'localhost';
|
||||||
|
Loading…
Reference in New Issue
Block a user