diff --git a/src/CrawlObserver.php b/src/CrawlObserver.php
index 9f27f3f..cb7831e 100644
--- a/src/CrawlObserver.php
+++ b/src/CrawlObserver.php
@@ -35,7 +35,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
ResponseInterface $response,
?UriInterface $foundOnUrl = null
){
-
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
if($response->getHeader('X-Guzzle-Redirect-History')){
// Retrieve both Redirect History headers
@@ -52,13 +51,30 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
}
- foreach($fullRedirectReport as $rr){
+ // PART 2a/2: has this url been found already
+ $fos=$this->results[(string)$url]['foundOn']??[];
+
+ foreach($fullRedirectReport as $k=>$redirect){
$this->addResult(
- (String)$rr['location'],
+ (String)$redirect['location'],
(string)$foundOnUrl,
- $rr['code'],
- $response->getHeader('Content-Type')[0]??null
+ $redirect['code'],
+ $response->getHeader('Content-Type')[0]??null,
+ $k == 0 ? $fullRedirectReport : []
);
+ // PART 2b/2: this redirecting url has only just been crawled.
+ // but if it has already been found on other pages
+ // this new redirect history needs to have those foundOnUrls added too
+ if($k>0){
+ foreach($fos as $kk=>$vv){
+ $this->addResult(
+ (String)$redirect['location'],
+ (string)$kk,
+ $redirect['code'],
+ $response->getHeader('Content-Type')[0]??null
+ );
+ }
+ }
}
}else{
$this->addResult(
@@ -72,6 +88,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
/**
* Called when the crawler has found the url again
+ * NOTE: That this may be called before crawled or crawlFailed is called for this URL
*
* @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response
@@ -81,10 +98,17 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
UriInterface $url,
?UriInterface $foundOnUrl = null
){
- $this->addResult((String)$url,(string)$foundOnUrl);
+ // PART 1/2: if there is an existing result with known redirects
+ // then its redirects must apply to this page
+ if(count($this->results[(string)$url]['redirects']??[])>0){
+ foreach($this->results[(string)$url]['redirects'] as $redirect){
+ $this->addResult($redirect['location'],(string)$foundOnUrl);
+ }
+ }else{
+ $this->addResult((String)$url,(string)$foundOnUrl);
+ }
}
-
/**
* Called when the crawler had a problem crawling the given url.
*
@@ -104,20 +128,24 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
}
}
- public function addResult($url, $foundOn, $code='???', $type='???'){
+ public function addResult($url, $foundOn, $code='', $type='', $redirects=[]){
if(!isset($this->results[$url])){
- $this->results[$url]=[
- 'code'=>$code,
- 'type'=>$type,
- 'foundOn'=>[$foundOn=>1],
- ];
- return;
+ $this->results[$url]=[];
+ }
+ if(!isset($this->results[$url]['code']) || !$this->results[$url]['code']){
+ $this->results[$url]['code']=$code;
+ }
+ if(!isset($this->results[$url]['type']) || !$this->results[$url]['type']){
+ $this->results[$url]['type']=$type;
}
if(isset($this->results[$url]['foundOn'][$foundOn])){
$this->results[$url]['foundOn'][$foundOn]++;
}else{
$this->results[$url]['foundOn'][$foundOn]=1;
}
+ if(!isset($this->results[$url]['redirects']) || !$this->results[$url]['redirects']){
+ $this->results[$url]['redirects']=$redirects;
+ }
}
/**
diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php
index 1438755..06d61b4 100644
--- a/tests/CrawlerTest.php
+++ b/tests/CrawlerTest.php
@@ -9,7 +9,7 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/' => ['code' => 200 ],
'http://localhost:8080/link1' => ['code' => 200 ],
'http://localhost:8080/link2' => ['code' => 200 ],
@@ -19,11 +19,60 @@ class CrawlerTest extends TestCase{
]);
}
+ public function testCollectsAllFoundOnUrls(){
+ $crawler=new Crawler('http://localhost:8080');
+ $crawler->crawl('http://localhost:8080/page1');
+ $crawler->crawl('http://localhost:8080/page4'); // this ensures the order or results for the URL tracking test 3PARTS.
+ $sitemap=$crawler->getResults();
+ print_r($sitemap);
+ $this->assertTreeContains($sitemap,[
+ 'http://localhost:8080/page1' => ['code' => 200 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ ]],
+ 'http://localhost:8080/page2' => ['code' => 200 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ ]],
+ 'http://localhost:8080/page3' => ['code' => 200 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ ]],
+ 'http://localhost:8080/notFound1' => ['code' => 404 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ ]],
+ 'http://localhost:8080/notFound2' => ['code' => 404 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ 'http://localhost:8080/page4' => 1,
+ ]],
+ 'http://localhost:8080/redirectToNotFound' => ['code' => 302 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ 'http://localhost:8080/page4' => 1,
+ ]],
+ 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 , 'foundOn' => [
+ 'http://localhost:8080/page1' => 1,
+ 'http://localhost:8080/page2' => 1,
+ 'http://localhost:8080/page3' => 1,
+ 'http://localhost:8080/page4' => 1,
+ ]],
+
+ ]);
+ }
+
public function testCanFollowRedirectToFound(){
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToFound');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToFound' => ['code' => 302 ],
'http://localhost:8080/' => ['code' => 200 ],
]);
@@ -33,9 +82,9 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToNotFound');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
- 'http://localhost:8080/notExists' => ['code' => 404 ],
+ 'http://localhost:8080/notFound2' => ['code' => 404 ],
]);
}
@@ -43,10 +92,10 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302 ],
'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
- 'http://localhost:8080/notExists' => ['code' => 404 ],
+ 'http://localhost:8080/notFound2' => ['code' => 404 ],
]);
}
@@ -54,7 +103,7 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200 ],
'http://localhost:8080/redirect1' => ['code' => 302 ],
'http://localhost:8080/redirect2' => ['code' => 302 ],
@@ -66,8 +115,8 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/timeout');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
- 'http://localhost:8080/timeout' => ['code' => '???' ],
+ $this->assertTreeContains($sitemap,[
+ 'http://localhost:8080/timeout' => ['code' => '' ],
]);
}
@@ -75,17 +124,18 @@ class CrawlerTest extends TestCase{
$crawler=new Crawler('http://localhost:8080');
$crawler->crawl('http://localhost:8080/internalServerError');
$sitemap=$crawler->getResults();
- $this->assertSitemapContains($sitemap,[
+ $this->assertTreeContains($sitemap,[
'http://localhost:8080/internalServerError' => ['code' => 500 ],
]);
}
- public function assertSitemapContains($sitemap, $contains){
- foreach($contains as $url=>$vals){
- $this->assertArrayHasKey($url, $sitemap, "$url not found in sitemap");
- foreach($vals as $k=>$v){
- $this->assertArrayHasKey($k, $sitemap[$url], "$url => $k not found in sitemap");
- $this->assertEquals($v, $sitemap[$url][$k], "$url => $k = $v not found in sitemap");
+ public function assertTreeContains($haystack, $contains, $crumbs=''){
+ foreach($contains as $k=>$v){
+ $this->assertArrayHasKey($k, $haystack, $crumbs);
+ if(is_array($v)){
+ $this->assertTreeContains($haystack[$k], $v, $crumbs.' => '.$k);
+ }else{
+ $this->assertEquals($v, $haystack[$k], $crumbs.' => '.$k);
}
}
}
diff --git a/tests/server/server.js b/tests/server/server.js
index f8320c4..7d32f87 100644
--- a/tests/server/server.js
+++ b/tests/server/server.js
@@ -23,7 +23,7 @@ app.get('/link4', function (request, response) {
});
app.get('/redirectToNotFound', function (request, response) {
- response.redirect('/notExists');
+ response.redirect('/notFound2');
});
app.get('/redirectToFound', function (request, response) {
@@ -54,6 +54,18 @@ app.get('/internalServerError', function (request, response) {
response.status(500).end();
});
+app.get('/page1', function (request, response) {
+ response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound');
+});
+app.get('/page2', function (request, response) {
+ response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound');
+});
+app.get('/page3', function (request, response) {
+ response.end('Page1Page2Page3NotFoundredirectToRedirectToNotFound');
+});
+app.get('/page4', function (request, response) {
+ response.end('redirectToRedirectToNotFound');
+});
let server = app.listen(8080, function () {
const host = 'localhost';