make observer collate all found on urls

This commit is contained in:
James 2020-02-21 10:24:34 +00:00
parent 6a986a62c0
commit ac7b849036
4 changed files with 977 additions and 909 deletions

View File

@ -4,6 +4,9 @@
"type": "project", "type": "project",
"require": { "require": {
"spatie/crawler": "^4.6", "spatie/crawler": "^4.6",
"cweagans/composer-patches": "~1.0"
},
"require-dev": {
"phpunit/phpunit": "^8.5" "phpunit/phpunit": "^8.5"
}, },
"authors": [ "authors": [
@ -16,6 +19,12 @@
"psr-4": { "psr-4": {
"JHodges\\Sitemap\\": "src/" "JHodges\\Sitemap\\": "src/"
} }
},
"extra": {
"patches": {
"spatie/crawler": {
"add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch"
}
}
} }
} }

1786
composer.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
*/ */
public function willCrawl(UriInterface $url) public function willCrawl(UriInterface $url)
{ {
echo "Will:$url\n";
} }
/** /**
@ -38,9 +37,9 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
){ ){
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests // https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response->getHeader('X-Guzzle-Redirect-History')){ if($response->getHeader('X-Guzzle-Redirect-History')){
// Retrieve both Redirect History headers
$fullRedirectReport = [];
// Retrieve both Redirect History headers // Retrieve both Redirect History headers
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history $redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history $redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
@ -52,18 +51,38 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
foreach ($redirectUriHistory as $key => $value) { foreach ($redirectUriHistory as $key => $value) {
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]]; $fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
} }
}
foreach($fullRedirectReport as $rr){ foreach($fullRedirectReport as $rr){
$this->results[]=[ $this->results[(String)$rr['location']]=[
'location'=>(String)$rr['location'], 'code'=>$rr['code'],
'code'=>$rr['code'], 'type'=>$response->getHeader('Content-Type')[0]??null,
'foundOn'=>[(string)$foundOnUrl],
];
}
}else{
$this->results[(String)$url]=[
'code'=>$response->getStatusCode(),
'type'=>$response->getHeader('Content-Type')[0]??null, 'type'=>$response->getHeader('Content-Type')[0]??null,
'foundOn'=>(string)$foundOnUrl, 'foundOn'=>[(string)$foundOnUrl],
]; ];
} }
} }
/**
* Called when the crawler has found the url again
*
* @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function alreadyCrawled(
UriInterface $url,
?UriInterface $foundOnUrl = null
){
$this->results[(String)$url]['foundOn'][]=(string)$foundOnUrl;
}
/** /**
* Called when the crawler had a problem crawling the given url. * Called when the crawler had a problem crawling the given url.
* *
@ -76,36 +95,21 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
RequestException $requestException, RequestException $requestException,
?UriInterface $foundOnUrl = null ?UriInterface $foundOnUrl = null
){ ){
if($response=$requestException->getResponse()){ if( $response=$requestException->getResponse() ){
$code=$response->getStatusCode(); $this->crawled($url,$response,$foundOnUrl);
$type=$response->getHeader('Content-Type')[0]??null;
}else{ }else{
$code='???'; $this->results[(String)$url]=[
$type=''; 'code'=>'???',
'type'=>'???',
'foundOn'=>[(string)$foundOnUrl],
];
} }
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
}
$this->results[]=[
'link'=>(String)$url,
'code'=>$code,
'type'=>$type,
'parent'=>(string)$foundOnUrl,
'redirects'=>$fullRedirectReport,
];
} }
/** /**
* Called when the crawl has ended. * Called when the crawl has ended.
*/ */
public function finishedCrawling() { public function finishedCrawling() {
//print_r($this->results);
} }
} }

View File

@ -13,21 +13,28 @@ use Spatie\Crawler\CrawlInternalUrls;
class Crawler{ class Crawler{
public function Crawl($url){ private $observer;
$observer=new CrawlObserver(); private $crawler;
SpatieCrawler::create([ public function __construct($baseUrl){
$this->observer = new CrawlObserver();
$this->crawler = SpatieCrawler::create([
RequestOptions::ALLOW_REDIRECTS => [ RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true, 'track_redirects' => true,
] ]
]) ])
//->setMaximumDepth(1) //->setMaximumDepth(1)
->setCrawlObserver($observer) ->setCrawlProfile(new CrawlInternalUrls($baseUrl))
->setCrawlProfile(new CrawlInternalUrls($url)) ->setCrawlObserver($this->observer)
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://hudevad.com/en/')) )
->startCrawling($url)
; ;
return $observer->results; }
public function crawl($url){
$this->crawler->startCrawling($url);
}
public function getResults(){
return $this->observer->results;
} }
} }