make observer collate all found on urls
This commit is contained in:
parent
6a986a62c0
commit
ac7b849036
@ -4,6 +4,9 @@
|
|||||||
"type": "project",
|
"type": "project",
|
||||||
"require": {
|
"require": {
|
||||||
"spatie/crawler": "^4.6",
|
"spatie/crawler": "^4.6",
|
||||||
|
"cweagans/composer-patches": "~1.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
"phpunit/phpunit": "^8.5"
|
"phpunit/phpunit": "^8.5"
|
||||||
},
|
},
|
||||||
"authors": [
|
"authors": [
|
||||||
@ -16,6 +19,12 @@
|
|||||||
"psr-4": {
|
"psr-4": {
|
||||||
"JHodges\\Sitemap\\": "src/"
|
"JHodges\\Sitemap\\": "src/"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"extra": {
|
||||||
|
"patches": {
|
||||||
|
"spatie/crawler": {
|
||||||
|
"add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
1786
composer.lock
generated
1786
composer.lock
generated
File diff suppressed because it is too large
Load Diff
@ -21,7 +21,6 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
*/
|
*/
|
||||||
public function willCrawl(UriInterface $url)
|
public function willCrawl(UriInterface $url)
|
||||||
{
|
{
|
||||||
echo "Will:$url\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -38,9 +37,9 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
){
|
){
|
||||||
|
|
||||||
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
|
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
|
||||||
// Retrieve both Redirect History headers
|
|
||||||
$fullRedirectReport = [];
|
|
||||||
if($response->getHeader('X-Guzzle-Redirect-History')){
|
if($response->getHeader('X-Guzzle-Redirect-History')){
|
||||||
|
// Retrieve both Redirect History headers
|
||||||
|
$fullRedirectReport = [];
|
||||||
// Retrieve both Redirect History headers
|
// Retrieve both Redirect History headers
|
||||||
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
||||||
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
||||||
@ -52,18 +51,38 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
foreach ($redirectUriHistory as $key => $value) {
|
foreach ($redirectUriHistory as $key => $value) {
|
||||||
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
foreach($fullRedirectReport as $rr){
|
foreach($fullRedirectReport as $rr){
|
||||||
$this->results[]=[
|
$this->results[(String)$rr['location']]=[
|
||||||
'location'=>(String)$rr['location'],
|
'code'=>$rr['code'],
|
||||||
'code'=>$rr['code'],
|
'type'=>$response->getHeader('Content-Type')[0]??null,
|
||||||
|
'foundOn'=>[(string)$foundOnUrl],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
$this->results[(String)$url]=[
|
||||||
|
'code'=>$response->getStatusCode(),
|
||||||
'type'=>$response->getHeader('Content-Type')[0]??null,
|
'type'=>$response->getHeader('Content-Type')[0]??null,
|
||||||
'foundOn'=>(string)$foundOnUrl,
|
'foundOn'=>[(string)$foundOnUrl],
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when the crawler has found the url again
|
||||||
|
*
|
||||||
|
* @param \Psr\Http\Message\UriInterface $url
|
||||||
|
* @param \Psr\Http\Message\ResponseInterface $response
|
||||||
|
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
|
||||||
|
*/
|
||||||
|
public function alreadyCrawled(
|
||||||
|
UriInterface $url,
|
||||||
|
?UriInterface $foundOnUrl = null
|
||||||
|
){
|
||||||
|
$this->results[(String)$url]['foundOn'][]=(string)$foundOnUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when the crawler had a problem crawling the given url.
|
* Called when the crawler had a problem crawling the given url.
|
||||||
*
|
*
|
||||||
@ -76,36 +95,21 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
RequestException $requestException,
|
RequestException $requestException,
|
||||||
?UriInterface $foundOnUrl = null
|
?UriInterface $foundOnUrl = null
|
||||||
){
|
){
|
||||||
if($response=$requestException->getResponse()){
|
if( $response=$requestException->getResponse() ){
|
||||||
$code=$response->getStatusCode();
|
$this->crawled($url,$response,$foundOnUrl);
|
||||||
$type=$response->getHeader('Content-Type')[0]??null;
|
|
||||||
}else{
|
}else{
|
||||||
$code='???';
|
$this->results[(String)$url]=[
|
||||||
$type='';
|
'code'=>'???',
|
||||||
|
'type'=>'???',
|
||||||
|
'foundOn'=>[(string)$foundOnUrl],
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve both Redirect History headers
|
|
||||||
$fullRedirectReport = [];
|
|
||||||
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
|
|
||||||
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
|
||||||
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
|
||||||
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->results[]=[
|
|
||||||
'link'=>(String)$url,
|
|
||||||
'code'=>$code,
|
|
||||||
'type'=>$type,
|
|
||||||
'parent'=>(string)$foundOnUrl,
|
|
||||||
'redirects'=>$fullRedirectReport,
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called when the crawl has ended.
|
* Called when the crawl has ended.
|
||||||
*/
|
*/
|
||||||
public function finishedCrawling() {
|
public function finishedCrawling() {
|
||||||
//print_r($this->results);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -13,21 +13,28 @@ use Spatie\Crawler\CrawlInternalUrls;
|
|||||||
|
|
||||||
class Crawler{
|
class Crawler{
|
||||||
|
|
||||||
public function Crawl($url){
|
private $observer;
|
||||||
$observer=new CrawlObserver();
|
private $crawler;
|
||||||
|
|
||||||
SpatieCrawler::create([
|
public function __construct($baseUrl){
|
||||||
|
$this->observer = new CrawlObserver();
|
||||||
|
$this->crawler = SpatieCrawler::create([
|
||||||
RequestOptions::ALLOW_REDIRECTS => [
|
RequestOptions::ALLOW_REDIRECTS => [
|
||||||
'track_redirects' => true,
|
'track_redirects' => true,
|
||||||
]
|
]
|
||||||
])
|
])
|
||||||
//->setMaximumDepth(1)
|
//->setMaximumDepth(1)
|
||||||
->setCrawlObserver($observer)
|
->setCrawlProfile(new CrawlInternalUrls($baseUrl))
|
||||||
->setCrawlProfile(new CrawlInternalUrls($url))
|
->setCrawlObserver($this->observer)
|
||||||
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://hudevad.com/en/')) )
|
|
||||||
->startCrawling($url)
|
|
||||||
;
|
;
|
||||||
return $observer->results;
|
}
|
||||||
|
|
||||||
|
public function crawl($url){
|
||||||
|
$this->crawler->startCrawling($url);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getResults(){
|
||||||
|
return $this->observer->results;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user