This commit is contained in:
James 2020-02-20 17:06:44 +00:00
commit 6a986a62c0
6 changed files with 2905 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/vendor/

21
composer.json Normal file
View File

@ -0,0 +1,21 @@
{
"name": "jhodges/sitemap",
"description": "generate full sitemap report",
"type": "project",
"require": {
"spatie/crawler": "^4.6",
"phpunit/phpunit": "^8.5"
},
"authors": [
{
"name": "James",
"email": "inbox.dev@jhodges.co.uk"
}
],
"autoload": {
"psr-4": {
"JHodges\\Sitemap\\": "src/"
}
}
}

2718
composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

111
src/CrawlObserver.php Normal file
View File

@ -0,0 +1,111 @@
<?php
namespace JHodges\Sitemap;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\RequestOptions;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlInternalUrls;
class CrawlObserver extends \Spatie\Crawler\CrawlObserver
{
public $results=[];
/**
* Called when the crawler will crawl the url.
*
* @param \Psr\Http\Message\UriInterface $url
*/
public function willCrawl(UriInterface $url)
{
echo "Will:$url\n";
}
/**
* Called when the crawler has crawled the given url successfully.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawled(
UriInterface $url,
ResponseInterface $response,
?UriInterface $foundOnUrl = null
){
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response->getHeader('X-Guzzle-Redirect-History')){
// Retrieve both Redirect History headers
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
// Add the initial URI requested to the (beginning of) URI history
array_unshift($redirectUriHistory, (string)$url);
// Add the final HTTP status code to the end of HTTP response history
array_push($redirectCodeHistory, $response->getStatusCode());
$fullRedirectReport = [];
foreach ($redirectUriHistory as $key => $value) {
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
}
}
foreach($fullRedirectReport as $rr){
$this->results[]=[
'location'=>(String)$rr['location'],
'code'=>$rr['code'],
'type'=>$response->getHeader('Content-Type')[0]??null,
'foundOn'=>(string)$foundOnUrl,
];
}
}
/**
* Called when the crawler had a problem crawling the given url.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \GuzzleHttp\Exception\RequestException $requestException
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
?UriInterface $foundOnUrl = null
){
if($response=$requestException->getResponse()){
$code=$response->getStatusCode();
$type=$response->getHeader('Content-Type')[0]??null;
}else{
$code='???';
$type='';
}
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
}
$this->results[]=[
'link'=>(String)$url,
'code'=>$code,
'type'=>$type,
'parent'=>(string)$foundOnUrl,
'redirects'=>$fullRedirectReport,
];
}
/**
* Called when the crawl has ended.
*/
public function finishedCrawling() {
//print_r($this->results);
}
}

33
src/Crawler.php Normal file
View File

@ -0,0 +1,33 @@
<?php
namespace JHodges\Sitemap;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\RequestOptions;
use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler as SpatieCrawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\CrawlInternalUrls;
class Crawler{
public function Crawl($url){
$observer=new CrawlObserver();
SpatieCrawler::create([
RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true,
]
])
//->setMaximumDepth(1)
->setCrawlObserver($observer)
->setCrawlProfile(new CrawlInternalUrls($url))
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://hudevad.com/en/')) )
->startCrawling($url)
;
return $observer->results;
}
}

21
tests/CrawlerTest.php Normal file
View File

@ -0,0 +1,21 @@
<?php
use \PHPUnit\Framework\TestCase;
use JHodges\Sitemap\Crawler;
class CrawlerTest extends TestCase{
public function testCrawl(){
$crawler=new Crawler();
$sitemap=$crawler->crawl('http://jhodges.co.uk');
print_r($sitemap);
return $sitemap;
}
/**
* @depends testCrawl
*/
public function testBrokenLinks($sitemap){
}
}