inital
This commit is contained in:
commit
6a986a62c0
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/vendor/
|
21
composer.json
Normal file
21
composer.json
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"name": "jhodges/sitemap",
|
||||||
|
"description": "generate full sitemap report",
|
||||||
|
"type": "project",
|
||||||
|
"require": {
|
||||||
|
"spatie/crawler": "^4.6",
|
||||||
|
"phpunit/phpunit": "^8.5"
|
||||||
|
},
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "James",
|
||||||
|
"email": "inbox.dev@jhodges.co.uk"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"JHodges\\Sitemap\\": "src/"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
2718
composer.lock
generated
Normal file
2718
composer.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
111
src/CrawlObserver.php
Normal file
111
src/CrawlObserver.php
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
<?php
|
||||||
|
namespace JHodges\Sitemap;
|
||||||
|
|
||||||
|
use GuzzleHttp\Exception\RequestException;
|
||||||
|
use GuzzleHttp\RequestOptions;
|
||||||
|
use Psr\Http\Message\ResponseInterface;
|
||||||
|
use Psr\Http\Message\UriInterface;
|
||||||
|
|
||||||
|
use Spatie\Crawler\Crawler;
|
||||||
|
use Spatie\Crawler\CrawlInternalUrls;
|
||||||
|
|
||||||
|
class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
||||||
|
{
|
||||||
|
|
||||||
|
public $results=[];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when the crawler will crawl the url.
|
||||||
|
*
|
||||||
|
* @param \Psr\Http\Message\UriInterface $url
|
||||||
|
*/
|
||||||
|
public function willCrawl(UriInterface $url)
|
||||||
|
{
|
||||||
|
echo "Will:$url\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when the crawler has crawled the given url successfully.
|
||||||
|
*
|
||||||
|
* @param \Psr\Http\Message\UriInterface $url
|
||||||
|
* @param \Psr\Http\Message\ResponseInterface $response
|
||||||
|
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
|
||||||
|
*/
|
||||||
|
public function crawled(
|
||||||
|
UriInterface $url,
|
||||||
|
ResponseInterface $response,
|
||||||
|
?UriInterface $foundOnUrl = null
|
||||||
|
){
|
||||||
|
|
||||||
|
// https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests
|
||||||
|
// Retrieve both Redirect History headers
|
||||||
|
$fullRedirectReport = [];
|
||||||
|
if($response->getHeader('X-Guzzle-Redirect-History')){
|
||||||
|
// Retrieve both Redirect History headers
|
||||||
|
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
||||||
|
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
||||||
|
// Add the initial URI requested to the (beginning of) URI history
|
||||||
|
array_unshift($redirectUriHistory, (string)$url);
|
||||||
|
// Add the final HTTP status code to the end of HTTP response history
|
||||||
|
array_push($redirectCodeHistory, $response->getStatusCode());
|
||||||
|
$fullRedirectReport = [];
|
||||||
|
foreach ($redirectUriHistory as $key => $value) {
|
||||||
|
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach($fullRedirectReport as $rr){
|
||||||
|
$this->results[]=[
|
||||||
|
'location'=>(String)$rr['location'],
|
||||||
|
'code'=>$rr['code'],
|
||||||
|
'type'=>$response->getHeader('Content-Type')[0]??null,
|
||||||
|
'foundOn'=>(string)$foundOnUrl,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when the crawler had a problem crawling the given url.
|
||||||
|
*
|
||||||
|
* @param \Psr\Http\Message\UriInterface $url
|
||||||
|
* @param \GuzzleHttp\Exception\RequestException $requestException
|
||||||
|
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
|
||||||
|
*/
|
||||||
|
public function crawlFailed(
|
||||||
|
UriInterface $url,
|
||||||
|
RequestException $requestException,
|
||||||
|
?UriInterface $foundOnUrl = null
|
||||||
|
){
|
||||||
|
if($response=$requestException->getResponse()){
|
||||||
|
$code=$response->getStatusCode();
|
||||||
|
$type=$response->getHeader('Content-Type')[0]??null;
|
||||||
|
}else{
|
||||||
|
$code='???';
|
||||||
|
$type='';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieve both Redirect History headers
|
||||||
|
$fullRedirectReport = [];
|
||||||
|
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
|
||||||
|
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
||||||
|
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
||||||
|
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->results[]=[
|
||||||
|
'link'=>(String)$url,
|
||||||
|
'code'=>$code,
|
||||||
|
'type'=>$type,
|
||||||
|
'parent'=>(string)$foundOnUrl,
|
||||||
|
'redirects'=>$fullRedirectReport,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when the crawl has ended.
|
||||||
|
*/
|
||||||
|
public function finishedCrawling() {
|
||||||
|
//print_r($this->results);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
33
src/Crawler.php
Normal file
33
src/Crawler.php
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
<?php
|
||||||
|
namespace JHodges\Sitemap;
|
||||||
|
|
||||||
|
use GuzzleHttp\Exception\RequestException;
|
||||||
|
use GuzzleHttp\RequestOptions;
|
||||||
|
use GuzzleHttp\Psr7\Uri;
|
||||||
|
use Psr\Http\Message\ResponseInterface;
|
||||||
|
use Psr\Http\Message\UriInterface;
|
||||||
|
|
||||||
|
use Spatie\Crawler\Crawler as SpatieCrawler;
|
||||||
|
use Spatie\Crawler\CrawlUrl;
|
||||||
|
use Spatie\Crawler\CrawlInternalUrls;
|
||||||
|
|
||||||
|
class Crawler{
|
||||||
|
|
||||||
|
public function Crawl($url){
|
||||||
|
$observer=new CrawlObserver();
|
||||||
|
|
||||||
|
SpatieCrawler::create([
|
||||||
|
RequestOptions::ALLOW_REDIRECTS => [
|
||||||
|
'track_redirects' => true,
|
||||||
|
]
|
||||||
|
])
|
||||||
|
//->setMaximumDepth(1)
|
||||||
|
->setCrawlObserver($observer)
|
||||||
|
->setCrawlProfile(new CrawlInternalUrls($url))
|
||||||
|
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://hudevad.com/en/')) )
|
||||||
|
->startCrawling($url)
|
||||||
|
;
|
||||||
|
return $observer->results;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
21
tests/CrawlerTest.php
Normal file
21
tests/CrawlerTest.php
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
<?php
|
||||||
|
use \PHPUnit\Framework\TestCase;
|
||||||
|
|
||||||
|
use JHodges\Sitemap\Crawler;
|
||||||
|
|
||||||
|
class CrawlerTest extends TestCase{
|
||||||
|
|
||||||
|
public function testCrawl(){
|
||||||
|
$crawler=new Crawler();
|
||||||
|
$sitemap=$crawler->crawl('http://jhodges.co.uk');
|
||||||
|
print_r($sitemap);
|
||||||
|
return $sitemap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @depends testCrawl
|
||||||
|
*/
|
||||||
|
public function testBrokenLinks($sitemap){
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user