add link crawler

This commit is contained in:
James 2020-02-16 09:04:19 +00:00
parent abb4354362
commit 7d3fb69463
3 changed files with 158 additions and 1 deletions

View File

@ -9,7 +9,8 @@
], ],
"require": { "require": {
"phpunit/phpunit-selenium": "^4.1", "phpunit/phpunit-selenium": "^4.1",
"facebook/webdriver": "^1.7" "php-webdriver/webdriver": "^1.7",
"spatie/crawler": "^4.6"
}, },
"autoload": { "autoload": {
"psr-4": { "psr-4": {

101
src/CrawlObserver.php Normal file
View File

@ -0,0 +1,101 @@
<?php
namespace JHodges\PHPUnitBase;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlObserver as BaseCrawlObserver;
class CrawlObserver extends BaseCrawlObserver
{
public $results=[];
/**
* Called when the crawler will crawl the url.
*
* @param \Psr\Http\Message\UriInterface $url
*/
public function willCrawl(UriInterface $url)
{
}
/**
* Called when the crawler has crawled the given url successfully.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \Psr\Http\Message\ResponseInterface $response
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawled(
UriInterface $url,
ResponseInterface $response,
?UriInterface $foundOnUrl = null
){
$code=$response->getStatusCode();
$type=$response->getHeader('Content-Type')[0]??null;
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response->getHeader('X-Guzzle-Redirect-History')){
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
}
$this->results[]=[
'link'=>(String)$url,
'code'=>$code,
'type'=>$type,
'parent'=>(string)$foundOnUrl,
'redirects'=>$fullRedirectReport,
];
}
/**
* Called when the crawler had a problem crawling the given url.
*
* @param \Psr\Http\Message\UriInterface $url
* @param \GuzzleHttp\Exception\RequestException $requestException
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
*/
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
?UriInterface $foundOnUrl = null
){
if($response=$requestException->getResponse()){
$code=$response->getStatusCode();
$type=$response->getHeader('Content-Type')[0]??null;
}else{
$code='???';
$type='';
}
// Retrieve both Redirect History headers
$fullRedirectReport = [];
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
}
$this->results[]=[
'link'=>(String)$url,
'code'=>$code,
'type'=>$type,
'parent'=>(string)$foundOnUrl,
'redirects'=>$fullRedirectReport,
];
}
/**
* Called when the crawl has ended.
*/
public function finishedCrawling() {
//print_r($this->results);
}
}

55
src/CrawlTest.php Normal file
View File

@ -0,0 +1,55 @@
<?php
namespace JHodges\PHPUnitBase;
use \PHPUnit\Framework\TestCase;
use GuzzleHttp\RequestOptions;
use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\CrawlInternalUrls;
abstract class CrawlTest extends TestCase{
/**
* @return string the source domain
**/
abstract protected function getUrl();
public function testCrawl(){
$observer=new CrawlObserver();
Crawler::create([
RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true,
],
])
->setCrawlObserver($observer)
->setCrawlProfile(new CrawlInternalUrls( $this-> getUrl() ))
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://another_entry_point??')) )
->startCrawling( $this-> getUrl() )
;
$this->assertTrue(true);
return $observer->results;
}
/**
* @depends testCrawl
*/
public function testBrokenLinks($results){
$errors='';
foreach($results as $result){
if($result['code']!=200){
$errors.="{$result['code']} {$result['link']} (found on {$result['parent']})\n";
}
}
if($errors){
throw new \Exception("\n".$errors);
}else{
$this->assertTrue(true);
}
}
}