add link crawler
This commit is contained in:
parent
abb4354362
commit
7d3fb69463
@ -9,7 +9,8 @@
|
||||
],
|
||||
"require": {
|
||||
"phpunit/phpunit-selenium": "^4.1",
|
||||
"facebook/webdriver": "^1.7"
|
||||
"php-webdriver/webdriver": "^1.7",
|
||||
"spatie/crawler": "^4.6"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
|
101
src/CrawlObserver.php
Normal file
101
src/CrawlObserver.php
Normal file
@ -0,0 +1,101 @@
|
||||
<?php
|
||||
namespace JHodges\PHPUnitBase;
|
||||
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Http\Message\UriInterface;
|
||||
|
||||
use Spatie\Crawler\CrawlObserver as BaseCrawlObserver;
|
||||
|
||||
class CrawlObserver extends BaseCrawlObserver
|
||||
{
|
||||
|
||||
public $results=[];
|
||||
|
||||
/**
|
||||
* Called when the crawler will crawl the url.
|
||||
*
|
||||
* @param \Psr\Http\Message\UriInterface $url
|
||||
*/
|
||||
public function willCrawl(UriInterface $url)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when the crawler has crawled the given url successfully.
|
||||
*
|
||||
* @param \Psr\Http\Message\UriInterface $url
|
||||
* @param \Psr\Http\Message\ResponseInterface $response
|
||||
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
|
||||
*/
|
||||
public function crawled(
|
||||
UriInterface $url,
|
||||
ResponseInterface $response,
|
||||
?UriInterface $foundOnUrl = null
|
||||
){
|
||||
$code=$response->getStatusCode();
|
||||
$type=$response->getHeader('Content-Type')[0]??null;
|
||||
|
||||
// Retrieve both Redirect History headers
|
||||
$fullRedirectReport = [];
|
||||
if($response->getHeader('X-Guzzle-Redirect-History')){
|
||||
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
||||
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
||||
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
|
||||
}
|
||||
|
||||
$this->results[]=[
|
||||
'link'=>(String)$url,
|
||||
'code'=>$code,
|
||||
'type'=>$type,
|
||||
'parent'=>(string)$foundOnUrl,
|
||||
'redirects'=>$fullRedirectReport,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when the crawler had a problem crawling the given url.
|
||||
*
|
||||
* @param \Psr\Http\Message\UriInterface $url
|
||||
* @param \GuzzleHttp\Exception\RequestException $requestException
|
||||
* @param \Psr\Http\Message\UriInterface|null $foundOnUrl
|
||||
*/
|
||||
public function crawlFailed(
|
||||
UriInterface $url,
|
||||
RequestException $requestException,
|
||||
?UriInterface $foundOnUrl = null
|
||||
){
|
||||
if($response=$requestException->getResponse()){
|
||||
$code=$response->getStatusCode();
|
||||
$type=$response->getHeader('Content-Type')[0]??null;
|
||||
}else{
|
||||
$code='???';
|
||||
$type='';
|
||||
}
|
||||
|
||||
// Retrieve both Redirect History headers
|
||||
$fullRedirectReport = [];
|
||||
if($response && $response->getHeader('X-Guzzle-Redirect-History')){
|
||||
$redirectUriHistory = $response->getHeader('X-Guzzle-Redirect-History'); // retrieve Redirect URI history
|
||||
$redirectCodeHistory = $response->getHeader('X-Guzzle-Redirect-Status-History'); // retrieve Redirect HTTP Status history
|
||||
$fullRedirectReport=[$redirectUriHistory,$redirectCodeHistory];
|
||||
}
|
||||
|
||||
$this->results[]=[
|
||||
'link'=>(String)$url,
|
||||
'code'=>$code,
|
||||
'type'=>$type,
|
||||
'parent'=>(string)$foundOnUrl,
|
||||
'redirects'=>$fullRedirectReport,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when the crawl has ended.
|
||||
*/
|
||||
public function finishedCrawling() {
|
||||
//print_r($this->results);
|
||||
}
|
||||
|
||||
}
|
55
src/CrawlTest.php
Normal file
55
src/CrawlTest.php
Normal file
@ -0,0 +1,55 @@
|
||||
<?php
|
||||
namespace JHodges\PHPUnitBase;
|
||||
|
||||
use \PHPUnit\Framework\TestCase;
|
||||
|
||||
use GuzzleHttp\RequestOptions;
|
||||
use GuzzleHttp\Psr7\Uri;
|
||||
use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Http\Message\UriInterface;
|
||||
|
||||
use Spatie\Crawler\Crawler;
|
||||
use Spatie\Crawler\CrawlUrl;
|
||||
use Spatie\Crawler\CrawlInternalUrls;
|
||||
|
||||
abstract class CrawlTest extends TestCase{
|
||||
|
||||
/**
|
||||
* @return string the source domain
|
||||
**/
|
||||
abstract protected function getUrl();
|
||||
|
||||
public function testCrawl(){
|
||||
$observer=new CrawlObserver();
|
||||
|
||||
Crawler::create([
|
||||
RequestOptions::ALLOW_REDIRECTS => [
|
||||
'track_redirects' => true,
|
||||
],
|
||||
])
|
||||
->setCrawlObserver($observer)
|
||||
->setCrawlProfile(new CrawlInternalUrls( $this-> getUrl() ))
|
||||
//->addToCrawlQueue( CrawlUrl::create(new Uri('https://another_entry_point??')) )
|
||||
->startCrawling( $this-> getUrl() )
|
||||
;
|
||||
$this->assertTrue(true);
|
||||
return $observer->results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @depends testCrawl
|
||||
*/
|
||||
public function testBrokenLinks($results){
|
||||
$errors='';
|
||||
foreach($results as $result){
|
||||
if($result['code']!=200){
|
||||
$errors.="{$result['code']} {$result['link']} (found on {$result['parent']})\n";
|
||||
}
|
||||
}
|
||||
if($errors){
|
||||
throw new \Exception("\n".$errors);
|
||||
}else{
|
||||
$this->assertTrue(true);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user