better tests for interlinked and redirect tracking
This commit is contained in:
parent
5fc2b1fb83
commit
3ab51ccff9
@ -37,7 +37,7 @@ class CrawlCommand extends Command
|
|||||||
protected function execute(InputInterface $input, OutputInterface $output)
|
protected function execute(InputInterface $input, OutputInterface $output)
|
||||||
{
|
{
|
||||||
$baseUrl = $input->getArgument('url');
|
$baseUrl = $input->getArgument('url');
|
||||||
$crawler=new Crawler();
|
$crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 60, RequestOptions::TIMEOUT => 60]);
|
||||||
$crawler->crawl($baseUrl);
|
$crawler->crawl($baseUrl);
|
||||||
|
|
||||||
foreach($crawler->getResults() as $url=>$result){
|
foreach($crawler->getResults() as $url=>$result){
|
||||||
|
@ -51,7 +51,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
|
||||||
}
|
}
|
||||||
|
|
||||||
// PART 2a/2: has this url been found already
|
// PART 1a/2: has this url been found already
|
||||||
$fos=$this->results[(string)$url]['foundOn']??[];
|
$fos=$this->results[(string)$url]['foundOn']??[];
|
||||||
|
|
||||||
foreach($fullRedirectReport as $k=>$redirect){
|
foreach($fullRedirectReport as $k=>$redirect){
|
||||||
@ -62,7 +62,8 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
$response->getHeader('Content-Type')[0]??null,
|
$response->getHeader('Content-Type')[0]??null,
|
||||||
$k == 0 ? $fullRedirectReport : []
|
$k == 0 ? $fullRedirectReport : []
|
||||||
);
|
);
|
||||||
// PART 2b/2: this redirecting url has only just been crawled.
|
// PART 1b/2: this redirecting url has only just been crawled.
|
||||||
|
// so we've only just obtained it redirect history
|
||||||
// but if it has already been found on other pages
|
// but if it has already been found on other pages
|
||||||
// this new redirect history needs to have those foundOnUrls added too
|
// this new redirect history needs to have those foundOnUrls added too
|
||||||
if($k>0){
|
if($k>0){
|
||||||
@ -98,9 +99,13 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
|
|||||||
UriInterface $url,
|
UriInterface $url,
|
||||||
?UriInterface $foundOnUrl = null
|
?UriInterface $foundOnUrl = null
|
||||||
){
|
){
|
||||||
// PART 1/2: if there is an existing result with known redirects
|
|
||||||
// then its redirects must apply to this page
|
|
||||||
if(count($this->results[(string)$url]['redirects']??[])>0){
|
if(count($this->results[(string)$url]['redirects']??[])>0){
|
||||||
|
// PART 2/2: if there is an existing result with known redirects
|
||||||
|
// then its redirects must apply to this page
|
||||||
|
// this is a rare case that the URL comes in here (depends on order of crawl)
|
||||||
|
// rather than at PART 1/2.
|
||||||
|
// remove this and the `CrawlerTest::testInterlinked` test will fail
|
||||||
|
// specificlly with interlinked4 not propergating down to foundOn /found
|
||||||
foreach($this->results[(string)$url]['redirects'] as $redirect){
|
foreach($this->results[(string)$url]['redirects'] as $redirect){
|
||||||
$this->addResult($redirect['location'],(string)$foundOnUrl);
|
$this->addResult($redirect['location'],(string)$foundOnUrl);
|
||||||
}
|
}
|
||||||
|
@ -76,23 +76,38 @@ class CrawlerTest extends TestCase{
|
|||||||
public function testInterlinked(){
|
public function testInterlinked(){
|
||||||
$crawler=new Crawler();
|
$crawler=new Crawler();
|
||||||
$crawler->crawl('http://localhost:8080/interlinked1');
|
$crawler->crawl('http://localhost:8080/interlinked1');
|
||||||
//$crawler->crawl('http://localhost:8080/page4'); // TODO!!! this ensures the order or results for the URL tracking test 3PARTS.
|
$crawler->crawl('http://localhost:8080/interlinked4'); //this ensures the order or results for the URL tracking test PART2
|
||||||
$sitemap=$crawler->getResults();
|
$sitemap=$crawler->getResults();
|
||||||
$this->assertTreeContains($sitemap,[
|
$this->assertTreeContains($sitemap,[
|
||||||
'http://localhost:8080/interlinked1' => ['code' => 200 , 'foundOn' => [
|
'http://localhost:8080/interlinked1' => ['code' => 200 , 'foundOn' => [
|
||||||
'http://localhost:8080/interlinked1' => 1,
|
'http://localhost:8080/interlinked1' => 1,
|
||||||
'http://localhost:8080/interlinked2' => 1,
|
'http://localhost:8080/interlinked2' => 1,
|
||||||
'http://localhost:8080/interlinked3' => 1,
|
'http://localhost:8080/interlinked3' => 1,
|
||||||
|
'http://localhost:8080/interlinked4' => 1,
|
||||||
]],
|
]],
|
||||||
'http://localhost:8080/interlinked2' => ['code' => 200 , 'foundOn' => [
|
'http://localhost:8080/interlinked2' => ['code' => 200 , 'foundOn' => [
|
||||||
'http://localhost:8080/interlinked1' => 1,
|
'http://localhost:8080/interlinked1' => 1,
|
||||||
'http://localhost:8080/interlinked2' => 1,
|
'http://localhost:8080/interlinked2' => 1,
|
||||||
'http://localhost:8080/interlinked3' => 1,
|
'http://localhost:8080/interlinked3' => 1,
|
||||||
|
'http://localhost:8080/interlinked4' => 1,
|
||||||
]],
|
]],
|
||||||
'http://localhost:8080/interlinked3' => ['code' => 200 , 'foundOn' => [
|
'http://localhost:8080/interlinked3' => ['code' => 200 , 'foundOn' => [
|
||||||
'http://localhost:8080/interlinked1' => 1,
|
'http://localhost:8080/interlinked1' => 1,
|
||||||
'http://localhost:8080/interlinked2' => 1,
|
'http://localhost:8080/interlinked2' => 1,
|
||||||
'http://localhost:8080/interlinked3' => 1,
|
'http://localhost:8080/interlinked3' => 1,
|
||||||
|
'http://localhost:8080/interlinked4' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/found' => ['code' => 200 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/interlinked1' => 1,
|
||||||
|
'http://localhost:8080/interlinked2' => 1,
|
||||||
|
'http://localhost:8080/interlinked3' => 1,
|
||||||
|
'http://localhost:8080/interlinked4' => 1,
|
||||||
|
]],
|
||||||
|
'http://localhost:8080/redirectToFound' => ['code' => 302 , 'foundOn' => [
|
||||||
|
'http://localhost:8080/interlinked1' => 1,
|
||||||
|
'http://localhost:8080/interlinked2' => 1,
|
||||||
|
'http://localhost:8080/interlinked3' => 1,
|
||||||
|
'http://localhost:8080/interlinked4' => 1,
|
||||||
]],
|
]],
|
||||||
], print_r($sitemap,true));
|
], print_r($sitemap,true));
|
||||||
}
|
}
|
||||||
|
@ -84,13 +84,16 @@ app.get('/invalidStatusCode', function (request, response) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
app.get('/interlinked1', function (request, response) {
|
app.get('/interlinked1', function (request, response) {
|
||||||
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a>');
|
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a><a href="/redirectToFound">r</a>');
|
||||||
});
|
});
|
||||||
app.get('/interlinked2', function (request, response) {
|
app.get('/interlinked2', function (request, response) {
|
||||||
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a>');
|
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a><a href="/redirectToFound">r</a>');
|
||||||
});
|
});
|
||||||
app.get('/interlinked3', function (request, response) {
|
app.get('/interlinked3', function (request, response) {
|
||||||
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a>');
|
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a><a href="/redirectToFound">r</a>');
|
||||||
|
});
|
||||||
|
app.get('/interlinked4', function (request, response) {
|
||||||
|
response.end('<a href="/interlinked1">1</a><a href="/interlinked2">2</a><a href="/interlinked3">3</a><a href="/redirectToFound">r</a>');
|
||||||
});
|
});
|
||||||
|
|
||||||
let server = app.listen(8080, function () {
|
let server = app.listen(8080, function () {
|
||||||
|
Loading…
Reference in New Issue
Block a user