diff --git a/src/CrawlCommand.php b/src/CrawlCommand.php
index b7879d8..0b87c2d 100644
--- a/src/CrawlCommand.php
+++ b/src/CrawlCommand.php
@@ -37,7 +37,7 @@ class CrawlCommand extends Command
protected function execute(InputInterface $input, OutputInterface $output)
{
$baseUrl = $input->getArgument('url');
- $crawler=new Crawler();
+ $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 60, RequestOptions::TIMEOUT => 60]);
$crawler->crawl($baseUrl);
foreach($crawler->getResults() as $url=>$result){
diff --git a/src/CrawlObserver.php b/src/CrawlObserver.php
index 75f0c55..4eea726 100644
--- a/src/CrawlObserver.php
+++ b/src/CrawlObserver.php
@@ -51,7 +51,7 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
$fullRedirectReport[$key] = ['location' => $value, 'code' => $redirectCodeHistory[$key]];
}
- // PART 2a/2: has this url been found already
+ // PART 1a/2: has this url been found already
$fos=$this->results[(string)$url]['foundOn']??[];
foreach($fullRedirectReport as $k=>$redirect){
@@ -62,7 +62,8 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
$response->getHeader('Content-Type')[0]??null,
$k == 0 ? $fullRedirectReport : []
);
- // PART 2b/2: this redirecting url has only just been crawled.
+ // PART 1b/2: this redirecting url has only just been crawled.
+ // so we've only just obtained it redirect history
// but if it has already been found on other pages
// this new redirect history needs to have those foundOnUrls added too
if($k>0){
@@ -98,9 +99,13 @@ class CrawlObserver extends \Spatie\Crawler\CrawlObserver
UriInterface $url,
?UriInterface $foundOnUrl = null
){
- // PART 1/2: if there is an existing result with known redirects
- // then its redirects must apply to this page
if(count($this->results[(string)$url]['redirects']??[])>0){
+ // PART 2/2: if there is an existing result with known redirects
+ // then its redirects must apply to this page
+ // this is a rare case that the URL comes in here (depends on order of crawl)
+ // rather than at PART 1/2.
+ // remove this and the `CrawlerTest::testInterlinked` test will fail
+ // specificlly with interlinked4 not propergating down to foundOn /found
foreach($this->results[(string)$url]['redirects'] as $redirect){
$this->addResult($redirect['location'],(string)$foundOnUrl);
}
diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php
index acf9260..b31b37d 100644
--- a/tests/CrawlerTest.php
+++ b/tests/CrawlerTest.php
@@ -76,23 +76,38 @@ class CrawlerTest extends TestCase{
public function testInterlinked(){
$crawler=new Crawler();
$crawler->crawl('http://localhost:8080/interlinked1');
- //$crawler->crawl('http://localhost:8080/page4'); // TODO!!! this ensures the order or results for the URL tracking test 3PARTS.
+ $crawler->crawl('http://localhost:8080/interlinked4'); //this ensures the order or results for the URL tracking test PART2
$sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[
'http://localhost:8080/interlinked1' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/interlinked1' => 1,
'http://localhost:8080/interlinked2' => 1,
'http://localhost:8080/interlinked3' => 1,
+ 'http://localhost:8080/interlinked4' => 1,
]],
'http://localhost:8080/interlinked2' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/interlinked1' => 1,
'http://localhost:8080/interlinked2' => 1,
'http://localhost:8080/interlinked3' => 1,
+ 'http://localhost:8080/interlinked4' => 1,
]],
'http://localhost:8080/interlinked3' => ['code' => 200 , 'foundOn' => [
'http://localhost:8080/interlinked1' => 1,
'http://localhost:8080/interlinked2' => 1,
'http://localhost:8080/interlinked3' => 1,
+ 'http://localhost:8080/interlinked4' => 1,
+ ]],
+ 'http://localhost:8080/found' => ['code' => 200 , 'foundOn' => [
+ 'http://localhost:8080/interlinked1' => 1,
+ 'http://localhost:8080/interlinked2' => 1,
+ 'http://localhost:8080/interlinked3' => 1,
+ 'http://localhost:8080/interlinked4' => 1,
+ ]],
+ 'http://localhost:8080/redirectToFound' => ['code' => 302 , 'foundOn' => [
+ 'http://localhost:8080/interlinked1' => 1,
+ 'http://localhost:8080/interlinked2' => 1,
+ 'http://localhost:8080/interlinked3' => 1,
+ 'http://localhost:8080/interlinked4' => 1,
]],
], print_r($sitemap,true));
}
diff --git a/tests/server/server.js b/tests/server/server.js
index d8155d5..ed9da20 100644
--- a/tests/server/server.js
+++ b/tests/server/server.js
@@ -84,13 +84,16 @@ app.get('/invalidStatusCode', function (request, response) {
});
app.get('/interlinked1', function (request, response) {
- response.end('123');
+ response.end('123r');
});
app.get('/interlinked2', function (request, response) {
- response.end('123');
+ response.end('123r');
});
app.get('/interlinked3', function (request, response) {
- response.end('123');
+ response.end('123r');
+});
+app.get('/interlinked4', function (request, response) {
+ response.end('123r');
});
let server = app.listen(8080, function () {