diff --git a/README.md b/README.md index 4cf4b6f..2c7f2f2 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,21 @@ This link checker / sitemap generator is built using spatie/crawler which uses guzzle to do the crawling. -This is very similar to spatie/http-status-check but intends to add some functionality. +This is very similar to spatie/http-status-check but due to the way guzzle handles redirects I wasn't happy with the results; known 404's, and even whole chunks of known URLs were missing from crawl results. By following this https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests I was able to get a crawl result with the complete list of responses as I was expecting. -The nodejs test server is copied directly from spatie/http-status-check with some additional pages added. The examples below will br run against the test server in this repo. +The node.js test server is copied directly from spatie/http-status-check but refactored to offer a more diverse range of tests cases that cover the redirect issues noted above. + +The examples below are run against the test server in this project. + +## EXCEPTION: Status code must be an integer value between 1xx and 5xx +I ran across this exception https://github.com/spatie/crawler/issues/271 +and the patch for GuzzleHttp/Guzzle fixed it https://github.com/guzzle/guzzle/pull/2591 +Without this patch `CrawlerTest::testInvalidStatusCode` will fail. ## Follow redirects By default `spatie/http-status-check` has `guzzle` set to not follow redirects. This results in the potential for parts of the site to be uncrawlable if they are behind a 301 or 302 redirect, and not linked internally anywhere else with a non-redirecting link. -Some webservers include a `` link on the 301/302 body and this will mitigate the problem (spaite indexes and follows it), however if the webserver does not do this, then the link wont be followed to its true destination, and the destination won't be crawled. +Some webservers include a `` link on the 301/302 body and this will mitigate the problem (spaite follows and indexes), however if the webserver does not do this, then the link won't be followed to its true destination, and the destination won't be indexed. This is most obvious with a redirect to a not found page: You'd expect to see a 404 here: @@ -37,10 +44,10 @@ Crawling summary Crawled 1 url(s) with statuscode 302 ``` -If I enable RequestOptions::ALLOW_REDIRECTS I get: +If I enable `RequestOptions::ALLOW_REDIRECTS` as suggested here https://github.com/spatie/crawler/issues/263 I get: ``` -bob@chodbox:~/Projects/http-status-check$ ./http-status-check scan http://localhost:8080/redirectToNotFound +$ ./http-status-check scan http://localhost:8080/redirectToNotFound Start scanning http://localhost:8080/redirectToNotFound @@ -58,46 +65,29 @@ $ ./http-status-check scan http://localhost:8080/redirectToFound Start scanning http://localhost:8080/redirectToFound -[2020-02-22 12:13:45] 200 OK - http://localhost:8080/redirectToFound -[2020-02-22 12:13:45] 200 OK - http://localhost:8080/link1 -[2020-02-22 12:13:45] 200 OK - http://localhost:8080/link2 -[2020-02-22 12:13:45] 200 OK - http://localhost:8080/link4 -[2020-02-22 12:13:45] 200 OK - http://localhost:8080/link3 -[2020-02-22 12:13:45] 200 OK - http://example.com/ -[2020-02-22 12:13:45] 404: Not Found - http://localhost:8080/notExists (found on http://localhost:8080/link3) +[2020-02-23 18:10:19] 200 OK - http://localhost:8080/redirectToFound Crawling summary ---------------- -Crawled 6 url(s) with statuscode 200 -Crawled 1 url(s) with statuscode 404 +Crawled 1 url(s) with statuscode 200 ``` -This looks better, we can see the 404 now, and the rest of the site being index, however on closer inspection you'll notice there are no 301 or 302's being shown. And the redirectToFound is actually showing as a 200. The homepage is missing. +This looks better, we can see the 404 and the 200 now, however on closer inspection you'll notice there are no 301 or 302's being shown. And the redirectToFound is actually showing as a 200 where the real response was 301, the 200 should be associated with the /found URL that is still missing from the results. For generating a sitemap of valid paged we'd need the destination URL, not the redirect URL. Enter my sitemap crawler: ``` -$ bin/crawler crawl http://localhost:8080/redirectToNotFound +$ bob@chodbox:~/Projects/sitemap$ bin/crawler crawl http://localhost:8080/redirectToNotFound 302 http://localhost:8080/redirectToNotFound -404 http://localhost:8080/notFound2 -``` +404 http://localhost:8080/notFound -and +$ bin/crawler crawl http://localhost:8080/redirectToFound -``` 302 http://localhost:8080/redirectToFound -200 http://localhost:8080/ -200 http://localhost:8080/link1 -200 http://localhost:8080/link2 -302 http://localhost:8080/link4 -200 http://example.com/ -200 http://localhost:8080/link3 -404 http://localhost:8080/notExists +200 http://localhost:8080/found ``` -This seems to be a feature of guzzle and not really a bug of spatie. https://github.com/guzzle/guzzle/blob/master/docs/faq.rst#how-can-i-track-redirected-requests - ## Collate `foundOnUrl`s. - * Requires a patch to spatie/crawler https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280 (auto applied) + * Requires a patch to spatie/crawler https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280 (auto applied) without this patch the `CrawlerTest::testInterlinked` test will fail. * TODO: Document the confusing PART1&2 behaviour. (collating the redirect urls) diff --git a/src/ConsoleApplication.php b/src/ConsoleApplication.php index ddb5fef..d68f2fe 100644 --- a/src/ConsoleApplication.php +++ b/src/ConsoleApplication.php @@ -10,7 +10,7 @@ class ConsoleApplication extends Application { error_reporting(-1); - parent::__construct('Sitempa', '0.0.1'); + parent::__construct('Sitempa', '0.1.0'); $this->add(new CrawlCommand()); } diff --git a/src/Crawler.php b/src/Crawler.php index 35b8e6e..1d46070 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -16,21 +16,15 @@ class Crawler{ private $observer; private $crawler; - public function __construct($baseUrl=null){ - $this->observer = new CrawlObserver(); - $this->crawler = SpatieCrawler::create([ + public function __construct($reqOps=[]){ + $this->crawler = SpatieCrawler::create(array_merge($reqOps, [ RequestOptions::ALLOW_REDIRECTS => [ 'track_redirects' => true, ], - RequestOptions::CONNECT_TIMEOUT => 3, - RequestOptions::TIMEOUT => 3, - ]) - //->setMaximumDepth(1) - ->setCrawlObserver($this->observer) - ; - if($baseUrl){ - $this->crawler->setCrawlProfile(new CrawlInternalUrls($baseUrl)); - } + ])); + + $this->observer = new CrawlObserver(); + $this->crawler->setCrawlObserver($this->observer); } public function crawl($url){ diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index d3b7d78..acf9260 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -2,11 +2,12 @@ use \PHPUnit\Framework\TestCase; use JHodges\Sitemap\Crawler; +use GuzzleHttp\RequestOptions; class CrawlerTest extends TestCase{ public function testFullSite(){ - $crawler=new Crawler(); + $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]); $crawler->crawl('http://localhost:8080/'); $sitemap=$crawler->getResults(); $this->assertTreeContains($sitemap,[ @@ -140,7 +141,7 @@ class CrawlerTest extends TestCase{ } public function testTimeout(){ - $crawler=new Crawler(); + $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]); $crawler->crawl('http://localhost:8080/timeout'); $sitemap=$crawler->getResults(); $this->assertTreeContains($sitemap,[