Compare commits

..

No commits in common. "master" and "v1.0.0" have entirely different histories.

11 changed files with 327 additions and 719 deletions

View File

@ -1,35 +0,0 @@
kind: pipeline
type: docker
name: testsuite
steps:
- name: composer install
image: chialab/php:7.3
commands:
- composer install
volumes:
- name: composer-cache
path: /root/.composer/cache/
- name: wait for test server
image: alpine:3.12
commands:
- echo "Waiting for server to launch on testserver:8080..."
- while ! nc -z testserver 8080; do sleep 0.1 ; done
- echo "Ready!"
- name: run tests
image: chialab/php:7.3
commands:
- URL=http://testserver:8080 vendor/bin/phpunit tests --testdox --color=always --no-interaction
services:
- name: testserver
image: node:15.2
detach: true
commands:
- cd tests/server/
- npm install
- node server.js

View File

@ -3,7 +3,7 @@ Similar project to [spatie/http-status-check](https://github.com/spatie/http-sta
## Install ## Install
```plain ```plain
composer config repositories.jhodges composer https://composer.jhodges.co.uk/ composer config repositories.jhodges composer https://git.jhodges.co.uk/composer
composer require jhodges/sitemap composer require jhodges/sitemap
``` ```
@ -37,8 +37,7 @@ Start the test server, will listen on localhost:8080
```plain ```plain
cd tests/server cd tests/server
npm install ./start_server.sh
node server.js
``` ```
Run the tests: Run the tests:

View File

@ -4,9 +4,8 @@
"type": "library", "type": "library",
"require": { "require": {
"php": "^7.1", "php": "^7.1",
"spatie/crawler": "4.6.6", "spatie/crawler": "^4.6",
"cweagans/composer-patches": "~1.0", "cweagans/composer-patches": "~1.0"
"guzzlehttp/guzzle": "6.5.2"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "^8.5" "phpunit/phpunit": "^8.5"
@ -28,7 +27,7 @@
"add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch" "add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch"
}, },
"guzzlehttp/guzzle": { "guzzlehttp/guzzle": {
"Status code must be an integer value between 1xx and 5xx": "https://github.com/guzzle/guzzle/commit/f81cd6cdff1213f90de8f012489017510e3d6ff4.patch" "Status code must be an integer value between 1xx and 5xx": "https://patch-diff.githubusercontent.com/raw/guzzle/guzzle/pull/2591.patch"
} }
} }
} }

699
composer.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +0,0 @@
FROM markizano/devuan:beowulf-amd64
RUN apt-get update
RUN apt-get install -y php-cli git unzip php-xml
COPY get_composer.sh /root/
RUN bash /root/get_composer.sh && rm /root/get_composer.sh
RUN mkdir /app
WORKDIR /app
RUN composer init && composer config repositories.jhodges composer https://composer.jhodges.co.uk && composer require jhodges/sitemap
COPY crawl.php /app/
RUN apt clean
ENTRYPOINT php crawl.php

View File

@ -1,45 +0,0 @@
<?php
require_once(__DIR__.'/vendor/autoload.php');
use \JHodges\Sitemap\Crawler;
if($url=getenv('CRAWL_URL')){
$urls=array_filter(array_map('trim',explode(',',$url)));
}else{
die("No env: CRAWL_URL\n");
}
if($code=getenv('CRAWL_CODE')){
$codes=array_filter(array_map('trim',explode(',',$code)));
}else{
$codes=[];
}
$crawler=new Crawler();
foreach($urls as $url){
$crawler->crawl($url);
}
$summary=[];
$details='';
foreach($crawler->getResults() as $url=>$result){
$summary[$result['code']]++;
if( count($codes)==0 || in_array($result['code'],$codes) ){
$details.="{$result['code']} {$url}\n";
foreach($result['foundOn'] as $url=>$count){
$details.=" <- ($count) $url\n";
}
}
}
ksort($summary);
echo '|code|count|'."\n";
echo '|----|-----|'."\n";
foreach($summary as $code=>$count){
echo "| $code | $count |\n";
}
if($details){
echo "\n\n----\n\n```\n$details\n```\n";
}

View File

@ -1,17 +0,0 @@
#!/bin/sh
EXPECTED_CHECKSUM="$(wget -q -O - https://composer.github.io/installer.sig)"
php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');"
ACTUAL_CHECKSUM="$(php -r "echo hash_file('sha384', 'composer-setup.php');")"
if [ "$EXPECTED_CHECKSUM" != "$ACTUAL_CHECKSUM" ]
then
>&2 echo 'ERROR: Invalid installer checksum'
rm composer-setup.php
exit 1
fi
php composer-setup.php --1 --filename=composer --install-dir=/usr/local/bin
RESULT=$?
rm composer-setup.php
exit $RESULT

View File

@ -1,19 +0,0 @@
#!/usr/bin/php
<?php
require_once(__DIR__.'/../vendor/autoload.php');
if(!$url=$argv[1]??null){
die("url?\n");
}
use \JHodges\Sitemap\Crawler;
$crawler=new Crawler();
$crawler->crawl($argv[1]);
foreach($crawler->getResults() as $url=>$result){
echo("{$result['code']} {$url}\n");
foreach($result['foundOn'] as $url=>$count){
echo(" -> ($count) $url\n");
}
}

View File

@ -9,15 +9,14 @@ use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler as SpatieCrawler; use Spatie\Crawler\Crawler as SpatieCrawler;
use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\CrawlAllUrls; use Spatie\Crawler\CrawlInternalUrls;
use Spatie\Crawler\CrawlProfile;
class Crawler{ class Crawler{
private $observer; private $observer;
private $crawler; private $crawler;
public function __construct(array $reqOps=[]){ public function __construct($reqOps=[]){
$this->crawler = SpatieCrawler::create(array_merge($reqOps, [ $this->crawler = SpatieCrawler::create(array_merge($reqOps, [
RequestOptions::ALLOW_REDIRECTS => [ RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true, 'track_redirects' => true,
@ -26,24 +25,13 @@ class Crawler{
$this->observer = new CrawlObserver(); $this->observer = new CrawlObserver();
$this->crawler->setCrawlObserver($this->observer); $this->crawler->setCrawlObserver($this->observer);
$this->crawler->setCrawlProfile(new CrawlAllUrls());
} }
public function setUserAgent(String $agent) : self{ public function crawl($url){
$this->crawler->setUserAgent($agent);
return $this;
}
public function setCrawlProfile(CrawlProfile $p) : self{
$this->crawler->setCrawlProfile($p);
return $this;
}
public function crawl(String $url) : void{
$this->crawler->startCrawling($url); $this->crawler->startCrawling($url);
} }
public function getResults() : array{ public function getResults(){
return $this->observer->results; return $this->observer->results;
} }

View File

@ -6,197 +6,188 @@ use GuzzleHttp\RequestOptions;
class CrawlerTest extends TestCase{ class CrawlerTest extends TestCase{
private $url='http://localhost:8080';
public function __construct(){
parent::__construct();
if(getenv('URL')){
$this->url=getenv('URL');
}
}
public function testFullSite(){ public function testFullSite(){
$crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]); $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]);
$crawler->crawl($this->url); $crawler->crawl('http://localhost:8080/');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
'http://example.com/' => ['code' => 200], 'http://example.com/' => ['code' => 200],
$this->url.'/deeplink1' => ['code' => 200], 'http://localhost:8080/deeplink1' => ['code' => 200],
$this->url.'/deeplink2' => ['code' => 200], 'http://localhost:8080/deeplink2' => ['code' => 200],
$this->url.'/deeplink3' => ['code' => 200], 'http://localhost:8080/deeplink3' => ['code' => 200],
$this->url.'/externalLink' => ['code' => 200], 'http://localhost:8080/externalLink' => ['code' => 200],
$this->url.'/found' => ['code' => 200], 'http://localhost:8080/found' => ['code' => 200],
$this->url.'/interlinked1' => ['code' => 200], 'http://localhost:8080/interlinked1' => ['code' => 200],
$this->url.'/interlinked2' => ['code' => 200], 'http://localhost:8080/interlinked2' => ['code' => 200],
$this->url.'/interlinked3' => ['code' => 200], 'http://localhost:8080/interlinked3' => ['code' => 200],
$this->url.'/internalServerError' => ['code' => 500], 'http://localhost:8080/internalServerError' => ['code' => 500],
$this->url.'/invalidStatusCode' => ['code' => '---'], 'http://localhost:8080/invalidStatusCode' => ['code' => '---'],
$this->url.'/notFound' => ['code' => 404], 'http://localhost:8080/notFound' => ['code' => 404],
$this->url.'/redirect1' => ['code' => 302], 'http://localhost:8080/redirect1' => ['code' => 302],
$this->url.'/redirect2' => ['code' => 302], 'http://localhost:8080/redirect2' => ['code' => 302],
$this->url.'/redirectLoop' => ['code' => '---'], 'http://localhost:8080/redirectLoop' => ['code' => '---'],
$this->url.'/redirectToFound' => ['code' => 302 ], 'http://localhost:8080/redirectToFound' => ['code' => 302 ],
$this->url.'/redirectToNotFound' => ['code' => 302 ], 'http://localhost:8080/redirectToNotFound' => ['code' => 302 ],
$this->url.'/redirectToRedirectToNotFound' => ['code' => 302], 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302],
$this->url.'/timeout' => ['code' => '---'], 'http://localhost:8080/timeout' => ['code' => '---'],
$this->url.'/twoRedirectsToSameLocation' => ['code' => 200], 'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testFound(){ public function testFound(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/found'); $crawler->crawl('http://localhost:8080/found');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/found' => ['code' => 200], 'http://localhost:8080/found' => ['code' => 200],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testNotFound(){ public function testNotFound(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/notFound'); $crawler->crawl('http://localhost:8080/notFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/notFound' => ['code' => 404], 'http://localhost:8080/notFound' => ['code' => 404],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testExternalLink(){ public function testExternalLink(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/externalLink'); $crawler->crawl('http://localhost:8080/externalLink');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/externalLink' => ['code' => 200], 'http://localhost:8080/externalLink' => ['code' => 200],
'http://example.com/' => ['code' => 200], 'http://example.com/' => ['code' => 200],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testDeeplink(){ public function testDeeplink(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/deeplink1'); $crawler->crawl('http://localhost:8080/deeplink1');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/deeplink1' => ['code' => 200], 'http://localhost:8080/deeplink1' => ['code' => 200],
$this->url.'/deeplink2' => ['code' => 200], 'http://localhost:8080/deeplink2' => ['code' => 200],
$this->url.'/deeplink3' => ['code' => 200], 'http://localhost:8080/deeplink3' => ['code' => 200],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testInterlinked(){ public function testInterlinked(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/interlinked1'); $crawler->crawl('http://localhost:8080/interlinked1');
$crawler->crawl($this->url.'/interlinked4'); //this ensures the order or results for the URL tracking test PART2 $crawler->crawl('http://localhost:8080/interlinked4'); //this ensures the order or results for the URL tracking test PART2
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/interlinked1' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked1' => ['code' => 200 , 'foundOn' => [
$this->url.'/interlinked1' => 1, 'http://localhost:8080/interlinked1' => 1,
$this->url.'/interlinked2' => 1, 'http://localhost:8080/interlinked2' => 1,
$this->url.'/interlinked3' => 1, 'http://localhost:8080/interlinked3' => 1,
$this->url.'/interlinked4' => 1, 'http://localhost:8080/interlinked4' => 1,
]], ]],
$this->url.'/interlinked2' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked2' => ['code' => 200 , 'foundOn' => [
$this->url.'/interlinked1' => 1, 'http://localhost:8080/interlinked1' => 1,
$this->url.'/interlinked2' => 1, 'http://localhost:8080/interlinked2' => 1,
$this->url.'/interlinked3' => 1, 'http://localhost:8080/interlinked3' => 1,
$this->url.'/interlinked4' => 1, 'http://localhost:8080/interlinked4' => 1,
]], ]],
$this->url.'/interlinked3' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/interlinked3' => ['code' => 200 , 'foundOn' => [
$this->url.'/interlinked1' => 1, 'http://localhost:8080/interlinked1' => 1,
$this->url.'/interlinked2' => 1, 'http://localhost:8080/interlinked2' => 1,
$this->url.'/interlinked3' => 1, 'http://localhost:8080/interlinked3' => 1,
$this->url.'/interlinked4' => 1, 'http://localhost:8080/interlinked4' => 1,
]], ]],
$this->url.'/found' => ['code' => 200 , 'foundOn' => [ 'http://localhost:8080/found' => ['code' => 200 , 'foundOn' => [
$this->url.'/interlinked1' => 1, 'http://localhost:8080/interlinked1' => 1,
$this->url.'/interlinked2' => 1, 'http://localhost:8080/interlinked2' => 1,
$this->url.'/interlinked3' => 1, 'http://localhost:8080/interlinked3' => 1,
$this->url.'/interlinked4' => 1, 'http://localhost:8080/interlinked4' => 1,
]], ]],
$this->url.'/redirectToFound' => ['code' => 302 , 'foundOn' => [ 'http://localhost:8080/redirectToFound' => ['code' => 302 , 'foundOn' => [
$this->url.'/interlinked1' => 1, 'http://localhost:8080/interlinked1' => 1,
$this->url.'/interlinked2' => 1, 'http://localhost:8080/interlinked2' => 1,
$this->url.'/interlinked3' => 1, 'http://localhost:8080/interlinked3' => 1,
$this->url.'/interlinked4' => 1, 'http://localhost:8080/interlinked4' => 1,
]], ]],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testRedirectToFound(){ public function testRedirectToFound(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/redirectToFound'); $crawler->crawl('http://localhost:8080/redirectToFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/redirectToFound' => ['code' => 302], 'http://localhost:8080/redirectToFound' => ['code' => 302],
$this->url.'/found' => ['code' => 200 ], 'http://localhost:8080/found' => ['code' => 200 ],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testRedirectToNotFound(){ public function testRedirectToNotFound(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/redirectToNotFound'); $crawler->crawl('http://localhost:8080/redirectToNotFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/redirectToNotFound' => ['code' => 302], 'http://localhost:8080/redirectToNotFound' => ['code' => 302],
$this->url.'/notFound' => ['code' => 404 ], 'http://localhost:8080/notFound' => ['code' => 404 ],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testRedirectToRedirectToNotFound(){ public function testRedirectToRedirectToNotFound(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/redirectToRedirectToNotFound'); $crawler->crawl('http://localhost:8080/redirectToRedirectToNotFound');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/redirectToRedirectToNotFound' => ['code' => 302], 'http://localhost:8080/redirectToRedirectToNotFound' => ['code' => 302],
$this->url.'/redirectToNotFound' => ['code' => 302], 'http://localhost:8080/redirectToNotFound' => ['code' => 302],
$this->url.'/notFound' => ['code' => 404], 'http://localhost:8080/notFound' => ['code' => 404],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testTwoRedirectsToSameLocation(){ public function testTwoRedirectsToSameLocation(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/twoRedirectsToSameLocation'); $crawler->crawl('http://localhost:8080/twoRedirectsToSameLocation');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/twoRedirectsToSameLocation' => ['code' => 200], 'http://localhost:8080/twoRedirectsToSameLocation' => ['code' => 200],
$this->url.'/redirect1' => ['code' => 302], 'http://localhost:8080/redirect1' => ['code' => 302],
$this->url.'/redirect2' => ['code' => 302], 'http://localhost:8080/redirect2' => ['code' => 302],
$this->url.'/found' => ['code' => 200], 'http://localhost:8080/found' => ['code' => 200],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testTimeout(){ public function testTimeout(){
$crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]); $crawler=new Crawler([RequestOptions::CONNECT_TIMEOUT => 3, RequestOptions::TIMEOUT => 3]);
$crawler->crawl($this->url.'/timeout'); $crawler->crawl('http://localhost:8080/timeout');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/timeout' => ['code' => '---'], 'http://localhost:8080/timeout' => ['code' => '---'],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testRedirectLoop(){ public function testRedirectLoop(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/redirectLoop'); $crawler->crawl('http://localhost:8080/redirectLoop');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/redirectLoop' => ['code' => '---'], 'http://localhost:8080/redirectLoop' => ['code' => '---'],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testInternalServerError(){ public function testInternalServerError(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/internalServerError'); $crawler->crawl('http://localhost:8080/internalServerError');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/internalServerError' => ['code' => 500], 'http://localhost:8080/internalServerError' => ['code' => 500],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }
public function testInvalidStatusCode(){ public function testInvalidStatusCode(){
$crawler=new Crawler(); $crawler=new Crawler();
$crawler->crawl($this->url.'/invalidStatusCode'); $crawler->crawl('http://localhost:8080/invalidStatusCode');
$sitemap=$crawler->getResults(); $sitemap=$crawler->getResults();
$this->assertTreeContains($sitemap,[ $this->assertTreeContains($sitemap,[
$this->url.'/invalidStatusCode' => ['code' => '---'], 'http://localhost:8080/invalidStatusCode' => ['code' => '---'],
], print_r($sitemap,true)); ], print_r($sitemap,true));
} }

13
tests/server/start_server.sh Executable file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
npm install
if [ -z ${TRAVIS_JOB_ID} ]; then
# not running under travis, stay in foreground until stopped
node server.js
else
cd tests/server
# running under travis, daemonize
(node server.js &) || /bin/true
fi