Compare commits

...

17 Commits

Author SHA1 Message Date
James
ac9e614086 lock pacakge version to fix dep breakage 2021-02-06 12:02:28 +00:00
James
bea673807c add method to setCrawlProfile, add type hinting 2021-02-06 11:57:25 +00:00
James
35aeeeca72 allow multiple urls in docker 2021-01-16 10:54:14 +00:00
James
5f71eeb6a5 improve docker output 2021-01-06 17:21:04 +00:00
James
2ece7c9e99 add docker support 2021-01-03 17:39:04 +00:00
f739042cfd fix composer url moved 2021-01-03 17:12:28 +00:00
James
432acb7475 add setUserAgent support
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
2020-11-16 12:58:28 +00:00
James
6b28eb168e revert fix redirectloop test
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
2020-11-15 18:34:28 +00:00
James
4e29ca154e downgrade
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 18:33:05 +00:00
James
fbf6d31c5b update package version
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 18:29:48 +00:00
James
a4037f2d95 fix package version
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 18:28:20 +00:00
James
437d0fbf4e fix redirect loop test
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 14:54:32 +00:00
James
418747027b try older spate crawler
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 14:49:47 +00:00
James
7b9b125f57 crawl externals too
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 14:43:23 +00:00
James
e2fe2eedf7 fix
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 14:33:08 +00:00
James
f46074dfba better phpunit output 2020-11-15 14:31:42 +00:00
James
44d07858b5 update
Some checks failed
continuous-integration/drone/push Build is failing
2020-11-15 14:27:14 +00:00
8 changed files with 579 additions and 238 deletions

View File

@ -5,7 +5,7 @@ name: testsuite
steps: steps:
- name: composer install - name: composer install
image: chialab/php image: chialab/php:7.3
commands: commands:
- composer install - composer install
volumes: volumes:
@ -13,21 +13,21 @@ steps:
path: /root/.composer/cache/ path: /root/.composer/cache/
- name: wait for test server - name: wait for test server
image: alpine image: alpine:3.12
commands: commands:
- echo "Waiting for server to launch on testserver:8080..." - echo "Waiting for server to launch on testserver:8080..."
- while ! nc -z testserver 8080; do sleep 0.1 ; done - while ! nc -z testserver 8080; do sleep 0.1 ; done
- echo "Ready!" - echo "Ready!"
- name: run tests - name: run tests
image: chialab/php image: chialab/php:7.3
commands: commands:
- URL=http://testserver:8080 vendor/bin/phpunit tests - URL=http://testserver:8080 vendor/bin/phpunit tests --testdox --color=always --no-interaction
services: services:
- name: testserver - name: testserver
image: node image: node:15.2
detach: true detach: true
commands: commands:
- cd tests/server/ - cd tests/server/

View File

@ -3,7 +3,7 @@ Similar project to [spatie/http-status-check](https://github.com/spatie/http-sta
## Install ## Install
```plain ```plain
composer config repositories.jhodges composer https://git.jhodges.co.uk/composer composer config repositories.jhodges composer https://composer.jhodges.co.uk/
composer require jhodges/sitemap composer require jhodges/sitemap
``` ```

View File

@ -4,8 +4,9 @@
"type": "library", "type": "library",
"require": { "require": {
"php": "^7.1", "php": "^7.1",
"spatie/crawler": "^4.6", "spatie/crawler": "4.6.6",
"cweagans/composer-patches": "~1.0" "cweagans/composer-patches": "~1.0",
"guzzlehttp/guzzle": "6.5.2"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "^8.5" "phpunit/phpunit": "^8.5"
@ -27,7 +28,7 @@
"add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch" "add crawled again observer": "https://patch-diff.githubusercontent.com/raw/spatie/crawler/pull/280.patch"
}, },
"guzzlehttp/guzzle": { "guzzlehttp/guzzle": {
"Status code must be an integer value between 1xx and 5xx": "https://patch-diff.githubusercontent.com/raw/guzzle/guzzle/pull/2591.patch" "Status code must be an integer value between 1xx and 5xx": "https://github.com/guzzle/guzzle/commit/f81cd6cdff1213f90de8f012489017510e3d6ff4.patch"
} }
} }
} }

699
composer.lock generated

File diff suppressed because it is too large Load Diff

17
docker/Dockerfile Normal file
View File

@ -0,0 +1,17 @@
FROM markizano/devuan:beowulf-amd64
RUN apt-get update
RUN apt-get install -y php-cli git unzip php-xml
COPY get_composer.sh /root/
RUN bash /root/get_composer.sh && rm /root/get_composer.sh
RUN mkdir /app
WORKDIR /app
RUN composer init && composer config repositories.jhodges composer https://composer.jhodges.co.uk && composer require jhodges/sitemap
COPY crawl.php /app/
RUN apt clean
ENTRYPOINT php crawl.php

45
docker/crawl.php Normal file
View File

@ -0,0 +1,45 @@
<?php
require_once(__DIR__.'/vendor/autoload.php');
use \JHodges\Sitemap\Crawler;
if($url=getenv('CRAWL_URL')){
$urls=array_filter(array_map('trim',explode(',',$url)));
}else{
die("No env: CRAWL_URL\n");
}
if($code=getenv('CRAWL_CODE')){
$codes=array_filter(array_map('trim',explode(',',$code)));
}else{
$codes=[];
}
$crawler=new Crawler();
foreach($urls as $url){
$crawler->crawl($url);
}
$summary=[];
$details='';
foreach($crawler->getResults() as $url=>$result){
$summary[$result['code']]++;
if( count($codes)==0 || in_array($result['code'],$codes) ){
$details.="{$result['code']} {$url}\n";
foreach($result['foundOn'] as $url=>$count){
$details.=" <- ($count) $url\n";
}
}
}
ksort($summary);
echo '|code|count|'."\n";
echo '|----|-----|'."\n";
foreach($summary as $code=>$count){
echo "| $code | $count |\n";
}
if($details){
echo "\n\n----\n\n```\n$details\n```\n";
}

17
docker/get_composer.sh Normal file
View File

@ -0,0 +1,17 @@
#!/bin/sh
EXPECTED_CHECKSUM="$(wget -q -O - https://composer.github.io/installer.sig)"
php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');"
ACTUAL_CHECKSUM="$(php -r "echo hash_file('sha384', 'composer-setup.php');")"
if [ "$EXPECTED_CHECKSUM" != "$ACTUAL_CHECKSUM" ]
then
>&2 echo 'ERROR: Invalid installer checksum'
rm composer-setup.php
exit 1
fi
php composer-setup.php --1 --filename=composer --install-dir=/usr/local/bin
RESULT=$?
rm composer-setup.php
exit $RESULT

View File

@ -9,14 +9,15 @@ use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler as SpatieCrawler; use Spatie\Crawler\Crawler as SpatieCrawler;
use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\CrawlInternalUrls; use Spatie\Crawler\CrawlAllUrls;
use Spatie\Crawler\CrawlProfile;
class Crawler{ class Crawler{
private $observer; private $observer;
private $crawler; private $crawler;
public function __construct($reqOps=[]){ public function __construct(array $reqOps=[]){
$this->crawler = SpatieCrawler::create(array_merge($reqOps, [ $this->crawler = SpatieCrawler::create(array_merge($reqOps, [
RequestOptions::ALLOW_REDIRECTS => [ RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true, 'track_redirects' => true,
@ -25,13 +26,24 @@ class Crawler{
$this->observer = new CrawlObserver(); $this->observer = new CrawlObserver();
$this->crawler->setCrawlObserver($this->observer); $this->crawler->setCrawlObserver($this->observer);
$this->crawler->setCrawlProfile(new CrawlAllUrls());
} }
public function crawl($url){ public function setUserAgent(String $agent) : self{
$this->crawler->setUserAgent($agent);
return $this;
}
public function setCrawlProfile(CrawlProfile $p) : self{
$this->crawler->setCrawlProfile($p);
return $this;
}
public function crawl(String $url) : void{
$this->crawler->startCrawling($url); $this->crawler->startCrawling($url);
} }
public function getResults(){ public function getResults() : array{
return $this->observer->results; return $this->observer->results;
} }