From 89940811c8bbb55c69950a61cff8f2dbeb1cbb9a Mon Sep 17 00:00:00 2001 From: Steve Worley Date: Thu, 22 Apr 2021 08:40:32 +1000 Subject: [PATCH 1/2] Add a new group plugin for crawler. --- src/Command/CrawlCommand.php | 4 ++ src/Crawler/Group/ElementFilter.php | 89 +++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 src/Crawler/Group/ElementFilter.php diff --git a/src/Command/CrawlCommand.php b/src/Command/CrawlCommand.php index e034a8f1..11ffc899 100644 --- a/src/Command/CrawlCommand.php +++ b/src/Command/CrawlCommand.php @@ -145,6 +145,10 @@ protected function execute(InputInterface $input, OutputInterface $output) $crawler->setDelayBetweenRequests($delay); } + if (!empty($this->config['options']['ignore_robotstxt'])) { + $crawler->ignoreRobots(); + } + $io->success('Starting crawl!'); $crawler->startCrawling($baseUrl); diff --git a/src/Crawler/Group/ElementFilter.php b/src/Crawler/Group/ElementFilter.php new file mode 100644 index 00000000..9683e3a5 --- /dev/null +++ b/src/Crawler/Group/ElementFilter.php @@ -0,0 +1,89 @@ +filter_type = NULL; + } + + /** + * {@inheritdoc} + */ + public function getId() : string + { + $id = parent::getId(); + + if ($this->filter_type) { + $id .= "-{$this->filter_type}"; + } + + return $id; + } + + /** + * {@inheritdoc} + */ + public function match($url, ResponseInterface $response) : bool + { + $dom = new Crawler($response->getBody()->__toString(), $url); + $filter_attr = $this->getOption('filter_attr') ?: 'class'; + $pattern = $this->getOption('pattern'); + + if (empty($this->getOption('selector')) || empty($pattern)) { + return FALSE; + } + + try { + $element = $dom->evaluate($this->getOption('selector')); + } catch (\Exception $error) { + $element = []; + } + + if (!is_callable([$element, 'count']) || $element->count() === 0) { + try { + $element = $dom->filter($this->getOption('selector')); + } catch (\Exception $error) { + return FALSE; + } + } + + if ($element->count() === 0) { + return FALSE; + } + + $types = $element->each(function(Crawler $node) use ($filter_attr, $pattern) { + preg_match($pattern, $node->attr($filter_attr), $matches); + return reset($matches); + }); + + $this->filter_type = reset($types); + + return TRUE; + } + + +} From 7c16be045786e82af22f3661bc49084d33a4b144 Mon Sep 17 00:00:00 2001 From: Steve Worley Date: Thu, 22 Apr 2021 08:48:38 +1000 Subject: [PATCH 2/2] lint. --- src/Crawler/Group/ElementFilter.php | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/Crawler/Group/ElementFilter.php b/src/Crawler/Group/ElementFilter.php index 9683e3a5..0317d21a 100644 --- a/src/Crawler/Group/ElementFilter.php +++ b/src/Crawler/Group/ElementFilter.php @@ -19,8 +19,14 @@ class ElementFilter extends GroupBase { + /** + * The filtered type - used to separate output by id. + * + * @var string + */ protected $filter_type; + /** * {@inheritdoc} */ @@ -28,7 +34,9 @@ public function __construct(array $config=[]) { parent::__construct($config); $this->filter_type = NULL; - } + + }//end __construct() + /** * {@inheritdoc} @@ -42,7 +50,9 @@ public function getId() : string } return $id; - } + + }//end getId() + /** * {@inheritdoc} @@ -75,15 +85,18 @@ public function match($url, ResponseInterface $response) : bool return FALSE; } - $types = $element->each(function(Crawler $node) use ($filter_attr, $pattern) { - preg_match($pattern, $node->attr($filter_attr), $matches); - return reset($matches); - }); + $types = $element->each( + function(Crawler $node) use ($filter_attr, $pattern) { + preg_match($pattern, $node->attr($filter_attr), $matches); + return reset($matches); + } + ); $this->filter_type = reset($types); return TRUE; - } + + }//end match() -} +}//end class