diff --git a/src/Command/CrawlCommand.php b/src/Command/CrawlCommand.php index e034a8f1..11ffc899 100644 --- a/src/Command/CrawlCommand.php +++ b/src/Command/CrawlCommand.php @@ -145,6 +145,10 @@ protected function execute(InputInterface $input, OutputInterface $output) $crawler->setDelayBetweenRequests($delay); } + if (!empty($this->config['options']['ignore_robotstxt'])) { + $crawler->ignoreRobots(); + } + $io->success('Starting crawl!'); $crawler->startCrawling($baseUrl); diff --git a/src/Crawler/Group/ElementFilter.php b/src/Crawler/Group/ElementFilter.php new file mode 100644 index 00000000..0317d21a --- /dev/null +++ b/src/Crawler/Group/ElementFilter.php @@ -0,0 +1,102 @@ +filter_type = NULL; + + }//end __construct() + + + /** + * {@inheritdoc} + */ + public function getId() : string + { + $id = parent::getId(); + + if ($this->filter_type) { + $id .= "-{$this->filter_type}"; + } + + return $id; + + }//end getId() + + + /** + * {@inheritdoc} + */ + public function match($url, ResponseInterface $response) : bool + { + $dom = new Crawler($response->getBody()->__toString(), $url); + $filter_attr = $this->getOption('filter_attr') ?: 'class'; + $pattern = $this->getOption('pattern'); + + if (empty($this->getOption('selector')) || empty($pattern)) { + return FALSE; + } + + try { + $element = $dom->evaluate($this->getOption('selector')); + } catch (\Exception $error) { + $element = []; + } + + if (!is_callable([$element, 'count']) || $element->count() === 0) { + try { + $element = $dom->filter($this->getOption('selector')); + } catch (\Exception $error) { + return FALSE; + } + } + + if ($element->count() === 0) { + return FALSE; + } + + $types = $element->each( + function(Crawler $node) use ($filter_attr, $pattern) { + preg_match($pattern, $node->attr($filter_attr), $matches); + return reset($matches); + } + ); + + $this->filter_type = reset($types); + + return TRUE; + + }//end match() + + +}//end class