diff --git a/.gitignore b/.gitignore index 7111f6e..4b721d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +Pipfile +Pipfile.lock *.pyc /build /*.egg-info diff --git a/scrapy_frontera/converters.py b/scrapy_frontera/converters.py index d1a2244..5afa9f9 100644 --- a/scrapy_frontera/converters.py +++ b/scrapy_frontera/converters.py @@ -39,6 +39,7 @@ def to_frontier(self, scrapy_request): statevars = self.spider.crawler.settings.getlist('FRONTERA_SCHEDULER_STATE_ATTRIBUTES', []) meta = { b'scrapy_callback': cb, + b'scrapy_cb_kwargs': scrapy_request.cb_kwargs, b'scrapy_errback': eb, b'scrapy_meta': scrapy_request.meta, b'scrapy_body': scrapy_request.body, diff --git a/scrapy_frontera/scheduler.py b/scrapy_frontera/scheduler.py index 22f0f19..74c56cb 100644 --- a/scrapy_frontera/scheduler.py +++ b/scrapy_frontera/scheduler.py @@ -22,8 +22,11 @@ def from_crawler(cls, crawler): return obj def next_request(self): - if not self.has_pending_requests(): - self._get_requests_from_backend() + try: + if not self.has_pending_requests(): + self._get_requests_from_backend() + except Exception as e: + LOG.warning(f"Exception while getting requests from frontier: {e!r}") return super(FronteraScheduler, self).next_request() def is_frontera_request(self, request): diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index a7092e7..fda5e4f 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,12 +3,15 @@ from twisted.trial.unittest import TestCase from twisted.internet import defer +from frontera.core.components import Queue as FronteraQueue from scrapy import Request, Spider from scrapy.http import Response from scrapy.settings import Settings from scrapy.utils.test import get_crawler from scrapy.crawler import CrawlerRunner +from scrapy_frontera.converters import FrontierRequest + TEST_SETTINGS = { 'SCHEDULER': 'scrapy_frontera.scheduler.FronteraScheduler', @@ -83,6 +86,33 @@ def parse(self, response): yield Request('http://example2.com') +class TestSpider4(Spider): + name = 'test' + success = [] + + def start_requests(self): + yield Request('http://example.com') + + def parse(self, response): + if response.url == response.request.url: + self.success.append(response.url) + + +class TestSpider5(Spider): + name = 'test' + success = [] + frontera_settings = { + 'AUTO_START': False + } + + def start_requests(self): + yield Request('http://example.com') + + def parse(self, response): + if response.url == response.request.url: + self.success.append(response.url) + + class TestDownloadHandler: results = [] @@ -110,10 +140,10 @@ def tearDown(self): @defer.inlineCallbacks def test_start_requests(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), + Response(url='http://example2.com')]) with patch('frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted') as mocked_links_extracted: mocked_links_extracted.return_value = None @@ -126,11 +156,53 @@ def test_start_requests(self): self.assertTrue(crawler.spider.success2) mocked_links_extracted.assert_not_called() + @defer.inlineCallbacks + def test_next_requests(self): + """ + Test default logic: frontier requests are obtained/scheduled before start requests + """ + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'), + Response(url='http://example.com')]) + + with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\ + patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count: + mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]] + mocked_count.side_effect = [1] * 2 + settings = Settings() + settings.setdict(TEST_SETTINGS, priority='cmdline') + crawler = get_crawler(TestSpider4, settings) + + yield self.runner.crawl(crawler) + self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com']) + + @defer.inlineCallbacks + def test_next_requests_not_autostart(self): + """ + Test default logic: frontier requests are obtained/scheduled before start requests + """ + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'), + Response(url='http://example.com')]) + + with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\ + patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count: + mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]] + mocked_count.side_effect = [1] * 2 + settings = Settings() + settings.setdict(TEST_SETTINGS, priority='cmdline') + crawler = get_crawler(TestSpider5, settings) + + yield self.runner.crawl(crawler) + self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com']) + @defer.inlineCallbacks def test_cf_store(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -144,9 +216,9 @@ def test_cf_store(self): @defer.inlineCallbacks def test_callback_requests_to_frontier(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -164,9 +236,9 @@ def test_callback_requests_to_frontier(self): @defer.inlineCallbacks def test_callback_requests_to_frontier_with_implicit_callback(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: @@ -184,11 +256,11 @@ def test_callback_requests_to_frontier_with_implicit_callback(self): @defer.inlineCallbacks def test_callback_requests_slot_map(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + mocked_handler.from_crawler.return_value.set_results([resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -209,11 +281,11 @@ def test_callback_requests_slot_map(self): @defer.inlineCallbacks def test_callback_requests_slot_map_with_num_slots(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + mocked_handler.from_crawler.return_value.set_results([resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -235,9 +307,9 @@ def test_callback_requests_slot_map_with_num_slots(self): @defer.inlineCallbacks def test_start_requests_to_frontier(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com')]) settings = Settings() @@ -253,8 +325,8 @@ def test_start_requests_to_frontier(self): @defer.inlineCallbacks def test_start_requests_to_frontier_ii(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() with patch('frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds') as mocked_add_seeds: mocked_add_seeds.return_value = None @@ -271,9 +343,9 @@ def test_start_requests_to_frontier_ii(self): @defer.inlineCallbacks def test_start_handle_errback(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501), Response(url='http://example3.com')]) @@ -292,9 +364,9 @@ def test_start_handle_errback_with_cf_store(self): """ Test that we get the expected result with errback cf_store """ - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com')]) @@ -313,9 +385,9 @@ def test_start_handle_errback_with_cf_store_ii(self): """ Test that we scheduled cf_store request on backend queue """ - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com')])