From bad31afbdaa7acbcb88cbd9060ba6d998c129cb4 Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Thu, 6 May 2021 16:44:07 -0300 Subject: [PATCH 1/3] fix tests in order to work with recent version of scrapy --- .gitignore | 2 ++ tests/__init__.py | 0 tests/test_scheduler.py | 66 ++++++++++++++++++++--------------------- 3 files changed, 35 insertions(+), 33 deletions(-) create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index 7111f6e..4b721d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +Pipfile +Pipfile.lock *.pyc /build /*.egg-info diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index a7092e7..d2de33d 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -110,10 +110,10 @@ def tearDown(self): @defer.inlineCallbacks def test_start_requests(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), + Response(url='http://example2.com')]) with patch('frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted') as mocked_links_extracted: mocked_links_extracted.return_value = None @@ -128,9 +128,9 @@ def test_start_requests(self): @defer.inlineCallbacks def test_cf_store(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -144,9 +144,9 @@ def test_cf_store(self): @defer.inlineCallbacks def test_callback_requests_to_frontier(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com')]) + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -164,9 +164,9 @@ def test_callback_requests_to_frontier(self): @defer.inlineCallbacks def test_callback_requests_to_frontier_with_implicit_callback(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com')]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: @@ -184,11 +184,11 @@ def test_callback_requests_to_frontier_with_implicit_callback(self): @defer.inlineCallbacks def test_callback_requests_slot_map(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + mocked_handler.from_crawler.return_value.set_results([resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -209,11 +209,11 @@ def test_callback_requests_slot_map(self): @defer.inlineCallbacks def test_callback_requests_slot_map_with_num_slots(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + mocked_handler.from_crawler.return_value.set_results([resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -235,9 +235,9 @@ def test_callback_requests_slot_map_with_num_slots(self): @defer.inlineCallbacks def test_start_requests_to_frontier(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com')]) settings = Settings() @@ -253,8 +253,8 @@ def test_start_requests_to_frontier(self): @defer.inlineCallbacks def test_start_requests_to_frontier_ii(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() with patch('frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds') as mocked_add_seeds: mocked_add_seeds.return_value = None @@ -271,9 +271,9 @@ def test_start_requests_to_frontier_ii(self): @defer.inlineCallbacks def test_start_handle_errback(self): - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501), Response(url='http://example3.com')]) @@ -292,9 +292,9 @@ def test_start_handle_errback_with_cf_store(self): """ Test that we get the expected result with errback cf_store """ - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com')]) @@ -313,9 +313,9 @@ def test_start_handle_errback_with_cf_store_ii(self): """ Test that we scheduled cf_store request on backend queue """ - with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com')]) From 19080926fe6d967a5fb3187868d7cffd5093348d Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Fri, 7 May 2021 11:16:44 -0300 Subject: [PATCH 2/3] added next_requests() test and check default ordering of scheduling --- scrapy_frontera/scheduler.py | 7 +++++-- tests/test_scheduler.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/scrapy_frontera/scheduler.py b/scrapy_frontera/scheduler.py index 22f0f19..74c56cb 100644 --- a/scrapy_frontera/scheduler.py +++ b/scrapy_frontera/scheduler.py @@ -22,8 +22,11 @@ def from_crawler(cls, crawler): return obj def next_request(self): - if not self.has_pending_requests(): - self._get_requests_from_backend() + try: + if not self.has_pending_requests(): + self._get_requests_from_backend() + except Exception as e: + LOG.warning(f"Exception while getting requests from frontier: {e!r}") return super(FronteraScheduler, self).next_request() def is_frontera_request(self, request): diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index d2de33d..7f7fcaa 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,12 +3,15 @@ from twisted.trial.unittest import TestCase from twisted.internet import defer +from frontera.core.components import Queue as FronteraQueue from scrapy import Request, Spider from scrapy.http import Response from scrapy.settings import Settings from scrapy.utils.test import get_crawler from scrapy.crawler import CrawlerRunner +from scrapy_frontera.converters import FrontierRequest + TEST_SETTINGS = { 'SCHEDULER': 'scrapy_frontera.scheduler.FronteraScheduler', @@ -83,6 +86,18 @@ def parse(self, response): yield Request('http://example2.com') +class TestSpider4(Spider): + name = 'test' + success = [] + + def start_requests(self): + yield Request('http://example.com') + + def parse(self, response): + if response.url == response.request.url: + self.success.append(response.url) + + class TestDownloadHandler: results = [] @@ -126,6 +141,27 @@ def test_start_requests(self): self.assertTrue(crawler.spider.success2) mocked_links_extracted.assert_not_called() + @defer.inlineCallbacks + def test_next_requests(self): + """ + Test default logic: frontier requests are obtained/scheduled before start requests + """ + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'), + Response(url='http://example.com')]) + + with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\ + patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count: + mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]] + mocked_count.side_effect = [1] * 2 + settings = Settings() + settings.setdict(TEST_SETTINGS, priority='cmdline') + crawler = get_crawler(TestSpider4, settings) + + yield self.runner.crawl(crawler) + self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com']) + @defer.inlineCallbacks def test_cf_store(self): with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: From bce4e00da50cac1844a8a222c78842114d384205 Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Tue, 8 Jun 2021 14:07:12 -0300 Subject: [PATCH 3/3] temp commit --- scrapy_frontera/converters.py | 1 + tests/test_scheduler.py | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/scrapy_frontera/converters.py b/scrapy_frontera/converters.py index d1a2244..5afa9f9 100644 --- a/scrapy_frontera/converters.py +++ b/scrapy_frontera/converters.py @@ -39,6 +39,7 @@ def to_frontier(self, scrapy_request): statevars = self.spider.crawler.settings.getlist('FRONTERA_SCHEDULER_STATE_ATTRIBUTES', []) meta = { b'scrapy_callback': cb, + b'scrapy_cb_kwargs': scrapy_request.cb_kwargs, b'scrapy_errback': eb, b'scrapy_meta': scrapy_request.meta, b'scrapy_body': scrapy_request.body, diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 7f7fcaa..fda5e4f 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -98,6 +98,21 @@ def parse(self, response): self.success.append(response.url) +class TestSpider5(Spider): + name = 'test' + success = [] + frontera_settings = { + 'AUTO_START': False + } + + def start_requests(self): + yield Request('http://example.com') + + def parse(self, response): + if response.url == response.request.url: + self.success.append(response.url) + + class TestDownloadHandler: results = [] @@ -162,6 +177,27 @@ def test_next_requests(self): yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com']) + @defer.inlineCallbacks + def test_next_requests_not_autostart(self): + """ + Test default logic: frontier requests are obtained/scheduled before start requests + """ + with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: + mocked_handler.from_crawler.return_value = TestDownloadHandler() + mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'), + Response(url='http://example.com')]) + + with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\ + patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count: + mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]] + mocked_count.side_effect = [1] * 2 + settings = Settings() + settings.setdict(TEST_SETTINGS, priority='cmdline') + crawler = get_crawler(TestSpider5, settings) + + yield self.runner.crawl(crawler) + self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com']) + @defer.inlineCallbacks def test_cf_store(self): with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler: