Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Start requests order #7

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
Pipfile
Pipfile.lock
*.pyc
/build
/*.egg-info
Expand Down
1 change: 1 addition & 0 deletions scrapy_frontera/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def to_frontier(self, scrapy_request):
statevars = self.spider.crawler.settings.getlist('FRONTERA_SCHEDULER_STATE_ATTRIBUTES', [])
meta = {
b'scrapy_callback': cb,
b'scrapy_cb_kwargs': scrapy_request.cb_kwargs,
b'scrapy_errback': eb,
b'scrapy_meta': scrapy_request.meta,
b'scrapy_body': scrapy_request.body,
Expand Down
7 changes: 5 additions & 2 deletions scrapy_frontera/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@ def from_crawler(cls, crawler):
return obj

def next_request(self):
if not self.has_pending_requests():
self._get_requests_from_backend()
try:
if not self.has_pending_requests():
self._get_requests_from_backend()
except Exception as e:
LOG.warning(f"Exception while getting requests from frontier: {e!r}")
return super(FronteraScheduler, self).next_request()

def is_frontera_request(self, request):
Expand Down
Empty file added tests/__init__.py
Empty file.
138 changes: 105 additions & 33 deletions tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
from twisted.trial.unittest import TestCase
from twisted.internet import defer

from frontera.core.components import Queue as FronteraQueue
from scrapy import Request, Spider
from scrapy.http import Response
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler
from scrapy.crawler import CrawlerRunner

from scrapy_frontera.converters import FrontierRequest


TEST_SETTINGS = {
'SCHEDULER': 'scrapy_frontera.scheduler.FronteraScheduler',
Expand Down Expand Up @@ -83,6 +86,33 @@ def parse(self, response):
yield Request('http://example2.com')


class TestSpider4(Spider):
name = 'test'
success = []

def start_requests(self):
yield Request('http://example.com')

def parse(self, response):
if response.url == response.request.url:
self.success.append(response.url)


class TestSpider5(Spider):
name = 'test'
success = []
frontera_settings = {
'AUTO_START': False
}

def start_requests(self):
yield Request('http://example.com')

def parse(self, response):
if response.url == response.request.url:
self.success.append(response.url)


class TestDownloadHandler:

results = []
Expand Down Expand Up @@ -110,10 +140,10 @@ def tearDown(self):

@defer.inlineCallbacks
def test_start_requests(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])

with patch('frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted') as mocked_links_extracted:
mocked_links_extracted.return_value = None
Expand All @@ -126,11 +156,53 @@ def test_start_requests(self):
self.assertTrue(crawler.spider.success2)
mocked_links_extracted.assert_not_called()

@defer.inlineCallbacks
def test_next_requests(self):
"""
Test default logic: frontier requests are obtained/scheduled before start requests
"""
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'),
Response(url='http://example.com')])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\
patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count:
mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]]
mocked_count.side_effect = [1] * 2
settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider4, settings)

yield self.runner.crawl(crawler)
self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com'])

@defer.inlineCallbacks
def test_next_requests_not_autostart(self):
"""
Test default logic: frontier requests are obtained/scheduled before start requests
"""
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example2.com'),
Response(url='http://example.com')])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.get_next_requests') as mocked_get_next_requests,\
patch('frontera.contrib.backends.memory.MemoryDequeQueue.count') as mocked_count:
mocked_get_next_requests.side_effect = [[FrontierRequest(url='http://example2.com')]]
mocked_count.side_effect = [1] * 2
settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider5, settings)

yield self.runner.crawl(crawler)
self.assertEqual(crawler.spider.success, ['http://example2.com', 'http://example.com'])

@defer.inlineCallbacks
def test_cf_store(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')])
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -144,9 +216,9 @@ def test_cf_store(self):

@defer.inlineCallbacks
def test_callback_requests_to_frontier(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com')])
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com')])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -164,9 +236,9 @@ def test_callback_requests_to_frontier(self):

@defer.inlineCallbacks
def test_callback_requests_to_frontier_with_implicit_callback(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
Expand All @@ -184,11 +256,11 @@ def test_callback_requests_to_frontier_with_implicit_callback(self):

@defer.inlineCallbacks
def test_callback_requests_slot_map(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
resp1 = Response(url='http://example.com')
resp2 = Response(url='http://example2.com')
mocked_handler.return_value.set_results([resp1, resp2])
mocked_handler.from_crawler.return_value.set_results([resp1, resp2])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -209,11 +281,11 @@ def test_callback_requests_slot_map(self):

@defer.inlineCallbacks
def test_callback_requests_slot_map_with_num_slots(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
resp1 = Response(url='http://example.com')
resp2 = Response(url='http://example2.com')
mocked_handler.return_value.set_results([resp1, resp2])
mocked_handler.from_crawler.return_value.set_results([resp1, resp2])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -235,9 +307,9 @@ def test_callback_requests_slot_map_with_num_slots(self):

@defer.inlineCallbacks
def test_start_requests_to_frontier(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])

settings = Settings()
Expand All @@ -253,8 +325,8 @@ def test_start_requests_to_frontier(self):

@defer.inlineCallbacks
def test_start_requests_to_frontier_ii(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()

with patch('frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds') as mocked_add_seeds:
mocked_add_seeds.return_value = None
Expand All @@ -271,9 +343,9 @@ def test_start_requests_to_frontier_ii(self):

@defer.inlineCallbacks
def test_start_handle_errback(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501),
Response(url='http://example3.com')])

Expand All @@ -292,9 +364,9 @@ def test_start_handle_errback_with_cf_store(self):
"""
Test that we get the expected result with errback cf_store
"""
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com')])

Expand All @@ -313,9 +385,9 @@ def test_start_handle_errback_with_cf_store_ii(self):
"""
Test that we scheduled cf_store request on backend queue
"""
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
with patch('scrapy.core.downloader.handlers.http.HTTPDownloadHandler') as mocked_handler:
mocked_handler.from_crawler.return_value = TestDownloadHandler()
mocked_handler.from_crawler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com')])

Expand Down