Skip to content

Commit

Permalink
Merge pull request #80 from tharropoulos/wait-for-render
Browse files Browse the repository at this point in the history
Enhance scraper reliability with improved page load detection
  • Loading branch information
jasonbosco authored Feb 4, 2025
2 parents b0e5837 + 6254eb7 commit 27407d7
Showing 1 changed file with 14 additions and 10 deletions.
24 changes: 14 additions & 10 deletions scraper/src/custom_downloader_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
"""

import time
from urllib.parse import unquote_plus, urlparse

from scrapy.http import HtmlResponse
from urllib.parse import urlparse, unquote_plus
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait


class CustomDownloaderMiddleware:
Expand All @@ -27,15 +29,17 @@ def process_request(self, request, spider):

self.driver.get(unquote_plus(
request.url)) # Decode url otherwise firefox is not happy. Ex /#%21/ => /#!/%21
time.sleep(spider.js_wait)
body = self.driver.page_source.encode('utf-8')
url = self.driver.current_url

return HtmlResponse(
url=url,
body=body,
encoding='utf8'
)
try:
# Wait for DOM ready
WebDriverWait(self.driver, 10).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
except TimeoutException:
time.sleep(spider.js_wait)

body = self.driver.page_source.encode("utf-8")
return HtmlResponse(url=self.driver.current_url, body=body, encoding="utf8")

def process_response(self, request, response, spider):
# Since scrappy use start_urls and stop_urls before creating the request
Expand All @@ -47,7 +51,7 @@ def process_response(self, request, response, spider):
url_without_params = o.scheme + "://" + o.netloc + o.path
response = response.replace(url=url_without_params)

if response.url == request.url + '#':
if response.url == request.url + "#":
response = response.replace(url=request.url)

return response

0 comments on commit 27407d7

Please sign in to comment.