Merge pull request #80 from tharropoulos/wait-for-render

Enhance scraper reliability with improved page load detection
typesense · Feb 4, 2025 · 27407d7 · 27407d7
2 parents b0e5837 + 6254eb7
commit 27407d7
Showing 1 changed file with 14 additions and 10 deletions.
diff --git a/scraper/src/custom_downloader_middleware.py b/scraper/src/custom_downloader_middleware.py
@@ -3,9 +3,11 @@
 """
 
 import time
+from urllib.parse import unquote_plus, urlparse
 
 from scrapy.http import HtmlResponse
-from urllib.parse import urlparse, unquote_plus
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.support.ui import WebDriverWait
 
 
 class CustomDownloaderMiddleware:
@@ -27,15 +29,17 @@ def process_request(self, request, spider):
 
         self.driver.get(unquote_plus(
             request.url))  # Decode url otherwise firefox is not happy. Ex /#%21/ => /#!/%21
-        time.sleep(spider.js_wait)
-        body = self.driver.page_source.encode('utf-8')
-        url = self.driver.current_url
 
-        return HtmlResponse(
-            url=url,
-            body=body,
-            encoding='utf8'
-        )
+        try:
+            # Wait for DOM ready
+            WebDriverWait(self.driver, 10).until(
+                lambda d: d.execute_script("return document.readyState") == "complete"
+            )
+        except TimeoutException:
+            time.sleep(spider.js_wait)
+
+        body = self.driver.page_source.encode("utf-8")
+        return HtmlResponse(url=self.driver.current_url, body=body, encoding="utf8")
 
     def process_response(self, request, response, spider):
         # Since scrappy use start_urls and stop_urls before creating the request
@@ -47,7 +51,7 @@ def process_response(self, request, response, spider):
             url_without_params = o.scheme + "://" + o.netloc + o.path
             response = response.replace(url=url_without_params)
 
-        if response.url == request.url + '#':
+        if response.url == request.url + "#":
             response = response.replace(url=request.url)
 
         return response