Merge branch 'flathunters:main' into main

veldhaenchen · Jan 31, 2024 · f06d428 · f06d428
2 parents cb5b5ec + 6677f4a
commit f06d428
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 16 deletions.
diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py
@@ -73,7 +73,7 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
 
         resp = requests.get(url, headers=self.HEADERS, timeout=30)
         if resp.status_code not in (200, 405):
@@ -83,7 +83,7 @@ def get_soup_from_url(
             logger.error("Got response (%i): %s\n%s",
                          resp.status_code, resp.content, user_agent)
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def get_soup_with_proxy(self, url) -> BeautifulSoup:
         """Will try proxies until it's possible to crawl and return a soup"""
@@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup:
             raise ProxyException(
                 "An error occurred while fetching proxies or content")
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         """Should be implemented in subclass"""

diff --git a/flathunter/chrome_wrapper.py b/flathunter/chrome_wrapper.py
@@ -69,6 +69,15 @@ def get_chrome_driver(driver_arguments):
     setattr(chrome_options, "headless", True)
     driver = uc.Chrome(version_main=chrome_version, options=chrome_options) # pylint: disable=no-member
 
+    driver.execute_cdp_cmd(
+        "Network.setUserAgentOverride",
+        {
+            "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
+                         "AppleWebKit/537.36 (KHTML, like Gecko)"
+                         "Chrome/120.0.0.0 Safari/537.36"
+        },
+    )
+
     driver.execute_cdp_cmd('Network.setBlockedURLs',
         {"urls": ["https://api.geetest.com/get.*"]})
     driver.execute_cdp_cmd('Network.enable', {})

diff --git a/flathunter/crawler/immobilienscout.py b/flathunter/crawler/immobilienscout.py
@@ -4,7 +4,7 @@
 import re
 
 from bs4 import BeautifulSoup, Tag
-from jsonpath_ng import parse
+from jsonpath_ng.ext import parse
 from selenium.common.exceptions import JavascriptException
 from selenium.webdriver import Chrome
 
@@ -35,7 +35,9 @@ class Immobilienscout(Crawler):
     URL_PATTERN = STATIC_URL_PATTERN
 
     JSON_PATH_PARSER_ENTRIES = parse("$..['resultlist.realEstate']")
-    JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments..['@href']")
+    JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments"
+                                    "..attachment[?'@xsi.type'=='common:Picture']"
+                                    "..['@href'].`sub(/(.*\\\\.jpe?g).*/, \\\\1)`")
 
     RESULT_LIMIT = 50
 
@@ -145,10 +147,7 @@ def extract_entry_from_javascript(self, entry):
         #
         # After: https://pictures.immobilienscout24.de/listings/$$IMAGE_ID$$.jpg
 
-        images = [
-            image.value[:image.value.find(".jpg") + 4]
-                for image in self.JSON_PATH_PARSER_IMAGES.find(entry)
-            ]
+        images = [image.value for image in self.JSON_PATH_PARSER_IMAGES.find(entry)]
 
         object_id: int = int(entry.get("@id", 0))
         return {

diff --git a/flathunter/crawler/wggesucht.py b/flathunter/crawler/wggesucht.py
@@ -148,12 +148,16 @@ def parse_expose_element_to_details(row: Tag, crawler: str) -> Optional[Dict]:
 
 
 def liste_attribute_filter(element: Union[Tag, str]) -> bool:
-    """Return true for elements whose 'id' attribute starts with 'liste-'"""
+    """Return true for elements whose 'id' attribute starts with 'liste-' 
+    and are not contained in the 'premium_user_extra_list' container"""
     if not isinstance(element, Tag):
         return False
-    if "id" not in element.attrs:
+    if not element.attrs or "id" not in element.attrs:
         return False
-    return element.attrs["id"].startswith('liste-')
+    if not element.parent or not element.parent.attrs or "class" not in element.parent.attrs:
+        return False
+    return element.attrs["id"].startswith('liste-') and \
+        'premium_user_extra_list' not in element.parent.attrs["class"]
 
 
 class WgGesucht(Crawler):
@@ -175,7 +179,6 @@ def extract_data(self, soup: BeautifulSoup):
             e for e in findings
             if isinstance(e, Tag) and e.has_attr('class') and not 'display-none' in e['class']
         ]
-
         for row in existing_findings:
             details = parse_expose_element_to_details(row, self.get_name())
             if details is None:
@@ -230,5 +233,5 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
-        return BeautifulSoup(resp.content, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
+        return BeautifulSoup(resp.content, 'lxml')
diff --git a/test/crawler/test_crawl_wggesucht.py b/test/crawler/test_crawl_wggesucht.py
@@ -31,7 +31,7 @@ def test(self):
 
     def test_filter_spotahome_ads(self):
         with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture:
-            soup = BeautifulSoup(fixture, 'html.parser')
+            soup = BeautifulSoup(fixture, 'lxml')
         entries = self.crawler.extract_data(soup)
         assert len(entries) == 20