Skip to content

Commit

Permalink
Merge branch 'flathunters:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
veldhaenchen authored Jan 31, 2024
2 parents cb5b5ec + 6677f4a commit f06d428
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 16 deletions.
6 changes: 3 additions & 3 deletions flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_soup_from_url(
elif re.search("g-recaptcha", driver.page_source):
self.resolve_recaptcha(
driver, checkbox, afterlogin_string or "")
return BeautifulSoup(driver.page_source, 'html.parser')
return BeautifulSoup(driver.page_source, 'lxml')

resp = requests.get(url, headers=self.HEADERS, timeout=30)
if resp.status_code not in (200, 405):
Expand All @@ -83,7 +83,7 @@ def get_soup_from_url(
logger.error("Got response (%i): %s\n%s",
resp.status_code, resp.content, user_agent)

return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(resp.content, 'lxml')

def get_soup_with_proxy(self, url) -> BeautifulSoup:
"""Will try proxies until it's possible to crawl and return a soup"""
Expand Down Expand Up @@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup:
raise ProxyException(
"An error occurred while fetching proxies or content")

return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(resp.content, 'lxml')

def extract_data(self, soup):
"""Should be implemented in subclass"""
Expand Down
9 changes: 9 additions & 0 deletions flathunter/chrome_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ def get_chrome_driver(driver_arguments):
setattr(chrome_options, "headless", True)
driver = uc.Chrome(version_main=chrome_version, options=chrome_options) # pylint: disable=no-member

driver.execute_cdp_cmd(
"Network.setUserAgentOverride",
{
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
"AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/120.0.0.0 Safari/537.36"
},
)

driver.execute_cdp_cmd('Network.setBlockedURLs',
{"urls": ["https://api.geetest.com/get.*"]})
driver.execute_cdp_cmd('Network.enable', {})
Expand Down
11 changes: 5 additions & 6 deletions flathunter/crawler/immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re

from bs4 import BeautifulSoup, Tag
from jsonpath_ng import parse
from jsonpath_ng.ext import parse
from selenium.common.exceptions import JavascriptException
from selenium.webdriver import Chrome

Expand Down Expand Up @@ -35,7 +35,9 @@ class Immobilienscout(Crawler):
URL_PATTERN = STATIC_URL_PATTERN

JSON_PATH_PARSER_ENTRIES = parse("$..['resultlist.realEstate']")
JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments..['@href']")
JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments"
"..attachment[?'@xsi.type'=='common:Picture']"
"..['@href'].`sub(/(.*\\\\.jpe?g).*/, \\\\1)`")

RESULT_LIMIT = 50

Expand Down Expand Up @@ -145,10 +147,7 @@ def extract_entry_from_javascript(self, entry):
#
# After: https://pictures.immobilienscout24.de/listings/$$IMAGE_ID$$.jpg

images = [
image.value[:image.value.find(".jpg") + 4]
for image in self.JSON_PATH_PARSER_IMAGES.find(entry)
]
images = [image.value for image in self.JSON_PATH_PARSER_IMAGES.find(entry)]

object_id: int = int(entry.get("@id", 0))
return {
Expand Down
15 changes: 9 additions & 6 deletions flathunter/crawler/wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,16 @@ def parse_expose_element_to_details(row: Tag, crawler: str) -> Optional[Dict]:


def liste_attribute_filter(element: Union[Tag, str]) -> bool:
"""Return true for elements whose 'id' attribute starts with 'liste-'"""
"""Return true for elements whose 'id' attribute starts with 'liste-'
and are not contained in the 'premium_user_extra_list' container"""
if not isinstance(element, Tag):
return False
if "id" not in element.attrs:
if not element.attrs or "id" not in element.attrs:
return False
return element.attrs["id"].startswith('liste-')
if not element.parent or not element.parent.attrs or "class" not in element.parent.attrs:
return False
return element.attrs["id"].startswith('liste-') and \
'premium_user_extra_list' not in element.parent.attrs["class"]


class WgGesucht(Crawler):
Expand All @@ -175,7 +179,6 @@ def extract_data(self, soup: BeautifulSoup):
e for e in findings
if isinstance(e, Tag) and e.has_attr('class') and not 'display-none' in e['class']
]

for row in existing_findings:
details = parse_expose_element_to_details(row, self.get_name())
if details is None:
Expand Down Expand Up @@ -230,5 +233,5 @@ def get_soup_from_url(
elif re.search("g-recaptcha", driver.page_source):
self.resolve_recaptcha(
driver, checkbox, afterlogin_string or "")
return BeautifulSoup(driver.page_source, 'html.parser')
return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(driver.page_source, 'lxml')
return BeautifulSoup(resp.content, 'lxml')
2 changes: 1 addition & 1 deletion test/crawler/test_crawl_wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test(self):

def test_filter_spotahome_ads(self):
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture:
soup = BeautifulSoup(fixture, 'html.parser')
soup = BeautifulSoup(fixture, 'lxml')
entries = self.crawler.extract_data(soup)
assert len(entries) == 20

0 comments on commit f06d428

Please sign in to comment.