Skip to content

Commit

Permalink
Merge branch 'master' into add-support-to-shein
Browse files Browse the repository at this point in the history
  • Loading branch information
Crinibus committed Jan 12, 2024
2 parents 940b286 + 88fe8be commit 1b4e3ae
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 21 deletions.
2 changes: 1 addition & 1 deletion scraper/add_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def add_products(categories: list[str], urls: list[str]) -> None:
def add_product(category: str, url: str) -> None:
logger = logging.getLogger(__name__)

website_name = get_website_name(url)
website_name = get_website_name(url, keep_subdomain=False)

if website_name not in SUPPORTED_DOMAINS.keys():
raise WebsiteNotSupported(website_name)
Expand Down
51 changes: 41 additions & 10 deletions scraper/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,11 @@ def _get_product_name(self) -> str:

def _get_product_price(self) -> float:
if self.soup_url.split("/")[3] == "itm":
price = float(
self.request_data.find("div", class_="x-price-primary")
.text
.replace("US $", "")
)
price = float(self.request_data.find("div", class_="x-price-primary").text.replace("US $", ""))
else:
price = float(
self.request_data.find("div", class_="x-price-primary")
.text
.replace("DKK ", "")
.text.replace("DKK ", "")
.replace("$", "")
.replace(",", "")
)
Expand Down Expand Up @@ -535,19 +530,55 @@ def get_short_url(self) -> str:
return self.url


def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str:
class Shein(BaseWebsiteHandler):
def _get_common_data(self) -> None:
script_data_raw = self.request_data.find_all("script", type="application/ld+json")[1].text
self.script_json = json.loads(script_data_raw)

def _get_product_name(self) -> str:
return self.script_json.get("name")

def _get_product_price(self) -> float:
return float(self.script_json.get("offers").get("price"))

def _get_product_currency(self) -> str:
return self.script_json.get("offers").get("priceCurrency")

def _get_product_id(self) -> str:
return self.script_json.get("sku")

def get_short_url(self) -> str:
return self.url


def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str:
stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://")
stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1)

if not keep_www and keep_http:
stripped_url = stripped_url.replace("www.", "", 1)
elif not keep_www:
stripped_url = stripped_url.removeprefix("www.")

domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0]

# Remove the TLD/DNS name (such as ".com") if keep_tld is false
website_name_list = domain.split(".") if keep_tld else domain.split(".")[:-1]

# Remove subdomain if keep_subdomain is false
if not keep_subdomain and len(website_name_list) > 1:
subdomain_and_domain = get_website_name(domain, keep_subdomain=True)
subdomains = subdomain_and_domain.split(".")[:-1]

website_name_list_copy = website_name_list.copy()
# remove subdomains
website_name_list = [elem for elem in website_name_list_copy if elem not in subdomains]

website_name = ".".join(website_name_list)
return website_name


def get_website_handler(url: str) -> BaseWebsiteHandler:
website_name = get_website_name(url).lower()
website_name = get_website_name(url, keep_subdomain=False).lower()

website_handler = SUPPORTED_DOMAINS.get(website_name, None)

Expand Down
45 changes: 35 additions & 10 deletions tests/test_domains.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,44 @@
from dataclasses import dataclass
import pytest

from scraper.domains import get_website_name


@dataclass
class UrlSetting:
keep_tld: bool = False
keep_http: bool = False
keep_www: bool = False
keep_subdomain: bool = True


test_websites = [
("https://www.amazon.com/", "amazon"),
("https://www.komplett.dk/", "komplett"),
("https://www.av-cables.dk/", "av-cables"),
("https://nowww.com/", "nowww"),
("https://no-ending-slash.com", "no-ending-slash"),
("https://www.test.testing.com/", "test.testing"),
("https://www.test.hello.com/hello/world", "test.hello"),
("https://www.amazon.com/", UrlSetting(), "amazon"),
("https://www.komplett.dk/", UrlSetting(), "komplett"),
("https://www.av-cables.dk/", UrlSetting(), "av-cables"),
("https://nowww.com/", UrlSetting(), "nowww"),
("https://no-ending-slash.com", UrlSetting(), "no-ending-slash"),
("https://www.test.testing.com/", UrlSetting(), "test.testing"),
("https://www.test.hello.com/hello/world", UrlSetting(), "test.hello"),
("https://sub.main.com", UrlSetting(keep_subdomain=False), "main"),
("https://www.sub.main.com", UrlSetting(keep_subdomain=False), "main"),
("https://main.com", UrlSetting(keep_subdomain=False), "main"),
("https://main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
("https://www.main.com/", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"),
("https://www.sub.main.com/", UrlSetting(keep_http=True), "https://sub.main"),
("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True), "https://www.sub.main"),
("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True, keep_subdomain=False), "https://www.main"),
]


@pytest.mark.parametrize("url,expected", test_websites)
def test_get_website_name(url, expected) -> None:
result = get_website_name(url)
@pytest.mark.parametrize("url,setting,expected", test_websites)
def test_get_website_name(url: str, setting: UrlSetting, expected: str) -> None:
result = get_website_name(
url,
keep_tld=setting.keep_tld,
keep_http=setting.keep_http,
keep_www=setting.keep_www,
keep_subdomain=setting.keep_subdomain,
)
assert result == expected

0 comments on commit 1b4e3ae

Please sign in to comment.