From 4118e0adaea659647c659c8e8eb11c5474eddc12 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:35:54 +0100 Subject: [PATCH 1/7] Update function 'get_website_name' - add optional parameter 'keep_subdomain' Fix if keep_www is false for some urls --- scraper/domains.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scraper/domains.py b/scraper/domains.py index 68fa14bf..93298acc 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -514,13 +514,23 @@ def get_short_url(self) -> str: return f"{website}/{id}" -def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False) -> str: +def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True) -> str: stripped_url = url if keep_http else url.removeprefix("https://").removeprefix("http://") - stripped_url = stripped_url if keep_www else stripped_url.replace("www.", "", 1) + + if not keep_www and keep_http: + stripped_url = stripped_url.replace("www.", "", 1) + elif not keep_www: + stripped_url = stripped_url.removeprefix("www.") + domain = "/".join(stripped_url.split("/")[0:3]) if keep_http else stripped_url.split("/")[0] # Remove the TLD/DNS name (such as ".com") if keep_tld is false website_name_list = domain.split(".") if keep_tld else domain.split(".")[:-1] + + # Remove subdomain if keep_subdomain is false + if not keep_subdomain and len(website_name_list) > 1: + website_name_list = website_name_list[1:] + website_name = ".".join(website_name_list) return website_name From effad9a6e72a1ef1aa33eda555837aa0f76b084e Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:38:08 +0100 Subject: [PATCH 2/7] Update test_domains.py - reflect changes to get_website_name and add more test cases --- tests/test_domains.py | 57 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/tests/test_domains.py b/tests/test_domains.py index 212fa0cf..eb0e9b8e 100644 --- a/tests/test_domains.py +++ b/tests/test_domains.py @@ -1,19 +1,56 @@ +from dataclasses import dataclass import pytest from scraper.domains import get_website_name + +@dataclass +class UrlSetting: + keep_tld: bool + keep_http: bool + keep_www: bool + keep_subdomain: bool + + test_websites = [ - ("https://www.amazon.com/", "amazon"), - ("https://www.komplett.dk/", "komplett"), - ("https://www.av-cables.dk/", "av-cables"), - ("https://nowww.com/", "nowww"), - ("https://no-ending-slash.com", "no-ending-slash"), - ("https://www.test.testing.com/", "test.testing"), - ("https://www.test.hello.com/hello/world", "test.hello"), + ("https://www.amazon.com/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "amazon"), + ("https://www.komplett.dk/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "komplett"), + ( + "https://www.av-cables.dk/", + UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), + "av-cables", + ), + ("https://nowww.com/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "nowww"), + ( + "https://no-ending-slash.com", + UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), + "no-ending-slash", + ), + ( + "https://www.test.testing.com/", + UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), + "test.testing", + ), + ( + "https://www.test.hello.com/hello/world", + UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), + "test.hello", + ), + ("https://sub.main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), + ("https://www.sub.main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), + ("https://main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), + ("https://main.com", UrlSetting(keep_tld=False, keep_http=True, keep_www=False, keep_subdomain=False), "https://main"), + ("https://www.main.com", UrlSetting(keep_tld=False, keep_http=True, keep_www=False, keep_subdomain=False), "https://main"), ] -@pytest.mark.parametrize("url,expected", test_websites) -def test_get_website_name(url, expected) -> None: - result = get_website_name(url) +@pytest.mark.parametrize("url,setting,expected", test_websites) +def test_get_website_name(url: str, setting: UrlSetting, expected: str) -> None: + result = get_website_name( + url, + keep_tld=setting.keep_tld, + keep_http=setting.keep_http, + keep_www=setting.keep_www, + keep_subdomain=setting.keep_subdomain, + ) assert result == expected From 723db6ad8fd3ccdf83eca797f7b409c4c7c90751 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:38:39 +0100 Subject: [PATCH 3/7] Ignore subdomain in function 'get_website_handler' --- scraper/domains.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/domains.py b/scraper/domains.py index 93298acc..330518b5 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -536,7 +536,7 @@ def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, def get_website_handler(url: str) -> BaseWebsiteHandler: - website_name = get_website_name(url).lower() + website_name = get_website_name(url, keep_subdomain=False).lower() website_handler = SUPPORTED_DOMAINS.get(website_name, None) From 3d1cf8fffec958f96d45482e0eff97d24ac5bf42 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:39:01 +0100 Subject: [PATCH 4/7] Ignore subdomain in function 'add_product' --- scraper/add_product.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/add_product.py b/scraper/add_product.py index 16a54c78..28041885 100644 --- a/scraper/add_product.py +++ b/scraper/add_product.py @@ -21,7 +21,7 @@ def add_products(categories: list[str], urls: list[str]) -> None: def add_product(category: str, url: str) -> None: logger = logging.getLogger(__name__) - website_name = get_website_name(url) + website_name = get_website_name(url, keep_subdomain=False) if website_name not in SUPPORTED_DOMAINS.keys(): raise WebsiteNotSupported(website_name) From 5a6c12948c552b6c1cc4c1c4d3f71bdc635d57ee Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 22:37:54 +0100 Subject: [PATCH 5/7] Set defaults in dataclass 'UrlSetting' --- tests/test_domains.py | 48 +++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/tests/test_domains.py b/tests/test_domains.py index eb0e9b8e..c74265eb 100644 --- a/tests/test_domains.py +++ b/tests/test_domains.py @@ -6,41 +6,25 @@ @dataclass class UrlSetting: - keep_tld: bool - keep_http: bool - keep_www: bool - keep_subdomain: bool + keep_tld: bool = False + keep_http: bool = False + keep_www: bool = False + keep_subdomain: bool = True test_websites = [ - ("https://www.amazon.com/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "amazon"), - ("https://www.komplett.dk/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "komplett"), - ( - "https://www.av-cables.dk/", - UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), - "av-cables", - ), - ("https://nowww.com/", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), "nowww"), - ( - "https://no-ending-slash.com", - UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), - "no-ending-slash", - ), - ( - "https://www.test.testing.com/", - UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), - "test.testing", - ), - ( - "https://www.test.hello.com/hello/world", - UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=True), - "test.hello", - ), - ("https://sub.main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), - ("https://www.sub.main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), - ("https://main.com", UrlSetting(keep_tld=False, keep_http=False, keep_www=False, keep_subdomain=False), "main"), - ("https://main.com", UrlSetting(keep_tld=False, keep_http=True, keep_www=False, keep_subdomain=False), "https://main"), - ("https://www.main.com", UrlSetting(keep_tld=False, keep_http=True, keep_www=False, keep_subdomain=False), "https://main"), + ("https://www.amazon.com/", UrlSetting(), "amazon"), + ("https://www.komplett.dk/", UrlSetting(), "komplett"), + ("https://www.av-cables.dk/", UrlSetting(), "av-cables"), + ("https://nowww.com/", UrlSetting(), "nowww"), + ("https://no-ending-slash.com", UrlSetting(), "no-ending-slash"), + ("https://www.test.testing.com/", UrlSetting(), "test.testing"), + ("https://www.test.hello.com/hello/world", UrlSetting(), "test.hello"), + ("https://sub.main.com", UrlSetting(keep_subdomain=False), "main"), + ("https://www.sub.main.com", UrlSetting(keep_subdomain=False), "main"), + ("https://main.com", UrlSetting(keep_subdomain=False), "main"), + ("https://main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), + ("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), ] From 72a486c5a8edfd107cf0b20f7983e7c3289ccd4e Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 22:38:14 +0100 Subject: [PATCH 6/7] Add tests to test_domains --- tests/test_domains.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_domains.py b/tests/test_domains.py index c74265eb..a2f1a1e6 100644 --- a/tests/test_domains.py +++ b/tests/test_domains.py @@ -25,6 +25,8 @@ class UrlSetting: ("https://main.com", UrlSetting(keep_subdomain=False), "main"), ("https://main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), ("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), + ("https://www.main.com/", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), + ("https://www.sub.main.com/", UrlSetting(keep_http=True), "https://sub.main"), ] From 08696711fbe1a2b05a2d4af84c74cbf8e814afb3 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 12 Jan 2024 23:34:24 +0100 Subject: [PATCH 7/7] Fix function 'get_website_name' Fix issue when keep_http=True, keep_www=True, keep_subdomain=False and url=https://www.sub.main.com/ --- scraper/domains.py | 7 ++++++- tests/test_domains.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/scraper/domains.py b/scraper/domains.py index 330518b5..2dec2f68 100644 --- a/scraper/domains.py +++ b/scraper/domains.py @@ -529,7 +529,12 @@ def get_website_name(url: str, keep_tld=False, keep_http=False, keep_www=False, # Remove subdomain if keep_subdomain is false if not keep_subdomain and len(website_name_list) > 1: - website_name_list = website_name_list[1:] + subdomain_and_domain = get_website_name(domain, keep_subdomain=True) + subdomains = subdomain_and_domain.split(".")[:-1] + + website_name_list_copy = website_name_list.copy() + # remove subdomains + website_name_list = [elem for elem in website_name_list_copy if elem not in subdomains] website_name = ".".join(website_name_list) return website_name diff --git a/tests/test_domains.py b/tests/test_domains.py index a2f1a1e6..eae87961 100644 --- a/tests/test_domains.py +++ b/tests/test_domains.py @@ -27,6 +27,8 @@ class UrlSetting: ("https://www.main.com", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), ("https://www.main.com/", UrlSetting(keep_http=True, keep_subdomain=False), "https://main"), ("https://www.sub.main.com/", UrlSetting(keep_http=True), "https://sub.main"), + ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True), "https://www.sub.main"), + ("https://www.sub.main.com/", UrlSetting(keep_http=True, keep_www=True, keep_subdomain=False), "https://www.main"), ]