diff --git a/mcmetadata/__init__.py b/mcmetadata/__init__.py index 86e2f6d..f4bdf47 100644 --- a/mcmetadata/__init__.py +++ b/mcmetadata/__init__.py @@ -119,6 +119,12 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O language_duration = time.monotonic() - t1 stats_accumulator['language'] += language_duration + # canonical url + if 'canonical_url' in overrides: + canonical_url = overrides['canonical_url'] + else: + canonical_url = article.get('canonical_url') + total_duration = time.monotonic() - t0 stats_accumulator['total'] += total_duration @@ -128,6 +134,7 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O normalized_url=normalized_url, unique_url_hash=urls.unique_url_hash(final_url), canonical_domain=canonical_domain, + canonical_url=canonical_url, publication_date=pub_date, language=full_language[:2] if full_language else full_language, # keep this as a two-letter code, like "en" full_language=full_language, # could be a full region language code, like "en-AU" diff --git a/mcmetadata/content.py b/mcmetadata/content.py index 7ad5f53..fa3cdd3 100644 --- a/mcmetadata/content.py +++ b/mcmetadata/content.py @@ -29,7 +29,6 @@ safe_attrs_only=False) readability.readability.html_cleaner = everything_cleaner - METHOD_NEWSPAPER_3k = 'newspaper3k' METHOD_GOOSE_3 = 'goose3' METHOD_BEAUTIFUL_SOUP_4 = 'beautifulsoup4' @@ -109,6 +108,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False): 'url': url, 'text': doc.text, 'title': doc.title, + 'canonical_url': doc.canonical_link, 'potential_publish_date': doc.publish_date, 'top_image_url': doc.top_image, 'authors': doc.authors, @@ -125,6 +125,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False): 'url': url, 'text': g3_article.cleaned_text, 'title': g3_article.title, + 'canonical_url': g3_article.canonical_link, 'potential_publish_date': g3_article.publish_date, 'top_image_url': g3_article.top_image.src if g3_article.top_image else None, 'authors': g3_article.authors, @@ -142,6 +143,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): 'url': url, 'text': bp_doc.content, 'title': bp_doc.title, + 'canonical_url': None, 'potential_publish_date': None, 'top_image_url': None, 'authors': None, @@ -152,7 +154,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): pass -# Trafilatura outputs images in teh raw text in Markdown format +# Trafilatura outputs images in the raw text in Markdown format markdown_img_path_pattern = re.compile(r"!\[[^\]]*\]\((.*?)\)") @@ -174,6 +176,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): 'url': url, 'text': text, 'title': results['title'], + 'canonical_url': results['url'], # Warning: This will not work with Trafilatura v1.11.* and later 'potential_publish_date': dateparser.parse(results['date']), 'top_image_url': image_urls[0] if len(image_urls) > 0 else None, 'authors': results['author'].split(',') if results['author'] else None, @@ -190,6 +193,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): 'url': url, 'text': strip_tags(doc.summary()), # remove any tags that readability leaves in place (links) 'title': doc.title(), + 'canonical_url': None, 'potential_publish_date': None, 'top_image_url': None, 'authors': None, @@ -201,7 +205,6 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): class RawHtmlExtractor(AbstractExtractor): - REMOVE_LIST = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style'} def __init__(self): @@ -214,10 +217,18 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): for t in text: if t.parent.name not in self.REMOVE_LIST: output += '{} '.format(t) + + can_url = None + if can_link := soup.find("link", rel="canonical"): + can_url = can_link.get("href") + elif can_link := soup.find('meta', attrs={'property': 'og:url'}): + can_url = can_link.get("content") + self.content = { 'url': url, 'text': output, 'title': None, + 'canonical_url': can_url, 'potential_publish_date': None, 'top_image_url': None, 'authors': None, @@ -245,10 +256,17 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False): circular = fromstring(tostring(parsed)) content_string = tostring(cleaner.clean_html(circular)) + can_url = None + if can_link := parsed.xpath("//link[@rel='canonical']/@href"): + can_url = can_link[0] + elif can_link := parsed.xpath("//meta[@property='og:url']/@content"): + can_url = can_link[0] + self.content = { "url": url, 'text': content_string, 'title': None, + 'canonical_url': can_url, 'potential_publish_date': None, 'top_image_url': None, 'authors': None, diff --git a/mcmetadata/test/test_extract.py b/mcmetadata/test/test_extract.py index 20cc798..3a939b1 100644 --- a/mcmetadata/test/test_extract.py +++ b/mcmetadata/test/test_extract.py @@ -145,6 +145,7 @@ def test_overrides(self): overrides = dict( text_content="This is some text", article_title="This is a title", + canonical_url="https://www.example.com/", language="pt", publication_date=dt.date(2023, 1, 1) ) @@ -160,6 +161,7 @@ def test_overrides(self): assert results['article_title'] == overrides['article_title'] assert results['language'] == overrides['language'] assert results['publication_date'] == overrides['publication_date'] + assert results['canonical_url'] == overrides['canonical_url'] def test_default_title(self): # throws too short error if no default @@ -181,6 +183,17 @@ def test_default_pub_date(self): results = extract("https://www.example.com", html_text=html, defaults=defaults) assert results['publication_date'] == defaults['publication_date'] + def test_canonical_url(self): + canonical_url = 'https://www.example.com/sample-page' + # extract from page with as per: https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls + html = f"

Sample Content

sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf

Copyright 2024

" + results = extract("https://www.example.com", html_text=html) + assert results['canonical_url'] == canonical_url + # extract from page with as per: https://developers.facebook.com/docs/sharing/webmasters/getting-started/versioned-link/ + html = f"

Sample Content

sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf

Copyright 2024

" + results = extract("https://www.example.com", html_text=html) + assert results['canonical_url'] == canonical_url + class TestStats(unittest.TestCase): diff --git a/mcmetadata/urls.py b/mcmetadata/urls.py index badcce2..8950b47 100644 --- a/mcmetadata/urls.py +++ b/mcmetadata/urls.py @@ -43,7 +43,7 @@ def _is_suffix_only_parsed_url(parsed_url) -> bool: def canonical_domain(raw_url: str) -> str: """ Return a useful canonical domain name given a url. In general this is the logical unique part of the domain. - However, to support news-based media research, this takes into account a list of exceptinos where this isn't the + However, to support news-based media research, this takes into account a list of exceptions where this isn't the case (wordpress.com, substack.com, etc). This also handles Google AMP domains appropriately. Created by James O'Toole with input from Emily Ndulue, Linas Valiukas, Anissa Piere, and Fernando Bermejo. :param raw_url: the full URL to extract a unique domain from @@ -295,7 +295,7 @@ def normalize_url(url: str) -> Optional[str]: def is_homepage_url(raw_url: str) -> bool: """Returns true if URL is a homepage-like URL (ie. not an article).""" url = raw_url.strip() # remove whitespace - if is_shortened_url(url): # if it is shortened than it should get a free pass becasue we have to resolve it later + if is_shortened_url(url): # if it is shortened than it should get a free pass because we have to resolve it later return False uri = furl(url) for homepage_url_path_regex in HOMEPAGE_URL_PATH_REGEXES: