Merge pull request #92 from m453h/feature-canonical-url-values

Enable extraction of canonical link information
mediacloud · Oct 11, 2024 · b69a588 · b69a588
2 parents 58594e0 + fca7b81
commit b69a588
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 5 deletions.
diff --git a/mcmetadata/__init__.py b/mcmetadata/__init__.py
@@ -119,6 +119,12 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
     language_duration = time.monotonic() - t1
     stats_accumulator['language'] += language_duration
 
+    # canonical url
+    if 'canonical_url' in overrides:
+        canonical_url = overrides['canonical_url']
+    else:
+        canonical_url = article.get('canonical_url')
+
     total_duration = time.monotonic() - t0
     stats_accumulator['total'] += total_duration
 
@@ -128,6 +134,7 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
         normalized_url=normalized_url,
         unique_url_hash=urls.unique_url_hash(final_url),
         canonical_domain=canonical_domain,
+        canonical_url=canonical_url,
         publication_date=pub_date,
         language=full_language[:2] if full_language else full_language,  # keep this as a two-letter code, like "en"
         full_language=full_language,  # could be a full region language code, like "en-AU"

diff --git a/mcmetadata/content.py b/mcmetadata/content.py
@@ -29,7 +29,6 @@
                              safe_attrs_only=False)
 readability.readability.html_cleaner = everything_cleaner
 
-
 METHOD_NEWSPAPER_3k = 'newspaper3k'
 METHOD_GOOSE_3 = 'goose3'
 METHOD_BEAUTIFUL_SOUP_4 = 'beautifulsoup4'
@@ -109,6 +108,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False):
             'url': url,
             'text': doc.text,
             'title': doc.title,
+            'canonical_url': doc.canonical_link,
             'potential_publish_date': doc.publish_date,
             'top_image_url': doc.top_image,
             'authors': doc.authors,
@@ -125,6 +125,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False):
             'url': url,
             'text': g3_article.cleaned_text,
             'title': g3_article.title,
+            'canonical_url': g3_article.canonical_link,
             'potential_publish_date': g3_article.publish_date,
             'top_image_url': g3_article.top_image.src if g3_article.top_image else None,
             'authors': g3_article.authors,
@@ -142,6 +143,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
                 'url': url,
                 'text': bp_doc.content,
                 'title': bp_doc.title,
+                'canonical_url': None,
                 'potential_publish_date': None,
                 'top_image_url': None,
                 'authors': None,
@@ -152,7 +154,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
             pass
 
 
-# Trafilatura outputs images in teh raw text in Markdown format
+# Trafilatura outputs images in the raw text in Markdown format
 markdown_img_path_pattern = re.compile(r"!\[[^\]]*\]\((.*?)\)")
 
 
@@ -174,6 +176,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
             'url': url,
             'text': text,
             'title': results['title'],
+            'canonical_url': results['url'],  # Warning: This will not work with Trafilatura v1.11.* and later
             'potential_publish_date': dateparser.parse(results['date']),
             'top_image_url': image_urls[0] if len(image_urls) > 0 else None,
             'authors': results['author'].split(',') if results['author'] else None,
@@ -190,6 +193,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
                 'url': url,
                 'text': strip_tags(doc.summary()),  # remove any tags that readability leaves in place (links)
                 'title': doc.title(),
+                'canonical_url': None,
                 'potential_publish_date': None,
                 'top_image_url': None,
                 'authors': None,
@@ -201,7 +205,6 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
 
 
 class RawHtmlExtractor(AbstractExtractor):
-
     REMOVE_LIST = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style'}
 
     def __init__(self):
@@ -214,10 +217,18 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
         for t in text:
             if t.parent.name not in self.REMOVE_LIST:
                 output += '{} '.format(t)
+
+        can_url = None
+        if can_link := soup.find("link", rel="canonical"):
+            can_url = can_link.get("href")
+        elif can_link := soup.find('meta', attrs={'property': 'og:url'}):
+            can_url = can_link.get("content")
+
         self.content = {
             'url': url,
             'text': output,
             'title': None,
+            'canonical_url': can_url,
             'potential_publish_date': None,
             'top_image_url': None,
             'authors': None,
@@ -245,10 +256,17 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
         circular = fromstring(tostring(parsed))
         content_string = tostring(cleaner.clean_html(circular))
 
+        can_url = None
+        if can_link := parsed.xpath("//link[@rel='canonical']/@href"):
+            can_url = can_link[0]
+        elif can_link := parsed.xpath("//meta[@property='og:url']/@content"):
+            can_url = can_link[0]
+
         self.content = {
             "url": url,
             'text': content_string,
             'title': None,
+            'canonical_url': can_url,
             'potential_publish_date': None,
             'top_image_url': None,
             'authors': None,

diff --git a/mcmetadata/test/test_extract.py b/mcmetadata/test/test_extract.py
@@ -145,6 +145,7 @@ def test_overrides(self):
         overrides = dict(
             text_content="This is some text",
             article_title="This is a title",
+            canonical_url="https://www.example.com/",
             language="pt",
             publication_date=dt.date(2023, 1, 1)
         )
@@ -160,6 +161,7 @@ def test_overrides(self):
         assert results['article_title'] == overrides['article_title']
         assert results['language'] == overrides['language']
         assert results['publication_date'] == overrides['publication_date']
+        assert results['canonical_url'] == overrides['canonical_url']
 
     def test_default_title(self):
         # throws too short error if no default
@@ -181,6 +183,17 @@ def test_default_pub_date(self):
         results = extract("https://www.example.com", html_text=html, defaults=defaults)
         assert results['publication_date'] == defaults['publication_date']
 
+    def test_canonical_url(self):
+        canonical_url = 'https://www.example.com/sample-page'
+        # extract from page with <Link> as per: https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls
+        html = f"<html><head><link rel='canonical' href='{canonical_url}' /></head><body><h1>Sample Content</h1><p>sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf</p><p>Copyright 2024</p></body></html>"
+        results = extract("https://www.example.com", html_text=html)
+        assert results['canonical_url'] == canonical_url
+        # extract from page with <Meta> as per: https://developers.facebook.com/docs/sharing/webmasters/getting-started/versioned-link/
+        html = f"<html><head><meta property='og:url' content='{canonical_url}' /></head><body><h1>Sample Content</h1><p>sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf</p><p>Copyright 2024</p></body></html>"
+        results = extract("https://www.example.com", html_text=html)
+        assert results['canonical_url'] == canonical_url
+
 
 class TestStats(unittest.TestCase):
 

diff --git a/mcmetadata/urls.py b/mcmetadata/urls.py
@@ -43,7 +43,7 @@ def _is_suffix_only_parsed_url(parsed_url) -> bool:
 def canonical_domain(raw_url: str) -> str:
     """
     Return a useful canonical domain name given a url. In general this is the logical unique part of the domain.
-    However, to support news-based media research, this takes into account a list of exceptinos where this isn't the
+    However, to support news-based media research, this takes into account a list of exceptions where this isn't the
     case (wordpress.com, substack.com, etc). This also handles Google AMP domains appropriately.
     Created by James O'Toole with input from Emily Ndulue, Linas Valiukas, Anissa Piere, and Fernando Bermejo.
     :param raw_url: the full URL to extract a unique domain from
@@ -295,7 +295,7 @@ def normalize_url(url: str) -> Optional[str]:
 def is_homepage_url(raw_url: str) -> bool:
     """Returns true if URL is a homepage-like URL (ie. not an article)."""
     url = raw_url.strip()  # remove whitespace
-    if is_shortened_url(url):  # if it is shortened than it should get a free pass becasue we have to resolve it later
+    if is_shortened_url(url):  # if it is shortened than it should get a free pass because we have to resolve it later
         return False
     uri = furl(url)
     for homepage_url_path_regex in HOMEPAGE_URL_PATH_REGEXES: