Skip to content

Commit

Permalink
Merge pull request #92 from m453h/feature-canonical-url-values
Browse files Browse the repository at this point in the history
Enable extraction of canonical link information
  • Loading branch information
m453h authored Oct 11, 2024
2 parents 58594e0 + fca7b81 commit b69a588
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 5 deletions.
7 changes: 7 additions & 0 deletions mcmetadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
language_duration = time.monotonic() - t1
stats_accumulator['language'] += language_duration

# canonical url
if 'canonical_url' in overrides:
canonical_url = overrides['canonical_url']
else:
canonical_url = article.get('canonical_url')

total_duration = time.monotonic() - t0
stats_accumulator['total'] += total_duration

Expand All @@ -128,6 +134,7 @@ def extract(url: str, html_text: Optional[str] = None, include_other_metadata: O
normalized_url=normalized_url,
unique_url_hash=urls.unique_url_hash(final_url),
canonical_domain=canonical_domain,
canonical_url=canonical_url,
publication_date=pub_date,
language=full_language[:2] if full_language else full_language, # keep this as a two-letter code, like "en"
full_language=full_language, # could be a full region language code, like "en-AU"
Expand Down
24 changes: 21 additions & 3 deletions mcmetadata/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
safe_attrs_only=False)
readability.readability.html_cleaner = everything_cleaner


METHOD_NEWSPAPER_3k = 'newspaper3k'
METHOD_GOOSE_3 = 'goose3'
METHOD_BEAUTIFUL_SOUP_4 = 'beautifulsoup4'
Expand Down Expand Up @@ -109,6 +108,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False):
'url': url,
'text': doc.text,
'title': doc.title,
'canonical_url': doc.canonical_link,
'potential_publish_date': doc.publish_date,
'top_image_url': doc.top_image,
'authors': doc.authors,
Expand All @@ -125,6 +125,7 @@ def extract(self, url, html_text: str, include_metadata: bool = False):
'url': url,
'text': g3_article.cleaned_text,
'title': g3_article.title,
'canonical_url': g3_article.canonical_link,
'potential_publish_date': g3_article.publish_date,
'top_image_url': g3_article.top_image.src if g3_article.top_image else None,
'authors': g3_article.authors,
Expand All @@ -142,6 +143,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
'url': url,
'text': bp_doc.content,
'title': bp_doc.title,
'canonical_url': None,
'potential_publish_date': None,
'top_image_url': None,
'authors': None,
Expand All @@ -152,7 +154,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
pass


# Trafilatura outputs images in teh raw text in Markdown format
# Trafilatura outputs images in the raw text in Markdown format
markdown_img_path_pattern = re.compile(r"!\[[^\]]*\]\((.*?)\)")


Expand All @@ -174,6 +176,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
'url': url,
'text': text,
'title': results['title'],
'canonical_url': results['url'], # Warning: This will not work with Trafilatura v1.11.* and later
'potential_publish_date': dateparser.parse(results['date']),
'top_image_url': image_urls[0] if len(image_urls) > 0 else None,
'authors': results['author'].split(',') if results['author'] else None,
Expand All @@ -190,6 +193,7 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
'url': url,
'text': strip_tags(doc.summary()), # remove any tags that readability leaves in place (links)
'title': doc.title(),
'canonical_url': None,
'potential_publish_date': None,
'top_image_url': None,
'authors': None,
Expand All @@ -201,7 +205,6 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):


class RawHtmlExtractor(AbstractExtractor):

REMOVE_LIST = {'[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style'}

def __init__(self):
Expand All @@ -214,10 +217,18 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
for t in text:
if t.parent.name not in self.REMOVE_LIST:
output += '{} '.format(t)

can_url = None
if can_link := soup.find("link", rel="canonical"):
can_url = can_link.get("href")
elif can_link := soup.find('meta', attrs={'property': 'og:url'}):
can_url = can_link.get("content")

self.content = {
'url': url,
'text': output,
'title': None,
'canonical_url': can_url,
'potential_publish_date': None,
'top_image_url': None,
'authors': None,
Expand Down Expand Up @@ -245,10 +256,17 @@ def extract(self, url: str, html_text: str, include_metadata: bool = False):
circular = fromstring(tostring(parsed))
content_string = tostring(cleaner.clean_html(circular))

can_url = None
if can_link := parsed.xpath("//link[@rel='canonical']/@href"):
can_url = can_link[0]
elif can_link := parsed.xpath("//meta[@property='og:url']/@content"):
can_url = can_link[0]

self.content = {
"url": url,
'text': content_string,
'title': None,
'canonical_url': can_url,
'potential_publish_date': None,
'top_image_url': None,
'authors': None,
Expand Down
13 changes: 13 additions & 0 deletions mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def test_overrides(self):
overrides = dict(
text_content="This is some text",
article_title="This is a title",
canonical_url="https://www.example.com/",
language="pt",
publication_date=dt.date(2023, 1, 1)
)
Expand All @@ -160,6 +161,7 @@ def test_overrides(self):
assert results['article_title'] == overrides['article_title']
assert results['language'] == overrides['language']
assert results['publication_date'] == overrides['publication_date']
assert results['canonical_url'] == overrides['canonical_url']

def test_default_title(self):
# throws too short error if no default
Expand All @@ -181,6 +183,17 @@ def test_default_pub_date(self):
results = extract("https://www.example.com", html_text=html, defaults=defaults)
assert results['publication_date'] == defaults['publication_date']

def test_canonical_url(self):
canonical_url = 'https://www.example.com/sample-page'
# extract from page with <Link> as per: https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls
html = f"<html><head><link rel='canonical' href='{canonical_url}' /></head><body><h1>Sample Content</h1><p>sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf</p><p>Copyright 2024</p></body></html>"
results = extract("https://www.example.com", html_text=html)
assert results['canonical_url'] == canonical_url
# extract from page with <Meta> as per: https://developers.facebook.com/docs/sharing/webmasters/getting-started/versioned-link/
html = f"<html><head><meta property='og:url' content='{canonical_url}' /></head><body><h1>Sample Content</h1><p>sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf sdf asdf asfewaf lkjl;kjf ;iasjfoijfésadsf</p><p>Copyright 2024</p></body></html>"
results = extract("https://www.example.com", html_text=html)
assert results['canonical_url'] == canonical_url


class TestStats(unittest.TestCase):

Expand Down
4 changes: 2 additions & 2 deletions mcmetadata/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _is_suffix_only_parsed_url(parsed_url) -> bool:
def canonical_domain(raw_url: str) -> str:
"""
Return a useful canonical domain name given a url. In general this is the logical unique part of the domain.
However, to support news-based media research, this takes into account a list of exceptinos where this isn't the
However, to support news-based media research, this takes into account a list of exceptions where this isn't the
case (wordpress.com, substack.com, etc). This also handles Google AMP domains appropriately.
Created by James O'Toole with input from Emily Ndulue, Linas Valiukas, Anissa Piere, and Fernando Bermejo.
:param raw_url: the full URL to extract a unique domain from
Expand Down Expand Up @@ -295,7 +295,7 @@ def normalize_url(url: str) -> Optional[str]:
def is_homepage_url(raw_url: str) -> bool:
"""Returns true if URL is a homepage-like URL (ie. not an article)."""
url = raw_url.strip() # remove whitespace
if is_shortened_url(url): # if it is shortened than it should get a free pass becasue we have to resolve it later
if is_shortened_url(url): # if it is shortened than it should get a free pass because we have to resolve it later
return False
uri = furl(url)
for homepage_url_path_regex in HOMEPAGE_URL_PATH_REGEXES:
Expand Down

0 comments on commit b69a588

Please sign in to comment.