From 58d480da187f719f6cef07d8888c27c176c4a358 Mon Sep 17 00:00:00 2001 From: Rahul Bhargava Date: Tue, 7 May 2024 11:18:09 -0400 Subject: [PATCH] favor_precision with Trafilatura and add first tests (all pass) #86 --- mcmetadata/content.py | 2 +- mcmetadata/test/test_content.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/mcmetadata/content.py b/mcmetadata/content.py index 7ad5f53..e1333b7 100644 --- a/mcmetadata/content.py +++ b/mcmetadata/content.py @@ -160,7 +160,7 @@ class TrafilaturaExtractor(AbstractExtractor): def extract(self, url: str, html_text: str, include_metadata: bool = False): results = trafilatura.bare_extraction(html_text, only_with_metadata=include_metadata, url=url, - include_images=include_metadata) + include_images=include_metadata, favor_precision=True) image_urls = [] if include_metadata: # pull out the images embedded in the markdown diff --git a/mcmetadata/test/test_content.py b/mcmetadata/test/test_content.py index 9fcc972..2326743 100644 --- a/mcmetadata/test/test_content.py +++ b/mcmetadata/test/test_content.py @@ -182,6 +182,21 @@ def test_too_short_content(self): except BadContentError: assert True + def test_no_related_links_1(self): + url = 'https://web.archive.org/web/20240507150742/https://www.ibtimes.co.uk/falling-inflation-shifts-focus-when-ecb-could-cut-rates-1722106' + results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA) + closing_str = 'Copyright AFP 2023. All rights reserved.' + # tailing links that shouldn't be included + copyright_index = results['text'].index('Copyright AFP 2023. All rights reserved.') + trailing_content = results['text'][copyright_index + len(closing_str):] + assert len(trailing_content) < 20 + + def test_no_related_links_3(self): + url = 'https://web.archive.org/web/20240507151403/https://www.bfmtv.com/cote-d-azur/nice-25-personnes-expulsees-lors-d-operations-anti-squat-menees-dans-le-quartier-des-liserons_AN-202312150639.html' + results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA) + most_read_header = "Les plus lus" # visual sidebar content of most read articles + assert most_read_header not in results['text'] + if __name__ == "__main__": unittest.main()