From 58d480da187f719f6cef07d8888c27c176c4a358 Mon Sep 17 00:00:00 2001
From: Rahul Bhargava <rahulbot@gmail.com>
Date: Tue, 7 May 2024 11:18:09 -0400
Subject: [PATCH] favor_precision with Trafilatura and add first tests (all
 pass) #86

---
 mcmetadata/content.py           |  2 +-
 mcmetadata/test/test_content.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/mcmetadata/content.py b/mcmetadata/content.py
index 7ad5f53..e1333b7 100644
--- a/mcmetadata/content.py
+++ b/mcmetadata/content.py
@@ -160,7 +160,7 @@ class TrafilaturaExtractor(AbstractExtractor):
 
     def extract(self, url: str, html_text: str, include_metadata: bool = False):
         results = trafilatura.bare_extraction(html_text, only_with_metadata=include_metadata, url=url,
-                                              include_images=include_metadata)
+                                              include_images=include_metadata, favor_precision=True)
         image_urls = []
         if include_metadata:
             # pull out the images embedded in the markdown
diff --git a/mcmetadata/test/test_content.py b/mcmetadata/test/test_content.py
index 9fcc972..2326743 100644
--- a/mcmetadata/test/test_content.py
+++ b/mcmetadata/test/test_content.py
@@ -182,6 +182,21 @@ def test_too_short_content(self):
         except BadContentError:
             assert True
 
+    def test_no_related_links_1(self):
+        url = 'https://web.archive.org/web/20240507150742/https://www.ibtimes.co.uk/falling-inflation-shifts-focus-when-ecb-could-cut-rates-1722106'
+        results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA)
+        closing_str = 'Copyright AFP 2023. All rights reserved.'
+        # tailing links that shouldn't be included
+        copyright_index = results['text'].index('Copyright AFP 2023. All rights reserved.')
+        trailing_content = results['text'][copyright_index + len(closing_str):]
+        assert len(trailing_content) < 20
+
+    def test_no_related_links_3(self):
+        url = 'https://web.archive.org/web/20240507151403/https://www.bfmtv.com/cote-d-azur/nice-25-personnes-expulsees-lors-d-operations-anti-squat-menees-dans-le-quartier-des-liserons_AN-202312150639.html'
+        results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA)
+        most_read_header = "Les plus lus"  # visual sidebar content of most read articles
+        assert most_read_header not in results['text']
+
 
 if __name__ == "__main__":
     unittest.main()