Merge pull request #355 from flairNLP/add-utility-to-get-test-articles-for-publisher

MaxDall · web-flow · commit 7ece2f0a6c05 · 2024-02-18T13:38:58.000+01:00
Add utility to retrieve test articles
diff --git a/tests/utility.py b/tests/utility.py
@@ -3,19 +3,38 @@
 import json
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
+from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar
 
 from typing_extensions import Self
 
 from fundus import PublisherCollection
 from fundus.parser import BaseParser
 from fundus.publishers.base_objects import PublisherEnum
+from fundus.scraping.article import Article
+from fundus.scraping.html import HTML, HTMLSource
 from scripts.generate_tables import supported_publishers_markdown_path
 from tests.resources.parser.test_data import __module_path__ as test_resource_path
 
 _T = TypeVar("_T")
 
 
+def get_test_articles(publisher: PublisherEnum) -> List[Article]:
+    articles = []
+    html_mapping = load_html_test_file_mapping(publisher)
+    for html_test_file in html_mapping.values():
+        extraction = publisher.parser(html_test_file.crawl_date).parse(html_test_file.content)
+        html = HTML(
+            content=html_test_file.content,
+            crawl_date=html_test_file.crawl_date,
+            requested_url=html_test_file.url,
+            responded_url=html_test_file.url,
+            source=HTMLSource(publisher.publisher_name),
+        )
+        article = Article.from_extracted(extracted=extraction, html=html)
+        articles.append(article)
+    return articles
+
+
 @dataclass
 class JSONFile(Generic[_T]):
     """Generic file class representing a JSON file.