Skip to content

Commit 7ece2f0

Browse files
authored
Merge pull request #355 from flairNLP/add-utility-to-get-test-articles-for-publisher
Add utility to retrieve test articles
2 parents 7d0abfa + e79b2b8 commit 7ece2f0

File tree

1 file changed

+20
-1
lines changed

1 file changed

+20
-1
lines changed

tests/utility.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,38 @@
33
import json
44
from dataclasses import dataclass
55
from pathlib import Path
6-
from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
6+
from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar
77

88
from typing_extensions import Self
99

1010
from fundus import PublisherCollection
1111
from fundus.parser import BaseParser
1212
from fundus.publishers.base_objects import PublisherEnum
13+
from fundus.scraping.article import Article
14+
from fundus.scraping.html import HTML, HTMLSource
1315
from scripts.generate_tables import supported_publishers_markdown_path
1416
from tests.resources.parser.test_data import __module_path__ as test_resource_path
1517

1618
_T = TypeVar("_T")
1719

1820

21+
def get_test_articles(publisher: PublisherEnum) -> List[Article]:
22+
articles = []
23+
html_mapping = load_html_test_file_mapping(publisher)
24+
for html_test_file in html_mapping.values():
25+
extraction = publisher.parser(html_test_file.crawl_date).parse(html_test_file.content)
26+
html = HTML(
27+
content=html_test_file.content,
28+
crawl_date=html_test_file.crawl_date,
29+
requested_url=html_test_file.url,
30+
responded_url=html_test_file.url,
31+
source=HTMLSource(publisher.publisher_name),
32+
)
33+
article = Article.from_extracted(extracted=extraction, html=html)
34+
articles.append(article)
35+
return articles
36+
37+
1938
@dataclass
2039
class JSONFile(Generic[_T]):
2140
"""Generic file class representing a JSON file.

0 commit comments

Comments
 (0)