|
3 | 3 | import json
|
4 | 4 | from dataclasses import dataclass
|
5 | 5 | from pathlib import Path
|
6 |
| -from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar |
| 6 | +from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar |
7 | 7 |
|
8 | 8 | from typing_extensions import Self
|
9 | 9 |
|
10 | 10 | from fundus import PublisherCollection
|
11 | 11 | from fundus.parser import BaseParser
|
12 | 12 | from fundus.publishers.base_objects import PublisherEnum
|
| 13 | +from fundus.scraping.article import Article |
| 14 | +from fundus.scraping.html import HTML, HTMLSource |
13 | 15 | from scripts.generate_tables import supported_publishers_markdown_path
|
14 | 16 | from tests.resources.parser.test_data import __module_path__ as test_resource_path
|
15 | 17 |
|
16 | 18 | _T = TypeVar("_T")
|
17 | 19 |
|
18 | 20 |
|
| 21 | +def get_test_articles(publisher: PublisherEnum) -> List[Article]: |
| 22 | + articles = [] |
| 23 | + html_mapping = load_html_test_file_mapping(publisher) |
| 24 | + for html_test_file in html_mapping.values(): |
| 25 | + extraction = publisher.parser(html_test_file.crawl_date).parse(html_test_file.content) |
| 26 | + html = HTML( |
| 27 | + content=html_test_file.content, |
| 28 | + crawl_date=html_test_file.crawl_date, |
| 29 | + requested_url=html_test_file.url, |
| 30 | + responded_url=html_test_file.url, |
| 31 | + source=HTMLSource(publisher.publisher_name), |
| 32 | + ) |
| 33 | + article = Article.from_extracted(extracted=extraction, html=html) |
| 34 | + articles.append(article) |
| 35 | + return articles |
| 36 | + |
| 37 | + |
19 | 38 | @dataclass
|
20 | 39 | class JSONFile(Generic[_T]):
|
21 | 40 | """Generic file class representing a JSON file.
|
|
0 commit comments