5
5
6
6
from tqdm import tqdm
7
7
8
- from fundus import Crawler , PublisherCollection
8
+ from fundus import BaseCrawler , Crawler , PublisherCollection
9
9
from fundus .logging import basic_logger
10
10
from fundus .publishers .base_objects import PublisherEnum
11
11
from fundus .scraping .article import Article
12
+ from fundus .scraping .html import FundusSource
13
+ from fundus .scraping .scraper import Scraper
12
14
from tests .test_parser import attributes_required_to_cover
13
15
from tests .utility import HTMLTestFile , get_test_case_json , load_html_test_file_mapping
14
16
15
17
16
- def get_test_article (enum : PublisherEnum ) -> Optional [Article ]:
17
- crawler = Crawler (enum )
18
+ def get_test_article (enum : PublisherEnum , url : Optional [str ] = None ) -> Optional [Article ]:
19
+ crawler : BaseCrawler
20
+ if url is None :
21
+ crawler = Crawler (enum )
22
+ else :
23
+ source = FundusSource ([url ], publisher = enum .publisher_name )
24
+ scraper = Scraper (source , parser = enum .parser )
25
+ crawler = BaseCrawler (scraper )
18
26
return next (crawler .crawl (max_articles = 1 , error_handling = "suppress" , only_complete = True ), None )
19
27
20
28
@@ -31,9 +39,17 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]:
31
39
"attributes" ,
32
40
metavar = "A" ,
33
41
nargs = "*" ,
34
- help = f"the attributes which should be used to create test cases. default: { ', ' .join (attributes_required_to_cover )} " ,
42
+ help = f"the attributes which should be used to create test cases. "
43
+ f"default: { ', ' .join (attributes_required_to_cover )} " ,
35
44
)
36
45
parser .add_argument ("-p" , dest = "publishers" , metavar = "P" , nargs = "+" , help = "only consider given publishers" )
46
+ parser .add_argument (
47
+ "-u" ,
48
+ dest = "urls" ,
49
+ metavar = "U" ,
50
+ nargs = "+" ,
51
+ help = "use given URL instead of searching for an article. if set the urls will be mapped to the order of -p" ,
52
+ )
37
53
group = parser .add_mutually_exclusive_group ()
38
54
group .add_argument (
39
55
"-o" ,
@@ -50,19 +66,25 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]:
50
66
51
67
args = parser .parse_args ()
52
68
69
+ if args .urls is not None :
70
+ if args .publishers is None :
71
+ parser .error ("-u requires -p. you can only specify URLs when also specifying publishers." )
72
+ if len (args .urls ) != len (args .publishers ):
73
+ parser .error ("-u and -p do not have the same argument length" )
74
+
53
75
# sort args.attributes for consistency
54
76
args .attributes = list (sorted (args .attributes )) or attributes_required_to_cover
55
77
56
78
basic_logger .setLevel (WARN )
57
79
58
80
publishers : List [PublisherEnum ] = (
59
- list (PublisherCollection )
60
- if args .publishers is None
61
- else [pub for pub in PublisherCollection if pub .name in args .publishers ]
81
+ list (PublisherCollection ) if args .publishers is None else [PublisherCollection [pub ] for pub in args .publishers ]
62
82
)
63
83
84
+ urls = args .urls if args .urls is not None else [None ] * len (publishers )
85
+
64
86
with tqdm (total = len (publishers )) as bar :
65
- for publisher in publishers :
87
+ for url , publisher in zip ( urls , publishers ) :
66
88
bar .set_description (desc = publisher .name , refresh = True )
67
89
68
90
# load json
@@ -73,7 +95,7 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]:
73
95
html_mapping = load_html_test_file_mapping (publisher ) if not args .overwrite else {}
74
96
75
97
if args .overwrite or not html_mapping .get (publisher .parser .latest_version ):
76
- if not (article := get_test_article (publisher )):
98
+ if not (article := get_test_article (publisher , url )):
77
99
basic_logger .warning (f"Couldn't get article for { publisher .name } . Skipping" )
78
100
continue
79
101
html = HTMLTestFile (
0 commit comments