diff --git a/pyproject.toml b/pyproject.toml index 4e7196efa..75980ec5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,3 +76,9 @@ target-version = ['py38'] [tool.isort] profile = "black" + +[tool.pytest.ini_options] +filterwarnings = [ + "error" +] + diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index af9bf0426..d370e0a66 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -9,6 +9,7 @@ Iterable, Iterator, List, + Literal, Optional, Sequence, Tuple, @@ -22,7 +23,7 @@ import xmltodict from dict2xml import dict2xml from lxml.etree import XPath, tostring -from typing_extensions import Self, TypeAlias +from typing_extensions import Self, TypeAlias, deprecated from fundus.utils.serialization import replace_keys_in_nested_dict @@ -81,6 +82,7 @@ def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None: self.__dict__[self.__UNKNOWN_TYPE__] = [] self.__dict__[self.__UNKNOWN_TYPE__].append(ld) + @deprecated("Use xpath_search() instead") def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Optional[Any]: """ Works like get() except this one assumes a path is given as list of keys (str). @@ -113,7 +115,15 @@ def to_unicode_characters(text: str) -> str: self.__xml = lxml.etree.fromstring(xml) return self.__xml - def xpath_search(self, query: XPath) -> List[Any]: + @overload + def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]: + ... + + @overload + def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]: + ... + + def xpath_search(self, query: Union[XPath, str], scalar: bool = False): """Search through LD using XPath expressions Internally, the content of the LinkedDataMapping is converted to XML and then @@ -142,12 +152,17 @@ def xpath_search(self, query: XPath) -> List[Any]: >> [value1] Args: - query: A XPath expression + query: A XPath expression either as string or XPath object. + scalar: If True, return an optional "scalar" value and raise a ValueError if there are more + than one result to return; if False, return a list of results. Defaults to False. Returns: - An ordered list of search results + An ordered list of search results or an optional "scalar" result """ + if isinstance(query, str): + query = XPath(query) + pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values()))) def node2string(n: lxml.etree._Element) -> str: @@ -174,7 +189,17 @@ def to_original_characters(text: str) -> str: xml = f"" + node2string(node) + f"" results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters)) - return list(results.values()) + values = list(results.values()) + + if scalar: + if not values: + return None + elif len(values) == 1: + return values.pop() + else: + raise ValueError(f"Got multiple values when expecting a single scalar value") + else: + return values def bf_search(self, key: str, depth: Optional[int] = None, default: Optional[_T] = None) -> Union[Any, _T]: """ diff --git a/src/fundus/publishers/de/freiepresse.py b/src/fundus/publishers/de/freiepresse.py index 61742036d..6bd091901 100644 --- a/src/fundus/publishers/de/freiepresse.py +++ b/src/fundus/publishers/de/freiepresse.py @@ -33,7 +33,7 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def title(self) -> Optional[str]: diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py index 038fa3848..8587cb4eb 100644 --- a/src/fundus/publishers/de/krautreporter.py +++ b/src/fundus/publishers/de/krautreporter.py @@ -43,8 +43,7 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime]: - key_path = ["NewsArticle", "datePublished"] - date_string = self.precomputed.ld.get_value_by_key_path(key_path) + date_string = self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True) return utility.generic_date_parsing(date_string) @attribute diff --git a/src/fundus/publishers/no/__init__.py b/src/fundus/publishers/no/__init__.py index 4caecec56..db25f5ecd 100644 --- a/src/fundus/publishers/no/__init__.py +++ b/src/fundus/publishers/no/__init__.py @@ -16,7 +16,7 @@ class NO(metaclass=PublisherGroup): sources=[ Sitemap( "https://www.vg.no/sitemap.xml", - sitemap_filter=inverse(regex_filter("vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")), + sitemap_filter=inverse(regex_filter(r"vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")), reverse=True, ), NewsMap("https://www.vg.no/sitemap/files/articles-48hrs.xml"), diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py index b4657a192..94de56114 100644 --- a/src/fundus/publishers/shared/euronews.py +++ b/src/fundus/publishers/shared/euronews.py @@ -28,8 +28,7 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - key_path = ["NewsArticle", "author", "name"] - author_string = self.precomputed.ld.get_value_by_key_path(key_path) + author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name") return utility.generic_author_parsing(author_string) @attribute diff --git a/src/fundus/publishers/us/ap_news.py b/src/fundus/publishers/us/ap_news.py index 7ff24b252..fcd5db9cf 100644 --- a/src/fundus/publishers/us/ap_news.py +++ b/src/fundus/publishers/us/ap_news.py @@ -42,17 +42,17 @@ def authors(self) -> List[str]: author_string = re.sub(r"^By ", "", author_string) except IndexError: # Fallback to the generic author parsing from the linked data. - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) return generic_author_parsing(author_string) @attribute def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py index ad5aa59ce..03ecb5b0c 100644 --- a/src/fundus/publishers/us/cnbc.py +++ b/src/fundus/publishers/us/cnbc.py @@ -30,16 +30,15 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - title: Optional[str] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) - return title + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/occupy_democrats.py b/src/fundus/publishers/us/occupy_democrats.py index b04b5a63e..2179d0423 100644 --- a/src/fundus/publishers/us/occupy_democrats.py +++ b/src/fundus/publishers/us/occupy_democrats.py @@ -41,7 +41,7 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "keywords"])) + return generic_topic_parsing(self.precomputed.ld.xpath_search("Article/keywords", scalar=True)) @attribute(validate=False) def description(self) -> Optional[str]: diff --git a/src/fundus/publishers/us/reuters.py b/src/fundus/publishers/us/reuters.py index 541efa379..dbe7dcfed 100644 --- a/src/fundus/publishers/us/reuters.py +++ b/src/fundus/publishers/us/reuters.py @@ -38,11 +38,11 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/the_gateway_pundit.py b/src/fundus/publishers/us/the_gateway_pundit.py index 0d86ce6f0..90817f7c8 100644 --- a/src/fundus/publishers/us/the_gateway_pundit.py +++ b/src/fundus/publishers/us/the_gateway_pundit.py @@ -29,7 +29,7 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("Article/author")) @attribute def publishing_date(self) -> Optional[datetime]: diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py index a4419ab98..518148c57 100644 --- a/src/fundus/publishers/us/the_intercept.py +++ b/src/fundus/publishers/us/the_intercept.py @@ -38,25 +38,22 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: # The Intercept specifies the article's topics, including other metadata, # inside the "keywords" linked data indicated by a "Subject: " prefix. # Example keywords: ["Day: Saturday", ..., "Subject: World", ...] - keywords: Optional[List[str]] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "keywords"]) - if keywords is None: - return [] - + keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords") return [keyword[9:] for keyword in keywords if keyword.startswith("Subject: ")] class V1_1(V1): diff --git a/src/fundus/publishers/us/the_new_yorker.py b/src/fundus/publishers/us/the_new_yorker.py index 43d2ebffc..49800f48c 100644 --- a/src/fundus/publishers/us/the_new_yorker.py +++ b/src/fundus/publishers/us/the_new_yorker.py @@ -32,23 +32,23 @@ def description(self) -> Optional[str]: @attribute(validate=False) def alternative_description(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "description"]) + return self.precomputed.ld.xpath_search("NewsArticle/description", scalar=True) @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute(validate=False) def alternative_title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "alternativeHeadline"]) + return self.precomputed.ld.xpath_search("NewsArticle/alternativeHeadline", scalar=True) @attribute def topics(self) -> List[str]: @@ -61,4 +61,4 @@ def topics(self) -> List[str]: @attribute(validate=False) def section(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "articleSection"]) + return self.precomputed.ld.xpath_search("NewsArticle/articleSection", scalar=True)