From 930fc3df50d5616ca0debdec6b3be8fd275186a2 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:05:44 +0200 Subject: [PATCH 1/8] deprecate `get_value_by_key_path` --- src/fundus/parser/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index af9bf042..8ba99173 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -22,7 +22,7 @@ import xmltodict from dict2xml import dict2xml from lxml.etree import XPath, tostring -from typing_extensions import Self, TypeAlias +from typing_extensions import Self, TypeAlias, deprecated from fundus.utils.serialization import replace_keys_in_nested_dict @@ -81,6 +81,7 @@ def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None: self.__dict__[self.__UNKNOWN_TYPE__] = [] self.__dict__[self.__UNKNOWN_TYPE__].append(ld) + @deprecated("Use xpath_search() instead") def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Optional[Any]: """ Works like get() except this one assumes a path is given as list of keys (str). From 0446ba2c0b19621a5b559808063b2a2b68b36eeb Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:06:51 +0200 Subject: [PATCH 2/8] throw pytest error with warnings --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4e7196ef..75980ec5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,3 +76,9 @@ target-version = ['py38'] [tool.isort] profile = "black" + +[tool.pytest.ini_options] +filterwarnings = [ + "error" +] + From 495fce700693481ffbde2f974e30b6aa588d509b Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:07:56 +0200 Subject: [PATCH 3/8] allow str as query and add `scalar` parameter to `xpath_search` --- src/fundus/parser/data.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index 8ba99173..8bb92726 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -9,6 +9,7 @@ Iterable, Iterator, List, + Literal, Optional, Sequence, Tuple, @@ -114,7 +115,15 @@ def to_unicode_characters(text: str) -> str: self.__xml = lxml.etree.fromstring(xml) return self.__xml - def xpath_search(self, query: XPath) -> List[Any]: + @overload + def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]: + ... + + @overload + def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Any: + ... + + def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]: """Search through LD using XPath expressions Internally, the content of the LinkedDataMapping is converted to XML and then @@ -149,6 +158,9 @@ def xpath_search(self, query: XPath) -> List[Any]: An ordered list of search results """ + if isinstance(query, str): + query = XPath(query) + pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values()))) def node2string(n: lxml.etree._Element) -> str: @@ -175,7 +187,15 @@ def to_original_characters(text: str) -> str: xml = f"" + node2string(node) + f"" results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters)) - return list(results.values()) + values = list(results.values()) + + if scalar: + if len(values) != 1: + raise ValueError(f"Got multiple values when expecting a single scalar value") + else: + return values.pop() + else: + return values def bf_search(self, key: str, depth: Optional[int] = None, default: Optional[_T] = None) -> Union[Any, _T]: """ From bc039d4aa3dd61f88fda0381a1b83bad155262d5 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:08:51 +0200 Subject: [PATCH 4/8] replace occasions of `get_value_by_key_path` with `xpath_search` --- src/fundus/publishers/de/freiepresse.py | 2 +- src/fundus/publishers/de/krautreporter.py | 3 +-- src/fundus/publishers/shared/euronews.py | 3 +-- src/fundus/publishers/us/ap_news.py | 6 +++--- src/fundus/publishers/us/cnbc.py | 7 +++---- src/fundus/publishers/us/occupy_democrats.py | 2 +- src/fundus/publishers/us/reuters.py | 4 ++-- src/fundus/publishers/us/the_gateway_pundit.py | 2 +- src/fundus/publishers/us/the_intercept.py | 8 ++++---- src/fundus/publishers/us/the_new_yorker.py | 12 ++++++------ 10 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/fundus/publishers/de/freiepresse.py b/src/fundus/publishers/de/freiepresse.py index 61742036..6bd09190 100644 --- a/src/fundus/publishers/de/freiepresse.py +++ b/src/fundus/publishers/de/freiepresse.py @@ -33,7 +33,7 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def title(self) -> Optional[str]: diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py index 038fa384..8587cb4e 100644 --- a/src/fundus/publishers/de/krautreporter.py +++ b/src/fundus/publishers/de/krautreporter.py @@ -43,8 +43,7 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime]: - key_path = ["NewsArticle", "datePublished"] - date_string = self.precomputed.ld.get_value_by_key_path(key_path) + date_string = self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True) return utility.generic_date_parsing(date_string) @attribute diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py index b4657a19..ddf64ec3 100644 --- a/src/fundus/publishers/shared/euronews.py +++ b/src/fundus/publishers/shared/euronews.py @@ -28,8 +28,7 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - key_path = ["NewsArticle", "author", "name"] - author_string = self.precomputed.ld.get_value_by_key_path(key_path) + author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name", scalar=True) return utility.generic_author_parsing(author_string) @attribute diff --git a/src/fundus/publishers/us/ap_news.py b/src/fundus/publishers/us/ap_news.py index 7ff24b25..fcd5db9c 100644 --- a/src/fundus/publishers/us/ap_news.py +++ b/src/fundus/publishers/us/ap_news.py @@ -42,17 +42,17 @@ def authors(self) -> List[str]: author_string = re.sub(r"^By ", "", author_string) except IndexError: # Fallback to the generic author parsing from the linked data. - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) return generic_author_parsing(author_string) @attribute def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py index ad5aa59c..03ecb5b0 100644 --- a/src/fundus/publishers/us/cnbc.py +++ b/src/fundus/publishers/us/cnbc.py @@ -30,16 +30,15 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - title: Optional[str] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) - return title + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/occupy_democrats.py b/src/fundus/publishers/us/occupy_democrats.py index b04b5a63..2179d042 100644 --- a/src/fundus/publishers/us/occupy_democrats.py +++ b/src/fundus/publishers/us/occupy_democrats.py @@ -41,7 +41,7 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "keywords"])) + return generic_topic_parsing(self.precomputed.ld.xpath_search("Article/keywords", scalar=True)) @attribute(validate=False) def description(self) -> Optional[str]: diff --git a/src/fundus/publishers/us/reuters.py b/src/fundus/publishers/us/reuters.py index 541efa37..dbe7dcfe 100644 --- a/src/fundus/publishers/us/reuters.py +++ b/src/fundus/publishers/us/reuters.py @@ -38,11 +38,11 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: diff --git a/src/fundus/publishers/us/the_gateway_pundit.py b/src/fundus/publishers/us/the_gateway_pundit.py index 0d86ce6f..90817f7c 100644 --- a/src/fundus/publishers/us/the_gateway_pundit.py +++ b/src/fundus/publishers/us/the_gateway_pundit.py @@ -29,7 +29,7 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("Article/author")) @attribute def publishing_date(self) -> Optional[datetime]: diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py index a4419ab9..0783deef 100644 --- a/src/fundus/publishers/us/the_intercept.py +++ b/src/fundus/publishers/us/the_intercept.py @@ -38,22 +38,22 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute def topics(self) -> List[str]: # The Intercept specifies the article's topics, including other metadata, # inside the "keywords" linked data indicated by a "Subject: " prefix. # Example keywords: ["Day: Saturday", ..., "Subject: World", ...] - keywords: Optional[List[str]] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "keywords"]) + keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords") if keywords is None: return [] diff --git a/src/fundus/publishers/us/the_new_yorker.py b/src/fundus/publishers/us/the_new_yorker.py index 43d2ebff..49800f48 100644 --- a/src/fundus/publishers/us/the_new_yorker.py +++ b/src/fundus/publishers/us/the_new_yorker.py @@ -32,23 +32,23 @@ def description(self) -> Optional[str]: @attribute(validate=False) def alternative_description(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "description"]) + return self.precomputed.ld.xpath_search("NewsArticle/description", scalar=True) @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"])) + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) @attribute def publishing_date(self) -> Optional[datetime]: - return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"])) + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) @attribute def title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"]) + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) @attribute(validate=False) def alternative_title(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "alternativeHeadline"]) + return self.precomputed.ld.xpath_search("NewsArticle/alternativeHeadline", scalar=True) @attribute def topics(self) -> List[str]: @@ -61,4 +61,4 @@ def topics(self) -> List[str]: @attribute(validate=False) def section(self) -> Optional[str]: - return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "articleSection"]) + return self.precomputed.ld.xpath_search("NewsArticle/articleSection", scalar=True) From f8c21834ad2a5e94063ebc90d20ac363f2d71c00 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:23:54 +0200 Subject: [PATCH 5/8] fix typing --- src/fundus/parser/data.py | 10 ++++++---- src/fundus/publishers/us/the_intercept.py | 3 --- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index 8bb92726..65e24e6c 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -120,7 +120,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) ... @overload - def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Any: + def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]: ... def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]: @@ -190,10 +190,12 @@ def to_original_characters(text: str) -> str: values = list(results.values()) if scalar: - if len(values) != 1: - raise ValueError(f"Got multiple values when expecting a single scalar value") - else: + if not values: + return None + elif len(values) == 1: return values.pop() + else: + raise ValueError(f"Got multiple values when expecting a single scalar value") else: return values diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py index 0783deef..518148c5 100644 --- a/src/fundus/publishers/us/the_intercept.py +++ b/src/fundus/publishers/us/the_intercept.py @@ -54,9 +54,6 @@ def topics(self) -> List[str]: # inside the "keywords" linked data indicated by a "Subject: " prefix. # Example keywords: ["Day: Saturday", ..., "Subject: World", ...] keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords") - if keywords is None: - return [] - return [keyword[9:] for keyword in keywords if keyword.startswith("Subject: ")] class V1_1(V1): From cf93360593de1baa674ee2ff4285382467fd06ca Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 3 Oct 2024 18:32:25 +0200 Subject: [PATCH 6/8] fix escape sequence for `VerdensGang` sitemap filter --- src/fundus/publishers/no/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/no/__init__.py b/src/fundus/publishers/no/__init__.py index 4caecec5..db25f5ec 100644 --- a/src/fundus/publishers/no/__init__.py +++ b/src/fundus/publishers/no/__init__.py @@ -16,7 +16,7 @@ class NO(metaclass=PublisherGroup): sources=[ Sitemap( "https://www.vg.no/sitemap.xml", - sitemap_filter=inverse(regex_filter("vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")), + sitemap_filter=inverse(regex_filter(r"vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")), reverse=True, ), NewsMap("https://www.vg.no/sitemap/files/articles-48hrs.xml"), From a5318628b44f833bbcf273e55f1d42212ff2320f Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 8 Oct 2024 20:38:36 +0200 Subject: [PATCH 7/8] fix documentation and type hint for `xpath_search` --- src/fundus/parser/data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index 65e24e6c..d370e0a6 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -123,7 +123,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]: ... - def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]: + def xpath_search(self, query: Union[XPath, str], scalar: bool = False): """Search through LD using XPath expressions Internally, the content of the LinkedDataMapping is converted to XML and then @@ -152,10 +152,12 @@ def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[ >> [value1] Args: - query: A XPath expression + query: A XPath expression either as string or XPath object. + scalar: If True, return an optional "scalar" value and raise a ValueError if there are more + than one result to return; if False, return a list of results. Defaults to False. Returns: - An ordered list of search results + An ordered list of search results or an optional "scalar" result """ if isinstance(query, str): From da7fe932f644561dba22c7c5cc6c60e3f542d49a Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 8 Oct 2024 20:38:50 +0200 Subject: [PATCH 8/8] remove `scalar=True` --- src/fundus/publishers/shared/euronews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py index ddf64ec3..94de5611 100644 --- a/src/fundus/publishers/shared/euronews.py +++ b/src/fundus/publishers/shared/euronews.py @@ -28,7 +28,7 @@ def body(self) -> ArticleBody: @attribute def authors(self) -> List[str]: - author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name", scalar=True) + author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name") return utility.generic_author_parsing(author_string) @attribute