From 930fc3df50d5616ca0debdec6b3be8fd275186a2 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:05:44 +0200
Subject: [PATCH 1/8] deprecate `get_value_by_key_path`

---
 src/fundus/parser/data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index af9bf042..8ba99173 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -22,7 +22,7 @@
 import xmltodict
 from dict2xml import dict2xml
 from lxml.etree import XPath, tostring
-from typing_extensions import Self, TypeAlias
+from typing_extensions import Self, TypeAlias, deprecated
 
 from fundus.utils.serialization import replace_keys_in_nested_dict
 
@@ -81,6 +81,7 @@ def add_ld(self, ld: Dict[str, Any], name: Optional[str] = None) -> None:
                 self.__dict__[self.__UNKNOWN_TYPE__] = []
             self.__dict__[self.__UNKNOWN_TYPE__].append(ld)
 
+    @deprecated("Use xpath_search() instead")
     def get_value_by_key_path(self, key_path: List[str], default: Any = None) -> Optional[Any]:
         """
         Works like get() except this one assumes a path is given as list of keys (str).

From 0446ba2c0b19621a5b559808063b2a2b68b36eeb Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:06:51 +0200
Subject: [PATCH 2/8] throw pytest error with warnings

---
 pyproject.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 4e7196ef..75980ec5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,3 +76,9 @@ target-version = ['py38']
 
 [tool.isort]
 profile = "black"
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error"
+]
+

From 495fce700693481ffbde2f974e30b6aa588d509b Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:07:56 +0200
Subject: [PATCH 3/8] allow str as query and add `scalar` parameter to
 `xpath_search`

---
 src/fundus/parser/data.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index 8ba99173..8bb92726 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -9,6 +9,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -114,7 +115,15 @@ def to_unicode_characters(text: str) -> str:
             self.__xml = lxml.etree.fromstring(xml)
         return self.__xml
 
-    def xpath_search(self, query: XPath) -> List[Any]:
+    @overload
+    def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]:
+        ...
+
+    @overload
+    def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Any:
+        ...
+
+    def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]:
         """Search through LD using XPath expressions
 
         Internally, the content of the LinkedDataMapping is converted to XML and then
@@ -149,6 +158,9 @@ def xpath_search(self, query: XPath) -> List[Any]:
             An ordered list of search results
         """
 
+        if isinstance(query, str):
+            query = XPath(query)
+
         pattern = re.compile("|".join(map(re.escape, self.__xml_transformation_table__.values())))
 
         def node2string(n: lxml.etree._Element) -> str:
@@ -175,7 +187,15 @@ def to_original_characters(text: str) -> str:
             xml = f"<result{i}>" + node2string(node) + f"</result{i}>"
             results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters))
 
-        return list(results.values())
+        values = list(results.values())
+
+        if scalar:
+            if len(values) != 1:
+                raise ValueError(f"Got multiple values when expecting a single scalar value")
+            else:
+                return values.pop()
+        else:
+            return values
 
     def bf_search(self, key: str, depth: Optional[int] = None, default: Optional[_T] = None) -> Union[Any, _T]:
         """

From bc039d4aa3dd61f88fda0381a1b83bad155262d5 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:08:51 +0200
Subject: [PATCH 4/8] replace occasions of `get_value_by_key_path` with
 `xpath_search`

---
 src/fundus/publishers/de/freiepresse.py        |  2 +-
 src/fundus/publishers/de/krautreporter.py      |  3 +--
 src/fundus/publishers/shared/euronews.py       |  3 +--
 src/fundus/publishers/us/ap_news.py            |  6 +++---
 src/fundus/publishers/us/cnbc.py               |  7 +++----
 src/fundus/publishers/us/occupy_democrats.py   |  2 +-
 src/fundus/publishers/us/reuters.py            |  4 ++--
 src/fundus/publishers/us/the_gateway_pundit.py |  2 +-
 src/fundus/publishers/us/the_intercept.py      |  8 ++++----
 src/fundus/publishers/us/the_new_yorker.py     | 12 ++++++------
 10 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/fundus/publishers/de/freiepresse.py b/src/fundus/publishers/de/freiepresse.py
index 61742036..6bd09190 100644
--- a/src/fundus/publishers/de/freiepresse.py
+++ b/src/fundus/publishers/de/freiepresse.py
@@ -33,7 +33,7 @@ def publishing_date(self) -> Optional[datetime.datetime]:
 
         @attribute
         def authors(self) -> List[str]:
-            return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
 
         @attribute
         def title(self) -> Optional[str]:
diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py
index 038fa384..8587cb4e 100644
--- a/src/fundus/publishers/de/krautreporter.py
+++ b/src/fundus/publishers/de/krautreporter.py
@@ -43,8 +43,7 @@ def authors(self) -> List[str]:
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
-            key_path = ["NewsArticle", "datePublished"]
-            date_string = self.precomputed.ld.get_value_by_key_path(key_path)
+            date_string = self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)
             return utility.generic_date_parsing(date_string)
 
         @attribute
diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py
index b4657a19..ddf64ec3 100644
--- a/src/fundus/publishers/shared/euronews.py
+++ b/src/fundus/publishers/shared/euronews.py
@@ -28,8 +28,7 @@ def body(self) -> ArticleBody:
 
         @attribute
         def authors(self) -> List[str]:
-            key_path = ["NewsArticle", "author", "name"]
-            author_string = self.precomputed.ld.get_value_by_key_path(key_path)
+            author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name", scalar=True)
             return utility.generic_author_parsing(author_string)
 
         @attribute
diff --git a/src/fundus/publishers/us/ap_news.py b/src/fundus/publishers/us/ap_news.py
index 7ff24b25..fcd5db9c 100644
--- a/src/fundus/publishers/us/ap_news.py
+++ b/src/fundus/publishers/us/ap_news.py
@@ -42,17 +42,17 @@ def authors(self) -> List[str]:
                 author_string = re.sub(r"^By ", "", author_string)
             except IndexError:
                 # Fallback to the generic author parsing from the linked data.
-                return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
+                return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
 
             return generic_author_parsing(author_string)
 
         @attribute
         def publishing_date(self) -> Optional[datetime.datetime]:
-            return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
 
         @attribute
         def title(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
+            return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
 
         @attribute
         def topics(self) -> List[str]:
diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py
index ad5aa59c..03ecb5b0 100644
--- a/src/fundus/publishers/us/cnbc.py
+++ b/src/fundus/publishers/us/cnbc.py
@@ -30,16 +30,15 @@ def body(self) -> ArticleBody:
 
         @attribute
         def authors(self) -> List[str]:
-            return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
 
         @attribute
         def publishing_date(self) -> Optional[datetime.datetime]:
-            return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
 
         @attribute
         def title(self) -> Optional[str]:
-            title: Optional[str] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
-            return title
+            return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
 
         @attribute
         def topics(self) -> List[str]:
diff --git a/src/fundus/publishers/us/occupy_democrats.py b/src/fundus/publishers/us/occupy_democrats.py
index b04b5a63..2179d042 100644
--- a/src/fundus/publishers/us/occupy_democrats.py
+++ b/src/fundus/publishers/us/occupy_democrats.py
@@ -41,7 +41,7 @@ def title(self) -> Optional[str]:
 
         @attribute
         def topics(self) -> List[str]:
-            return generic_topic_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "keywords"]))
+            return generic_topic_parsing(self.precomputed.ld.xpath_search("Article/keywords", scalar=True))
 
         @attribute(validate=False)
         def description(self) -> Optional[str]:
diff --git a/src/fundus/publishers/us/reuters.py b/src/fundus/publishers/us/reuters.py
index 541efa37..dbe7dcfe 100644
--- a/src/fundus/publishers/us/reuters.py
+++ b/src/fundus/publishers/us/reuters.py
@@ -38,11 +38,11 @@ def authors(self) -> List[str]:
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
-            return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
 
         @attribute
         def title(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
+            return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
 
         @attribute
         def topics(self) -> List[str]:
diff --git a/src/fundus/publishers/us/the_gateway_pundit.py b/src/fundus/publishers/us/the_gateway_pundit.py
index 0d86ce6f..90817f7c 100644
--- a/src/fundus/publishers/us/the_gateway_pundit.py
+++ b/src/fundus/publishers/us/the_gateway_pundit.py
@@ -29,7 +29,7 @@ def body(self) -> ArticleBody:
 
         @attribute
         def authors(self) -> List[str]:
-            return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["Article", "author"]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search("Article/author"))
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py
index a4419ab9..0783deef 100644
--- a/src/fundus/publishers/us/the_intercept.py
+++ b/src/fundus/publishers/us/the_intercept.py
@@ -38,22 +38,22 @@ def body(self) -> ArticleBody:
 
         @attribute
         def authors(self) -> List[str]:
-            return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
-            return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
 
         @attribute
         def title(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
+            return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
 
         @attribute
         def topics(self) -> List[str]:
             # The Intercept specifies the article's topics, including other metadata,
             # inside the "keywords" linked data indicated by a "Subject: " prefix.
             # Example keywords: ["Day: Saturday", ..., "Subject: World", ...]
-            keywords: Optional[List[str]] = self.precomputed.ld.get_value_by_key_path(["NewsArticle", "keywords"])
+            keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords")
             if keywords is None:
                 return []
 
diff --git a/src/fundus/publishers/us/the_new_yorker.py b/src/fundus/publishers/us/the_new_yorker.py
index 43d2ebff..49800f48 100644
--- a/src/fundus/publishers/us/the_new_yorker.py
+++ b/src/fundus/publishers/us/the_new_yorker.py
@@ -32,23 +32,23 @@ def description(self) -> Optional[str]:
 
         @attribute(validate=False)
         def alternative_description(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "description"])
+            return self.precomputed.ld.xpath_search("NewsArticle/description", scalar=True)
 
         @attribute
         def authors(self) -> List[str]:
-            return generic_author_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "author"]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
-            return generic_date_parsing(self.precomputed.ld.get_value_by_key_path(["NewsArticle", "datePublished"]))
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
 
         @attribute
         def title(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "headline"])
+            return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
 
         @attribute(validate=False)
         def alternative_title(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "alternativeHeadline"])
+            return self.precomputed.ld.xpath_search("NewsArticle/alternativeHeadline", scalar=True)
 
         @attribute
         def topics(self) -> List[str]:
@@ -61,4 +61,4 @@ def topics(self) -> List[str]:
 
         @attribute(validate=False)
         def section(self) -> Optional[str]:
-            return self.precomputed.ld.get_value_by_key_path(["NewsArticle", "articleSection"])
+            return self.precomputed.ld.xpath_search("NewsArticle/articleSection", scalar=True)

From f8c21834ad2a5e94063ebc90d20ac363f2d71c00 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:23:54 +0200
Subject: [PATCH 5/8] fix typing

---
 src/fundus/parser/data.py                 | 10 ++++++----
 src/fundus/publishers/us/the_intercept.py |  3 ---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index 8bb92726..65e24e6c 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -120,7 +120,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False)
         ...
 
     @overload
-    def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Any:
+    def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]:
         ...
 
     def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]:
@@ -190,10 +190,12 @@ def to_original_characters(text: str) -> str:
         values = list(results.values())
 
         if scalar:
-            if len(values) != 1:
-                raise ValueError(f"Got multiple values when expecting a single scalar value")
-            else:
+            if not values:
+                return None
+            elif len(values) == 1:
                 return values.pop()
+            else:
+                raise ValueError(f"Got multiple values when expecting a single scalar value")
         else:
             return values
 
diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py
index 0783deef..518148c5 100644
--- a/src/fundus/publishers/us/the_intercept.py
+++ b/src/fundus/publishers/us/the_intercept.py
@@ -54,9 +54,6 @@ def topics(self) -> List[str]:
             # inside the "keywords" linked data indicated by a "Subject: " prefix.
             # Example keywords: ["Day: Saturday", ..., "Subject: World", ...]
             keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords")
-            if keywords is None:
-                return []
-
             return [keyword[9:] for keyword in keywords if keyword.startswith("Subject: ")]
 
     class V1_1(V1):

From cf93360593de1baa674ee2ff4285382467fd06ca Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 3 Oct 2024 18:32:25 +0200
Subject: [PATCH 6/8] fix escape sequence for `VerdensGang` sitemap filter

---
 src/fundus/publishers/no/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/publishers/no/__init__.py b/src/fundus/publishers/no/__init__.py
index 4caecec5..db25f5ec 100644
--- a/src/fundus/publishers/no/__init__.py
+++ b/src/fundus/publishers/no/__init__.py
@@ -16,7 +16,7 @@ class NO(metaclass=PublisherGroup):
         sources=[
             Sitemap(
                 "https://www.vg.no/sitemap.xml",
-                sitemap_filter=inverse(regex_filter("vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")),
+                sitemap_filter=inverse(regex_filter(r"vg\.no\/sitemaps/\d{4}\-\d{2}-articles.xml")),
                 reverse=True,
             ),
             NewsMap("https://www.vg.no/sitemap/files/articles-48hrs.xml"),

From a5318628b44f833bbcf273e55f1d42212ff2320f Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 8 Oct 2024 20:38:36 +0200
Subject: [PATCH 7/8] fix documentation and type hint for `xpath_search`

---
 src/fundus/parser/data.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index 65e24e6c..d370e0a6 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -123,7 +123,7 @@ def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False)
     def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]:
         ...
 
-    def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[Any, List[Any]]:
+    def xpath_search(self, query: Union[XPath, str], scalar: bool = False):
         """Search through LD using XPath expressions
 
         Internally, the content of the LinkedDataMapping is converted to XML and then
@@ -152,10 +152,12 @@ def xpath_search(self, query: Union[XPath, str], scalar: bool = False) -> Union[
         >> [value1]
 
         Args:
-            query: A XPath expression
+            query: A XPath expression either as string or XPath object.
+            scalar: If True, return an optional "scalar" value and raise a ValueError if there are more
+                than one result to return; if False, return a list of results. Defaults to False.
 
         Returns:
-            An ordered list of search results
+            An ordered list of search results or an optional "scalar" result
         """
 
         if isinstance(query, str):

From da7fe932f644561dba22c7c5cc6c60e3f542d49a Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 8 Oct 2024 20:38:50 +0200
Subject: [PATCH 8/8] remove `scalar=True`

---
 src/fundus/publishers/shared/euronews.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py
index ddf64ec3..94de5611 100644
--- a/src/fundus/publishers/shared/euronews.py
+++ b/src/fundus/publishers/shared/euronews.py
@@ -28,7 +28,7 @@ def body(self) -> ArticleBody:
 
         @attribute
         def authors(self) -> List[str]:
-            author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name", scalar=True)
+            author_string = self.precomputed.ld.xpath_search("NewsArticle/author/name")
             return utility.generic_author_parsing(author_string)
 
         @attribute