flairNLP · MaxDall · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · addie9800
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -7,7 +7,6 @@
 from datetime import datetime
 from functools import total_ordering
 from typing import (
-    Any,
     Callable,
     ClassVar,
     Dict,
@@ -18,6 +17,7 @@
     Pattern,
     Type,
     Union,
+    cast,
 )
 
 import lxml.html
@@ -33,6 +33,7 @@
     LinkedDataMapping,
     TextSequence,
 )
+from fundus.utils.serialization import JSONVal
 
 logger = create_logger(__name__)
 
@@ -154,25 +155,34 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
 
 _ld_node_selector = XPath("//script[@type='application/ld+json']")
 _json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")
+_json_undefined = re.compile(r'(?P<key>"[^"]*?"):\s*undefined')
 
 
 def sanitize_json(text: str) -> Optional[str]:
     # capture only content enclosed as follows: {...} or [{...}]
     match = re.search(_json_pattern, text)
-    if match is not None and (sanitized := match.group("json")):
-        return sanitized
-    return None
-
-
-def extract_json_from_dom(root: lxml.html.HtmlElement, selector: XPath) -> Iterable[Dict[str, Any]]:
-    json_nodes = selector(root)
-    jsons = []
-    for node in json_nodes:
-        json_content = sanitize_json(node.text_content()) or ""
-        try:
-            jsons.append(json.loads(json_content))
-        except json.JSONDecodeError as error:
-            logger.debug(f"Encountered {error!r} during JSON parsing")
+    if match is None or not (sanitized := match.group("json")):
+        return None
+
+    # substitute "bad" values
+    sanitized = re.sub(_json_undefined, r"\g<key>:null", sanitized)
+
+    return sanitized
+
+
+def parse_json(text: str) -> Optional[Dict[str, JSONVal]]:
+    if not (json_content := sanitize_json(text)):
+        return None
+
+    try:
+        return cast(Dict[str, JSONVal], json.loads(json_content))
+    except json.JSONDecodeError as error:
+        logger.debug(f"Encountered {error!r} during JSON parsing")
+        return None
+
+
+def extract_json_from_dom(root: lxml.html.HtmlElement, selector: XPath) -> Iterable[Dict[str, JSONVal]]:
+    jsons = [parsed_json for node in selector(root) if (parsed_json := parse_json(node.text_content())) is not None]
     return more_itertools.collapse(jsons, base_type=dict)
 
 

diff --git a/src/fundus/publishers/au/west_australian.py b/src/fundus/publishers/au/west_australian.py
@@ -1,5 +1,4 @@
 import datetime
-import json
 import re
 from typing import List, Optional
 
@@ -11,7 +10,7 @@
     generic_author_parsing,
     generic_date_parsing,
     generic_topic_parsing,
-    sanitize_json,
+    parse_json,
 )
 
 
@@ -21,23 +20,12 @@ class V1(BaseParser):
             "string(//script[re:test(text(), 'window.PAGE_DATA')])",
             namespaces={"re": "http://exslt.org/regular-expressions"},
         )
-        _json_undefined_pattern = re.compile(r'":\s*undefined')
 
         @function(priority=1)
         def _parse_page_content(self):
-            page_data_content = self._page_data_selector(self.precomputed.doc)
-
-            if not page_data_content or not (sanitized := sanitize_json(page_data_content)):
-                return
-
-            json_string = re.sub(self._json_undefined_pattern, r'": null', sanitized)
-
-            try:
-                json_content = json.loads(json_string)
-            except json.JSONDecodeError:
-                return
-
-            self.precomputed.ld.add_ld(json_content, "windows.PAGE_DATA")
+            if not (parsed_json := parse_json(self._page_data_selector(self.precomputed.doc))):
+                raise ValueError("Couldn't parse page data")
+            self.precomputed.ld.add_ld(parsed_json, "windows.PAGE_DATA")
 
         @attribute
         def body(self) -> ArticleBody:

diff --git a/src/fundus/publishers/na/the_namibian.py b/src/fundus/publishers/na/the_namibian.py
@@ -5,14 +5,11 @@
 import lxml.html
 from lxml.etree import XPath
 
-from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, function
-from fundus.parser.base_parser import Precomputed
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
 from fundus.parser.utility import (
     extract_article_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
-    get_ld_content,
-    get_meta_content,
 )