|
| 1 | +import datetime |
| 2 | +import re |
| 3 | +from typing import List, Optional |
| 4 | + |
| 5 | +from lxml.cssselect import CSSSelector |
| 6 | +from lxml.etree import XPath |
| 7 | + |
| 8 | +from fundus.parser import ( |
| 9 | + ArticleBody, |
| 10 | + BaseParser, |
| 11 | + Image, |
| 12 | + ParserProxy, |
| 13 | + attribute, |
| 14 | + function, |
| 15 | +) |
| 16 | +from fundus.parser.utility import ( |
| 17 | + extract_article_body_with_selector, |
| 18 | + generic_author_parsing, |
| 19 | + generic_date_parsing, |
| 20 | + generic_topic_parsing, |
| 21 | + image_extraction, |
| 22 | + transform_breaks_to_paragraphs, |
| 23 | +) |
| 24 | + |
| 25 | + |
| 26 | +class NikkanGeadaiParser(ParserProxy): |
| 27 | + class V1(BaseParser): |
| 28 | + _paragraph_selector = XPath( |
| 29 | + "//div[@class='article-wrap'] //p[@class='full-text'] /p[@class='br-wrap' and text()]" |
| 30 | + ) |
| 31 | + |
| 32 | + _full_text_selector = CSSSelector("div.article-wrap p.full-text") |
| 33 | + |
| 34 | + _topic_selector = XPath("//main //div[contains(@class, 'm-keyword-list')] /ul /li //text()") |
| 35 | + |
| 36 | + @function(priority=0) |
| 37 | + def _transform_br_element(self): |
| 38 | + if nodes := self._full_text_selector(self.precomputed.doc): |
| 39 | + if len(nodes) != 1: |
| 40 | + raise ValueError(f"Expected exactly one node") |
| 41 | + else: |
| 42 | + transform_breaks_to_paragraphs(nodes[0], __class__="br-wrap") |
| 43 | + |
| 44 | + @attribute |
| 45 | + def body(self) -> Optional[ArticleBody]: |
| 46 | + return extract_article_body_with_selector( |
| 47 | + self.precomputed.doc, |
| 48 | + paragraph_selector=self._paragraph_selector, |
| 49 | + ) |
| 50 | + |
| 51 | + @attribute |
| 52 | + def title(self) -> Optional[str]: |
| 53 | + return self.precomputed.ld.bf_search("headline") |
| 54 | + |
| 55 | + @attribute |
| 56 | + def authors(self) -> List[str]: |
| 57 | + return generic_author_parsing(self.precomputed.ld.bf_search("author")) |
| 58 | + |
| 59 | + @attribute |
| 60 | + def publishing_date(self) -> Optional[datetime.datetime]: |
| 61 | + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) |
| 62 | + |
| 63 | + @attribute |
| 64 | + def topics(self) -> List[str]: |
| 65 | + if topics := self._topic_selector(self.precomputed.doc): |
| 66 | + return generic_topic_parsing(topics) |
| 67 | + return [] |
| 68 | + |
| 69 | + @attribute |
| 70 | + def images(self) -> List[Image]: |
| 71 | + return image_extraction( |
| 72 | + doc=self.precomputed.doc, |
| 73 | + paragraph_selector=self._paragraph_selector, |
| 74 | + upper_boundary_selector=CSSSelector("div.article-wrap"), |
| 75 | + # https://regex101.com/r/uY6o2z/1 |
| 76 | + author_selector=re.compile(r"(C)(?P<credits>.*?)\s*$"), |
| 77 | + ) |
0 commit comments