|
2 | 2 | from datetime import datetime
|
3 | 3 | from typing import List, Optional, Pattern
|
4 | 4 |
|
| 5 | +import lxml.html |
5 | 6 | from lxml.etree import XPath
|
6 | 7 |
|
7 |
| -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute |
| 8 | +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, function |
| 9 | +from fundus.parser.base_parser import Precomputed |
8 | 10 | from fundus.parser.utility import (
|
9 | 11 | extract_article_body_with_selector,
|
10 | 12 | generic_author_parsing,
|
11 | 13 | generic_date_parsing,
|
| 14 | + get_ld_content, |
| 15 | + get_meta_content, |
12 | 16 | )
|
13 | 17 |
|
14 | 18 |
|
@@ -44,5 +48,15 @@ def authors(self) -> List[str]:
|
44 | 48 |
|
45 | 49 | class V1_1(V1):
|
46 | 50 | VALID_UNTIL = datetime.today().date()
|
47 |
| - _paragraph_selector = XPath("//div[contains(@class, 'entry-content')]/p[position()>1]") |
48 |
| - _summary_selector = XPath("//div[contains(@class, 'entry-content')]/p[position()=1]") |
| 51 | + _paragraph_selector = XPath("//div[contains(@class, 'entry-content')]/p[(text() or strong) and position()>1]") |
| 52 | + _summary_selector = XPath("//div[contains(@class, 'entry-content')]/p[(text() or strong) and position()=1]") |
| 53 | + |
| 54 | + @attribute |
| 55 | + def body(self) -> ArticleBody: |
| 56 | + html = re.sub(r"(<br>)+", "<p>", self.precomputed.html) |
| 57 | + doc = lxml.html.document_fromstring(html) |
| 58 | + return extract_article_body_with_selector( |
| 59 | + doc, |
| 60 | + paragraph_selector=self._paragraph_selector, |
| 61 | + summary_selector=self._summary_selector, |
| 62 | + ) |
0 commit comments