Skip to content

Commit 395a0eb

Browse files
committed
Resolve Merge Conflict
2 parents 69c9984 + e31fa75 commit 395a0eb

File tree

6 files changed

+45
-5
lines changed

6 files changed

+45
-5
lines changed

docs/attribute_guidelines.md

+8
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,12 @@ Those attributes will be validated with unit tests when used.
5858
<td><code>List[str]</code></td>
5959
<td><code>generic_topic_parsing</code></td>
6060
</tr>
61+
<tr>
62+
<td>free_access</td>
63+
<td>A boolean which is set to be True, if the article is restricted to users with a subscription. This usually indicates
64+
that the article cannot be crawled completely.
65+
<i>This attribute is implemented by default</i></td>
66+
<td><code>bool</code></td>
67+
<td><code></code></td>
68+
</tr>
6169
</table>

docs/how_to_add_a_publisher.md

+18
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,24 @@ Instead, we recommend referring to [this](https://devhints.io/xpath) documentati
469469
Make sure to examine other parsers and consult the [attribute guidelines](attribute_guidelines.md) for specifics on attribute implementation.
470470
We strongly encourage utilizing these utility functions, especially when parsing the `ArticleBody`.
471471

472+
### Checking the free_access attribute
473+
474+
In case your new publisher does not have a subscription model, you can go ahead and skip this step. If it does,
475+
please verify that there is a tag `isAccessibleForFree` within the `<script type="application/ld+json">` blocks in the
476+
source code of premium articles that is set to either `false` or `False`. It doesn't matter if the tag is missing in the
477+
freely accessible articles. If this is the case, you can continue with the next step. If not, please overwrite the
478+
existing function by adding the following snippet to your parser:
479+
480+
```python
481+
@attribute
482+
def free_access(self) -> bool:
483+
# Your personalized logic goes here
484+
pass
485+
```
486+
487+
Usually you can identify a premium article by an indicator within the URL or by using XPath or CSSSelector and selecting
488+
the element asking to to purchase a subscription to view the article.
489+
472490
### Finishing the Parser
473491

474492
Bringing all the above together, the Los Angeles Times now looks like this.

src/fundus/parser/base_parser.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def _search_members(cls, obj_type: type) -> List[Tuple[str, Any]]:
172172
@classmethod
173173
def attributes(cls) -> AttributeCollection:
174174
attrs: List[Attribute] = [
175-
func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta"]
175+
func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta", "free_access"]
176176
]
177177
return AttributeCollection(*attrs)
178178

@@ -233,6 +233,15 @@ def __meta(self) -> Dict[str, Any]:
233233
def __ld(self) -> Optional[LinkedDataMapping]:
234234
return self.precomputed.ld
235235

236+
@attribute
237+
def free_access(self) -> bool:
238+
if (isAccessibleForFree := self.precomputed.ld.bf_search("isAccessibleForFree")) is None:
239+
return True
240+
elif not isAccessibleForFree or isAccessibleForFree == "false" or isAccessibleForFree == "False":
241+
return False
242+
else:
243+
return True
244+
236245

237246
class _ParserCache:
238247
def __init__(self, factory: Type[BaseParser]):

src/fundus/publishers/de/bild.py

+8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime
2+
import re
23
from typing import List, Optional
34

45
from lxml.etree import XPath
@@ -42,3 +43,10 @@ def title(self) -> Optional[str]:
4243
@attribute
4344
def topics(self) -> List[str]:
4445
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
46+
47+
@attribute
48+
def free_access(self) -> bool:
49+
if (url := self.precomputed.meta.get("og:url")) is not None:
50+
return re.search(r"/bild-plus/", url) is None
51+
else:
52+
return True

src/fundus/publishers/de/braunschweiger_zeitung.py

-4
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,3 @@ def authors(self) -> List[str]:
5656
@attribute
5757
def publishing_date(self) -> Optional[datetime.datetime]:
5858
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
59-
60-
@attribute(validate=False)
61-
def free_access(self) -> bool:
62-
return self.precomputed.ld.bf_search("isAccessibleForFree") == "True"

src/fundus/scraping/scraper.py

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ async def scrape(
3434
supported_attributes = set(
3535
more_itertools.flatten(collection.names for collection in self.parser.attribute_mapping.values())
3636
)
37+
supported_attributes.add("free_access")
3738
if missing_attributes := extraction_filter.required_attributes - supported_attributes:
3839
if len(missing_attributes) == 1:
3940
basic_logger.warning(

0 commit comments

Comments
 (0)