diff --git a/docs/attribute_guidelines.md b/docs/attribute_guidelines.md
index 01b906cbc..7359cf625 100644
--- a/docs/attribute_guidelines.md
+++ b/docs/attribute_guidelines.md
@@ -58,4 +58,12 @@ Those attributes will be validated with unit tests when used.
List[str] |
generic_topic_parsing |
+
+ free_access |
+ A boolean which is set to be False, if the article is restricted to users with a subscription. This usually indicates
+ that the article cannot be crawled completely.
+ This attribute is implemented by default |
+ bool |
+
|
+
diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md
index 0c9cf14cb..ddece3b3d 100644
--- a/docs/how_to_add_a_publisher.md
+++ b/docs/how_to_add_a_publisher.md
@@ -16,6 +16,7 @@
* [Working with `lxml`](#working-with-lxml)
* [CSS-Select](#css-select)
* [XPath](#xpath)
+ * [Checking the free_access attribute](#checking-the-free_access-attribute)
* [Finishing the Parser](#finishing-the-parser)
* [6. Generate unit tests](#6-generate-unit-tests)
* [7. Opening a Pull Request](#7-opening-a-pull-request)
@@ -469,6 +470,23 @@ Instead, we recommend referring to [this](https://devhints.io/xpath) documentati
Make sure to examine other parsers and consult the [attribute guidelines](attribute_guidelines.md) for specifics on attribute implementation.
We strongly encourage utilizing these utility functions, especially when parsing the `ArticleBody`.
+### Checking the free_access attribute
+
+In case your new publisher does not have a subscription model, you can go ahead and skip this step.
+If it does, please verify that there is a tag `isAccessibleForFree` within the HTMLs `ld+json` elements (refer to the section [Extracting attributes from Precomputed](#extracting-attributes-from-precomputed) for details) in the source code of premium articles that is set to either `false` or `False`, `true`/`True` respectively.
+It doesn't matter if the tag is missing in the freely accessible articles.
+If this is the case, you can continue with the next step. If not, please overwrite the existing function by adding the following snippet to your parser:
+
+```python
+@attribute
+def free_access(self) -> bool:
+ # Your personalized logic goes here
+ ...
+```
+
+Usually you can identify a premium article by an indicator within the URL or by using XPath or CSSSelector and selecting
+the element asking to to purchase a subscription to view the article.
+
### Finishing the Parser
Bringing all the above together, the Los Angeles Times now looks like this.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
index 78bcdfda8..8c6893897 100644
--- a/docs/supported_publishers.md
+++ b/docs/supported_publishers.md
@@ -91,9 +91,7 @@
|
-
- free_access
- |
+ |
diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py
index 066991c83..3bc2eb99e 100644
--- a/src/fundus/parser/base_parser.py
+++ b/src/fundus/parser/base_parser.py
@@ -233,6 +233,15 @@ def __meta(self) -> Dict[str, Any]:
def __ld(self) -> Optional[LinkedDataMapping]:
return self.precomputed.ld
+ @attribute
+ def free_access(self) -> bool:
+ if (isAccessibleForFree := self.precomputed.ld.bf_search("isAccessibleForFree")) is None:
+ return True
+ elif not isAccessibleForFree or isAccessibleForFree == "false" or isAccessibleForFree == "False":
+ return False
+ else:
+ return True
+
class _ParserCache:
def __init__(self, factory: Type[BaseParser]):
diff --git a/src/fundus/publishers/de/bild.py b/src/fundus/publishers/de/bild.py
index 1fa1753b1..ad452c758 100644
--- a/src/fundus/publishers/de/bild.py
+++ b/src/fundus/publishers/de/bild.py
@@ -1,4 +1,5 @@
import datetime
+import re
from typing import List, Optional
from lxml.etree import XPath
@@ -42,3 +43,10 @@ def title(self) -> Optional[str]:
@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
+
+ @attribute
+ def free_access(self) -> bool:
+ if (url := self.precomputed.meta.get("og:url")) is not None:
+ return re.search(r"/bild-plus/", url) is None
+ else:
+ return True
diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py
index a80cc6aeb..00cf837ff 100644
--- a/src/fundus/publishers/de/braunschweiger_zeitung.py
+++ b/src/fundus/publishers/de/braunschweiger_zeitung.py
@@ -56,7 +56,3 @@ def authors(self) -> List[str]:
@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
-
- @attribute(validate=False)
- def free_access(self) -> bool:
- return self.precomputed.ld.bf_search("isAccessibleForFree") == "True"
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 6b519b10a..c1c77c93d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -47,10 +47,10 @@ def test_functions_iter(self, parser_with_function_test, parser_with_static_meth
assert parser_with_function_test.functions().names == ["test"]
def test_attributes_iter(self, parser_with_attr_title, parser_with_static_method):
- assert len(BaseParser.attributes()) == 0
- assert len(parser_with_static_method.attributes()) == 0
- assert len(parser_with_attr_title.attributes()) == 1
- assert parser_with_attr_title.attributes().names == ["title"]
+ assert len(BaseParser.attributes()) == 1
+ assert len(parser_with_static_method.attributes()) == 1
+ assert len(parser_with_attr_title.attributes()) == 2
+ assert parser_with_attr_title.attributes().names == ["free_access", "title"]
def test_supported_unsupported(self):
class ParserWithValidatedAndUnvalidated(BaseParser):
@@ -63,12 +63,12 @@ def unvalidated(self) -> str:
return "unsupported"
parser = ParserWithValidatedAndUnvalidated()
- assert len(parser.attributes()) == 2
+ assert len(parser.attributes()) == 3
assert (validated := parser.attributes().validated)
assert isinstance(validated, AttributeCollection)
assert (funcs := list(validated)) != [parser.validated]
- assert funcs[0].__func__ == parser.validated.__func__
+ assert funcs[1].__func__ == parser.validated.__func__
assert (unvalidated := parser.attributes().unvalidated)
assert isinstance(validated, AttributeCollection)
|