diff --git a/docs/attribute_guidelines.md b/docs/attribute_guidelines.md index 01b906cbc..7359cf625 100644 --- a/docs/attribute_guidelines.md +++ b/docs/attribute_guidelines.md @@ -58,4 +58,12 @@ Those attributes will be validated with unit tests when used. List[str] generic_topic_parsing + + free_access + A boolean which is set to be False, if the article is restricted to users with a subscription. This usually indicates + that the article cannot be crawled completely. + This attribute is implemented by default + bool + + diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md index 0c9cf14cb..ddece3b3d 100644 --- a/docs/how_to_add_a_publisher.md +++ b/docs/how_to_add_a_publisher.md @@ -16,6 +16,7 @@ * [Working with `lxml`](#working-with-lxml) * [CSS-Select](#css-select) * [XPath](#xpath) + * [Checking the free_access attribute](#checking-the-free_access-attribute) * [Finishing the Parser](#finishing-the-parser) * [6. Generate unit tests](#6-generate-unit-tests) * [7. Opening a Pull Request](#7-opening-a-pull-request) @@ -469,6 +470,23 @@ Instead, we recommend referring to [this](https://devhints.io/xpath) documentati Make sure to examine other parsers and consult the [attribute guidelines](attribute_guidelines.md) for specifics on attribute implementation. We strongly encourage utilizing these utility functions, especially when parsing the `ArticleBody`. +### Checking the free_access attribute + +In case your new publisher does not have a subscription model, you can go ahead and skip this step. +If it does, please verify that there is a tag `isAccessibleForFree` within the HTMLs `ld+json` elements (refer to the section [Extracting attributes from Precomputed](#extracting-attributes-from-precomputed) for details) in the source code of premium articles that is set to either `false` or `False`, `true`/`True` respectively. +It doesn't matter if the tag is missing in the freely accessible articles. +If this is the case, you can continue with the next step. If not, please overwrite the existing function by adding the following snippet to your parser: + +```python +@attribute +def free_access(self) -> bool: + # Your personalized logic goes here + ... +``` + +Usually you can identify a premium article by an indicator within the URL or by using XPath or CSSSelector and selecting +the element asking to to purchase a subscription to view the article. + ### Finishing the Parser Bringing all the above together, the Los Angeles Times now looks like this. diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 78bcdfda8..8c6893897 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -91,9 +91,7 @@   - - free_access - +   diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index 066991c83..3bc2eb99e 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -233,6 +233,15 @@ def __meta(self) -> Dict[str, Any]: def __ld(self) -> Optional[LinkedDataMapping]: return self.precomputed.ld + @attribute + def free_access(self) -> bool: + if (isAccessibleForFree := self.precomputed.ld.bf_search("isAccessibleForFree")) is None: + return True + elif not isAccessibleForFree or isAccessibleForFree == "false" or isAccessibleForFree == "False": + return False + else: + return True + class _ParserCache: def __init__(self, factory: Type[BaseParser]): diff --git a/src/fundus/publishers/de/bild.py b/src/fundus/publishers/de/bild.py index 1fa1753b1..ad452c758 100644 --- a/src/fundus/publishers/de/bild.py +++ b/src/fundus/publishers/de/bild.py @@ -1,4 +1,5 @@ import datetime +import re from typing import List, Optional from lxml.etree import XPath @@ -42,3 +43,10 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def free_access(self) -> bool: + if (url := self.precomputed.meta.get("og:url")) is not None: + return re.search(r"/bild-plus/", url) is None + else: + return True diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index a80cc6aeb..00cf837ff 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -56,7 +56,3 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) - - @attribute(validate=False) - def free_access(self) -> bool: - return self.precomputed.ld.bf_search("isAccessibleForFree") == "True" diff --git a/tests/test_parser.py b/tests/test_parser.py index 6b519b10a..c1c77c93d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -47,10 +47,10 @@ def test_functions_iter(self, parser_with_function_test, parser_with_static_meth assert parser_with_function_test.functions().names == ["test"] def test_attributes_iter(self, parser_with_attr_title, parser_with_static_method): - assert len(BaseParser.attributes()) == 0 - assert len(parser_with_static_method.attributes()) == 0 - assert len(parser_with_attr_title.attributes()) == 1 - assert parser_with_attr_title.attributes().names == ["title"] + assert len(BaseParser.attributes()) == 1 + assert len(parser_with_static_method.attributes()) == 1 + assert len(parser_with_attr_title.attributes()) == 2 + assert parser_with_attr_title.attributes().names == ["free_access", "title"] def test_supported_unsupported(self): class ParserWithValidatedAndUnvalidated(BaseParser): @@ -63,12 +63,12 @@ def unvalidated(self) -> str: return "unsupported" parser = ParserWithValidatedAndUnvalidated() - assert len(parser.attributes()) == 2 + assert len(parser.attributes()) == 3 assert (validated := parser.attributes().validated) assert isinstance(validated, AttributeCollection) assert (funcs := list(validated)) != [parser.validated] - assert funcs[0].__func__ == parser.validated.__func__ + assert funcs[1].__func__ == parser.validated.__func__ assert (unvalidated := parser.attributes().unvalidated) assert isinstance(validated, AttributeCollection)