Resolve Merge Conflict

addie9800 · addie9800 · commit 395a0ebd0cc8 · 2024-02-20T22:12:57.000+01:00
diff --git a/docs/attribute_guidelines.md b/docs/attribute_guidelines.md
@@ -58,4 +58,12 @@ Those attributes will be validated with unit tests when used.
         <td><code>List[str]</code></td>
         <td><code>generic_topic_parsing</code></td>
     </tr>
+    <tr>
+        <td>free_access</td>
+        <td>A boolean which is set to be True, if the article is restricted to users with a subscription. This usually indicates
+        that the article cannot be crawled completely.
+        <i>This attribute is implemented by default</i></td>
+        <td><code>bool</code></td>
+        <td><code></code></td>
+    </tr>
 </table>
diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md
@@ -469,6 +469,24 @@ Instead, we recommend referring to [this](https://devhints.io/xpath) documentati
 Make sure to examine other parsers and consult the [attribute guidelines](attribute_guidelines.md) for specifics on attribute implementation. 
 We strongly encourage utilizing these utility functions, especially when parsing the `ArticleBody`.
 
+### Checking the free_access attribute
+
+In case your new publisher does not have a subscription model, you can go ahead and skip this step. If it does,
+please verify that there is a tag `isAccessibleForFree` within the `<script type="application/ld+json">` blocks in the
+source code of premium articles that is set to either `false` or `False`. It doesn't matter if the tag is missing in the
+freely accessible articles. If this is the case, you can continue with the next step. If not, please overwrite the
+existing function by adding the following snippet to your parser:
+
+```python
+@attribute
+def free_access(self) -> bool:
+    # Your personalized logic goes here
+    pass
+```
+
+Usually you can identify a premium article by an indicator within the URL or by using XPath or CSSSelector and selecting
+the element asking to to purchase a subscription to view the article.
+
 ### Finishing the Parser
 
 Bringing all the above together, the Los Angeles Times now looks like this.
diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py
@@ -172,7 +172,7 @@ def _search_members(cls, obj_type: type) -> List[Tuple[str, Any]]:
     @classmethod
     def attributes(cls) -> AttributeCollection:
         attrs: List[Attribute] = [
-            func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta"]
+            func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta", "free_access"]
         ]
         return AttributeCollection(*attrs)
 
@@ -233,6 +233,15 @@ def __meta(self) -> Dict[str, Any]:
     def __ld(self) -> Optional[LinkedDataMapping]:
         return self.precomputed.ld
 
+    @attribute
+    def free_access(self) -> bool:
+        if (isAccessibleForFree := self.precomputed.ld.bf_search("isAccessibleForFree")) is None:
+            return True
+        elif not isAccessibleForFree or isAccessibleForFree == "false" or isAccessibleForFree == "False":
+            return False
+        else:
+            return True
+
 
 class _ParserCache:
     def __init__(self, factory: Type[BaseParser]):
diff --git a/src/fundus/publishers/de/bild.py b/src/fundus/publishers/de/bild.py
@@ -1,4 +1,5 @@
 import datetime
+import re
 from typing import List, Optional
 
 from lxml.etree import XPath
@@ -42,3 +43,10 @@ def title(self) -> Optional[str]:
         @attribute
         def topics(self) -> List[str]:
             return generic_topic_parsing(self.precomputed.meta.get("keywords"))
+
+        @attribute
+        def free_access(self) -> bool:
+            if (url := self.precomputed.meta.get("og:url")) is not None:
+                return re.search(r"/bild-plus/", url) is None
+            else:
+                return True
diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py
@@ -56,7 +56,3 @@ def authors(self) -> List[str]:
         @attribute
         def publishing_date(self) -> Optional[datetime.datetime]:
             return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
-
-        @attribute(validate=False)
-        def free_access(self) -> bool:
-            return self.precomputed.ld.bf_search("isAccessibleForFree") == "True"
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
@@ -34,6 +34,7 @@ async def scrape(
             supported_attributes = set(
                 more_itertools.flatten(collection.names for collection in self.parser.attribute_mapping.values())
             )
+            supported_attributes.add("free_access")
             if missing_attributes := extraction_filter.required_attributes - supported_attributes:
                 if len(missing_attributes) == 1:
                     basic_logger.warning(

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ async def scrape(`
`34`	`34`	`supported_attributes = set(`
`35`	`35`	`more_itertools.flatten(collection.names for collection in self.parser.attribute_mapping.values())`
`36`	`36`	`)`
	`37`	`+ supported_attributes.add("free_access")`
`37`	`38`	`if missing_attributes := extraction_filter.required_attributes - supported_attributes:`
`38`	`39`	`if len(missing_attributes) == 1:`
`39`	`40`	`basic_logger.warning(`