docs: add sphinx docs, type hinting

City-Bureau · Jul 13, 2020 · 741d7c0 · 741d7c0
1 parent 0ff0439
commit 741d7c0
Show file tree

Hide file tree

Showing 35 changed files with 691 additions and 84 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ jobs:
       - run: python setup.py develop
 
       - name: Check imports with isort
-        run: isort --check-only --diff || exit 1
+        run: isort . --check --diff || exit 1
 
       - name: Check formatting with black
         run: black ./city_scrapers_core ./tests --check

diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ env/
 build/
 develop-eggs/
 dist/
+docs/_build/
 docs/_site/
 downloads/
 eggs/

diff --git a/README.md b/README.md
@@ -2,8 +2,12 @@
 
 [![Build status](https://github.com/City-Bureau/city-scrapers-core/workflows/CI/badge.svg)](https://github.com/City-Bureau/city-scrapers-core/actions)
 
+[![Documentation Status](https://readthedocs.org/projects/city-scrapers-core/badge/?version=latest)](https://city-scrapers-core.readthedocs.io/en/latest/?badge=latest)
+
 Core functionality for creating public meetings web scrapers for the [City Scrapers](https://cityscrapers.org/) project.
 
+See the [documentation](https://city-scrapers-core.readthedocs.io/) for more details.
+
 ## Installation
 
 ```bash

diff --git a/city_scrapers_core/commands/genspider.py b/city_scrapers_core/commands/genspider.py
@@ -1,5 +1,5 @@
-import logging
 import json
+import logging
 import shutil
 import string
 from datetime import datetime

diff --git a/city_scrapers_core/commands/validate.py b/city_scrapers_core/commands/validate.py
@@ -6,7 +6,6 @@
 
 from ..pipelines import ValidationPipeline
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/city_scrapers_core/decorators.py b/city_scrapers_core/decorators.py
@@ -2,7 +2,12 @@
 
 
 def ignore_processed(func):
-    """Method decorator to ignore processed items passed to pipeline by middleware"""
+    """Method decorator to ignore processed items passed to pipeline by middleware.
+
+    This should be used on the ``process_item`` method of any additional custom
+    pipelines used to handle :class:`Meeting` objects to make sure that ``dict`` items
+    passed by :class:`DiffPipeline` don't cause issues.
+    """
 
     @wraps(func)
     def wrapper(*args, **kwargs):

diff --git a/city_scrapers_core/extensions/__init__.py b/city_scrapers_core/extensions/__init__.py
@@ -1,2 +1,9 @@
 from .azure_storage import AzureBlobFeedStorage  # noqa
-from .status import AzureBlobStatusExtension, S3StatusExtension  # noqa
+from .status import AzureBlobStatusExtension, S3StatusExtension, StatusExtension  # noqa
+
+__all__ = [
+    "AzureBlobFeedStorage",
+    "StatusExtension",
+    "AzureBlobStatusExtension",
+    "S3StatusExtension",
+]
diff --git a/city_scrapers_core/extensions/azure_storage.py b/city_scrapers_core/extensions/azure_storage.py
@@ -2,7 +2,15 @@
 
 
 class AzureBlobFeedStorage(BlockingFeedStorage):
-    def __init__(self, uri):
+    """
+    Subclass of :class:`scrapy.extensions.feedexport.BlockingFeedStorage` for writing
+    scraper results to Azure Blob Storage.
+
+    :param uri: Azure Blob Storage URL including an account name, credentials,
+                container, and filename
+    """
+
+    def __init__(self, uri: str):
         from azure.storage.blob import ContainerClient
 
         container = uri.split("@")[1].split("/")[0]
@@ -14,7 +22,7 @@ def __init__(self, uri):
         self.container = container
         self.filename = filename
         self.container_client = ContainerClient(
-            "{}.blob.core.windows.net".format(self.account_name),
+            f"{self.account_name}.blob.core.windows.net",
             self.container,
             credential=self.account_key,
         )

diff --git a/city_scrapers_core/extensions/status.py b/city_scrapers_core/extensions/status.py
@@ -1,7 +1,8 @@
 from datetime import datetime
 
 import pytz
-from scrapy import signals
+from scrapy import Spider, signals
+from scrapy.crawler import Crawler
 
 RUNNING = "running"
 FAILING = "failing"
@@ -31,59 +32,88 @@
 
 
 class StatusExtension:
-    """
-    Scrapy extension for maintaining an SVG badge for each scraper's status.
-
-    TODO: Track how many items are scraped on each run.
-    """
+    """Scrapy extension for maintaining an SVG badge for each scraper's status."""
 
-    def __init__(self, crawler):
+    def __init__(self, crawler: Crawler):
         self.crawler = crawler
         self.has_error = False
+        # TODO: Track how many items are scraped on each run.
 
     @classmethod
-    def from_crawler(cls, crawler):
+    def from_crawler(cls, crawler: Crawler):
+        """Generate an extension from a crawler
+
+        :param crawler: Current scrapy crawler
+        """
         ext = cls(crawler)
         crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
         crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
         return ext
 
     def spider_closed(self):
+        """Updates the status SVG with a running status unless the spider has
+        encountered an error in which case it exits
+        """
         if self.has_error:
             return
         svg = self.create_status_svg(self.crawler.spider, RUNNING)
         self.update_status_svg(self.crawler.spider, svg)
 
     def spider_error(self):
+        """Sets the `has_error` flag on the first spider error and immediately updates the
+        SVG with a "failing" status
+        """
         self.has_error = True
         svg = self.create_status_svg(self.crawler.spider, FAILING)
         self.update_status_svg(self.crawler.spider, svg)
 
-    def create_status_svg(self, spider, status):
+    def create_status_svg(self, spider: Spider, status: str) -> str:
+        """Format a template status SVG string based on a spider and status information
+
+        :param spider: Spider to determine the status for
+        :param status: String indicating scraper status, one of "running", "failing"
+        :return: An SVG string formatted for a given spider and status
+        """
+
         tz = pytz.timezone(spider.timezone)
         return STATUS_ICON.format(
             color=STATUS_COLOR_MAP[status],
             status=status,
             date=tz.localize(datetime.now()).strftime("%Y-%m-%d"),
         )
 
-    def update_status_svg(self, spider, svg):
+    def update_status_svg(self, spider: Spider, svg: str):
+        """Method for updating the status button SVG for a storage provider. Must be
+        implemented on subclasses.
+
+        :param spider: Spider with the status being tracked
+        :param svg: Templated SVG string
+        :raises NotImplementedError: Raises if not implemented on subclass
+        """
         raise NotImplementedError
 
 
 class AzureBlobStatusExtension(StatusExtension):
-    def update_status_svg(self, spider, svg):
+    """
+    Implements :class:`StatusExtension` for Azure Blob Storage
+    """
+
+    def update_status_svg(self, spider: Spider, svg: str):
+        """Implements writing templated status SVG to Azure Blob Storage
+
+        :param spider: Spider with the status being tracked
+        :param svg: Templated SVG string
+        """
+
         from azure.storage.blob import ContainerClient, ContentSettings
 
         container_client = ContainerClient(
-            "{}.blob.core.windows.net".format(
-                self.crawler.settings.get("AZURE_ACCOUNT_NAME")
-            ),
+            f"{self.crawler.settings.get('AZURE_ACCOUNT_NAME')}.blob.core.windows.net",
             self.crawler.settings.get("CITY_SCRAPERS_STATUS_CONTAINER"),
             credential=self.crawler.settings.get("AZURE_ACCOUNT_KEY"),
         )
         container_client.upload_blob(
-            "{}.svg".format(spider.name),
+            f"{spider.name}.svg",
             svg,
             content_settings=ContentSettings(
                 content_type="image/svg+xml", cache_control="no-cache"
@@ -93,7 +123,15 @@ def update_status_svg(self, spider, svg):
 
 
 class S3StatusExtension(StatusExtension):
-    def update_status_svg(self, spider, svg):
+    """Implements :class:`StatusExtension` for AWS S3"""
+
+    def update_status_svg(self, spider: Spider, svg: str):
+        """Implements writing templated status SVG to AWS S3
+
+        :param spider: Spider with the status being tracked
+        :param svg: Templated SVG string
+        """
+
         import boto3
 
         s3_client = boto3.client(
@@ -106,5 +144,5 @@ def update_status_svg(self, spider, svg):
             Bucket=self.crawler.settings.get("CITY_SCRAPERS_STATUS_BUCKET"),
             CacheControl="no-cache",
             ContentType="image/svg+xml",
-            Key="{}.svg".format(spider.name),
+            Key=f"{spider.name}.svg",
         )
diff --git a/city_scrapers_core/items.py b/city_scrapers_core/items.py
@@ -4,6 +4,8 @@
 
 
 class Meeting(scrapy.Item):
+    """Main scrapy Item subclass used for handing meetings."""
+
     id = scrapy.Field()
     title = scrapy.Field()
     description = scrapy.Field()

diff --git a/city_scrapers_core/pipelines/__init__.py b/city_scrapers_core/pipelines/__init__.py
@@ -3,3 +3,13 @@
 from .meeting import MeetingPipeline  # noqa
 from .ocd import OpenCivicDataPipeline  # noqa
 from .validation import ValidationPipeline  # noqa
+
+__all__ = [
+    "DefaultValuesPipeline",
+    "DiffPipeline",
+    "AzureDiffPipeline",
+    "S3DiffPipeline",
+    "MeetingPipeline",
+    "OpenCivicDataPipeline",
+    "ValidationPipeline",
+]
diff --git a/city_scrapers_core/pipelines/default.py b/city_scrapers_core/pipelines/default.py
@@ -1,12 +1,21 @@
-from city_scrapers_core.constants import NOT_CLASSIFIED, TENTATIVE
-from city_scrapers_core.decorators import ignore_processed
+from scrapy import Item, Spider
+
+from ..constants import NOT_CLASSIFIED, TENTATIVE
+from ..decorators import ignore_processed
 
 
 class DefaultValuesPipeline:
-    """Sets default values for Meeting items"""
+    """Pipeline for setting default values on scraped Item objects"""
 
     @ignore_processed
-    def process_item(self, item, spider):
+    def process_item(self, item: Item, spider: Spider) -> Item:
+        """Pipeline hook for setting multiple default values for scraped Item objects
+
+        :param item: An individual Item that's been scraped
+        :param spider: Spider passed to the pipeline
+        :return: Item with defaults set
+        """
+
         item.setdefault("description", "")
         item.setdefault("all_day", False)
         item.setdefault("location", {})
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,7 +6,6 @@

		from ..pipelines import ValidationPipeline


		logger = logging.getLogger(__name__)


Expand Down