Skip to content

Commit

Permalink
docs: add sphinx docs, type hinting
Browse files Browse the repository at this point in the history
  • Loading branch information
pjsier committed Jul 13, 2020
1 parent 0ff0439 commit 741d7c0
Show file tree
Hide file tree
Showing 35 changed files with 691 additions and 84 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- run: python setup.py develop

- name: Check imports with isort
run: isort --check-only --diff || exit 1
run: isort . --check --diff || exit 1

- name: Check formatting with black
run: black ./city_scrapers_core ./tests --check
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ env/
build/
develop-eggs/
dist/
docs/_build/
docs/_site/
downloads/
eggs/
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

[![Build status](https://github.com/City-Bureau/city-scrapers-core/workflows/CI/badge.svg)](https://github.com/City-Bureau/city-scrapers-core/actions)

[![Documentation Status](https://readthedocs.org/projects/city-scrapers-core/badge/?version=latest)](https://city-scrapers-core.readthedocs.io/en/latest/?badge=latest)

Core functionality for creating public meetings web scrapers for the [City Scrapers](https://cityscrapers.org/) project.

See the [documentation](https://city-scrapers-core.readthedocs.io/) for more details.

## Installation

```bash
Expand Down
2 changes: 1 addition & 1 deletion city_scrapers_core/commands/genspider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import json
import logging
import shutil
import string
from datetime import datetime
Expand Down
1 change: 0 additions & 1 deletion city_scrapers_core/commands/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from ..pipelines import ValidationPipeline


logger = logging.getLogger(__name__)


Expand Down
7 changes: 6 additions & 1 deletion city_scrapers_core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@


def ignore_processed(func):
"""Method decorator to ignore processed items passed to pipeline by middleware"""
"""Method decorator to ignore processed items passed to pipeline by middleware.
This should be used on the ``process_item`` method of any additional custom
pipelines used to handle :class:`Meeting` objects to make sure that ``dict`` items
passed by :class:`DiffPipeline` don't cause issues.
"""

@wraps(func)
def wrapper(*args, **kwargs):
Expand Down
9 changes: 8 additions & 1 deletion city_scrapers_core/extensions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
from .azure_storage import AzureBlobFeedStorage # noqa
from .status import AzureBlobStatusExtension, S3StatusExtension # noqa
from .status import AzureBlobStatusExtension, S3StatusExtension, StatusExtension # noqa

__all__ = [
"AzureBlobFeedStorage",
"StatusExtension",
"AzureBlobStatusExtension",
"S3StatusExtension",
]
12 changes: 10 additions & 2 deletions city_scrapers_core/extensions/azure_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@


class AzureBlobFeedStorage(BlockingFeedStorage):
def __init__(self, uri):
"""
Subclass of :class:`scrapy.extensions.feedexport.BlockingFeedStorage` for writing
scraper results to Azure Blob Storage.
:param uri: Azure Blob Storage URL including an account name, credentials,
container, and filename
"""

def __init__(self, uri: str):
from azure.storage.blob import ContainerClient

container = uri.split("@")[1].split("/")[0]
Expand All @@ -14,7 +22,7 @@ def __init__(self, uri):
self.container = container
self.filename = filename
self.container_client = ContainerClient(
"{}.blob.core.windows.net".format(self.account_name),
f"{self.account_name}.blob.core.windows.net",
self.container,
credential=self.account_key,
)
Expand Down
72 changes: 55 additions & 17 deletions city_scrapers_core/extensions/status.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime

import pytz
from scrapy import signals
from scrapy import Spider, signals
from scrapy.crawler import Crawler

RUNNING = "running"
FAILING = "failing"
Expand Down Expand Up @@ -31,59 +32,88 @@


class StatusExtension:
"""
Scrapy extension for maintaining an SVG badge for each scraper's status.
TODO: Track how many items are scraped on each run.
"""
"""Scrapy extension for maintaining an SVG badge for each scraper's status."""

def __init__(self, crawler):
def __init__(self, crawler: Crawler):
self.crawler = crawler
self.has_error = False
# TODO: Track how many items are scraped on each run.

@classmethod
def from_crawler(cls, crawler):
def from_crawler(cls, crawler: Crawler):
"""Generate an extension from a crawler
:param crawler: Current scrapy crawler
"""
ext = cls(crawler)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
return ext

def spider_closed(self):
"""Updates the status SVG with a running status unless the spider has
encountered an error in which case it exits
"""
if self.has_error:
return
svg = self.create_status_svg(self.crawler.spider, RUNNING)
self.update_status_svg(self.crawler.spider, svg)

def spider_error(self):
"""Sets the `has_error` flag on the first spider error and immediately updates the
SVG with a "failing" status
"""
self.has_error = True
svg = self.create_status_svg(self.crawler.spider, FAILING)
self.update_status_svg(self.crawler.spider, svg)

def create_status_svg(self, spider, status):
def create_status_svg(self, spider: Spider, status: str) -> str:
"""Format a template status SVG string based on a spider and status information
:param spider: Spider to determine the status for
:param status: String indicating scraper status, one of "running", "failing"
:return: An SVG string formatted for a given spider and status
"""

tz = pytz.timezone(spider.timezone)
return STATUS_ICON.format(
color=STATUS_COLOR_MAP[status],
status=status,
date=tz.localize(datetime.now()).strftime("%Y-%m-%d"),
)

def update_status_svg(self, spider, svg):
def update_status_svg(self, spider: Spider, svg: str):
"""Method for updating the status button SVG for a storage provider. Must be
implemented on subclasses.
:param spider: Spider with the status being tracked
:param svg: Templated SVG string
:raises NotImplementedError: Raises if not implemented on subclass
"""
raise NotImplementedError


class AzureBlobStatusExtension(StatusExtension):
def update_status_svg(self, spider, svg):
"""
Implements :class:`StatusExtension` for Azure Blob Storage
"""

def update_status_svg(self, spider: Spider, svg: str):
"""Implements writing templated status SVG to Azure Blob Storage
:param spider: Spider with the status being tracked
:param svg: Templated SVG string
"""

from azure.storage.blob import ContainerClient, ContentSettings

container_client = ContainerClient(
"{}.blob.core.windows.net".format(
self.crawler.settings.get("AZURE_ACCOUNT_NAME")
),
f"{self.crawler.settings.get('AZURE_ACCOUNT_NAME')}.blob.core.windows.net",
self.crawler.settings.get("CITY_SCRAPERS_STATUS_CONTAINER"),
credential=self.crawler.settings.get("AZURE_ACCOUNT_KEY"),
)
container_client.upload_blob(
"{}.svg".format(spider.name),
f"{spider.name}.svg",
svg,
content_settings=ContentSettings(
content_type="image/svg+xml", cache_control="no-cache"
Expand All @@ -93,7 +123,15 @@ def update_status_svg(self, spider, svg):


class S3StatusExtension(StatusExtension):
def update_status_svg(self, spider, svg):
"""Implements :class:`StatusExtension` for AWS S3"""

def update_status_svg(self, spider: Spider, svg: str):
"""Implements writing templated status SVG to AWS S3
:param spider: Spider with the status being tracked
:param svg: Templated SVG string
"""

import boto3

s3_client = boto3.client(
Expand All @@ -106,5 +144,5 @@ def update_status_svg(self, spider, svg):
Bucket=self.crawler.settings.get("CITY_SCRAPERS_STATUS_BUCKET"),
CacheControl="no-cache",
ContentType="image/svg+xml",
Key="{}.svg".format(spider.name),
Key=f"{spider.name}.svg",
)
2 changes: 2 additions & 0 deletions city_scrapers_core/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@


class Meeting(scrapy.Item):
"""Main scrapy Item subclass used for handing meetings."""

id = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
Expand Down
10 changes: 10 additions & 0 deletions city_scrapers_core/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,13 @@
from .meeting import MeetingPipeline # noqa
from .ocd import OpenCivicDataPipeline # noqa
from .validation import ValidationPipeline # noqa

__all__ = [
"DefaultValuesPipeline",
"DiffPipeline",
"AzureDiffPipeline",
"S3DiffPipeline",
"MeetingPipeline",
"OpenCivicDataPipeline",
"ValidationPipeline",
]
17 changes: 13 additions & 4 deletions city_scrapers_core/pipelines/default.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
from city_scrapers_core.constants import NOT_CLASSIFIED, TENTATIVE
from city_scrapers_core.decorators import ignore_processed
from scrapy import Item, Spider

from ..constants import NOT_CLASSIFIED, TENTATIVE
from ..decorators import ignore_processed


class DefaultValuesPipeline:
"""Sets default values for Meeting items"""
"""Pipeline for setting default values on scraped Item objects"""

@ignore_processed
def process_item(self, item, spider):
def process_item(self, item: Item, spider: Spider) -> Item:
"""Pipeline hook for setting multiple default values for scraped Item objects
:param item: An individual Item that's been scraped
:param spider: Spider passed to the pipeline
:return: Item with defaults set
"""

item.setdefault("description", "")
item.setdefault("all_day", False)
item.setdefault("location", {})
Expand Down
Loading

0 comments on commit 741d7c0

Please sign in to comment.