Skip to content

Commit

Permalink
refactor: drop py 3.5 support, f strings, logs
Browse files Browse the repository at this point in the history
Drops Python 3.5 support, replaces several calls to .format() with f
string literals, replaces print() calls with logs
  • Loading branch information
pjsier committed Jul 10, 2020
1 parent a2f0fed commit 8e1f739
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 43 deletions.
11 changes: 4 additions & 7 deletions city_scrapers_core/commands/combinefeeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,7 @@ def combine_azure(self):
account_name, account_key = feed_uri[8::].split("@")[0].split(":")
container = feed_uri.split("@")[1].split("/")[0]
container_client = ContainerClient(
"{}.blob.core.windows.net".format(account_name),
container,
credential=account_key,
f"{account_name}.blob.core.windows.net", container, credential=account_key,
)

max_days_previous = 3
Expand Down Expand Up @@ -134,9 +132,8 @@ def combine_azure(self):
spider_blob_name = blob_name.split("/")[-1]
spider_blob = container_client.get_blob_client(spider_blob_name)
spider_blob.start_copy_from_url(
"https://{}.blob.core.windows.net/{}/{}".format(
account_name, quote(container), blob_name
)
f"https://{account_name}.blob.core.windows.net"
f"/{quote(container)}/{blob_name}"
)
meetings = sorted(meetings, key=itemgetter(self.start_key))
yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19]
Expand Down Expand Up @@ -164,7 +161,7 @@ def get_spider_paths(self, path_list):
"""Get a list of the most recent scraper results for each spider"""
spider_paths = []
for spider in self.crawler_process.spider_loader.list():
all_spider_paths = [p for p in path_list if "{}.".format(spider) in p]
all_spider_paths = [p for p in path_list if f"{spider}." in p]
if len(all_spider_paths) > 0:
spider_paths.append(sorted(all_spider_paths)[-1])
return spider_paths
Expand Down
33 changes: 17 additions & 16 deletions city_scrapers_core/commands/genspider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import json
import shutil
import string
Expand All @@ -14,6 +15,8 @@

USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36" # noqa

logger = logging.getLogger(__name__)


class Command(ScrapyCommand):
requires_project = False
Expand All @@ -34,13 +37,13 @@ def run(self, args, opts):
test_template = "test.tmpl"
if "legistar.com" in domain:
proto = "https" if start_url.startswith("https") else "http"
start_url = "{}://{}".format(proto, domain)
start_url = f"{proto}://{domain}"
spider_template = "spider_legistar.tmpl"
test_template = "test_legistar.tmpl"
fixture_file = self._gen_legistar_fixtures(name, start_url)
else:
fixture_file = self._gen_fixtures(name, start_url)
classname = "{}Spider".format(string.capwords(name, sep="_").replace("_", ""))
classname = f"{string.capwords(name, sep='_').replace('_', '')}Spider"
self._genspider(name, agency, classname, domain, start_url, spider_template)
self._gen_tests(name, classname, start_url, fixture_file, test_template)

Expand All @@ -51,14 +54,12 @@ def _genspider(self, name, agency, classname, domain, start_url, template_file):
"agency": agency,
"domain": domain,
"start_url": start_url,
"classname": "{}Spider".format(
string.capwords(name, sep="_").replace("_", "")
),
"classname": f"{string.capwords(name, sep='_').replace('_', '')}Spider",
}
spider_file = "{}.py".format(join(self.spiders_dir, name))
spider_file = f"{join(self.spiders_dir, name)}.py"
shutil.copyfile(join(self.templates_dir, template_file), spider_file)
render_templatefile(spider_file, **template_dict)
print("Created file: {}".format(spider_file))
logger.info(f"Created file: {spider_file}")

def _gen_tests(self, name, classname, start_url, fixture_file, template_file):
"""Creates tests from test template file"""
Expand All @@ -70,34 +71,34 @@ def _gen_tests(self, name, classname, start_url, fixture_file, template_file):
}
if "legistar" not in name:
template_dict["start_url"] = start_url
test_file = join(self.tests_dir, "test_{}.py".format(name))
test_file = join(self.tests_dir, f"test_{name}.py")
shutil.copyfile(join(self.templates_dir, template_file), test_file)
render_templatefile(test_file, **template_dict)
print("Created file: {}".format(test_file))
logger.info(f"Created file: {test_file}")

def _gen_fixtures(self, name, start_url):
"""Creates fixures from HTML response at the start URL"""
res = requests.get(start_url, headers={"user-agent": USER_AGENT})
content = res.text.strip()
fixture_file = join(self.fixtures_dir, "{}.html".format(name))
fixture_file = join(self.fixtures_dir, f"{name}.html")
with open(fixture_file, "w", encoding="utf-8") as f:
f.write(content)
print("Created file: {}".format(fixture_file))
return "{}.html".format(name)
logger.info(f"Created file: {fixture_file}")
return f"{name}.html"

def _gen_legistar_fixtures(self, name, start_url):
"""Creates fixtures from a Legistar response"""
events = []
les = LegistarEventsScraper()
les.BASE_URL = start_url
les.EVENTSPAGE = "{}/Calendar.aspx".format(start_url)
les.EVENTSPAGE = f"{start_url}/Calendar.aspx"
for event, _ in les.events(since=datetime.today().year):
events.append((dict(event), None))
fixture_file = join(self.fixtures_dir, "{}.json".format(name))
fixture_file = join(self.fixtures_dir, f"{name}.json")
with open(fixture_file, "w", encoding="utf-8") as f:
json.dump(events, f)
print("Created file: {}".format(fixture_file))
return "{}.json".format(name)
logger.info(f"Created file: {fixture_file}")
return f"{name}.json"

@property
def spiders_dir(self):
Expand Down
8 changes: 6 additions & 2 deletions city_scrapers_core/commands/validate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
from importlib import import_module

Expand All @@ -6,6 +7,9 @@
from ..pipelines import ValidationPipeline


logger = logging.getLogger(__name__)


class Command(ScrapyCommand):
requires_project = True

Expand All @@ -29,7 +33,7 @@ def run(self, args, opts):
spider_list = self.crawler_process.spider_loader.list()
spiders = [spider for spider in args if spider in spider_list]
if len(spiders) == 0 and not opts.all:
print("No spiders provided, exiting...")
logger.info("No spiders provided, exiting...")
return
elif opts.all:
spiders = spider_list
Expand All @@ -44,7 +48,7 @@ def _add_validation_pipeline(self):
# Exit if pipeline already included
if any(pipeline_name in pipeline for pipeline in pipelines.keys()):
return
fullname = "{}.{}".format(ValidationPipeline.__module__, pipeline_name)
fullname = f"{ValidationPipeline.__module__}.{pipeline_name}"
priority = 1
if len(pipelines.keys()) > 0:
priority = max(pipelines.values()) + 1
Expand Down
8 changes: 3 additions & 5 deletions city_scrapers_core/pipelines/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(self, crawler, output_format):
self.spider = crawler.spider
self.container = feed_uri.split("@")[1].split("/")[0]
self.container_client = ContainerClient(
"{}.blob.core.windows.net".format(account_name),
f"{account_name}.blob.core.windows.net",
self.container,
credential=account_key,
)
Expand All @@ -119,9 +119,7 @@ def load_previous_results(self):
).strftime(self.feed_prefix)
)
spider_blobs = [
blob
for blob in matching_blobs
if "{}.".format(self.spider.name) in blob.name
blob for blob in matching_blobs if f"{self.spider_name}." in blob.name
]
if len(spider_blobs) > 0:
break
Expand Down Expand Up @@ -170,7 +168,7 @@ def load_previous_results(self):
spider_objects = [
obj
for obj in match_objects.get("Contents", [])
if "{}.".format(self.spider.name) in obj["Key"]
if f"{self.spider.name}." in obj["Key"]
]
if len(spider_objects) > 0:
break
Expand Down
17 changes: 9 additions & 8 deletions city_scrapers_core/pipelines/validation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import logging
from collections import defaultdict

from jsonschema.validators import Draft7Validator


logger = logging.getLogger(__name__)


class ValidationPipeline:
"""
Check against schema if present, prints % valid for each property.
Expand Down Expand Up @@ -43,17 +47,14 @@ def process_item(self, item, spider):
def validation_report(self, spider):
"""Prints a validation report to stdout and raise an error if fails"""
props = list(self.error_count.keys())
print(
"\n{line}Validation summary for: {spider}{line}".format(
line="-" * 12, spider=spider.name
)
)
print("Validating {} items\n".format(self.item_count))
line_str = "-" * 12
logger.info(f"\n{line_str}\nValidation summary for: {spider.name}\n{line_str}")
logger.info(f"Validating {self.item_count} items\n")
valid_list = []
for prop in props:
valid = (self.item_count - self.error_count[prop]) / self.item_count
valid_list.append(valid)
print("{}: {:.0%}".format(prop, valid))
logger.info("{}: {:.0%}".format(prop, valid))
try:
assert all([val >= 0.9 for val in valid_list])
except AssertionError:
Expand All @@ -66,7 +67,7 @@ def validation_report(self, spider):
if self.enforce_validation:
raise ValueError(message)
else:
print(message)
logger.info(message)

def _get_props_from_errors(self, errors):
error_props = []
Expand Down
8 changes: 4 additions & 4 deletions city_scrapers_core/spiders/legistar.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def parse(self, response):
def _call_legistar(self, since=None):
les = LegistarEventsScraper()
les.BASE_URL = self.base_url
les.EVENTSPAGE = "{}/Calendar.aspx".format(self.base_url)
les.EVENTSPAGE = f"{self.base_url}/Calendar.aspx"
if not since:
since = datetime.today().year
return les.events(since=since)
Expand All @@ -29,7 +29,7 @@ def legistar_start(self, item):
if start_date and start_time:
try:
return datetime.strptime(
"{} {}".format(start_date, start_time), "%m/%d/%Y %I:%M %p"
f"{start_date} {start_time}", "%m/%d/%Y %I:%M %p"
)
except ValueError:
return datetime.strptime(start_date, "%m/%d/%Y")
Expand All @@ -42,7 +42,7 @@ def legistar_links(self, item):
return links

def legistar_source(self, item):
default_url = "{}/Calendar.aspx".format(self.base_url)
default_url = f"{self.base_url}/Calendar.aspx"
if isinstance(item.get("Name"), dict):
return item["Name"].get("url", default_url)
if isinstance(item.get("Meeting Details"), dict):
Expand All @@ -52,4 +52,4 @@ def legistar_source(self, item):
@property
def base_url(self):
parsed_url = urlparse(self.start_urls[0])
return "{}://{}".format(parsed_url.scheme, parsed_url.netloc)
return f"{parsed_url.scheme}://{parsed_url.netloc}"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
install_requires=["jsonschema>=3.0.0a5", "pytz", "requests", "scrapy"],
tests_requires=["flake8", "pytest", "isort"],
extras_require={"aws": ["boto3"], "azure": ["azure-storage-blob>=12"]},
python_requires=">=3.5,<4.0",
python_requires=">=3.6,<4.0",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit 8e1f739

Please sign in to comment.