From 8e1f7392efd5e4dd7956c6614673e72e231d61b9 Mon Sep 17 00:00:00 2001 From: pjsier Date: Fri, 10 Jul 2020 07:10:04 -0500 Subject: [PATCH] refactor: drop py 3.5 support, f strings, logs Drops Python 3.5 support, replaces several calls to .format() with f string literals, replaces print() calls with logs --- city_scrapers_core/commands/combinefeeds.py | 11 +++---- city_scrapers_core/commands/genspider.py | 33 +++++++++++---------- city_scrapers_core/commands/validate.py | 8 +++-- city_scrapers_core/pipelines/diff.py | 8 ++--- city_scrapers_core/pipelines/validation.py | 17 ++++++----- city_scrapers_core/spiders/legistar.py | 8 ++--- setup.py | 2 +- 7 files changed, 44 insertions(+), 43 deletions(-) diff --git a/city_scrapers_core/commands/combinefeeds.py b/city_scrapers_core/commands/combinefeeds.py index b0db18b..17e7c96 100644 --- a/city_scrapers_core/commands/combinefeeds.py +++ b/city_scrapers_core/commands/combinefeeds.py @@ -101,9 +101,7 @@ def combine_azure(self): account_name, account_key = feed_uri[8::].split("@")[0].split(":") container = feed_uri.split("@")[1].split("/")[0] container_client = ContainerClient( - "{}.blob.core.windows.net".format(account_name), - container, - credential=account_key, + f"{account_name}.blob.core.windows.net", container, credential=account_key, ) max_days_previous = 3 @@ -134,9 +132,8 @@ def combine_azure(self): spider_blob_name = blob_name.split("/")[-1] spider_blob = container_client.get_blob_client(spider_blob_name) spider_blob.start_copy_from_url( - "https://{}.blob.core.windows.net/{}/{}".format( - account_name, quote(container), blob_name - ) + f"https://{account_name}.blob.core.windows.net" + f"/{quote(container)}/{blob_name}" ) meetings = sorted(meetings, key=itemgetter(self.start_key)) yesterday_iso = (datetime.now() - timedelta(days=1)).isoformat()[:19] @@ -164,7 +161,7 @@ def get_spider_paths(self, path_list): """Get a list of the most recent scraper results for each spider""" spider_paths = [] for spider in self.crawler_process.spider_loader.list(): - all_spider_paths = [p for p in path_list if "{}.".format(spider) in p] + all_spider_paths = [p for p in path_list if f"{spider}." in p] if len(all_spider_paths) > 0: spider_paths.append(sorted(all_spider_paths)[-1]) return spider_paths diff --git a/city_scrapers_core/commands/genspider.py b/city_scrapers_core/commands/genspider.py index 2fbf794..d0348ad 100644 --- a/city_scrapers_core/commands/genspider.py +++ b/city_scrapers_core/commands/genspider.py @@ -1,3 +1,4 @@ +import logging import json import shutil import string @@ -14,6 +15,8 @@ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36" # noqa +logger = logging.getLogger(__name__) + class Command(ScrapyCommand): requires_project = False @@ -34,13 +37,13 @@ def run(self, args, opts): test_template = "test.tmpl" if "legistar.com" in domain: proto = "https" if start_url.startswith("https") else "http" - start_url = "{}://{}".format(proto, domain) + start_url = f"{proto}://{domain}" spider_template = "spider_legistar.tmpl" test_template = "test_legistar.tmpl" fixture_file = self._gen_legistar_fixtures(name, start_url) else: fixture_file = self._gen_fixtures(name, start_url) - classname = "{}Spider".format(string.capwords(name, sep="_").replace("_", "")) + classname = f"{string.capwords(name, sep='_').replace('_', '')}Spider" self._genspider(name, agency, classname, domain, start_url, spider_template) self._gen_tests(name, classname, start_url, fixture_file, test_template) @@ -51,14 +54,12 @@ def _genspider(self, name, agency, classname, domain, start_url, template_file): "agency": agency, "domain": domain, "start_url": start_url, - "classname": "{}Spider".format( - string.capwords(name, sep="_").replace("_", "") - ), + "classname": f"{string.capwords(name, sep='_').replace('_', '')}Spider", } - spider_file = "{}.py".format(join(self.spiders_dir, name)) + spider_file = f"{join(self.spiders_dir, name)}.py" shutil.copyfile(join(self.templates_dir, template_file), spider_file) render_templatefile(spider_file, **template_dict) - print("Created file: {}".format(spider_file)) + logger.info(f"Created file: {spider_file}") def _gen_tests(self, name, classname, start_url, fixture_file, template_file): """Creates tests from test template file""" @@ -70,34 +71,34 @@ def _gen_tests(self, name, classname, start_url, fixture_file, template_file): } if "legistar" not in name: template_dict["start_url"] = start_url - test_file = join(self.tests_dir, "test_{}.py".format(name)) + test_file = join(self.tests_dir, f"test_{name}.py") shutil.copyfile(join(self.templates_dir, template_file), test_file) render_templatefile(test_file, **template_dict) - print("Created file: {}".format(test_file)) + logger.info(f"Created file: {test_file}") def _gen_fixtures(self, name, start_url): """Creates fixures from HTML response at the start URL""" res = requests.get(start_url, headers={"user-agent": USER_AGENT}) content = res.text.strip() - fixture_file = join(self.fixtures_dir, "{}.html".format(name)) + fixture_file = join(self.fixtures_dir, f"{name}.html") with open(fixture_file, "w", encoding="utf-8") as f: f.write(content) - print("Created file: {}".format(fixture_file)) - return "{}.html".format(name) + logger.info(f"Created file: {fixture_file}") + return f"{name}.html" def _gen_legistar_fixtures(self, name, start_url): """Creates fixtures from a Legistar response""" events = [] les = LegistarEventsScraper() les.BASE_URL = start_url - les.EVENTSPAGE = "{}/Calendar.aspx".format(start_url) + les.EVENTSPAGE = f"{start_url}/Calendar.aspx" for event, _ in les.events(since=datetime.today().year): events.append((dict(event), None)) - fixture_file = join(self.fixtures_dir, "{}.json".format(name)) + fixture_file = join(self.fixtures_dir, f"{name}.json") with open(fixture_file, "w", encoding="utf-8") as f: json.dump(events, f) - print("Created file: {}".format(fixture_file)) - return "{}.json".format(name) + logger.info(f"Created file: {fixture_file}") + return f"{name}.json" @property def spiders_dir(self): diff --git a/city_scrapers_core/commands/validate.py b/city_scrapers_core/commands/validate.py index 8c70233..877c200 100644 --- a/city_scrapers_core/commands/validate.py +++ b/city_scrapers_core/commands/validate.py @@ -1,3 +1,4 @@ +import logging import os from importlib import import_module @@ -6,6 +7,9 @@ from ..pipelines import ValidationPipeline +logger = logging.getLogger(__name__) + + class Command(ScrapyCommand): requires_project = True @@ -29,7 +33,7 @@ def run(self, args, opts): spider_list = self.crawler_process.spider_loader.list() spiders = [spider for spider in args if spider in spider_list] if len(spiders) == 0 and not opts.all: - print("No spiders provided, exiting...") + logger.info("No spiders provided, exiting...") return elif opts.all: spiders = spider_list @@ -44,7 +48,7 @@ def _add_validation_pipeline(self): # Exit if pipeline already included if any(pipeline_name in pipeline for pipeline in pipelines.keys()): return - fullname = "{}.{}".format(ValidationPipeline.__module__, pipeline_name) + fullname = f"{ValidationPipeline.__module__}.{pipeline_name}" priority = 1 if len(pipelines.keys()) > 0: priority = max(pipelines.values()) + 1 diff --git a/city_scrapers_core/pipelines/diff.py b/city_scrapers_core/pipelines/diff.py index cd95568..fca9b9e 100644 --- a/city_scrapers_core/pipelines/diff.py +++ b/city_scrapers_core/pipelines/diff.py @@ -99,7 +99,7 @@ def __init__(self, crawler, output_format): self.spider = crawler.spider self.container = feed_uri.split("@")[1].split("/")[0] self.container_client = ContainerClient( - "{}.blob.core.windows.net".format(account_name), + f"{account_name}.blob.core.windows.net", self.container, credential=account_key, ) @@ -119,9 +119,7 @@ def load_previous_results(self): ).strftime(self.feed_prefix) ) spider_blobs = [ - blob - for blob in matching_blobs - if "{}.".format(self.spider.name) in blob.name + blob for blob in matching_blobs if f"{self.spider_name}." in blob.name ] if len(spider_blobs) > 0: break @@ -170,7 +168,7 @@ def load_previous_results(self): spider_objects = [ obj for obj in match_objects.get("Contents", []) - if "{}.".format(self.spider.name) in obj["Key"] + if f"{self.spider.name}." in obj["Key"] ] if len(spider_objects) > 0: break diff --git a/city_scrapers_core/pipelines/validation.py b/city_scrapers_core/pipelines/validation.py index dc69f28..476e701 100644 --- a/city_scrapers_core/pipelines/validation.py +++ b/city_scrapers_core/pipelines/validation.py @@ -1,8 +1,12 @@ +import logging from collections import defaultdict from jsonschema.validators import Draft7Validator +logger = logging.getLogger(__name__) + + class ValidationPipeline: """ Check against schema if present, prints % valid for each property. @@ -43,17 +47,14 @@ def process_item(self, item, spider): def validation_report(self, spider): """Prints a validation report to stdout and raise an error if fails""" props = list(self.error_count.keys()) - print( - "\n{line}Validation summary for: {spider}{line}".format( - line="-" * 12, spider=spider.name - ) - ) - print("Validating {} items\n".format(self.item_count)) + line_str = "-" * 12 + logger.info(f"\n{line_str}\nValidation summary for: {spider.name}\n{line_str}") + logger.info(f"Validating {self.item_count} items\n") valid_list = [] for prop in props: valid = (self.item_count - self.error_count[prop]) / self.item_count valid_list.append(valid) - print("{}: {:.0%}".format(prop, valid)) + logger.info("{}: {:.0%}".format(prop, valid)) try: assert all([val >= 0.9 for val in valid_list]) except AssertionError: @@ -66,7 +67,7 @@ def validation_report(self, spider): if self.enforce_validation: raise ValueError(message) else: - print(message) + logger.info(message) def _get_props_from_errors(self, errors): error_props = [] diff --git a/city_scrapers_core/spiders/legistar.py b/city_scrapers_core/spiders/legistar.py index 27bc0a6..aa7ed46 100644 --- a/city_scrapers_core/spiders/legistar.py +++ b/city_scrapers_core/spiders/legistar.py @@ -18,7 +18,7 @@ def parse(self, response): def _call_legistar(self, since=None): les = LegistarEventsScraper() les.BASE_URL = self.base_url - les.EVENTSPAGE = "{}/Calendar.aspx".format(self.base_url) + les.EVENTSPAGE = f"{self.base_url}/Calendar.aspx" if not since: since = datetime.today().year return les.events(since=since) @@ -29,7 +29,7 @@ def legistar_start(self, item): if start_date and start_time: try: return datetime.strptime( - "{} {}".format(start_date, start_time), "%m/%d/%Y %I:%M %p" + f"{start_date} {start_time}", "%m/%d/%Y %I:%M %p" ) except ValueError: return datetime.strptime(start_date, "%m/%d/%Y") @@ -42,7 +42,7 @@ def legistar_links(self, item): return links def legistar_source(self, item): - default_url = "{}/Calendar.aspx".format(self.base_url) + default_url = f"{self.base_url}/Calendar.aspx" if isinstance(item.get("Name"), dict): return item["Name"].get("url", default_url) if isinstance(item.get("Meeting Details"), dict): @@ -52,4 +52,4 @@ def legistar_source(self, item): @property def base_url(self): parsed_url = urlparse(self.start_urls[0]) - return "{}://{}".format(parsed_url.scheme, parsed_url.netloc) + return f"{parsed_url.scheme}://{parsed_url.netloc}" diff --git a/setup.py b/setup.py index de0ce89..0761c8a 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ install_requires=["jsonschema>=3.0.0a5", "pytz", "requests", "scrapy"], tests_requires=["flake8", "pytest", "isort"], extras_require={"aws": ["boto3"], "azure": ["azure-storage-blob>=12"]}, - python_requires=">=3.5,<4.0", + python_requires=">=3.6,<4.0", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",