Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Simplify command line for scraping datasets #186

Merged
merged 4 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ endif


scrape-ca-public-charge:
$(PY_RUN_CMD) scrape-ca-public-charge
$(PY_RUN_CMD) scrapy-runner ca_public_charge

ingest-ca-public-charge: check-ingest-arguments
$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
Expand All @@ -274,16 +274,18 @@ scrape-la-county-policy:
cd src/ingestion/la_policy/scrape; uv run --no-project scrape_la_policy_nav_bar.py

# Now that we have the expanded nav bar, scrape all the links in the nav bar
# Either should work:
# DEBUG_SCRAPINGS=true uv run --no-project scrape_la_policy.py &> out.log
$(PY_RUN_CMD) scrape-la-policy 2>&1 | tee out.log
$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log

ingest-la-county-policy: check-ingest-arguments
$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-irs-web:
$(PY_RUN_CMD) scrape-irs-web
$(PY_RUN_CMD) scrapy-runner irs

ingest-irs-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-ca-ftb:
$(PY_RUN_CMD) scrapy-runner ca_ftb
4 changes: 2 additions & 2 deletions app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ db-migrate = "src.db.migrations.run:up"
db-migrate-down = "src.db.migrations.run:down"
db-migrate-down-all = "src.db.migrations.run:downall"
ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
scrape-ca-public-charge = "src.ingestion.scrape_ca_public_charge:main"
ingest-edd-web = "src.ingest_edd_web:main"
scrape-edd-web = "src.ingestion.scrape_edd_web:main"
ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
scrape-la-policy = "src.ingestion.scrape_la_policy:main"
ingest-la-policy = "src.ingest_la_county_policy:main"
scrape-irs-web = "src.ingestion.scrape_irs_web:main"
ingest-irs-web = "src.ingest_irs_web:main"

scrapy-runner = "src.ingestion.scrapy_runner:main"

[tool.black]
line-length = 100

Expand Down
16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_ca_public_charge.py

This file was deleted.

49 changes: 0 additions & 49 deletions app/src/ingestion/scrape_edd_web.py

This file was deleted.

16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_irs_web.py

This file was deleted.

31 changes: 0 additions & 31 deletions app/src/ingestion/scrape_la_policy.py

This file was deleted.

60 changes: 60 additions & 0 deletions app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
from typing import Optional

import html2text
import scrapy
from scrapy.http import HtmlResponse


class CaFranchiseTaxBoardSpider(scrapy.Spider):
# This name is used on the commandline: scrapy crawl edd_spider
name = "ca_ftb_spider"
allowed_domains = ["www.ftb.ca.gov"]
start_urls = ["https://www.ftb.ca.gov/file/personal/credits/index.html"]

# This is used to substitute the base URL in the cache storage
common_url_prefix = "https://www.ftb.ca.gov/file/"

def parse(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}

nav_links = response.css("nav.local-nav a")
for link in nav_links:
if "class" in link.attrib and link.attrib["class"] == "uplevel":
# Skip the uplevel/back link that goes to the parent page
continue

assert link.attrib["href"]
self.logger.info("Found nav link: %s", link.attrib["href"])
response.follow(link, callback=self.parse_childpage)
self.logger.info("Found nav link: %s", link)

return extractions

def parse_childpage(self, response):

Check warning on line 35 in app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py

View workflow job for this annotation

GitHub Actions / Lint

Function is missing a type annotation [no-untyped-def]
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}
return extractions


def to_markdown(html: str, base_url: Optional[str] = None) -> str:
h2t = html2text.HTML2Text()

# Refer to https://github.com/Alir3z4/html2text/blob/master/docs/usage.md and html2text.config
# for options:
# 0 for no wrapping
h2t.body_width = 0
h2t.wrap_links = False

if base_url:
h2t.baseurl = base_url

# Exclude the <sup> and <sub> tags
h2t.include_sup_sub = False

markdown = h2t.handle(html)

# Consolidate newlines
markdown = re.sub(r"\n\n+", "\n\n", markdown)
return markdown.strip()
4 changes: 1 addition & 3 deletions app/src/ingestion/scrapy_dst/spiders/irs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders.crawl import CrawlSpider, Rule

AccordionSections = dict[str, list[str]]


class IrsSpider(CrawlSpider):
# This name is used on the commandline: scrapy crawl edd_spider
Expand Down Expand Up @@ -43,7 +41,7 @@ class IrsSpider(CrawlSpider):
),
)

def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]:
def parse_page(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}

Expand Down
25 changes: 25 additions & 0 deletions app/src/ingestion/scrapy_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import json
import logging
import os
import sys
from pprint import pprint

from scrapy.crawler import CrawlerProcess
Expand Down Expand Up @@ -55,3 +57,26 @@ def run(spider_name: str, output_json_filename: str, debug: bool = False) -> Non
run_spider(spider_name, output_json_filename)
if debug:
postprocess_json(output_json_filename)


DATASETS = {
"edd": {},
"la_policy": {},
"irs": {
"spider": "irs_web_spider",
},
"ca_public_charge": {},
"ca_ftb": {},
}


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("dataset")
parser.add_argument("--debug", action="store_true")
Comment on lines +74 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Help text generation can be useful in the future for usage as complexity of the CLI app increases


args = parser.parse_args(sys.argv[1:])
ds = DATASETS[args.dataset]
spider_id = ds.get("spider", f"{args.dataset}_spider")
json_output = ds.get("output", f"{spider_id.removesuffix("spider")}scrapings.json")
run(spider_id, json_output, debug=args.debug)
Loading