Skip to content

Commit

Permalink
refactor: Simplify command line for scraping datasets (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam authored Jan 17, 2025
1 parent 1181eef commit 151ceb4
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 123 deletions.
14 changes: 8 additions & 6 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ endif


scrape-ca-public-charge:
$(PY_RUN_CMD) scrape-ca-public-charge
$(PY_RUN_CMD) scrapy-runner ca_public_charge

ingest-ca-public-charge: check-ingest-arguments
$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)

scrape-edd-web:
$(PY_RUN_CMD) scrape-edd-web
$(PY_RUN_CMD) scrapy-runner edd

ingest-edd-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
Expand All @@ -274,16 +274,18 @@ scrape-la-county-policy:
cd src/ingestion/la_policy/scrape; uv run --no-project scrape_la_policy_nav_bar.py

# Now that we have the expanded nav bar, scrape all the links in the nav bar
# Either should work:
# DEBUG_SCRAPINGS=true uv run --no-project scrape_la_policy.py &> out.log
$(PY_RUN_CMD) scrape-la-policy 2>&1 | tee out.log
$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log

ingest-la-county-policy: check-ingest-arguments
$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-irs-web:
$(PY_RUN_CMD) scrape-irs-web
$(PY_RUN_CMD) scrapy-runner irs

ingest-irs-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-ca-ftb:
$(PY_RUN_CMD) scrapy-runner ca_ftb
4 changes: 2 additions & 2 deletions app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ db-migrate = "src.db.migrations.run:up"
db-migrate-down = "src.db.migrations.run:down"
db-migrate-down-all = "src.db.migrations.run:downall"
ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
scrape-ca-public-charge = "src.ingestion.scrape_ca_public_charge:main"
ingest-edd-web = "src.ingest_edd_web:main"
scrape-edd-web = "src.ingestion.scrape_edd_web:main"
ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
scrape-la-policy = "src.ingestion.scrape_la_policy:main"
ingest-la-policy = "src.ingest_la_county_policy:main"
scrape-irs-web = "src.ingestion.scrape_irs_web:main"
ingest-irs-web = "src.ingest_irs_web:main"

scrapy-runner = "src.ingestion.scrapy_runner:main"

[tool.black]
line-length = 100

Expand Down
16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_ca_public_charge.py

This file was deleted.

49 changes: 0 additions & 49 deletions app/src/ingestion/scrape_edd_web.py

This file was deleted.

16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_irs_web.py

This file was deleted.

31 changes: 0 additions & 31 deletions app/src/ingestion/scrape_la_policy.py

This file was deleted.

74 changes: 74 additions & 0 deletions app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import re
from typing import Iterator, Optional

import html2text
import scrapy
from scrapy.http import HtmlResponse


class CaFranchiseTaxBoardSpider(scrapy.Spider):
# This name is used on the commandline: scrapy crawl edd_spider
name = "ca_ftb_spider"
allowed_domains = ["www.ftb.ca.gov"]
start_urls = ["https://www.ftb.ca.gov/file/personal/credits/index.html"]

# This is used to substitute the base URL in the cache storage
common_url_prefix = "https://www.ftb.ca.gov/file/"

def parse(self, response: HtmlResponse) -> Iterator[scrapy.Request | dict[str, str]]:
self.logger.info("Parsing %s", response.url)

nav_links = response.css("nav.local-nav a")
for link in nav_links:
if "class" in link.attrib and link.attrib["class"] == "uplevel":
# Skip the uplevel/back link that goes to the parent page
continue

assert link.attrib["href"]
self.logger.info("Found nav link: %s", link)
yield response.follow(link, callback=self.parse_childpage)

yield self.parse_childpage(response)

def parse_childpage(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)

if (h1_count := len(response.css("h1").getall())) > 1:
self.logger.warning("Found %i h1 elements for %r", h1_count, response.url)
raise ValueError("Multiple h1 elements found")

title = to_markdown(response.css("h1").get().strip()).removeprefix("# ")
assert title

body = response.css("div#body-content")
# Drop the navigation sidebar so that we only get the main content
body.css("aside").drop()

markdown = to_markdown(body.get(), response.url)
extractions = {
"url": response.url,
"markdown": markdown,
}
return extractions


def to_markdown(html: str, base_url: Optional[str] = None) -> str:
h2t = html2text.HTML2Text()

# Refer to https://github.com/Alir3z4/html2text/blob/master/docs/usage.md and html2text.config
# for options:
# 0 for no wrapping
h2t.body_width = 0
h2t.wrap_links = False

if base_url:
h2t.baseurl = base_url

# Exclude the <sup> and <sub> tags
h2t.include_sup_sub = False

markdown = h2t.handle(html)

# Consolidate newlines
markdown = re.sub(r"\n\n+", "\n\n", markdown)
return markdown.strip()
4 changes: 1 addition & 3 deletions app/src/ingestion/scrapy_dst/spiders/irs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders.crawl import CrawlSpider, Rule

AccordionSections = dict[str, list[str]]


class IrsSpider(CrawlSpider):
# This name is used on the commandline: scrapy crawl edd_spider
Expand Down Expand Up @@ -43,7 +41,7 @@ class IrsSpider(CrawlSpider):
),
)

def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]:
def parse_page(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}

Expand Down
25 changes: 25 additions & 0 deletions app/src/ingestion/scrapy_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import json
import logging
import os
import sys
from pprint import pprint

from scrapy.crawler import CrawlerProcess
Expand Down Expand Up @@ -55,3 +57,26 @@ def run(spider_name: str, output_json_filename: str, debug: bool = False) -> Non
run_spider(spider_name, output_json_filename)
if debug:
postprocess_json(output_json_filename)


DATASETS = {
"edd": {},
"la_policy": {},
"irs": {
"spider": "irs_web_spider",
},
"ca_public_charge": {},
"ca_ftb": {},
}


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("dataset")
parser.add_argument("--debug", action="store_true")

args = parser.parse_args(sys.argv[1:])
ds = DATASETS[args.dataset]
spider_id = ds.get("spider", f"{args.dataset}_spider")
json_output = ds.get("output", f"{spider_id.removesuffix("spider")}scrapings.json")
run(spider_id, json_output, debug=args.debug)

0 comments on commit 151ceb4

Please sign in to comment.