Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Simplify command line for scraping datasets #186

Merged
merged 4 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ endif


scrape-ca-public-charge:
$(PY_RUN_CMD) scrape-ca-public-charge
$(PY_RUN_CMD) scrapy-runner ca_public_charge

ingest-ca-public-charge: check-ingest-arguments
$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)

scrape-edd-web:
$(PY_RUN_CMD) scrape-edd-web
$(PY_RUN_CMD) scrapy-runner edd

ingest-edd-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
Expand All @@ -274,16 +274,18 @@ scrape-la-county-policy:
cd src/ingestion/la_policy/scrape; uv run --no-project scrape_la_policy_nav_bar.py

# Now that we have the expanded nav bar, scrape all the links in the nav bar
# Either should work:
# DEBUG_SCRAPINGS=true uv run --no-project scrape_la_policy.py &> out.log
$(PY_RUN_CMD) scrape-la-policy 2>&1 | tee out.log
$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log

ingest-la-county-policy: check-ingest-arguments
$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-irs-web:
$(PY_RUN_CMD) scrape-irs-web
$(PY_RUN_CMD) scrapy-runner irs

ingest-irs-web: check-ingest-arguments
$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)


scrape-ca-ftb:
$(PY_RUN_CMD) scrapy-runner ca_ftb
4 changes: 2 additions & 2 deletions app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ db-migrate = "src.db.migrations.run:up"
db-migrate-down = "src.db.migrations.run:down"
db-migrate-down-all = "src.db.migrations.run:downall"
ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
scrape-ca-public-charge = "src.ingestion.scrape_ca_public_charge:main"
ingest-edd-web = "src.ingest_edd_web:main"
scrape-edd-web = "src.ingestion.scrape_edd_web:main"
ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
scrape-la-policy = "src.ingestion.scrape_la_policy:main"
ingest-la-policy = "src.ingest_la_county_policy:main"
scrape-irs-web = "src.ingestion.scrape_irs_web:main"
ingest-irs-web = "src.ingest_irs_web:main"

scrapy-runner = "src.ingestion.scrapy_runner:main"

[tool.black]
line-length = 100

Expand Down
16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_ca_public_charge.py

This file was deleted.

49 changes: 0 additions & 49 deletions app/src/ingestion/scrape_edd_web.py

This file was deleted.

16 changes: 0 additions & 16 deletions app/src/ingestion/scrape_irs_web.py

This file was deleted.

31 changes: 0 additions & 31 deletions app/src/ingestion/scrape_la_policy.py

This file was deleted.

74 changes: 74 additions & 0 deletions app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import re
from typing import Iterator, Optional

import html2text
import scrapy
from scrapy.http import HtmlResponse


class CaFranchiseTaxBoardSpider(scrapy.Spider):
# This name is used on the commandline: scrapy crawl edd_spider
name = "ca_ftb_spider"
allowed_domains = ["www.ftb.ca.gov"]
start_urls = ["https://www.ftb.ca.gov/file/personal/credits/index.html"]

# This is used to substitute the base URL in the cache storage
common_url_prefix = "https://www.ftb.ca.gov/file/"

def parse(self, response: HtmlResponse) -> Iterator[scrapy.Request | dict[str, str]]:
self.logger.info("Parsing %s", response.url)

nav_links = response.css("nav.local-nav a")
for link in nav_links:
if "class" in link.attrib and link.attrib["class"] == "uplevel":
# Skip the uplevel/back link that goes to the parent page
continue

assert link.attrib["href"]
self.logger.info("Found nav link: %s", link)
yield response.follow(link, callback=self.parse_childpage)

yield self.parse_childpage(response)

def parse_childpage(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)

if (h1_count := len(response.css("h1").getall())) > 1:
self.logger.warning("Found %i h1 elements for %r", h1_count, response.url)
raise ValueError("Multiple h1 elements found")

title = to_markdown(response.css("h1").get().strip()).removeprefix("# ")
assert title

body = response.css("div#body-content")
# Drop the navigation sidebar so that we only get the main content
body.css("aside").drop()

markdown = to_markdown(body.get(), response.url)
extractions = {
"url": response.url,
"markdown": markdown,
}
return extractions


def to_markdown(html: str, base_url: Optional[str] = None) -> str:
h2t = html2text.HTML2Text()

# Refer to https://github.com/Alir3z4/html2text/blob/master/docs/usage.md and html2text.config
# for options:
# 0 for no wrapping
h2t.body_width = 0
h2t.wrap_links = False

if base_url:
h2t.baseurl = base_url

# Exclude the <sup> and <sub> tags
h2t.include_sup_sub = False

markdown = h2t.handle(html)

# Consolidate newlines
markdown = re.sub(r"\n\n+", "\n\n", markdown)
return markdown.strip()
4 changes: 1 addition & 3 deletions app/src/ingestion/scrapy_dst/spiders/irs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders.crawl import CrawlSpider, Rule

AccordionSections = dict[str, list[str]]


class IrsSpider(CrawlSpider):
# This name is used on the commandline: scrapy crawl edd_spider
Expand Down Expand Up @@ -43,7 +41,7 @@ class IrsSpider(CrawlSpider):
),
)

def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]:
def parse_page(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}

Expand Down
25 changes: 25 additions & 0 deletions app/src/ingestion/scrapy_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import json
import logging
import os
import sys
from pprint import pprint

from scrapy.crawler import CrawlerProcess
Expand Down Expand Up @@ -55,3 +57,26 @@ def run(spider_name: str, output_json_filename: str, debug: bool = False) -> Non
run_spider(spider_name, output_json_filename)
if debug:
postprocess_json(output_json_filename)


DATASETS = {
"edd": {},
"la_policy": {},
"irs": {
"spider": "irs_web_spider",
},
"ca_public_charge": {},
"ca_ftb": {},
}


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("dataset")
parser.add_argument("--debug", action="store_true")
Comment on lines +74 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Help text generation can be useful in the future for usage as complexity of the CLI app increases


args = parser.parse_args(sys.argv[1:])
ds = DATASETS[args.dataset]
spider_id = ds.get("spider", f"{args.dataset}_spider")
json_output = ds.get("output", f"{spider_id.removesuffix("spider")}scrapings.json")
run(spider_id, json_output, debug=args.debug)
Loading