refactor: Simplify command line for scraping datasets (#186)

navapbc · Jan 17, 2025 · 151ceb4 · 151ceb4
1 parent 1181eef
commit 151ceb4
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 123 deletions.
diff --git a/app/Makefile b/app/Makefile
@@ -250,13 +250,13 @@ endif
 
 
 scrape-ca-public-charge:
-	$(PY_RUN_CMD) scrape-ca-public-charge
+	$(PY_RUN_CMD) scrapy-runner ca_public_charge
 
 ingest-ca-public-charge: check-ingest-arguments
 	$(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
 
 scrape-edd-web:
-	$(PY_RUN_CMD) scrape-edd-web
+	$(PY_RUN_CMD) scrapy-runner edd
 
 ingest-edd-web: check-ingest-arguments
 	$(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
@@ -274,16 +274,18 @@ scrape-la-county-policy:
 	cd src/ingestion/la_policy/scrape; uv run --no-project scrape_la_policy_nav_bar.py
 
 	# Now that we have the expanded nav bar, scrape all the links in the nav bar
-	# Either should work:
-	# DEBUG_SCRAPINGS=true uv run --no-project scrape_la_policy.py &> out.log
-	$(PY_RUN_CMD) scrape-la-policy 2>&1 | tee out.log
+	$(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log
 
 ingest-la-county-policy: check-ingest-arguments
 	$(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
 
 
 scrape-irs-web:
-	$(PY_RUN_CMD) scrape-irs-web
+	$(PY_RUN_CMD) scrapy-runner irs
 
 ingest-irs-web: check-ingest-arguments
 	$(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS)
+
+
+scrape-ca-ftb:
+	$(PY_RUN_CMD) scrapy-runner ca_ftb
diff --git a/app/pyproject.toml b/app/pyproject.toml
@@ -69,15 +69,15 @@ db-migrate = "src.db.migrations.run:up"
 db-migrate-down = "src.db.migrations.run:down"
 db-migrate-down-all = "src.db.migrations.run:downall"
 ingest-ca-public-charge = "src.ingest_ca_public_charge:main"
-scrape-ca-public-charge = "src.ingestion.scrape_ca_public_charge:main"
 ingest-edd-web = "src.ingest_edd_web:main"
 scrape-edd-web = "src.ingestion.scrape_edd_web:main"
 ingest-imagine-la = "src.ingestion.imagine_la.ingest:main"
 scrape-la-policy = "src.ingestion.scrape_la_policy:main"
 ingest-la-policy = "src.ingest_la_county_policy:main"
-scrape-irs-web = "src.ingestion.scrape_irs_web:main"
 ingest-irs-web = "src.ingest_irs_web:main"
 
+scrapy-runner = "src.ingestion.scrapy_runner:main"
+
 [tool.black]
 line-length = 100
 

diff --git a/app/src/ingestion/scrape_ca_public_charge.py b/app/src/ingestion/scrape_ca_public_charge.py
diff --git a/app/src/ingestion/scrape_edd_web.py b/app/src/ingestion/scrape_edd_web.py
diff --git a/app/src/ingestion/scrape_irs_web.py b/app/src/ingestion/scrape_irs_web.py
diff --git a/app/src/ingestion/scrape_la_policy.py b/app/src/ingestion/scrape_la_policy.py
diff --git a/app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py b/app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py
@@ -0,0 +1,74 @@
+import re
+from typing import Iterator, Optional
+
+import html2text
+import scrapy
+from scrapy.http import HtmlResponse
+
+
+class CaFranchiseTaxBoardSpider(scrapy.Spider):
+    # This name is used on the commandline: scrapy crawl edd_spider
+    name = "ca_ftb_spider"
+    allowed_domains = ["www.ftb.ca.gov"]
+    start_urls = ["https://www.ftb.ca.gov/file/personal/credits/index.html"]
+
+    # This is used to substitute the base URL in the cache storage
+    common_url_prefix = "https://www.ftb.ca.gov/file/"
+
+    def parse(self, response: HtmlResponse) -> Iterator[scrapy.Request | dict[str, str]]:
+        self.logger.info("Parsing %s", response.url)
+
+        nav_links = response.css("nav.local-nav a")
+        for link in nav_links:
+            if "class" in link.attrib and link.attrib["class"] == "uplevel":
+                # Skip the uplevel/back link that goes to the parent page
+                continue
+
+            assert link.attrib["href"]
+            self.logger.info("Found nav link: %s", link)
+            yield response.follow(link, callback=self.parse_childpage)
+
+        yield self.parse_childpage(response)
+
+    def parse_childpage(self, response: HtmlResponse) -> dict[str, str]:
+        self.logger.info("Parsing %s", response.url)
+
+        if (h1_count := len(response.css("h1").getall())) > 1:
+            self.logger.warning("Found %i h1 elements for %r", h1_count, response.url)
+            raise ValueError("Multiple h1 elements found")
+
+        title = to_markdown(response.css("h1").get().strip()).removeprefix("# ")
+        assert title
+
+        body = response.css("div#body-content")
+        # Drop the navigation sidebar so that we only get the main content
+        body.css("aside").drop()
+
+        markdown = to_markdown(body.get(), response.url)
+        extractions = {
+            "url": response.url,
+            "markdown": markdown,
+        }
+        return extractions
+
+
+def to_markdown(html: str, base_url: Optional[str] = None) -> str:
+    h2t = html2text.HTML2Text()
+
+    # Refer to https://github.com/Alir3z4/html2text/blob/master/docs/usage.md and html2text.config
+    # for options:
+    # 0 for no wrapping
+    h2t.body_width = 0
+    h2t.wrap_links = False
+
+    if base_url:
+        h2t.baseurl = base_url
+
+    # Exclude the <sup> and <sub> tags
+    h2t.include_sup_sub = False
+
+    markdown = h2t.handle(html)
+
+    # Consolidate newlines
+    markdown = re.sub(r"\n\n+", "\n\n", markdown)
+    return markdown.strip()
diff --git a/app/src/ingestion/scrapy_dst/spiders/irs_spider.py b/app/src/ingestion/scrapy_dst/spiders/irs_spider.py
@@ -6,8 +6,6 @@
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders.crawl import CrawlSpider, Rule
 
-AccordionSections = dict[str, list[str]]
-
 
 class IrsSpider(CrawlSpider):
     # This name is used on the commandline: scrapy crawl edd_spider
@@ -43,7 +41,7 @@ class IrsSpider(CrawlSpider):
         ),
     )
 
-    def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]:
+    def parse_page(self, response: HtmlResponse) -> dict[str, str]:
         self.logger.info("Parsing %s", response.url)
         extractions = {"url": response.url}
 

diff --git a/app/src/ingestion/scrapy_runner.py b/app/src/ingestion/scrapy_runner.py
@@ -1,6 +1,8 @@
+import argparse
 import json
 import logging
 import os
+import sys
 from pprint import pprint
 
 from scrapy.crawler import CrawlerProcess
@@ -55,3 +57,26 @@ def run(spider_name: str, output_json_filename: str, debug: bool = False) -> Non
     run_spider(spider_name, output_json_filename)
     if debug:
         postprocess_json(output_json_filename)
+
+
+DATASETS = {
+    "edd": {},
+    "la_policy": {},
+    "irs": {
+        "spider": "irs_web_spider",
+    },
+    "ca_public_charge": {},
+    "ca_ftb": {},
+}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset")
+    parser.add_argument("--debug", action="store_true")
+
+    args = parser.parse_args(sys.argv[1:])
+    ds = DATASETS[args.dataset]
+    spider_id = ds.get("spider", f"{args.dataset}_spider")
+    json_output = ds.get("output", f"{spider_id.removesuffix("spider")}scrapings.json")
+    run(spider_id, json_output, debug=args.debug)