diff --git a/app/Makefile b/app/Makefile index 9ccbda19..8704d5cd 100644 --- a/app/Makefile +++ b/app/Makefile @@ -250,13 +250,13 @@ endif scrape-ca-public-charge: - $(PY_RUN_CMD) scrape-ca-public-charge + $(PY_RUN_CMD) scrapy-runner ca_public_charge ingest-ca-public-charge: check-ingest-arguments $(PY_RUN_CMD) ingest-ca-public-charge "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS) scrape-edd-web: - $(PY_RUN_CMD) scrape-edd-web + $(PY_RUN_CMD) scrapy-runner edd ingest-edd-web: check-ingest-arguments $(PY_RUN_CMD) ingest-edd-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS) @@ -274,16 +274,18 @@ scrape-la-county-policy: cd src/ingestion/la_policy/scrape; uv run --no-project scrape_la_policy_nav_bar.py # Now that we have the expanded nav bar, scrape all the links in the nav bar - # Either should work: - # DEBUG_SCRAPINGS=true uv run --no-project scrape_la_policy.py &> out.log - $(PY_RUN_CMD) scrape-la-policy 2>&1 | tee out.log + $(PY_RUN_CMD) scrapy-runner la_policy 2>&1 | tee out.log ingest-la-county-policy: check-ingest-arguments $(PY_RUN_CMD) ingest-la-policy "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS) scrape-irs-web: - $(PY_RUN_CMD) scrape-irs-web + $(PY_RUN_CMD) scrapy-runner irs ingest-irs-web: check-ingest-arguments $(PY_RUN_CMD) ingest-irs-web "$(DATASET_ID)" "$(BENEFIT_PROGRAM)" "$(BENEFIT_REGION)" "$(FILEPATH)" $(INGEST_ARGS) + + +scrape-ca-ftb: + $(PY_RUN_CMD) scrapy-runner ca_ftb diff --git a/app/pyproject.toml b/app/pyproject.toml index 55d151c3..e9223337 100644 --- a/app/pyproject.toml +++ b/app/pyproject.toml @@ -69,15 +69,15 @@ db-migrate = "src.db.migrations.run:up" db-migrate-down = "src.db.migrations.run:down" db-migrate-down-all = "src.db.migrations.run:downall" ingest-ca-public-charge = "src.ingest_ca_public_charge:main" -scrape-ca-public-charge = "src.ingestion.scrape_ca_public_charge:main" ingest-edd-web = "src.ingest_edd_web:main" scrape-edd-web = "src.ingestion.scrape_edd_web:main" ingest-imagine-la = "src.ingestion.imagine_la.ingest:main" scrape-la-policy = "src.ingestion.scrape_la_policy:main" ingest-la-policy = "src.ingest_la_county_policy:main" -scrape-irs-web = "src.ingestion.scrape_irs_web:main" ingest-irs-web = "src.ingest_irs_web:main" +scrapy-runner = "src.ingestion.scrapy_runner:main" + [tool.black] line-length = 100 diff --git a/app/src/ingestion/scrape_ca_public_charge.py b/app/src/ingestion/scrape_ca_public_charge.py deleted file mode 100644 index 9bd8918d..00000000 --- a/app/src/ingestion/scrape_ca_public_charge.py +++ /dev/null @@ -1,16 +0,0 @@ -SPIDER_NAME = "ca_public_charge_spider" -OUTPUT_JSON = "ca_public_charge_scrapings.json" - - -def main() -> None: - import os - - from .scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=bool(os.environ.get("DEBUG_SCRAPINGS", False))) - - -if __name__ == "__main__": - from scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=True) diff --git a/app/src/ingestion/scrape_edd_web.py b/app/src/ingestion/scrape_edd_web.py deleted file mode 100644 index 13799757..00000000 --- a/app/src/ingestion/scrape_edd_web.py +++ /dev/null @@ -1,49 +0,0 @@ -import json - - -def save_user_friendly_markdown(filename: str) -> None: - with open(filename, "r", encoding="utf-8") as raw_json: - data = json.load(raw_json) - with open(f"{filename}.md", "w", encoding="utf-8") as md_file: - for item in data: - item_md = ["\n\n=============================="] - item_md.append(f"{item['title']}, {item['url']}") - if "main_content" in item: - item_md.append("\n------- @MAIN_CONTENT:\n") - item_md.append(item["main_content"]) - if "main_primary" in item: - item_md.append("\n------- @MAIN_PRIMARY:\n") - item_md.append(item["main_primary"]) - if "nonaccordion" in item: - item_md.append("\n------- @NONACCORDION:") - item_md.append(item["nonaccordion"]) - if "accordions" in item: - item_md.append("\n------- @ACCORDIONS:") - for heading, paras in item["accordions"].items(): - item_md.append(f"\n---- ## {heading}:\n") - for para in paras: - item_md.append(para) - md_file.write("\n".join(item_md)) - print("User-friendly markdown of JSON saved to %s.md", filename) - - -OUTPUT_JSON = "edd_scrapings.json" -SPIDER_NAME = "edd_spider" - - -def main() -> None: - import os - - from .scrapy_runner import run - - debug = bool(os.environ.get("DEBUG_SCRAPINGS", False)) - run(SPIDER_NAME, OUTPUT_JSON, debug) - - if debug: - save_user_friendly_markdown(OUTPUT_JSON) - - -if __name__ == "__main__": - from scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=True) diff --git a/app/src/ingestion/scrape_irs_web.py b/app/src/ingestion/scrape_irs_web.py deleted file mode 100644 index e51ccebd..00000000 --- a/app/src/ingestion/scrape_irs_web.py +++ /dev/null @@ -1,16 +0,0 @@ -SPIDER_NAME = "irs_web_spider" -OUTPUT_JSON = "irs_web_scrapings.json" - - -def main() -> None: - import os - - from .scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=bool(os.environ.get("DEBUG_SCRAPINGS", False))) - - -if __name__ == "__main__": - from scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=True) diff --git a/app/src/ingestion/scrape_la_policy.py b/app/src/ingestion/scrape_la_policy.py deleted file mode 100644 index 4cd6f808..00000000 --- a/app/src/ingestion/scrape_la_policy.py +++ /dev/null @@ -1,31 +0,0 @@ -# /// script -# dependencies = [ -# "install-playwright", -# "playwright", -# "scrapy", -# "markdownify", -# "nltk", -# "langchain_text_splitters", -# "html2text", -# "mistletoe", -# "nutree", -# ] -# /// -# (This comment enables `uv run` to automatically create a virtual environment) - -SPIDER_NAME = "la_policy_spider" -OUTPUT_JSON = "la_policy_scrapings.json" - - -def main() -> None: - import os - - from .scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=bool(os.environ.get("DEBUG_SCRAPINGS", False))) - - -if __name__ == "__main__": - from scrapy_runner import run - - run(SPIDER_NAME, OUTPUT_JSON, debug=True) diff --git a/app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py b/app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py new file mode 100644 index 00000000..27ec6bf7 --- /dev/null +++ b/app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py @@ -0,0 +1,74 @@ +import re +from typing import Iterator, Optional + +import html2text +import scrapy +from scrapy.http import HtmlResponse + + +class CaFranchiseTaxBoardSpider(scrapy.Spider): + # This name is used on the commandline: scrapy crawl edd_spider + name = "ca_ftb_spider" + allowed_domains = ["www.ftb.ca.gov"] + start_urls = ["https://www.ftb.ca.gov/file/personal/credits/index.html"] + + # This is used to substitute the base URL in the cache storage + common_url_prefix = "https://www.ftb.ca.gov/file/" + + def parse(self, response: HtmlResponse) -> Iterator[scrapy.Request | dict[str, str]]: + self.logger.info("Parsing %s", response.url) + + nav_links = response.css("nav.local-nav a") + for link in nav_links: + if "class" in link.attrib and link.attrib["class"] == "uplevel": + # Skip the uplevel/back link that goes to the parent page + continue + + assert link.attrib["href"] + self.logger.info("Found nav link: %s", link) + yield response.follow(link, callback=self.parse_childpage) + + yield self.parse_childpage(response) + + def parse_childpage(self, response: HtmlResponse) -> dict[str, str]: + self.logger.info("Parsing %s", response.url) + + if (h1_count := len(response.css("h1").getall())) > 1: + self.logger.warning("Found %i h1 elements for %r", h1_count, response.url) + raise ValueError("Multiple h1 elements found") + + title = to_markdown(response.css("h1").get().strip()).removeprefix("# ") + assert title + + body = response.css("div#body-content") + # Drop the navigation sidebar so that we only get the main content + body.css("aside").drop() + + markdown = to_markdown(body.get(), response.url) + extractions = { + "url": response.url, + "markdown": markdown, + } + return extractions + + +def to_markdown(html: str, base_url: Optional[str] = None) -> str: + h2t = html2text.HTML2Text() + + # Refer to https://github.com/Alir3z4/html2text/blob/master/docs/usage.md and html2text.config + # for options: + # 0 for no wrapping + h2t.body_width = 0 + h2t.wrap_links = False + + if base_url: + h2t.baseurl = base_url + + # Exclude the and tags + h2t.include_sup_sub = False + + markdown = h2t.handle(html) + + # Consolidate newlines + markdown = re.sub(r"\n\n+", "\n\n", markdown) + return markdown.strip() diff --git a/app/src/ingestion/scrapy_dst/spiders/irs_spider.py b/app/src/ingestion/scrapy_dst/spiders/irs_spider.py index 6fa08bdc..28e89d0b 100644 --- a/app/src/ingestion/scrapy_dst/spiders/irs_spider.py +++ b/app/src/ingestion/scrapy_dst/spiders/irs_spider.py @@ -6,8 +6,6 @@ from scrapy.linkextractors import LinkExtractor from scrapy.spiders.crawl import CrawlSpider, Rule -AccordionSections = dict[str, list[str]] - class IrsSpider(CrawlSpider): # This name is used on the commandline: scrapy crawl edd_spider @@ -43,7 +41,7 @@ class IrsSpider(CrawlSpider): ), ) - def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]: + def parse_page(self, response: HtmlResponse) -> dict[str, str]: self.logger.info("Parsing %s", response.url) extractions = {"url": response.url} diff --git a/app/src/ingestion/scrapy_runner.py b/app/src/ingestion/scrapy_runner.py index 419faa6c..7b297a29 100644 --- a/app/src/ingestion/scrapy_runner.py +++ b/app/src/ingestion/scrapy_runner.py @@ -1,6 +1,8 @@ +import argparse import json import logging import os +import sys from pprint import pprint from scrapy.crawler import CrawlerProcess @@ -55,3 +57,26 @@ def run(spider_name: str, output_json_filename: str, debug: bool = False) -> Non run_spider(spider_name, output_json_filename) if debug: postprocess_json(output_json_filename) + + +DATASETS = { + "edd": {}, + "la_policy": {}, + "irs": { + "spider": "irs_web_spider", + }, + "ca_public_charge": {}, + "ca_ftb": {}, +} + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("dataset") + parser.add_argument("--debug", action="store_true") + + args = parser.parse_args(sys.argv[1:]) + ds = DATASETS[args.dataset] + spider_id = ds.get("spider", f"{args.dataset}_spider") + json_output = ds.get("output", f"{spider_id.removesuffix("spider")}scrapings.json") + run(spider_id, json_output, debug=args.debug)