Skip to content

Commit

Permalink
delint ca_ftb_spider.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam committed Jan 17, 2025
1 parent 38a6d05 commit 54b66c5
Showing 1 changed file with 12 additions and 5 deletions.
17 changes: 12 additions & 5 deletions app/src/ingestion/scrapy_dst/spiders/ca_ftb_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ class CaFranchiseTaxBoardSpider(scrapy.Spider):

def parse(self, response: HtmlResponse) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}

nav_links = response.css("nav.local-nav a")
for link in nav_links:
Expand All @@ -26,13 +25,21 @@ def parse(self, response: HtmlResponse) -> dict[str, str]:
continue

assert link.attrib["href"]
self.logger.info("Found nav link: %s", link.attrib["href"])
response.follow(link, callback=self.parse_childpage)
self.logger.info("Found nav link: %s", link)
yield response.follow(link, callback=self.parse_childpage)

return extractions
body = response.css("div#body-content")
# Drop the navigation sidebar so that we only get the main content
body.css("aside").drop()

markdown = to_markdown(body.get(), response.url)
extractions = {
"url": response.url,
"markdown": markdown,
}
yield extractions

def parse_childpage(self, response):
def parse_childpage(self, response) -> dict[str, str]:
self.logger.info("Parsing %s", response.url)
extractions = {"url": response.url}
return extractions
Expand Down

0 comments on commit 54b66c5

Please sign in to comment.