fix: Update Imagine LA Content Hub scraper and ingester (#198)

navapbc · Jan 28, 2025 · 2506041 · 2506041
1 parent 8097e72
commit 2506041
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 7 deletions.
diff --git a/app/src/ingestion/imagine_la/ingest.py b/app/src/ingestion/imagine_la/ingest.py
@@ -21,7 +21,6 @@
     save_json,
     tokenize,
 )
-from src.util.string_utils import remove_links
 
 logger = logging.getLogger(__name__)
 
@@ -38,6 +37,7 @@ def _parse_html(
     md_base_dir: str, common_base_url: str, file_path: str, doc_attribs: dict[str, str]
 ) -> tuple[Document, Sequence[Chunk], Sequence[str]]:
 
+    logger.info("Reading %r", file_path)
     with open(file_path, "r") as file:
         file_contents = file.read()
     soup = BeautifulSoup(file_contents, "html.parser")
@@ -75,10 +75,12 @@ def _parse_html(
     tree = create_markdown_tree(content, doc_name=document.name, doc_source=document.source)
     tree_chunks = chunk_tree(tree, ImagineLaChunkingConfig())
     chunks = [
-        Chunk(content=chunk.markdown, document=document, headings=chunk.headings)
+        Chunk(
+            content=chunk.markdown, document=document, headings=chunk.headings, tokens=chunk.length
+        )
         for chunk in tree_chunks
     ]
-    chunk_texts_to_encode = [remove_links(chunk.markdown) for chunk in tree_chunks]
+    chunk_texts_to_encode = [t_chunk.embedding_str for t_chunk in tree_chunks]
 
     chunks_file_path = f"{file_path}.chunks.json"
     logger.info("  Saving chunks to %r", chunks_file_path)
@@ -119,17 +121,21 @@ def _ingest_content_hub(
         logger.info("Skip saving to DB")
     else:
         for document, chunks, chunk_texts_to_encode in all_chunks:
+            logger.info("Adding embeddings for %r", document.source)
             add_embeddings(chunks, chunk_texts_to_encode)
             db_session.add(document)
             db_session.add_all(chunks)
 
 
 def main() -> None:
+    # Print INFO messages since this is often run from the terminal during local development
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+
     default_config = IngestConfig(
         "Imagine LA",
         "mixed",
         "California",
         "https://socialbenefitsnavigator25.web.app/contenthub/",
-        "imagine_la_md",
+        "imagine_la",
     )
     process_and_ingest_sys_args(sys.argv, logger, _ingest_content_hub, default_config)
diff --git a/app/src/ingestion/imagine_la/scrape/.gitignore b/app/src/ingestion/imagine_la/scrape/.gitignore
@@ -0,0 +1 @@
+/pages*/
diff --git a/app/src/ingestion/imagine_la/scrape/scrape_content_hub.py b/app/src/ingestion/imagine_la/scrape/scrape_content_hub.py
@@ -48,6 +48,8 @@
     quit()
 password_field.fill(password)
 password_field.press("Enter")
+page.wait_for_load_state("networkidle")
+print("Logged in")
 
 # Wait for the page to load by ensuring an element (e.g., an <h2> tag) is present
 page.wait_for_selector("h2", timeout=10_000)
@@ -63,16 +65,20 @@
     accordions = page.locator(".chakra-accordion__button")
     for accordion_index in range(accordions.count()):
         accordions.nth(accordion_index).click()
+    page.wait_for_load_state("networkidle")
 
     with page.expect_navigation() as navigation:
         learn_more_buttons.nth(index).click()
+        page.wait_for_load_state("networkidle")
 
+    page.wait_for_selector("h2", timeout=10_000)
     page_path = page.url.removeprefix(root_url_prefix)
     print(f"Scraped page: {page_path}")
 
     content_hub_pages[page_path] = page.content()
 
     page.go_back()
+    page.wait_for_load_state("networkidle")
 
 # Write the files to the `pages` directory
 os.makedirs("pages", exist_ok=True)

diff --git a/app/src/util/ingest_utils.py b/app/src/util/ingest_utils.py
@@ -102,7 +102,7 @@ def process_and_ingest_sys_args(
         args.benefit_program,
         args.benefit_region,
         default_config.common_base_url,
-        default_config.md_base_dir,
+        default_config.scraper_dataset,
     )
 
     start_ingestion(
@@ -175,10 +175,11 @@ def add_embeddings(
 
     for chunk, embedding, text in zip(chunks, embeddings, to_encode, strict=True):
         chunk.mpnet_embedding = embedding
+        token_len = len(tokenize(text))
         if not chunk.tokens:
-            chunk.tokens = len(tokenize(text))
+            chunk.tokens = token_len
         else:
-            assert chunk.tokens == len(tokenize(text))
+            assert chunk.tokens == token_len, f"Token count mismatch: {chunk.tokens} != {token_len}"
         assert (
             chunk.tokens <= embedding_model.max_seq_length
         ), f"Text too long for embedding model: {chunk.tokens} tokens: {len(chunk.content)} chars: {chunk.content[:80]}...{chunk.content[-50:]}"