Skip to content

Commit

Permalink
fix: Update Imagine LA Content Hub scraper and ingester (#198)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam authored Jan 28, 2025
1 parent 8097e72 commit 2506041
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 7 deletions.
14 changes: 10 additions & 4 deletions app/src/ingestion/imagine_la/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
save_json,
tokenize,
)
from src.util.string_utils import remove_links

logger = logging.getLogger(__name__)

Expand All @@ -38,6 +37,7 @@ def _parse_html(
md_base_dir: str, common_base_url: str, file_path: str, doc_attribs: dict[str, str]
) -> tuple[Document, Sequence[Chunk], Sequence[str]]:

logger.info("Reading %r", file_path)
with open(file_path, "r") as file:
file_contents = file.read()
soup = BeautifulSoup(file_contents, "html.parser")
Expand Down Expand Up @@ -75,10 +75,12 @@ def _parse_html(
tree = create_markdown_tree(content, doc_name=document.name, doc_source=document.source)
tree_chunks = chunk_tree(tree, ImagineLaChunkingConfig())
chunks = [
Chunk(content=chunk.markdown, document=document, headings=chunk.headings)
Chunk(
content=chunk.markdown, document=document, headings=chunk.headings, tokens=chunk.length
)
for chunk in tree_chunks
]
chunk_texts_to_encode = [remove_links(chunk.markdown) for chunk in tree_chunks]
chunk_texts_to_encode = [t_chunk.embedding_str for t_chunk in tree_chunks]

chunks_file_path = f"{file_path}.chunks.json"
logger.info(" Saving chunks to %r", chunks_file_path)
Expand Down Expand Up @@ -119,17 +121,21 @@ def _ingest_content_hub(
logger.info("Skip saving to DB")
else:
for document, chunks, chunk_texts_to_encode in all_chunks:
logger.info("Adding embeddings for %r", document.source)
add_embeddings(chunks, chunk_texts_to_encode)
db_session.add(document)
db_session.add_all(chunks)


def main() -> None:
# Print INFO messages since this is often run from the terminal during local development
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

default_config = IngestConfig(
"Imagine LA",
"mixed",
"California",
"https://socialbenefitsnavigator25.web.app/contenthub/",
"imagine_la_md",
"imagine_la",
)
process_and_ingest_sys_args(sys.argv, logger, _ingest_content_hub, default_config)
1 change: 1 addition & 0 deletions app/src/ingestion/imagine_la/scrape/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/pages*/
6 changes: 6 additions & 0 deletions app/src/ingestion/imagine_la/scrape/scrape_content_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
quit()
password_field.fill(password)
password_field.press("Enter")
page.wait_for_load_state("networkidle")
print("Logged in")

# Wait for the page to load by ensuring an element (e.g., an <h2> tag) is present
page.wait_for_selector("h2", timeout=10_000)
Expand All @@ -63,16 +65,20 @@
accordions = page.locator(".chakra-accordion__button")
for accordion_index in range(accordions.count()):
accordions.nth(accordion_index).click()
page.wait_for_load_state("networkidle")

with page.expect_navigation() as navigation:
learn_more_buttons.nth(index).click()
page.wait_for_load_state("networkidle")

page.wait_for_selector("h2", timeout=10_000)
page_path = page.url.removeprefix(root_url_prefix)
print(f"Scraped page: {page_path}")

content_hub_pages[page_path] = page.content()

page.go_back()
page.wait_for_load_state("networkidle")

# Write the files to the `pages` directory
os.makedirs("pages", exist_ok=True)
Expand Down
7 changes: 4 additions & 3 deletions app/src/util/ingest_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def process_and_ingest_sys_args(
args.benefit_program,
args.benefit_region,
default_config.common_base_url,
default_config.md_base_dir,
default_config.scraper_dataset,
)

start_ingestion(
Expand Down Expand Up @@ -175,10 +175,11 @@ def add_embeddings(

for chunk, embedding, text in zip(chunks, embeddings, to_encode, strict=True):
chunk.mpnet_embedding = embedding
token_len = len(tokenize(text))
if not chunk.tokens:
chunk.tokens = len(tokenize(text))
chunk.tokens = token_len
else:
assert chunk.tokens == len(tokenize(text))
assert chunk.tokens == token_len, f"Token count mismatch: {chunk.tokens} != {token_len}"
assert (
chunk.tokens <= embedding_model.max_seq_length
), f"Text too long for embedding model: {chunk.tokens} tokens: {len(chunk.content)} chars: {chunk.content[:80]}...{chunk.content[-50:]}"
Expand Down

0 comments on commit 2506041

Please sign in to comment.