Skip to content

Commit

Permalink
fix: revise headers in public charge content (#183)
Browse files Browse the repository at this point in the history
Co-authored-by: Yoom Lam <[email protected]>
  • Loading branch information
ccheng26 and yoomlam authored Jan 17, 2025
1 parent 66f5d13 commit 1181eef
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 22 deletions.
8 changes: 6 additions & 2 deletions app/src/ingestion/markdown_chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,15 @@ def chunk_tree(input_tree: Tree, config: ChunkingConfig) -> list[ProtoChunk]:
unchunked_ids = input_data_ids - chunked_data_ids
assert not unchunked_ids, f"Expected {unchunked_ids} to be chunked"

# Identify which chunk each node is in
# Identify which chunk each node is in, ignoring the Document root node
data_id_to_chunk_id = {id: pc.id for pc in config.chunks for id in pc.data_ids}
logger.debug(
"Node-to-chunk mapping: %s",
{id: data_id_to_chunk_id[id] for id in [n.data_id for n in input_tree.iterator()]},
{
n.data_id: data_id_to_chunk_id[n.data_id]
for n in input_tree.iterator()
if n.data_id != doc_node.data_id
},
)
return config.chunks

Expand Down
12 changes: 4 additions & 8 deletions app/src/ingestion/scrapy_dst/spiders/ca_public_charge_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,8 @@ class CaPublicChargeSpider(CrawlSpider):

def parse_page(self, response: HtmlResponse) -> dict[str, str | AccordionSections]:
extractions = {"url": response.url}
if len(response.css("h4::text").getall()) == 1:
title = response.css("h4.title::text").get()
extractions["title"] = title.strip()
else:
titles = ";".join(response.css("h4.title::text").getall())
extractions["title"] = titles
title = response.css("title::text").get().removesuffix("| Keep Your Benefits")
extractions["title"] = title.strip()
base_url = response.url

# remove icon text
Expand Down Expand Up @@ -86,7 +82,7 @@ def to_markdown(self, base_url: str, html: str) -> str:
return markdown.strip()

def parse_main_primary(self, base_url: str, main_primary: SelectorList) -> dict[str, str]:
markdown = self.to_markdown(base_url, main_primary.get())
markdown = self.to_markdown(base_url, main_primary.get()).replace("\r", "").strip()
return {"main_primary": markdown}

def parse_main_content(self, base_url: str, main_content: SelectorList) -> dict[str, str]:
Expand All @@ -102,4 +98,4 @@ def parse_main_content(self, base_url: str, main_content: SelectorList) -> dict[
for middle_detail in middler_details:
markdown += "\n" + self.to_markdown(base_url, middle_detail)

return {"main_content": markdown}
return {"main_content": markdown.replace("\r", "").strip()}
75 changes: 63 additions & 12 deletions app/tests/src/test_ingest_ca_public_charge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,74 @@
from tempfile import TemporaryDirectory

import pytest
from sqlalchemy import delete
from sqlalchemy import delete, select

from src.app_config import app_config as app_config_for_test
from src.db.models.document import Document
from src.ingest_ca_public_charge import _ingest_ca_public_charge

from .test_ingest_edd_web import check_database_contents, sample_cards # noqa: F401


@pytest.fixture
def sample_markdown(sample_cards): # noqa: F811
items = json.loads(sample_cards)
for item in items:
item["h4"] = item["title"]
item["markdown"] = item.get("main_content", item.get("main_primary"))
return json.dumps(items)
def sample_markdown(): # noqa: F811
return json.dumps(
[
{
"url": "https://keepyourbenefits.org/en/ca/",
"title": "CALIFORNIA - Public Charge, Benefits and Immigration",
"main_primary": "#### The federal government’s final public charge rules are now in effect. Get the facts about public charge & immigration.\n\n Rules about public benefit programs and immigrants are confusing. But benefits can help your family stay healthy and thrive. \n\n Click [Use the Guide](https://keepyourbenefits.org/en/ca/use-the-guide) to see if public benefits could affect different immigration options.\n\n[Use the Guide](https://keepyourbenefits.org/en/ca/use-the-guide)",
"main_content": "### Who is affected by the Public Charge Rule?\n\n* It does not apply to:\n- U.S. Citizens or people applying for citizenship. \n - Lawful Permanent residents (Green Card holders) unless the Green Card holder leaves the U.S. for more than 6 months. A Public Charge assessment can apply when they try to return. \n - People applying for Green Card renewal or DACA renewal.",
},
{
"url": "https://keepyourbenefits.org/en/ca/public-charge",
"title": "CALIFORNIA - Public Charge Explained",
"main_primary": '#### Public Charge Explained\n\n "Public Charge" makes people afraid to use public benefits. But the Public Charge Rule does not affect every immigration application. And most immigrants who face a Public Charge test do not receive the benefits that count. \n\n This article will explain: \n\n* What is Public Charge\n* What benefits are included in the rule\n* Who is affected by the rule and who is not',
"main_content": "### Public Benefits are part of the Public Charge Test\n\nOnly these public benefits* obtained for the immigrant are considered in the Public Charge Test. \n\n* • Cash benefits for income maintenance \n - SSI (Supplemental Security Income) \n - CalWorks/TANF (Temporary Assistance for Needy Families) \n - CAPI (Cash Assistance Programs for Immigrants) \n - GA (General Assistance/Relief)",
},
{
"url": "https://keepyourbenefits.org/en/ca/resources",
"title": "CALIFORNIA - FAQ and Resources: What is Public Charge",
"main_primary": "#### FAQs and Resources\n\nWhat is Public Charge and what benefits are included? \n\nFind answers and information sources to these and other public charge questions\n\n[Understanding Public Charge](https://keepyourbenefits.org/en/ca/resources/public-charge)",
"main_content": "#### FAQs and Resources\n\nWhat is Public Charge and what benefits are included? \n\nFind answers and information sources to these and other public charge questions\n\n[Understanding Public Charge](https://keepyourbenefits.org/en/ca/resources/public-charge)\n#### Frequently Asked Questions\n\n\n Learn more before making decisions about public benefits for you and your family. Here are a few Frequently Asked Questions to get you started:\n\n**Q. What are public benefits?**\n\n\n A. Public benefits are government benefits like food, cash, housing, and medical assistance for people with low or no income. ",
},
{
"url": "https://keepyourbenefits.org/en/ca/updates",
"title": "CALIFORNIA - News Updates (English)",
"main_primary": '##### Nov 12, 2024: Important Update for Immigrant Families\n\n\nDespite the recent election, no immigration or public benefits rules have changed or are likely to change before January 20, 2025. We are closely monitoring any policy changes and will keep this page current with the latest information. Please check back here for updates and reliable guidance.\n\n \n\n##### Nov 12, 2024: Deferred Action for Childhood Arrivals (DACA) Health Coverage Update\n\nStarting November 1, 2024, people with DACA status can sign up for health and dental plans through Covered California. Those who qualify may get help paying for their plan.\n\nDACA recipients have a special enrollment period from November 1 to December 31, 2024. To sign up, select "gained lawful presence" on the application. If you enroll in November, your coverage could start as soon as December 1, 2024.\n\nThis special enrollment period overlaps with Covered California’s open enrollment.',
},
]
)


def check_database_contents(db_session, caplog):
documents = db_session.execute(select(Document).order_by(Document.name)).scalars().all()
assert len(documents) == 4

assert documents[0].name == "CALIFORNIA - FAQ and Resources: What is Public Charge"
assert documents[0].source == "https://keepyourbenefits.org/en/ca/resources"

assert documents[1].name == "CALIFORNIA - News Updates (English)"
assert documents[1].source == "https://keepyourbenefits.org/en/ca/updates"

assert documents[2].name == "CALIFORNIA - Public Charge, Benefits and Immigration"
assert documents[2].source == "https://keepyourbenefits.org/en/ca/"

doc0 = documents[0]
assert len(doc0.chunks) == 2

assert doc0.chunks[0].content.startswith("#### Frequently Asked Questions")
assert doc0.chunks[0].headings == ["CALIFORNIA - FAQ and Resources: What is Public Charge"]
assert doc0.chunks[1].content.startswith(
"#### FAQs and Resources\n\nWhat is Public Charge and what benefits are included"
)
assert doc0.chunks[1].headings == ["CALIFORNIA - FAQ and Resources: What is Public Charge"]

# # Document[1] is short
doc1 = documents[1]
assert len(doc1.chunks) == 3
assert doc1.chunks[0].content.startswith(
"##### Nov 12, 2024: Important Update for Immigrant Families"
)
assert doc1.chunks[0].headings == ["CALIFORNIA - News Updates (English)"]


@pytest.fixture
Expand All @@ -29,15 +81,14 @@ def ca_public_charge_local_file(tmp_path, sample_markdown):


doc_attribs = {
"dataset": "keepyourbenefits.org",
"dataset": "Keep Your Benefits",
"program": "mixed",
"region": "California",
}


def test_ingestion(caplog, app_config, db_session, ca_public_charge_local_file):
# Force a short max_seq_length to test chunking
app_config_for_test.sentence_transformer.max_seq_length = 47
app_config_for_test.sentence_transformer.max_seq_length = 75

db_session.execute(delete(Document))

Expand Down

0 comments on commit 1181eef

Please sign in to comment.