From 31e1907db567753aa9394f260d6d2350d11bda5e Mon Sep 17 00:00:00 2001 From: Edward Kim <109497216+edknv@users.noreply.github.com> Date: Fri, 28 Feb 2025 11:08:08 -0800 Subject: [PATCH] filter out null content before embedding (#498) --- src/nv_ingest/stages/embeddings/text_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nv_ingest/stages/embeddings/text_embeddings.py b/src/nv_ingest/stages/embeddings/text_embeddings.py index b4db7f64..b00a1a94 100644 --- a/src/nv_ingest/stages/embeddings/text_embeddings.py +++ b/src/nv_ingest/stages/embeddings/text_embeddings.py @@ -303,7 +303,7 @@ def _generate_text_embeddings_df( # Extract content from metadata and filter out rows with empty content. extracted_content = df.loc[content_mask, "metadata"].apply(content_getter) - non_empty_mask = extracted_content.str.strip() != "" + non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "") final_mask = content_mask & non_empty_mask if not final_mask.any(): continue