Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle lack of support for docx/pptx/xlsx for media description #2260

Merged
merged 7 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions app/backend/prepdocslib/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.core.exceptions import HttpResponseError
from PIL import Image
from pypdf import PdfReader

Expand Down Expand Up @@ -68,6 +69,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
async with DocumentIntelligenceClient(
endpoint=self.endpoint, credential=self.credential
) as document_intelligence_client:
file_analyzed = False
if self.use_content_understanding:
if self.content_understanding_endpoint is None:
raise ValueError("Content Understanding is enabled but no endpoint was provided")
Expand All @@ -77,15 +79,29 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
)
cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
content_bytes = content.read()
poller = await document_intelligence_client.begin_analyze_document(
model_id="prebuilt-layout",
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
output=["figures"],
features=["ocrHighResolution"],
output_content_format="markdown",
)
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
else:
try:
poller = await document_intelligence_client.begin_analyze_document(
model_id="prebuilt-layout",
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
output=["figures"],
features=["ocrHighResolution"],
output_content_format="markdown",
)
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
file_analyzed = True
except HttpResponseError as e:
content.seek(0)
if e.error and e.error.code == "InvalidArgument":
logger.error(
"This document type does not support media description. Proceeding with standard analysis."
)
else:
logger.error(
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
e,
)

if file_analyzed is False:
poller = await document_intelligence_client.begin_analyze_document(
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
)
Expand Down
4 changes: 3 additions & 1 deletion docs/deploy_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ By default, if your documents contain image-like figures, the data ingestion pro
so users will not be able to ask questions about them.

You can optionably enable the description of media content using Azure Content Understanding. When enabled, the data ingestion process will send figures to Azure Content Understanding and replace the figure with the description in the indexed document.
To learn more about this process and compare it to the gpt-4 vision integration, see [this guide](./data_ingestion.md#media-description).

To enable media description with Azure Content Understanding, run:

Expand All @@ -175,6 +174,9 @@ If you have already run `azd up`, you will need to run `azd provision` to create
If you have already indexed your documents and want to re-index them with the media descriptions,
first [remove the existing documents](./data_ingestion.md#removing-documents) and then [re-ingest the data](./data_ingestion.md#indexing-additional-documents).

⚠️ This feature does not yet support DOCX, PPTX, or XLSX formats. If you have figures in those formats, they will be ignored.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning looks good!

Convert them first to PDF or image formats to enable media description.

## Enabling client-side chat history

This feature allows users to view the chat history of their conversation, stored in the browser using [IndexedDB](https://developer.mozilla.org/docs/Web/API/IndexedDB_API). That means the chat history will be available only on the device where the chat was initiated. To enable browser-stored chat history, run:
Expand Down
64 changes: 63 additions & 1 deletion tests/test_pdfparser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import io
import json
import logging
import math
import pathlib
from unittest.mock import AsyncMock, MagicMock
from unittest.mock import AsyncMock, MagicMock, Mock

import pymupdf
import pytest
Expand All @@ -17,6 +18,7 @@
DocumentTable,
DocumentTableCell,
)
from azure.core.exceptions import HttpResponseError
from PIL import Image, ImageChops

from prepdocslib.mediadescriber import ContentUnderstandingDescriber
Expand Down Expand Up @@ -308,3 +310,63 @@ async def mock_describe_image(self, image_bytes):
pages[0].text
== "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure><figcaption>Figure 1<br>Pie chart</figcaption></figure>\n\n\nThis is text after the figure that's not part of it."
)


@pytest.mark.asyncio
async def test_parse_unsupportedformat(monkeypatch, caplog):
mock_poller = MagicMock()

async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):

if kwargs.get("features") == ["ocrHighResolution"]:

class FakeErrorOne:
def __init__(self):
self.error = Mock(message="A fake error", code="FakeErrorOne")

class FakeHttpResponse(HttpResponseError):
def __init__(self, response, error, *args, **kwargs):
self.error = error
super().__init__(self, response=response, *args, **kwargs)

message = {
"error": {
"code": "InvalidArgument",
"message": "A fake error",
}
}
response = Mock(status_code=500, headers={})
response.text = lambda encoding=None: json.dumps(message).encode("utf-8")
response.headers["content-type"] = "application/json"
response.content_type = "application/json"
raise FakeHttpResponse(response, FakeErrorOne())
else:
return mock_poller

async def mock_poller_result():
return AnalyzeResult(
content="Page content",
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
tables=[],
figures=[],
)

monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
monkeypatch.setattr(mock_poller, "result", mock_poller_result)

parser = DocumentAnalysisParser(
endpoint="https://example.com",
credential=MockAzureCredential(),
use_content_understanding=True,
content_understanding_endpoint="https://example.com",
)
content = io.BytesIO(b"pdf content bytes")
content.name = "test.docx"
with caplog.at_level(logging.ERROR):
pages = [page async for page in parser.parse(content)]
assert "This document type does not support media description." in caplog.text

assert len(pages) == 1
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
Loading