Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: added extraction of byte-images in excel #804

Merged
merged 6 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 50 additions & 37 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from typing import Any, List

from PIL import Image as PILImage
from pydantic import BaseModel


Expand Down Expand Up @@ -325,49 +326,61 @@ def _find_images_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:

# FIXME: mypy does not agree with _images ...
"""
# Iterate over images in the sheet
for idx, image in enumerate(sheet._images): # Access embedded images
# Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore

image_bytes = BytesIO(image.ref.blob)
pil_image = Image.open(image_bytes)
try:
pil_image = PILImage.open(image.ref)

doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
"""
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except:
_log.error("could not extract the image from excel sheets")

# FIXME: mypy does not agree with _charts ...
"""
for idx, chart in enumerate(sheet._charts): # Access embedded charts
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)
for idx, chart in enumerate(sheet._charts): # type: ignore
try:
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)

_log.info(f"Chart {idx + 1}:")

# Chart type
_log.info(f"Type: {type(chart).__name__}")

# Title
if chart.title:
_log.info(f"Title: {chart.title}")
else:
_log.info("No title")

# Data series
for series in chart.series:
_log.info(" => series ...")
_log.info(f"Data Series: {series.title}")
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")
_log.info(f"Chart {idx + 1}:")

# Position
# _log.info(f"Anchor Cell: {chart.anchor}")
# Chart type
# _log.info(f"Type: {type(chart).__name__}")
print(f"Type: {type(chart).__name__}")

# Extract series data
for series_idx, series in enumerate(chart.series):
#_log.info(f"Series {series_idx + 1}:")
print(f"Series {series_idx + 1} type: {type(series).__name__}")
#print(f"x-values: {series.xVal}")
#print(f"y-values: {series.yVal}")

print(f"xval type: {type(series.xVal).__name__}")

xvals = []
for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)

print(f"x-values: {xvals}")

yvals = []
for _ in series.yVal:
if hasattr(_, 'v'):
yvals.append(_.v)

print(f"y-values: {yvals}")

except Exception as exc:
print(exc)
continue
"""

return doc
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ onnxruntime = [
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
pillow = "^10.0.0"

[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
Expand Down
3 changes: 2 additions & 1 deletion tests/data/groundtruth/docling_v2/test-01.xlsx.itxt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ item-0 at level 0: unspecified: group _root_
item-6 at level 2: table with [5x3]
item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-10 at level 2: picture
30 changes: 28 additions & 2 deletions tests/data/groundtruth/docling_v2/test-01.xlsx.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion tests/data/groundtruth/docling_v2/test-01.xlsx.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,6 @@
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |
| 10 | 9 | 9 |

<!-- image -->
Binary file modified tests/data/xlsx/test-01.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_backend_msexcel.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_e2e_xlsx_conversions():
converter = get_converter()

for xlsx_path in xlsx_paths:
# print(f"converting {xlsx_path}")
print(f"converting {xlsx_path}")

gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
Expand Down
Loading