Merge branch 'main' into chore/bump-python-req

Unstructured-IO · Jan 21, 2025 · 0779f2b · 0779f2b
2 parents 160bfb1 + 4d0c20a
commit 0779f2b
Show file tree

Hide file tree

Showing 17 changed files with 278 additions and 1,858 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,29 @@
+## 0.8.4
+
+* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes
+* feat: replace the single valueed `source` attribute from `TextRegions` and `LayoutElements` with an array attribute `sources`
+
+## 0.8.3
+
+* fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
+* fix: update requirements to drop `layoutparser` lib
+* fix: update `README.md` to remove layoutparser model zoo support note
+
+## 0.8.2
+
+* fix: fix bug when an empty list is passed into `TextRegions.from_list` triggers `IndexError`
+* fix: fix bug when concatenate a list of `LayoutElements` the class id mapping is no properly
+  updated
+
+## 0.8.1
+
+* fix: fix list index out of range error caused by calling LayoutElements.from_list() with empty list
+
+## 0.8.0
+
+* fix: fix missing source after cleaning layout elements
+* **BREAKING** Remove chipper model
+
 ## 0.7.41
 
 * fix: fix incorrect type casting with higher versions of `numpy` when substracting a `float` from an `int` array

diff --git a/README.md b/README.md
@@ -72,10 +72,6 @@ model = get_model("yolox")
 layout = DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf", detection_model=model)
 ```
 
-### Using models from the layoutparser model zoo
-
-The `UnstructuredDetectronModel` class in `unstructured_inference.modelts.detectron2` uses the `faster_rcnn_R_50_FPN_3x` model pretrained on DocLayNet, but by using different construction parameters, any model in the `layoutparser` [model zoo](https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html) can be used. `UnstructuredDetectronModel` is a light wrapper around the `layoutparser` `Detectron2LayoutModel` object, and accepts the same arguments. See [layoutparser documentation](https://layout-parser.readthedocs.io/en/latest/api_doc/models.html#layoutparser.models.Detectron2LayoutModel) for details.
-
 ### Using your own model
 
 Any detection model can be used for in the `unstructured_inference` pipeline by wrapping the model in the `UnstructuredObjectDetectionModel` class. To integrate with the `DocumentLayout` class, a subclass of `UnstructuredObjectDetectionModel` must have a `predict` method that accepts a `PIL.Image.Image` and returns a list of `LayoutElement`s, and an `initialize` method, which loads the model and prepares it for inference.

diff --git a/requirements/base.in b/requirements/base.in
@@ -1,5 +1,4 @@
 -c constraints.in
-layoutparser
 python-multipart
 huggingface-hub
 numpy<2
@@ -12,3 +11,6 @@ timm
 # NOTE(alan): Pinned because this is when the most recent module we import appeared
 transformers>=4.25.1
 rapidfuzz
+pandas
+scipy
+pdfplumber
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -4,58 +4,54 @@
 #
 #    pip-compile requirements/base.in
 #
-certifi==2024.8.30
+certifi==2024.12.14
     # via requests
 cffi==1.17.1
     # via cryptography
-charset-normalizer==3.3.2
+charset-normalizer==3.4.1
     # via
     #   pdfminer-six
     #   requests
 coloredlogs==15.0.1
     # via onnxruntime
 contourpy==1.3.0
     # via matplotlib
-cryptography==43.0.1
+cryptography==44.0.0
     # via pdfminer-six
 cycler==0.12.1
     # via matplotlib
-filelock==3.16.0
+filelock==3.16.1
     # via
     #   huggingface-hub
     #   torch
     #   transformers
-flatbuffers==24.3.25
+flatbuffers==24.12.23
     # via onnxruntime
-fonttools==4.53.1
+fonttools==4.55.3
     # via matplotlib
-fsspec==2024.9.0
+fsspec==2024.12.0
     # via
     #   huggingface-hub
     #   torch
-huggingface-hub==0.24.7
+huggingface-hub==0.27.1
     # via
     #   -r requirements/base.in
     #   timm
     #   tokenizers
     #   transformers
 humanfriendly==10.0
     # via coloredlogs
-idna==3.8
+idna==3.10
     # via requests
-importlib-resources==6.4.5
+importlib-resources==6.5.2
     # via matplotlib
-iopath==0.1.10
-    # via layoutparser
-jinja2==3.1.4
+jinja2==3.1.5
     # via torch
 kiwisolver==1.4.7
     # via matplotlib
-layoutparser==0.3.4
-    # via -r requirements/base.in
-markupsafe==2.1.5
+markupsafe==3.0.2
     # via jinja2
-matplotlib==3.9.2
+matplotlib==3.9.4
     # via -r requirements/base.in
 mpmath==1.3.0
     # via sympy
@@ -65,7 +61,6 @@ numpy==1.26.4
     # via
     #   -r requirements/base.in
     #   contourpy
-    #   layoutparser
     #   matplotlib
     #   onnx
     #   onnxruntime
@@ -74,107 +69,96 @@ numpy==1.26.4
     #   scipy
     #   torchvision
     #   transformers
-onnx==1.16.2
+onnx==1.17.0
     # via -r requirements/base.in
 onnxruntime==1.19.2
     # via -r requirements/base.in
-opencv-python==4.10.0.84
-    # via
-    #   -r requirements/base.in
-    #   layoutparser
-packaging==24.1
+opencv-python==4.11.0.86
+    # via -r requirements/base.in
+packaging==24.2
     # via
     #   huggingface-hub
     #   matplotlib
     #   onnxruntime
     #   transformers
-pandas==2.2.2
-    # via layoutparser
-pdf2image==1.17.0
-    # via layoutparser
+pandas==2.2.3
+    # via -r requirements/base.in
 pdfminer-six==20231228
     # via pdfplumber
-pdfplumber==0.11.4
-    # via layoutparser
-pillow==10.4.0
+pdfplumber==0.11.5
+    # via -r requirements/base.in
+pillow==11.1.0
     # via
-    #   layoutparser
     #   matplotlib
-    #   pdf2image
     #   pdfplumber
     #   torchvision
-portalocker==2.10.1
-    # via iopath
-protobuf==5.28.1
+protobuf==5.29.3
     # via
     #   onnx
     #   onnxruntime
 pycparser==2.22
     # via cffi
-pyparsing==3.1.4
+pyparsing==3.2.1
     # via matplotlib
-pypdfium2==4.30.0
+pypdfium2==4.30.1
     # via pdfplumber
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
-python-multipart==0.0.9
+python-multipart==0.0.20
     # via -r requirements/base.in
 pytz==2024.2
     # via pandas
 pyyaml==6.0.2
     # via
     #   huggingface-hub
-    #   layoutparser
     #   timm
     #   transformers
-rapidfuzz==3.9.7
+rapidfuzz==3.11.0
     # via -r requirements/base.in
-regex==2024.9.11
+regex==2024.11.6
     # via transformers
 requests==2.32.3
     # via
     #   huggingface-hub
     #   transformers
-safetensors==0.4.5
+safetensors==0.5.2
     # via
     #   timm
     #   transformers
 scipy==1.13.1
-    # via layoutparser
-six==1.16.0
+    # via -r requirements/base.in
+six==1.17.0
     # via python-dateutil
-sympy==1.13.2
+sympy==1.13.1
     # via
     #   onnxruntime
     #   torch
-timm==1.0.9
+timm==1.0.13
     # via -r requirements/base.in
-tokenizers==0.19.1
+tokenizers==0.21.0
     # via transformers
-torch==2.4.1
+torch==2.5.1
     # via
     #   -r requirements/base.in
     #   timm
     #   torchvision
-torchvision==0.19.1
+torchvision==0.20.1
     # via timm
-tqdm==4.66.5
+tqdm==4.67.1
     # via
     #   huggingface-hub
-    #   iopath
     #   transformers
-transformers==4.44.2
+transformers==4.48.0
     # via -r requirements/base.in
 typing-extensions==4.12.2
     # via
     #   huggingface-hub
-    #   iopath
     #   torch
-tzdata==2024.1
+tzdata==2024.2
     # via pandas
-urllib3==2.2.3
+urllib3==2.3.0
     # via requests
-zipp==3.20.2
+zipp==3.21.0
     # via importlib-resources