Skip to content

Commit

Permalink
Fix grouping textlines when bounding box of parent container is wrong (
Browse files Browse the repository at this point in the history
…#386)

* Default value for --all-texts should be false, because using the flag enables it

* Fix edge case: when no neighbors are found a line should form its own text box

* Added test for grouping textlines where 1 is outside the parent bounding box

* Added CHANGELOG.md line
  • Loading branch information
pietermarsman authored Mar 14, 2020
1 parent 7e91d4e commit 1d773dc
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

Nothing
### Fixed
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))

## [20200124] - 2020-01-24

Expand Down
6 changes: 2 additions & 4 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,16 +597,14 @@ def group_objects(self, laparams, objs):
yield line
return

# group_textlines: group neighboring lines to textboxes.
def group_textlines(self, laparams, lines):
"""Group neighboring lines to textboxes"""
plane = Plane(self.bbox)
plane.extend(lines)
boxes = {}
for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin)
if line not in neighbors:
continue
members = []
members = [line]
for obj1 in neighbors:
members.append(obj1)
if obj1 in boxes:
Expand Down
23 changes: 23 additions & 0 deletions tests/test_layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import unittest

from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal


class TestGroupTextLines(unittest.TestCase):
def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self):
"""
LTLayoutContainer.group_textlines() should return all the lines in a
separate LTTextBoxes if they do not overlap. Even when the bounding box
of the parent container does not contain all the lines.
"""
laparams = LAParams()
layout = LTLayoutContainer((0, 0, 50, 50))
line1 = LTTextLineHorizontal(laparams.word_margin)
line1.set_bbox((0, 0, 50, 5))
line2 = LTTextLineHorizontal(laparams.word_margin)
line2.set_bbox((0, 50, 50, 55))
lines = [line1, line2]

textboxes = list(layout.group_textlines(laparams, lines))

self.assertEqual(len(textboxes), 2)
2 changes: 1 addition & 1 deletion tools/pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def maketheparser():
"should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters).")
la_params.add_argument(
"--all-texts", "-A", default=True, action="store_true",
"--all-texts", "-A", default=False, action="store_true",
help="If layout analysis should be performed on text in figures.")

output_params = parser.add_argument_group(
Expand Down

0 comments on commit 1d773dc

Please sign in to comment.