Fix regression in page layout that sometimes returned text lines out …

…of order (#659) * add a test * fix the bug * rewrap long lines * update CHANGELOG * re-merge CHANGELOG Co-authored-by: Pieter Marsman <[email protected]>
pdfminer · Jan 26, 2022 · 95dee8d · 95dee8d
1 parent 9a644aa
commit 95dee8d
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
+- Regression (since 20191107) in `LTLayoutContainer.group_textboxes` that returned some text lines out of order ([#659](https://github.com/pdfminer/pdfminer.six/pull/659))
 - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
 - Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
 - Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))

diff --git a/pdfminer/layout.py b/pdfminer/layout.py
@@ -889,7 +889,7 @@ def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
             (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
             # Skip objects that are already merged
             if (id1 not in done) and (id2 not in done):
-                if skip_isany and isany(obj1, obj2):
+                if not skip_isany and isany(obj1, obj2):
                     heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
                     continue
                 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \

diff --git a/samples/simple5.pdf b/samples/simple5.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -31,6 +31,10 @@ def run_with_file(sample_path):
     "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
                    "World\n\nWorld\n\n\f",
     "simple4.pdf": "Text1\nText2\nText3\n\n\f",
+    "simple5.pdf": "Heading\n\n"
+                   "Link to heading that is working with vim-pandoc.\n\n"
+                   "Link to heading “that is” not working with vim-pandoc.\n\n"
+                   "Subheading\n\nSome “more text”\n\n1\n\n\f",
     "zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
     "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
     "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
@@ -64,6 +68,11 @@ def test_simple4_with_string(self):
         s = run_with_string(test_file)
         self.assertEqual(s, test_strings[test_file])
 
+    def test_simple5_with_string(self):
+        test_file = "simple5.pdf"
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
     def test_simple1_with_file(self):
         test_file = "simple1.pdf"
         s = run_with_file(test_file)
@@ -84,6 +93,11 @@ def test_simple4_with_file(self):
         s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
+    def test_simple5_with_file(self):
+        test_file = "simple5.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
     def test_zlib_corrupted(self):
         test_file = "zen_of_python_corrupted.pdf"
         s = run_with_file(test_file)