diff --git a/.github/scripts/docs.sh b/.github/scripts/docs.sh
new file mode 100755
index 00000000..6c3a6393
--- /dev/null
+++ b/.github/scripts/docs.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+echo '#######################################################################'
+echo '# Building docs #'
+echo '#######################################################################'
+
+export SPHINXOPTS="-W" # Treat warnings as errors
+
+make --directory $PROJECT_DIR/docs html
+
+DOCS_STATUS=$?
+if [[ ("$DOCS_STATUS" == 0) ]]; then
+ echo '#######################################################################'
+ echo '# Build succeded #'
+ echo '#######################################################################'
+ exit 0
+else
+ echo ''
+ echo '#######################################################################'
+ echo '# Build failed ! #'
+ echo '#######################################################################'
+ exit 1
+fi
diff --git a/.github/scripts/lint.sh b/.github/scripts/lint.sh
index 2182cb0b..2445472e 100755
--- a/.github/scripts/lint.sh
+++ b/.github/scripts/lint.sh
@@ -50,19 +50,19 @@ fi
echo ''
echo '#######################################################################'
-echo '# Running PyType #'
+echo '# Running PyType #'
echo '#######################################################################'
pytype $PROJECT_DIR --disable=pyi-error,import-error
PYTYPE_STATUS=$?
if [[ ("$PYTYPE_STATUS" == 0) ]]; then
echo '#######################################################################'
- echo '# PyType succeded #'
+ echo '# PyType succeded #'
echo '#######################################################################'
else
echo ''
echo '#######################################################################'
- echo '# PyType failed ! #'
+ echo '# PyType failed ! #'
echo '#######################################################################'
LINTERS_FAILED=1
fi
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
index 97266b04..0e94bd27 100644
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@@ -10,11 +10,12 @@ jobs:
with:
username: "${{ secrets.DOCKER_USERNAME }}"
password: "${{ secrets.DOCKER_PASSWORD }}"
- image_name: py-pdf-parser/py-pdf-parser-test
+ image_name: jstockwin/py-pdf-parser-test
image_tag: test
dockerfile: dockerfiles/Dockerfile_tests
- push_image_and_stages: false
- name: Run linting
- run: docker run --rm py-pdf-parser/py-pdf-parser-test:test .github/scripts/lint.sh
+ run: docker run --rm jstockwin/py-pdf-parser-test:test .github/scripts/lint.sh
- name: Run test
- run: docker run --rm py-pdf-parser/py-pdf-parser-test:test .github/scripts/test.sh
+ run: docker run --rm jstockwin/py-pdf-parser-test:test .github/scripts/test.sh
+ - name: Check docs build correctly
+ run: docker run --rm jstockwin/py-pdf-parser-test:test .github/scripts/docs.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61d12be8..ba5ba5eb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,8 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
-- Documentation is now hosted [here](https://py-pdf-parser.readthedocs.io/en/latest/).
+- Documentation is now hosted [here](https://py-pdf-parser.readthedocs.io/en/latest/). ([#71](https://github.com/jstockwin/py-pdf-parser/pull/71))
+- Added new examples to the documentation. ([#74](https://github.com/jstockwin/py-pdf-parser/pull/74))
- Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73)) (updated in [#78](https://github.com/jstockwin/py-pdf-parser/pull/78))
+- Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73))
### Changed
- This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`.
diff --git a/docker-compose.yml b/docker-compose.yml
index 2d772008..f893b570 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -33,9 +33,22 @@ tests:
- .:/py-pdf-parser
command: bash -c "nosetests ."
+# Run docs to re-build the docs once.
docs:
build: .
dockerfile: dockerfiles/Dockerfile_tests
volumes:
- .:/py-pdf-parser
command: make --directory docs html
+ environment:
+ - SPHINXOPTS="-W"
+
+# Use "up" to host the docs on port 8000, watching for changes.
+docs-autobuild:
+ build: .
+ dockerfile: dockerfiles/Dockerfile_tests
+ volumes:
+ - .:/py-pdf-parser
+ ports:
+ - "8000:8000"
+ command: make --directory docs livehtml
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf1..be546683 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -14,6 +14,9 @@ help:
.PHONY: help Makefile
+livehtml:
+ sphinx-autobuild --host 0.0.0.0 -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
diff --git a/docs/source/CHANGELOG.md b/docs/source/CHANGELOG.md
new file mode 120000
index 00000000..699cc9e7
--- /dev/null
+++ b/docs/source/CHANGELOG.md
@@ -0,0 +1 @@
+../../CHANGELOG.md
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9719b43b..467cd81f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -32,6 +32,7 @@
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"sphinx_rtd_theme",
+ "recommonmark",
]
# Add any paths that contain templates here, relative to this directory.
@@ -55,7 +56,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
+html_static_path = []
# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/example_files/order_summary.pdf b/docs/source/example_files/order_summary.pdf
new file mode 100644
index 00000000..89c94bed
Binary files /dev/null and b/docs/source/example_files/order_summary.pdf differ
diff --git a/docs/source/example_files/simple_memo.pdf b/docs/source/example_files/simple_memo.pdf
new file mode 100644
index 00000000..4a0a28e8
Binary files /dev/null and b/docs/source/example_files/simple_memo.pdf differ
diff --git a/docs/source/example_files/tables.pdf b/docs/source/example_files/tables.pdf
new file mode 100644
index 00000000..f020aee7
Binary files /dev/null and b/docs/source/example_files/tables.pdf differ
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
new file mode 100644
index 00000000..873761a8
--- /dev/null
+++ b/docs/source/examples/index.rst
@@ -0,0 +1,15 @@
+Examples
+========
+
+Below you can find links to the following examples:
+
+- The :ref:`simple-memo` example shows the very basics of using py-pdf-parser. You will see how to load a pdf document, start filtering the elements, and extract text from certain elements in the document.
+- The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables.
+- The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables.
+
+.. toctree::
+
+ simple_memo
+ order_summary
+ more_tables
+
diff --git a/docs/source/examples/more_tables.rst b/docs/source/examples/more_tables.rst
new file mode 100644
index 00000000..451daf01
--- /dev/null
+++ b/docs/source/examples/more_tables.rst
@@ -0,0 +1,270 @@
+.. _more-tables:
+
+More Tables
+-----------
+
+In this example, we will learn how to extract different types of table, and the difference between a simple table and more complicated ones.
+
+You can :download:`download the example here `.
+
+Please read the :ref:`order-summary` example first, as this covers some other functionality of the table extraction methods.
+
+Load the file
+.............
+
+The following code (click "show code" below to see it) loads the file, and assigns the elements for each table to a variable. If this does not make sense, you should go back and look at some of the previous examples.
+
+.. raw:: html
+
+
+ Show code
+
+.. code-block:: python
+
+ from py_pdf_parser.loaders import load_file
+
+ FONT_MAPPING = {
+ "BAAAAA+LiberationSerif-Bold,12.0": "header",
+ "CAAAAA+LiberationSerif,12.0": "table_element",
+ }
+ document = load_file("tables.pdf", font_mapping=FONT_MAPPING)
+
+ headers = document.elements.filter_by_font("header")
+
+ # Extract reference elements
+ simple_table_header = headers.filter_by_text_equal(
+ "Simple Table"
+ ).extract_single_element()
+
+ simple_table_with_gaps_header = headers.filter_by_text_equal(
+ "Simple Table with gaps"
+ ).extract_single_element()
+
+ simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
+ "Simple Table with gaps in first row/col"
+ ).extract_single_element()
+
+ non_simple_table_header = headers.filter_by_text_equal(
+ "Non Simple Table"
+ ).extract_single_element()
+
+ non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
+ "Non Simple Table with Merged Columns"
+ ).extract_single_element()
+
+ non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
+ "Non Simple Table with Merged Rows and Columns"
+ ).extract_single_element()
+
+ over_the_page_header = headers.filter_by_text_equal(
+ "Over the page"
+ ).extract_single_element()
+
+ # Extract table elements
+ simple_table_elements = document.elements.between(
+ simple_table_header, simple_table_with_gaps_header
+ )
+ simple_table_with_gaps_elements = document.elements.between(
+ simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header
+ )
+
+ simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
+ simple_table_with_gaps_in_first_row_col_header, non_simple_table_header
+ )
+
+ non_simple_table_elements = document.elements.between(
+ non_simple_table_header, non_simple_table_with_merged_cols_header
+ )
+
+ non_simple_table_with_merged_cols_elements = document.elements.between(
+ non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header
+ )
+
+ non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
+ non_simple_table_with_merged_rows_header, over_the_page_header
+ )
+
+ over_the_page_elements = document.elements.after(over_the_page_header)
+
+.. raw:: html
+
+
+
+Overview
+........
+
+The tables in the example pdf are split into "Simple Tables" and "Non Simple Tables". For the simple tables, we will be able to use :meth:`~py_pdf_parser.tables.extract_simple_table`, otherwise we must use :meth:`~py_pdf_parser.tables.extract_table`. The former is much more efficient, and should be used when possible.
+
+In general, tables can become more complicated by having missing cells, or merged cells which go across multiple columns or multiple rows. In both cases, you will have to pass additional parameters to stop exceptions being raised when this is the case. This is to make the extraction more robust, and protect against unexpected outcomes.
+
+To use :meth:`~py_pdf_parser.tables.extract_simple_table` we must have at least one column and one row which have no missing cells, and we must have no merged cells at all. We will need to know which row/column has no missing cells, as these must be passed as the reference row and column.
+
+To understand why: for each column element in the reference row and each row element in the reference column, :meth:`~py_pdf_parser.tables.extract_simple_table` will scan across from the row element (to get the row) and up/down from the column element (to get the column), and see if there is an element there. If there is, it is added to the table. Therefore, if there are gaps in the reference row/column, other elements may get missed. There is a check for this, so an exception will be raised if this is the case.
+
+This means :meth:`~py_pdf_parser.tables.extract_simple_table` takes time proportional to ``len(cols) + len(rows)``. Conversely, :meth:`~py_pdf_parser.tables.extract_table` is at least ``len(cols) * len(rows)``, and if there are merged cells it will be even worse. (Note in reality the complexity is not quite this simple, but it should give you an idea of the difference.)
+
+Below, we will work through increasingly complex examples to explain the functionality, and the steps involved.
+
+Simple Table
+............
+
+This table is as simple as they come - there are no blank or merged cells. This means we can simply use :meth:`~py_pdf_parser.tables.extract_simple_table` as we have seen previously.
+
+.. code-block:: python
+
+ from py_pdf_parser import tables
+ table = tables.extract_simple_table(simple_table_elements, as_text=True)
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']]
+
+Simple Table with gaps
+......................
+
+This table has gaps, however there are no gaps in the first row or column. These are the default reference row and column, and so :meth:`~py_pdf_parser.tables.extract_simple_table` will still work as expected. Blank cells will be empty strings if ``as_text=True``, and otherwise they will be ``None``. However, if we try the same code as above:
+
+.. code-block:: python
+
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_elements, as_text=True
+ )
+
+this will raise an exception:
+
+::
+
+ py_pdf_parser.exceptions.TableExtractionError: Element not found, there appears to be a gap in the table. If this is expected, pass allow_gaps=True.
+
+This is to allow py-pdf-parser to be more robust in the case that you're expecting your table to have no empty cells. As the error message says, since this is expected behaviour we can simply pass ``allow_gaps=True``.
+
+.. code-block:: python
+
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_elements, as_text=True, allow_gaps=True
+ )
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', '', ''], ['C', '', 'C', '3']]
+
+Simple Table with gaps in first row/col
+.......................................
+
+This table is similar to the above example, but now we have gaps in the first row and the first column (if either of these were true then the above wouldn't work). If we try the above code, a useful exception is raised:
+
+.. code-block:: python
+
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True
+ )
+
+::
+
+ py_pdf_parser.exceptions.TableExtractionError: Number of elements in table (9) does not match number of elements passed (12). Perhaps try extract_table instead of extract_simple_table, or change you reference element.
+
+The error message suggests either passing another reference element, or using the more complicated :meth:`~py_pdf_parser.tables.extract_table` method. In this case, as we still have a row and a column which have no missing cells, we can just pass a new reference element.
+
+As such, we can use the second column and the last row as our references, as neither of these have missing cells. The reference row and column are specified by simply passing the unique element in both the reference row and the reference column (called the reference element). In this case, it's the first number "3" in the table. Here we will be lazy and simply use the fact that this is the 10th element in the table, but you should probably do something smarter.
+
+.. code-block:: python
+
+ reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_in_first_row_col_elements,
+ as_text=True,
+ allow_gaps=True,
+ reference_element=reference_element,
+ )
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', '', 'Heading 4'], ['', '1', 'A', ''], ['B', '2', '', '2'], ['C', '3', 'C', '3']]
+
+Non Simple Table
+................
+
+The next table does not have any row with no empty cells, and as such we must use :meth:`~py_pdf_parser.tables.extract_table`. There is no ``allow_gaps`` parameter for this method, since if you don't want to allow gaps you should be using :meth:`~py_pdf_parser.tables.extract_simple_table` instead.
+
+Whilst the below may seem easier than working out the reference element in the above example, please note that it will be computationally slower.
+
+.. code-block:: python
+
+ table = tables.extract_table(non_simple_table_elements, as_text=True)
+
+::
+
+ >>> table
+ [['', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', 'B', '2'], ['C', '3', 'C', '']]
+
+
+Non Simple Table with Merged Columns
+....................................
+
+This table has text which goes across multiple columns. If we naively run this as above:
+
+.. code-block:: python
+
+ table = tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True)
+
+then we get an exception:
+
+::
+
+ py_pdf_parser.exceptions.TableExtractionError: An element is in multiple columns. If this is expected, you can try passing fix_element_in_multiple_cols=True
+
+Just like ``allow_gaps``, this is so we can be more robust in the case that this is not expected. The error helpfully suggests to try passing ``fix_element_in_multiple_cols=True``.
+
+.. code-block:: python
+
+ table = tables.extract_table(
+ non_simple_table_with_merged_cols_elements,
+ as_text=True,
+ fix_element_in_multiple_cols=True,
+ )
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['This text spans across multiple columns', '', 'B', '2'], ['C', '3', 'C', '3']]
+
+Note that the merged cell has been pushed into the left-most column. Likewise, if we had a cell that was merged across multiple rows, we could pass ``fix_element_in_multiple_rows=True``, and it would be pushed into the top row.
+
+Non Simple Table with Merged Rows and Columns
+.............................................
+
+In this case we have both merged rows and merged columns. We can pass both ``fix_element_in_multiple_rows=True`` and ``fix_element_in_multiple_cols=True``. The merged cell will be pushed into the left-most column and the top row.
+
+.. code-block:: python
+
+ table = tables.extract_table(
+ non_simple_table_with_merged_rows_and_cols_elements,
+ as_text=True,
+ fix_element_in_multiple_rows=True,
+ fix_element_in_multiple_cols=True,
+ )
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['This text spans across multiple rows and \nmultiple columns.', '', 'A', '1'], ['', '', 'B', '2'], ['C', '3', 'C', '3']]
+
+
+Over the page
+.............
+
+The final table goes over the page break. This is not a problem, simply pass the elements within the table and the result should be correct.
+
+If you had e.g. a footer that broke the table in two, simply ensure these elements are not included in the element list you pass to :meth:`~py_pdf_parser.tables.extract_table`, and again it should still work.
+
+.. code-block:: python
+
+ table = tables.extract_simple_table(over_the_page_elements, as_text=True)
+
+::
+
+ >>> table
+ [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']]
diff --git a/docs/source/examples/order_summary.rst b/docs/source/examples/order_summary.rst
new file mode 100644
index 00000000..4bda4f13
--- /dev/null
+++ b/docs/source/examples/order_summary.rst
@@ -0,0 +1,229 @@
+.. _order-summary:
+
+Order Summary
+-------------
+
+In this example we will extract some tabular data from an order summary pdf.
+
+You can :download:`download the example here `.
+
+This is a fairly simple PDF, and as such it would be fairly easy to identify the tables and extract the data from them, however we will use this example to introduce font mappings and sections, which will come in useful for larger PDFs.
+
+Step 1 - Load the file
+......................
+
+We can :func:`load ` the file as follows, and take a quick look using the :func:`visualise tool ` to check it looks good.
+
+.. code-block:: python
+
+ from py_pdf_parser.loaders import load_file
+ from py_pdf_parser.visualise import visualise
+
+ document = load_file("order_summary.pdf")
+ visualise(document)
+
+This should show the following. We should check that py-pdf-parser has detected each element correctly, which in this case it has.
+
+.. image:: /screenshots/order_summary_example/initial.png
+ :height: 300px
+
+Step 2 - Use a font mapping
+...........................
+
+Each :class:`~py_pdf_parser.components.PDFElement` has a :attr:`~py_pdf_parser.components.PDFElement.font` property, which is the name of the font in the PDF document (including the font size). You can use fonts to help filter elements.
+
+Fonts often have long, not very useful names. However, additional keyword arguments passed to :func:`~py_pdf_parser.loaders.load_file` will be used to initialise the :class:`~py_pdf_parser.components.PDFDocument`. One of these is the font mapping, which allows you to map the fonts in your PDF to more useful names.
+
+The visualise tool allows you to inspect fonts. If you however over an element, a summary will be shown in text at the bottom of the window. For example, in the image below we hover over the first cell in the table, and can see that the font is ``EAAAA+FreeMono,12.0``.
+
+.. image:: /screenshots/order_summary_example/showing_font_1.png
+ :height: 300px
+
+We can easily ask to see all of the available fonts by running
+
+::
+
+ >>> set(element.font for element in document.elements)
+ {'EAAAAA+FreeMono,12.0', 'BAAAAA+LiberationSerif-Bold,16.0', 'CAAAAA+LiberationSerif,12.0', 'DAAAAA+FreeMonoBold,12.0', 'BAAAAA+LiberationSerif-Bold,12.0'}
+
+Using this and the visualise tool, we can now choose better names for each of the fonts, and then load the document again, but this time providing a font mapping.
+
+.. code-block:: python
+
+ FONT_MAPPING = {
+ "BAAAAA+LiberationSerif-Bold,16.0": "title",
+ "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
+ "CAAAAA+LiberationSerif,12.0": "text",
+ "DAAAAA+FreeMonoBold,12.0": "table_header",
+ "EAAAAA+FreeMono,12.0": "table_text",
+ }
+ document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING)
+
+Using the visualise tool again, we can now see that our element's font has changed to ``table_text``, which is a much more useful name for us.
+
+.. image:: /screenshots/order_summary_example/showing_font_2.png
+ :height: 300px
+
+Step 3 - Add sections
+.....................
+
+Another thing we can do to make our job easier is to add :class:`Sections` to our document. A :class:`Sections` class is made available on :attr:`document.sectioning`, which in particular allows us to call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`.
+
+A section has a name, and contains all elements between the start element and the end element. You can add multiple sections with the same name, but each section will have both a ``name`` and a ``unique_name`` (which is just the name with an additional ``_n`` on the end, where ``n`` is the number of sections with that name).
+
+As with the :class:`~py_pdf_parser.components.PDFDocument`, a :class:`~py_pdf_parser.sectioning.Section` has an :attr:`~py_pdf_parser.sectioning.Section.elements` property which returns an :class:`~py_pdf_parser.filtering.ElementList`, allowing you to filter the elements.
+
+.. important:: Never instantiate a :class:`Sections` yourself. You should always use :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`.
+
+Calling :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will return the :class:`~py_pdf_parser.sectioning.Section`, but the :class:`~py_pdf_parser.sectioning.Sectioning` class also has :meth:`~py_pdf_parser.sectioning.Sectioning.get_section` and :meth:`~py_pdf_parser.sectioning.Sectioning.get_sections_with_name` methods.
+
+Going back to our example, we will create sections for the order summary table, and for the totals table. Our order summary table will start with the "Order Summary:" sub title and end at the "Totals:" sub title. Note that there are two elements on the page with text equal to "Order Summary:", however they have different font and so we can still extract exactly the one we want.
+
+
+.. image:: /screenshots/order_summary_example/zoomed.png
+ :height: 300px
+
+By default, :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will include the last element in the section, but this can be disabled by passing ``include_last_element=False``.
+
+The totals section will run from the "Totals:" sub title, until the end of the document. An :class:`~py_pdf_parser.filtering.ElementList` (e.g. ``document.elements``) acts like a set of elements, but it does also define an order, and as such we can access the last element in the :class:`~py_pdf_parser.filtering.ElementList` by simply doing ``document.elements[-1]``.
+
+.. code-block:: python
+
+ order_summary_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Order Summary:")
+ .extract_single_element()
+ )
+
+ totals_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Totals:")
+ .extract_single_element()
+ )
+
+ final_element = document.elements[-1]
+
+ order_summary_section = document.sectioning.create_section(
+ name="order_summary",
+ start_element=order_summary_sub_title_element,
+ end_element=totals_sub_title_element,
+ include_last_element=False,
+ )
+
+Again, the visualise tool is helpful to check everything worked as expected, as it will draw a border around all of our sections:
+
+.. image:: /screenshots/order_summary_example/sections.png
+ :height: 300px
+
+Step 4 - Extract tables
+.......................
+
+Now we have mapped our fonts and added some sections, we'd like to extract the table. In this case, we are able to use :meth:`~py_pdf_parser.tables.extract_simple_table`. We need to pass this the elements which form our table, however currently our sections also include the sub titles, "Order Summary:" and "Totals:". We need to exclude these from the elements we pass to :meth:`~py_pdf_parser.tables.extract_simple_table`. We have a reference to the sub title elements, so we could simply use :meth:`~py_pdf_parser.filtering.ElementList.remove_element`. However, since the tables seem to have their own fonts, it may be more robust to use :meth:`~py_pdf_parser.filtering.ElementList.filter_by_fonts`.
+
+We will also pass ``as_text=True``, since we are interested in the text, not the :class:`PDFElements` themselves.
+
+.. code-block:: python
+
+ order_summary_table = tables.extract_simple_table(
+ order_summary_section.elements.filter_by_fonts("table_header", "table_text"),
+ as_text=True,
+ )
+
+ totals_table = tables.extract_simple_table(
+ totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True
+ )
+
+This gives:
+
+::
+
+ >>> order_summary_table
+ [['Item', 'Unit Cost', 'Quantity', 'Cost'], ['Challenger 100g\nWhole Hops', '£3.29', '1', '£3.29'], ['Maris Otter \nPale Ale Malt \n(Crushed)', '£1.50/1000g', '4000g', '£6.00'], ['WLP037 \nYorkshire Ale \nYeast', '£7.08', '1', '£7.08'], ['Bottle Caps', '£1 per 100', '500', '£5']]
+
+ >>> totals_table
+ [['Subtotal:', '£26.28'], ['Shipping', '£6'], ['VAT 20%', '£6.45'], ['Total:', '£38.73']]
+
+As one final step, since the order summary table has a header row, we can make use of :meth:`~py_pdf_parser.tables.add_header_to_table`, which will change the list of lists to a list of dicts, mapping the header to the values in each row:
+
+.. code-block:: python
+
+ order_summary_with_header = tables.add_header_to_table(order_summary_table)
+
+::
+
+ >>> order_summary_with_header
+ [{'Item': 'Challenger 100g\nWhole Hops', 'Unit Cost': '£3.29', 'Quantity': '1', 'Cost': '£3.29'}, {'Item': 'Maris Otter \nPale Ale Malt \n(Crushed)', 'Unit Cost': '£1.50/1000g', 'Quantity': '4000g', 'Cost': '£6.00'}, {'Item': 'WLP037 \nYorkshire Ale \nYeast', 'Unit Cost': '£7.08', 'Quantity': '1', 'Cost': '£7.08'}, {'Item': 'Bottle Caps', 'Unit Cost': '£1 per 100', 'Quantity': '500', 'Cost': '£5'}]
+
+
+Full Code
+.........
+
+.. code-block:: python
+
+ from py_pdf_parser.loaders import load_file
+ from py_pdf_parser import tables
+
+ # from py_pdf_parser.visualise import visualise
+
+
+ # Step 1 - Load the file
+ document = load_file("order_summary.pdf")
+
+ # visualise(document)
+
+ # Step 2 - Use a font mapping
+
+ # Show all fonts:
+ # set(element.font for element in document.elements)
+
+ FONT_MAPPING = {
+ "BAAAAA+LiberationSerif-Bold,16.0": "title",
+ "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
+ "CAAAAA+LiberationSerif,12.0": "text",
+ "DAAAAA+FreeMonoBold,12.0": "table_header",
+ "EAAAAA+FreeMono,12.0": "table_text",
+ }
+ document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING)
+
+ # visualise(document)
+
+ # Step 3 - Add sections
+ order_summary_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Order Summary:")
+ .extract_single_element()
+ )
+
+ totals_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Totals:")
+ .extract_single_element()
+ )
+
+ final_element = document.elements[-1]
+
+ order_summary_section = document.sectioning.create_section(
+ name="order_summary",
+ start_element=order_summary_sub_title_element,
+ end_element=totals_sub_title_element,
+ include_last_element=False,
+ )
+
+ totals_section = document.sectioning.create_section(
+ name="totals", start_element=totals_sub_title_element, end_element=final_element
+ )
+
+ # visualise(document)
+
+ # Step 4 - Extract tables
+
+ order_summary_table = tables.extract_simple_table(
+ order_summary_section.elements.filter_by_fonts("table_header", "table_text"),
+ as_text=True,
+ )
+
+ totals_table = tables.extract_simple_table(
+ totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True
+ )
+
+ order_summary_with_header = tables.add_header_to_table(order_summary_table)
diff --git a/docs/source/examples/simple_memo.rst b/docs/source/examples/simple_memo.rst
new file mode 100644
index 00000000..25622721
--- /dev/null
+++ b/docs/source/examples/simple_memo.rst
@@ -0,0 +1,167 @@
+.. _simple-memo:
+
+Simple Memo
+-----------
+
+Our first example will be extracting information from a simple memo.
+
+You can :download:`download the example memo here `.
+
+We will assume that your company issues these memos always in a consistent format, i.e. with the "TO", "FROM", "DATE", and "SUBJECT" fields, the main content of the memo. We would like to write some code such that we can extract the information from each memo.
+
+Step 1 - Load the file
+......................
+
+First, we should load the file into a :class:`~py_pdf_parser.components.PDFDocument`, using :func:`~py_pdf_parser.loaders.load_file`:
+
+.. code-block:: python
+
+ from py_pdf_parser.loaders import load_file
+
+ document = load_file("simple_memo.pdf")
+
+To check the PDF loaded as expected, we can use the :func:`~py_pdf_parser.visualise.main.visualise` tool by running
+
+.. code-block:: python
+
+ from py_pdf_parser.visualise import visualise
+
+ visualise(document)
+
+This will open a matplotlib window which should look something like the following image:
+
+.. image:: /screenshots/simple_memo_example/visualise.png
+ :height: 300px
+
+Py-pdf-parser has extracted each element from the PDF as a :class:`~py_pdf_parser.components.PDFElement`, and is showing a blue box around each element. This is what we are looking for. Always check the visualise tool, since sometimes you will need to adjust the layout parameters so that the tool correctly identifies your elements. We will get on to this in later examples.
+
+Step 2 - Extract reference elements
+...................................
+
+Certain elements should be present in every memo. We will use these as reference elements to identify the elements which contain the information we are interested in. We already have our ``document``, which is a :class:`~py_pdf_parser.components.PDFDocument`. We can do :meth:`document.elements ` to get a list (an :class:`~py_pdf_parser.filtering.ElementList`) of all the :class:`~py_pdf_parser.components.PDFElement` in the document, and also to allow us to filter the elements.
+
+The simplest way to extract the elements we are interested in is by text. There are many other options available to us, and a full list can be found on the :ref:`filtering reference page`.
+
+We will extract the "TO:", "FROM:", "DATE:" and "SUBJECT:" elements as reference elements, i.e. the elements on the left of the below image. We will then search to the right of each of them in turn, to extract the values for each field.
+
+.. image:: /screenshots/simple_memo_example/top.png
+ :height: 200px
+
+To extract the element which says "TO:", we can simply run :meth:`document.elements.filter_by_font("TO:") `. This returns a new :class:`~py_pdf_parser.filtering.ElementList` which contains all the elements in the document with text equal to "TO:". In this case, there should only be one element in the list. We could just use ``[0]`` on the element list to access the element in question, however, there is a convenience function, :func:`~py_pdf_parser.filtering.ElementList.extract_single_element` on the :class:`~py_pdf_parser.filtering.ElementList` class to handle this case. This essentially checks if the list has a single element and returns the element for you, otherwise it raises an exception. Use of this is encouraged to make your code more robust and to make any errors more explicit.
+
+.. code-block:: python
+
+ to_element = document.elements.filter_by_text_equal("TO:").extract_single_element()
+ from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element()
+ date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element()
+ subject_element = document.elements.filter_by_text_equal(
+ "SUBJECT:"
+ ).extract_single_element()
+
+Each of the above elements will be a :class:`~py_pdf_parser.components.PDFElement`.
+
+Step 3 - Extract the data
+.........................
+
+In the above section we have extracted our reference elements. We can now use these to do some more filtering to extract the data we want. In particular, we can use :func:`~py_pdf_parser.filtering.ElementList.to_the_right_of`, which will extract elements directly to the right of a given element. It effectively draws a dotted line from the top and bottom of your element out to the right hand side of the page, and any elements which are partially within the box created by the dotted line will be returned. To extract the text from a :class:`~py_pdf_parser.components.PDFElement`, we must also call :func:`.text() `.
+
+.. code-block:: python
+
+ to_text = document.elements.to_the_right_of(to_element).extract_single_element().text()
+ from_text = (
+ document.elements.to_the_right_of(from_element).extract_single_element().text()
+ )
+ date_text = (
+ document.elements.to_the_right_of(date_element).extract_single_element().text()
+ )
+ subject_text_element = document.elements.to_the_right_of(
+ subject_element
+ ).extract_single_element()
+ subject_text = subject_text_element.text()
+
+Note we keep a reference to the subject text element. This is because we will use it later.
+
+We have now extracted the data from the top of the memo, for example ``to_text`` will be ``"All Developers"``. The code does not rely on who the memo is to, and so it should still work for a memo with different values.
+
+The last thing we need to do is extract the content of the memo. In our example there is only one paragraph, and so only one element, but if there were multiple paragraphs there could be multiple elements. There are a few ways to do this. It is probably the case that all the content elements are below the "SUBJECT:" element, however if the text started too far to the right this may not be the case. Instead, we can just use :func:`~py_pdf_parser.filtering.ElementList.after` to filter for elements strictly after the ``subject_text_element``:
+
+.. code-block:: python
+
+ content_elements = document.elements.after(subject_element)
+ content_text = "\n".join(element.text() for element in content_elements)
+
+That is now everything extracted from the memo. We can wrap our output into any data structure we fancy, for example json:
+
+.. code-block:: python
+
+ output = {
+ "to": to_text,
+ "from": from_text,
+ "date": date_text,
+ "subject": subject_text,
+ "content": content_text,
+ }
+
+Full Code
+.........
+
+Here is the full script constructed above:
+
+.. code-block:: python
+
+ from py_pdf_parser.loaders import load_file
+
+ # Step 1 - Load the document
+ document = load_file("simple_memo.pdf")
+
+ # We could visualise it here to check it looks correct:
+ # from py_pdf_parser.visualise import visualise
+ # visualise(document)
+
+ # Step 2 - Extract reference elements:
+ to_element = document.elements.filter_by_text_equal("TO:").extract_single_element()
+ from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element()
+ date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element()
+ subject_element = document.elements.filter_by_text_equal(
+ "SUBJECT:"
+ ).extract_single_element()
+
+ # Step 3 - Extract the data
+ to_text = document.elements.to_the_right_of(to_element).extract_single_element().text()
+ from_text = (
+ document.elements.to_the_right_of(from_element).extract_single_element().text()
+ )
+ date_text = (
+ document.elements.to_the_right_of(date_element).extract_single_element().text()
+ )
+ subject_text_element = document.elements.to_the_right_of(
+ subject_element
+ ).extract_single_element()
+ subject_text = subject_text_element.text()
+
+ content_elements = document.elements.after(subject_element)
+ content_text = "\n".join(element.text() for element in content_elements)
+
+ output = {
+ "to": to_text,
+ "from": from_text,
+ "date": date_text,
+ "subject": subject_text,
+ "content": content_text,
+ }
+
+This gives:
+::
+
+ >>> from pprint import pprint
+ >>> pprint(output)
+
+ {'content': 'A new PDF Parsing tool\n'
+ 'There is a new PDF parsing tool available, called py-pdf-parser - '
+ 'you should all check it out!\n'
+ 'I think it could really help you extract that data we need from '
+ 'those PDFs.',
+ 'date': '1st January 2020',
+ 'from': 'John Smith',
+ 'subject': 'A new PDF Parsing tool',
+ 'to': 'All Developers'}
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9c7fa6e5..73140af2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -6,4 +6,6 @@ Welcome to PDF Parser's documentation!
:caption: Contents:
overview
+ examples/index
reference/index
+ CHANGELOG.md
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
index ff780d3a..83dc9997 100644
--- a/docs/source/overview.rst
+++ b/docs/source/overview.rst
@@ -4,65 +4,45 @@ Overview
Introduction
------------
-This PDF Parser is a tool built on top of PDF Miner to help extracting information from
-PDFs in Python. The main idea was to create a tool that could be driven by code to
-interact with the elements on the PDF and slowly classify them by creating sections
-and adding tags to them. It also comes with a helpful visualisation tool which enables
-you to examine the current status of your elements.
+This PDF Parser is a tool built on top of PDF Miner to help extracting information from PDFs in Python. The main idea was to create a tool that could be driven by code to interact with the elements on the PDF and slowly classify them by creating sections and adding tags to them. It also comes with a helpful visualisation tool which enables you to examine the current status of your elements.
-This page gives a brief overview of the PDF Parser, but there is also a full
-:doc:`reference/index` of all the functionality.
+This page gives a brief overview of the PDF Parser, but there is also a full :doc:`reference/index` of all the functionality. You may get a more in-depth overview by looking at the :doc:`examples/index`.
Setup
-----
-At the moment you will need to install it from github. You will also need to manually
-install matplotlib and PyQt5 using apt. We are working on this.
+At the moment you will need to install it from github, using ``pip install git+https://github.com/jstockwin/py-pdf-parser.git@master#egg=py-pdf-parser``. We hope to publish to `PyPi` soon.
+
+When Should I Use Py PDF Parser?
+--------------------------------
+
+Py PDF Parser is for extracting specific, structured data from a PDF. You will be able to write code that should extract for multiple PDFs with the same format.
+
+If you're simply trying to extract all of the text from a PDF, other tools (e.g. https://textract.readthedocs.io/en/stable/python_package.html) may be more appropriate.
+
+If you're trying to extract specific tables from a certain PDF, other tools (e.g. https://camelot-py.readthedocs.io/en/master/) may be more appropriate.
Loading A PDF
-------------
-To load a PDF, use the ``load_file`` function from the :doc:`reference/loaders`. You
-will need to use ``load_file`` with a file path to be able to use the visualisation
-tool with your PDF as the background. If you don't have this, you can instead use the
-``load`` function, but when you use the visualisation tool there will be no background.
+To load a PDF, use the :func:`~py_pdf_parser.loaders.load_file`: function from the :doc:`reference/loaders`. You will need to use :func:`~py_pdf_parser.loaders.load_file`: with a file path to be able to use the visualisation tool with your PDF as the background. If you don't have this, you can instead use the :func:`~py_pdf_parser.loaders.load`: function, but when you use the visualisation tool there will be no background.
-We order the elements in a pdf, left-to-right, top-to-bottom. At the moment, this is
-not configurable. Each ``PDFElement`` within the ``PDFDocument`` are aware of their
-position, both on the page and within the document, and also have properties allowing
-you to access their font and text. For more information about ``PDFDocument`` and
-``PDFElement``, see :doc:`reference/components`.
+We order the elements in a pdf, left-to-right, top-to-bottom. At the moment, this is not configurable. Each :class:`~py_pdf_parser.components.PDFElement` within the :class:`~py_pdf_parser.components.PDFDocument` are aware of their position, both on the page and within the document, and also have properties allowing you to access their font and text. For more information about :class:`~py_pdf_parser.components.PDFDocument` and :class:`~py_pdf_parser.components.PDFElement`, see :doc:`reference/components`.
-Pay particular attention to the ``la_params`` argument. These will need to be
-fine-tuned for your PDF. We suggest immediately visualising your PDF using the
-visualisation tool to see how the elements have been grouped. If multiple elements
-have been counted as one, or vice versa, you should be able to fix this by tweaking
-the ``la_params``.
+Pay particular attention to the ``la_params`` argument. These will need to be fine-tuned for your PDF. We suggest immediately visualising your PDF using the visualisation tool to see how the elements have been grouped. If multiple elements have been counted as one, or vice versa, you should be able to fix this by tweaking the ``la_params``.
Filtering
---------
-Once you have loaded your PDF, say into a variable ``document``, you can start
-interacting with the elements. You can access all the elements by calling
-``document.elements``. You may now want to filter your elements, for example you could
-do ``document.elements.filter_by_text_equal("foo")`` to filter for all elements which
-say "foo". To view all available filters, have a look at the :doc:`reference/filtering`
-reference.
+Once you have loaded your PDF, say into a variable :class:`document`, you can start interacting with the elements. You can access all the elements by calling :class:`document.elements`. You may now want to filter your elements, for example you could do :meth:`document.elements.filter_by_text_equal("foo")` to filter for all elements which say "foo". To view all available filters, have a look at the :doc:`reference/filtering` reference.
-The ``document.elements`` object, and any filtered subset thereof, will be an
-``ElementList``. These act like sets of elements, and so you can union (``|``),
-intersect (``&``), difference (``-``) and symmetric difference (``^``) different
-filtered sets of elements.
+The :class:`document.elements` object, and any filtered subset thereof, will be an :class:`~py_pdf_parser.filtering.ElementList`. These act like sets of elements, and so you can union (:meth:`|`), intersect (:meth:`&`), difference (:meth:`-`) and symmetric difference (:meth:`^`) different filtered sets of elements.
-You can also chain filters, which will do the same as intersecting multiple filters, for
-example ``document.elements.filter_by_text_equal("foo").filter_by_tag("bar")`` is the
-same as ``document.elements.filter_by_text_equal("foo") &
-document.elements.filter_by_tag("bar")``.
+You can also chain filters, which will do the same as intersecting multiple filters, for example ``document.elements.filter_by_text_equal("foo").filter_by_tag("bar")`` is the same as ``document.elements.filter_by_text_equal("foo") & document.elements.filter_by_tag("bar")``.
-If you believe you have filtered down to a single element, and would like to examine
-that element, you can call ``extract_single_element`` on your ``ElementList``. This will
-return said element, or raise an exception if there is not a single element in your
-list.
+If you believe you have filtered down to a single element, and would like to examine that element, you can call :meth:`~py_pdf_parser.filtering.ElementList.extract_single_element`. This will return said element, or raise an exception if there is not a single element in your list.
+
+You can see an example of filtering in the :ref:`simple-memo` example.
Classifying Elements
--------------------
@@ -73,47 +53,31 @@ There are three ways to classify elements:
- create sections
- mark certain elements as ignored
-To add a tag, you can simply call ``add_tag`` on an element. You can filter by tags.
+To add a tag, you can simply call :meth:`~py_pdf_parser.components.PDFElement.add_tag` on an :class:`~py_pdf_parser.components.PDFElement`, or :meth:`~py_pdf_parser.filtering.ElementList.add_tag_to_elements` on an :class:`~py_pdf_parser.filtering.ElementList`. You can filter by tags.
-To create a section, you can call ``document.sectioning.create_section``. See
-:doc:`reference/sectioning` for more information. When you create a section you simply
-specify a name for the section, and the start and end element for the section. Any
-elements between the start and end element will be included in your section. You can
-add multiple sections with the same name, and internally they will be given unique
-names. You can filter by either the non-unique ``section_name``, or by the unique
-sections. Elements can be in multiple sections.
+To create a section, you can call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`. See :doc:`reference/sectioning` for more information and the :ref:`order-summary` example for an example. When you create a section you simply specify a name for the section, and the start and end element for the section. Any elements between the start and end element will be included in your section. You can add multiple sections with the same name, and internally they will be given unique names. You can filter by either the non-unique ``section_name``, or by the unique sections. Elements can be in multiple sections.
-To mark an element as ignored, simply set the ``ignore`` property to ``True``. You can
-then remove all ignored elements by doing ``document.elements.exclude_ignored()``.
+To mark an element as ignored, simply set the ``ignore`` property to ``True``. Ignored elements will not be included in any :class:`~py_pdf_parser.filtering.ElementList`, however existing lists which you have assigned to variables will not be re-calculated and so may still include the ignored elements.
-To process a whole pdf, we suggest that you mark any elements you're not interested in
-as ignored, group any elements which are together into sections, and then add tags to
-important elements. You can then loop through filtered sets of elements to extract the
-information you would like.
+To process a whole pdf, we suggest that you mark any elements you're not interested in as ignored, group any elements which are together into sections, and then add tags to important elements. You can then loop through filtered sets of elements to extract the information you would like.
Visualisation Tool
------------------
-The PDF Parser comes with a visualisation tool. See the :doc:`reference/visualise`
-documentation. When you visualise your ``PDFDocument``, you'll be able to see each
-page of the document in turn, with every ``PDFElement`` highlighted. You can hover
-over the elements to see their sections, tags and whether they are ignored or not. This
-is very helpful for debugging any problems.
+The PDF Parser comes with a visualisation tool. See the :doc:`reference/visualise` documentation. When you visualise your :class:`~py_pdf_parser.components.PDFDocument`, you'll be able to see each page of the document in turn, with every :class:`~py_pdf_parser.components.PDFElement` highlighted. You can hover over the elements to see their sections, tags and whether they are ignored or not. This is very helpful for debugging any problems.
+
+You can use the arrow key icons to change page, and can press home to return to page 1. You can also use the scroll wheel on your mouse to zoom in and out.
-You can use the arrow key icons to change page, and can press home to return to page 1.
-You can also use the scroll wheel on your mouse to zoom in and out.
+You can see an example of the visualisation in the :ref:`simple-memo` and :ref:`order-summary` examples.
Font Mappings
-------------
-You can filter elements by font. The font will be taken from the PDF itself, however
-often they have long and confusing names. You can specify a ``font_mapping`` when
-you load the document to map these to more memorable names. See the
-:doc:`reference/components` reference for the ``PDFDocument`` arguments for more
-information.
+You can filter elements by font. The font will be taken from the PDF itself, however often they have long and confusing names. You can specify a ``font_mapping`` when you load the document to map these to more memorable names. See the :doc:`reference/components` reference for the :class:`~py_pdf_parser.components.PDFDocument` arguments for more information.
+
+You can see an example of font mapping in the :ref:`order-summary` example.
Tables
------
-We have many functions to help extract tables. All of these use the positioning of the
-elements on the page to do this. See :doc:`reference/tables`.
+We have many functions to help extract tables. All of these use the positioning of the elements on the page to do this. See the :doc:`reference/tables` reference, and the :ref:`order-summary` and :ref:`more-tables` examples.
diff --git a/docs/source/reference/filtering.rst b/docs/source/reference/filtering.rst
index a2385380..1ebf3447 100644
--- a/docs/source/reference/filtering.rst
+++ b/docs/source/reference/filtering.rst
@@ -1,3 +1,5 @@
+.. _filtering-reference:
+
Filtering
---------
diff --git a/docs/source/screenshots/order_summary_example/initial.png b/docs/source/screenshots/order_summary_example/initial.png
new file mode 100644
index 00000000..868cc941
Binary files /dev/null and b/docs/source/screenshots/order_summary_example/initial.png differ
diff --git a/docs/source/screenshots/order_summary_example/sections.png b/docs/source/screenshots/order_summary_example/sections.png
new file mode 100644
index 00000000..f19c2537
Binary files /dev/null and b/docs/source/screenshots/order_summary_example/sections.png differ
diff --git a/docs/source/screenshots/order_summary_example/showing_font_1.png b/docs/source/screenshots/order_summary_example/showing_font_1.png
new file mode 100644
index 00000000..41bab9f9
Binary files /dev/null and b/docs/source/screenshots/order_summary_example/showing_font_1.png differ
diff --git a/docs/source/screenshots/order_summary_example/showing_font_2.png b/docs/source/screenshots/order_summary_example/showing_font_2.png
new file mode 100644
index 00000000..b975d3c3
Binary files /dev/null and b/docs/source/screenshots/order_summary_example/showing_font_2.png differ
diff --git a/docs/source/screenshots/order_summary_example/zoomed.png b/docs/source/screenshots/order_summary_example/zoomed.png
new file mode 100644
index 00000000..9db7d1b7
Binary files /dev/null and b/docs/source/screenshots/order_summary_example/zoomed.png differ
diff --git a/docs/source/screenshots/simple_memo_example/top.png b/docs/source/screenshots/simple_memo_example/top.png
new file mode 100644
index 00000000..86d1339c
Binary files /dev/null and b/docs/source/screenshots/simple_memo_example/top.png differ
diff --git a/docs/source/screenshots/simple_memo_example/visualise.png b/docs/source/screenshots/simple_memo_example/visualise.png
new file mode 100644
index 00000000..8ee5549f
Binary files /dev/null and b/docs/source/screenshots/simple_memo_example/visualise.png differ
diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py
index 00bd0eef..cbbeaf5a 100644
--- a/py_pdf_parser/tables.py
+++ b/py_pdf_parser/tables.py
@@ -199,9 +199,15 @@ def extract_table(
if fix_element_in_multiple_cols:
_fix_cols(cols, elements)
if sum([len(row) for row in rows]) != len(set(chain.from_iterable(rows))):
- raise TableExtractionError("An element is in multiple rows")
+ raise TableExtractionError(
+ "An element is in multiple rows. If this is expected, you can try passing "
+ "fix_element_in_multiple_rows=True"
+ )
if sum([len(col) for col in cols]) != len(set(chain.from_iterable(cols))):
- raise TableExtractionError("An element is in multiple columns")
+ raise TableExtractionError(
+ "An element is in multiple columns. If this is expected, you can try "
+ "passing fix_element_in_multiple_cols=True"
+ )
sorted_rows = sorted(
rows,
diff --git a/py_pdf_parser/tests/base.py b/py_pdf_parser/tests/base.py
index b7e4ed5c..12a7e5ce 100644
--- a/py_pdf_parser/tests/base.py
+++ b/py_pdf_parser/tests/base.py
@@ -1,5 +1,7 @@
from typing import List, Optional, Union, TYPE_CHECKING
+import logging
+
from unittest import TestCase
if TYPE_CHECKING:
@@ -9,6 +11,10 @@
from py_pdf_parser.filtering import ElementList
+# Turn of debug spam from pdfminer
+logging.getLogger("pdfminer").setLevel(logging.WARNING)
+
+
class BaseTestCase(TestCase):
# Helper functions
def assert_original_element_in(
diff --git a/py_pdf_parser/tests/test_doc_examples/__init__.py b/py_pdf_parser/tests/test_doc_examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/py_pdf_parser/tests/test_doc_examples/test_order_summary.py b/py_pdf_parser/tests/test_doc_examples/test_order_summary.py
new file mode 100644
index 00000000..c24d52da
--- /dev/null
+++ b/py_pdf_parser/tests/test_doc_examples/test_order_summary.py
@@ -0,0 +1,128 @@
+import os
+
+from py_pdf_parser.tests.base import BaseTestCase
+
+from py_pdf_parser import tables
+from py_pdf_parser.loaders import load_file
+
+
+class TestSimpleMemo(BaseTestCase):
+ def test_output_is_correct(self):
+ # The code below should match that in the documentation example "order_summary"
+ # Step 1 - Load the document
+ file_path = os.path.join(
+ os.path.dirname(__file__),
+ "../../../docs/source/example_files/order_summary.pdf",
+ )
+ FONT_MAPPING = {
+ "BAAAAA+LiberationSerif-Bold,16.0": "title",
+ "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
+ "CAAAAA+LiberationSerif,12.0": "text",
+ "DAAAAA+FreeMonoBold,12.0": "table_header",
+ "EAAAAA+FreeMono,12.0": "table_text",
+ }
+ document = load_file(file_path, font_mapping=FONT_MAPPING)
+
+ # visualise(document)
+
+ # Step 3 - Add sections
+ order_summary_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Order Summary:")
+ .extract_single_element()
+ )
+
+ totals_sub_title_element = (
+ document.elements.filter_by_font("sub_title")
+ .filter_by_text_equal("Totals:")
+ .extract_single_element()
+ )
+
+ final_element = document.elements[-1]
+
+ order_summary_section = document.sectioning.create_section(
+ name="order_summary",
+ start_element=order_summary_sub_title_element,
+ end_element=totals_sub_title_element,
+ include_last_element=False,
+ )
+
+ totals_section = document.sectioning.create_section(
+ name="totals",
+ start_element=totals_sub_title_element,
+ end_element=final_element,
+ )
+
+ # visualise(document)
+
+ # Step 4 - Extract tables
+
+ order_summary_table = tables.extract_simple_table(
+ order_summary_section.elements.filter_by_fonts(
+ "table_header", "table_text"
+ ),
+ as_text=True,
+ )
+
+ totals_table = tables.extract_simple_table(
+ totals_section.elements.filter_by_fonts("table_header", "table_text"),
+ as_text=True,
+ )
+
+ order_summary_with_header = tables.add_header_to_table(order_summary_table)
+
+ self.assertListEqual(
+ order_summary_table,
+ [
+ ["Item", "Unit Cost", "Quantity", "Cost"],
+ ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"],
+ [
+ "Maris Otter \nPale Ale Malt \n(Crushed)",
+ "£1.50/1000g",
+ "4000g",
+ "£6.00",
+ ],
+ ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"],
+ ["Bottle Caps", "£1 per 100", "500", "£5"],
+ ],
+ )
+
+ self.assertListEqual(
+ totals_table,
+ [
+ ["Subtotal:", "£26.28"],
+ ["Shipping", "£6"],
+ ["VAT 20%", "£6.45"],
+ ["Total:", "£38.73"],
+ ],
+ )
+
+ self.assertListEqual(
+ order_summary_with_header,
+ [
+ {
+ "Item": "Challenger 100g\nWhole Hops",
+ "Unit Cost": "£3.29",
+ "Quantity": "1",
+ "Cost": "£3.29",
+ },
+ {
+ "Item": "Maris Otter \nPale Ale Malt \n(Crushed)",
+ "Unit Cost": "£1.50/1000g",
+ "Quantity": "4000g",
+ "Cost": "£6.00",
+ },
+ {
+ "Item": "WLP037 \nYorkshire Ale \nYeast",
+ "Unit Cost": "£7.08",
+ "Quantity": "1",
+ "Cost": "£7.08",
+ },
+ {
+ "Item": "Bottle Caps",
+ "Unit Cost": "£1 per 100",
+ "Quantity": "500",
+ "Cost": "£5",
+ },
+ ],
+ )
diff --git a/py_pdf_parser/tests/test_doc_examples/test_simple_memo.py b/py_pdf_parser/tests/test_doc_examples/test_simple_memo.py
new file mode 100644
index 00000000..cd57c003
--- /dev/null
+++ b/py_pdf_parser/tests/test_doc_examples/test_simple_memo.py
@@ -0,0 +1,82 @@
+import os
+
+from py_pdf_parser.tests.base import BaseTestCase
+from py_pdf_parser.loaders import load_file
+
+
+class TestSimpleMemo(BaseTestCase):
+ def test_output_is_correct(self):
+ # The code below should match that in the documentation example "simple_memo"
+ # Step 1 - Load the document
+ file_path = os.path.join(
+ os.path.dirname(__file__),
+ "../../../docs/source/example_files/simple_memo.pdf",
+ )
+ document = load_file(file_path)
+
+ # We could visualise it here to check it looks correct:
+ # from py_pdf_parser.visualise import visualise
+ # visualise(document)
+
+ # Step 2 - Extract reference elements:
+ to_element = document.elements.filter_by_text_equal(
+ "TO:"
+ ).extract_single_element()
+ from_element = document.elements.filter_by_text_equal(
+ "FROM:"
+ ).extract_single_element()
+ date_element = document.elements.filter_by_text_equal(
+ "DATE:"
+ ).extract_single_element()
+ subject_element = document.elements.filter_by_text_equal(
+ "SUBJECT:"
+ ).extract_single_element()
+
+ # Step 3 - Extract the data
+ to_text = (
+ document.elements.to_the_right_of(to_element)
+ .extract_single_element()
+ .text()
+ )
+ from_text = (
+ document.elements.to_the_right_of(from_element)
+ .extract_single_element()
+ .text()
+ )
+ date_text = (
+ document.elements.to_the_right_of(date_element)
+ .extract_single_element()
+ .text()
+ )
+ subject_text_element = document.elements.to_the_right_of(
+ subject_element
+ ).extract_single_element()
+ subject_text = subject_text_element.text()
+
+ content_elements = document.elements.after(subject_element)
+ content_text = "\n".join(element.text() for element in content_elements)
+
+ output = {
+ "to": to_text,
+ "from": from_text,
+ "date": date_text,
+ "subject": subject_text,
+ "content": content_text,
+ }
+
+ self.assertDictEqual(
+ output,
+ {
+ "content": (
+ "A new PDF Parsing tool\n"
+ "There is a new PDF parsing tool available, called py-pdf-parser - "
+ "you should all check it out!\n"
+ "I think it could really help you extract that data we need from "
+ "those PDFs."
+ ),
+ "date": "1st January 2020",
+ "from": "John Smith",
+ "subject": "A new PDF Parsing tool",
+ "to": "All Developers",
+ },
+ )
diff --git a/py_pdf_parser/tests/test_doc_examples/test_tables.py b/py_pdf_parser/tests/test_doc_examples/test_tables.py
new file mode 100644
index 00000000..edfee12b
--- /dev/null
+++ b/py_pdf_parser/tests/test_doc_examples/test_tables.py
@@ -0,0 +1,202 @@
+import os
+
+from py_pdf_parser.tests.base import BaseTestCase
+
+from py_pdf_parser import tables
+from py_pdf_parser.exceptions import TableExtractionError
+from py_pdf_parser.loaders import load_file
+
+
+class TestSimpleMemo(BaseTestCase):
+ def test_output_is_correct(self):
+ file_path = os.path.join(
+ os.path.dirname(__file__), "../../../docs/source/example_files/tables.pdf"
+ )
+
+ # Step 1 - Load the file
+ FONT_MAPPING = {
+ "BAAAAA+LiberationSerif-Bold,12.0": "header",
+ "CAAAAA+LiberationSerif,12.0": "table_element",
+ }
+ document = load_file(file_path, font_mapping=FONT_MAPPING)
+
+ headers = document.elements.filter_by_font("header")
+
+ # Extract reference elements
+ simple_table_header = headers.filter_by_text_equal(
+ "Simple Table"
+ ).extract_single_element()
+
+ simple_table_with_gaps_header = headers.filter_by_text_equal(
+ "Simple Table with gaps"
+ ).extract_single_element()
+
+ simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
+ "Simple Table with gaps in first row/col"
+ ).extract_single_element()
+
+ non_simple_table_header = headers.filter_by_text_equal(
+ "Non Simple Table"
+ ).extract_single_element()
+
+ non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
+ "Non Simple Table with Merged Columns"
+ ).extract_single_element()
+
+ non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
+ "Non Simple Table with Merged Rows and Columns"
+ ).extract_single_element()
+
+ over_the_page_header = headers.filter_by_text_equal(
+ "Over the page"
+ ).extract_single_element()
+
+ # Extract table elements
+ simple_table_elements = document.elements.between(
+ simple_table_header, simple_table_with_gaps_header
+ )
+ simple_table_with_gaps_elements = document.elements.between(
+ simple_table_with_gaps_header,
+ simple_table_with_gaps_in_first_row_col_header,
+ )
+
+ simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
+ simple_table_with_gaps_in_first_row_col_header, non_simple_table_header
+ )
+
+ non_simple_table_elements = document.elements.between(
+ non_simple_table_header, non_simple_table_with_merged_cols_header
+ )
+
+ non_simple_table_with_merged_cols_elements = document.elements.between(
+ non_simple_table_with_merged_cols_header,
+ non_simple_table_with_merged_rows_header,
+ )
+
+ non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
+ non_simple_table_with_merged_rows_header, over_the_page_header
+ )
+
+ over_the_page_elements = document.elements.after(over_the_page_header)
+
+ # Simple Table
+ table = tables.extract_simple_table(simple_table_elements, as_text=True)
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
+ ["A", "1", "A", "1"],
+ ["B", "2", "B", "2"],
+ ["C", "3", "C", "3"],
+ ],
+ )
+
+ # Simple Table with gaps
+
+ with self.assertRaises(TableExtractionError):
+ tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True)
+
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_elements, as_text=True, allow_gaps=True
+ )
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
+ ["A", "1", "", "1"],
+ ["B", "", "", ""],
+ ["C", "", "C", "3"],
+ ],
+ )
+
+ # Simple Table with gaps in first row/col
+ with self.assertRaises(TableExtractionError):
+ tables.extract_simple_table(
+ simple_table_with_gaps_in_first_row_col_elements,
+ as_text=True,
+ allow_gaps=True,
+ )
+
+ reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
+ table = tables.extract_simple_table(
+ simple_table_with_gaps_in_first_row_col_elements,
+ as_text=True,
+ allow_gaps=True,
+ reference_element=reference_element,
+ )
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "", "Heading 4"],
+ ["", "1", "A", ""],
+ ["B", "2", "", "2"],
+ ["C", "3", "C", "3"],
+ ],
+ )
+
+ # Non Simple Table
+ table = tables.extract_table(non_simple_table_elements, as_text=True)
+ self.assertListEqual(
+ table,
+ [
+ ["", "Heading 2", "Heading 3", "Heading 4"],
+ ["A", "1", "", "1"],
+ ["B", "", "B", "2"],
+ ["C", "3", "C", ""],
+ ],
+ )
+
+ # Non Simple Table with Merged Columns
+ with self.assertRaises(TableExtractionError):
+ tables.extract_table(
+ non_simple_table_with_merged_cols_elements, as_text=True
+ )
+
+ table = tables.extract_table(
+ non_simple_table_with_merged_cols_elements,
+ as_text=True,
+ fix_element_in_multiple_cols=True,
+ )
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
+ ["A", "1", "A", "1"],
+ ["This text spans across multiple columns", "", "B", "2"],
+ ["C", "3", "C", "3"],
+ ],
+ )
+
+ # Non Simple Table with Merged Rows and Columns
+ table = tables.extract_table(
+ non_simple_table_with_merged_rows_and_cols_elements,
+ as_text=True,
+ fix_element_in_multiple_rows=True,
+ fix_element_in_multiple_cols=True,
+ )
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
+ [
+ "This text spans across multiple rows and \nmultiple columns.",
+ "",
+ "A",
+ "1",
+ ],
+ ["", "", "B", "2"],
+ ["C", "3", "C", "3"],
+ ],
+ )
+
+ # Over the page
+ table = tables.extract_simple_table(over_the_page_elements, as_text=True)
+ self.assertListEqual(
+ table,
+ [
+ ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
+ ["A", "1", "A", "1"],
+ ["B", "2", "B", "2"],
+ ["C", "3", "C", "3"],
+ ],
+ )
diff --git a/setup.py b/setup.py
index d9b20181..7a3d7123 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,8 @@
"pycodestyle==2.5.0",
"pyqt5==5.14.1",
"pytype==2020.1.8",
+ "recommonmark==0.6.0",
+ "sphinx-autobuild==0.7.1",
"sphinx-rtd-theme==0.4.3",
"Sphinx==2.3.1",
# This is a sub-dependency but is pinned because the next version doesn't