Skip to content

Commit

Permalink
Further work on table referencing and basic code refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Krande committed Sep 15, 2021
1 parent 688f058 commit e246e25
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 37 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from setuptools import setup

setup(version="0.0.2")
setup(version="0.0.3")
12 changes: 7 additions & 5 deletions src/paradoc/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Table:
df: pd.DataFrame
caption: str
format: TableFormat = TableFormat()
add_link: bool = False

def to_markdown(self, include_name_in_cell=False, flags=None):
df = self.df.copy()
Expand All @@ -36,7 +37,9 @@ def to_markdown(self, include_name_in_cell=False, flags=None):
tbl_str = df.to_markdown(index=False, tablefmt="grid")
if flags is not None and TableFlags.NO_CAPTION in flags:
return tbl_str
tbl_str += f"\nTable: {self.caption} {{#tbl:{self.name}}}"
tbl_str += f"\n\nTable: {self.caption}"
if self.add_link:
tbl_str += f"{{#tbl:{self.name}}}"
return tbl_str


Expand Down Expand Up @@ -70,10 +73,9 @@ def to_latex(self, print_latex=False, print_formula=False, flags=None):


@dataclass
class Formatting:
is_appendix: bool
paragraph_style_map: dict
table_format: TableFormat
class DocXFormat:
pg_font: str = "Arial"
pg_size: int = 11


@dataclass
Expand Down
11 changes: 9 additions & 2 deletions src/paradoc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,14 @@
import pandas as pd
import pypandoc

from .common import Equation, ExportFormats, MarkDownFile, Table, TableFormat
from .common import (
DocXFormat,
Equation,
ExportFormats,
MarkDownFile,
Table,
TableFormat,
)
from .utils import get_list_of_files, variable_sub


Expand Down Expand Up @@ -61,9 +68,9 @@ def __init__(
self.variables = dict()
self.tables: Dict[str, Table] = dict()
self.equations: Dict[str, Equation] = dict()
self.doc_format = DocXFormat()

# Style info: https://python-docx.readthedocs.io/en/latest/user/styles-using.html
self.table_format = TableFormat()
self.paragraph_style_map = kwargs.get("paragraph_style_map", OneDoc.default_paragraph_map)
self.appendix_heading_map = kwargs.get("appendix_heading_map", OneDoc.default_app_map)

Expand Down
47 changes: 41 additions & 6 deletions src/paradoc/io/word/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from docx.shared import Pt
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
from docx.text.run import Run

from paradoc.common import Table

from .references import insert_caption_into_runs
from .references import add_seq_reference


@dataclass
Expand All @@ -25,7 +26,6 @@ def is_complete(self):
return all([x is not None for x in docx_attr])

def format_table(self, is_appendix):

tbl = self.docx_table
tbl_format = self.table_ref.format

Expand All @@ -45,15 +45,50 @@ def format_table(self, is_appendix):
font.bold = True
else:
font.bold = False
tbl.autofit = True

# Format table Caption
caption = self.docx_caption
caption.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
insert_caption_into_runs(caption, "Table", is_appendix)

rebuild_caption(caption, self.table_ref.caption, is_appendix)

for run in caption.runs:
run.font.name = tbl_format.font_style

# Fix formatting after Table
follower_pg = self.docx_following_pg
follower_pg.runs[0].text = "\n" + follower_pg.runs[0].text
follower_pg.paragraph_format.space_before = None

self.docx_following_pg.paragraph_format.space_before = Pt(12)
# follower_pg = self.docx_following_pg
#
# i = par_index(follower_pg)
#
# follower_pg.runs[0].text = "\n" + follower_pg.runs[0].text
# follower_pg.paragraph_format.space_before = None


def rebuild_caption(caption: Paragraph, caption_str, is_appendix):
caption.clear()
caption.runs.clear()

run = caption.add_run()

heading_ref = "Appendix" if is_appendix is True else '"Heading 1"'

seq1 = caption._element._new_r()
seq1.text = "Table "

add_seq_reference(seq1, f"STYLEREF \\s {heading_ref} \\n", run._parent)
run._element.addprevious(seq1)

stroke = caption._element._new_r()
new_run = Run(stroke, run._parent)
new_run.text = "-"
run._element.addprevious(stroke)
seq2 = caption._element._new_r()
add_seq_reference(seq2, "SEQ Table \\* ARABIC \\s 1", run._parent)
run._element.addprevious(seq2)
fin = caption._element._new_r()
fin_run = Run(fin, run._parent)
fin_run.text = ": " + caption_str
run._element.addprevious(fin)
47 changes: 42 additions & 5 deletions src/paradoc/io/word/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from typing import List, Union

import numpy as np
from docx import Document
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
Expand Down Expand Up @@ -32,6 +33,7 @@ def convert_to_docx(self, output_name, dest_file):
composer_app = add_to_composer(MY_DOCX_TMPL_BLANK, one_doc.md_files_app)

for tbl in self.identify_tables(composer_main.doc):

tbl.format_table(is_appendix=False)

for tbl in self.identify_tables(composer_app.doc):
Expand Down Expand Up @@ -71,12 +73,12 @@ def identify_tables(self, doc: Document):
continue

if block.style.name == "Table Caption":
if "using solid elements" in block.text:
print("sd")
current_table.docx_caption = block

if type(block) == Paragraph and prev_table is True and len(block.runs) > 0:
block.runs[0].text = "\n" + block.runs[0].text
if type(block) == Paragraph and prev_table is True:
prev_table = False
block.paragraph_format.space_before = None
current_table.docx_following_pg = block

if current_table.is_complete():
Expand All @@ -90,19 +92,42 @@ def identify_tables(self, doc: Document):

return tables

def get_related_table(self, current_table: DocXTableRef) -> Union[Table, None]:
def get_related_table(self, current_table: DocXTableRef, frac=1e-4) -> Union[Table, None]:
one = self.one_doc

# Search using Caption string
caption = current_table.docx_caption
re_cap = re.compile("Table [0-9]{0,9}:(.*)")
re_cap = re.compile(r"Table\s*[0-9]{0,9}:(.*)")
for key, tbl in one.tables.items():
if "Table" in caption.text:
m = re_cap.search(caption.text)
if m is None:
raise ValueError()
caption_text = str(m.group(1).strip())
else:
caption_text = str(caption.text)
caption_text = caption_text.replace("”", '"')
if tbl.caption == caption_text:
return tbl

# If no match using caption string, then use contents of table
content = get_first_row_from_table(current_table.docx_table)
is_content_numeric = False

try:
content_numeric = np.array(content, dtype=float)
is_content_numeric = True
except ValueError:
content_numeric = None

for key, tbl in one.tables.items():
row_1 = tbl.df.iloc[0].values
if is_content_numeric and len(content) == len(row_1):
tot = sum(row_1)
diff = sum(row_1 - content_numeric)
if abs(diff) < abs(tot) * frac:
return tbl
print("")
return None


Expand All @@ -116,3 +141,15 @@ def add_to_composer(source_doc, md_files: List[MarkDownFile]) -> Composer:
composer_doc.append(doc_in)
logging.info(f"Added {md.new_file}")
return composer_doc


def get_first_row_from_table(docx_table: DocxTable, num_row=1):
content = []
for i, row in enumerate(docx_table.rows):
if i == 0:
continue
for cell in row.cells:
paragraphs = cell.paragraphs
for paragraph in paragraphs:
content.append(paragraph.text.strip())
return content
16 changes: 16 additions & 0 deletions src/paradoc/io/word/references.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,19 @@ def add_seq_reference(run_in, seq, parent):
fldChar.set(qn("w:fldCharType"), "end")
r.append(fldChar)
return new_run


def add_table_reference(paragraph, seq=" SEQ Table \\* ARABIC \\s 1"):
run = paragraph.add_run()
r = run._r
fldChar = OxmlElement("w:fldChar")
fldChar.set(qn("w:fldCharType"), "begin")
r.append(fldChar)
instrText = OxmlElement("w:instrText")
instrText.text = seq
r.append(instrText)
fldChar = OxmlElement("w:fldChar")
fldChar.set(qn("w:fldCharType"), "end")
r.append(fldChar)

return run
18 changes: 0 additions & 18 deletions src/paradoc/io/word/tables.py

This file was deleted.

0 comments on commit e246e25

Please sign in to comment.