Skip to content

Commit

Permalink
Don't wrap tables by default
Browse files Browse the repository at this point in the history
Adds a --wrap-tables flag to wrap tables if the user wants.
  • Loading branch information
gpanders committed Apr 19, 2021
1 parent 4592133 commit ec057f8
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 2 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ The AUTHORS/Contributors are (and/or have been):
* Jonathan Vanasco <[email protected]>
* Jon Dufresne <[email protected]>
* Mike Borsetti
* Gregory Anders <[email protected]>

Maintainer:

Expand Down
1 change: 1 addition & 0 deletions ChangeLog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ UNRELEASED
* Add support for Python 3.9.
* Fix extra line breaks inside html link text (between '[' and ']')
* Fix #344: indent ``<ul>`` inside ``<ol>`` three spaces instead of two to comply with CommonMark, GFM, etc.
* Don't wrap tables by default and add a ``--wrap-tables`` config option

2020.1.16
=========
Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ simple indications of their function.
- MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags
- WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False)
- WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping
- WRAP_TABLES to decide if tables have to be wrapped during text wrapping
- DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values.
- DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage
- OPEN_QUOTE is the character used to open a quote when replacing the `<q>` tag. It defaults to `"`.
Expand Down Expand Up @@ -143,6 +144,7 @@ Command line options
| `--mark-code` | Mark code with [code]...[/code] blocks
| `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links`
| `--wrap-list-items` | Wrap list items during text wrapping.
| `--wrap-tables` | Wrap tables during text wrapping.
| `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc.
| `--pad-tables` | Use padding to make tables look good.
| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values.
Expand Down
5 changes: 4 additions & 1 deletion html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(
self.mark_code = config.MARK_CODE
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
self.wrap_links = config.WRAP_LINKS # covered in cli
self.wrap_tables = config.WRAP_TABLES
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.tag_callback = None
Expand Down Expand Up @@ -922,7 +923,9 @@ def optwrap(self, text: str) -> str:
self.inline_links = False
for para in text.split("\n"):
if len(para) > 0:
if not skipwrap(para, self.wrap_links, self.wrap_list_items):
if not skipwrap(
para, self.wrap_links, self.wrap_list_items, self.wrap_tables
):
indent = ""
if para.startswith(" " + self.ul_item_mark):
# list item continuation: add a double indent to the
Expand Down
8 changes: 8 additions & 0 deletions html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ class bcolors:
default=config.WRAP_LIST_ITEMS,
help="wrap list items during conversion",
)
p.add_argument(
"--wrap-tables",
dest="wrap_tables",
action="store_true",
default=config.WRAP_TABLES,
help="wrap tables",
)
p.add_argument(
"--ignore-emphasis",
dest="ignore_emphasis",
Expand Down Expand Up @@ -298,6 +305,7 @@ class bcolors:
h.mark_code = args.mark_code
h.wrap_links = args.wrap_links
h.wrap_list_items = args.wrap_list_items
h.wrap_tables = args.wrap_tables
h.pad_tables = args.pad_tables
h.default_image_alt = args.default_image_alt
h.open_quote = args.open_quote
Expand Down
6 changes: 6 additions & 0 deletions html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
# Wrap list items.
WRAP_LIST_ITEMS = False

# Wrap tables
WRAP_TABLES = False

# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36

Expand Down Expand Up @@ -63,6 +66,9 @@
# to find links in the text
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")

# to find table separators
RE_TABLE = re.compile(r" \| ")

RE_MD_DOT_MATCHER = re.compile(
r"""
^ # start of line
Expand Down
8 changes: 7 additions & 1 deletion html2text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
return 0


def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
def skipwrap(
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
) -> bool:
# If it appears to contain a link
# don't wrap
if not wrap_links and config.RE_LINK.search(para):
Expand All @@ -181,6 +183,10 @@ def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
return not wrap_list_items

# If text contains a pipe character it is likely a table
if not wrap_tables and config.RE_TABLE.search(para):
return True

# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
Expand Down
5 changes: 5 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ def generate_testdata():
cmdline_args.append("--wrap-list-items")
func_args = skip

if base_fn.startswith("wrap_tables"):
module_args["wrap_tables"] = True
cmdline_args.append("--wrap-tables")
func_args = skip

if base_fn == "inplace_baseurl_substitution.html":
module_args["baseurl"] = "http://brettterpstra.com"
module_args["body_width"] = 0
Expand Down
12 changes: 12 additions & 0 deletions test/wrap_tables.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html> <html>
<head lang="en"> <meta charset="UTF-8"> <title></title> </head>
<body> <h1>This is a test document</h1> With some text, <code>code</code>, <b>bolds</b> and <i>italics</i>. <h2>This is second header</h2> <p style="display: none">Displaynone text</p>
<table>
<tr> <th>Header 1</th> <th>Header 2</th> <th>Header 3</th> </tr>
<tr> <td>Content 1</td> <td>2</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
<tr> <td>Content 1 longer</td> <td>Content 2</td> <td>Here is some really long text that will wrap to the next line. Because it's so long.</td> </tr>
<tr> <td>Content </td> <td>Content 2</td> <td>blah</td> </tr>
<tr> <td>t </td> <td>Content 2</td> <td>blah blah blah</td> </tr>
</table>

</body> </html>
16 changes: 16 additions & 0 deletions test/wrap_tables.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This is a test document

With some text, `code`, **bolds** and _italics_.

## This is second header

Displaynone text

Header 1 | Header 2 | Header 3
---|---|---
Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image!
Content 1 longer | Content 2 | Here is some really long text that will wrap to
the next line. Because it's so long.
Content | Content 2 | blah
t | Content 2 | blah blah blah

0 comments on commit ec057f8

Please sign in to comment.