diff --git a/AUTHORS.rst b/AUTHORS.rst index d4889ab..d3a8293 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,6 +32,7 @@ The AUTHORS/Contributors are (and/or have been): * Jonathan Vanasco * Jon Dufresne * Mike Borsetti +* Gregory Anders Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 42b8c96..935199d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,6 +6,7 @@ UNRELEASED * Add support for Python 3.9. * Fix extra line breaks inside html link text (between '[' and ']') * Fix #344: indent ``
    `` inside ``
      `` three spaces instead of two to comply with CommonMark, GFM, etc. +* Don't wrap tables by default and add a ``--wrap-tables`` config option 2020.1.16 ========= diff --git a/docs/usage.md b/docs/usage.md index a1758d3..2a5b78c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -95,6 +95,7 @@ simple indications of their function. - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) - WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping + - WRAP_TABLES to decide if tables have to be wrapped during text wrapping - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage - OPEN_QUOTE is the character used to open a quote when replacing the `` tag. It defaults to `"`. @@ -143,6 +144,7 @@ Command line options | `--mark-code` | Mark code with [code]...[/code] blocks | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` | `--wrap-list-items` | Wrap list items during text wrapping. +| `--wrap-tables` | Wrap tables during text wrapping. | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. | `--pad-tables` | Use padding to make tables look good. | `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values. diff --git a/html2text/__init__.py b/html2text/__init__.py index 770e271..5fac050 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -78,6 +78,7 @@ def __init__( self.mark_code = config.MARK_CODE self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli self.wrap_links = config.WRAP_LINKS # covered in cli + self.wrap_tables = config.WRAP_TABLES self.pad_tables = config.PAD_TABLES # covered in cli self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None @@ -922,7 +923,9 @@ def optwrap(self, text: str) -> str: self.inline_links = False for para in text.split("\n"): if len(para) > 0: - if not skipwrap(para, self.wrap_links, self.wrap_list_items): + if not skipwrap( + para, self.wrap_links, self.wrap_list_items, self.wrap_tables + ): indent = "" if para.startswith(" " + self.ul_item_mark): # list item continuation: add a double indent to the diff --git a/html2text/cli.py b/html2text/cli.py index 30a362e..38140c8 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -45,6 +45,13 @@ class bcolors: default=config.WRAP_LIST_ITEMS, help="wrap list items during conversion", ) + p.add_argument( + "--wrap-tables", + dest="wrap_tables", + action="store_true", + default=config.WRAP_TABLES, + help="wrap tables", + ) p.add_argument( "--ignore-emphasis", dest="ignore_emphasis", @@ -298,6 +305,7 @@ class bcolors: h.mark_code = args.mark_code h.wrap_links = args.wrap_links h.wrap_list_items = args.wrap_list_items + h.wrap_tables = args.wrap_tables h.pad_tables = args.pad_tables h.default_image_alt = args.default_image_alt h.open_quote = args.open_quote diff --git a/html2text/config.py b/html2text/config.py index 2bb38b6..468a995 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -31,6 +31,9 @@ # Wrap list items. WRAP_LIST_ITEMS = False +# Wrap tables +WRAP_TABLES = False + # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 @@ -63,6 +66,9 @@ # to find links in the text RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") +# to find table separators +RE_TABLE = re.compile(r" \| ") + RE_MD_DOT_MATCHER = re.compile( r""" ^ # start of line diff --git a/html2text/utils.py b/html2text/utils.py index 2051d23..366748b 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -159,7 +159,9 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: return 0 -def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: +def skipwrap( + para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool +) -> bool: # If it appears to contain a link # don't wrap if not wrap_links and config.RE_LINK.search(para): @@ -181,6 +183,10 @@ def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": return not wrap_list_items + # If text contains a pipe character it is likely a table + if not wrap_tables and config.RE_TABLE.search(para): + return True + # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. diff --git a/test/test_html2text.py b/test/test_html2text.py index f724fc8..8a64180 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -122,6 +122,11 @@ def generate_testdata(): cmdline_args.append("--wrap-list-items") func_args = skip + if base_fn.startswith("wrap_tables"): + module_args["wrap_tables"] = True + cmdline_args.append("--wrap-tables") + func_args = skip + if base_fn == "inplace_baseurl_substitution.html": module_args["baseurl"] = "http://brettterpstra.com" module_args["body_width"] = 0 diff --git a/test/wrap_tables.html b/test/wrap_tables.html new file mode 100644 index 0000000..d778800 --- /dev/null +++ b/test/wrap_tables.html @@ -0,0 +1,12 @@ + + +

      This is a test document

      With some text, code, bolds and italics.

      This is second header

      Displaynone text

      + + + + + + +
      Header 1 Header 2 Header 3
      Content 1 2 200 Image!
      Content 1 longer Content 2 Here is some really long text that will wrap to the next line. Because it's so long.
      Content Content 2 blah
      t Content 2 blah blah blah
      + + diff --git a/test/wrap_tables.md b/test/wrap_tables.md new file mode 100644 index 0000000..37decf2 --- /dev/null +++ b/test/wrap_tables.md @@ -0,0 +1,16 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +---|---|--- +Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 longer | Content 2 | Here is some really long text that will wrap to +the next line. Because it's so long. +Content | Content 2 | blah +t | Content 2 | blah blah blah +