Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make pretty_print and trim better handle whitespace in xml (closes #78) #79

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions src/markup.mli
Original file line number Diff line number Diff line change
Expand Up @@ -665,10 +665,15 @@ val text : ([< signal ], 's) stream -> (char, 's) stream
[`Text ss] signal, the result stream has the bytes of the strings [ss], and
all other signals are ignored. *)

val trim : (signal, 's) stream -> (signal, 's) stream
(** Trims insignificant whitespace in an HTML signal stream. Whitespace around
flow ("block") content does not matter, but whitespace in phrasing
("inline") content does. So, if the input stream is
val trim :
?preserve_whitespace:(name -> (name * string) list -> bool) ->
(signal, 's) stream ->
(signal, 's) stream
(** Trims insignificant whitespace.

In an HTML signal stream, whitespace around flow ("block") content does not
matter, but whitespace in phrasing ("inline") content does. So, if the input
stream is

{[
<div>
Expand All @@ -684,7 +689,11 @@ val trim : (signal, 's) stream -> (signal, 's) stream
<div><p><em>foo</em> bar</p></div>
]}

Note that whitespace around the [</em>] tag was preserved. *)
Note that whitespace around the [</em>] tag was preserved.

In an XML stream, whitespace is assumed to be insignificant unless an
ancestor element has the xml:space=preserve attribute. You can specify
[?preserve_whitespace] to preserve more or less whitespace. *)

val normalize_text :
([> `Text of string list ] as 'a, 's) stream -> ('a, 's) stream
Expand All @@ -695,7 +704,10 @@ val normalize_text :
after parsing, or generating streams from scratch, and would like to clean
up the [`Text] signals. *)

val pretty_print : (signal, 's) stream -> (signal, 's) stream
val pretty_print :
?preserve_whitespace:(name -> (name * string) list -> bool)
-> (signal, 's) stream
-> (signal, 's) stream
(** Adjusts the whitespace in the [`Text] signals in the given stream so that
the output appears nicely-indented when the stream is converted to bytes and
written.
Expand All @@ -720,7 +732,11 @@ val pretty_print : (signal, 's) stream -> (signal, 's) stream

Note that no whitespace was inserted around [<em>] and [</em>], because
doing so would create a word break that wasn't present in the original
stream. *)
stream.

In an XML stream, whitespace is assumed to be insignificant unless an
ancestor element has the xml:space=preserve attribute. You can specify
[?preserve_whitespace] to control the whitespace changes. *)

val html5 : ([< signal ], 's) stream -> (signal, 's) stream
(** Converts a signal stream into an HTML5 signal stream by stripping any
Expand Down
21 changes: 12 additions & 9 deletions src/utility.ml
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,12 @@ let normalize_text s =

make match_other

let is_phrasing_element (namespace, element_name) =
let default_preserve_whitespace (namespace, element_name) attrs =
if namespace <> html_ns then
false
List.exists
(fun ((ns, key), value) ->
ns = xml_ns && key = "space" && value = "preserve")
attrs
else
match element_name with
| "a" | "abbr" | "b" | "bdi" | "bdo" | "br" | "button" | "cite" | "code"
Expand All @@ -235,17 +238,17 @@ let rec trim_string_list trim = function
| "" -> trim_string_list trim more
| s -> s::more

let trim signals =
let trim ?(preserve_whitespace = default_preserve_whitespace) signals =
let signals = normalize_text signals in

let signals_and_flow =
Kstream.transform begin fun phrasing_nesting_level signal _throw k ->
match signal with
| `Start_element (name, _) ->
| `Start_element (name, attrs) ->
if phrasing_nesting_level > 0 then
k ([signal, false], Some (phrasing_nesting_level + 1))
else
if is_phrasing_element name then
if preserve_whitespace name attrs then
k ([signal, false], Some 1)
else
k ([signal, true], Some 0)
Expand Down Expand Up @@ -295,8 +298,8 @@ let trim signals =

let tab_width = 1

let pretty_print signals =
let signals = trim signals in
let pretty_print ?(preserve_whitespace = default_preserve_whitespace) signals =
let signals = trim ~preserve_whitespace signals in

let indent n =
let n = if n < 0 then 0 else n in
Expand All @@ -308,7 +311,7 @@ let pretty_print signals =
and flow indentation throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when not @@ is_phrasing_element name ->
| `Start_element (name, attrs) when not @@ preserve_whitespace name attrs ->
(* If the next signal is `End_element, don't insert a line break. This
is mainly for collapsing inherently empty tags like <meta> and
<br>. *)
Expand Down Expand Up @@ -352,7 +355,7 @@ let pretty_print signals =
and phrasing indentation phrasing_nesting_level throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when is_phrasing_element name ->
| `Start_element (name, attrs) when preserve_whitespace name attrs ->
list
[signal]
(phrasing indentation (phrasing_nesting_level + 1)) throw e k
Expand Down
45 changes: 45 additions & 0 deletions test/test_utility.ml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,51 @@ let tests = [
`End_element;
`Text ["\n"]]);

("utility.pretty_print.xml" >:: fun _ ->
let parse_print ?preserve_whitespace () =
Markup.string (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r><w:rPr></w:rPr><w:t xml:space="preserve">a </w:t></w:r><w:r><w:rPr></w:rPr><w:t>b c</w:t></w:r>
</w:document>
|})
|> Markup.parse_xml
|> Markup.signals
|> pretty_print ?preserve_whitespace
|> Markup.write_xml
|> Markup.to_string
in
assert_equal ~printer:Fun.id (parse_print ()) (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r>
<w:rPr/>
<w:t xml:space="preserve">a </w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t>
b c
</w:t>
</w:r>
</w:document>
|} ^ "\n");
let preserve_whitespace name _ =
name = ("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "t")
in
assert_equal ~printer:Fun.id (parse_print ~preserve_whitespace ()) (String.trim {|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:r>
<w:rPr/>
<w:t xml:space="preserve">a </w:t>
</w:r>
<w:r>
<w:rPr/>
<w:t>b c</w:t>
</w:r>
</w:document>
|} ^ "\n");
);

("utility.html5" >:: fun _ ->
[doctype;
doctype;
Expand Down
Loading