-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil_html.ml
262 lines (219 loc) · 6.17 KB
/
util_html.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
open Printf
let encode s =
let buf = Buffer.create (2 * String.length s) in
for i = 0 to String.length s - 1 do
match s.[i] with
'<' -> Buffer.add_string buf "<"
| '>' -> Buffer.add_string buf ">"
| '&' -> Buffer.add_string buf "&"
| '\"' -> Buffer.add_string buf """
| '\'' -> Buffer.add_string buf "'"
| c -> Buffer.add_char buf c
done;
Buffer.contents buf
let attr k v =
Printf.sprintf " %s='%s'"
(encode k)
(encode v)
(*
attributes are raw
content is html (text is html-escaped)
*)
let elt elt_name ?(attrs = []) content =
let attrs =
String.concat "" (BatList.map (fun (k, v) -> attr k v) attrs)
in
Printf.sprintf "<%s%s>%s</%s>"
elt_name attrs content elt_name
let paragraph_sep = Pcre.regexp "(?:\r*\n){2,}"
let line_sep = Pcre.regexp "\r*\n"
let split regexp s =
Pcre.split ~rex:regexp s
let split_lines s =
split line_sep s
let parse_paragraphs s =
let paragraphs = split paragraph_sep s in
BatList.map split_lines paragraphs
type with_quotes =
| Paragraphs of string list list
| Quote of with_quotes list
(* group contiguous elements that have the same (boolean) property *)
let map_contiguous cond f g l =
let rec aux acc acc2 prop l =
let func = if prop then f else g in
match l with
| [] ->
let acc =
if acc2 = [] then acc
else func (List.rev acc2) :: acc
in
List.rev acc
| x :: tl ->
if cond x = prop then
aux acc (x :: acc2) prop tl
else
let acc =
if acc2 = [] then acc
else func (List.rev acc2) :: acc
in
aux acc [x] (not prop) tl
in
aux [] [] true l
let test_map_contiguous () =
let is_even x = x mod 2 = 0 in
let even_group l = `Even l in
let odd_group l = `Odd l in
let test_one input expected_output =
let output = map_contiguous is_even even_group odd_group input in
expected_output = output
in
List.for_all (fun (input, expected_output) ->
test_one input expected_output
)
[
[], [];
[1], [`Odd [1]];
[1;3], [`Odd [1;3]];
[0], [`Even [0]];
[1;3;0;2;7], [`Odd [1;3]; `Even [0;2]; `Odd [7]];
]
let has_quote_prefix line =
line <> "" && line.[0] = '>'
let has_space_prefix line =
line = "" || line.[0] = ' '
let remove_first_character line =
if line = "" then ""
else String.sub line 1 (String.length line - 1)
let rec remove_leading_spaces_evenly lines =
if List.for_all has_space_prefix lines then
let l = BatList.map remove_first_character lines in
if l = lines then
l
else
remove_leading_spaces_evenly l
else
lines
let handle_regular_lines regular_lines =
let s = String.concat "\n" regular_lines in
let paragraphs = parse_paragraphs s in
Paragraphs paragraphs
let rec parse_quotes lines =
map_contiguous
has_quote_prefix
(fun quoted_lines ->
let lines = BatList.map remove_first_character quoted_lines in
let lines = remove_leading_spaces_evenly lines in
if lines <> quoted_lines then
Quote (parse_quotes lines)
else
handle_regular_lines lines
)
handle_regular_lines
lines
let url_regexp = Pcre.regexp
"\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/)))"
let tag_links line =
Pcre.replace
~rex:url_regexp
~itempl:(Pcre.subst "<a href=\"$&\">$&</a>")
line
let encode_paragraph
?(p_elt = "p")
?p_class
?p_style
l =
let paragraph = BatList.map (fun s -> tag_links (encode s)) l in
sprintf "<%s%s%s>%s</%s>\n"
p_elt
(match p_class with None -> "" | Some s -> sprintf " class='%s'" s)
(match p_style with None -> "" | Some s -> sprintf " style='%s'" s)
(String.concat "<br>\n" paragraph)
p_elt
let encode_paragraphs ?p_elt ?p_class ?p_style ll =
String.concat "\n\n" (
BatList.map
(encode_paragraph ?p_elt ?p_class ?p_style)
ll
)
let encode_text ?p_elt ?p_class ?p_style l =
let rec print_one buf = function
| Quote l ->
bprintf buf
"<blockquote type='cite'>\n%a\n</blockquote>\n"
print_list l
| Paragraphs l ->
Buffer.add_string buf (encode_paragraphs ?p_elt ?p_class ?p_style l)
and print_list buf l =
List.iter (print_one buf) l
in
let buf = Buffer.create 1000 in
print_list buf l;
Buffer.contents buf
(* Turn plain text into HTML:
Replace blank lines by paragraph separators,
replace single line breaks by <br>.
TODO: replace URLs by links
*)
let of_text ?p_elt ?p_class ?p_style s =
encode_text ?p_elt ?p_class ?p_style (parse_quotes (split_lines s))
let test_from_text () =
let s = "\
Dear Santa,
I hope you're doing well
and I want a lot of presents.
Thanks!
> What would you like this year?
>
> -- Santa Claus
>
> >Hey, are you
> >the real Santa?
> The following is indented but it won't be rendered as such:
> ***
"
in
let expected = "\
<p>Dear Santa,</p>
<p>I hope you're doing well<br>
and I want a lot of presents.</p>
<p>Thanks!</p>
<blockquote type='cite'>
<p>What would you like this year?</p>
<p>-- Santa Claus</p>
<blockquote type='cite'>
<p>Hey, are you<br>
the real Santa?</p>
</blockquote>
</blockquote>
<blockquote type='cite'>
<p>The following is indented but it won't be rendered as such:<br>
***</p>
</blockquote>
"
in
of_text s = expected
let test_blank_quote () =
ignore (of_text ">"); (* test against infinite loop *)
true
let html_start =
Pcre.regexp "\\A[ \r\n\t]*<(?:html\
|HTML\
|!doctype\
|!DOCTYPE\
|!--)[^a-zA-Z]"
let looks_like_html s =
Pcre.pmatch ~rex:html_start s
let test_looks_like_html () =
assert (not (looks_like_html ""));
assert (not (looks_like_html "&"));
assert (looks_like_html "<html>");
assert (looks_like_html "\n <HTML >");
assert (looks_like_html "<!doctype>");
assert (looks_like_html "<!-- blah -->");
true
let tests = [
"map contiguous", test_map_contiguous;
"from text", test_from_text;
"blank quote", test_blank_quote;
"looks like html", test_looks_like_html;
]