-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathmarkup.mli
969 lines (743 loc) · 34.6 KB
/
markup.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
(* This file is part of Markup.ml, released under the MIT license. See
LICENSE.md for details, or visit https://github.com/aantron/markup.ml. *)
(** Error-recovering streaming HTML and XML parsers and writers.
Markup.ml is an HTML and XML parsing and serialization library. It:
- Is error-recovering, so you can get a best-effort parse of malformed
input.
- Reports all errors before recovery, so you can get strict parsing
instead.
- Conforms closely to the XML grammar and HTML parser from the respective
specifications.
- Accepts document fragments, but can be told to accept only full documents.
- Detects character encodings automatically.
- Supports both simple synchronous (this module) and non-blocking usage
({!Markup_lwt}).
- Is streaming and lazy. Partial input is processed as soon as received, but
only as needed.
- Does one pass over the input and emits a stream of SAX-style parsing
signals. A helper ({!tree}) allows that to be easily converted into
DOM-style trees.
The usage is straightforward. For example:
{[
open Markup
(* Correct and pretty-print HTML. *)
channel stdin
|> parse_html |> signals |> pretty_print
|> write_html |> to_channel stdout
(* Show up to 10 XML well-formedness errors to the user. Stop after
the 10th, without reading more input. *)
let report =
let count = ref 0 in
fun location error ->
error |> Error.to_string ~location |> prerr_endline;
count := !count + 1;
if !count >= 10 then raise_notrace Exit
string "some xml" |> parse_xml ~report |> signals |> drain
(* Load HTML into a custom document tree data type. *)
type html = Text of string | Element of string * html list
file "some_file"
|> fst
|> parse_html
|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (_, name) _ children -> Element (name, children))
]}
The interface is centered around four functions. In pseudocode:
{[
val parse_html : char stream -> signal stream
val write_html : signal stream -> char stream
val parse_xml : char stream -> signal stream
val write_xml : signal stream -> char stream
]}
Most of the remaining functions create streams from, or write streams to,
strings, files, and channels, or manipulate streams, such as {!next} and the
combinators {!map} and {!fold}.
Apart from this module, Markup.ml provides two other top-level modules:
{!modules:Markup_lwt Markup_lwt_unix}
Most of the interface of {!Markup_lwt} is specified in signature
{!ASYNCHRONOUS}, which will be shared with a [Markup_async] module, should
it be implemented.
Markup.ml is developed on {{:https://github.com/aantron/markup.ml} GitHub}
and distributed under the
{{:https://github.com/aantron/markup.ml/blob/master/LICENSE.md} MIT
license}. This documentation is for version 1.0.0 of the library.
Documentation for older versions can be found on the
{{: https://github.com/aantron/markup.ml/releases} releases page}. *)
(** {2 Streams} *)
type async
type sync
(** Phantom types for use with [('a, 's) stream] in place of ['s]. See
explanation below. *)
type ('a, 's) stream
(** Streams of elements of type ['a].
In simple usage, when using only this module [Markup], the additional type
parameter ['s] is always [sync], and there is no need to consider it
further.
However, if you are using {!Markup_lwt}, you may create some [async]
streams. The difference between the two is that {!next} on a [sync] stream
retrieves an element before {!next} "returns," while {!next} on an [async]
stream might not retrieve an element until later. As a result, it is not
safe to pass an [async] stream where a [sync] stream is required. The
phantom types are used to make the type checker catch such errors at compile
time. *)
(** {2 Errors}
The parsers recover from errors automatically. If that is sufficient, you
can ignore this section. However, if you want stricter behavior, or need to
debug parser output, use optional argument [?report] of the parsers, and
look in module {!Error}. *)
type location = int * int
(** Line and column for parsing errors. Both numbers are one-based. *)
(** Error type and [to_string] function. *)
module Error :
sig
type t =
[ `Decoding_error of string * string
| `Bad_token of string * string * string
| `Unexpected_eoi of string
| `Bad_document of string
| `Unmatched_start_tag of string
| `Unmatched_end_tag of string
| `Bad_namespace of string
| `Misnested_tag of string * string * (string * string) list
| `Bad_content of string ]
(** Errors reported by the parsers. A few of these are also used by the
writers.
- [`Decoding_error (bytes, encoding)] is reported by the decoders in
module {! Encoding}. For example, if the UTF-8 decoder encounters a bare
[0xA0] byte, it will report [`Decoding_error ("\xA0", "utf-8")].
- [`Bad_token (token, where, s)] is reported when there is a "local"
problem with the syntax of the input stream, such as an invalid
character or a duplicate attribute. For example, if the XML parser
detects a [&] that is not part of an entity reference while reading an
attribute, it will report
[`Bad_token ("&", "attribute", "replace with '&'")]
- [`Unexpected_eoi where] is reported by the parsers when the input ends
before an item, such as a tag, element, or comment, is closed. [where]
describes the kind of item that wasn't closed.
- [`Bad_document s] is reported by the parsers when there is a problem
with the top-level structure of the document. For example, if you are
parsing an input stream as XML with [~context:`Document], and the parser
finds an element after the root element, it will report
[`Bad_document "not allowed after root element"].
- [`Unmatched_start_tag name] and [`Unmatched_end_tag name] are reported
when tags aren't properly balanced. Note that not all unbalanced tags
are parse errors in HTML.
- [`Bad_namespace s] is reported by parsers when the prefix [s] can't be
resolved to a namespace, and by the writers when the namespace [s] can't
be resolved to a prefix (or the default namespace).
- [`Misnested_tag (what, where, attributes)] is reported by the HTML
parser when a tag appears where it is not allowed. For example, if the
input has a [<body class="">] tag inside a [<p>] tag, the parser will
report [`Misnested_tag ("body", "p", [("class", "")])].
- [`Bad_content where] is reported by the HTML parser if an element has
content it is not allowed to have. For example, if there is stray text
at the top level of a [<table>] element, the parser will report
[`Bad_content "table"].
*)
val to_string : ?location:location -> t -> string
(** Converts an error to human-readable form. If [~location] is specified,
location information is prepended to the output. *)
end
(** {2 Encodings}
The parsers detect encodings automatically. If you need to specify an
encoding, use optional argument [?encoding] of the parsers, and look in
module {!Encoding}. *)
(** Common Internet encodings such as UTF-8 and UTF-16; also includes some less
popular encodings that are sometimes used for XML. *)
module Encoding :
sig
type t
(** Decoders. These are notionally maps from byte streams to Unicode scalar
value streams, i.e. pseudocode type [char stream -> int stream]. *)
val decode :
?report:(location -> Error.t -> unit) -> t ->
(char, 's) stream -> (int, 's) stream
(** Applies a decoder to a byte stream. Illegal input byte sequences result in
calls to the error handler [~report] with error kind [`Decoding_error].
The illegal bytes are then skipped, and zero or more U+FFFD replacement
characters are emitted. The default handler ignores errors.
The locations provided to the error handler by the built-in decoders below
in this module are fully accurate only if the input byte stream uses LF
characters as line breaks. *)
val utf_8 : t
val utf_16be : t
val utf_16le : t
val utf_16 : t
val iso_8859_1 : t
val us_ascii : t
val windows_1251 : t
val windows_1252 : t
val ucs_4be : t
val ucs_4le : t
val ucs_4be_transposed : t
val ucs_4le_transposed : t
val ebcdic : t
(** Code page 37. *)
end
(** {2 Signals} *)
type name = string * string
(** Expanded name: a namespace URI followed by a local name. *)
type xml_declaration =
{version : string;
encoding : string option;
standalone : bool option}
(** Representation of an XML declaration, i.e.
[<?xml version="1.0" encoding="utf-8"?>]. *)
type doctype =
{doctype_name : string option;
public_identifier : string option;
system_identifier : string option;
raw_text : string option;
force_quirks : bool}
(** Representation of a document type declaration. The HTML parser fills in all
fields besides [raw_text]. The XML parser reads declarations roughly, and
fills only the [raw_text] field with the text found in the declaration. *)
type signal =
[ `Start_element of name * (name * string) list
| `End_element
| `Text of string list
| `Doctype of doctype
| `Xml of xml_declaration
| `PI of string * string
| `Comment of string ]
(** Parsing signals. The parsers emit them according to the following grammar:
{[
doc ::= `Xml? misc* `Doctype? misc* element misc*
misc ::= `PI | `Comment
element ::= `Start_element content* `End_element
content ::= `Text | element | `PI | `Comment
]}
As a result, emitted [`Start_element] and [`End_element] signals are always
balanced, and, if there is an XML declaration, it is the first signal.
If parsing with [~context:`Document], the signal sequence will match the
[doc] production until the first error. If parsing with
[~context:`Fragment], it will match [content*]. If [~context] is not
specified, the parser will pick one of the two by examining the input.
As an example, if the XML parser is parsing
{[
<?xml version="1.0"?><root>text<nested>more text</nested></root>
]}
it will emit the signal sequence
{[
`Xml {version = "1.0"; encoding = None; standalone = None}
`Start_element (("", "root"), [])
`Text ["text"]
`Start_element (("", "nested"), [])
`Text ["more text"]
`End_element
`End_element
]}
The [`Text] signal carries a [string list] instead of a single [string]
because on 32-bit platforms, OCaml strings cannot be larger than 16MB. In
case the parsers encounter a very long sequence of text, one whose length
exceeds about [Sys.max_string_length / 2], they will emit a [`Text] signal
with several strings. *)
val signal_to_string : [< signal ] -> string
(** Provides a human-readable representation of signals for debugging. *)
(** {2 Parsers} *)
type 's parser
(** An ['s parser] is a thin wrapper around a [(signal, 's) stream] that
supports access to additional information that is not carried directly in
the stream, such as source locations. *)
val signals : 's parser -> (signal, 's) stream
(** Converts a parser to its underlying signal stream. *)
val location : _ parser -> location
(** Evaluates to the location of the last signal emitted on the parser's signal
stream. If no signals have yet been emitted, evaluates to [(1, 1)]. *)
(** {2 XML} *)
val parse_xml :
?report:(location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[< `Document | `Fragment ] ->
(char, 's) stream -> 's parser
(** Creates a parser that converts an XML byte stream to a signal stream.
For simple usage, [string "foo" |> parse_xml |> signals].
If [~report] is provided, [report] is called for every error encountered.
You may raise an exception in [report], and it will propagate to the code
reading the signal stream.
If [~encoding] is {e not} specified, the parser detects the input encoding
automatically. Otherwise, the given encoding is used.
[~namespace] is called when the parser is unable to resolve a namespace
prefix. If it evaluates to [Some s], the parser maps the prefix to [s].
Otherwise, the parser reports [`Bad_namespace].
[~entity] is called when the parser is unable to resolve an entity
reference. If it evaluates to [Some s], the parser inserts [s] into the
text or attribute being parsed without any further parsing of [s]. [s] is
assumed to be encoded in UTF-8. If [entity] evaluates to [None] instead,
the parser reports [`Bad_token]. See {!xhtml_entity} if you are parsing
XHTML.
The meaning of [~context] is described at {! signal}, above. *)
val write_xml :
?report:((signal * int) -> Error.t -> unit) ->
?prefix:(string -> string option) ->
([< signal ], 's) stream -> (char, 's) stream
(** Converts an XML signal stream to a byte stream.
If [~report] is provided, it is called for every error encountered. The
first argument is a pair of the signal causing the error and its index in
the signal stream. You may raise an exception in [report], and it will
propagate to the code reading the byte stream.
[~prefix] is called when the writer is unable to find a prefix in scope
for a namespace URI. If it evaluates to [Some s], the writer uses [s] for
the URI. Otherwise, the writer reports [`Bad_namespace]. *)
(** {2 HTML} *)
val parse_html :
?report:(location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, 's) stream -> 's parser
(** Similar to {!parse_xml}, but parses HTML with embedded SVG and MathML, never
emits signals [`Xml] or [`PI], and [~context] has a different type on tag
[`Fragment].
For HTML fragments, you should specify the enclosing element, e.g.
[`Fragment "body"]. This is because, when parsing HTML, error recovery and
the interpretation of text depend on the current element. For example, the
text
{[
foo</bar>
]}
parses differently in [title] elements than in [p] elements. In the former,
it is parsed as [foo</bar>], while in the latter, it is [foo] followed by a
parse error due to unmatched tag [</bar>]. To get these behaviors, set
[~context] to [`Fragment "title"] and [`Fragment "p"], respectively.
If you use [`Fragment "svg"], the fragment is assumed to be SVG markup.
Likewise, [`Fragment "math"] causes the parser to parse MathML markup.
If [~context] is omitted, the parser guesses it from the input stream. For
example, if the first signal would be [`Doctype], the context is set to
[`Document], but if the first signal would be [`Start_element "td"], the
context is set to [`Fragment "tr"]. If the first signal would be
[`Start_element "g"], the context is set to [`Fragment "svg"]. *)
val write_html :
?escape_attribute:(string -> string) ->
?escape_text:(string -> string) ->
([< signal ], 's) stream -> (char, 's) stream
(** Similar to {!write_xml}, but emits HTML5 instead of XML.
If [~escape_attribute] and/or [~escape_text] are provided,
they are used instead of default escaping functions. *)
(** {2 Input sources} *)
val string : string -> (char, sync) stream
(** Evaluates to a stream that retrieves successive bytes from the given
string. *)
val buffer : Buffer.t -> (char, sync) stream
(** Evaluates to a stream that retrieves successive bytes from the given buffer.
Be careful of changing the buffer while it is being iterated by the
stream. *)
val channel : in_channel -> (char, sync) stream
(** Evaluates to a stream that retrieves bytes from the given channel. If the
channel cannot be read, the next read of the stream results in raising
[Sys_error].
Note that this input source is synchronous because [Pervasives.in_channel]
reads are blocking. For non-blocking channels, see {!Markup_lwt_unix}. *)
val file : string -> (char, sync) stream * (unit -> unit)
(** [file path] opens the file at [path], then evaluates to a pair [s, close],
where reading from stream [s] retrieves successive bytes from the file, and
calling [close ()] closes the file.
The file is closed automatically if [s] is read to completion, or if reading
[s] raises an exception. It is not necessary to call [close ()] in these
cases.
If the file cannot be opened, raises [Sys_error] immediately. If the file
cannot be read, reading the stream raises [Sys_error]. *)
val fn : (unit -> char option) -> (char, sync) stream
(** [fn f] is a stream that retrives bytes by calling [f ()]. If the call
results in [Some c], the stream emits [c]. If the call results in [None],
the stream is considered to have ended.
This is actually an alias for {!stream}, restricted to type [char]. *)
(** {2 Output destinations} *)
val to_string : (char, sync) stream -> string
(** Eagerly retrieves bytes from the given stream and assembles a string. *)
val to_buffer : (char, sync) stream -> Buffer.t
(** Eagerly retrieves bytes from the given stream and places them into a
buffer. *)
val to_channel : out_channel -> (char, sync) stream -> unit
(** Eagerly retrieves bytes from the given stream and writes them to the given
channel. If writing fails, raises [Sys_error]. *)
val to_file : string -> (char, sync) stream -> unit
(** Eagerly retrieves bytes from the given stream and writes them to the given
file. If writing fails, or the file cannot be opened, raises [Sys_error].
Note that the file is truncated (cleared) before writing. If you wish to
append to file, open it with the appropriate flags and use [to_channel] on
the resulting channel. *)
(** {2 Stream operations} *)
val stream : (unit -> 'a option) -> ('a, sync) stream
(** [stream f] creates a stream that repeatedly calls [f ()]. Each time [f ()]
evaluates to [Some v], the next item in the stream is [v]. The first time
[f ()] evaluates to [None], the stream ends. *)
val next : ('a, sync) stream -> 'a option
(** Retrieves the next item in the stream, if any, and removes it from the
stream. *)
val peek : ('a, sync) stream -> 'a option
(** Retrieves the next item in the stream, if any, but does not remove the item
from the stream. *)
val transform :
('a -> 'b -> 'c list * 'a option) -> 'a -> ('b, 's) stream -> ('c, 's) stream
(** [transform f init s] lazily creates a stream by repeatedly applying
[f acc v], where [acc] is an accumulator whose initial value is [init], and
[v] is consecutive values of [s]. Each time, [f acc v] evaluates to a pair
[(vs, maybe_acc')]. The values [vs] are added to the result stream. If
[maybe_acc'] is [Some acc'], the accumulator is set to [acc']. Otherwise, if
[maybe_acc'] is [None], the result stream ends. *)
val fold : ('a -> 'b -> 'a) -> 'a -> ('b, sync) stream -> 'a
(** [fold f init s] eagerly folds over the items [v], [v'], [v''], ... of [s],
i.e. evaluates [f (f (f init v) v') v'']... *)
val map : ('a -> 'b) -> ('a, 's) stream -> ('b, 's) stream
(** [map f s] lazily applies [f] to each item of [s], and produces the resulting
stream. *)
val filter : ('a -> bool) -> ('a, 's) stream -> ('a, 's) stream
(** [filter f s] is [s] without the items for which [f] evaluates to [false].
[filter] is lazy. *)
val filter_map : ('a -> 'b option) -> ('a, 's) stream -> ('b, 's) stream
(** [filter_map f s] lazily applies [f] to each item [v] of [s]. If [f v]
evaluates to [Some v'], the result stream has [v']. If [f v] evaluates to
[None], no item corresponding to [v] appears in the result stream. *)
val iter : ('a -> unit) -> ('a, sync) stream -> unit
(** [iter f s] eagerly applies [f] to each item of [s], i.e. evaluates
[f v; f v'; f v'']... *)
val drain : ('a, sync) stream -> unit
(** [drain s] eagerly consumes [s]. This is useful for observing side effects,
such as parsing errors, when you don't care about the parsing signals
themselves. It is equivalent to [iter ignore s]. *)
val of_list : 'a list -> ('a, sync) stream
(** Produces a (lazy) stream from the given list. *)
val to_list : ('a, sync) stream -> 'a list
(** Eagerly converts the given stream to a list. *)
(** {2 Utility} *)
val content : ([< signal ], 's) stream -> (signal, 's) stream
(** Filters out all signals besides [`Start_element], [`End_element], and
[`Text]. *)
val tree :
?text:(string list -> 'a) ->
?element:(name -> (name * string) list -> 'a list -> 'a) ->
?comment:(string -> 'a) ->
?pi:(string -> string -> 'a) ->
?xml:(xml_declaration -> 'a) ->
?doctype:(doctype -> 'a) ->
([< signal ], sync) stream -> 'a option
(** This function's type signature may look intimidating, but it is actually
easy to use. It is best introduced by example:
{[
type my_dom = Text of string | Element of name * my_dom list
"<p>HTML5 is <em>easy</em> to parse"
|> string
|> parse_html
|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (_ns, name) _attrs children -> Element (name, children))
]}
results in the structure
{[
Element ("p" [
Text "HTML5 is ";
Element ("em", [Text "easy"]);
Text " to parse"])
]}
Formally, [tree] assembles a tree data structure of type ['a] from a signal
stream. The stream is parsed according to the following grammar:
{[
stream ::= node*
node ::= element | `Text | `Comment | `PI | `Xml | `Doctype
element ::= `Start_element node* `End_element
]}
Each time [tree] matches a production of [node], it calls the corresponding
function to convert the node into your tree type ['a]. For example, when
[tree] matches [`Text ss], it calls [~text ss], if [~text] is supplied.
Similarly, when [tree] matches [element], it calls
[~element name attributes children], if [~element] is supplied.
[tree] returns [None] when its input signal stream is empty. In terms of the
original input bytes, this can correspond to either an empty input, or a
non-empty input which the parser's error recovery completely discarded,
producing no signals.
See {!trees} if the input stream might have multiple top-level trees. This
function [tree] only retrieves the first one. *)
val trees :
?text:(string list -> 'a) ->
?element:(name -> (name * string) list -> 'a list -> 'a) ->
?comment:(string -> 'a) ->
?pi:(string -> string -> 'a) ->
?xml:(xml_declaration -> 'a) ->
?doctype:(doctype -> 'a) ->
([< signal ], 's) stream -> ('a, 's) stream
(** Like {!tree}, but converts all top-level trees, not only the first one. The
trees are emitted on the resulting stream, in the sequence that they appear
in the input. *)
type 'a node =
[ `Element of name * (name * string) list * 'a list
| `Text of string
| `Doctype of doctype
| `Xml of xml_declaration
| `PI of string * string
| `Comment of string ]
(** See {!from_tree} below. *)
val from_tree : ('a -> 'a node) -> 'a -> (signal, sync) stream
(** Deconstructs tree data structures of type ['a] into signal streams. The
function argument is applied to each data structure node. For example,
{[
type my_dom = Text of string | Element of string * my_dom list
let dom =
Element ("p", [
Text "HTML5 is ";
Element ("em", [Text "easy"]);
Text " to parse"])
dom |> from_tree (function
| Text s -> `Text s
| Element (name, children) -> `Element (("", name), [], children))
]}
results in the signal stream
{[
`Start_element (("", "p"), [])
`Text ["HTML5 is "]
`Start_element (("", "em"), [])
`Text ["easy"]
`End_element
`Text " to parse"
`End_element
]} *)
val elements :
(name -> (name * string) list -> bool) ->
([< signal ] as 'a, 's) stream ->
(('a, 's) stream, 's) stream
(** [elements f s] scans the signal stream [s] for
[`Start_element (name, attributes)] signals that satisfy
[f name attributes]. Each such matching signal is the beginning of a
substream that ends with the corresponding [`End_element] signal. The result
of [elements f s] is the stream of these substreams.
Matches don't nest. If there is a matching element contained in another
matching element, only the top one results in a substream.
Code using [elements] does not have to read each substream to completion, or
at all. However, once the using code has tried to get the next substream, it
should not try to read a previous one. *)
val text : ([< signal ], 's) stream -> (char, 's) stream
(** Extracts all the text in a signal stream by discarding all markup. For each
[`Text ss] signal, the result stream has the bytes of the strings [ss], and
all other signals are ignored. *)
val trim : (signal, 's) stream -> (signal, 's) stream
(** Trims insignificant whitespace in an HTML signal stream. Whitespace around
flow ("block") content does not matter, but whitespace in phrasing
("inline") content does. So, if the input stream is
{[
<div>
<p>
<em>foo</em> bar
</p>
</div>
]}
passing it through [Markup.trim] will result in
{[
<div><p><em>foo</em> bar</p></div>
]}
Note that whitespace around the [</em>] tag was preserved. *)
val normalize_text :
([> `Text of string list ] as 'a, 's) stream -> ('a, 's) stream
(** Concatenates adjacent [`Text] signals, then eliminates all empty strings,
then all [`Text []] signals. Signals besides [`Text] are unaffected. Note
that signal streams emitted by the parsers already have normalized text.
This function is useful when you are inserting text into a signal stream
after parsing, or generating streams from scratch, and would like to clean
up the [`Text] signals. *)
val pretty_print : (signal, 's) stream -> (signal, 's) stream
(** Adjusts the whitespace in the [`Text] signals in the given stream so that
the output appears nicely-indented when the stream is converted to bytes and
written.
This function is aware of the significance of whitespace in HTML, so it
avoids changing the whitespace in phrasing ("inline") content. For example,
pretty printing
{[
<div><p><em>foo</em>bar</p></div>
]}
results in
{[
<div>
<p>
<em>foo</em>bar
</p>
</div>
]}
Note that no whitespace was inserted around [<em>] and [</em>], because
doing so would create a word break that wasn't present in the original
stream. *)
val html5 : ([< signal ], 's) stream -> (signal, 's) stream
(** Converts a signal stream into an HTML5 signal stream by stripping any
document type declarations, XML declarations, and processing instructions,
and prefixing the HTML5 doctype declaration. This is useful when converting
between XHTML and HTML. *)
val xhtml :
?dtd:[< `Strict_1_0 | `Transitional_1_0 | `Frameset_1_0 | `Strict_1_1 ] ->
([< signal ], 's) stream -> (signal, 's) stream
(** Similar to {!html5}, but does not strip processing instructions, and
prefixes an XHTML document type declaration and an XML declaration. The
[~dtd] argument specifies which DTD to refer to in the doctype declaration.
The default is [`Strict_1_1]. *)
val xhtml_entity : string -> string option
(** Translates XHTML entities. This function is for use with the [~entity]
argument of {!parse_xml} when parsing XHTML. *)
val strings_to_bytes : (string, 's) stream -> (char, 's) stream
(** [strings_to_bytes s] is the stream of all the bytes of all strings in
[s]. *)
val compare_locations : location -> location -> int
(** Orders locations according to their appearance in an input stream, i.e.
first by line, and then, for locations on the same line, by column. *)
(** {2 Namespaces} *)
(** Common namespace URIs. *)
module Ns :
sig
val html : string
(** [http://www.w3.org/1999/xhtml]. Use for HTML and XHTML. *)
val svg : string
(** [http://www.w3.org/2000/svg]. *)
val mathml : string
(** [http://www.w3.org/1998/Math/MathML]. *)
val xml : string
(** [http://www.w3.org/XML/1998/namespace]. *)
val xmlns : string
(** [http://www.w3.org/2000/xmlns/]. *)
val xlink : string
(** [http://www.w3.org/1999/xlink]. *)
end
(** {2 Asynchronous interface} *)
(**/**)
module type IO =
sig
type 'a t
val return : 'a -> 'a t
val of_cps : ((exn -> unit) -> ('a -> unit) -> unit) -> 'a t
val to_cps : (unit -> 'a t) -> ((exn -> unit) -> ('a -> unit) -> unit)
end
(**/**)
(** Markup.ml interface for monadic I/O libraries such as Lwt and Async.
This signature is implemented by {!Markup_lwt}, with a few additions.
Each function here corresponds directly to the function in the basic module
{!Markup} that has the same name. So, see {!Markup} for details.
The only difference is that functions here, all of which are higher-order
functions, take a function as argument that returns an ['a io] promise,
rather than returning an already-computed value. *)
module type ASYNCHRONOUS =
sig
(** {2 Promises} *)
type 'a io
(** Promise type. Replaced by ['a Lwt.t] in {!Markup_lwt}. *)
(** {2 Encodings} *)
(** Asynchronous counterpart to {!Markup.Encoding}. *)
module Encoding :
sig
(**/**)
type t = Encoding.t
(**/**)
val decode :
?report:(location -> Error.t -> unit io) -> Encoding.t ->
(char, _) stream -> (int, async) stream
end
(** {2 XML} *)
val parse_xml :
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[< `Document | `Fragment ] ->
(char, _) stream -> async parser
val write_xml :
?report:((signal * int) -> Error.t -> unit io) ->
?prefix:(string -> string option) ->
([< signal ], _) stream -> (char, async) stream
(** {2 HTML} *)
val parse_html :
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, _) stream -> async parser
val write_html :
?escape_attribute:(string -> string) ->
?escape_text:(string -> string) ->
([< signal ], _) stream -> (char, async) stream
(** {2 I/O} *)
val fn : (unit -> char option io) -> (char, async) stream
val to_string : (char, _) stream -> string io
val to_buffer : (char, _) stream -> Buffer.t io
(** {2 Stream manipulation} *)
val stream : (unit -> 'a option io) -> ('a, async) stream
val next : ('a, _) stream -> 'a option io
val peek : ('a, _) stream -> 'a option io
val transform :
('a -> 'b -> ('c list * 'a option) io) -> 'a -> ('b, _) stream ->
('c, async) stream
val fold : ('a -> 'b -> 'a io) -> 'a -> ('b, _) stream -> 'a io
val map : ('a -> 'b io) -> ('a, _) stream -> ('b, async) stream
val filter : ('a -> bool io) -> ('a, _) stream -> ('a, async) stream
val filter_map : ('a -> 'b option io) -> ('a, _) stream -> ('b, async) stream
val iter : ('a -> unit io) -> ('a, _) stream -> unit io
val drain : ('a, _) stream -> unit io
val to_list : ('a, _) stream -> 'a list io
val load : ('a, _) stream -> ('a, sync) stream io
(** [load s] converts a general stream [s] to a synchronous stream by
buffering it. *)
(** {2 Utility} *)
val tree :
?text:(string list -> 'a) ->
?element:(name -> (name * string) list -> 'a list -> 'a) ->
?comment:(string -> 'a) ->
?pi:(string -> string -> 'a) ->
?xml:(xml_declaration -> 'a) ->
?doctype:(doctype -> 'a) ->
([< signal ], _) stream -> 'a option io
end
(**/**)
module Asynchronous (IO : IO) : ASYNCHRONOUS with type 'a io := 'a IO.t
val kstream : ('a, _) stream -> 'a Kstream.t
val of_kstream : 'a Kstream.t -> ('a, _) stream
val preprocess_input_stream :
(int, 's) stream -> (location * int, 's) stream * (unit -> location)
(**/**)
(** {2 Conformance status}
The HTML parser seeks to implement
{{:https://www.w3.org/TR/html5/syntax.html} section 8 of the HTML5
specification}. That section describes a parser, part of a full-blown user
agent, that is building up a DOM representation of an HTML document.
Markup.ml is neither inherently part of a user agent, nor does it build up a
DOM representation. With respect to section 8 of HTML5, Markup.ml is
concerned with only the syntax. When that section requires that the user
agent perform an action, Markup.ml emits enough information for a
hypothetical user agent based on it to be able to decide to perform this
action. Likewise, Markup.ml seeks to emit enough information for a
hypothetical user agent to build up a conforming DOM.
The XML parser seeks to be a non-validating implementation of the
{{:https://www.w3.org/TR/xml/} XML} and {{:https://www.w3.org/TR/xml-names/}
Namespaces in XML} specifications.
This rest of this section lists known deviations from HTML5, XML, and
Namespaces in XML. Some of these deviations are meant to be corrected in
future versions of Markup.ml, while others will probably remain. The latter
satisfy some or all of the following properties:
- They require non-local adjustment, especially of past nodes. For example,
adjusting the start signal of the root node mid-way through the signal
stream is difficult for a one-pass parser.
- They are minor. Users implementing less than a conforming browser
typically don't care about them. They typically have to do with obscure
error recovery. There are no deviations affecting the parsing of
well-formed input.
- They can easily be corrected by code written over Markup.ml that builds up
a DOM or maintains other auxiliary data structures during parsing.
{3 To be corrected:}
- XML: There is no attribute value normalization.
- HTML: {e foster parenting} is not implemented, because it requires
non-local adjustments.
- HTML: Quirks mode is not honored. This affects the interaction between
automatic closing of [p] elements and opening of [table] elements.
- HTML: The parser has non-standard recovery from unmatched closing [form]
tags in {{: https://github.com/aantron/markup.ml/commit/0bf4f1b} some
situations}.
- HTML: The parser ignores interactions between [form] and [template].
- HTML: The form translation for [isindex] is completely ignored. [isindex]
is handled as an unknown element.
{3 To remain:}
- HTML: Except when detecting encodings, the parser does not try to read
[<meta>] tags for encoding declarations. The user of Markup.ml should read
these, if necessary. They are part of the emitted signal stream.
- HTML: [noscript] elements are always parsed, as are [script] elements. For
conforming behavior, if the user of Markup.ml "supports scripts," the user
should serialize the content of [noscript] to a [`Text] signal using
[write_html].
- HTML: Elements such as [title] that belong in [head], but are found
between [head] and [body], are not moved into [head].
- HTML: [<html>] tags found in the body do not have their attributes added
to the [`Start_element "html"] signal emitted at the beginning of the
document. *)