-
Notifications
You must be signed in to change notification settings - Fork 129
/
Copy pathkill_xml.js
52 lines (47 loc) · 1.98 KB
/
kill_xml.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
//okay, i know you're not supposed to regex html, but...
//https://en.wikipedia.org/wiki/Help:HTML_in_wikitext
//these are things we throw-away
//these will mess-up if they're nested, but they're not usually.
const ignore = [
'table',
'code',
'score',
'data',
'categorytree',
'charinsert',
'hiero',
'imagemap',
'inputbox',
'references',
'source',
'syntaxhighlight',
'timeline',
'maplink',
]
const openTag = `< ?(${ignore.join('|')}) ?[^>]{0,200}?>`
const closeTag = `< ?/ ?(${ignore.join('|')}) ?>`
const anyChar = '\\s\\S' //including newline
const noThanks = new RegExp(`${openTag}[${anyChar}]+?${closeTag}`, 'gi')
const kill_xml = function (wiki) {
//(<ref> tags are parsed in Section class) - luckily, refs can't be recursive.
//types of html/xml that we want to trash completely.
wiki = wiki.replace(noThanks, ' ')
//some xml-like fragments we can also kill
wiki = wiki.replace(/ ?< ?(span|div|table|data) [a-zA-Z0-9=%.\-#:;'" ]{2,100}\/? ?> ?/g, ' ') //<ref name="asd">
//only kill ref tags if they are selfclosing
wiki = wiki.replace(/ ?< ?(ref) [a-zA-Z0-9=" ]{2,100}\/ ?> ?/g, ' ') //<ref name="asd"/>
// convert these html tags to known formatting
wiki = wiki.replace(/<i>(.*?)<\/i>/g, `''$1''`)
wiki = wiki.replace(/<b>(.*?)<\/b>/g, `'''$1'''`)
// these are better-handled with templates
wiki = wiki.replace(/<sub>(.*?)<\/sub>/g, `{{sub|$1}}`)
wiki = wiki.replace(/<sup>(.*?)<\/sup>/g, `{{sup|$1}}`)
wiki = wiki.replace(/<blockquote>(.*?)<\/blockquote>/g, `{{blockquote|text=$1}}`)
//some formatting xml, we'll keep their insides though
wiki = wiki.replace(/ ?<[ /]?(p|sub|sup|span|nowiki|div|table|br|tr|td|th|pre|pre2|hr|u)[ /]?> ?/g, ' ') //<sub>, </sub>
wiki = wiki.replace(/ ?<[ /]?(abbr|bdi|bdo|cite|del|dfn|em|ins|kbd|mark|q|s|small)[ /]?> ?/g, ' ') //<abbr>, </abbr>
wiki = wiki.replace(/ ?<[ /]?h[0-9][ /]?> ?/g, ' ') //<h2>, </h2>
wiki = wiki.replace(/ ?< ?br ?\/> ?/g, '\n') //<br />
return wiki.trim()
}
export default kill_xml