-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_html.py
116 lines (95 loc) · 2.71 KB
/
convert_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''
Convert standard spelling to American Spelling
usage:
python convert_html.py input.html output.html
where input.html is the original in standard spelling,
and output.html is in reform spelling.
Create input.html by saving a page from your web
browser. View output.html by openning that file
from disk in your web browser.
In addition, this script must have access to the
file DIAMBG, a dictionary of American (reform)
spelling, in the run directory.
Mark Petersen
August 2016
'''
import numpy as np
import string as str
import sys
if len(sys.argv)<3:
print "input and output files required"
sys.exit()
################################################
#
# Read and parse dictionary
#
################################################
# Open file of entries for American (reform) spelling.
f = open('DIAMBG', 'r')
rawString = f.read()
f.close()
# Create a python dictionary relating each
# standard word and it's reform version
a = str.split(rawString)
standardToReform = {}
for i in range(len(a)/2):
standardToReform[a[2*i]] = a[2*i+1]
################################################
#
# Read in text file to translate
#
################################################
f = open(sys.argv[1], 'r')
x = f.read()
f.close()
################################################
#
# Translate text
#
################################################
# x is input, y is output, just like in highschool algebra.
# Initialize y as an empty list.
y=[]
i=0
# Iterate over all characters in input string
while i< len(x):
iBeg = i
# Check for beginning of html declaration.
# If found, take the declaration verbatim
if x[i]=='<':
while x[i]!='>':
i+=1
i+=1
y.append( x[iBeg:i] )
# Check for beginning of a word. Advance the
# index until the whole word is found.
elif x[i].isalpha():
while x[i].isalpha():
i+=1
word = x[iBeg:i].lower()
# Translate word to reform spelling.
try:
reformWord = standardToReform[word]
# Keep upper case the same as the original
if x[iBeg:i].istitle():
y.append( reformWord.title() )
elif x[iBeg:i].isupper():
y.append( reformWord.upper() )
else:
y.append( reformWord )
# If word is not found in the dictionary, take it
# verbatim.
except:
y.append( x[iBeg:i] )
# If the character is not a letter, take it directly.
else:
y.append( x[i] )
i+=1
################################################
#
# Save translated text as new file
#
################################################
f = open(sys.argv[2], 'w')
f.write(''.join(y))
f.close()