-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdoc.go
221 lines (199 loc) · 4.28 KB
/
doc.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
package doc2txt
import (
"bytes"
"encoding/binary"
"errors"
"io"
"github.com/mattetti/filebuffer"
"github.com/richardlehane/mscfb"
)
var (
errTable = errors.New("cannot find table stream")
errDocEmpty = errors.New("WordDocument not found")
errDocShort = errors.New("wordDoc block too short")
errInvalidArgument = errors.New("invalid table and/or fib")
)
type allReader interface {
io.Closer
io.ReaderAt
io.ReadSeeker
}
func wrapError(e error) error {
return errors.New("Error processing file: " + e.Error())
}
// ParseDoc converts a standard io.Reader from a Microsoft Word
// .doc binary file and returns a reader (actually a bytes.Buffer)
// which will output the plain text found in the .doc file
func ParseDoc(r io.Reader) (io.Reader, error) {
ra, ok := r.(io.ReaderAt)
if !ok {
ra, _, err := toMemoryBuffer(r)
if err != nil {
return nil, wrapError(err)
}
defer ra.Close()
}
d, err := mscfb.New(ra)
if err != nil {
return nil, wrapError(err)
}
wordDoc, table0, table1 := getWordDocAndTables(d)
fib, err := getFib(wordDoc)
if err != nil {
return nil, wrapError(err)
}
table := getActiveTable(table0, table1, fib)
if table == nil {
return nil, wrapError(errTable)
}
clx, err := getClx(table, fib)
if err != nil {
return nil, wrapError(err)
}
return getText(wordDoc, clx)
}
func toMemoryBuffer(r io.Reader) (allReader, int64, error) {
var b bytes.Buffer
size, err := b.ReadFrom(r)
if err != nil {
return nil, 0, err
}
fb := filebuffer.New(b.Bytes())
return fb, size, nil
}
func getText(wordDoc *mscfb.File, clx *clx) (io.Reader, error) {
var buf bytes.Buffer
for i := 0; i < len(clx.pcdt.PlcPcd.aPcd); i++ {
pcd := clx.pcdt.PlcPcd.aPcd[i]
cp := clx.pcdt.PlcPcd.aCP[i]
cpNext := clx.pcdt.PlcPcd.aCP[i+1]
var start, end, size int
if pcd.fc.fCompressed {
size = 1
start = pcd.fc.fc / 2
end = start + cpNext - cp
} else {
size = 2
start = pcd.fc.fc
end = start + 2*(cpNext-cp)
}
b := make([]byte, end-start)
_, err := wordDoc.ReadAt(b, int64(start/size)) // read all the characters
if err != nil {
return nil, err
}
translateText(b, &buf, pcd.fc.fCompressed)
}
return &buf, nil
}
func translateText(b []byte, buf *bytes.Buffer, fCompressed bool) {
fieldLevel := 0
var isFieldChar bool
for cIndex := range b {
// Handle special field characters (section 2.8.25)
if b[cIndex] == 0x13 {
isFieldChar = true
fieldLevel++
continue
} else if b[cIndex] == 0x14 {
isFieldChar = false
continue
} else if b[cIndex] == 0x15 {
isFieldChar = false
continue
} else if isFieldChar {
continue
}
if b[cIndex] == 7 { // table column separator
buf.WriteByte(' ')
continue
} else if b[cIndex] < 32 && b[cIndex] != 9 && b[cIndex] != 10 && b[cIndex] != 13 { // skip non-printable ASCII characters
//buf.Write([]byte(fmt.Sprintf("|%#x|", b[cIndex])))
continue
}
if fCompressed { // compressed, so replace compressed characters
buf.Write(replaceCompressed(b[cIndex]))
} else {
buf.Write(b)
}
}
}
func replaceCompressed(char byte) []byte {
var v uint16
switch char {
case 0x82:
v = 0x201A
case 0x83:
v = 0x0192
case 0x84:
v = 0x201E
case 0x85:
v = 0x2026
case 0x86:
v = 0x2020
case 0x87:
v = 0x2021
case 0x88:
v = 0x02C6
case 0x89:
v = 0x2030
case 0x8A:
v = 0x0160
case 0x8B:
v = 0x2039
case 0x8C:
v = 0x0152
case 0x91:
v = 0x2018
case 0x92:
v = 0x2019
case 0x93:
v = 0x201C
case 0x94:
v = 0x201D
case 0x95:
v = 0x2022
case 0x96:
v = 0x2013
case 0x97:
v = 0x2014
case 0x98:
v = 0x02DC
case 0x99:
v = 0x2122
case 0x9A:
v = 0x0161
case 0x9B:
v = 0x203A
case 0x9C:
v = 0x0153
case 0x9F:
v = 0x0178
default:
return []byte{char}
}
out := make([]byte, 2)
binary.LittleEndian.PutUint16(out, v)
return out
}
func getWordDocAndTables(r *mscfb.Reader) (*mscfb.File, *mscfb.File, *mscfb.File) {
var wordDoc, table0, table1 *mscfb.File
for i := 0; i < len(r.File); i++ {
stream := r.File[i]
switch stream.Name {
case "WordDocument":
wordDoc = stream
case "0Table":
table0 = stream
case "1Table":
table1 = stream
}
}
return wordDoc, table0, table1
}
func getActiveTable(table0 *mscfb.File, table1 *mscfb.File, f *fib) *mscfb.File {
if f.base.fWhichTblStm == 0 {
return table0
}
return table1
}