-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_edict.sh
46 lines (30 loc) · 1.35 KB
/
preprocess_edict.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
CORPUS_DIR=/home/takebayashi/src/corpus/EDICT
ORIG_CORPUS=${CORPUS_DIR}/IWSLT17.TED
SLAN=ja
TLAN=en
PREP_DIR=/home/takebayashi/src/Preprocess
YEAR=2010
date
# extract sentence
DONE="
cat ${CORPUS_DIR}/edict.txt \
| python sentExt_edict.py ${PREP_DIR}/stopword.en \
| python3 han2zen.py \
| python split.py ${CORPUS_DIR}/edict.ext.ja ${CORPUS_DIR}/edict.ext.en
cat ${CORPUS_DIR}/edict.ext.ja | mecab -Owakati > ${CORPUS_DIR}/edict.wakati.ja
python delete_stopword.py ${PREP_DIR}/stopword.ja ${CORPUS_DIR}/edict.wakati.ja ${CORPUS_DIR}/edict.ext.en \
${CORPUS_DIR}/edict.tok.ja ${CORPUS_DIR}/edict.tok.en
"
# apply bpe
DONE="
N_OP=32000
CODE_FILE=${CORPUS_DIR}/shared_bpe${N_OP}.code
#cat ${CORPUS_DIR}/edict.tok.ja ${CORPUS_DIR}/edict.tok.en | python ./bpe/learn_bpe.py -s ${N_OP} -o ${CODE_FILE}
#python3 ./bpe/apply_bpe.py -c ${CODE_FILE} < ${CORPUS_DIR}/edict.tok.ja | python3 ./bpe/get_vocab.py > ${CORPUS_DIR}/edict.tok.vocab.ja
#python3 ./bpe/apply_bpe.py -c ${CODE_FILE} < ${CORPUS_DIR}/edict.tok.en | python3 ./bpe/get_vocab.py > ${CORPUS_DIR}/edict.tok.vocab.en
python3 ./bpe/apply_bpe.py -c ${CODE_FILE} < ${CORPUS_DIR}/edict.tok.ja > ${CORPUS_DIR}/edict.bpe.ja
python3 ./bpe/apply_bpe.py -c ${CODE_FILE} < ${CORPUS_DIR}/edict.tok.en > ${CORPUS_DIR}/edict.bpe.en
"
# create j2e dictionary
# | python create_j2e-dict_edict.py