-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_parallel.sh
executable file
·110 lines (83 loc) · 2.76 KB
/
build_parallel.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
#
# take everything
# build & tokenize parallel corpora and split them
# build clean and also noisy tokenised bornholmsk data
DA_ALL="parallel.da.all"
DA_TRAIN="parallel.da.train"
DA_VAL="parallel.da.val"
DA_TEST="parallel.da.test"
DB_ALL="parallel.da-bornholm.all"
DB_TRAIN="parallel.da-bornholm.train"
DB_VAL="parallel.da-bornholm.val"
DB_TEST="parallel.da-bornholm.test"
DB_CLEAN="da-bornholm.clean.txt"
DB_NOISY="da-bornholm.noisy.txt"
> $DA_ALL
> $DB_ALL
> $DB_CLEAN
> $DB_NOISY
# kuhre
echo "Converting Kuhre"
cd resources
for chapter in `ls kuhre.ch? kuhre.ch??` ; do
./align_kuhre.py $chapter > /dev/null
cat $chapter.plain.da >> ../$DA_ALL
cat $chapter.plain.da-bo >> ../$DB_ALL
cat $chapter.norm.da >> ../$DA_ALL
cat $chapter.plain.da-bo >> ../$DB_ALL
cat $chapter.norm.lc.da >> ../$DA_ALL
cat $chapter.plain.da-bo | tr 'A-Z' 'a-z' >> ../$DB_ALL
done
cd ..
# citations
echo "Converting citations"
cd resources
./align_citations.py bornholmsk_ordbog_citations.txt
cat citations.da > ../$DA_ALL
cat citations.da-bo > ../$DB_ALL
cd ..
# gubbana
echo "Converting miscellanous parallel text"
./pairs_to_files.py bo_da_sent_gubbana.txt
cat bo_da_sent_gubbana.txt.a >> $DB_ALL
cat bo_da_sent_gubbana.txt.b >> $DA_ALL
# misc
./pairs_to_files.py bo_da_sent.txt
cat bo_da_sent.txt.a >> $DB_ALL
cat bo_da_sent.txt.b >> $DA_ALL
./pairs_to_files.py da_bo_sent_bornholmskersnak.txt
cat da_bo_sent_bornholmskersnak.txt.b >> $DB_ALL
cat da_bo_sent_bornholmskersnak.txt.a >> $DA_ALL
# build bornholmsk text
echo "Building monolingual corpus:"
echo " -- citations and kuhre"
cat resources/citations.da-bo resources/kuhre*.plain.da-bo >> $DB_CLEAN
echo " -- otto j lund"
ls resources/ottolund* | xargs -n1 ./nltktok.py >> $DB_CLEAN
echo " -- untokenized text"
./nltktok.py bornholmsk-untok.txt > bornholmsk.txt
echo " -- noisy text"
cp $DB_CLEAN $DB_NOISY
cat da_bo_sent_bornholmskersnak.txt.b bo_da_sent.txt.a bo_da_sent_gubbana.txt.a bornholmsk.txt >> $DB_NOISY
# shuffle and split parallel texts
echo "Shuffle and split parallel text"
shuf --random-source=rand_seed $DB_ALL > $DB_ALL.shuf
shuf --random-source=rand_seed $DA_ALL > $DA_ALL.shuf
head -n 500 $DB_ALL.shuf > $DB_TEST
head -n 1000 $DB_ALL.shuf | tail -n 500 > $DB_VAL
tail -n +1000 $DB_ALL.shuf > $DB_TRAIN
head -n 500 $DA_ALL.shuf > $DA_TEST
head -n 1000 $DA_ALL.shuf | tail -n 500 > $DA_VAL
tail -n +1000 $DA_ALL.shuf > $DA_TRAIN
# add names to training set
cat resources/men >> $DB_TRAIN
cat resources/men >> $DA_TRAIN
cat resources/women >> $DA_TRAIN
cat resources/women >> $DB_TRAIN
cat resources/geo >> $DA_TRAIN
cat resources/geo >> $DB_TRAIN
# package up the parallel resources
zip parallel.da.da-bornholm.zip parallel.d*
# next step: get a glove encoding on DB_NOISY
# use DB_CLEAN for normalisation