This repository has been archived by the owner on Oct 31, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 556
/
Copy pathget_evaluation.sh
129 lines (112 loc) · 4.24 KB
/
get_evaluation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
en_analogy='https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip'
dl_path='https://dl.fbaipublicfiles.com/arrival'
semeval_2017='http://alt.qcri.org/semeval2017/task2/data/uploads'
europarl='http://www.statmt.org/europarl/v7'
declare -A wordsim_lg
wordsim_lg=(["en"]="EN_MC-30.txt EN_MTurk-287.txt EN_RG-65.txt EN_VERB-143.txt EN_WS-353-REL.txt EN_YP-130.txt EN_MEN-TR-3k.txt EN_MTurk-771.txt EN_RW-STANFORD.txt EN_SIMLEX-999.txt EN_WS-353-ALL.txt EN_WS-353-SIM.txt" ["es"]="ES_MC-30.txt ES_RG-65.txt ES_WS-353.txt" ["de"]="DE_GUR350.txt DE_GUR65.txt DE_SIMLEX-999.txt DE_WS-353.txt DE_ZG222.txt" ["fr"]="FR_RG-65.txt" ["it"]="IT_SIMLEX-999.txt IT_WS-353.txt")
mkdir monolingual crosslingual
## English word analogy task
curl -Lo source-archive.zip $en_analogy
mkdir -p monolingual/en/
unzip -p source-archive.zip word2vec/trunk/questions-words.txt > monolingual/en/questions-words.txt
rm source-archive.zip
## Downloading en-{} or {}-en dictionaries
lgs="af ar bg bn bs ca cs da de el en es et fa fi fr he hi hr hu id it ja ko lt lv mk ms nl no pl pt ro ru sk sl sq sv ta th tl tr uk vi zh"
mkdir -p crosslingual/dictionaries/
for lg in ${lgs}
do
for suffix in .txt .0-5000.txt .5000-6500.txt
do
fname=en-$lg$suffix
curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname
fname=$lg-en$suffix
curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname
done
done
## Download European dictionaries
for src_lg in de es fr it pt
do
for tgt_lg in de es fr it pt
do
if [ $src_lg != $tgt_lg ]
then
for suffix in .txt .0-5000.txt .5000-6500.txt
do
fname=$src_lg-$tgt_lg$suffix
curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname
done
fi
done
done
## Download Dinu et al. dictionaries
for fname in OPUS_en_it_europarl_train_5K.txt OPUS_en_it_europarl_test.txt
do
echo $fname
curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname
done
## Monolingual wordsim tasks
for lang in "${!wordsim_lg[@]}"
do
echo $lang
mkdir monolingual/$lang
for wsim in ${wordsim_lg[$lang]}
do
echo $wsim
curl -Lo monolingual/$lang/$wsim $dl_path/$lang/$wsim
done
done
## SemEval 2017 monolingual and cross-lingual wordsim tasks
# 1) Task1: monolingual
curl -Lo semeval2017-task2.zip $semeval_2017/semeval2017-task2.zip
unzip semeval2017-task2.zip
fdir='SemEval17-Task2/test/subtask1-monolingual'
for lang in en es de fa it
do
mkdir -p monolingual/$lang
uplang=`echo $lang | awk '{print toupper($0)}'`
paste $fdir/data/$lang.test.data.txt $fdir/keys/$lang.test.gold.txt > monolingual/$lang/${uplang}_SEMEVAL17.txt
done
# 2) Task2: cross-lingual
mkdir -p crosslingual/wordsim
fdir='SemEval17-Task2/test/subtask2-crosslingual'
for lg_pair in de-es de-fa de-it en-de en-es en-fa en-it es-fa es-it it-fa
do
echo $lg_pair
paste $fdir/data/$lg_pair.test.data.txt $fdir/keys/$lg_pair.test.gold.txt > crosslingual/wordsim/$lg_pair-SEMEVAL17.txt
done
rm semeval2017-task2.zip
rm -r SemEval17-Task2/
## Europarl for sentence retrieval
# TODO: set to true to activate download of Europarl (slow)
if false; then
mkdir -p crosslingual/europarl
# Tokenize EUROPARL with MOSES
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
for lg_pair in it-en # es-en etc
do
curl -Lo $lg_pair.tgz $europarl/$lg_pair.tgz
tar -xvf $lg_pair.tgz
rm $lg_pair.tgz
lgs=(${lg_pair//-/ })
for lg in ${lgs[0]} ${lgs[1]}
do
cat europarl-v7.$lg_pair.$lg | $TOKENIZER -threads 8 -l $lg -no-escape > euro.$lg.txt
rm europarl-v7.$lg_pair.$lg
done
paste euro.${lgs[0]}.txt euro.${lgs[1]}.txt | shuf > euro.paste.txt
rm euro.${lgs[0]}.txt euro.${lgs[1]}.txt
cut -f1 euro.paste.txt > crosslingual/europarl/europarl-v7.$lg_pair.${lgs[0]}
cut -f2 euro.paste.txt > crosslingual/europarl/europarl-v7.$lg_pair.${lgs[1]}
rm euro.paste.txt
done
rm -rf mosesdecoder
fi