-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_corpus.sh
80 lines (69 loc) · 2.95 KB
/
make_corpus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/sh
# Assuming the following files are already downloaded:
# ar-download/
# AraCorpus.tar.gz
# ShamelaLibrary348.7z
# watan-2004.7z
# arwiki-latest-pages-articles.xml.bz2
# Tashkeela-arabic-diacritized-text-utf8-0.3.zip
# pos_arabic.txt and neg_arabic.txt
# UNv1.0.6way.ar.txt
DL_LOC="ar-download"
mkdir $DL_LOC/out
cd $DL_LOC/out
preproccp1256 () {
tr $'\xA1\xBA.,:t' ' ' $1 |tr -d '\000-\037\041-\277\356-\377\327\334\340\342\347-\353'| sed "s/ \+/ /g"| LANG=C sed "/^.\{,30\}$/d"| tr $'\xc5\xc2\xc3' $'\xc7';
}
#preprocutf8 () { sed "s/[$(echo -ne '\u060C\u061B\.,:')]/ /g" $1 | sed "s/[^$(echo -ne '\u0621-\u064A ')\r]//g" | sed "s/ \+/ /g" | sed "/^.\{,30\}$/d" | sed "s/[$(echo -ne '\u0622\u0623\u0625')]/$(echo -ne '\u0627')/g"; }
# Parse AraCorpus
tar xzf ../AraCorpus.tar.gz
cat AraCorpus/Data/Collection-* | preproccp1256 > aracorpus.txt
rm -fr AraCorpus
# Parse Tashkeela
unzip ../Tashkeela-arabic-diacritized-text-utf8-0.3.zip
find Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt -type d -exec rename 's/ //g' {} \;
find Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt -type f -exec rename 's/ //g' {} \;
find Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt/ -type f | gawk 'BEGIN{ a=1 }{ printf "mv %s Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt/%04d.utf8\n", $0, a++ }' | bash
cat Tashkeela-arabic-diacritized-text-utf8-0.3/texts.txt/*.utf8 > tash.unproc.utf8
iconv -c -f utf8 -t Windows-1256 tash.unproc.utf8 | preproccp1256 > tashkeela.txt
rm -f tash.unproc.utf8
rm -fr Tashkeela-arabic-diacritized-text-utf8-0.3
# Parse ShamelaLibrary
7z x ../ShamelaLibrary348.7z
export MDB_JET3_CHARSET=CP1256
export MDBICONV=CP1256
for b in `find shamela/Books/ -name \*.mdb` ; do
for t in `mdb-tables $b | tr ' ' '\n' | grep -o "b[0-9]*"` ; do
mdb-export -H $b $t | preproccp1256 >> shamela.txt;
done
done
rm -fr shamela
# Parse Wikpedia
## For a faster parser, use: https://dizzylogic.com/wiki-parser [64-bit Windows App]
wget https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py
bzip2 -dk ../arwiki-latest-pages-articles.xml.bz2
mv ../arwiki-latest-pages-articles.xml arwiki.xml
python WikiExtractor.py --processes 8 -b 50M -q arwiki.xml
find text -type f | xargs cat > wiki.unproc.utf8
iconv -c -f utf8 -t Windows-1256 wiki.unproc.utf8 | preproccp1256 > wiki.txt
rm -fr text WikiExtractor.py arwiki.xml wiki.unproc.utf8
# Parse Watan
mkdir watan
cd watan
7z x ../../watan-2004.7z
find . -type f | xargs cat | preproccp1256 > ../watan.txt
cd ..
rm -fr watan
# Parse UNv1
#tar xzf ../UNv1.0.ar-en.tar.gz
iconv -c -f utf8 -t Windows-1256 ../UNv1.0.6way.ar.txt | preproccp1256 > un.txt
#rm -fr ar-en
#Parse Arabic Tweets
cat ../*_arabic.txt > tweets.unproc.utf8
iconv -c -f utf8 -t Windows-1256 tweets.unproc.utf8 | preproccp1256 > tweets.txt
rm -f tweets.unproc.utf8
# Final Concat + Replace Linebreaks with a space
cat *.txt | tr '\r\n' ' ' > ../../arabic_corpus
cd ../..
echo "Done!"
echo "You may delete $DL_LOC directory."