-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcomputer_segment_setup.txt
119 lines (103 loc) · 3.57 KB
/
computer_segment_setup.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Install dhSegment, VGG Image Annotator and other segmentation related programs.
# dhSegment, Res-Net based segmentator. Does classification in postprocessing.
git clone https://github.com/dhlab-epfl/dhSegment.git
cd dhSegment
vi setup.py
# change all == to >=
sudo pip3 install numpy==1.15.4
sudo pip3 install .
cd pretrained_models/
sudo python3 download_resnet_pretrained_model.py
cd ../demo
wget https://github.com/dhlab-epfl/dhSegment/releases/download/v0.2/model.zip
unzip model.zip
cd ..
sudo python3 demo.py
cd ..
sudo apt-get update
sudo apt-get install libreoffice
sudo apt-get install libmagickwand-dev imagemagick python-wand
pip install minecart --user
pip uninstall pdfminer.six
pip install pdfminer
rm -rf ~/tigrinya_ocr/scripts/*.pyc ~/tigrinya_ocr/scripts/__pycache__
vi ~/.local/lib/python3.6/site-packages/minecart/miner.py +330
pdfparser --> pdfdocument
convert -version
python -mwand.version
ldconfig -p | grep -i wand
export MAGICK_HOME='/usr/lib/x86_64-linux-gnu'
export WAND_MAGICK_LIBRARY_SUFFIX='-6.Q16'
# Loosen limits and enable ghostwriter scripts
sudo vi /etc/ImageMagick-6/policy.xml
- limits
<policy domain="resource" name="memory" value="2GiB"/>
<policy domain="resource" name="map" value="4GiB"/>
<policy domain="resource" name="width" value="128KP"/>
<policy domain="resource" name="height" value="128KP"/>
<policy domain="resource" name="area" value="1.0737GB"/>
<policy domain="resource" name="disk" value="16GiB"/>
- for all coder policies
<policy domain="coder" rights="read|write" pattern="PDF" />
# Assemble training data
mkdir train
mkdir eval
mkdir data
# download newspapers
python3 ../tigrinya_ocr/scripts/crawler.py
python3 ../tigrinya_ocr/scripts/crawler.py data/ 50
# download saved files from google drive
cd data
gdrive download --recursive <folder_id>
# generate corpus pdf (python2.7)
mkdir generated
cd ..
python ~/tigrinya_ocr/scripts/create_pdf.py ~/kraken/corpus/full_corpus.txt data/generated/corpus.pdf /dev/null
# Extract text boxes from pdfs
python ~/tigrinya_ocr/scripts/detect_boxes.py ~/tigrinya_ocr/raw_data/Tigrigna-Grammar-i-vs-e.pdf ~/segment/generated
# convert
mkdir images
mkdir logs
# remove massive files
du -ah ~/segment/data | sort -rh
python3 ../tigrinya_ocr/scripts/seg_process_pdfs.py data/ images/ &> logs/convert.log
# serve images
sudo apt-get install samba
sudo vi /etc/samba/smb.conf
sudo service smbd restart
sudo smbpasswd -a babraham
sudo chmod +rw -R /home/babraham/segment/images
# on MacOS, click "Go > Connect to Server"
smb://ip-address/segmentimagesvia
sudo apt-get install nginx
sudo chmod 755 -R /home/babraham/segment/images
sudo vi /etc/nginx/conf.d/segment.conf
server {
listen 8001; # a customed port
# download
autoindex on; # enable directory listing output
autoindex_exact_size off; # output file sizes rounded to kilobytes, megabytes, and gigabytes
autoindex_localtime on; # output local times in the directory
location / {
root /home/babraham/segment/images;
}
}
sudo nginx -s reload
sudo service nginx restart
# annotate images using VIA
mkdir labels
# transfer 1/10th of the data to eval
cd train
sh ../../tigrinya_ocr/scripts/mv_every_x.sh ../eval_1/ 10
cd ..
mkdir masks
python3 ../../tigrinya_ocr/scripts/convert_via.py
# configure train_config.json
# 6GB RAM at least needed for gpu training
https://github.com/dhlab-epfl/dhSegment/blob/master/demo/demo_config.json
# classes.txt
# Create a class color for each region type + background
python3 train.py with ../../tigrinya_ocr/segment/train_config.json
cd demo
tensorboard --logdir .
cd ..