forked from kandorm/CLINE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.sh
45 lines (37 loc) · 1.06 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env bash
OUTPUT_DIR=.
CONFIG_DIR=./config
DATA_DIR=$OUTPUT_DIR/data
CACHE_DIR=$OUTPUT_DIR/cache
TOKENIZE_DIR=$DATA_DIR/tokenize
CORPUS_DIR=$DATA_DIR/corpus
DISK_DIR=$DATA_DIR/disk
if [ ! -d $DISK_DIR ]; then
mkdir $DISK_DIR
fi
if [ ! -d $CACHE_DIR ]; then
mkdir $CACHE_DIR
fi
if [ ! -d $TOKENIZE_DIR ]; then
mkdir $TOKENIZE_DIR
fi
CORPUS_NAME=enwiki_bookcorpus
MODEL_NAME=roberta
CORPUS_PATH=$CORPUS_DIR/$CORPUS_NAME
CONFIG_PATH=$CONFIG_DIR/en/$MODEL_NAME-tiny-config.json
TOKENIZE_PATH=$TOKENIZE_DIR/$CORPUS_NAME
DATA_CACHE_PATH=$CACHE_DIR/$CORPUS_NAME-train.arrow
PREPROCESS_BATCH_SIZE=500
BLOCK_SIZE=256
PREPROCESS_NUM_PROCESS=4
python3 src/dataloader.py \
--train_data_file $CORPUS_PATH \
--cache_dir $CACHE_DIR \
--config_name $CONFIG_PATH \
--tokenizer_name $TOKENIZE_PATH \
--block_size $BLOCK_SIZE \
--lang en \
--preprocess_batch_size $PREPROCESS_BATCH_SIZE \
--preprocess_cache_file $DATA_CACHE_PATH \
--preprocess_num_process $PREPROCESS_NUM_PROCESS \
--preprocess_output_file $DISK_DIR/$CORPUS_NAME-tiny-disk