-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphrase_mining.sh
87 lines (85 loc) · 3.35 KB
/
phrase_mining.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash
# RAW_TEXT is the input file for Phrase Mining, where each line is a single document.
RAW_TEXT='data/input.txt'
# LABELED_FILE is the name and location of the labeled data file used by our model.
LABELED_FILE='data/training_set.txt'
# A threshold of raw frequency is specified for frequent phrase mining, which will generate a candidate set of phrases.
MIN_FREQ=10
# A threshold of length is specified for frequent phrase mining, which will generate a candidate set of phrases.
MAX_LENGTH=6
# Set to 1 if you want to use GPU to run our model.
CUDA=1
# PRETRAINED_WEIGHTS is the name and location of the pretrained_weights for BERT model.
PRETRAINED_WEIGHTS='model/pretrained_bert_tf/biobert_pretrain_output_disch_100000'
# OUTPUT_MODEL is the name and location of the BERT model used for phrase mining.
OUTPUT_MODEL='model/clinical_bert_best_model.bin'
# OUTPUT_CONFIG is the name and location of the BERT config used for phrase mining.
OUTPUT_CONFIG='model/clinical_bert_best_model_config.bin'
# OUTPUT_VOCAB is the name and location of the BERT vocab used for phrase mining.
OUTPUT_VOCAB='model/clinical_bert_best_model_vocab.bin'
# BERT DIMENSIONS is the dimension size of the BERT Model
BERT_DIMENSIONS=768
# LR is the learning rate of the fine-tuning model.
LR=2e-5
# EPOCHS is the number of epochs to fine-tune the pretrained BERT model
EPOCHS=4
# QUALITY_OUTPUT is the output of our model, where each line is a quality phrase
QUALITY_OUTPUT='output/quality_phrases.txt'
# Set to 1 if you want to extract frequent N grams.
FREQUENT_NGRAM=0
# Set to 1 if you want to fine-tune a pretrained model
TRAIN=1
#Frequent Phrase Generation
if [ ${FREQUENT_NGRAM} -eq 1 ]; then
python src/frequent_ngram_gen.py \
--input_file $RAW_TEXT \
--length $MAX_LENGTH \
--frequency $MIN_FREQ
fi
if [ ${CUDA} -eq 1 ]; then
if [ ${TRAIN} -eq 1 ]; then
python src/main.py \
--input_file $RAW_TEXT \
--output_file $QUALITY_OUTPUT \
--training_file $LABELED_FILE \
--num_epochs $EPOCHS \
--pretrained_weights $PRETRAINED_WEIGHTS \
--output_model $OUTPUT_MODEL \
--output_config $OUTPUT_CONFIG \
--output_vocab $OUTPUT_VOCAB \
--cuda \
--train
else
python src/main.py \
--input_file $RAW_TEXT \
--output_file $QUALITY_OUTPUT \
--num_epochs $EPOCHS \
--pretrained_weights $PRETRAINED_WEIGHTS \
--output_model $OUTPUT_MODEL \
--output_config $OUTPUT_CONFIG \
--output_vocab $OUTPUT_VOCAB \
--cuda
fi
else
if [ ${TRAIN} -eq 1 ]; then
python src/main.py \
--input_file $RAW_TEXT \
--output_file $QUALITY_OUTPUT \
--training_file $LABELED_FILE \
--num_epochs $EPOCHS \
--pretrained_weights $PRETRAINED_WEIGHTS \
--output_model $OUTPUT_MODEL \
--output_config $OUTPUT_CONFIG \
--output_vocab $OUTPUT_VOCAB \
--train
else
python src/main.py \
--input_file $RAW_TEXT \
--output_file $QUALITY_OUTPUT \
--num_epochs $EPOCHS \
--pretrained_weights $PRETRAINED_WEIGHTS \
--output_model $OUTPUT_MODEL \
--output_config $OUTPUT_CONFIG \
--output_vocab $OUTPUT_VOCAB
fi
fi