-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
executable file
·170 lines (132 loc) · 11.5 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
SHELL:=/bin/bash
############################################################################################
# SYSTEM EVALUATION
############################################################################################
SCORER_DIR?=CLEF-HIPE-2020-scorer
SYS_DIR?=systems
SUB_DIR?=$(SYS_DIR)/submitted
SUB_TIMENORM_DIR?=$(SYS_DIR)/time-normalized
SUB_HISTONORM_DIR?=$(SYS_DIR)/histo-normalized
EVAL_DIR?=evaluation-results
RES_DIR?=$(EVAL_DIR)/system-evaluations
EVAL_LOGS_DIR?=$(EVAL_DIR)/system-evaluation-logs
RANK_DIR?=$(EVAL_DIR)/system-rankings
# copy this file manually from here and remove spaces in file name:
# https://docs.google.com/spreadsheets/d/1s2BpIeiqsJIjHLsIecshrHG9UFkMBW1Nfr-h68AB2pY/edit#gid=1606475364
FILE_NEL_MAPPING?=$(SCORER_DIR)/historico-fuzzy-QID-mapping-ID-mapping.tsv
DE-GOLD-FILE?=$(RELEASE_DIR)/$(VERSION)/de/HIPE-data-$(VERSION)-test-de.tsv
FR-GOLD-FILE?=$(RELEASE_DIR)/$(VERSION)/fr/HIPE-data-$(VERSION)-test-fr.tsv
EN-GOLD-FILE?=$(RELEASE_DIR)/$(VERSION)/en/HIPE-data-$(VERSION)-test-en.tsv
EVAL_NOISE_LEVEL?=0.0-0.0,0.001-0.1,0.1-0.3,0.3-1.1
EVAL_TIME_PERIOD_FR?=1790-1810,1810-1830,1830-1850,1850-1870,1870-1890,1890-1910,1910-1930,1930-1950,1950-1970,1970-1990,1990-2010,2010-2030
EVAL_TIME_PERIOD_DE?=1790-1810,1810-1830,1830-1850,1850-1870,1870-1890,1890-1910,1910-1930,1930-1950,1950-1970
EVAL_TIME_PERIOD_EN?=1790-1900,1900-1970
prepare-eval:
mkdir -p $(SUB_DIR)
mkdir -p $(SUB_TIMENORM_DIR)
mkdir -p $(SUB_HISTONORM_DIR)
mkdir -p $(RES_DIR)
mkdir -p $(EVAL_LOGS_DIR)
mkdir -p $(RANK_DIR)
# TEAMNAME_TASKBUNDLEID_LANG_RUNNUMBER.tsv
submission-files:=$(wildcard $(SUB_DIR)/*bundle$(BUNDLE)*.tsv)
# produce normalized version of system responses
# versions: time, historical+time
submission-timenorm-files:=$(subst $(SUB_DIR),$(SUB_TIMENORM_DIR),$(submission-files))
submission-histonorm-files:=$(subst $(SUB_DIR),$(SUB_HISTONORM_DIR),$(submission-files))
# produce normalized version of gold standard
# version: historical
gold-files:=$(wildcard $(RELEASE_DIR)/$(VERSION)/*/*test-??.tsv)
gold-histonorm-files:=$(gold-files:.tsv=_histonorm.tsv)
result-timenorm-files:=$(subst $(SUB_TIMENORM_DIR),$(RES_DIR),$(submission-timenorm-files))
result-histonorm-files:=$(subst $(SUB_HISTONORM_DIR),$(RES_DIR),$(submission-histonorm-files:.tsv=_relaxed.tsv))
eval-system-bundle: $(submission-timenorm-files) $(result-timenorm-files) $(submission-histonorm-files) $(result-histonorm-files)
eval-full: $(gold-histonorm-files) baseline-nerc eval-clean eval-system ranking-de ranking-fr ranking-en ranking-fine-de ranking-fine-fr plots-paper #rankings-summary
eval-system: prepare-eval
$(MAKE) eval-system-bundle BUNDLE=1
$(MAKE) eval-system-bundle BUNDLE=2
$(MAKE) eval-system-bundle BUNDLE=3
$(MAKE) eval-system-bundle BUNDLE=4
$(MAKE) eval-system-bundle BUNDLE=5
# normalize NEL for historical entities in gold standard
%_histonorm.tsv: %.tsv
python $(SCORER_DIR)/normalize_linking.py -i $< -o $@ --norm-histo -m $(FILE_NEL_MAPPING)
# normalize NEL for time mentions in system responses
$(SUB_TIMENORM_DIR)/%.tsv: $(SUB_DIR)/%.tsv
python $(SCORER_DIR)/normalize_linking.py -i $< -o $@ --norm-time
# normalize NEL for historical entities and time mentions in system responses
$(SUB_HISTONORM_DIR)/%.tsv: $(SUB_DIR)/%.tsv
python $(SCORER_DIR)/normalize_linking.py -i $< -o $@ --norm-time --norm-histo -m $(FILE_NEL_MAPPING)
# Train baseline systems and predict bundle 4
baseline-nerc: $(SUB_DIR)/baseline_bundle4_de_1.tsv $(SUB_DIR)/baseline_bundle4_fr_1.tsv $(SUB_DIR)/baseline_bundle4_en_1.tsv
$(SUB_DIR)/baseline_bundle4_%_1.tsv: $(DE-GOLD-FILE) $(FR-GOLD-FILE)
python $(SCORER_DIR)/baseline.py --train $(RELEASE_DIR)/$(VERSION)/$*/HIPE-data-$(VERSION)-train-$*.tsv \
--dev $(RELEASE_DIR)/$(VERSION)/$*/HIPE-data-$(VERSION)-test-$*.tsv \
--pred $@ --cols NE-COARSE-LIT,NE-COARSE-METO
# take development set to train baseline system for English
$(SUB_DIR)/baseline_bundle4_en_1.tsv: $(EN-GOLD-FILE)
python $(SCORER_DIR)/baseline.py --train $(RELEASE_DIR)/$(VERSION)/en/HIPE-data-$(VERSION)-dev-en.tsv \
--dev $(RELEASE_DIR)/$(VERSION)/en/HIPE-data-$(VERSION)-test-en.tsv \
--pred $@ --cols NE-COARSE-LIT,NE-COARSE-METO
# standard evaluation
# with dirty runtime hacks to conditionally evaluate on a particular language and bundle
# the language is also used to determine the time buckets conditionally
$(RES_DIR)/%.tsv: $(SUB_TIMENORM_DIR)/%.tsv $(gold-files)
@$(eval LANG_ABBR=$(shell echo $< | grep -Po '(?<=_)[a-z]{2}(?=_[0-3])'))
@$(eval PERIOD=$(shell if [ "en" == $(LANG_ABBR) ]; then echo $(EVAL_TIME_PERIOD_EN); elif [ "de" == $(LANG_ABBR) ]; then echo $(EVAL_TIME_PERIOD_DE); else echo $(EVAL_TIME_PERIOD_FR); fi))
ifeq ($(BUNDLE),1)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_coarse --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_coarse.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_fine --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_fine.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),2)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_coarse --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_coarse.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),3)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_coarse --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_coarse.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_fine --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_fine.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),4)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nerc_coarse --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nerc_coarse.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),5)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR).tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
endif
# NEL evaluation on historically normalized system and gold datasets
$(RES_DIR)/%_relaxed.tsv: $(SUB_HISTONORM_DIR)/%.tsv $(gold-histonorm-files)
@$(eval LANG_ABBR=$(shell echo $< | grep -Po '(?<=_)[a-z]{2}(?=_[0-3])'))
@$(eval PERIOD=$(shell if [ "en" == $(LANG_ABBR) ]; then echo $(EVAL_TIME_PERIOD_EN); elif [ "de" == $(LANG_ABBR) ]; then echo $(EVAL_TIME_PERIOD_DE); else echo $(EVAL_TIME_PERIOD_FR); fi))
ifeq ($(BUNDLE),1)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR)_histonorm.tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --suffix relaxed --n_best 1,3,5 --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),2)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR)_histonorm.tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --suffix relaxed --n_best 1,3,5 --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
else ifeq ($(BUNDLE),5)
python $(SCORER_DIR)/clef_evaluation.py --ref $(RELEASE_DIR)/$(VERSION)/$(LANG_ABBR)/HIPE-data-$(VERSION)-test-$(LANG_ABBR)_histonorm.tsv --pred $< --task nel --outdir $(RES_DIR) --log $(EVAL_LOGS_DIR)/$(@F:.tsv=.nel.log) --suffix relaxed --n_best 1,3,5 --tagset $(SCORER_DIR)/tagset.txt --noise-level $(EVAL_NOISE_LEVEL) --time-period $(PERIOD)
endif
# produce rankings per language, task, sorted by F1-score of micro fuzzy across labels
# bundle 5 goes into separate table as they have gold annotations
# English has no fine annotation, other than German and French
ranking-%: $(result-files) $(result-norm-files) $(gold-norm-files) $(normalized-files)
head -q -n1 $(RES_DIR)/*_$*_*.tsv | head -1 > header.tmp
grep -hs 'NE-COARSE.*micro-fuzzy.*ALL' $(RES_DIR)/*_$*_*.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-coarse-micro-fuzzy-all.tsv
grep -hs 'NE-COARSE.*micro-strict.*ALL' $(RES_DIR)/*_$*_*.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-coarse-micro-strict-all.tsv
grep -hs 'NEL.*micro-fuzzy' $(RES_DIR)/*_bundle{1..4}_$*_*.tsv | grep -v 'relaxed' | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-nel-micro-fuzzy.tsv
grep -hs 'NEL.*micro-fuzzy' $(RES_DIR)/*_bundle5_$*_*.tsv | grep -v 'relaxed' | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-nel-only-micro-fuzzy.tsv
grep -hs 'NEL.*micro-fuzzy' $(RES_DIR)/*_bundle{1..4}_$*_*_relaxed.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-nel-micro-fuzzy-relaxed.tsv
grep -hs 'NEL.*micro-fuzzy' $(RES_DIR)/*_bundle5_$*_*_relaxed.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-nel-only-micro-fuzzy-relaxed.tsv
rm header.tmp
ranking-fine-%: $(result-files) $(result-norm-files) $(gold-norm-files) $(normalized-files)
head -q -n1 $(RES_DIR)/*_$*_*.tsv | head -1 > header.tmp
grep -Phs '(NE-FINE|NE-NESTED).*micro-fuzzy.*ALL' $(RES_DIR)/*_$*_*.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-fine-micro-fuzzy-all.tsv
grep -Phs '(NE-FINE|NE-NESTED).*micro-strict.*ALL' $(RES_DIR)/*_$*_*.tsv | sort -t$$'\t' -k2,2 -k6,6r | (cat header.tmp && cat) > $(RANK_DIR)/ranking-$*-fine-micro-strict-all.tsv
rm header.tmp
# produce the plots of the system performance on noisy and diachronic data also shown in the paper
plots-paper: ranking-de ranking-fr ranking-en
python lib/eval_robustness.py --input-dir $(RANK_DIR) --output-dir $(EVAL_DIR)/robustness --log-file eval_robustness.log
eval-clean:
rm -f $(RANK_DIR)/*
rm -f $(SUB_TIMENORM_DIR)/*
rm -f $(SUB_HISTONORM_DIR)/*
rm -f $(RES_DIR)/*tsv
rm -f $(EVAL_LOGS_DIR)/*
rm -f $(gold-histonorm-files)
rankings-summary: generate-rankings-summary
generate-rankings-summary:
python lib/format_rankings_summary.py --input-dir=$(RANK_DIR) --output-dir=$(RANK_DIR)