-
Notifications
You must be signed in to change notification settings - Fork 91
/
Copy pathvalidate_indicators.sh
executable file
·128 lines (115 loc) · 4.51 KB
/
validate_indicators.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script:
# 1) Creates a virtualenv at `venv_test`
# 2) Installs python package requirements.
# 3) Runs the indicator calculation logic through PySpark.
# 4) Verifies the generated output.
set -e # Make sure that we exit after the very first error.
# Set up Spark and requirements
declare -r has_virtualenv=$(which virtualenv)
if [[ -z ${has_virtualenv} ]]; then
echo "ERROR: 'virtualenv' not found; make sure it is installed and is in PATH."
exit 1
fi
virtualenv -p python3 venv_test
source ./venv_test/bin/activate
# Making sure the pip3 is from the virtualenv
declare -r has_pip3=$(which pip3 | grep 'venv_test')
if [[ -z ${has_pip3} ]]; then
echo "ERROR: could not find pip3 in 'venv_test/' ${has_pip3}!"
exit 1
fi
pip3 install -r requirements.txt
# Run unit-tests first:
python -m unittest query_lib_spark_test.SparkPatientQueryTest \
query_lib_compatibility_tests.PatientQueryTestSpark \
query_lib_compatibility_tests.PatientQueryTestBigQuery
# Run indicator calculation logic.
TEMP_OUT=$(mktemp indicators_output_XXXXXX.csv --tmpdir)
echo "Output indicators file is: ${TEMP_OUT}"
# Setting the reporting period to a year because the synthetic data is sparse.
spark-submit indicators.py --src_dir=./test_files/parquet_big_db_r4 \
--last_date=2010-01-01 --num_days=365 --output_csv=${TEMP_OUT}
##########################################
# Assertion function that tests aggregates generated
# by comparing them to what is expected
# Arguments:
# $1 a message identifying the indicator being validated
# $2 the expected aggregates as a string containing comma separated values
# $3 the column index in the CSV for the indicator value
# Globals:
# ${FAILED} is set if the actual value is different from the expected
##########################################
function validate() {
local indicator_label=$1
local expected=$2
local col_index=$3
actual=$(cat ${TEMP_OUT} | awk -v col_index="$col_index" -F, '
BEGIN {value_true = 0; value_false = 0; value_none = 0;}
/False,ALL-AGES_ALL-GENDERS/ {value_false=$col_index}
/True,ALL-AGES_ALL-GENDERS/ {value_true=$col_index}
/None,ALL-AGES_ALL-GENDERS/ {value_none=$col_index}
/True,25-49_male/ {value_male_25=$col_index}
END {printf("%.3g,%.3g,%.3g,%.3g", value_true, value_false, value_none, value_male_25); }')
echo "${indicator_label} : ${actual}"
if [[ "${actual}" != "${expected}" ]]; then
echo "ERROR: ${indicator_label}" \
"expected to be ${expected} GOT ${actual}"
FAILED="yes"
fi
}
FAILED=""
# PVLS counts
validate "Suppressed, non-suppressed, none, male_25 numbers are" "8,752,0,2" 3
# PVLS ratio
validate "Suppressed, non-suppressed, none, male_25 ratios are" \
"0.0105,0.989,0,0.00263" 4
# TX_NEW counts
# TODO validate these manually by querying the DB
validate "TX_NEW, non-TX_NEW, none, male_25 numbers are" "95,439,0,24" 6
# TX_NEW ratio
validate "TX_NEW, non-TX_NEW, none, male_25 ratios are" "0.178,0.822,0,0.0449" 7
# TB_STAT counts
validate "TB_STAT, non-TB_STAT, none, male_25 numbers are" "86,448,0,29" 9
# TB_STAT ratio
validate "TB_STAT, non-TB_STAT, none, male_25 ratios are" \
"0.161,0.839,0,0.0543" 10
# TX_CURR counts
validate "TX_CURR, non-TX_CURR, none, male_25 numbers are" "116,418,0,29" 12
# TX_CURR ratio
validate "TX_CURR, non-TX_CURR, none, male_25 ratios are" \
"0.217,0.783,0,0.0543" 13
# TB_ART counts
validate "TB_ART, non-TB_ART, none, male_25 numbers are" "43,491,0,13" 15
# TB_ART ratio
validate "TB_ART, non-TB_ART, none, male_25 ratios are" \
"0.0805,0.919,0,0.0243" 16
# TB_PREV counts
validate "TB_PREV, non-TB_PREV, none, male_25 numbers are" "57,6.35e+03,0,8" 18
# TB_PREV ratio
validate "TB_PREV, non-TB_PREV, none, male_25 ratios are" \
"0.0089,0.991,0,0.00125" 19
# TX_TB counts
validate "TX_TB, non-TX_TB, none, male_25 numbers are" "0,6.4e+03,0,0" 21
# TX_TB ratio
validate "TX_TB, non-TX_TB, none, male_25 ratios are" "0,1,0,0" 22
if [[ -n "${FAILED}" ]]; then
echo "FAILED!"
else
echo "SUCCESS!"
fi
deactivate