-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-suggest.sh
executable file
·81 lines (74 loc) · 3.06 KB
/
test-suggest.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/sh
: "${freqs:=/t/freqs}"
: "${md:=2}" # MaxMaxDist update -d Distance {1..4}
: "${qd:=2}" # Query dmax query -d Distance {1..4}
: "${d:=1}" # Number of deletes in makeTypos -d {1..4}
: "${b:=6}" # Batch size in makeTypos -s {1..6}
: "${qm:=5}" # Matches query -m {1..10}
: "${ht:=/.T}" # HugeTLBfs
if [ $# -lt 1 ]; then cat <<- EOF
Usage:
[ freqs=/t/freqs md=1 qd=1 d=1 b=6 qm=5 ht=/.T ] $0 N1 [N2..]
This is a benchmarking harness for 'suggest' that takes as primary arguments
the size of the 'head's of a corpus file \$freqs "word<spc>freq<newline>".
(I.e. vocabulary size subsets, weighted by most frequent in smaller subsets.)
This data is used to 'suggest makeTypos' sampled from the actual frequency
distribution and then 'suggest compare' the SymSpell and linear scan algos,
reporting mean +- stddev(mean) milliseconds per word to get suggestions.
Additional environment variable controls are a the top of the script.
This script also supports timing the impact of Huge TLB Virtual Memory pages
using a Linux hugetlbfs mounted on \$ht with sufficient room for the data
files generated by \$freq and \$md. I've measured 2-4x query speed-up @d<4
(larger than the SymSpell-vs-scan boost for md=4,qd=4 and 20000 word dicts).
To use HUGETLB=1, add '/etc/sysctl.conf:vm.nr_hugepages = 1024', and also
add '/etc/fstab:nodev \$ht hugetlbfs defaults,size=2048m 0 0', mkdir \$ht &
either reboot or 'echo 3 > /proc/sys/vm/drop_caches; sysctl -a; mount -a'.
Since hugetlbfs rounds file sizes to 2MiB, we build in /tmp (an ordinary
tmpfs for me, but any FS will do) and 'suggest cpHuge' data files to \$ht
(GNU 'cp' fails for hugetlbfs) and use 'suggest -r' to let 'suggest' know
true file sizes. \$HUGESAVE=1 also skips a final \$ht purge.
EOF
exit
fi
set -e
rm -rf /tmp/[0-9][0-9]*
meanPmSdev() {
awk '{print sum += $1, n += 1, ssq += $1*$1}' |
tail -n1 |
awk '{print $1 / $2, "+-", (($3 / $2 - ($1 / $2)^2)/$2)^0.5 }'
}
for z; do
dir="/tmp/$z"
mkdir -p "$dir/typos"
(cd "$dir"
head -n "$z" "$freqs" > freqs
suggest update -pp -d "$md" -i freqs &
suggest makeTypos -d "$d" -s "$b" -p freqs -o typos/ -n2001 &
wait
if [ "x$HUGETLB" = x ]; then
suggest compare -pp --dmax="$qd" -m "$qm" -d typos > both
else
rm -f "$ht/p.corp" "$ht/p.keys" "$ht/p.meta" "$ht/p.sugg" "$ht/p.tabl"
for f in p.corp p.keys p.meta p.sugg p.tabl; do
suggest cpHuge "$f" "$ht/$f" #GNU cp fails for hugetlbfs!
done
suggest compare -p "$ht/p" -r p --dmax="$qd" -m "$qm" -d typos > both
if [ "x$HUGESAVE" = x ]; then
rm -f "$ht/p.corp" "$ht/p.keys" "$ht/p.meta" "$ht/p.sugg" "$ht/p.tabl"
fi
fi
awk '{print $2}' < both > Via-scan
awk '{print $4}' < both > Via-qry
printf "sz %d scan: %s\n" "$z" "$(meanPmSdev < Via-scan)"
printf "sz %d qry: %s\n" "$z" "$(meanPmSdev < Via-qry)"
# gnuplot <<-EOF
# set term png small size 1080,1080
# set output "cdfs.png"
# set style data points
# set title 'Vocab Size $z'
# set xlab 'timeRank'
# set ylab 'ms'
# plot 'Via-scan' u 0:1, 'Via-qry' u 0:1
#EOF
)
done