-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathixFull5.nim
52 lines (46 loc) · 2.43 KB
/
ixFull5.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Here we rely on imports, wikipa, and do top-N-by-relevance results.
when not declared(File): import std/syncio
import tables, strutils, sugar, ./reader, ./terms, ./qutil
type # Main code: Here to end of `find`
Doc = object # Abstraction of a "document"
name: string # document metadata
freq: CountTable[string] # histo of terms; Should put into uint32
Index = object # Inverted Index
docs: seq[Doc] # uint32-keyed dyn.array
inv: Table[string, HashSet[uint32]] # map term -> docs; ~50-50 inv,docs spc
proc names*(ix: auto, docs: auto, prefix=""): string =
join(collect(for d in docs: prefix & ix.docs[d].name), "\n")
let noDocs = initHashSet[uint32](0) # convenience empty set
proc initIndex*(f: File, nDigit=3): Index =
for (name, text) in f.lenPrefixedPairs(nDigit):
result.docs.setLen result.docs.len + 1
result.docs[^1].name = name
result.docs[^1].freq = initCountTable[string](1) # Abstracts can be tiny
for term in text.terms: result.docs[^1].freq.inc term
for term, cnt in result.docs[^1].freq:
result.inv.mgetOrPut(term, noDocs).incl uint32(result.docs.len - 1)
proc find*(ix: Index, query: string, any=false, top=0): seq[uint32] =
var results: seq[HashSet[uint32]]
var qterms: seq[string]
var idf: seq[float]
for term in query.terms:
qterms.add term
let docs = ix.inv.getOrDefault(term, noDocs)
idf.add if docs.len > 0: idf(ix.docs.len, docs.len) else: 0.0
results.add docs
let res = if any: union(results) else: intersect(results)
if top == 0: return collect(for d in res: d) # no ranking => Done!
var hq: HeapQueue[(float32, uint32)]
for d in res:
var s = 0.0'f32 # Relevance score
for i, t in qterms: s += ix.docs[d].freq.getOrDefault(t).float*idf[i]
hq.pushOrReplace (s, d), top
for e in hq.popAscending: result.add e[1]
result.reverse # Want results in descending order
when isMainModule:
timeIt("ingest:"): (var ix = stdin.initIndex) # ./data.sh | ./wikipa | ./this
echo ix.inv.len," unique terms in ",ix.docs.len," documents"
timeIt(" q1:"): echo names(ix, ix.find("London Beer"))
timeIt(" q2:"): echo ix.find("London Beer", any=true).len
timeIt(" q3:"): echo names(ix, ix.find("London Beer", top=9))
timeIt(" q4:"): echo names(ix, ix.find("London Beer", any=true, top=9))