-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathixAllInOneEasy1.nim
72 lines (61 loc) · 3.27 KB
/
ixAllInOneEasy1.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# The essence of the problem/solution all in a single file.
when not declared(File): import std/syncio
import sets, tables, strutils, streams, parsexml, sugar, times
iterator words(text: string): string = # Tokenizer; could go UTF8, limit..
var word = "" #..length of pure numeric, etc.
for c in text:
if c in {'A'..'Z'}: word.add char(ord(c) + 32)
elif c in {'a'..'z'}: word.add c # Add '0'..'9' maybe with len limit?
elif word.len > 0: yield word; word.setLen 0
if word.len > 0: yield word
iterator terms(text: string): string = # yield indexed words in `text`
for word in text.words: yield word
iterator xml(f: File, name: string, elements: HashSet[string], dataKinds =
{xmlCharData,xmlWhitespace,xmlCData,xmlEntity}): (string, string) =
var x: XmlParser # yield (elName, innerText) for elements
open(x, f.newFileStream, name)
x.next # get to first XML element
var t = ""
while x.kind != xmlEof:
if x.kind == xmlElementStart and (let e = x.elementName; e in elements):
x.next; t.setLen 0 # skip ElementStart; trunc yielded
while not (x.kind == xmlElementEnd and x.elementName == e):
if x.kind in dataKinds: t.add x.charData
x.next
yield (e, t)
else: x.next
x.close
proc intersect(results: var seq[HashSet[uint32]]): HashSet[uint32] =
for i in 1..<results.len: results[0] = results[0].intersection(results[i])
results[0]
proc union(results: var seq[HashSet[uint32]]): HashSet[uint32] =
for i in 1..<results.len: results[0] = results[0].union(results[i])
results[0]
type Index = object # Inverted Index
names: seq[string] # [doc id] = name (uri, etc.)
inv: Table[string, HashSet[uint32]] # map term -> docs
proc names*(ix: auto, docs: auto, prefix=""): string =
join(collect(for d in docs: prefix & ix.names[d]), "\n")
let noDocs = initHashSet[uint32](0) # convenience empty set
proc initIndex*(f: File, name=""): Index = # CORE LOGIC: this & `find`
var name, title: string; const pfx = "https://en.wikipedia.org/wiki/"
for (e, t) in xml(f, name, ["title", "url", "abstract"].toHashSet):
case e
of "title": title = if t.startsWith("Wikipedia: "): t[11..^1] else: t
of "url": name = if t.startsWith(pfx): t[pfx.len..^1] else: t
of "abstract":
var freq = initCountTable[string](1) # Abstracts can be tiny
for term in terms(title & " " & t): freq.inc term
for term, cnt in freq: # should skip if no terms
result.inv.mgetOrPut(term, noDocs).incl uint32(result.names.len)
result.names.add name; title.setLen 0; name.setLen 0
proc find*(ix: Index, query: string, any=false): HashSet[uint32] =
var results: seq[HashSet[uint32]]
for term in query.terms: results.add ix.inv.getOrDefault(term, noDocs)
result = if any: union(results) else: intersect(results)
template timeIt(tag: string, body: untyped) =
let t0=epochTime(); body; echo tag," ",formatFloat(epochTime()-t0,ffDecimal,6)
timeIt("ingest:"): (var ix = stdin.initIndex) # ./data.sh | ./this
echo ix.inv.len," unique terms in ",ix.names.len," documents"
timeIt(" q1:"): echo names(ix, ix.find("London Beer"))
timeIt(" q2:"): echo ix.find("London Beer", any=true).len