Skip to content

Commit

Permalink
search gitter messages
Browse files Browse the repository at this point in the history
  • Loading branch information
kapilt committed Dec 21, 2021
1 parent 517bd79 commit 4ccd599
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 1 deletion.
44 changes: 43 additions & 1 deletion hubhud/cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import click
import logging
import time

import click
from sqlalchemy.orm import Session

from .github import sync as github_sync
from .gitter import sync as gitter_sync

try:
from .search import index as search_index
from .search import search as query_index
except ImportError:
search_index = None
query_index = None

from .schema import get_db

log = logging.getLogger("hubhud")
Expand All @@ -18,6 +27,39 @@ def cli():
)


@cli.group()
def search():
"""Search for information"""


@search.command()
@click.option("-i", "--index", type=click.Path(), required=True)
@click.option("-q", "--query", required=True)
def query(index, query):

t = time.time()
results = query_index(index, query)
log.info("queried messages in %0.2f", time.time() - t)
for r in results:
print((
f"score: {r['score']}"
f" sent: {r['doc']['sent'][0]}"
f" author: {r['doc']['author'][0]}"
f" body:\n{r['doc']['body'][0]}\n"
))


@search.command()
@click.option("-f", "--db", envvar="HUD_DB", required=True)
@click.option("-i", "--index", type=click.Path(), required=True)
def index(db, index):
log.info("indexing messages for search")
engine = get_db(db)
with Session(engine) as s:
count = search_index(s, index)
log.info("finished - indexed %d messages", count)


@cli.group()
def sync():
"""Sync information sources to local database"""
Expand Down
50 changes: 50 additions & 0 deletions hubhud/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import tantivy
import sqlalchemy as rdb

from . import gitter


def get_schema():
schema_builder = tantivy.SchemaBuilder()
schema_builder.add_date_field("sent", stored=True)
schema_builder.add_text_field("author", stored=True)
schema_builder.add_text_field("id", stored=True)
schema_builder.add_text_field("body", stored=True)
schema = schema_builder.build()
return schema


def search(path, query_phrase, max_results=10):
schema = get_schema()
index = tantivy.Index(schema, path, reuse=True)

searcher = index.searcher()
query = index.parse_query(query_phrase, ["body", "author"])

qresults = searcher.search(query, max_results).hits
results = []

for (score, addr) in qresults:
results.append({"score": score, "addr": addr, "doc": searcher.doc(addr)})
return results


def index(session, path):

schema = get_schema()
index = tantivy.Index(schema, path, reuse=True)
writer = index.writer()

count = 0
results = session.execute(
rdb.select(gitter.Message).order_by(rdb.desc(gitter.Message.sent))
)

for r in results.all():
m = r[0]
writer.add_document(
tantivy.Document(id=[m.id], sent=[m.sent], author=[m.author], body=[m.text])
)
count += 1
writer.commit()
return count

0 comments on commit 4ccd599

Please sign in to comment.