Fix Pyserini compatibility issues (#71)

castorini · Sep 8, 2020 · 96a7e8d · 96a7e8d
1 parent ae2dfc5
commit 96a7e8d
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 8 deletions.
diff --git a/docs/experiments-CovidQA.md b/docs/experiments-CovidQA.md
@@ -142,9 +142,10 @@ mrr             0.37988285486956513
 mrr@10          0.3671336788683727
 ```
 
-It takes about 17 minutes to re-rank this subset on CovidQA using a P100.
+It takes about 17 minutes to re-rank this subset on CovidQA using a P100.  It is worth noting again that you might need to modify the batch size to best fit the GPU at hand (--batch-size={BATCH_SIZE}).
 
 If you were able to replicate these results, please submit a PR adding to the replication log!
 
 
-## Replication Log
+## Replication Log
+
diff --git a/pygaggle/data/relevance.py b/pygaggle/data/relevance.py
@@ -64,7 +64,7 @@ def unfold(entries):
 
 class MsMarcoPassageLoader:
     def __init__(self, index_path: str):
-        self.searcher = pysearch.SimpleSearcher(index_path)
+        self.searcher = SimpleSearcher(index_path)
 
     def load_passage(self, id: str) -> MsMarcoPassage:
         try:

diff --git a/pygaggle/data/segmentation.py b/pygaggle/data/segmentation.py
@@ -38,7 +38,7 @@ def segment(self, documents: List[Text], seg_size: int, stride: int) -> SegmentG
             sentences = [sent.string.strip() for sent in doc.sents]
             for i in range(0, len(sentences), stride):
                 segment_text = ' '.join(sentences[i:i + seg_size])
-                segmented_doc.append(Text(segment_text, dict(docid=document.raw["docid"])))
+                segmented_doc.append(Text(segment_text, dict(docid=document.metadata["docid"])))
                 if i + seg_size >= len(sentences):
                     end_idx += i/stride + 1
                     doc_end_indexes.append(int(end_idx))

diff --git a/pygaggle/rerank/bm25.py b/pygaggle/rerank/bm25.py
@@ -3,8 +3,8 @@
 from typing import List
 import math
 
-from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer
-from pyserini.index.pyutils import IndexReaderUtils
+from pyserini.analysis import get_lucene_analyzer, Analyzer
+from pyserini.index import IndexReader
 import numpy as np
 
 from .base import Reranker, Query, Text
@@ -24,7 +24,7 @@ def __init__(self,
         self.analyzer = Analyzer(get_lucene_analyzer())
         if index_path:
             self.use_corpus_estimator = True
-            self.index_utils = IndexReaderUtils(index_path)
+            self.index_utils = IndexReader(index_path)
 
     def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
         query_words = self.analyzer.analyze(query.text)
@@ -45,7 +45,7 @@ def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
             if self.use_corpus_estimator:
                 idfs = {w:
                         self.index_utils.compute_bm25_term_weight(
-                                text.raw['docid'], w) for w in tf}
+                                text.metadata['docid'], w) for w in tf}
             score = sum(idfs[w] * tf[w] * (self.k1 + 1) /
                         (tf[w] + self.k1 * (1 - self.b + self.b *
                                             (d_len / mean_len))) for w in tf)