From 997f3668c78b81b898fc5d72bda2034c07885536 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xueguang=20Ma=20=E9=A9=AC=E9=9B=AA=E5=85=89?=
 <x93ma@edu.uwaterloo.ca>
Date: Tue, 5 Oct 2021 14:40:18 -0400
Subject: [PATCH] remove duplicate code (#802)

remove duplicate code for dindex
---
 docs/usage-dense-indexes.md             |  34 +++---
 integrations/dense/test_create_index.py |  90 ---------------
 pyserini/dindex/__init__.py             |  21 ----
 pyserini/dindex/__main__.py             | 104 -----------------
 pyserini/dindex/_base.py                | 142 ------------------------
 pyserini/dindex/merge_indexes.py        |  46 --------
 scripts/msmarco-doc/rerank_with_maxp.py |   9 +-
 7 files changed, 25 insertions(+), 421 deletions(-)
 delete mode 100644 integrations/dense/test_create_index.py
 delete mode 100644 pyserini/dindex/__init__.py
 delete mode 100644 pyserini/dindex/__main__.py
 delete mode 100644 pyserini/dindex/_base.py
 delete mode 100644 pyserini/dindex/merge_indexes.py

diff --git a/docs/usage-dense-indexes.md b/docs/usage-dense-indexes.md
index 657faf029..1d63be04f 100644
--- a/docs/usage-dense-indexes.md
+++ b/docs/usage-dense-indexes.md
@@ -7,7 +7,7 @@ Pyserini create dense index for collections with JSONL format:
 ```json
 {
   "id": "doc1",
-  "contents": "this is the contents."
+  "contents": "title\nthis is the contents."
 }
 ```
 
@@ -19,12 +19,13 @@ Then, you can invoke the indexer:
 
 Here we provide an example to index collections with DPR passage encoder
 ```bash
-python -m pyserini.dindex --corpus integrations/resources/sample_collection_jsonl \
-                          --encoder facebook/dpr-ctx_encoder-multiset-base \
-                          --index indexes/dindex-sample-dpr-multi \
-                          --batch 64 \
-                          --device cuda:0 \
-                          --title-delimiter '\n' 
+python -m pyserini.encode input   --corpus integrations/resources/sample_collection_jsonl \
+                                  --fields title text \  # fields in collection contents
+                          output  --embeddings indexes/dindex-sample-dpr-multi \
+                                  --to-faiss \
+                          encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
+                                  --fields title text \  # fields to encode
+                                  --batch 32 
 ```
 
 Once this is done, you can use `SimpleDenseSearcher` to search the index:
@@ -43,17 +44,18 @@ for i in range(0, 10):
 If you want to speed up the passage embedding generation, you can run create the index in shard way.
 e.g. the command below create a sub-index for the first 1/4 of the collection.
 ```bash
-python -m pyserini.dindex --corpus integrations/resources/sample_collection_jsonl \
-                          --encoder facebook/dpr-ctx_encoder-multiset-base \
-                          --index indexes/dindex-sample-dpr-multi-0 \
-                          --batch 64 \
-                          --device cuda:0 \
-                          --title-delimiter '\n' \ 
-                          --shard-id 0 \
-                          --shard-num 4
+python -m pyserini.encode input   --corpus integrations/resources/sample_collection_jsonl \
+                                  --fields title text \  # fields in collection contents
+                                  --shard-id 0 \
+                                  --shard-num 4 \
+                          output  --embeddings indexes/dindex-sample-dpr-multi-0 \
+                                  --to-faiss \ 
+                          encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
+                                  --fields title text \  # fields to encode
+                                  --batch 32 
 ```
 you can run 4 process on 4 gpu to speed up the process by 4 times.
 Once it down, you can create the full index by merge the sub-indexes by:
 ```bash
-python -m pyserini.dindex.merge_indexes --prefix indexes/dindex-sample-dpr-multi- --shard-num 4
+python -m pyserini.index.merge_faiss_indexes --prefix indexes/dindex-sample-dpr-multi- --shard-num 4
 ```
diff --git a/integrations/dense/test_create_index.py b/integrations/dense/test_create_index.py
deleted file mode 100644
index fcfe77768..000000000
--- a/integrations/dense/test_create_index.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#
-# Pyserini: Reproducible IR research with sparse and dense representations
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Integration tests for create dense index """
-
-import os
-import shutil
-import unittest
-from pyserini.dsearch import SimpleDenseSearcher
-
-
-class TestSearchIntegration(unittest.TestCase):
-    def setUp(self):
-        curdir = os.getcwd()
-        if curdir.endswith('dense'):
-            self.pyserini_root = '../..'
-        else:
-            self.pyserini_root = '.'
-        self.temp_folders = []
-
-    def test_create_index(self):
-        index_dir = f'{self.pyserini_root}/temp_index'
-        self.temp_folders.append(index_dir)
-        cmd1 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \
-                          --encoder facebook/dpr-ctx_encoder-multiset-base \
-                          --index {index_dir} \
-                          --batch 64 \
-                          --device cpu \
-                          --title-delimiter '\n'"
-        status = os.system(cmd1)
-        self.assertEqual(status, 0)
-        searcher = SimpleDenseSearcher(
-            index_dir,
-            'facebook/dpr-question_encoder-multiset-base'
-        )
-        self.assertEqual(searcher.num_docs, 18)
-
-    def test_create_index_shard(self):
-        index_dir = f'{self.pyserini_root}/temp_index-0'
-        self.temp_folders.append(index_dir)
-        cmd1 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \
-                          --encoder facebook/dpr-ctx_encoder-multiset-base \
-                          --index {index_dir} \
-                          --batch 64 \
-                          --device cpu \
-                          --title-delimiter '\n' \
-                          --shard-id 0 \
-                          --shard-num 2 "
-
-        index_dir = f'{self.pyserini_root}/temp_index-1'
-        self.temp_folders.append(index_dir)
-        cmd2 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \
-                                  --encoder facebook/dpr-ctx_encoder-multiset-base \
-                                  --index {index_dir} \
-                                  --batch 64 \
-                                  --device cpu \
-                                  --title-delimiter '\n' \
-                                  --shard-id 1 \
-                                  --shard-num 2 "
-        status = os.system(cmd1)
-        self.assertEqual(status, 0)
-        status = os.system(cmd2)
-        self.assertEqual(status, 0)
-        cmd3 = f"python -m pyserini.dindex.merge_indexes --prefix {self.pyserini_root}/temp_index- --shard-num 2"
-        status = os.system(cmd3)
-        self.assertEqual(status, 0)
-        index_dir = f'{self.pyserini_root}/temp_index-full'
-        self.temp_folders.append(index_dir)
-        searcher = SimpleDenseSearcher(
-            index_dir,
-            'facebook/dpr-question_encoder-multiset-base'
-        )
-        self.assertEqual(searcher.num_docs, 18)
-
-    def tearDown(self):
-        for f in self.temp_folders:
-            shutil.rmtree(f)
diff --git a/pyserini/dindex/__init__.py b/pyserini/dindex/__init__.py
deleted file mode 100644
index 471c18d7f..000000000
--- a/pyserini/dindex/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# Pyserini: Reproducible IR research with sparse and dense representations
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from ._base import DocumentEncoder, DprDocumentEncoder, AnceDocumentEncoder, \
-    TctColBertDocumentEncoder, AutoDocumentEncoder
-
-__all__ = ['DocumentEncoder', 'DprDocumentEncoder', 'AnceDocumentEncoder', 'TctColBertDocumentEncoder',
-           'AutoDocumentEncoder']
\ No newline at end of file
diff --git a/pyserini/dindex/__main__.py b/pyserini/dindex/__main__.py
deleted file mode 100644
index 293a07ab5..000000000
--- a/pyserini/dindex/__main__.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#
-# Pyserini: Reproducible IR research with sparse and dense representations
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import json
-import os
-import faiss
-import numpy as np
-from tqdm import tqdm
-from pyserini.dindex import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AutoDocumentEncoder
-
-
-def init_encoder(encoder, device):
-    encoder = encoder.lower()
-    if 'dpr' in encoder:
-        return DprDocumentEncoder(encoder, device=device)
-    elif 'tct_colbert' in encoder:
-        return TctColBertDocumentEncoder(encoder, device=device)
-    elif 'ance' in encoder:
-        return AnceDocumentEncoder(encoder, device=device)
-    elif 'sentence-transformers' in encoder:
-        return AutoDocumentEncoder(encoder, device=device, pooling='mean', l2_norm=True)
-    else:
-        return AutoDocumentEncoder(encoder, device=device)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--encoder', type=str, help='encoder name or path', required=True)
-    parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768)
-    parser.add_argument('--corpus', type=str,
-                        help='directory that contains corpus files to be encoded, in jsonl format.', required=True)
-    parser.add_argument('--index', type=str, help='directory to store brute force index of corpus', required=True)
-    parser.add_argument('--batch', type=int, help='batch size', default=64)
-    parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0)
-    parser.add_argument('--shard-num', type=int, help='number of shards', default=1)
-    parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cuda:0')
-    parser.add_argument('--title-delimiter', type=str, default=None)
-    args = parser.parse_args()
-
-    model = init_encoder(args.encoder, device=args.device)
-
-    index = faiss.IndexFlatIP(args.dimension)
-
-    if not os.path.exists(args.index):
-        os.mkdir(args.index)
-
-    ids = []
-    texts = []
-    titles = []
-    for file in sorted(os.listdir(args.corpus)):
-        file = os.path.join(args.corpus, file)
-        if file.endswith('json') or file.endswith('jsonl'):
-            print(f'Loading {file}')
-            with open(file, 'r') as corpus:
-                for idx, line in enumerate(tqdm(corpus.readlines())):
-                    info = json.loads(line)
-                    docid = info['id']
-                    ids.append(docid)
-                    text = info['contents'].strip()
-                    if args.title_delimiter is None:
-                        texts.append(text.lower())
-                    else:
-                        if args.title_delimiter == '\\n':
-                            args.title_delimiter = '\n'
-                        elif args.title_delimiter == '\\t':
-                            args.title_delimiter = '\t'
-                        title, text = text.lower().split(args.title_delimiter)
-                        titles.append(title)
-                        texts.append(text)
-
-    total_len = len(texts)
-    shard_size = int(total_len / args.shard_num)
-    start_idx = args.shard_id * shard_size
-    end_idx = min(start_idx + shard_size, total_len)
-    if args.shard_id == args.shard_num - 1:
-        end_idx = total_len
-
-    with open(os.path.join(args.index, 'docid'), 'w') as id_file:
-        for idx in tqdm(range(start_idx, end_idx)):
-            id_file.write(f'{ids[idx]}\n')
-
-    for idx in tqdm(range(start_idx, end_idx, args.batch)):
-        text_batch = texts[idx: min(idx + args.batch, end_idx)]
-        if len(titles) != 0:
-            title_batch = titles[idx: min(idx + args.batch, end_idx)]
-            embeddings = model.encode(text_batch, title_batch)
-        else:
-            embeddings = model.encode(text_batch)
-        index.add(np.array(embeddings))
-    faiss.write_index(index, os.path.join(args.index, 'index'))
diff --git a/pyserini/dindex/_base.py b/pyserini/dindex/_base.py
deleted file mode 100644
index a73cf51d3..000000000
--- a/pyserini/dindex/_base.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#
-# Pyserini: Reproducible IR research with sparse and dense representations
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import faiss
-import torch
-from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer, DPRContextEncoder, \
-    DPRContextEncoderTokenizer, RobertaTokenizer
-
-from pyserini.dsearch import AnceEncoder
-
-
-class DocumentEncoder:
-    def encode(self, texts, titles=None):
-        pass
-
-
-class TctColBertDocumentEncoder(DocumentEncoder):
-    def __init__(self, model_name, tokenizer_name=None, device='cuda:0'):
-        self.device = device
-        self.model = BertModel.from_pretrained(model_name)
-        self.model.to(self.device)
-        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name)
-
-    def encode(self, texts, titles=None):
-        texts = ['[CLS] [D] ' + text for text in texts]
-        max_length = 154  # hardcode for now
-        inputs = self.tokenizer(
-            texts,
-            max_length=max_length,
-            padding="longest",
-            truncation=True,
-            add_special_tokens=False,
-            return_tensors='pt'
-        )
-        inputs.to(self.device)
-        outputs = self.model(**inputs)
-        embeddings = mean_pooling(outputs["last_hidden_state"][:, 4:, :], inputs['attention_mask'][:, 4:])
-        return embeddings.detach().cpu().numpy()
-
-
-class DprDocumentEncoder(DocumentEncoder):
-    def __init__(self, model_name, tokenizer_name=None, device='cuda:0'):
-        self.device = device
-        self.model = DPRContextEncoder.from_pretrained(model_name)
-        self.model.to(self.device)
-        self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(tokenizer_name or model_name)
-
-    def encode(self, texts, titles=None):
-        max_length = 256  # hardcode for now
-        if titles:
-            inputs = self.tokenizer(
-                titles,
-                text_pair=texts,
-                max_length=max_length,
-                padding='longest',
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors='pt'
-            )
-        else:
-            inputs = self.tokenizer(
-                texts,
-                max_length=max_length,
-                padding='longest',
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors='pt'
-            )
-        inputs.to(self.device)
-        return self.model(inputs["input_ids"]).pooler_output.detach().cpu().numpy()
-
-
-class AnceDocumentEncoder(DocumentEncoder):
-    def __init__(self, model_name, tokenizer_name=None, device='cuda:0'):
-        self.device = device
-        self.model = AnceEncoder.from_pretrained(model_name)
-        self.model.to(self.device)
-        self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or model_name)
-
-    def encode(self, texts, titles=None):
-        max_length = 512  # hardcode for now
-        inputs = self.tokenizer(
-            texts,
-            max_length=max_length,
-            padding='longest',
-            truncation=True,
-            add_special_tokens=True,
-            return_tensors='pt'
-        )
-        inputs.to(self.device)
-        return self.model(inputs["input_ids"]).detach().cpu().numpy()
-
-
-class AutoDocumentEncoder(DocumentEncoder):
-    def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False):
-        self.device = device
-        self.model = AutoModel.from_pretrained(model_name)
-        self.model.to(self.device)
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name)
-        self.has_model = True
-        self.pooling = pooling
-        self.l2_norm = l2_norm
-
-    def encode(self, texts, titles=None):
-        inputs = self.tokenizer(
-            texts,
-            max_length=512,
-            padding='longest',
-            truncation=True,
-            add_special_tokens=True,
-            return_tensors='pt'
-        )
-        inputs.to(self.device)
-        outputs = self.model(**inputs)
-        if self.pooling == "mean":
-            embeddings = mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy()
-        else:
-            embeddings = outputs[0][:, 0, :].detach().cpu().numpy()
-        if self.l2_norm:
-            faiss.normalize_L2(embeddings)
-        return embeddings
-
-
-def mean_pooling(last_hidden_state, attention_mask):
-    token_embeddings = last_hidden_state
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
-    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    return sum_embeddings / sum_mask
diff --git a/pyserini/dindex/merge_indexes.py b/pyserini/dindex/merge_indexes.py
deleted file mode 100644
index 5662aae9f..000000000
--- a/pyserini/dindex/merge_indexes.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Pyserini: Reproducible IR research with sparse and dense representations
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-
-import faiss
-import os
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768)
-parser.add_argument('--prefix', type=str, help='directory to store brute force index of corpus', required=True)
-parser.add_argument('--shard-num', type=int, help='number of shards', default=1)
-args = parser.parse_args()
-
-new_index = faiss.IndexFlatIP(args.dimension)
-docid_files = []
-for i in range(args.shard_num):
-    index = faiss.read_index(os.path.join(args.prefix + str(i), 'index'))
-    docid_files.append(os.path.join(args.prefix + str(i), 'docid'))
-    vectors = index.reconstruct_n(0, index.ntotal)
-    new_index.add(vectors)
-
-if not os.path.exists(args.prefix + 'full'):
-    os.mkdir(args.prefix + 'full')
-
-faiss.write_index(new_index, os.path.join(args.prefix + 'full', 'index'))
-
-with open(os.path.join(args.prefix + 'full', 'docid'), 'w') as wfd:
-    for f in docid_files:
-        with open(f, 'r') as f1:
-            for line in f1:
-                wfd.write(line)
diff --git a/scripts/msmarco-doc/rerank_with_maxp.py b/scripts/msmarco-doc/rerank_with_maxp.py
index 93d8b46c8..d81de5603 100644
--- a/scripts/msmarco-doc/rerank_with_maxp.py
+++ b/scripts/msmarco-doc/rerank_with_maxp.py
@@ -134,8 +134,13 @@ def rerank(cache, qid, query, docs, reranker):
                       f'-generator DefaultLuceneDocumentGenerator -threads 1 ' +
                       f'-input {collection_dir} -index {index_path}')
         elif reranker == 'ance':
-            os.system(f'python -m pyserini.dindex --corpus {collection_dir} ' +
-                      f'--encoder castorini/ance-msmarco-doc-maxp --index {index_path} --batch 64 --device cpu')
+            os.system(f'python -m pyserini.encode input   --corpus {collection_dir} \
+                                  --fields text \
+                          output  --embeddings {index_path} \
+                                  --to-faiss \
+                          encoder --encoder castorini/ance-msmarco-doc-maxp \
+                                  --fields text \
+                                  --batch 64 --device cpu ')
 
     output = []
     # Choose which reranker to use: