From 997f3668c78b81b898fc5d72bda2034c07885536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xueguang=20Ma=20=E9=A9=AC=E9=9B=AA=E5=85=89?= Date: Tue, 5 Oct 2021 14:40:18 -0400 Subject: [PATCH] remove duplicate code (#802) remove duplicate code for dindex --- docs/usage-dense-indexes.md | 34 +++--- integrations/dense/test_create_index.py | 90 --------------- pyserini/dindex/__init__.py | 21 ---- pyserini/dindex/__main__.py | 104 ----------------- pyserini/dindex/_base.py | 142 ------------------------ pyserini/dindex/merge_indexes.py | 46 -------- scripts/msmarco-doc/rerank_with_maxp.py | 9 +- 7 files changed, 25 insertions(+), 421 deletions(-) delete mode 100644 integrations/dense/test_create_index.py delete mode 100644 pyserini/dindex/__init__.py delete mode 100644 pyserini/dindex/__main__.py delete mode 100644 pyserini/dindex/_base.py delete mode 100644 pyserini/dindex/merge_indexes.py diff --git a/docs/usage-dense-indexes.md b/docs/usage-dense-indexes.md index 657faf029..1d63be04f 100644 --- a/docs/usage-dense-indexes.md +++ b/docs/usage-dense-indexes.md @@ -7,7 +7,7 @@ Pyserini create dense index for collections with JSONL format: ```json { "id": "doc1", - "contents": "this is the contents." + "contents": "title\nthis is the contents." } ``` @@ -19,12 +19,13 @@ Then, you can invoke the indexer: Here we provide an example to index collections with DPR passage encoder ```bash -python -m pyserini.dindex --corpus integrations/resources/sample_collection_jsonl \ - --encoder facebook/dpr-ctx_encoder-multiset-base \ - --index indexes/dindex-sample-dpr-multi \ - --batch 64 \ - --device cuda:0 \ - --title-delimiter '\n' +python -m pyserini.encode input --corpus integrations/resources/sample_collection_jsonl \ + --fields title text \ # fields in collection contents + output --embeddings indexes/dindex-sample-dpr-multi \ + --to-faiss \ + encoder --encoder facebook/dpr-ctx_encoder-multiset-base \ + --fields title text \ # fields to encode + --batch 32 ``` Once this is done, you can use `SimpleDenseSearcher` to search the index: @@ -43,17 +44,18 @@ for i in range(0, 10): If you want to speed up the passage embedding generation, you can run create the index in shard way. e.g. the command below create a sub-index for the first 1/4 of the collection. ```bash -python -m pyserini.dindex --corpus integrations/resources/sample_collection_jsonl \ - --encoder facebook/dpr-ctx_encoder-multiset-base \ - --index indexes/dindex-sample-dpr-multi-0 \ - --batch 64 \ - --device cuda:0 \ - --title-delimiter '\n' \ - --shard-id 0 \ - --shard-num 4 +python -m pyserini.encode input --corpus integrations/resources/sample_collection_jsonl \ + --fields title text \ # fields in collection contents + --shard-id 0 \ + --shard-num 4 \ + output --embeddings indexes/dindex-sample-dpr-multi-0 \ + --to-faiss \ + encoder --encoder facebook/dpr-ctx_encoder-multiset-base \ + --fields title text \ # fields to encode + --batch 32 ``` you can run 4 process on 4 gpu to speed up the process by 4 times. Once it down, you can create the full index by merge the sub-indexes by: ```bash -python -m pyserini.dindex.merge_indexes --prefix indexes/dindex-sample-dpr-multi- --shard-num 4 +python -m pyserini.index.merge_faiss_indexes --prefix indexes/dindex-sample-dpr-multi- --shard-num 4 ``` diff --git a/integrations/dense/test_create_index.py b/integrations/dense/test_create_index.py deleted file mode 100644 index fcfe77768..000000000 --- a/integrations/dense/test_create_index.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Integration tests for create dense index """ - -import os -import shutil -import unittest -from pyserini.dsearch import SimpleDenseSearcher - - -class TestSearchIntegration(unittest.TestCase): - def setUp(self): - curdir = os.getcwd() - if curdir.endswith('dense'): - self.pyserini_root = '../..' - else: - self.pyserini_root = '.' - self.temp_folders = [] - - def test_create_index(self): - index_dir = f'{self.pyserini_root}/temp_index' - self.temp_folders.append(index_dir) - cmd1 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \ - --encoder facebook/dpr-ctx_encoder-multiset-base \ - --index {index_dir} \ - --batch 64 \ - --device cpu \ - --title-delimiter '\n'" - status = os.system(cmd1) - self.assertEqual(status, 0) - searcher = SimpleDenseSearcher( - index_dir, - 'facebook/dpr-question_encoder-multiset-base' - ) - self.assertEqual(searcher.num_docs, 18) - - def test_create_index_shard(self): - index_dir = f'{self.pyserini_root}/temp_index-0' - self.temp_folders.append(index_dir) - cmd1 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \ - --encoder facebook/dpr-ctx_encoder-multiset-base \ - --index {index_dir} \ - --batch 64 \ - --device cpu \ - --title-delimiter '\n' \ - --shard-id 0 \ - --shard-num 2 " - - index_dir = f'{self.pyserini_root}/temp_index-1' - self.temp_folders.append(index_dir) - cmd2 = f"python -m pyserini.dindex --corpus {self.pyserini_root}/integrations/resources/sample_collection_dense \ - --encoder facebook/dpr-ctx_encoder-multiset-base \ - --index {index_dir} \ - --batch 64 \ - --device cpu \ - --title-delimiter '\n' \ - --shard-id 1 \ - --shard-num 2 " - status = os.system(cmd1) - self.assertEqual(status, 0) - status = os.system(cmd2) - self.assertEqual(status, 0) - cmd3 = f"python -m pyserini.dindex.merge_indexes --prefix {self.pyserini_root}/temp_index- --shard-num 2" - status = os.system(cmd3) - self.assertEqual(status, 0) - index_dir = f'{self.pyserini_root}/temp_index-full' - self.temp_folders.append(index_dir) - searcher = SimpleDenseSearcher( - index_dir, - 'facebook/dpr-question_encoder-multiset-base' - ) - self.assertEqual(searcher.num_docs, 18) - - def tearDown(self): - for f in self.temp_folders: - shutil.rmtree(f) diff --git a/pyserini/dindex/__init__.py b/pyserini/dindex/__init__.py deleted file mode 100644 index 471c18d7f..000000000 --- a/pyserini/dindex/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from ._base import DocumentEncoder, DprDocumentEncoder, AnceDocumentEncoder, \ - TctColBertDocumentEncoder, AutoDocumentEncoder - -__all__ = ['DocumentEncoder', 'DprDocumentEncoder', 'AnceDocumentEncoder', 'TctColBertDocumentEncoder', - 'AutoDocumentEncoder'] \ No newline at end of file diff --git a/pyserini/dindex/__main__.py b/pyserini/dindex/__main__.py deleted file mode 100644 index 293a07ab5..000000000 --- a/pyserini/dindex/__main__.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import json -import os -import faiss -import numpy as np -from tqdm import tqdm -from pyserini.dindex import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AutoDocumentEncoder - - -def init_encoder(encoder, device): - encoder = encoder.lower() - if 'dpr' in encoder: - return DprDocumentEncoder(encoder, device=device) - elif 'tct_colbert' in encoder: - return TctColBertDocumentEncoder(encoder, device=device) - elif 'ance' in encoder: - return AnceDocumentEncoder(encoder, device=device) - elif 'sentence-transformers' in encoder: - return AutoDocumentEncoder(encoder, device=device, pooling='mean', l2_norm=True) - else: - return AutoDocumentEncoder(encoder, device=device) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--encoder', type=str, help='encoder name or path', required=True) - parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768) - parser.add_argument('--corpus', type=str, - help='directory that contains corpus files to be encoded, in jsonl format.', required=True) - parser.add_argument('--index', type=str, help='directory to store brute force index of corpus', required=True) - parser.add_argument('--batch', type=int, help='batch size', default=64) - parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0) - parser.add_argument('--shard-num', type=int, help='number of shards', default=1) - parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cuda:0') - parser.add_argument('--title-delimiter', type=str, default=None) - args = parser.parse_args() - - model = init_encoder(args.encoder, device=args.device) - - index = faiss.IndexFlatIP(args.dimension) - - if not os.path.exists(args.index): - os.mkdir(args.index) - - ids = [] - texts = [] - titles = [] - for file in sorted(os.listdir(args.corpus)): - file = os.path.join(args.corpus, file) - if file.endswith('json') or file.endswith('jsonl'): - print(f'Loading {file}') - with open(file, 'r') as corpus: - for idx, line in enumerate(tqdm(corpus.readlines())): - info = json.loads(line) - docid = info['id'] - ids.append(docid) - text = info['contents'].strip() - if args.title_delimiter is None: - texts.append(text.lower()) - else: - if args.title_delimiter == '\\n': - args.title_delimiter = '\n' - elif args.title_delimiter == '\\t': - args.title_delimiter = '\t' - title, text = text.lower().split(args.title_delimiter) - titles.append(title) - texts.append(text) - - total_len = len(texts) - shard_size = int(total_len / args.shard_num) - start_idx = args.shard_id * shard_size - end_idx = min(start_idx + shard_size, total_len) - if args.shard_id == args.shard_num - 1: - end_idx = total_len - - with open(os.path.join(args.index, 'docid'), 'w') as id_file: - for idx in tqdm(range(start_idx, end_idx)): - id_file.write(f'{ids[idx]}\n') - - for idx in tqdm(range(start_idx, end_idx, args.batch)): - text_batch = texts[idx: min(idx + args.batch, end_idx)] - if len(titles) != 0: - title_batch = titles[idx: min(idx + args.batch, end_idx)] - embeddings = model.encode(text_batch, title_batch) - else: - embeddings = model.encode(text_batch) - index.add(np.array(embeddings)) - faiss.write_index(index, os.path.join(args.index, 'index')) diff --git a/pyserini/dindex/_base.py b/pyserini/dindex/_base.py deleted file mode 100644 index a73cf51d3..000000000 --- a/pyserini/dindex/_base.py +++ /dev/null @@ -1,142 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import faiss -import torch -from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer, DPRContextEncoder, \ - DPRContextEncoderTokenizer, RobertaTokenizer - -from pyserini.dsearch import AnceEncoder - - -class DocumentEncoder: - def encode(self, texts, titles=None): - pass - - -class TctColBertDocumentEncoder(DocumentEncoder): - def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): - self.device = device - self.model = BertModel.from_pretrained(model_name) - self.model.to(self.device) - self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name or model_name) - - def encode(self, texts, titles=None): - texts = ['[CLS] [D] ' + text for text in texts] - max_length = 154 # hardcode for now - inputs = self.tokenizer( - texts, - max_length=max_length, - padding="longest", - truncation=True, - add_special_tokens=False, - return_tensors='pt' - ) - inputs.to(self.device) - outputs = self.model(**inputs) - embeddings = mean_pooling(outputs["last_hidden_state"][:, 4:, :], inputs['attention_mask'][:, 4:]) - return embeddings.detach().cpu().numpy() - - -class DprDocumentEncoder(DocumentEncoder): - def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): - self.device = device - self.model = DPRContextEncoder.from_pretrained(model_name) - self.model.to(self.device) - self.tokenizer = DPRContextEncoderTokenizer.from_pretrained(tokenizer_name or model_name) - - def encode(self, texts, titles=None): - max_length = 256 # hardcode for now - if titles: - inputs = self.tokenizer( - titles, - text_pair=texts, - max_length=max_length, - padding='longest', - truncation=True, - add_special_tokens=True, - return_tensors='pt' - ) - else: - inputs = self.tokenizer( - texts, - max_length=max_length, - padding='longest', - truncation=True, - add_special_tokens=True, - return_tensors='pt' - ) - inputs.to(self.device) - return self.model(inputs["input_ids"]).pooler_output.detach().cpu().numpy() - - -class AnceDocumentEncoder(DocumentEncoder): - def __init__(self, model_name, tokenizer_name=None, device='cuda:0'): - self.device = device - self.model = AnceEncoder.from_pretrained(model_name) - self.model.to(self.device) - self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name or model_name) - - def encode(self, texts, titles=None): - max_length = 512 # hardcode for now - inputs = self.tokenizer( - texts, - max_length=max_length, - padding='longest', - truncation=True, - add_special_tokens=True, - return_tensors='pt' - ) - inputs.to(self.device) - return self.model(inputs["input_ids"]).detach().cpu().numpy() - - -class AutoDocumentEncoder(DocumentEncoder): - def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False): - self.device = device - self.model = AutoModel.from_pretrained(model_name) - self.model.to(self.device) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name) - self.has_model = True - self.pooling = pooling - self.l2_norm = l2_norm - - def encode(self, texts, titles=None): - inputs = self.tokenizer( - texts, - max_length=512, - padding='longest', - truncation=True, - add_special_tokens=True, - return_tensors='pt' - ) - inputs.to(self.device) - outputs = self.model(**inputs) - if self.pooling == "mean": - embeddings = mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy() - else: - embeddings = outputs[0][:, 0, :].detach().cpu().numpy() - if self.l2_norm: - faiss.normalize_L2(embeddings) - return embeddings - - -def mean_pooling(last_hidden_state, attention_mask): - token_embeddings = last_hidden_state - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask diff --git a/pyserini/dindex/merge_indexes.py b/pyserini/dindex/merge_indexes.py deleted file mode 100644 index 5662aae9f..000000000 --- a/pyserini/dindex/merge_indexes.py +++ /dev/null @@ -1,46 +0,0 @@ -# -# Pyserini: Reproducible IR research with sparse and dense representations -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse - -import faiss -import os - - -parser = argparse.ArgumentParser() -parser.add_argument('--dimension', type=int, help='dimension of passage embeddings', required=False, default=768) -parser.add_argument('--prefix', type=str, help='directory to store brute force index of corpus', required=True) -parser.add_argument('--shard-num', type=int, help='number of shards', default=1) -args = parser.parse_args() - -new_index = faiss.IndexFlatIP(args.dimension) -docid_files = [] -for i in range(args.shard_num): - index = faiss.read_index(os.path.join(args.prefix + str(i), 'index')) - docid_files.append(os.path.join(args.prefix + str(i), 'docid')) - vectors = index.reconstruct_n(0, index.ntotal) - new_index.add(vectors) - -if not os.path.exists(args.prefix + 'full'): - os.mkdir(args.prefix + 'full') - -faiss.write_index(new_index, os.path.join(args.prefix + 'full', 'index')) - -with open(os.path.join(args.prefix + 'full', 'docid'), 'w') as wfd: - for f in docid_files: - with open(f, 'r') as f1: - for line in f1: - wfd.write(line) diff --git a/scripts/msmarco-doc/rerank_with_maxp.py b/scripts/msmarco-doc/rerank_with_maxp.py index 93d8b46c8..d81de5603 100644 --- a/scripts/msmarco-doc/rerank_with_maxp.py +++ b/scripts/msmarco-doc/rerank_with_maxp.py @@ -134,8 +134,13 @@ def rerank(cache, qid, query, docs, reranker): f'-generator DefaultLuceneDocumentGenerator -threads 1 ' + f'-input {collection_dir} -index {index_path}') elif reranker == 'ance': - os.system(f'python -m pyserini.dindex --corpus {collection_dir} ' + - f'--encoder castorini/ance-msmarco-doc-maxp --index {index_path} --batch 64 --device cpu') + os.system(f'python -m pyserini.encode input --corpus {collection_dir} \ + --fields text \ + output --embeddings {index_path} \ + --to-faiss \ + encoder --encoder castorini/ance-msmarco-doc-maxp \ + --fields text \ + --batch 64 --device cpu ') output = [] # Choose which reranker to use: