Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index dir option, change evaluate_passage_ranking to be consistent with CovidQA, update index to latest #33

Merged
merged 9 commits into from
May 26, 2020
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ Currently, this repo contains implementations of the rerankers for [CovidQA](htt

## Running rerankers on CovidQA

By default, the script uses `data/lucene-index-covid-paragraph` for the index path.
If this is undesirable, set the environment variable `CORD19_INDEX_PATH` to the path of the index.
For a full list of mostly self-explanatory environment variables, see [this file](https://github.com/castorini/pygaggle/blob/master/pygaggle/settings.py#L7).

BM25 uses the CPU. If you don't have a GPU for the transformer models, pass `--device cpu` (PyTorch device string format) to the script.
Expand Down
8 changes: 4 additions & 4 deletions docs/experiments-msmarco-passage.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ First, lets evaluate using monoBERT!
python -um pygaggle.run.evaluate_passage_ranker --split dev \
--method seq_class_transformer \
--model-name-or-path castorini/monobert_large_msmarco \
--data-dir data/msmarco_ans_small/ \
--dataset data/msmarco_ans_small/ \
--index-dir indexes/index-msmarco-passage-20191117-0ed488 \
--dataset msmarco \
--task msmarco \
--output-file runs/run.monobert.ans_small.dev.tsv
```

Expand Down Expand Up @@ -118,9 +118,9 @@ We use the monoT5-base variant as it is the easiest to run without access to lar
python -um pygaggle.run.evaluate_passage_ranker --split dev \
--method t5 \
--model-name-or-path castorini/monot5-base-msmarco \
--data-dir data/msmarco_ans_small \
--dataset data/msmarco_ans_small \
--model-type t5-base \
--dataset msmarco \
--task msmarco \
--index-dir indexes/index-msmarco-passage-20191117-0ed488 \
--batch-size 32 \
--output-file runs/run.monot5.ans_small.dev.tsv
Expand Down
24 changes: 10 additions & 14 deletions pygaggle/run/evaluate_kaggle_highlighter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

class KaggleEvaluationOptions(BaseModel):
dataset: Path
index_dir: Path
method: str
batch_size: int
device: str
Expand Down Expand Up @@ -78,8 +79,7 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = loader.load().to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
options.model_name,
do_lower_case=options.do_lower_case)
options.model_name, do_lower_case=options.do_lower_case)
tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
return T5Reranker(model, tokenizer)

Expand All @@ -93,8 +93,7 @@ def construct_transformer(options: KaggleEvaluationOptions) -> Reranker:
from_tf=True).to(device).eval()
tokenizer = SimpleBatchTokenizer(
AutoTokenizer.from_pretrained(
options.tokenizer_name,
do_lower_case=options.do_lower_case),
options.tokenizer_name, do_lower_case=options.do_lower_case),
options.batch_size)
provider = CosineSimilarityMatrixProvider()
return UnsupervisedTransformerReranker(model, tokenizer, provider)
Expand Down Expand Up @@ -123,8 +122,7 @@ def construct_seq_class_transformer(options:
device = torch.device(options.device)
model = model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
options.tokenizer_name,
do_lower_case=options.do_lower_case)
options.tokenizer_name, do_lower_case=options.do_lower_case)
return SequenceClassificationTransformerReranker(model, tokenizer)


Expand All @@ -143,20 +141,18 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = fixed_model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
options.tokenizer_name,
do_lower_case=options.do_lower_case)
options.tokenizer_name, do_lower_case=options.do_lower_case)
return QuestionAnsweringTransformerReranker(model, tokenizer)


def construct_bm25(_: KaggleEvaluationOptions) -> Reranker:
return Bm25Reranker(index_path=SETTINGS.cord19_index_path)
def construct_bm25(options: KaggleEvaluationOptions) -> Reranker:
return Bm25Reranker(index_path=str(options.index_dir))


def main():
apb = ArgumentParserBuilder()
apb.add_opts(opt('--dataset',
type=Path,
default='data/kaggle-lit-review-0.2.json'),
apb.add_opts(opt('--dataset', type=Path, required=True),
opt('--index-dir', type=Path, required=True),
opt('--method',
required=True,
type=str,
Expand All @@ -175,7 +171,7 @@ def main():
args = apb.parser.parse_args()
options = KaggleEvaluationOptions(**vars(args))
ds = LitReviewDataset.from_file(str(options.dataset))
examples = ds.to_senticized_dataset(SETTINGS.cord19_index_path,
examples = ds.to_senticized_dataset(str(options.index_dir),
split=options.split)
construct_map = dict(transformer=construct_transformer,
bm25=construct_bm25,
Expand Down
27 changes: 13 additions & 14 deletions pygaggle/run/evaluate_passage_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from pygaggle.rerank.random import RandomReranker
from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider
from pygaggle.model import (SimpleBatchTokenizer,
CachedT5ModelLoader,
T5BatchTokenizer,
RerankerEvaluator,
metric_names,
Expand All @@ -36,8 +35,8 @@


class PassageRankingEvaluationOptions(BaseModel):
dataset: str
data_dir: Path
task: str
dataset: Path
index_dir: Path
method: str
model_name_or_path: str
Expand All @@ -50,17 +49,17 @@ class PassageRankingEvaluationOptions(BaseModel):
model_type: Optional[str]
tokenizer_name: Optional[str]

@validator('dataset')
def dataset_exists(cls, v: str):
@validator('task')
def task_exists(cls, v: str):
assert v in ['msmarco', 'treccar']

@validator('data_dir')
def datadir_exists(cls, v: str):
@validator('dataset')
def dataset_exists(cls, v: Path):
assert v.exists(), 'data directory must exist'
return v

@validator('index_dir')
def index_dir_exists(cls, v: str):
def index_dir_exists(cls, v: Path):
assert v.exists(), 'index directory must exist'
return v

Expand All @@ -81,7 +80,7 @@ def tokenizer_sane(cls, v: str, values, **kwargs):
def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = T5ForConditionalGeneration.from_pretrained(options.model_name_or_path,
from_tf=options.from_tf).to(device).eval()
from_tf=options.from_tf).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(options.model_type)
tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
return T5Reranker(model, tokenizer)
Expand All @@ -91,7 +90,7 @@ def construct_transformer(options:
PassageRankingEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = AutoModel.from_pretrained(options.model_name_or_path,
from_tf=options.from_tf).to(device).eval()
from_tf=options.from_tf).to(device).eval()
tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(
options.tokenizer_name),
options.batch_size)
Expand Down Expand Up @@ -121,15 +120,15 @@ def construct_seq_class_transformer(options: PassageRankingEvaluationOptions


def construct_bm25(options: PassageRankingEvaluationOptions) -> Reranker:
return Bm25Reranker(index_path=options.index_dir)
return Bm25Reranker(index_path=str(options.index_dir))


def main():
apb = ArgumentParserBuilder()
apb.add_opts(opt('--dataset',
apb.add_opts(opt('--task',
type=str,
default='msmarco'),
opt('--data-dir', type=Path, required=True),
opt('--dataset', type=Path, required=True),
opt('--index-dir', type=Path, required=True),
opt('--method',
required=True,
Expand All @@ -155,7 +154,7 @@ def main():
opt('--tokenizer-name', type=str))
args = apb.parser.parse_args()
options = PassageRankingEvaluationOptions(**vars(args))
ds = MsMarcoDataset.from_folder(str(options.data_dir), split=options.split,
ds = MsMarcoDataset.from_folder(str(options.dataset), split=options.split,
is_duo=options.is_duo)
examples = ds.to_relevance_examples(str(options.index_dir),
is_duo=options.is_duo)
Expand Down
1 change: 0 additions & 1 deletion pygaggle/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class MsMarcoSettings(Settings):


class Cord19Settings(Settings):
cord19_index_path: str = 'data/lucene-index-covid-paragraph'
# T5 model settings
t5_model_dir: str = 'gs://neuralresearcher_data/covid/data/model_exp304'
t5_model_type: str = 't5-base'
11 changes: 6 additions & 5 deletions scripts/update-index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

echo "Updating Anserini index..."

INDEX_NAME=lucene-index-covid-paragraph-2020-04-10
INDEX_URL=https://www.dropbox.com/s/ivk87journyajw3/lucene-index-covid-paragraph-2020-04-10.tar.gz
INDEX_NAME=${1:-lucene-index-cord19-paragraph-2020-05-12}
INDEX_URL=${2:-https://www.dropbox.com/s/s3bylw97cf0t2wq/lucene-index-cord19-paragraph-2020-05-12.tar.gz}

wget ${INDEX_URL}
tar xvfz ${INDEX_NAME}.tar.gz && rm ${INDEX_NAME}.tar.gz

rm -rf data/lucene-index-covid-paragraph
mv ${INDEX_NAME} data/lucene-index-covid-paragraph
INDEX_PATH=indexes/${INDEX_NAME}
rm -rf ${INDEX_PATH}
mv ${INDEX_NAME} ${INDEX_PATH}

echo "Successfully updated Anserini index at data/${INDEX_NAME}"
echo "Successfully updated Anserini index at indexes/${INDEX_NAME}"