From 47fa0c69a4fff14a4bf4d1d026939ff0a3ae243f Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 22:35:10 +0530 Subject: [PATCH 1/9] move index dir to options and update instructions for the new index --- README.md | 2 +- pygaggle/run/evaluate_kaggle_highlighter.py | 9 +++++---- pygaggle/settings.py | 1 - scripts/update-index.sh | 13 +++++++------ 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 7420ac78..ea515c7e 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Currently, this repo contains implementations of the rerankers for [CovidQA](htt ## Running rerankers on CovidQA -By default, the script uses `data/lucene-index-covid-paragraph` for the index path. +By default, the script uses `indexes/lucene-index-covid-paragraph` for the index path. If this is undesirable, set the environment variable `CORD19_INDEX_PATH` to the path of the index. For a full list of mostly self-explanatory environment variables, see [this file](https://github.com/castorini/pygaggle/blob/master/pygaggle/settings.py#L7). diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index 90a8bc0d..bda1d5a5 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -37,6 +37,7 @@ class KaggleEvaluationOptions(BaseModel): dataset: Path + index_dir: Path method: str batch_size: int device: str @@ -149,14 +150,14 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: def construct_bm25(_: KaggleEvaluationOptions) -> Reranker: - return Bm25Reranker(index_path=SETTINGS.cord19_index_path) + return Bm25Reranker(index_path=options.index_dir) def main(): apb = ArgumentParserBuilder() apb.add_opts(opt('--dataset', - type=Path, - default='data/kaggle-lit-review-0.2.json'), + type=Path), + opt('--index-dir', type=Path, required=True), opt('--method', required=True, type=str, @@ -175,7 +176,7 @@ def main(): args = apb.parser.parse_args() options = KaggleEvaluationOptions(**vars(args)) ds = LitReviewDataset.from_file(str(options.dataset)) - examples = ds.to_senticized_dataset(SETTINGS.cord19_index_path, + examples = ds.to_senticized_dataset(options.index_dir, split=options.split) construct_map = dict(transformer=construct_transformer, bm25=construct_bm25, diff --git a/pygaggle/settings.py b/pygaggle/settings.py index 65aeb3da..bbbe57f2 100644 --- a/pygaggle/settings.py +++ b/pygaggle/settings.py @@ -16,7 +16,6 @@ class MsMarcoSettings(Settings): class Cord19Settings(Settings): - cord19_index_path: str = 'data/lucene-index-covid-paragraph' # T5 model settings t5_model_dir: str = 'gs://neuralresearcher_data/covid/data/model_exp304' t5_model_type: str = 't5-base' diff --git a/scripts/update-index.sh b/scripts/update-index.sh index 70a46562..6fa44982 100644 --- a/scripts/update-index.sh +++ b/scripts/update-index.sh @@ -2,13 +2,14 @@ echo "Updating Anserini index..." -INDEX_NAME=lucene-index-covid-paragraph-2020-04-10 -INDEX_URL=https://www.dropbox.com/s/ivk87journyajw3/lucene-index-covid-paragraph-2020-04-10.tar.gz +export INDEX_NAME=${1:-lucene-index-cord19-paragraph-2020-05-12} +export INDEX_URL=${2:-https://www.dropbox.com/s/s3bylw97cf0t2wq/lucene-index-cord19-paragraph-2020-05-12.tar.gz} -wget ${INDEX_URL} -tar xvfz ${INDEX_NAME}.tar.gz && rm ${INDEX_NAME}.tar.gz +wget $INDEX_URL +tar xvfz $INDEX_NAME.tar.gz && rm $INDEX_NAME.tar.gz rm -rf data/lucene-index-covid-paragraph -mv ${INDEX_NAME} data/lucene-index-covid-paragraph +export INDEX_PATH=indexes/$INDEX_NAME +mv $INDEX_NAME $INDEX_PATH -echo "Successfully updated Anserini index at data/${INDEX_NAME}" +echo "Successfully updated Anserini index at data/$INDEX_NAME" From 092630f3fb11a1a0af56ace82d44db0d055d3a85 Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 22:39:12 +0530 Subject: [PATCH 2/9] required field --- pygaggle/run/evaluate_kaggle_highlighter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index bda1d5a5..e10789dc 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -155,8 +155,7 @@ def construct_bm25(_: KaggleEvaluationOptions) -> Reranker: def main(): apb = ArgumentParserBuilder() - apb.add_opts(opt('--dataset', - type=Path), + apb.add_opts(opt('--dataset', type=Path, required=True), opt('--index-dir', type=Path, required=True), opt('--method', required=True, From e3a17e2942bc9d6f9f87cae8e183e849b21b5509 Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 22:45:34 +0530 Subject: [PATCH 3/9] change data -> indexes --- scripts/update-index.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update-index.sh b/scripts/update-index.sh index 6fa44982..e77df794 100644 --- a/scripts/update-index.sh +++ b/scripts/update-index.sh @@ -12,4 +12,4 @@ rm -rf data/lucene-index-covid-paragraph export INDEX_PATH=indexes/$INDEX_NAME mv $INDEX_NAME $INDEX_PATH -echo "Successfully updated Anserini index at data/$INDEX_NAME" +echo "Successfully updated Anserini index at indexes/$INDEX_NAME" From 847c6aca70d64239a81b12b23a53bd753946ac3e Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 23:03:46 +0530 Subject: [PATCH 4/9] fix bugs --- README.md | 2 -- pygaggle/run/evaluate_kaggle_highlighter.py | 4 ++-- pygaggle/run/evaluate_passage_ranker.py | 5 ++--- scripts/update-index.sh | 16 ++++++++-------- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ea515c7e..d8c0071c 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,6 @@ Currently, this repo contains implementations of the rerankers for [CovidQA](htt ## Running rerankers on CovidQA -By default, the script uses `indexes/lucene-index-covid-paragraph` for the index path. -If this is undesirable, set the environment variable `CORD19_INDEX_PATH` to the path of the index. For a full list of mostly self-explanatory environment variables, see [this file](https://github.com/castorini/pygaggle/blob/master/pygaggle/settings.py#L7). BM25 uses the CPU. If you don't have a GPU for the transformer models, pass `--device cpu` (PyTorch device string format) to the script. diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index e10789dc..a1881ff3 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -149,7 +149,7 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: return QuestionAnsweringTransformerReranker(model, tokenizer) -def construct_bm25(_: KaggleEvaluationOptions) -> Reranker: +def construct_bm25(options: KaggleEvaluationOptions) -> Reranker: return Bm25Reranker(index_path=options.index_dir) @@ -175,7 +175,7 @@ def main(): args = apb.parser.parse_args() options = KaggleEvaluationOptions(**vars(args)) ds = LitReviewDataset.from_file(str(options.dataset)) - examples = ds.to_senticized_dataset(options.index_dir, + examples = ds.to_senticized_dataset(str(options.index_dir), split=options.split) construct_map = dict(transformer=construct_transformer, bm25=construct_bm25, diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index 6224ae1e..61d60719 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -21,7 +21,6 @@ from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider from pygaggle.model import (SimpleBatchTokenizer, - CachedT5ModelLoader, T5BatchTokenizer, RerankerEvaluator, metric_names, @@ -81,7 +80,7 @@ def tokenizer_sane(cls, v: str, values, **kwargs): def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker: device = torch.device(options.device) model = T5ForConditionalGeneration.from_pretrained(options.model_name_or_path, - from_tf=options.from_tf).to(device).eval() + from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer) @@ -91,7 +90,7 @@ def construct_transformer(options: PassageRankingEvaluationOptions) -> Reranker: device = torch.device(options.device) model = AutoModel.from_pretrained(options.model_name_or_path, - from_tf=options.from_tf).to(device).eval() + from_tf=options.from_tf).to(device).eval() tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained( options.tokenizer_name), options.batch_size) diff --git a/scripts/update-index.sh b/scripts/update-index.sh index e77df794..c88ca3d6 100644 --- a/scripts/update-index.sh +++ b/scripts/update-index.sh @@ -2,14 +2,14 @@ echo "Updating Anserini index..." -export INDEX_NAME=${1:-lucene-index-cord19-paragraph-2020-05-12} -export INDEX_URL=${2:-https://www.dropbox.com/s/s3bylw97cf0t2wq/lucene-index-cord19-paragraph-2020-05-12.tar.gz} +INDEX_NAME=${1:-lucene-index-cord19-paragraph-2020-05-12} +INDEX_URL=${2:-https://www.dropbox.com/s/s3bylw97cf0t2wq/lucene-index-cord19-paragraph-2020-05-12.tar.gz} -wget $INDEX_URL -tar xvfz $INDEX_NAME.tar.gz && rm $INDEX_NAME.tar.gz +wget ${INDEX_URL} +tar xvfz ${INDEX_NAME}.tar.gz && rm ${INDEX_NAME}.tar.gz -rm -rf data/lucene-index-covid-paragraph -export INDEX_PATH=indexes/$INDEX_NAME -mv $INDEX_NAME $INDEX_PATH +INDEX_PATH=indexes/${INDEX_NAME} +rm -rf ${INDEX_PATH} +mv ${INDEX_NAME} ${INDEX_PATH} -echo "Successfully updated Anserini index at indexes/$INDEX_NAME" +echo "Successfully updated Anserini index at indexes/${INDEX_NAME}" From d08ea67677411d0a7c180c5e9e7b1f3329832c51 Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 23:13:31 +0530 Subject: [PATCH 5/9] convert to string before passing --- pygaggle/run/evaluate_kaggle_highlighter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index a1881ff3..490c512f 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -150,7 +150,7 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: def construct_bm25(options: KaggleEvaluationOptions) -> Reranker: - return Bm25Reranker(index_path=options.index_dir) + return Bm25Reranker(index_path=str(options.index_dir)) def main(): From 0825de124367ff1c73f0d86430f871dd6eddea1a Mon Sep 17 00:00:00 2001 From: ronakice Date: Mon, 25 May 2020 23:19:14 +0530 Subject: [PATCH 6/9] Consistent variable names across covidqa/marco --- docs/experiments-msmarco-passage.md | 8 ++++---- pygaggle/run/evaluate_passage_ranker.py | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index e67964dd..a796f744 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -78,9 +78,9 @@ First, lets evaluate using monoBERT! python -um pygaggle.run.evaluate_passage_ranker --split dev \ --method seq_class_transformer \ --model-name-or-path castorini/monobert_large_msmarco \ - --data-dir data/msmarco_ans_small/ \ + --dataset data/msmarco_ans_small/ \ --index-dir indexes/index-msmarco-passage-20191117-0ed488 \ - --dataset msmarco \ + --task msmarco \ --output-file runs/run.monobert.ans_small.dev.tsv ``` @@ -118,9 +118,9 @@ We use the monoT5-base variant as it is the easiest to run without access to lar python -um pygaggle.run.evaluate_passage_ranker --split dev \ --method t5 \ --model-name-or-path castorini/monot5-base-msmarco \ - --data-dir data/msmarco_ans_small \ + --dataset data/msmarco_ans_small \ --model-type t5-base \ - --dataset msmarco \ + --task msmarco \ --index-dir indexes/index-msmarco-passage-20191117-0ed488 \ --batch-size 32 \ --output-file runs/run.monot5.ans_small.dev.tsv diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index 61d60719..7a5ccddb 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -35,8 +35,8 @@ class PassageRankingEvaluationOptions(BaseModel): - dataset: str - data_dir: Path + task: str + dataset: Path index_dir: Path method: str model_name_or_path: str @@ -49,17 +49,17 @@ class PassageRankingEvaluationOptions(BaseModel): model_type: Optional[str] tokenizer_name: Optional[str] - @validator('dataset') - def dataset_exists(cls, v: str): + @validator('task') + def task_exists(cls, v: str): assert v in ['msmarco', 'treccar'] - @validator('data_dir') - def datadir_exists(cls, v: str): + @validator('dataset') + def dataset_exists(cls, v: Path): assert v.exists(), 'data directory must exist' return v @validator('index_dir') - def index_dir_exists(cls, v: str): + def index_dir_exists(cls, v: Path): assert v.exists(), 'index directory must exist' return v @@ -120,15 +120,15 @@ def construct_seq_class_transformer(options: PassageRankingEvaluationOptions def construct_bm25(options: PassageRankingEvaluationOptions) -> Reranker: - return Bm25Reranker(index_path=options.index_dir) + return Bm25Reranker(index_path=str(options.index_dir)) def main(): apb = ArgumentParserBuilder() - apb.add_opts(opt('--dataset', + apb.add_opts(opt('--task', type=str, default='msmarco'), - opt('--data-dir', type=Path, required=True), + opt('--dataset', type=Path, required=True), opt('--index-dir', type=Path, required=True), opt('--method', required=True, @@ -154,7 +154,7 @@ def main(): opt('--tokenizer-name', type=str)) args = apb.parser.parse_args() options = PassageRankingEvaluationOptions(**vars(args)) - ds = MsMarcoDataset.from_folder(str(options.data_dir), split=options.split, + ds = MsMarcoDataset.from_folder(str(options.dataset), split=options.split, is_duo=options.is_duo) examples = ds.to_relevance_examples(str(options.index_dir), is_duo=options.is_duo) From 02517703c49bdf6bb095184d197797e5021aa239 Mon Sep 17 00:00:00 2001 From: ronakice Date: Tue, 26 May 2020 00:47:25 +0530 Subject: [PATCH 7/9] change to bert-base-uncased, remove do-lower-case (deprecated from transformers --- pygaggle/run/evaluate_kaggle_highlighter.py | 14 ++++---------- scripts/evaluate-highlighters.sh | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index 490c512f..d4a81c15 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -42,7 +42,6 @@ class KaggleEvaluationOptions(BaseModel): batch_size: int device: str split: str - do_lower_case: bool metrics: List[str] model_name: Optional[str] tokenizer_name: Optional[str] @@ -79,8 +78,7 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker: device = torch.device(options.device) model = loader.load().to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.model_name, - do_lower_case=options.do_lower_case) + options.model_name) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer) @@ -94,8 +92,7 @@ def construct_transformer(options: KaggleEvaluationOptions) -> Reranker: from_tf=True).to(device).eval() tokenizer = SimpleBatchTokenizer( AutoTokenizer.from_pretrained( - options.tokenizer_name, - do_lower_case=options.do_lower_case), + options.tokenizer_name), options.batch_size) provider = CosineSimilarityMatrixProvider() return UnsupervisedTransformerReranker(model, tokenizer, provider) @@ -124,8 +121,7 @@ def construct_seq_class_transformer(options: device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.tokenizer_name, - do_lower_case=options.do_lower_case) + options.tokenizer_name) return SequenceClassificationTransformerReranker(model, tokenizer) @@ -144,8 +140,7 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: device = torch.device(options.device) model = fixed_model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.tokenizer_name, - do_lower_case=options.do_lower_case) + options.tokenizer_name) return QuestionAnsweringTransformerReranker(model, tokenizer) @@ -166,7 +161,6 @@ def main(): opt('--batch-size', '-bsz', type=int, default=96), opt('--device', type=str, default='cuda:0'), opt('--tokenizer-name', type=str), - opt('--do-lower-case', action='store_true'), opt('--metrics', type=str, nargs='+', diff --git a/scripts/evaluate-highlighters.sh b/scripts/evaluate-highlighters.sh index e7344374..dde229b2 100644 --- a/scripts/evaluate-highlighters.sh +++ b/scripts/evaluate-highlighters.sh @@ -3,7 +3,7 @@ for split in kq nq; do python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method random > results/random-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method bm25 > results/bm25-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method t5 > results/t5-$split.log; - python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name bert-base-cased > results/bbc-unsup-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name bert-base-uncased > results/bbc-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name biobert > results/biobert-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name allenai/scibert_scivocab_cased > results/scibert-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method seq_class_transformer --model-name ~/models/biobert-msmarco > results/biobert-marco-$split.log; From 18da4d149f5cbc2662147fbe26044f0b05d5889c Mon Sep 17 00:00:00 2001 From: ronakice Date: Tue, 26 May 2020 01:01:16 +0530 Subject: [PATCH 8/9] revert to bert-base cased --- scripts/evaluate-highlighters.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/evaluate-highlighters.sh b/scripts/evaluate-highlighters.sh index dde229b2..e7344374 100644 --- a/scripts/evaluate-highlighters.sh +++ b/scripts/evaluate-highlighters.sh @@ -3,7 +3,7 @@ for split in kq nq; do python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method random > results/random-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method bm25 > results/bm25-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method t5 > results/t5-$split.log; - python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name bert-base-uncased > results/bbc-unsup-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name bert-base-cased > results/bbc-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name biobert > results/biobert-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name allenai/scibert_scivocab_cased > results/scibert-unsup-$split.log; python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method seq_class_transformer --model-name ~/models/biobert-msmarco > results/biobert-marco-$split.log; From 81d0c93bb20ceea82938bd4b5628985d625394ab Mon Sep 17 00:00:00 2001 From: ronakice Date: Tue, 26 May 2020 19:35:57 +0530 Subject: [PATCH 9/9] revert to lower case --- pygaggle/run/evaluate_kaggle_highlighter.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index d4a81c15..4f2a1029 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -42,6 +42,7 @@ class KaggleEvaluationOptions(BaseModel): batch_size: int device: str split: str + do_lower_case: bool metrics: List[str] model_name: Optional[str] tokenizer_name: Optional[str] @@ -78,7 +79,7 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker: device = torch.device(options.device) model = loader.load().to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.model_name) + options.model_name, do_lower_case=options.do_lower_case) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer) @@ -92,7 +93,7 @@ def construct_transformer(options: KaggleEvaluationOptions) -> Reranker: from_tf=True).to(device).eval() tokenizer = SimpleBatchTokenizer( AutoTokenizer.from_pretrained( - options.tokenizer_name), + options.tokenizer_name, do_lower_case=options.do_lower_case), options.batch_size) provider = CosineSimilarityMatrixProvider() return UnsupervisedTransformerReranker(model, tokenizer, provider) @@ -121,7 +122,7 @@ def construct_seq_class_transformer(options: device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.tokenizer_name) + options.tokenizer_name, do_lower_case=options.do_lower_case) return SequenceClassificationTransformerReranker(model, tokenizer) @@ -140,7 +141,7 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: device = torch.device(options.device) model = fixed_model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( - options.tokenizer_name) + options.tokenizer_name, do_lower_case=options.do_lower_case) return QuestionAnsweringTransformerReranker(model, tokenizer) @@ -161,6 +162,7 @@ def main(): opt('--batch-size', '-bsz', type=int, default=96), opt('--device', type=str, default='cuda:0'), opt('--tokenizer-name', type=str), + opt('--do-lower-case', action='store_true'), opt('--metrics', type=str, nargs='+',