diff --git a/.gitignore b/.gitignore index 73c802e066..b97ccc8d19 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ src/main/java/io/anserini/scratch/ # these are just concatenation of TREC-COVID round 1 and round2 qrels, so no need to check in. src/main/resources/topics-and-qrels/qrels.covid-round12.txt src/main/resources/topics-and-qrels/qrels.covid-round2-cumulative.txt + +# TREC 2022 NeuCLIR qrels haven't been official released yet. +src/main/resources/topics-and-qrels/qrels.neuclir22-*.txt \ No newline at end of file diff --git a/README.md b/README.md index ffcdcd5e37..7383b65ace 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,8 @@ See individual pages for details! + Regressions for HC4 (v1.0) baselines on HC4 corpora: [Persian](docs/regressions-hc4-v1.0-fa.md), [Russian](docs/regressions-hc4-v1.0-ru.md), [Chinese](docs/regressions-hc4-v1.0-zh.md) + Regressions for HC4 (v1.0) baselines on original NeuCLIR22 corpora: [Persian](docs/regressions-hc4-neuclir22-fa.md), [Russian](docs/regressions-hc4-neuclir22-ru.md), [Chinese](docs/regressions-hc4-neuclir22-zh.md) + Regressions for HC4 (v1.0) baselines on translated NeuCLIR22 corpora: [Persian](docs/regressions-hc4-neuclir22-fa-en.md), [Russian](docs/regressions-hc4-neuclir22-ru-en.md), [Chinese](docs/regressions-hc4-neuclir22-zh-en.md) ++ Regressions for TREC 2022 NeuCLIR Track (query translation): [Persian](docs/regressions-neuclir22-fa-qt.md), [Russian](docs/regressions-neuclir22-ru-qt.md), [Chinese](docs/regressions-neuclir22-zh-qt.md) ++ Regressions for TREC 2022 NeuCLIR Track (document translation): [Persian](docs/regressions-neuclir22-fa-dt.md), [Russian](docs/regressions-neuclir22-ru-dt.md), [Chinese](docs/regressions-neuclir22-zh-dt.md) + Regressions for DPR Wikipedia QA baselines: [100-word splits](docs/regressions-wikipedia-dpr-100w-bm25.md) diff --git a/docs/regressions-neuclir22-fa-dt.md b/docs/regressions-neuclir22-fa-dt.md new file mode 100644 index 0000000000..52ec7a77ce --- /dev/null +++ b/docs/regressions-neuclir22-fa-dt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Persian (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Persian, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-fa-dt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-fa-dt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-fa-dt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-fa-dt \ + --corpus-path collections/neuclir22-fa-en +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-fa-en \ + -index indexes/lucene-index.neuclir22-fa-en \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw \ + >& logs/log.neuclir22-fa-en & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.title.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.title.txt \ + -bm25 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.title.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt \ + -bm25 -rm3 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.title.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt \ + -bm25 -rocchio & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.2617 | 0.2980 | 0.3028 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.1690 | 0.2435 | 0.2444 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.2566 | 0.2969 | 0.3000 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.4420 | 0.4320 | 0.4358 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.3351 | 0.3754 | 0.3770 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.4376 | 0.4333 | 0.4437 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.3680 | 0.3974 | 0.3961 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.3048 | 0.3614 | 0.3561 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.3706 | 0.3939 | 0.3930 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.6817 | 0.7495 | 0.7631 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.5793 | 0.7234 | 0.7149 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.6841 | 0.7795 | 0.7815 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-fa-dt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions-neuclir22-fa-qt.md b/docs/regressions-neuclir22-fa-qt.md new file mode 100644 index 0000000000..f9018b3182 --- /dev/null +++ b/docs/regressions-neuclir22-fa-qt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Persian (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Persian, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-fa-qt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-fa-qt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-fa-qt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-fa-qt \ + --corpus-path collections/neuclir22-fa +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-fa \ + -index indexes/lucene-index.neuclir22-fa \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language fa \ + >& logs/log.neuclir22-fa & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.title.txt \ + -bm25 -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.txt \ + -bm25 -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.title.txt \ + -bm25 -language fa & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.title.txt \ + -bm25 -rm3 -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.txt \ + -bm25 -rm3 -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.title.txt \ + -bm25 -rm3 -language fa & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.title.txt \ + -bm25 -rocchio -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.txt \ + -bm25 -rocchio -language fa & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-fa \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.fa.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.title.txt \ + -bm25 -rocchio -language fa & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default.topics.neuclir22.fa.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rm3.topics.neuclir22.fa.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-fa.txt runs/run.neuclir22-fa.bm25-default+rocchio.topics.neuclir22.fa.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.2554 | 0.1956 | 0.2922 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.2245 | 0.1366 | 0.2734 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.2730 | 0.1760 | 0.3074 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.4273 | 0.3492 | 0.4327 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.3546 | 0.2679 | 0.3813 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.4301 | 0.3274 | 0.4379 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.3759 | 0.3250 | 0.3882 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.3592 | 0.2684 | 0.3781 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.4035 | 0.3110 | 0.4035 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Persian): title](https://neuclir.github.io/) | 0.6899 | 0.6434 | 0.7633 | +| [NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/) | 0.6814 | 0.5069 | 0.7506 | +| [NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/) | 0.7290 | 0.6033 | 0.7998 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-fa-qt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions-neuclir22-ru-dt.md b/docs/regressions-neuclir22-ru-dt.md new file mode 100644 index 0000000000..d57f6ad80c --- /dev/null +++ b/docs/regressions-neuclir22-ru-dt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Russian (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Russian, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-ru-dt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-ru-dt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-ru-dt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-ru-dt \ + --corpus-path collections/neuclir22-ru-en +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-ru-en \ + -index indexes/lucene-index.neuclir22-ru-en \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw \ + >& logs/log.neuclir22-ru-en & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.title.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.title.txt \ + -bm25 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.title.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt \ + -bm25 -rm3 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.title.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt \ + -bm25 -rocchio & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.2492 | 0.3005 | 0.2979 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.1489 | 0.2408 | 0.2291 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.2307 | 0.2982 | 0.2919 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.4123 | 0.4312 | 0.4309 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.2755 | 0.3524 | 0.3511 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.3789 | 0.4092 | 0.4166 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.3702 | 0.3882 | 0.3868 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.2772 | 0.3417 | 0.3382 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.3618 | 0.3737 | 0.3811 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.6942 | 0.7233 | 0.7542 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.5448 | 0.6601 | 0.6622 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.6921 | 0.7252 | 0.7455 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-ru-dt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions-neuclir22-ru-qt.md b/docs/regressions-neuclir22-ru-qt.md new file mode 100644 index 0000000000..614cdaafc1 --- /dev/null +++ b/docs/regressions-neuclir22-ru-qt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Russian (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Russian, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-ru-qt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-ru-qt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-ru-qt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-ru-qt \ + --corpus-path collections/neuclir22-ru +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-ru \ + -index indexes/lucene-index.neuclir22-ru \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language ru \ + >& logs/log.neuclir22-ru & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.title.txt \ + -bm25 -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.txt \ + -bm25 -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.title.txt \ + -bm25 -language ru & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.title.txt \ + -bm25 -rm3 -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.txt \ + -bm25 -rm3 -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.title.txt \ + -bm25 -rm3 -language ru & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.title.txt \ + -bm25 -rocchio -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.txt \ + -bm25 -rocchio -language ru & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-ru \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.ru.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.title.txt \ + -bm25 -rocchio -language ru & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default.topics.neuclir22.ru.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rm3.topics.neuclir22.ru.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-ru.txt runs/run.neuclir22-ru.bm25-default+rocchio.topics.neuclir22.ru.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.2726 | 0.2255 | 0.3118 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.2176 | 0.1362 | 0.3012 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.2800 | 0.1856 | 0.3316 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.4210 | 0.3735 | 0.4381 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.3808 | 0.2677 | 0.4224 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.4296 | 0.3335 | 0.4573 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.3732 | 0.3246 | 0.3873 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.3522 | 0.2592 | 0.3803 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.3943 | 0.2917 | 0.3947 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Russian): title](https://neuclir.github.io/) | 0.6686 | 0.5962 | 0.7044 | +| [NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/) | 0.6077 | 0.4546 | 0.7047 | +| [NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/) | 0.7055 | 0.5365 | 0.7453 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-ru-qt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions-neuclir22-zh-dt.md b/docs/regressions-neuclir22-zh-dt.md new file mode 100644 index 0000000000..6556827f97 --- /dev/null +++ b/docs/regressions-neuclir22-zh-dt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Chinese (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Chinese, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-zh-dt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-zh-dt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-zh-dt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-zh-dt \ + --corpus-path collections/neuclir22-zh-en +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-zh-en \ + -index indexes/lucene-index.neuclir22-zh-en \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw \ + >& logs/log.neuclir22-zh-en & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.title.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.txt \ + -bm25 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.title.txt \ + -bm25 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.title.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.txt \ + -bm25 -rm3 & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt \ + -bm25 -rm3 & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.title.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt \ + -bm25 -rocchio & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh-en \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.en.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt \ + -bm25 -rocchio & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rm3.topics.neuclir22.en.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh-en.bm25-default+rocchio.topics.neuclir22.en.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.3472 | 0.3801 | 0.3818 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.2499 | 0.3052 | 0.3021 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.3480 | 0.3610 | 0.3702 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.4774 | 0.4828 | 0.4879 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.3665 | 0.3974 | 0.3997 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.4725 | 0.4588 | 0.4743 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.3908 | 0.4211 | 0.4197 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.3412 | 0.3978 | 0.3974 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.3899 | 0.4154 | 0.4184 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.7423 | 0.8143 | 0.8158 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.6509 | 0.7556 | 0.7477 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.7607 | 0.8113 | 0.8149 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-zh-dt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions-neuclir22-zh-qt.md b/docs/regressions-neuclir22-zh-qt.md new file mode 100644 index 0000000000..457810bfc1 --- /dev/null +++ b/docs/regressions-neuclir22-zh-qt.md @@ -0,0 +1,172 @@ +# Anserini Regressions: NeuCLIR22 — Chinese (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Chinese, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/neuclir22-zh-qt.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/neuclir22-zh-qt.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-zh-qt +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression neuclir22-zh-qt \ + --corpus-path collections/neuclir22-zh +``` + +## Indexing + +Typical indexing command: + +``` +target/appassembler/bin/IndexCollection \ + -collection NeuClirCollection \ + -input /path/to/neuclir22-zh \ + -index indexes/lucene-index.neuclir22-zh \ + -generator DefaultLuceneDocumentGenerator \ + -threads 8 -storePositions -storeDocvectors -storeRaw -language zh \ + >& logs/log.neuclir22-zh & +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.title.txt \ + -bm25 -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.txt \ + -bm25 -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.title.txt \ + -bm25 -language zh & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.title.txt \ + -bm25 -rm3 -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.txt \ + -bm25 -rm3 -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.title.txt \ + -bm25 -rm3 -language zh & + +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.title.txt \ + -bm25 -rocchio -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.txt \ + -bm25 -rocchio -language zh & +target/appassembler/bin/SearchCollection \ + -index indexes/lucene-index.neuclir22-zh \ + -topics src/main/resources/topics-and-qrels/topics.neuclir22.zh.desc.title.tsv \ + -topicreader TsvInt \ + -output runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.title.txt \ + -bm25 -rocchio -language zh & +``` + +Evaluation can be performed using `trec_eval`: + +``` +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default.topics.neuclir22.zh.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rm3.topics.neuclir22.zh.desc.title.txt + +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m ndcg_cut.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.title.txt +python -m pyserini.eval.trec_eval -c -m judged.20 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.title.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map src/main/resources/topics-and-qrels/qrels.neuclir22-zh.txt runs/run.neuclir22-zh.bm25-default+rocchio.topics.neuclir22.zh.desc.title.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **MAP** | **BM25 (default)**| **+RM3** | **+Rocchio**| +|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.1796 | 0.1758 | 0.2090 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.1515 | 0.1216 | 0.1763 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.1952 | 0.1707 | 0.2311 | +| **nDCG@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.3203 | 0.2718 | 0.3279 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.2803 | 0.2112 | 0.2769 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.3343 | 0.2671 | 0.3501 | +| **J@20** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.3895 | 0.3132 | 0.4004 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.3588 | 0.2461 | 0.3689 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.4298 | 0.2925 | 0.4298 | +| **Recall@1000** | **BM25 (default)**| **+RM3** | **+Rocchio**| +| [NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/) | 0.4527 | 0.4334 | 0.5121 | +| [NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/) | 0.4377 | 0.3755 | 0.4989 | +| [NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/) | 0.4743 | 0.4174 | 0.5201 | + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/neuclir22-zh-qt.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/java/io/anserini/search/topicreader/Topics.java b/src/main/java/io/anserini/search/topicreader/Topics.java index feaf5816ae..9778d6ec3b 100755 --- a/src/main/java/io/anserini/search/topicreader/Topics.java +++ b/src/main/java/io/anserini/search/topicreader/Topics.java @@ -308,6 +308,20 @@ public enum Topics { HC4_V1_0_ZH_EN_TEST_DESC(TsvIntTopicReader.class, "topics-and-qrels/topics.hc4-v1.0-zh.en.test.desc.tsv"), HC4_V1_0_ZH_EN_TEST_DESC_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.hc4-v1.0-zh.en.test.desc.title.tsv"), + // TREC NeuCLIR 2022 Topics + NEUCLIR22_EN_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.en.title.tsv"), + NEUCLIR22_EN_DESC(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.en.desc.tsv"), + NEUCLIR22_EN_DESC_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.en.desc.title.tsv"), + NEUCLIR22_FA_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.fa.title.tsv"), + NEUCLIR22_FA_DESC(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.fa.desc.tsv"), + NEUCLIR22_FA_DESC_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.fa.desc.title.tsv"), + NEUCLIR22_RU_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.ru.title.tsv"), + NEUCLIR22_RU_DESC(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.ru.desc.tsv"), + NEUCLIR22_RU_DESC_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.ru.desc.title.tsv"), + NEUCLIR22_ZH_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.zh.title.tsv"), + NEUCLIR22_ZH_DESC(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.zh.desc.tsv"), + NEUCLIR22_ZH_DESC_TITLE(TsvIntTopicReader.class, "topics-and-qrels/topics.neuclir22.zh.desc.title.tsv"), + // MIRACL (v1.0.0): original queries MIRACL_V10_AR_DEV(TsvIntTopicReader.class, "topics-and-qrels/topics.miracl-v1.0-ar-dev.tsv"), MIRACL_V10_BN_DEV(TsvIntTopicReader.class, "topics-and-qrels/topics.miracl-v1.0-bn-dev.tsv"), diff --git a/src/main/resources/docgen/templates/neuclir22-fa-dt.template b/src/main/resources/docgen/templates/neuclir22-fa-dt.template new file mode 100644 index 0000000000..75e9198d7a --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-fa-dt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Persian (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Persian, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/neuclir22-fa-qt.template b/src/main/resources/docgen/templates/neuclir22-fa-qt.template new file mode 100644 index 0000000000..e1d11b7af2 --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-fa-qt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Persian (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Persian, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/neuclir22-ru-dt.template b/src/main/resources/docgen/templates/neuclir22-ru-dt.template new file mode 100644 index 0000000000..6aa152ae1b --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-ru-dt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Russian (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Russian, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/neuclir22-ru-qt.template b/src/main/resources/docgen/templates/neuclir22-ru-qt.template new file mode 100644 index 0000000000..9b1a9139d1 --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-ru-qt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Russian (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Russian, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/neuclir22-zh-dt.template b/src/main/resources/docgen/templates/neuclir22-zh-dt.template new file mode 100644 index 0000000000..a6f76fcd06 --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-zh-dt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Chinese (Document Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Chinese, using document translation (i.e., corpus translation provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/neuclir22-zh-qt.template b/src/main/resources/docgen/templates/neuclir22-zh-qt.template new file mode 100644 index 0000000000..8ce661dc78 --- /dev/null +++ b/src/main/resources/docgen/templates/neuclir22-zh-qt.template @@ -0,0 +1,58 @@ +# Anserini Regressions: NeuCLIR22 — Chinese (Query Translation) + +This page documents BM25 regression experiments for the [TREC 2022 NeuCLIR Track](https://neuclir.github.io/), Chinese, using query translation (i.e., human translations provided by the organizers). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +``` +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +## Corpus Download + +The NeuCLIR 2022 corpus can be downloaded following the instructions [here](https://neuclir.github.io/). + +With the corpus downloaded, unpack into `collections/` and run the following command to perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Typical indexing command: + +``` +${index_cmds} +``` + +For additional details, see explanation of [common indexing options](common-indexing-options.md). + +## Retrieval + +After indexing has completed, you should be able to perform retrieval as follows: + +``` +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +``` +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/regression/neuclir22-fa-dt.yaml b/src/main/resources/regression/neuclir22-fa-dt.yaml new file mode 100644 index 0000000000..76c6fc9b3b --- /dev/null +++ b/src/main/resources/regression/neuclir22-fa-dt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-fa-en +corpus_path: collections/multilingual/neuclir22-fa-en/ + +index_path: indexes/lucene-index.neuclir22-fa-en +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw +index_stats: + documents: 2232016 + documents (non-empty): 2232013 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Persian): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.en.title.tsv + qrel: qrels.neuclir22-fa.txt + - name: "[NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.en.desc.tsv + qrel: qrels.neuclir22-fa.txt + - name: "[NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.en.desc.title.tsv + qrel: qrels.neuclir22-fa.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 + results: + MAP: + - 0.2617 + - 0.1690 + - 0.2566 + nDCG@20: + - 0.4420 + - 0.3351 + - 0.4376 + J@20: + - 0.3680 + - 0.3048 + - 0.3706 + Recall@1000: + - 0.6817 + - 0.5793 + - 0.6841 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 + results: + MAP: + - 0.2980 + - 0.2435 + - 0.2969 + nDCG@20: + - 0.4320 + - 0.3754 + - 0.4333 + J@20: + - 0.3974 + - 0.3614 + - 0.3939 + Recall@1000: + - 0.7495 + - 0.7234 + - 0.7795 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio + results: + MAP: + - 0.3028 + - 0.2444 + - 0.3000 + nDCG@20: + - 0.4358 + - 0.3770 + - 0.4437 + J@20: + - 0.3961 + - 0.3561 + - 0.3930 + Recall@1000: + - 0.7631 + - 0.7149 + - 0.7815 diff --git a/src/main/resources/regression/neuclir22-fa-qt.yaml b/src/main/resources/regression/neuclir22-fa-qt.yaml new file mode 100644 index 0000000000..fb8f9c6f24 --- /dev/null +++ b/src/main/resources/regression/neuclir22-fa-qt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-fa +corpus_path: collections/multilingual/neuclir22-fa/ + +index_path: indexes/lucene-index.neuclir22-fa +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw -language fa +index_stats: + documents: 2232016 + documents (non-empty): 2232016 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Persian): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.fa.title.tsv + qrel: qrels.neuclir22-fa.txt + - name: "[NeuCLIR 2022 (Persian): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.fa.desc.tsv + qrel: qrels.neuclir22-fa.txt + - name: "[NeuCLIR 2022 (Persian): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.fa.desc.title.tsv + qrel: qrels.neuclir22-fa.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 -language fa + results: + MAP: + - 0.2554 + - 0.2245 + - 0.2730 + nDCG@20: + - 0.4273 + - 0.3546 + - 0.4301 + J@20: + - 0.3759 + - 0.3592 + - 0.4035 + Recall@1000: + - 0.6899 + - 0.6814 + - 0.7290 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 -language fa + results: + MAP: + - 0.1956 + - 0.1366 + - 0.1760 + nDCG@20: + - 0.3492 + - 0.2679 + - 0.3274 + J@20: + - 0.3250 + - 0.2684 + - 0.3110 + Recall@1000: + - 0.6434 + - 0.5069 + - 0.6033 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio -language fa + results: + MAP: + - 0.2922 + - 0.2734 + - 0.3074 + nDCG@20: + - 0.4327 + - 0.3813 + - 0.4379 + J@20: + - 0.3882 + - 0.3781 + - 0.4035 + Recall@1000: + - 0.7633 + - 0.7506 + - 0.7998 diff --git a/src/main/resources/regression/neuclir22-ru-dt.yaml b/src/main/resources/regression/neuclir22-ru-dt.yaml new file mode 100644 index 0000000000..4bae6fc9bb --- /dev/null +++ b/src/main/resources/regression/neuclir22-ru-dt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-ru-en +corpus_path: collections/multilingual/neuclir22-ru-en/ + +index_path: indexes/lucene-index.neuclir22-ru-en +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw +index_stats: + documents: 4627541 + documents (non-empty): 4625762 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Russian): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.en.title.tsv + qrel: qrels.neuclir22-ru.txt + - name: "[NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.en.desc.tsv + qrel: qrels.neuclir22-ru.txt + - name: "[NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.en.desc.title.tsv + qrel: qrels.neuclir22-ru.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 + results: + MAP: + - 0.2492 + - 0.1489 + - 0.2307 + nDCG@20: + - 0.4123 + - 0.2755 + - 0.3789 + J@20: + - 0.3702 + - 0.2772 + - 0.3618 + Recall@1000: + - 0.6942 + - 0.5448 + - 0.6921 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 + results: + MAP: + - 0.3005 + - 0.2408 + - 0.2982 + nDCG@20: + - 0.4312 + - 0.3524 + - 0.4092 + J@20: + - 0.3882 + - 0.3417 + - 0.3737 + Recall@1000: + - 0.7233 + - 0.6601 + - 0.7252 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio + results: + MAP: + - 0.2979 + - 0.2291 + - 0.2919 + nDCG@20: + - 0.4309 + - 0.3511 + - 0.4166 + J@20: + - 0.3868 + - 0.3382 + - 0.3811 + Recall@1000: + - 0.7542 + - 0.6622 + - 0.7455 diff --git a/src/main/resources/regression/neuclir22-ru-qt.yaml b/src/main/resources/regression/neuclir22-ru-qt.yaml new file mode 100644 index 0000000000..7694876aa7 --- /dev/null +++ b/src/main/resources/regression/neuclir22-ru-qt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-ru +corpus_path: collections/multilingual/neuclir22-ru/ + +index_path: indexes/lucene-index.neuclir22-ru +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw -language ru +index_stats: + documents: 4627541 + documents (non-empty): 4627532 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Russian): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.ru.title.tsv + qrel: qrels.neuclir22-ru.txt + - name: "[NeuCLIR 2022 (Russian): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.ru.desc.tsv + qrel: qrels.neuclir22-ru.txt + - name: "[NeuCLIR 2022 (Russian): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.ru.desc.title.tsv + qrel: qrels.neuclir22-ru.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 -language ru + results: + MAP: + - 0.2726 + - 0.2176 + - 0.2800 + nDCG@20: + - 0.4210 + - 0.3808 + - 0.4296 + J@20: + - 0.3732 + - 0.3522 + - 0.3943 + Recall@1000: + - 0.6686 + - 0.6077 + - 0.7055 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 -language ru + results: + MAP: + - 0.2255 + - 0.1362 + - 0.1856 + nDCG@20: + - 0.3735 + - 0.2677 + - 0.3335 + J@20: + - 0.3246 + - 0.2592 + - 0.2917 + Recall@1000: + - 0.5962 + - 0.4546 + - 0.5365 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio -language ru + results: + MAP: + - 0.3118 + - 0.3012 + - 0.3316 + nDCG@20: + - 0.4381 + - 0.4224 + - 0.4573 + J@20: + - 0.3873 + - 0.3803 + - 0.3947 + Recall@1000: + - 0.7044 + - 0.7047 + - 0.7453 diff --git a/src/main/resources/regression/neuclir22-zh-dt.yaml b/src/main/resources/regression/neuclir22-zh-dt.yaml new file mode 100644 index 0000000000..bf463336d7 --- /dev/null +++ b/src/main/resources/regression/neuclir22-zh-dt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-zh-en +corpus_path: collections/multilingual/neuclir22-zh-en/ + +index_path: indexes/lucene-index.neuclir22-zh-en +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw +index_stats: + documents: 3179206 + documents (non-empty): 3179203 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.en.title.tsv + qrel: qrels.neuclir22-zh.txt + - name: "[NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.en.desc.tsv + qrel: qrels.neuclir22-zh.txt + - name: "[NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.en.desc.title.tsv + qrel: qrels.neuclir22-zh.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 + results: + MAP: + - 0.3472 + - 0.2499 + - 0.3480 + nDCG@20: + - 0.4774 + - 0.3665 + - 0.4725 + J@20: + - 0.3908 + - 0.3412 + - 0.3899 + Recall@1000: + - 0.7423 + - 0.6509 + - 0.7607 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 + results: + MAP: + - 0.3801 + - 0.3052 + - 0.3610 + nDCG@20: + - 0.4828 + - 0.3974 + - 0.4588 + J@20: + - 0.4211 + - 0.3978 + - 0.4154 + Recall@1000: + - 0.8143 + - 0.7556 + - 0.8113 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio + results: + MAP: + - 0.3818 + - 0.3021 + - 0.3702 + nDCG@20: + - 0.4879 + - 0.3997 + - 0.4743 + J@20: + - 0.4197 + - 0.3974 + - 0.4184 + Recall@1000: + - 0.8158 + - 0.7477 + - 0.8149 diff --git a/src/main/resources/regression/neuclir22-zh-qt.yaml b/src/main/resources/regression/neuclir22-zh-qt.yaml new file mode 100644 index 0000000000..350ea1e1b4 --- /dev/null +++ b/src/main/resources/regression/neuclir22-zh-qt.yaml @@ -0,0 +1,121 @@ +--- +corpus: neuclir22-zh +corpus_path: collections/multilingual/neuclir22-zh/ + +index_path: indexes/lucene-index.neuclir22-zh +collection_class: NeuClirCollection +generator_class: DefaultLuceneDocumentGenerator +index_threads: 8 +index_options: -storePositions -storeDocvectors -storeRaw -language zh +index_stats: + documents: 3179206 + documents (non-empty): 3179206 + +metrics: + - metric: MAP + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: true + - metric: nDCG@20 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m ndcg_cut.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: J@20 + command: python -m pyserini.eval.trec_eval + params: -c -m judged.20 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: Recall@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topic_root: src/main/resources/topics-and-qrels/ +qrels_root: src/main/resources/topics-and-qrels/ +topics: + - name: "[NeuCLIR 2022 (Chinese): title](https://neuclir.github.io/)" + id: title + path: topics.neuclir22.zh.title.tsv + qrel: qrels.neuclir22-zh.txt + - name: "[NeuCLIR 2022 (Chinese): desc](https://neuclir.github.io/)" + id: desc + path: topics.neuclir22.zh.desc.tsv + qrel: qrels.neuclir22-zh.txt + - name: "[NeuCLIR 2022 (Chinese): desc+title](https://neuclir.github.io/)" + id: desc_title + path: topics.neuclir22.zh.desc.title.tsv + qrel: qrels.neuclir22-zh.txt + +models: + - name: bm25-default + display: BM25 (default) + params: -bm25 -language zh + results: + MAP: + - 0.1796 + - 0.1515 + - 0.1952 + nDCG@20: + - 0.3203 + - 0.2803 + - 0.3343 + J@20: + - 0.3895 + - 0.3588 + - 0.4298 + Recall@1000: + - 0.4527 + - 0.4377 + - 0.4743 + - name: bm25-default+rm3 + display: +RM3 + params: -bm25 -rm3 -language zh + results: + MAP: + - 0.1758 + - 0.1216 + - 0.1707 + nDCG@20: + - 0.2718 + - 0.2112 + - 0.2671 + J@20: + - 0.3132 + - 0.2461 + - 0.2925 + Recall@1000: + - 0.4334 + - 0.3755 + - 0.4174 + - name: bm25-default+rocchio + display: +Rocchio + params: -bm25 -rocchio -language zh + results: + MAP: + - 0.2090 + - 0.1763 + - 0.2311 + nDCG@20: + - 0.3279 + - 0.2769 + - 0.3501 + J@20: + - 0.4004 + - 0.3689 + - 0.4298 + Recall@1000: + - 0.5121 + - 0.4989 + - 0.5201 diff --git a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java index 329296d4fb..382769ec66 100755 --- a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java +++ b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java @@ -37,7 +37,7 @@ public void testIterateThroughAllEnums() { String[] pathParts = topic.path.split("/"); assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(pathParts[1])); } - assertEquals(293, cnt); + assertEquals(305, cnt); } @Test @@ -1676,6 +1676,25 @@ public void testHC4Topics() { assertEquals(50, TopicReader.getTopics(Topics.HC4_V1_0_ZH_EN_TEST_DESC_TITLE).keySet().size()); } + @Test + public void testNeuCLIR22Topics() { + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_EN_TITLE).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_EN_DESC).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_EN_DESC_TITLE).keySet().size()); + + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_FA_TITLE).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_FA_DESC).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_FA_DESC_TITLE).keySet().size()); + + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_RU_TITLE).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_RU_DESC).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_RU_DESC_TITLE).keySet().size()); + + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_ZH_TITLE).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_ZH_DESC).keySet().size()); + assertEquals(114, TopicReader.getTopics(Topics.NEUCLIR22_ZH_DESC_TITLE).keySet().size()); + } + @Test public void testMIRACLTopics() { assertEquals(2896, TopicReader.getTopics(Topics.MIRACL_V10_AR_DEV).keySet().size());