diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 039067ad97..e7474c9ad3 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -30,6 +30,7 @@ "robust04": { "description": "TREC Disks 4 & 5 (minus Congressional Records), used in the TREC 2004 Robust Track", "filename": "index-robust04-20191213.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz", "https://vault.cs.uwaterloo.ca/s/eqFacNeSGc4pLLH/download" @@ -43,6 +44,7 @@ "msmarco-passage": { "description": "MS MARCO passage corpus", "filename": "index-msmarco-passage-20201117-f87c94.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20201117-f87c94.tar.gz", "https://vault.cs.uwaterloo.ca/s/QQsZMFG8MpF4P8M/download" @@ -57,6 +59,7 @@ "msmarco-passage-slim": { "description": "MS MARCO passage corpus (slim version, no documents)", "filename": "index-msmarco-passage-slim-20201202-ab6e28.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-slim-20201202-ab6e28.tar.gz", "https://vault.cs.uwaterloo.ca/s/Kx6K9NJFmwnaAP8/download" @@ -71,6 +74,7 @@ "msmarco-passage-expanded": { "description": "MS MARCO passage corpus (+ docTTTTTquery expansion)", "filename": "index-msmarco-passage-expanded-20201121-e127fb.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-expanded-20201121-e127fb.tar.gz", "https://vault.cs.uwaterloo.ca/s/pm7cisJtRxiAMHd/download" @@ -85,6 +89,7 @@ "msmarco-passage-ltr": { "description": "MS MARCO passage corpus (4 extra preprocessed fields) used for LTR pipeline", "filename": "index-msmarco-passage-ltr-20210519-e25e33f.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt", "urls": [ "https://vault.cs.uwaterloo.ca/s/8qFCaCtwabRfYQD/download" # too big for UWaterloo GitLab ], @@ -98,6 +103,7 @@ "msmarco-doc": { "description": "MS MARCO document corpus", "filename": "index-msmarco-doc-20201117-f87c94.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-20201117-f87c94.tar.gz", "https://vault.cs.uwaterloo.ca/s/5NC7A2wAL7opJKH/download" @@ -112,6 +118,7 @@ "msmarco-doc-slim": { "description": "MS MARCO document corpus (slim version, no documents)", "filename": "index-msmarco-doc-slim-20201202-ab6e28.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-slim-20201202-ab6e28.tar.gz", "https://vault.cs.uwaterloo.ca/s/BMZ6oYBoEPgTFqs/download" @@ -126,6 +133,7 @@ "msmarco-doc-per-passage": { "description": "MS MARCO document corpus, segmented into passages", "filename": "index-msmarco-doc-per-passage-20201204-f50dcc.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-per-passage-20201204-f50dcc.tar.gz", "https://vault.cs.uwaterloo.ca/s/q6sAxE6q57q2TBo/download" @@ -140,6 +148,7 @@ "msmarco-doc-per-passage-slim": { "description": "MS MARCO document corpus, segmented into passages (slim version, no documents)", "filename": "index-msmarco-doc-per-passage-slim-20201204-f50dcc.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-per-passage-slim-20201204-f50dcc.tar.gz", "https://vault.cs.uwaterloo.ca/s/mKTjbTKMwWF9kY3/download" @@ -154,6 +163,7 @@ "msmarco-doc-expanded-per-doc": { "description": "MS MARCO document corpus, with per-doc docTTTTTquery expansion", "filename": "index-msmarco-doc-expanded-per-doc-20201126-1b4d0a.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a.tar.gz", "https://vault.cs.uwaterloo.ca/s/3BQz6ZAXAxtfne8/download" @@ -168,6 +178,7 @@ "msmarco-doc-expanded-per-passage": { "description": "MS MARCO document corpus, with per-passage docTTTTTquery expansion", "filename": "index-msmarco-doc-expanded-per-passage-20201126-1b4d0a.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a.tar.gz", "https://vault.cs.uwaterloo.ca/s/eZLbPWcnB7LzKnQ/download" @@ -434,6 +445,7 @@ "wikipedia-dpr": { "description": "Wikipedia (DPR 100 word splits) Anserini index", "filename": "index-wikipedia-dpr-20210120-d1b9e6.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-wikipedia-dpr-20210120-d1b9e6.tar.gz", "https://vault.cs.uwaterloo.ca/s/t6tDJmpoxPw9tH8/download" @@ -448,6 +460,7 @@ "wikipedia-dpr-slim": { "description": "Wikipedia (DPR 100 word splits) Anserini index, without raw texts stored", "filename": "index-wikipedia-dpr-slim-20210120-d1b9e6.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-wikipedia-dpr-slim-20210120-d1b9e6.tar.gz", "https://vault.cs.uwaterloo.ca/s/Gk2sfTyJCyaTrYH/download" @@ -462,6 +475,7 @@ "wikipedia-kilt-doc": { "description": "Wikipedia snapshot used as KILT's knowledge source. Indexed by documents.", "filename": "index-wikipedia-kilt-doc-20210421-f29307.tar.gz", + "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt", "urls": [ "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-wikipedia-kilt-doc-20210421-f29307.tar.gz", "https://vault.cs.uwaterloo.ca/s/RqtLg3CZT38k32c/download" diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt new file mode 100644 index 0000000000..cd7fe03745 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-20201117-f87c94-readme.txt @@ -0,0 +1,15 @@ +This index was generated on 2020/11/17 at commit f87c945fd1c1e4174468194c72e3c05688dc45dd Mon Nov 16 16:17:20 2020 -0500 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-doc \ + -index index-msmarco-doc-20201117-f87c94 -threads 1 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-doc-20201117-f87c94.tar.gz MD5 checksum = ac747860e7a37aed37cc30ed3990f273 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt new file mode 100644 index 0000000000..db57732f8a --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-doc-20201126-1b4d0a-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/26 at + ++ docTTTTTquery commit d2704c025c2bf6db652b4b27f49c4e59714ba898 (2020/11/24). ++ anserini commit 1b4d0a29879a867ca5d1f003f924acc3279455ba (2020/11/25). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-expanded -index index-msmarco-doc-expanded-per-doc-20201126-1b4d0a -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-expanded-per-doc-20201126-1b4d0a.tar.gz MD5 checksum = f7056191842ab77a01829cff68004782 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt new file mode 100644 index 0000000000..29362ba570 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-expanded-per-passage-20201126-1b4d0a-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/26 at + ++ docTTTTTquery commit d2704c025c2bf6db652b4b27f49c4e59714ba898 (2020/11/24). ++ anserini commit 1b4d0a29879a867ca5d1f003f924acc3279455ba (2020/11/25). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-expanded-passage -index index-msmarco-doc-expanded-per-passage-20201126-1b4d0a -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-expanded-per-passage-20201126-1b4d0a.tar.gz MD5 checksum = 54ea30c64515edf3c3741291b785be53 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt new file mode 100644 index 0000000000..6f250a5de3 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-20201204-f50dcc-readme.txt @@ -0,0 +1,19 @@ +This index was generated on 2020/12/04 at + ++ docTTTTTquery commit 5be1af130b4657ea117781f761c4e5d15c77cb42 (2020/12/01). ++ anserini commit f50dcceb6cd0ec3403c1e77066aa51bb3275d24e (2020/12/04). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-passage -index index-msmarco-doc-per-passage-20201204-f50dcc -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-doc-per-passage-20201204-f50dcc.tar.gz MD5 checksum = 797367406a7542b649cefa6b41cf4c33 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt new file mode 100644 index 0000000000..565915c8b7 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-per-passage-slim-20201204-f50dcc-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/12/04 at + ++ docTTTTTquery commit 5be1af130b4657ea117781f761c4e5d15c77cb42 (2020/12/01). ++ anserini commit f50dcceb6cd0ec3403c1e77066aa51bb3275d24e (2020/12/04). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 1 \ + -input msmarco-doc-passage -index index-msmarco-doc-per-passage-slim-20201204-f50dcc -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-per-passage-slim-20201204-f50dcc.tar.gz MD5 checksum = 77c2409943a8c9faffabf57cb6adca69 diff --git a/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt b/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt new file mode 100644 index 0000000000..7e79f60ca7 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-doc-slim-20201202-ab6e28-readme.txt @@ -0,0 +1,10 @@ +This index was generated on 2020/12/02 at commit ab6e280b06a7a6476d001a5eb2319c191010c0e1 (2020/12/01) +with the following command: + +sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-doc \ + -index index-msmarco-doc-slim-20201202-ab6e28 -threads 1 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-doc-slim-20201202-ab6e28.tar.gz MD5 checksum = c56e752f7992bf6149761097641d515a diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt new file mode 100644 index 0000000000..a3a08f586a --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-20201117-f87c94-readme.txt @@ -0,0 +1,15 @@ +This index was generated on 2020/11/17 at commit f87c945fd1c1e4174468194c72e3c05688dc45dd Mon Nov 16 16:17:20 2020 -0500 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \ + -index index-msmarco-passage-20201117-f87c94 -threads 9 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw passages are stored, so they can be fetched and fed to further downstream reranking components. + +index-msmarco-passage-20201117-f87c94.tar.gz MD5 checksum = 1efad4f1ae6a77e235042eff4be1612d diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt new file mode 100644 index 0000000000..23fa654428 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-expanded-20201121-e127fb-readme.txt @@ -0,0 +1,14 @@ +This index was generated on 2020/11/21 at + ++ docTTTTTquery commit 701ea0a72beeb8db46aa409352a72ba52cd2c36b Tue Nov 17 07:13:27 2020 -0500 ++ anserini commit e127fbea6f5515d60eb7c325cd866657dbf13cc6 Sat Nov 21 07:58:03 2020 -0500 + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection \ + -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -input msmarco-passage-expanded -index index-msmarco-passage-expanded-20201121-e127fb -threads 9 -optimize + +Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-passage-expanded-20201121-e127fb.tar.gz MD5 checksum = e5762e9e065b6fe5000f9c18da778565 diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt new file mode 100644 index 0000000000..4a5e758a89 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-ltr-20210519-e25e33f-readme.txt @@ -0,0 +1,11 @@ +This index was generated on 2021/05/19 at commit e25e33f4a06e9c1ab4d795908cae4474fa019643 2021-05-17 21:48:48 -0400 +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-ltr-passage/ltr_collection_jsonl \ + -index index-msmarco-passage-ltr-20210519-e25e33f -threads 9 -storeRaw -optimize -storePositions -storeDocvectors -pretokenizdd + +Note, pretokenized option is used to keep preprocessed tokenization. +This is built with spacy 3.0.6. + +index-msmarco-passage-ltr-20210519-e25e33f MD5 checksum = a5de642c268ac1ed5892c069bdc29ae3 diff --git a/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt b/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt new file mode 100644 index 0000000000..010eaab227 --- /dev/null +++ b/pyserini/resources/index-metadata/index-msmarco-passage-slim-20201202-ab6e28-readme.txt @@ -0,0 +1,10 @@ +This index was generated on 2020/12/02 at commit ab6e280b06a7a6476d001a5eb2319c191010c0e1 (2020/12/01) +with the following command: + +sh target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \ + -index index-msmarco-passage-slim-20201202-ab6e28 -threads 9 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-msmarco-passage-slim-20201202-ab6e28.tar.gz MD5 checksum = 5e11da4cebd2e8dda2e73c589ffb0b4c diff --git a/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt b/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt new file mode 100644 index 0000000000..bc45b21c72 --- /dev/null +++ b/pyserini/resources/index-metadata/index-robust04-20191213-readme.txt @@ -0,0 +1,7 @@ +This index was generated on 12/13/2019 with Anserini v0.7.0, with the following command: + +sh target/appassembler/bin/IndexCollection -collection TrecCollection \ + -input /tuna1/collections/newswire/disk45/ -index index-robust04-20191213 \ + -generator JsoupGenerator -threads 16 -storePositions -storeDocvectors -storeRawDocs -optimize + +index-robust04-20191213.tar.gz MD5 checksum = 15f3d001489c97849a010b0a4734d018 diff --git a/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt new file mode 100644 index 0000000000..e449ad1048 --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-dpr-20210120-d1b9e6-readme.txt @@ -0,0 +1,18 @@ +This index was generated on 2021/01/20 at + ++ anserini commit d1b9e67928aa60fa557113ace5d209b0c58e994c (2021/01/19). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 22 \ + -input wikipedia-dpr-jsonl -index index-wikipedia-dpr-20210120-d1b9e6 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-wikipedia-dpr-20210120-d1b9e6.tar.gz MD5 checksum = c28f3a56b2dfcef25bf3bf755c264d04 diff --git a/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt new file mode 100644 index 0000000000..9ff6af6e28 --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-dpr-slim-20210120-d1b9e6-readme.txt @@ -0,0 +1,13 @@ +This index was generated on 2021/01/20 at + ++ anserini commit d1b9e67928aa60fa557113ace5d209b0c58e994c (2021/01/19). + +with the following command: + +sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator DefaultLuceneDocumentGenerator -threads 22 \ + -input wikipedia-dpr-jsonl -index index-wikipedia-dpr-slim-20210120-d1b9e6 -optimize + +This minimal index does not store any "extras" (positions, document vectors, raw documents, etc.). + +index-wikipedia-dpr-slim-20210120-d1b9e6.tar.gz MD5 checksum = 7d40604a824b5df37a1ae9d25ea38071 diff --git a/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt b/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt new file mode 100644 index 0000000000..8449100a55 --- /dev/null +++ b/pyserini/resources/index-metadata/index-wikipedia-kilt-doc-20210421-f29307-readme.txt @@ -0,0 +1,18 @@ +This index was generated on 2021/04/22 at + ++ anserini commit f29307a9fb162ec7bef4919a164929a673d2304e (2021/04/21). + +with the following command: + +python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ + -threads 40 -input collections/wikipedia-kilt-doc \ + -index indexes/index-wikipedia-kilt-doc-20210421-f29307 -storeRaw -optimize + +Note that to reduce index size: + ++ positions are not indexed (so no phrase queries) ++ document vectors are not stored (so no query expansion) + +However, the raw documents are stored, so they can be fetched and fed to further downstream reranking components. + +index-wikipedia-kilt-doc-20210421-f29307.tar.gz MD5 checksum = b8ec8feb654f7aaa86f9901dc6c804a8