From 3e0acc90445350fb240af7d1cef5938084e2abdf Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 7 Jul 2024 18:11:16 +0100 Subject: [PATCH 1/4] artifact --- requirements.txt | 3 +- setup.py | 10 ++++- src/pyterrier_pisa/__init__.py | 44 ++++++++++++--------- src/pyterrier_pisa/indexers.py | 15 ++++--- src/pyterrier_pisa/pisa_metadata_adapter.py | 13 ++++++ 5 files changed, 57 insertions(+), 28 deletions(-) create mode 100644 src/pyterrier_pisa/pisa_metadata_adapter.py diff --git a/requirements.txt b/requirements.txt index 1ce90eb..52f772a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ python_terrier>=0.9.1 -numpy>=1.21.0 +pyterrier-alpha>=0.2.0 +numpy>=1.21.0, <2.0.0 pyciff==0.1.1 diff --git a/setup.py b/setup.py index 1ddc3d5..f4b8d3d 100644 --- a/setup.py +++ b/setup.py @@ -36,10 +36,16 @@ def run(self): packages=find_packages(where='src'), package_dir={'': 'src'}, include_package_data=True, - install_requires=['python-terrier>=0.8.0', 'numpy>=1.21.0'], - python_requires=">=3.7", + install_requires=list(open('requirements.txt')), + python_requires=">=3.8", entry_points={ 'console_scripts': ['pyterrier_pisa=pyterrier_pisa.cli:main'], + 'pyterrier.artifact': [ + 'sparse_index.pisa = pyterrier_pisa:PisaIndex', + ], + 'pyterrier.artifact.metadata_adapter': [ + 'sparse_index.pisa = pyterrier_pisa.pisa_metadata_adapter:pisa_artifact_metadata_adapter', + ], }, cmdclass={'bdist_wheel': bdist_wheel} ) diff --git a/src/pyterrier_pisa/__init__.py b/src/pyterrier_pisa/__init__.py index 34e62c5..4daa03b 100644 --- a/src/pyterrier_pisa/__init__.py +++ b/src/pyterrier_pisa/__init__.py @@ -10,6 +10,7 @@ from enum import Enum from collections import Counter import pyterrier as pt +import pyterrier_alpha as pta from pyterrier.datasets import Dataset import functools import ir_datasets @@ -92,7 +93,7 @@ def log_level(on=True): _pisathon.log_level(1 if on else 0) -class PisaIndex(pt.Indexer): +class PisaIndex(pta.Artifact, pt.Indexer): def __init__(self, path: str, text_field: str = None, @@ -102,14 +103,17 @@ def __init__(self, stops: Optional[Union[PisaStopwords, List[str]]] = None, threads: int = 8, overwrite=False): - self.path = path - ppath = Path(path) + super().__init__(path) if stemmer is not None: stemmer = PisaStemmer(stemmer) if index_encoding is not None: index_encoding = PisaIndexEncoding(index_encoding) if stops is not None and not isinstance(stops, list): stops = PisaStopwords(stops) - if (ppath/'pt_pisa_config.json').exists(): - with (ppath/'pt_pisa_config.json').open('rt') as fin: - config = json.load(fin) + if (_old_metadata := (self.path/'pt_pisa_config.json').exists()) or (self.path/'pt_meta.json').exists(): + if _old_metadata: + with (self.path/'pt_pisa_config.json').open('rt') as fin: + config = json.load(fin) + else: + with (self.path/'pt_meta.json').open('rt') as fin: + config = json.load(fin) if stemmer is None: stemmer = PisaStemmer(config['stemmer']) if stemmer.value != config['stemmer']: @@ -129,7 +133,7 @@ def transform(self, *args, **kwargs): raise RuntimeError(f'You cannot use {self} itself as a transformer. Did you mean to call a ranking function like .bm25()?') def built(self): - return (Path(self.path)/'pt_pisa_config.json').exists() + return (self.path/'pt_meta.json').exists() or (self.path/'pt_pisa_config.json').exists() def index(self, it): it = more_itertools.peekable(it) @@ -168,17 +172,17 @@ def quantized(self, num_results=1000, verbose=False, threads=None, query_algorit def num_terms(self): assert self.built() - return _pisathon.num_terms(self.path) + return _pisathon.num_terms(str(self.path)) def num_docs(self): assert self.built() - return _pisathon.num_docs(self.path) + return _pisathon.num_docs(str(self.path)) def __len__(self): return self.num_docs() def __repr__(self): - return f'PisaIndex({repr(self.path)})' + return f'PisaIndex({repr(str(self.path))})' @staticmethod def from_dataset(dataset: Union[str, Dataset], variant: str = 'pisa_porter2', version: str = 'latest', **kwargs): @@ -210,8 +214,11 @@ def from_ciff(ciff_file: str, index_path, overwrite: bool = False, stemmer = PIS else: # If it wasn't created, create one from the documents file _pisathon.build_binlex(str(ppath/'fwd.documents'), str(ppath/'fwd.doclex')) - with open(ppath/'pt_pisa_config.json', 'wt') as fout: + with open(ppath/'pt_meta.json', 'wt') as fout: json.dump({ + 'type': 'sparse_index', + 'format': 'pisa', + 'package_hint': 'pyterrier-pisa', 'stemmer': stemmer.value, }, fout) return PisaIndex(index_path, stemmer=stemmer) @@ -219,16 +226,15 @@ def from_ciff(ciff_file: str, index_path, overwrite: bool = False, stemmer = PIS def to_ciff(self, ciff_file: str, description: str = 'from pyterrier_pisa'): assert self.built() import pyciff - pyciff.pisa_to_ciff(str(Path(self.path)/'inv'), str(Path(self.path)/'fwd.terms'), str(Path(self.path)/'fwd.documents'), ciff_file, description) + pyciff.pisa_to_ciff(str(self.path/'inv'), str(self.path/'fwd.terms'), str(self.path/'fwd.documents'), ciff_file, description) def get_corpus_iter(self, field='toks', verbose=True): assert self.built() - ppath = Path(self.path) - assert (ppath/'fwd').exists(), "get_corpus_iter requires a fwd index" - m = np.memmap(ppath/'fwd', mode='r', dtype=np.uint32) - lexicon = [l.strip() for l in (ppath/'fwd.terms').open('rt')] + assert (self.path/'fwd').exists(), "get_corpus_iter requires a fwd index" + m = np.memmap(self.path/'fwd', mode='r', dtype=np.uint32) + lexicon = [l.strip() for l in (self.path/'fwd.terms').open('rt')] idx = 2 - it = iter((ppath/'fwd.documents').open('rt')) + it = iter((self.path/'fwd.documents').open('rt')) if verbose: it = _logger.pbar(it, total=int(m[1]), desc=f'iterating documents in {self}', unit='doc') for did in it: @@ -269,7 +275,7 @@ def __init__(self, index: Union[PisaIndex, str], scorer: Union[PisaScorer, str], else: self.query_weighted = query_weighted self.toks_scale = toks_scale - _pisathon.prepare_index(self.index.path, encoding=self.index.index_encoding.value, scorer_name=self.scorer.value, **retr_args) + _pisathon.prepare_index(str(self.index.path), encoding=self.index.index_encoding.value, scorer_name=self.scorer.value, **retr_args) def transform(self, queries): assert 'qid' in queries.columns @@ -295,7 +301,7 @@ def transform(self, queries): result_ranks = np.ascontiguousarray(np.empty(shape, dtype=np.int32)) result_scores = np.ascontiguousarray(np.empty(shape, dtype=np.float32)) size = _pisathon.retrieve( - self.index.path, + str(self.index.path), self.index.index_encoding.value, self.query_algorithm.value, self.scorer.value, diff --git a/src/pyterrier_pisa/indexers.py b/src/pyterrier_pisa/indexers.py index 5421f36..967cd82 100644 --- a/src/pyterrier_pisa/indexers.py +++ b/src/pyterrier_pisa/indexers.py @@ -24,7 +24,7 @@ class PisaIndexingMode(Enum): class PisaIndexer(pt.Indexer): def __init__(self, path, text_field='text', mode=PisaIndexingMode.create, stemmer='porter2', threads=1, batch_size=100_000): - self.path = path + self.path = Path(path) self.text_field = text_field self.mode = PisaIndexingMode(mode) self.stemmer = pyterrier_pisa.PisaStemmer(stemmer) @@ -32,23 +32,26 @@ def __init__(self, path, text_field='text', mode=PisaIndexingMode.create, stemme self.batch_size = batch_size def index(self, it): - path = Path(self.path) + path = self.path if pyterrier_pisa.PisaIndex.built(self): if PisaIndexingMode(self.mode) == PisaIndexingMode.overwrite: warn(f'Removing {str(path)}') shutil.rmtree(path) else: - raise RuntimeError(f'A PISA index already exists at {self.path}. If you want to overwrite it, set mode="overwrite"') + raise RuntimeError(f'A PISA index already exists at {path}. If you want to overwrite it, set mode="overwrite"') if not path.exists(): path.mkdir(parents=True, exist_ok=True) self._index(it) - with open(path/'pt_pisa_config.json', 'wt') as fout: + with open(path/'pt_meta.json', 'wt') as fout: json.dump({ + 'type': 'sparse_index', + 'format': 'pisa', + 'package_hint': 'pyterrier-pisa', 'stemmer': self.stemmer.value, }, fout) - return pyterrier_pisa.PisaIndex(self.path, batch_size=self.batch_size, stemmer=self.stemmer, text_field=self.text_field, threads=self.threads) + return pyterrier_pisa.PisaIndex(path, batch_size=self.batch_size, stemmer=self.stemmer, text_field=self.text_field, threads=self.threads) def _index(self, it): with tempfile.TemporaryDirectory() as d: @@ -76,7 +79,7 @@ def __init__(self, path, text_field='toks', mode=PisaIndexingMode.create, thread def _index(self, it): lexicon = {} docid = 0 - path = Path(self.path) + path = self.path with (path/'fwd.documents').open('wt') as f_docs, (path/'fwd.terms').open('wt') as f_lex: for bidx, batch in enumerate(more_itertools.chunked(it, self.batch_size)): _logger.info(f'inverting batch {bidx}: documents [{docid},{docid+len(batch)})') diff --git a/src/pyterrier_pisa/pisa_metadata_adapter.py b/src/pyterrier_pisa/pisa_metadata_adapter.py new file mode 100644 index 0000000..f1186ed --- /dev/null +++ b/src/pyterrier_pisa/pisa_metadata_adapter.py @@ -0,0 +1,13 @@ +from typing import List + +def pisa_artifact_metadata_adapter(path: str, dir_listing: List[str]): + """ + Guess whether this path is a pisa index. + pyterrier_pisa used to use pt_pisa_config.json instead of pt_meta.json. Use this file to assume they are pisa indexes. + """ + if 'pt_pisa_config.json' in dir_listing: + return { + 'type': 'sparse_index', + 'format': 'pisa', + 'package_hint': 'pyterrier-pisa', + } From 40fbf59cb325b3ac688565d5536e5aa02a6fc7e9 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 7 Jul 2024 18:18:04 +0100 Subject: [PATCH 2/4] drop 3.7 from gh actions --- .github/workflows/build.yml | 4 ++-- .github/workflows/ci.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c6e088c..d3a2f10 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,8 +35,8 @@ jobs: CIBW_BEFORE_BUILD: "python -m pip install numpy setuptools scikit-build ninja cmake -v" CIBW_BUILD_VERBOSITY: '1' CIBW_ARCHS_LINUX: "x86_64" - # Only supports cpython 3.7 - 3.10 on manylinux_x86_64 - CIBW_BUILD: 'cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64' + # Only supports cpython 3.8 - 3.10 on manylinux_x86_64 + CIBW_BUILD: 'cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64' CIBW_REPAIR_WHEEL_COMMAND: 'auditwheel repair -w {dest_dir} {wheel} ; python patcher.py {dest_dir}' CIBW_ENVIRONMENT: 'PT_PISA_VERSION_SUFFIX="${{github.event.inputs.version_suffix}}" PT_PISA_MANYLINUX="True"' - name: upload Test diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5889383..f5a4155 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10'] name: [ubuntu-gcc-9] java: [13] architecture: ['x64'] From cdaed01e59493eb893b9f16e6c4e053607d58707 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Mon, 8 Jul 2024 08:02:09 +0100 Subject: [PATCH 3/4] remove java dependency for stopword list --- src/pyterrier_pisa/__init__.py | 27 ++++++++++----------------- src/pyterrier_pisa/stopwords.py | 4 ++++ 2 files changed, 14 insertions(+), 17 deletions(-) create mode 100644 src/pyterrier_pisa/stopwords.py diff --git a/src/pyterrier_pisa/__init__.py b/src/pyterrier_pisa/__init__.py index 4daa03b..b1f8a09 100644 --- a/src/pyterrier_pisa/__init__.py +++ b/src/pyterrier_pisa/__init__.py @@ -12,10 +12,10 @@ import pyterrier as pt import pyterrier_alpha as pta from pyterrier.datasets import Dataset -import functools import ir_datasets from . import _pisathon from .indexers import PisaIndexer, PisaToksIndexer, PisaIndexingMode +from .stopwords import _STOPWORDS __version__ = '0.1.0' @@ -334,22 +334,15 @@ def from_dataset(dataset: Union[str, Dataset], variant: str = None, version: str def _stops_fname(self, d): if self.stops == PisaStopwords.none: return '' - else: - fifo = os.path.join(d, 'stops') - stops = self.stops - if stops == PisaStopwords.terrier: - stops = _terrier_stops() - with open(fifo, 'wt') as fout: - for stop in stops: - fout.write(f'{stop}\n') - return fifo - - -@functools.lru_cache() -def _terrier_stops(): - Stopwords = pt.autoclass('org.terrier.terms.Stopwords') - stops = list(Stopwords(None).stopWords) - return stops + + fifo = os.path.join(d, 'stops') + stops = self.stops + if stops == PisaStopwords.terrier: + stops = _STOPWORDS['terrier'] + with open(fifo, 'wt') as fout: + for stop in stops: + fout.write(f'{stop}\n') + return fifo class DictTokeniser(pt.Transformer): diff --git a/src/pyterrier_pisa/stopwords.py b/src/pyterrier_pisa/stopwords.py new file mode 100644 index 0000000..6c62cb3 --- /dev/null +++ b/src/pyterrier_pisa/stopwords.py @@ -0,0 +1,4 @@ +_STOPWORDS = { + # From org.terrier.terms.Stopwords + 'terrier': 'a abaft abafter abaftest about abouter aboutest above abover abovest accordingly aer aest afore after afterer afterest afterward afterwards again against aid ain albeit all aller allest alls allyou almost along alongside already also although always amid amidst among amongst an and andor anear anent another any anybody anyhow anyone anything anywhere apart aparter apartest appear appeared appearing appears appropriate appropriated appropriater appropriates appropriatest appropriating are ares around as ases aside asides aslant astraddle astraddler astraddlest astride astrider astridest at athwart atop atween aught aughts available availabler availablest awfully b be became because become becomes becoming becominger becomingest becomings been before beforehand beforehander beforehandest behind behinds below beneath beside besides better bettered bettering betters between betwixt beyond bist both but buts by by-and-by byandby c cannot canst cant canted cantest canting cants cer certain certainer certainest cest chez circa co come-on come-ons comeon comeons concerning concerninger concerningest consequently considering could couldst cum d dday ddays describe described describes describing despite despited despites despiting did different differenter differentest do doe does doing doings done doner dones donest dos dost doth downs downward downwarder downwardest downwards during e each eg eight either else elsewhere enough ere et etc even evened evenest evens evenser evensest ever every everybody everyone everything everywhere ex except excepted excepting excepts exes f fact facts failing failings few fewer fewest figupon figuponed figuponing figupons five followthrough for forby forbye fore forer fores forever former formerer formerest formerly formers fornenst forwhy four fourscore frae from fs further furthered furtherer furtherest furthering furthermore furthers g get gets getting go gone good got gotta gotten h had hadst hae hardly has hast hath have haves having he hence her hereafter hereafters hereby herein hereupon hers herself him himself his hither hitherer hitherest hoo hoos how how-do-you-do howbeit howdoyoudo however huh humph i idem idemer idemest ie if ifs immediate immediately immediater immediatest in inasmuch inc indeed indicate indicated indicates indicating info information insofar instead into inward inwarder inwardest inwards is it its itself j k l latter latterer latterest latterly latters layabout layabouts less lest lot lots lotted lotting m main make many mauger maugre mayest me meanwhile meanwhiles midst midsts might mights more moreover most mostly much mucher muchest must musth musths musts my myself n natheless nathless neath neaths necessarier necessariest necessary neither nethe nethermost never nevertheless nigh nigher nighest nine no no-one nobodies nobody noes none noone nor nos not nothing nothings notwithstanding nowhere nowheres o of off offest offs often oftener oftenest oh on one oneself onest ons onto or orer orest other others otherwise otherwiser otherwisest ought oughts our ours ourself ourselves out outed outest outs outside outwith over overall overaller overallest overalls overs own owned owning owns owt p particular particularer particularest particularly particulars per perhaps plaintiff please pleased pleases plenties plenty pro probably provide provided provides providing q qua que quite r rath rathe rather rathest re really regarding relate related relatively res respecting respectively s said saider saidest same samer sames samest sans sanserif sanserifs sanses saved sayid sayyid seem seemed seeminger seemingest seemings seems send sent senza serious seriouser seriousest seven several severaler severalest shall shalled shalling shalls she should shoulded shoulding shoulds since sine sines sith six so sobeit soer soest some somebody somehow someone something sometime sometimer sometimes sometimest somewhat somewhere stop stopped such summat sup supped supping sups syn syne t ten than that the thee their theirs them themselves then thence thener thenest there thereafter thereby therefore therein therer therest thereupon these they thine thing things this thises thorough thorougher thoroughest thoroughly those thou though thous thouses three thro through througher throughest throughout thru thruer thruest thus thy thyself till tilled tilling tills to together too toward towarder towardest towards two u umpteen under underneath unless unlike unliker unlikest until unto up upon uponed uponing upons upped upping ups us use used usedest username usually v various variouser variousest verier veriest versus very via vis-a-vis vis-a-viser vis-a-visest viz vs w was wast we were wert what whatever whateverer whateverest whatsoever whatsoeverer whatsoeverest wheen when whenas whence whencesoever whenever whensoever where whereafter whereas whereby wherefrom wherein whereinto whereof whereon wheresoever whereto whereupon wherever wherewith wherewithal whether which whichever whichsoever while whiles whilst whither whithersoever whoever whomever whose whoso whosoever why with withal within without would woulded woulding woulds x y ye yet yon yond yonder you your yours yourself yourselves z zillion'.split(), +} From 9b2b2a110c6a0197dbcfc18141754f9afb1baf81 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Mon, 8 Jul 2024 08:04:40 +0100 Subject: [PATCH 4/4] use tqdm directly --- requirements.txt | 1 + src/pyterrier_pisa/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 52f772a..c73765d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ python_terrier>=0.9.1 pyterrier-alpha>=0.2.0 numpy>=1.21.0, <2.0.0 pyciff==0.1.1 +tqdm diff --git a/src/pyterrier_pisa/__init__.py b/src/pyterrier_pisa/__init__.py index b1f8a09..6064810 100644 --- a/src/pyterrier_pisa/__init__.py +++ b/src/pyterrier_pisa/__init__.py @@ -9,6 +9,7 @@ from typing import Optional, Union, List from enum import Enum from collections import Counter +from tqdm import tqdm import pyterrier as pt import pyterrier_alpha as pta from pyterrier.datasets import Dataset @@ -293,7 +294,7 @@ def transform(self, queries): inp.extend(enumerate(queries['query'])) if self.verbose: - inp = pt.tqdm(inp, unit='query', desc=f'PISA {self.scorer.value}') + inp = tqdm(inp, unit='query', desc=f'PISA {self.scorer.value}') with tempfile.TemporaryDirectory() as d: shape = (len(queries) * self.num_results,) result_qidxs = np.ascontiguousarray(np.empty(shape, dtype=np.int32))