Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove java dependency for stopword list #22

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ jobs:
CIBW_BEFORE_BUILD: "python -m pip install numpy setuptools scikit-build ninja cmake -v"
CIBW_BUILD_VERBOSITY: '1'
CIBW_ARCHS_LINUX: "x86_64"
# Only supports cpython 3.7 - 3.10 on manylinux_x86_64
CIBW_BUILD: 'cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64'
# Only supports cpython 3.8 - 3.10 on manylinux_x86_64
CIBW_BUILD: 'cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64'
CIBW_REPAIR_WHEEL_COMMAND: 'auditwheel repair -w {dest_dir} {wheel} ; python patcher.py {dest_dir}'
CIBW_ENVIRONMENT: 'PT_PISA_VERSION_SUFFIX="${{github.event.inputs.version_suffix}}" PT_PISA_MANYLINUX="True"'
- name: upload Test
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
python-version: ['3.8', '3.9', '3.10']
name: [ubuntu-gcc-9]
java: [13]
architecture: ['x64']
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
python_terrier>=0.9.1
numpy>=1.21.0
pyterrier-alpha>=0.2.0
numpy>=1.21.0, <2.0.0
pyciff==0.1.1
tqdm
10 changes: 8 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,16 @@ def run(self):
packages=find_packages(where='src'),
package_dir={'': 'src'},
include_package_data=True,
install_requires=['python-terrier>=0.8.0', 'numpy>=1.21.0'],
python_requires=">=3.7",
install_requires=list(open('requirements.txt')),
python_requires=">=3.8",
entry_points={
'console_scripts': ['pyterrier_pisa=pyterrier_pisa.cli:main'],
'pyterrier.artifact': [
'sparse_index.pisa = pyterrier_pisa:PisaIndex',
],
'pyterrier.artifact.metadata_adapter': [
'sparse_index.pisa = pyterrier_pisa.pisa_metadata_adapter:pisa_artifact_metadata_adapter',
],
},
cmdclass={'bdist_wheel': bdist_wheel}
)
74 changes: 37 additions & 37 deletions src/pyterrier_pisa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
from typing import Optional, Union, List
from enum import Enum
from collections import Counter
from tqdm import tqdm
import pyterrier as pt
import pyterrier_alpha as pta
from pyterrier.datasets import Dataset
import functools
import ir_datasets
from . import _pisathon
from .indexers import PisaIndexer, PisaToksIndexer, PisaIndexingMode
from .stopwords import _STOPWORDS

__version__ = '0.1.0'

Expand Down Expand Up @@ -92,7 +94,7 @@ def log_level(on=True):
_pisathon.log_level(1 if on else 0)


class PisaIndex(pt.Indexer):
class PisaIndex(pta.Artifact, pt.Indexer):
def __init__(self,
path: str,
text_field: str = None,
Expand All @@ -102,14 +104,17 @@ def __init__(self,
stops: Optional[Union[PisaStopwords, List[str]]] = None,
threads: int = 8,
overwrite=False):
self.path = path
ppath = Path(path)
super().__init__(path)
if stemmer is not None: stemmer = PisaStemmer(stemmer)
if index_encoding is not None: index_encoding = PisaIndexEncoding(index_encoding)
if stops is not None and not isinstance(stops, list): stops = PisaStopwords(stops)
if (ppath/'pt_pisa_config.json').exists():
with (ppath/'pt_pisa_config.json').open('rt') as fin:
config = json.load(fin)
if (_old_metadata := (self.path/'pt_pisa_config.json').exists()) or (self.path/'pt_meta.json').exists():
if _old_metadata:
with (self.path/'pt_pisa_config.json').open('rt') as fin:
config = json.load(fin)
else:
with (self.path/'pt_meta.json').open('rt') as fin:
config = json.load(fin)
if stemmer is None:
stemmer = PisaStemmer(config['stemmer'])
if stemmer.value != config['stemmer']:
Expand All @@ -129,7 +134,7 @@ def transform(self, *args, **kwargs):
raise RuntimeError(f'You cannot use {self} itself as a transformer. Did you mean to call a ranking function like .bm25()?')

def built(self):
return (Path(self.path)/'pt_pisa_config.json').exists()
return (self.path/'pt_meta.json').exists() or (self.path/'pt_pisa_config.json').exists()

def index(self, it):
it = more_itertools.peekable(it)
Expand Down Expand Up @@ -168,17 +173,17 @@ def quantized(self, num_results=1000, verbose=False, threads=None, query_algorit

def num_terms(self):
assert self.built()
return _pisathon.num_terms(self.path)
return _pisathon.num_terms(str(self.path))

def num_docs(self):
assert self.built()
return _pisathon.num_docs(self.path)
return _pisathon.num_docs(str(self.path))

def __len__(self):
return self.num_docs()

def __repr__(self):
return f'PisaIndex({repr(self.path)})'
return f'PisaIndex({repr(str(self.path))})'

@staticmethod
def from_dataset(dataset: Union[str, Dataset], variant: str = 'pisa_porter2', version: str = 'latest', **kwargs):
Expand Down Expand Up @@ -210,25 +215,27 @@ def from_ciff(ciff_file: str, index_path, overwrite: bool = False, stemmer = PIS
else:
# If it wasn't created, create one from the documents file
_pisathon.build_binlex(str(ppath/'fwd.documents'), str(ppath/'fwd.doclex'))
with open(ppath/'pt_pisa_config.json', 'wt') as fout:
with open(ppath/'pt_meta.json', 'wt') as fout:
json.dump({
'type': 'sparse_index',
'format': 'pisa',
'package_hint': 'pyterrier-pisa',
'stemmer': stemmer.value,
}, fout)
return PisaIndex(index_path, stemmer=stemmer)

def to_ciff(self, ciff_file: str, description: str = 'from pyterrier_pisa'):
assert self.built()
import pyciff
pyciff.pisa_to_ciff(str(Path(self.path)/'inv'), str(Path(self.path)/'fwd.terms'), str(Path(self.path)/'fwd.documents'), ciff_file, description)
pyciff.pisa_to_ciff(str(self.path/'inv'), str(self.path/'fwd.terms'), str(self.path/'fwd.documents'), ciff_file, description)

def get_corpus_iter(self, field='toks', verbose=True):
assert self.built()
ppath = Path(self.path)
assert (ppath/'fwd').exists(), "get_corpus_iter requires a fwd index"
m = np.memmap(ppath/'fwd', mode='r', dtype=np.uint32)
lexicon = [l.strip() for l in (ppath/'fwd.terms').open('rt')]
assert (self.path/'fwd').exists(), "get_corpus_iter requires a fwd index"
m = np.memmap(self.path/'fwd', mode='r', dtype=np.uint32)
lexicon = [l.strip() for l in (self.path/'fwd.terms').open('rt')]
idx = 2
it = iter((ppath/'fwd.documents').open('rt'))
it = iter((self.path/'fwd.documents').open('rt'))
if verbose:
it = _logger.pbar(it, total=int(m[1]), desc=f'iterating documents in {self}', unit='doc')
for did in it:
Expand Down Expand Up @@ -269,7 +276,7 @@ def __init__(self, index: Union[PisaIndex, str], scorer: Union[PisaScorer, str],
else:
self.query_weighted = query_weighted
self.toks_scale = toks_scale
_pisathon.prepare_index(self.index.path, encoding=self.index.index_encoding.value, scorer_name=self.scorer.value, **retr_args)
_pisathon.prepare_index(str(self.index.path), encoding=self.index.index_encoding.value, scorer_name=self.scorer.value, **retr_args)

def transform(self, queries):
assert 'qid' in queries.columns
Expand All @@ -287,15 +294,15 @@ def transform(self, queries):
inp.extend(enumerate(queries['query']))

if self.verbose:
inp = pt.tqdm(inp, unit='query', desc=f'PISA {self.scorer.value}')
inp = tqdm(inp, unit='query', desc=f'PISA {self.scorer.value}')
with tempfile.TemporaryDirectory() as d:
shape = (len(queries) * self.num_results,)
result_qidxs = np.ascontiguousarray(np.empty(shape, dtype=np.int32))
result_docnos = np.ascontiguousarray(np.empty(shape, dtype=object))
result_ranks = np.ascontiguousarray(np.empty(shape, dtype=np.int32))
result_scores = np.ascontiguousarray(np.empty(shape, dtype=np.float32))
size = _pisathon.retrieve(
self.index.path,
str(self.index.path),
self.index.index_encoding.value,
self.query_algorithm.value,
self.scorer.value,
Expand Down Expand Up @@ -328,22 +335,15 @@ def from_dataset(dataset: Union[str, Dataset], variant: str = None, version: str
def _stops_fname(self, d):
if self.stops == PisaStopwords.none:
return ''
else:
fifo = os.path.join(d, 'stops')
stops = self.stops
if stops == PisaStopwords.terrier:
stops = _terrier_stops()
with open(fifo, 'wt') as fout:
for stop in stops:
fout.write(f'{stop}\n')
return fifo


@functools.lru_cache()
def _terrier_stops():
Stopwords = pt.autoclass('org.terrier.terms.Stopwords')
stops = list(Stopwords(None).stopWords)
return stops

fifo = os.path.join(d, 'stops')
stops = self.stops
if stops == PisaStopwords.terrier:
stops = _STOPWORDS['terrier']
with open(fifo, 'wt') as fout:
for stop in stops:
fout.write(f'{stop}\n')
return fifo


class DictTokeniser(pt.Transformer):
Expand Down
15 changes: 9 additions & 6 deletions src/pyterrier_pisa/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,34 @@ class PisaIndexingMode(Enum):

class PisaIndexer(pt.Indexer):
def __init__(self, path, text_field='text', mode=PisaIndexingMode.create, stemmer='porter2', threads=1, batch_size=100_000):
self.path = path
self.path = Path(path)
self.text_field = text_field
self.mode = PisaIndexingMode(mode)
self.stemmer = pyterrier_pisa.PisaStemmer(stemmer)
self.threads = threads
self.batch_size = batch_size

def index(self, it):
path = Path(self.path)
path = self.path
if pyterrier_pisa.PisaIndex.built(self):
if PisaIndexingMode(self.mode) == PisaIndexingMode.overwrite:
warn(f'Removing {str(path)}')
shutil.rmtree(path)
else:
raise RuntimeError(f'A PISA index already exists at {self.path}. If you want to overwrite it, set mode="overwrite"')
raise RuntimeError(f'A PISA index already exists at {path}. If you want to overwrite it, set mode="overwrite"')
if not path.exists():
path.mkdir(parents=True, exist_ok=True)

self._index(it)

with open(path/'pt_pisa_config.json', 'wt') as fout:
with open(path/'pt_meta.json', 'wt') as fout:
json.dump({
'type': 'sparse_index',
'format': 'pisa',
'package_hint': 'pyterrier-pisa',
'stemmer': self.stemmer.value,
}, fout)
return pyterrier_pisa.PisaIndex(self.path, batch_size=self.batch_size, stemmer=self.stemmer, text_field=self.text_field, threads=self.threads)
return pyterrier_pisa.PisaIndex(path, batch_size=self.batch_size, stemmer=self.stemmer, text_field=self.text_field, threads=self.threads)

def _index(self, it):
with tempfile.TemporaryDirectory() as d:
Expand Down Expand Up @@ -76,7 +79,7 @@ def __init__(self, path, text_field='toks', mode=PisaIndexingMode.create, thread
def _index(self, it):
lexicon = {}
docid = 0
path = Path(self.path)
path = self.path
with (path/'fwd.documents').open('wt') as f_docs, (path/'fwd.terms').open('wt') as f_lex:
for bidx, batch in enumerate(more_itertools.chunked(it, self.batch_size)):
_logger.info(f'inverting batch {bidx}: documents [{docid},{docid+len(batch)})')
Expand Down
13 changes: 13 additions & 0 deletions src/pyterrier_pisa/pisa_metadata_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import List

def pisa_artifact_metadata_adapter(path: str, dir_listing: List[str]):
"""
Guess whether this path is a pisa index.
pyterrier_pisa used to use pt_pisa_config.json instead of pt_meta.json. Use this file to assume they are pisa indexes.
"""
if 'pt_pisa_config.json' in dir_listing:
return {
'type': 'sparse_index',
'format': 'pisa',
'package_hint': 'pyterrier-pisa',
}
4 changes: 4 additions & 0 deletions src/pyterrier_pisa/stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
_STOPWORDS = {
# From org.terrier.terms.Stopwords
'terrier': 'a abaft abafter abaftest about abouter aboutest above abover abovest accordingly aer aest afore after afterer afterest afterward afterwards again against aid ain albeit all aller allest alls allyou almost along alongside already also although always amid amidst among amongst an and andor anear anent another any anybody anyhow anyone anything anywhere apart aparter apartest appear appeared appearing appears appropriate appropriated appropriater appropriates appropriatest appropriating are ares around as ases aside asides aslant astraddle astraddler astraddlest astride astrider astridest at athwart atop atween aught aughts available availabler availablest awfully b be became because become becomes becoming becominger becomingest becomings been before beforehand beforehander beforehandest behind behinds below beneath beside besides better bettered bettering betters between betwixt beyond bist both but buts by by-and-by byandby c cannot canst cant canted cantest canting cants cer certain certainer certainest cest chez circa co come-on come-ons comeon comeons concerning concerninger concerningest consequently considering could couldst cum d dday ddays describe described describes describing despite despited despites despiting did different differenter differentest do doe does doing doings done doner dones donest dos dost doth downs downward downwarder downwardest downwards during e each eg eight either else elsewhere enough ere et etc even evened evenest evens evenser evensest ever every everybody everyone everything everywhere ex except excepted excepting excepts exes f fact facts failing failings few fewer fewest figupon figuponed figuponing figupons five followthrough for forby forbye fore forer fores forever former formerer formerest formerly formers fornenst forwhy four fourscore frae from fs further furthered furtherer furtherest furthering furthermore furthers g get gets getting go gone good got gotta gotten h had hadst hae hardly has hast hath have haves having he hence her hereafter hereafters hereby herein hereupon hers herself him himself his hither hitherer hitherest hoo hoos how how-do-you-do howbeit howdoyoudo however huh humph i idem idemer idemest ie if ifs immediate immediately immediater immediatest in inasmuch inc indeed indicate indicated indicates indicating info information insofar instead into inward inwarder inwardest inwards is it its itself j k l latter latterer latterest latterly latters layabout layabouts less lest lot lots lotted lotting m main make many mauger maugre mayest me meanwhile meanwhiles midst midsts might mights more moreover most mostly much mucher muchest must musth musths musts my myself n natheless nathless neath neaths necessarier necessariest necessary neither nethe nethermost never nevertheless nigh nigher nighest nine no no-one nobodies nobody noes none noone nor nos not nothing nothings notwithstanding nowhere nowheres o of off offest offs often oftener oftenest oh on one oneself onest ons onto or orer orest other others otherwise otherwiser otherwisest ought oughts our ours ourself ourselves out outed outest outs outside outwith over overall overaller overallest overalls overs own owned owning owns owt p particular particularer particularest particularly particulars per perhaps plaintiff please pleased pleases plenties plenty pro probably provide provided provides providing q qua que quite r rath rathe rather rathest re really regarding relate related relatively res respecting respectively s said saider saidest same samer sames samest sans sanserif sanserifs sanses saved sayid sayyid seem seemed seeminger seemingest seemings seems send sent senza serious seriouser seriousest seven several severaler severalest shall shalled shalling shalls she should shoulded shoulding shoulds since sine sines sith six so sobeit soer soest some somebody somehow someone something sometime sometimer sometimes sometimest somewhat somewhere stop stopped such summat sup supped supping sups syn syne t ten than that the thee their theirs them themselves then thence thener thenest there thereafter thereby therefore therein therer therest thereupon these they thine thing things this thises thorough thorougher thoroughest thoroughly those thou though thous thouses three thro through througher throughest throughout thru thruer thruest thus thy thyself till tilled tilling tills to together too toward towarder towardest towards two u umpteen under underneath unless unlike unliker unlikest until unto up upon uponed uponing upons upped upping ups us use used usedest username usually v various variouser variousest verier veriest versus very via vis-a-vis vis-a-viser vis-a-visest viz vs w was wast we were wert what whatever whateverer whateverest whatsoever whatsoeverer whatsoeverest wheen when whenas whence whencesoever whenever whensoever where whereafter whereas whereby wherefrom wherein whereinto whereof whereon wheresoever whereto whereupon wherever wherewith wherewithal whether which whichever whichsoever while whiles whilst whither whithersoever whoever whomever whose whoso whosoever why with withal within without would woulded woulding woulds x y ye yet yon yond yonder you your yours yourself yourselves z zillion'.split(),
}
Loading