From 18a8f857fd12be9c384e6c2dcfda021a13411f14 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 5 Dec 2024 09:59:19 +0000 Subject: [PATCH] upgraded github actions (#32) * upgraded github actions * fix ruff errors * fix ruff errors --- .github/workflows/{build.yml => deploy.yml} | 4 +-- .github/workflows/style.yml | 35 +++++++++++++++++++++ .github/workflows/{ci.yml => test.yml} | 11 ++++--- requirements-dev.txt | 6 ++++ requirements-test.txt | 1 - src/pyterrier_pisa/__init__.py | 21 ++++++++----- src/pyterrier_pisa/indexers.py | 18 +++++------ 7 files changed, 72 insertions(+), 24 deletions(-) rename .github/workflows/{build.yml => deploy.yml} (97%) create mode 100644 .github/workflows/style.yml rename .github/workflows/{ci.yml => test.yml} (87%) create mode 100644 requirements-dev.txt delete mode 100644 requirements-test.txt diff --git a/.github/workflows/build.yml b/.github/workflows/deploy.yml similarity index 97% rename from .github/workflows/build.yml rename to .github/workflows/deploy.yml index dba249a..ed7a139 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/deploy.yml @@ -1,11 +1,11 @@ -name: Upload Python Package +name: deploy on: release: types: [created] jobs: - deploy-bdist: + pypi: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..aa94892 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,35 @@ +name: style + +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + +jobs: + ruff: + strategy: + matrix: + python-version: ['3.10'] + os: ['ubuntu-latest'] + + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements-dev.txt -r requirements.txt + + - name: Ruff + run: 'ruff check --output-format=github src/pyterrier_pisa' diff --git a/.github/workflows/ci.yml b/.github/workflows/test.yml similarity index 87% rename from .github/workflows/ci.yml rename to .github/workflows/test.yml index a8a6a58..445a586 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/test.yml @@ -1,9 +1,12 @@ -name: Test Python package +name: test -on: [push, pull_request] +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + schedule: [cron: '0 12 * * 3'] # every Wednesday at noon jobs: - build: + pytest: runs-on: ${{ matrix.os }} strategy: matrix: @@ -45,7 +48,7 @@ jobs: python setup.py bdist_wheel python patcher.py dist/ pip install dist/*.whl - pip install -r requirements-test.txt + pip install -r requirements-dev.txt - uses: actions/upload-artifact@v4 with: diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..4bb0f3f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +nltk +pytest +pytest-subtests +pytest-cov +pytest-json-report +ruff diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index 6fa2de4..0000000 --- a/requirements-test.txt +++ /dev/null @@ -1 +0,0 @@ -nltk \ No newline at end of file diff --git a/src/pyterrier_pisa/__init__.py b/src/pyterrier_pisa/__init__.py index fac268e..f73ba77 100644 --- a/src/pyterrier_pisa/__init__.py +++ b/src/pyterrier_pisa/__init__.py @@ -1,7 +1,6 @@ from typing import List import numpy as np import json -import sys from pathlib import Path import tempfile import os @@ -121,9 +120,12 @@ def __init__(self, overwrite: If True, the index will be overwritten if it already exists. Defaults to False. """ super().__init__(path) - if stemmer is not None: stemmer = PisaStemmer(stemmer) - if index_encoding is not None: index_encoding = PisaIndexEncoding(index_encoding) - if stops is not None and not isinstance(stops, list): stops = PisaStopwords(stops) + if stemmer is not None: + stemmer = PisaStemmer(stemmer) + if index_encoding is not None: + index_encoding = PisaIndexEncoding(index_encoding) + if stops is not None and not isinstance(stops, list): + stops = PisaStopwords(stops) if (_old_metadata := (self.path/'pt_pisa_config.json').exists()) or (self.path/'pt_meta.json').exists(): if _old_metadata: with (self.path/'pt_pisa_config.json').open('rt') as fin: @@ -135,9 +137,12 @@ def __init__(self, stemmer = PisaStemmer(config['stemmer']) if stemmer.value != config['stemmer']: warn(f'requested stemmer={stemmer.value}, but index was constructed with {config["stemmer"]}') - if stemmer is None: stemmer = PISA_INDEX_DEFAULTS['stemmer'] - if index_encoding is None: index_encoding = PISA_INDEX_DEFAULTS['index_encoding'] - if stops is None: stops = PISA_INDEX_DEFAULTS['stops'] + if stemmer is None: + stemmer = PISA_INDEX_DEFAULTS['stemmer'] + if index_encoding is None: + index_encoding = PISA_INDEX_DEFAULTS['index_encoding'] + if stops is None: + stops = PISA_INDEX_DEFAULTS['stops'] self.text_field = text_field self.stemmer = stemmer self.index_encoding = index_encoding @@ -329,7 +334,7 @@ def get_corpus_iter(self, field='toks', verbose=True): assert self.built() assert (self.path/'fwd').exists(), "get_corpus_iter requires a fwd index" m = np.memmap(self.path/'fwd', mode='r', dtype=np.uint32) - lexicon = [l.strip() for l in (self.path/'fwd.terms').open('rt')] + lexicon = [term.strip() for term in (self.path/'fwd.terms').open('rt')] idx = 2 it = iter((self.path/'fwd.documents').open('rt')) if verbose: diff --git a/src/pyterrier_pisa/indexers.py b/src/pyterrier_pisa/indexers.py index 967cd82..5c1defe 100644 --- a/src/pyterrier_pisa/indexers.py +++ b/src/pyterrier_pisa/indexers.py @@ -87,26 +87,26 @@ def _index(self, it): inv_score = defaultdict(list) lens = [] for doc in batch: - l = 0 + doclen = 0 f_docs.write(doc['docno']+'\n') for term, score in doc[self.text_field].items(): score = int(score * self.scale) if score <= 0: continue - l += score + doclen += score if term not in lexicon: lexicon[term] = len(lexicon) f_lex.write(term+'\n') inv_did[lexicon[term]].append(docid) inv_score[lexicon[term]].append(int(score)) - lens.append(l) + lens.append(doclen) docid += 1 with (path/f'inv.batch.{bidx}.docs').open('wb') as f_did, (path/f'inv.batch.{bidx}.freqs').open('wb') as f_score, (path/f'inv.batch.{bidx}.sizes').open('wb') as f_len: f_did.write(np.array([1, len(batch)], dtype=np.uint32).tobytes()) for i in range(len(lexicon)): - l = len(inv_did[i]) - f_did.write(np.array([l] + inv_did[i], dtype=np.uint32).tobytes()) - f_score.write(np.array([l] + inv_score[i], dtype=np.uint32).tobytes()) + doclen = len(inv_did[i]) + f_did.write(np.array([doclen] + inv_did[i], dtype=np.uint32).tobytes()) + f_score.write(np.array([doclen] + inv_score[i], dtype=np.uint32).tobytes()) f_len.write(np.array([len(lens)] + lens, dtype=np.uint32).tobytes()) _pisathon.merge_inv(str(path/'inv'), bidx+1, len(lexicon)) for i in range(bidx+1): @@ -128,9 +128,9 @@ def _index(self, it): for term in _logger.pbar(sorted(lexicon), desc='re-mapping term ids'): f_lex.write(f'{term}\n') i = lexicon[term] - start, l = offsets_lens[i] - f_docs.write(in_docs[start:start+l]) - f_freqs.write(in_freqs[start:start+l]) + start, doclen = offsets_lens[i] + f_docs.write(in_docs[start:start+doclen]) + f_freqs.write(in_freqs[start:start+doclen]) del in_docs # close mmap del in_freqs # close mmap (path/'inv.docs.tmp').unlink()