From 18a8f857fd12be9c384e6c2dcfda021a13411f14 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 5 Dec 2024 09:59:19 +0000
Subject: [PATCH] upgraded github actions (#32)

* upgraded github actions

* fix ruff errors

* fix ruff errors
---
 .github/workflows/{build.yml => deploy.yml} |  4 +--
 .github/workflows/style.yml                 | 35 +++++++++++++++++++++
 .github/workflows/{ci.yml => test.yml}      | 11 ++++---
 requirements-dev.txt                        |  6 ++++
 requirements-test.txt                       |  1 -
 src/pyterrier_pisa/__init__.py              | 21 ++++++++-----
 src/pyterrier_pisa/indexers.py              | 18 +++++------
 7 files changed, 72 insertions(+), 24 deletions(-)
 rename .github/workflows/{build.yml => deploy.yml} (97%)
 create mode 100644 .github/workflows/style.yml
 rename .github/workflows/{ci.yml => test.yml} (87%)
 create mode 100644 requirements-dev.txt
 delete mode 100644 requirements-test.txt

diff --git a/.github/workflows/build.yml b/.github/workflows/deploy.yml
similarity index 97%
rename from .github/workflows/build.yml
rename to .github/workflows/deploy.yml
index dba249a..ed7a139 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/deploy.yml
@@ -1,11 +1,11 @@
-name: Upload Python Package
+name: deploy
 
 on:
   release:
     types: [created]
 
 jobs:
-  deploy-bdist:
+  pypi:
     runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
new file mode 100644
index 0000000..aa94892
--- /dev/null
+++ b/.github/workflows/style.yml
@@ -0,0 +1,35 @@
+name: style
+
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+
+jobs:
+  ruff:
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        os: ['ubuntu-latest']
+
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Install Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Cache Dependencies
+      uses: actions/cache@v4
+      with:
+        path: ${{ env.pythonLocation }}
+        key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+
+    - name: Install Dependencies
+      run: |
+        pip install --upgrade -r requirements-dev.txt -r requirements.txt
+
+    - name: Ruff
+      run: 'ruff check --output-format=github src/pyterrier_pisa'
diff --git a/.github/workflows/ci.yml b/.github/workflows/test.yml
similarity index 87%
rename from .github/workflows/ci.yml
rename to .github/workflows/test.yml
index a8a6a58..445a586 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/test.yml
@@ -1,9 +1,12 @@
-name: Test Python package
+name: test
 
-on: [push, pull_request]
+on:
+  push: {branches: [main]} # pushes to main
+  pull_request: {} # all PRs
+  schedule: [cron: '0 12 * * 3'] # every Wednesday at noon
 
 jobs:
-  build:
+  pytest:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -45,7 +48,7 @@ jobs:
         python setup.py bdist_wheel
         python patcher.py dist/
         pip install dist/*.whl
-        pip install -r requirements-test.txt
+        pip install -r requirements-dev.txt
 
     - uses: actions/upload-artifact@v4
       with:
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..4bb0f3f
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,6 @@
+nltk
+pytest
+pytest-subtests
+pytest-cov
+pytest-json-report
+ruff
diff --git a/requirements-test.txt b/requirements-test.txt
deleted file mode 100644
index 6fa2de4..0000000
--- a/requirements-test.txt
+++ /dev/null
@@ -1 +0,0 @@
-nltk
\ No newline at end of file
diff --git a/src/pyterrier_pisa/__init__.py b/src/pyterrier_pisa/__init__.py
index fac268e..f73ba77 100644
--- a/src/pyterrier_pisa/__init__.py
+++ b/src/pyterrier_pisa/__init__.py
@@ -1,7 +1,6 @@
 from typing import List
 import numpy as np
 import json
-import sys
 from pathlib import Path
 import tempfile
 import os
@@ -121,9 +120,12 @@ def __init__(self,
       overwrite: If True, the index will be overwritten if it already exists. Defaults to False.
     """
     super().__init__(path)
-    if stemmer is not None: stemmer = PisaStemmer(stemmer)
-    if index_encoding is not None: index_encoding = PisaIndexEncoding(index_encoding)
-    if stops is not None and not isinstance(stops, list): stops = PisaStopwords(stops)
+    if stemmer is not None:
+      stemmer = PisaStemmer(stemmer)
+    if index_encoding is not None:
+      index_encoding = PisaIndexEncoding(index_encoding)
+    if stops is not None and not isinstance(stops, list):
+      stops = PisaStopwords(stops)
     if (_old_metadata := (self.path/'pt_pisa_config.json').exists()) or (self.path/'pt_meta.json').exists():
       if _old_metadata:
         with (self.path/'pt_pisa_config.json').open('rt') as fin:
@@ -135,9 +137,12 @@ def __init__(self,
         stemmer = PisaStemmer(config['stemmer'])
       if stemmer.value != config['stemmer']:
         warn(f'requested stemmer={stemmer.value}, but index was constructed with {config["stemmer"]}')
-    if stemmer is None: stemmer = PISA_INDEX_DEFAULTS['stemmer']
-    if index_encoding is None: index_encoding = PISA_INDEX_DEFAULTS['index_encoding']
-    if stops is None: stops = PISA_INDEX_DEFAULTS['stops']
+    if stemmer is None:
+      stemmer = PISA_INDEX_DEFAULTS['stemmer']
+    if index_encoding is None:
+      index_encoding = PISA_INDEX_DEFAULTS['index_encoding']
+    if stops is None:
+      stops = PISA_INDEX_DEFAULTS['stops']
     self.text_field = text_field
     self.stemmer = stemmer
     self.index_encoding = index_encoding
@@ -329,7 +334,7 @@ def get_corpus_iter(self, field='toks', verbose=True):
     assert self.built()
     assert (self.path/'fwd').exists(), "get_corpus_iter requires a fwd index"
     m = np.memmap(self.path/'fwd', mode='r', dtype=np.uint32)
-    lexicon = [l.strip() for l in (self.path/'fwd.terms').open('rt')]
+    lexicon = [term.strip() for term in (self.path/'fwd.terms').open('rt')]
     idx = 2
     it = iter((self.path/'fwd.documents').open('rt'))
     if verbose:
diff --git a/src/pyterrier_pisa/indexers.py b/src/pyterrier_pisa/indexers.py
index 967cd82..5c1defe 100644
--- a/src/pyterrier_pisa/indexers.py
+++ b/src/pyterrier_pisa/indexers.py
@@ -87,26 +87,26 @@ def _index(self, it):
         inv_score = defaultdict(list)
         lens = []
         for doc in batch:
-          l = 0
+          doclen = 0
           f_docs.write(doc['docno']+'\n')
           for term, score in doc[self.text_field].items():
             score = int(score * self.scale)
             if score <= 0:
               continue
-            l += score
+            doclen += score
             if term not in lexicon:
               lexicon[term] = len(lexicon)
               f_lex.write(term+'\n')
             inv_did[lexicon[term]].append(docid)
             inv_score[lexicon[term]].append(int(score))
-          lens.append(l)
+          lens.append(doclen)
           docid += 1
         with (path/f'inv.batch.{bidx}.docs').open('wb') as f_did, (path/f'inv.batch.{bidx}.freqs').open('wb') as f_score, (path/f'inv.batch.{bidx}.sizes').open('wb') as f_len:
           f_did.write(np.array([1, len(batch)], dtype=np.uint32).tobytes())
           for i in range(len(lexicon)):
-            l = len(inv_did[i])
-            f_did.write(np.array([l] + inv_did[i], dtype=np.uint32).tobytes())
-            f_score.write(np.array([l] + inv_score[i], dtype=np.uint32).tobytes())
+            doclen = len(inv_did[i])
+            f_did.write(np.array([doclen] + inv_did[i], dtype=np.uint32).tobytes())
+            f_score.write(np.array([doclen] + inv_score[i], dtype=np.uint32).tobytes())
           f_len.write(np.array([len(lens)] + lens, dtype=np.uint32).tobytes())
     _pisathon.merge_inv(str(path/'inv'), bidx+1, len(lexicon))
     for i in range(bidx+1):
@@ -128,9 +128,9 @@ def _index(self, it):
       for term in _logger.pbar(sorted(lexicon), desc='re-mapping term ids'):
         f_lex.write(f'{term}\n')
         i = lexicon[term]
-        start, l = offsets_lens[i]
-        f_docs.write(in_docs[start:start+l])
-        f_freqs.write(in_freqs[start:start+l])
+        start, doclen = offsets_lens[i]
+        f_docs.write(in_docs[start:start+doclen])
+        f_freqs.write(in_freqs[start:start+doclen])
     del in_docs # close mmap
     del in_freqs # close mmap
     (path/'inv.docs.tmp').unlink()