diff --git a/.circleci/config.yml b/.circleci/config.yml
index 59aebf33..5cadd5dd 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,46 +1,62 @@
-version: 2
+version: "2.1"
-workflows:
- version: 2
- test:
- jobs:
- - python_3.8
- - python_3.9
- - python_3.10
-
-template: &template
- docker:
- - image: ubuntu:bionic
- steps:
- - checkout
- - run:
- name: Set up miniconda
- command: |
- apt update
- apt install -y wget
- cd $HOME
- wget "https://repo.anaconda.com/miniconda/Miniconda3-4.7.10-Linux-x86_64.sh" -O miniconda.sh
- printf '%s' "8a324adcc9eaf1c09e22a992bb6234d91a94146840ee6b11c114ecadafc68121 miniconda.sh" | sha256sum -c
- bash miniconda.sh -b -p $HOME/miniconda
- - run:
- name: Set up environment, dependencies and run tests
- command: |
- export PATH="$HOME/miniconda/bin:$PATH"
- conda create -n sequentia python=$PYTHON_VERSION -y
- source activate sequentia
- pip install ".[dev]"
- pytest lib/test --disable-pytest-warnings
+orbs:
+ python: circleci/python@2.1.1
+ coveralls: coveralls/coveralls@2.2.1
jobs:
- python_3.8:
- <<: *template
- environment:
- PYTHON_VERSION: "3.8"
- python_3.9:
- <<: *template
- environment:
- PYTHON_VERSION: "3.9"
- python_3.10:
- <<: *template
- environment:
- PYTHON_VERSION: "3.10"
+ linting:
+ executor:
+ name: python/default
+ tag: "3.11"
+ steps:
+ - checkout
+ - python/install-packages:
+ pkg-manager: poetry
+ args: --only base
+ - run:
+ name: Linting
+ command: |
+ poetry run tox -e lint
+ tests:
+ parameters:
+ version:
+ type: string
+ executor:
+ name: python/default
+ tag: <>
+ steps:
+ - checkout
+ - python/install-packages:
+ pkg-manager: poetry
+ args: --only base
+ # - run:
+ # name: Docstring tests
+ # command: |
+ # poetry run tox -e tests.doctest
+ - run:
+ name: Unit tests
+ command: |
+ poetry run -- tox -e tests -- --cov
+ - coveralls/upload:
+ flag_name: <>
+ parallel: true
+ coverage:
+ executor:
+ name: python/default
+ steps:
+ - coveralls/upload:
+ carryforward: 3.11, 3.12
+ parallel_finished: true
+
+workflows:
+ checks:
+ jobs:
+ - linting
+ - tests:
+ matrix:
+ parameters:
+ version: ["3.11", "3.12"]
+ - coverage:
+ requires:
+ - tests
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 00000000..f507f561
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @eonu
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
new file mode 100644
index 00000000..e74e8350
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@@ -0,0 +1,68 @@
+name: Report unexpected behaviour
+description: If you came across something unexpected, let us know here!
+labels: [bug, pending]
+
+body:
+ - type: checkboxes
+ id: exists
+ attributes:
+ label: Has this already been reported?
+ description: If you haven't already, please look other existing issues to see if this bug has already been reported.
+ options:
+ - label: This is a new bug!
+ required: true
+
+ - type: textarea
+ id: expected-behaviour
+ attributes:
+ label: Expected behaviour
+ description: |
+ Please describe the behaviour that you expected to see.
+
+ If appropriate, provide any links to official Sequentia documentation that indicate this is the behaviour that is expected.
+ validations:
+ required: true
+
+ - type: textarea
+ id: observed-behaviour
+ attributes:
+ label: Observed behaviour
+ description: |
+ Please describe the unexpected behaviour that you observed.
+
+ Make sure to provide as much information as possible, so that we can investigate as thoroughly as we can.
+ validations:
+ required: true
+
+ - type: textarea
+ id: example
+ attributes:
+ label: Code to reproduce
+ description: >
+ Please provide a snippet of code that shows how to reproduce the bug,
+ making sure that it is [minimal and reproducible](https://stackoverflow.com/help/minimal-reproducible-example).
+
+ placeholder: |
+ import sequentia
+
+ ...
+ render: Python
+
+ - type: textarea
+ id: version
+ attributes:
+ label: Version details
+ description: |
+ To help us get to the root of the problem as fast as possible, please run the following command to display version information about:
+
+ - Python
+ - Sequentia
+ - Operating system
+
+ ```bash
+ python -c "import sequentia; print(sequentia.version.version_info())"
+ ```
+
+ render: Text
+ validations:
+ required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..c18f3334
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+ - name: Got a question?
+ url: "https://github.com/eonu/sequentia/discussions/new?category=questions"
+ about: Start a discussion on GitHub discussions where Sequentia developers and users can respond.
diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml
new file mode 100644
index 00000000..93a3cdb4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature.yml
@@ -0,0 +1,21 @@
+name: Request a new feature or improvement
+description: If you have a suggestion for something that might improve Sequentia, let us know here!
+labels: [feature, pending]
+
+body:
+ - type: checkboxes
+ id: exists
+ attributes:
+ label: Does this suggestion already exist?
+ description: If you haven't already, please look through the documentation and other existing issues to see if this feature is already implemented.
+ options:
+ - label: This is a new feature!
+ required: true
+
+ - type: textarea
+ id: feature-description
+ attributes:
+ label: Feature description
+ description: Please describe the new feature or improvement that you would like.
+ validations:
+ required: true
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 00000000..5697917b
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,13 @@
+## Description
+
+
+
+
+
+
+## Checklist
+
+- [ ] I have added new tests (if necessary).
+- [ ] I have ensured that tests and coverage are passing on CI.
+- [ ] I have updated any relevant documentation (if necessary).
+- [ ] I have used a descriptive pull request title.
diff --git a/.github/workflows/create-github-release.yml b/.github/workflows/create-github-release.yml
new file mode 100644
index 00000000..2987102b
--- /dev/null
+++ b/.github/workflows/create-github-release.yml
@@ -0,0 +1,25 @@
+name: Create GitHub release
+
+on:
+ push:
+ branches:
+ - master
+
+jobs:
+ release:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write
+ steps:
+ - name: Check out repository code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: "Get previous tag"
+ id: latest-tag
+ uses: "WyriHaximus/github-action-get-previous-tag@v1"
+ - uses: ncipollo/release-action@v1
+ with:
+ tag: ${{ steps.latest-tag.outputs.tag }}
+ generateReleaseNotes: true
+ token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/create-pypi-release.yml b/.github/workflows/create-pypi-release.yml
new file mode 100644
index 00000000..f2209060
--- /dev/null
+++ b/.github/workflows/create-pypi-release.yml
@@ -0,0 +1,19 @@
+name: Create PyPI release
+
+on:
+ push:
+ branches:
+ - master
+
+jobs:
+ release:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out repository code
+ uses: actions/checkout@v4
+ - name: Build and publish to PyPI
+ uses: JRubics/poetry-publish@v1.17
+ with:
+ python_version: "3.11.3"
+ poetry_version: "==1.7.1"
+ pypi_token: ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/create-release-pr.yml b/.github/workflows/create-release-pr.yml
new file mode 100644
index 00000000..e8f76aa0
--- /dev/null
+++ b/.github/workflows/create-release-pr.yml
@@ -0,0 +1,60 @@
+name: Create release PR
+
+on:
+ workflow_dispatch:
+ inputs:
+ version:
+ description: Version
+ required: true
+
+jobs:
+ create-pull-request:
+ permissions: write-all
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check out repository code
+ uses: actions/checkout@v4
+ with:
+ ref: dev
+ fetch-depth: 0
+ - name: Install Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.11.3
+ - name: Install Poetry
+ uses: abatilo/actions-poetry@v2
+ with:
+ poetry-version: 1.7.1
+ - name: Install base dependencies
+ run: poetry install --sync --only base
+ - name: Bump version
+ run: |
+ poetry run -q invoke release.build -- -v ${{ github.event.inputs.version }}
+ - name: Update changelog
+ uses: orhun/git-cliff-action@v2
+ id: cliff-changelog
+ with:
+ config: cliff.toml
+ args: --tag ${{ github.event.inputs.version }}
+ env:
+ OUTPUT: CHANGELOG.md
+ - name: Get changelog entry
+ uses: orhun/git-cliff-action@v2
+ id: cliff-entry
+ with:
+ config: cliff.toml
+ args: --unreleased --strip header
+ env:
+ OUTPUT: ENTRY.md
+ - name: Create pull request
+ uses: peter-evans/create-pull-request@v5.0.2
+ with:
+ token: ${{ secrets.GITHUB_TOKEN }}
+ commit-message: "release: v${{ github.event.inputs.version }}"
+ title: "release: v${{ github.event.inputs.version }}"
+ body: "${{ steps.cliff-entry.outputs.content }}"
+ branch: release/${{ github.event.inputs.version }}
+ - uses: rickstaa/action-create-tag@v1
+ with:
+ tag: v${{ github.event.inputs.version }}
+ github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/semantic-pull-request.yml b/.github/workflows/semantic-pull-request.yml
new file mode 100644
index 00000000..187552e0
--- /dev/null
+++ b/.github/workflows/semantic-pull-request.yml
@@ -0,0 +1,43 @@
+name: Enforce semantic PR title
+
+on:
+ pull_request_target:
+ types:
+ - opened
+ - edited
+ - synchronize
+
+permissions:
+ pull-requests: read
+
+jobs:
+ main:
+ name: validate
+ runs-on: ubuntu-latest
+ steps:
+ - uses: amannn/action-semantic-pull-request@v5
+ with:
+ subjectPattern: ^(?![A-Z]).+$
+ subjectPatternError: |
+ The subject "{subject}" found in the pull request title "{title}"
+ didn't match the configured pattern. Please ensure that the subject
+ doesn't start with an uppercase character.
+ types: |
+ build
+ chore
+ ci
+ docs
+ feat
+ fix
+ perf
+ refactor
+ release
+ revert
+ style
+ tests
+ scopes: |
+ deps
+ git
+ pkg
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index f2282c09..5025358b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,7 @@ ehthumbs.db
Thumbs.db
# Byte-compiled / optimized / DLL files
-__pycache__/
+__pycache__
*.py[cod]
*$py.class
@@ -73,6 +73,7 @@ docs/_build
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
Pipfile.lock
+poetry.lock
# Environments
.env
@@ -87,3 +88,9 @@ venv.bak/
.spyderproject
.spyproject
.vscode
+
+# Ruff
+.ruff_cache
+
+# Changelog entry
+ENTRY.md
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..82305145
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+repos:
+ # license notice
+ - repo: local
+ hooks:
+ - id: notice
+ name: notice
+ entry: poetry run python notice.py
+ language: system
+ types: [python]
+ always_run: true
+ pass_filenames: false
+ # ruff check (w/autofix)
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.3 # should match version in pyproject.toml
+ hooks:
+ - id: ruff
+ args: [--fix, --exit-non-zero-on-fix]
+ # ruff format
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.3 # should match version in pyproject.toml
+ hooks:
+ - id: ruff-format
+ # # pydoclint - docstring formatting
+ # - repo: https://github.com/jsh9/pydoclint
+ # rev: 0.3.8
+ # hooks:
+ # - id: pydoclint
+ # args: [--config=pyproject.toml]
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 00000000..de9dea11
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,16 @@
+version: 2
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.11"
+ jobs:
+ post_create_environment:
+ - pip install poetry
+ - poetry config virtualenvs.create false
+ post_install:
+ - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --only base,main,docs
+
+sphinx:
+ configuration: docs/source/conf.py
+ fail_on_warning: false
diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index 991ad00a..00000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-version: 2
-python:
- version: "3.8"
- install:
- - method: pip
- path: .
- extra_requirements:
- - dev
- system_packages: true
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 9206db16..fe583f6c 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -7,7 +7,7 @@ Our open source community strives to:
- **Be friendly and patient.**
- **Be welcoming**: We strive to be a community that welcomes and supports people of all backgrounds and identities. This includes, but is not limited to members of any race, ethnicity, culture, national origin, colour, immigration status, social and economic class, educational level, sex, sexual orientation, gender identity and expression, age, size, family status, political belief, religion, and mental and physical ability.
- **Be considerate**: Your work will be used by other people, and you in turn will depend on the work of others. Any decision you take will affect users and colleagues, and you should take those consequences into account when making decisions. Remember that we're a world-wide community, so you might not be communicating in someone else's primary language.
-- **Be respectful**: Not all of us will agree all the time, but disagreement is no excuse for poor behavior and poor manners. We might all experience some frustration now and then, but we cannot allow that frustration to turn into a personal attack. It’s important to remember that a community where people feel uncomfortable or threatened is not a productive one.
+- **Be respectful**: Not all of us will agree all the time, but disagreement is no excuse for poor behavior and poor manners. We might all experience some frustration now and then, but we cannot allow that frustration to turn into a personal attack. It’s important to remember that a community where people feel uncomfortable or threatened is not a productive one.
- **Be careful in the words that you choose**: we are a community of professionals, and we conduct ourselves professionally. Be kind to others. Do not insult or put down other participants. Harassment and other exclusionary behavior aren't acceptable. This includes, but is not limited to:
- Violent threats or language directed against another person.
- Discriminatory jokes and language.
@@ -34,7 +34,7 @@ If you experience or witness unacceptable behavior—or have any other concerns
- Your contact information.
- Names (real, nicknames, or pseudonyms) of any individuals involved. If there are additional witnesses, please
-include them as well. Your account of what occurred, and if you believe the incident is ongoing. If there is a publicly available record (e.g. a mailing list archive or a public IRC logger), please include a link.
+ include them as well. Your account of what occurred, and if you believe the incident is ongoing. If there is a publicly available record (e.g. a mailing list archive or a public IRC logger), please include a link.
- Any additional information that may be helpful.
After filing a report, a representative will contact you personally. If the person who is harassing you is part of the response team, they will recuse themselves from handling your incident. A representative will then review the incident, follow up with any additional questions, and make a decision as to how to respond. We will respect confidentiality requests for the purpose of protecting victims of abuse.
@@ -46,3 +46,10 @@ Anyone asked to stop unacceptable behavior is expected to comply immediately. If
This code of conduct is based on the [Open Code of Conduct](https://github.com/todogroup/opencodeofconduct) from the [TODOGroup](http://todogroup.org).
We are thankful for their work and all the communities who have paved the way with code of conducts.
+
+---
+
+
+ Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
+ Authored and maintained by Edwin Onuonga.
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index af8d66a5..1430004f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,28 +1,35 @@
# Contributing
-As Sequentia is an open source library, any contributions from the community are greatly appreciated. This document details the guidelines for making contributions to Sequentia.
+As Sequentia is an open source library, any contributions from the community are greatly appreciated.
+This document details the guidelines for making contributions to Sequentia.
## Reporting issues
Prior to reporting an issue, please ensure:
-- [ ] You have used the search utility provided on GitHub issues to look for similar issues.
-- [ ] You have checked the documentation (for the version of Sequentia you are using).
-- [ ] You are using the latest version of Sequentia (if possible).
+- [x] You have used the search utility provided on GitHub issues to look for similar issues.
+- [x] You have checked the documentation (for the version of Sequentia you are using).
+- [x] You are using the latest stable version of Sequentia (if possible).
## Making changes to Sequentia
-- **Add tests**: Your pull request won't be accepted if it doesn't have any tests.
+- **Add tests**:
+ Your pull request won't be accepted if it doesn't have any tests (if necessary).
-- **Document any change in behaviour**: Make sure the README and all other relevant documentation is kept up-to-date.
+- **Document any change in behaviour**:
+ Make sure all relevant documentation is kept up-to-date.
-- **Create topic branches**: Will not pull from your master branch!
+- **Create topic branches**:
+ Will not pull from your master branch!
-- **One pull request per feature**: If you wish to add more than one new feature, please make multiple pull requests.
+- **One pull request per feature**:
+ If you wish to add more than one new feature, make multiple pull requests.
-- **Meaningful commit messages**: Make sure each individual commit in your pull request has a meaningful message.
+- **Meaningful commit messages**:
+ Each commit in your pull request should have a meaningful message.
-- **De-clutter commit history**: If you had to make multiple intermediate commits while developing, please squash them before making your pull request.
+- **De-clutter commit history**:
+ If you had to make multiple intermediate commits while developing, please squash them before making your pull request.
Or add a note on the PR specifying to squash and merge your changes when ready to be merged.
### Making pull requests
@@ -31,17 +38,73 @@ Please make new branches based on the current `dev` branch, and merge your PR ba
### Installing dependencies
-If you intend to help contribute to Sequentia, you will need some additional dependencies for running tests, notebooks and generating documentation.
+To install all dependencies and pre-commit hooks for development, ensure you have [Poetry](https://python-poetry.org/) (1.6.1+) installed and run:
-You can specify the `dev` extra when installing Sequentia to do this.
+```console
+make
+```
+
+### Running tests
+
+This repository relies on the use of [Tox](https://tox.wiki/en/4.11.3/) for running tests in virtual environments.
+
+- Run **ALL tests** in a virtual environment:
+ ```console
+ # a.k.a. poetry run invoke tests.install tests.unit
+ poetry run tox -e tests
+ ```
+
+### Linting and formatting
+
+This repository relies on the use of:
+
+- [Ruff](https://github.com/astral-sh/ruff) for linting and formatting Python source code,
+- [Tox](https://tox.wiki/en/4.11.3/) for running linting and formatting in a virtual environment.
+
+To lint the source code using Ruff and Pydoclint with Tox:
+
+```console
+# a.k.a poetry run invoke lint.install lint.check
+poetry run tox -e lint
+```
+
+To format the source code and attempt to auto-fix any linting issues using Ruff with Tox:
```console
-# from inside the sequentia repository folder
-pip install -e .[dev]
+# a.k.a. poetry run invoke lint.install lint.format
+poetry run tox -e format
```
-> **Note**: On some shells you may have to escape, e.g. `pip install -e ".[dev]"`
+Pre-commit hooks will prevent you from making a commit if linting fails or your code is not formatted correctly.
+
+### Documentation
+
+Package documentation is automatically produced from docstrings using [Sphinx](https://www.sphinx-doc.org/en/master/).
+The package also uses [Tox](https://tox.wiki/en/4.11.3/) for building documentation inside a virtual environment.
+
+To build package documentation and automatically serve the files as a HTTP server while watching for source code changes, run:
+
+```console
+# a.k.a. poetry run invoke docs.install docs.build
+poetry run tox -e docs
+```
+
+This will start a server running on `localhost:8000` by default.
+
+To only build the static documentation HTML files without serving them or watching for changes, run:
+
+```console
+# a.k.a. poetry run invoke docs.install docs.build --no-watch
+poetry run tox -e docs -- --no-watch
+```
## License
-By contributing, you agree that your contributions will be licensed under the same [MIT License](/LICENSE) that covers this repository.
+By contributing, you agree that your contributions will be licensed under the repository's [MIT License](/LICENSE).
+
+---
+
+
+ Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
+ Authored and maintained by Edwin Onuonga.
+
diff --git a/LICENSE b/LICENSE
index 16449d11..c5d87015 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2019-2023 Edwin Onuonga (eonu)
+Copyright (c) 2019-2025 Edwin Onuonga (eonu)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..e83318f3
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+# This Makefile is based on https://github.com/pydantic/pydantic/blob/main/Makefile.
+
+.DEFAULT_GOAL := dev
+
+# check that poetry is installed
+.PHONY: .check-poetry
+.check-poetry:
+ @poetry -V || echo 'Please install Poetry: https://python-poetry.org/'
+
+# install invoke and tox
+.PHONY: base
+base: .check-poetry
+ poetry install --sync --only base
+
+# install development dependencies
+.PHONY: dev
+dev: .check-poetry
+ poetry install --sync --only base
+ poetry run invoke install
diff --git a/README.md b/README.md
index fcedee38..f90a15ce 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
- HMM and DTW-based sequence machine learning algorithms in Python following an sklearn-like interface.
+ Scikit-Learn compatible HMM and DTW based sequence machine learning algorithms in Python.
@@ -20,6 +20,9 @@
+
+
+
@@ -50,10 +53,15 @@ Some examples of how Sequentia can be used on sequence data include:
- predicting motion intent for gesture control from sEMG signals,
- classifying hand-written characters according to their pen-tip trajectories.
+### Why Sequentia?
+
+- **Simplicity and interpretability**: Sequentia offers a limited set of machine learning algorithms, chosen specifically to be more interpretable and easier to configure than more complex alternatives such as recurrent neural networks and transformers, while maintaining a high level of effectiveness.
+- **Familiar and user-friendly**: To fit more seamlessly into the workflow of data science practitioners, Sequentia follows the ubiquitous Scikit-Learn API, providing a familiar model development process for many, as well as enabling wider access to the rapidly growing Scikit-Learn ecosystem.
+
## Build Status
-| `master` | `dev` |
-| -------- | ------|
+| `master` | `dev` |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [![CircleCI Build (Master)](https://img.shields.io/circleci/build/github/eonu/sequentia/master?logo=circleci&style=flat-square)](https://app.circleci.com/pipelines/github/eonu/sequentia?branch=master) | [![CircleCI Build (Development)](https://img.shields.io/circleci/build/github/eonu/sequentia/dev?logo=circleci&style=flat-square)](https://app.circleci.com/pipelines/github/eonu/sequentia?branch=master) |
## Features
@@ -84,36 +92,19 @@ Parameter estimation with the Baum-Welch algorithm and prediction with the forwa
### Scikit-Learn compatibility
-Sequentia aims to follow the Scikit-Learn interface for estimators and transformations,
-as well as to be largely compatible with three core Scikit-Learn modules to improve the ease of model development:
-[`preprocessing`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing), [`model_selection`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection) and [`pipeline`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline).
-
-While there are many other modules, maintaining full compatibility with Scikit-Learn is challenging and many of its features are inapplicable to sequential data, therefore we only focus on the relevant core modules.
+**Sequentia (≥2.0) is fully compatible with the Scikit-Learn API (≥1.4), enabling for rapid development and prototyping of sequential models.**
-Despite some deviation from the Scikit-Learn interface in order to accommodate sequences, the following features are currently compatible with Sequentia.
-
-- [x] [`preprocessing`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing)
- - [x] [`FunctionTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer) — via an adapted class definition
- - [x] Function-based transformations (stateless)
- - [x] Class-based transformations (stateful)
-- [ ] [`pipeline`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline)
- - [x] [`Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline) — via an adapted class definition
- - [ ] [`FeatureUnion`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html#sklearn.pipeline.FeatureUnion)
-- [ ] [`model_selection`](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection)
+In most cases, the only necessary change is to add a `lengths` key-word argument to provide sequence length information, e.g. `fit(X, y, lengths=lengths)` instead of `fit(X, y)`.
## Installation
-You can install Sequentia using `pip`.
-
-### Stable
-
-The latest stable version of Sequentia can be installed with the following command.
+The latest stable version of Sequentia can be installed with the following command:
```console
pip install sequentia
```
-#### C library compilation
+### C library compilation
For optimal performance when using any of the k-NN based models, it is important that `dtaidistance` C libraries are compiled correctly.
@@ -126,16 +117,6 @@ from dtaidistance import dtw
dtw.try_import_c()
```
-### Pre-release
-
-Pre-release versions include new features which are in active development and may change unpredictably.
-
-The latest pre-release version can be installed with the following command.
-
-```console
-pip install --pre sequentia
-```
-
### Development
Please see the [contribution guidelines](/CONTRIBUTING.md) to see installation instructions for contributing to Sequentia.
@@ -155,10 +136,10 @@ import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
from sequentia.models import KNNClassifier
-from sequentia.pipeline import Pipeline
-from sequentia.preprocessing import IndependentFunctionTransformer, mean_filter
+from sequentia.preprocessing import IndependentFunctionTransformer, median_filter
# Create input data
# - Sequentia expects sequences to be concatenated into a single array
@@ -179,7 +160,7 @@ X = np.array([
[1.21, 5.8 ],
# Sequence 3 - Length 2
[1.7 , 6.22],
- [2.01, 5.49]
+ [2.01, 5.49],
])
# Sequence lengths
@@ -189,23 +170,23 @@ lengths = np.array([3, 5, 2])
y = np.array([0, 1, 1])
# Create a transformation pipeline that feeds into a KNNClassifier
-# 1. Individually denoise each sequence by applying a mean filter for each feature
+# 1. Individually denoise each sequence by applying a median filter for each feature
# 2. Individually standardize each sequence by subtracting the mean and dividing the s.d. for each feature
# 3. Reduce the dimensionality of the data to a single feature by using PCA
# 4. Pass the resulting transformed data into a KNNClassifier
pipeline = Pipeline([
- ('denoise', IndependentFunctionTransformer(mean_filter)),
+ ('denoise', IndependentFunctionTransformer(median_filter)),
('scale', IndependentFunctionTransformer(scale)),
('pca', PCA(n_components=1)),
('knn', KNNClassifier(k=1))
])
# Fit the pipeline to the data - lengths must be provided
-pipeline.fit(X, y, lengths)
+pipeline.fit(X, y, lengths=lengths)
# Predict classes for the sequences and calculate accuracy - lengths must be provided
-y_pred = pipeline.predict(X, lengths)
-acc = pipeline.score(X, y, lengths)
+y_pred = pipeline.predict(X, lengths=lengths)
+acc = pipeline.score(X, y, lengths=lengths)
```
## Acknowledgments
@@ -282,11 +263,11 @@ All contributions to this repository are greatly appreciated. Contribution guide
Sequentia is released under the [MIT](https://opensource.org/licenses/MIT) license.
Certain parts of the source code are heavily adapted from [Scikit-Learn](scikit-learn.org/).
-Such files contain copy of [their license](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING).
+Such files contain a copy of [their license](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING).
---
- Sequentia © 2019-2023, Edwin Onuonga - Released under the MIT license.
+ Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
Authored and maintained by Edwin Onuonga.
-
\ No newline at end of file
+
diff --git a/CHANGELOG.md b/cliff.toml
similarity index 87%
rename from CHANGELOG.md
rename to cliff.toml
index 95f11041..fb980ce5 100644
--- a/CHANGELOG.md
+++ b/cliff.toml
@@ -1,5 +1,23 @@
+# git-cliff ~ default configuration file
+# https://git-cliff.org/docs/configuration
+#
+# Lines starting with "#" are comments.
+# Configuration options are organized into tables and keys.
+# See documentation for more information on available options.
+
+[changelog]
+# changelog header
+header = """
# Changelog
+All notable changes to this project will be documented in this file.
+
+---
+
+
+
+ Click to see pre-2.0 changelog entries
+
## [1.1.1](https://github.com/eonu/sequentia/releases/tag/v1.1.1)
@@ -376,4 +394,81 @@
#### Major changes
-Nothing, initial release!
\ No newline at end of file
+Nothing, initial release!
+
+
+
+"""
+# template for the changelog body
+# https://keats.github.io/tera/docs/#introduction
+body = """
+{% if version %}\
+ ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }}
+{% else %}\
+ ## [unreleased]
+{% endif %}\
+{% for group, commits in commits | group_by(attribute="group") %}
+ ### {{ group | upper_first }}
+ {% for commit in commits %}
+{% if commit.message is matching("^.*\\(#\\d+\\)$") %}\
+ - {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message }}\
+{% endif %}\
+ {% endfor %}
+{% endfor %}\n
+"""
+# remove the leading and trailing whitespace from the template
+trim = true
+# changelog footer
+footer = """
+
+"""
+# postprocessors
+postprocessors = [
+ { pattern = '## \[([0-9]+\.[0-9]+\.[0-9]+[a-z0-9]*)\]', replace = "## [v${1}](https://github.com/eonu/sequentia/releases/tag/v${1})" },
+ { pattern = '\(#([0-9]+)\)', replace = "([#${1}](https://github.com/eonu/sequentia/issues/${1}))" },
+ { pattern = '\n\n\n', replace = "\n\n" },
+]
+
+[git]
+# parse the commits based on https://www.conventionalcommits.org
+conventional_commits = true
+# filter out the commits that are not conventional
+filter_unconventional = true
+# process each line of a commit as an individual commit
+split_commits = false
+# regex for preprocessing the commit messages
+commit_preprocessors = [
+ # { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](/issues/${2}))"}, # replace issue numbers
+]
+
+# regex for parsing and grouping commits
+commit_parsers = [
+ { message = "^build", group = "Build System" },
+ # { message = "^chore|ci", group = "Miscellaneous Tasks" },
+ { message = "^doc", group = "Documentation" },
+ { message = "^feat", group = "Features" },
+ { message = "^fix", group = "Bug Fixes" },
+ { message = "^perf", group = "Performance" },
+ { message = "^refactor", group = "Refactor" },
+ # { message = "^release", group = "Release" },
+ { message = "^revert", group = "Reversions" },
+ { message = "^style", group = "Styling" },
+ { message = "^test", group = "Testing" },
+]
+# protect breaking changes from being skipped due to matching a skipping commit_parser
+protect_breaking_commits = false
+# filter out the commits that are not matched by commit parsers
+filter_commits = true
+# regex for matching git tags
+tag_pattern = "v[0-9].*"
+
+# regex for skipping tags
+skip_tags = "v0.1.0-beta.1"
+# regex for ignoring tags
+ignore_tags = ""
+# sort the tags topologically
+topo_order = false
+# sort the commits inside sections by oldest/newest order
+sort_commits = "oldest"
+# limit the number of commits included in the changelog.
+# limit_commits = 42
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 275f93d6..00000000
--- a/docs/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Documentation
-
-This directory contains the Sphinx documentation for Sequentia.
-
-## Viewing the documentation
-
-You can view the documentation for different versions of Sequentia on [Read The Docs](https://sequentia.readthedocs.io/en/latest).
-
-## Building the documentation
-
-To build the documentation, you'll need to make sure you have the required dependencies installed.
-
-Once you've cloned the repository, you can do this by running the following command in the repository root.
-
-```console
-pip install .[dev]
-```
-
-Once the dependencies are installed, you can build the documentation with the following command (from the `docs` directory).
-
-```console
-sphinx-autobuild . _build/html --watch ../lib
-```
-
-This will pick up any changes made to the `docs` and `lib` directories and restart the Sphinx server.
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index 66341bae..00000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. Sequentia documentation master file, created by
- sphinx-quickstart on Sat Dec 28 19:22:34 2019.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-.. image:: /_static/images/logo.png
- :alt: Sequentia
- :width: 75px
- :target: https://github.com/eonu/sequentia
- :align: center
-
-Sequentia
-=========
-
-Sequentia is a Python package that provides various classification and regression algorithms for sequential data, including methods based on hidden Markov models and dynamic time warping.
-
-Features
---------
-
-.. toctree::
- :titlesonly:
-
- sections/models/index
- sections/preprocessing/index
- sections/datasets/index
-
-Documentation Search and Index
-------------------------------
-
-* :ref:`search`
-* :ref:`genindex`
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 00000000..747ffb7b
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.https://www.sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/sections/datasets/index.rst b/docs/sections/datasets/index.rst
deleted file mode 100644
index e61f8f14..00000000
--- a/docs/sections/datasets/index.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-Datasets
-========
-
-.. toctree::
- :titlesonly:
-
- digits
- gene_families
-
-----
-
-Sequentia provides a selection of sample sequential datasets for quick experimentation.
-
-Each dataset follows the interface described below.
-
-API reference
--------------
-
-Class
-^^^^^
-
-.. autosummary::
-
- ~sequentia.utils.SequentialDataset
-
-Methods
-^^^^^^^
-
-.. autosummary::
-
- ~sequentia.utils.SequentialDataset.__init__
- ~sequentia.utils.SequentialDataset.copy
- ~sequentia.utils.SequentialDataset.iter_by_class
- ~sequentia.utils.SequentialDataset.load
- ~sequentia.utils.SequentialDataset.save
- ~sequentia.utils.SequentialDataset.split
-
-Properties
-^^^^^^^^^^
-
-.. autosummary::
-
- ~sequentia.utils.SequentialDataset.X
- ~sequentia.utils.SequentialDataset.X_lengths
- ~sequentia.utils.SequentialDataset.X_y
- ~sequentia.utils.SequentialDataset.X_y_lengths
- ~sequentia.utils.SequentialDataset.classes
- ~sequentia.utils.SequentialDataset.idxs
- ~sequentia.utils.SequentialDataset.lengths
- ~sequentia.utils.SequentialDataset.y
-
-|
-
-.. autoclass:: sequentia.utils.SequentialDataset
- :members:
-
diff --git a/docs/sections/preprocessing/pipeline.rst b/docs/sections/preprocessing/pipeline.rst
deleted file mode 100644
index 6b9120bc..00000000
--- a/docs/sections/preprocessing/pipeline.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-Pipeline
-========
-
-Before fitting and using a model, it is common to apply a sequence of preprocessing steps to data.
-
-Pipelines can be used to wrap preprocessing transformations as well as a model into a single estimator,
-making it more convenient to reapply the transformations and make predictions on new data.
-
-The :class:`.Pipeline` class implements this feature and is based on :class:`sklearn.pipeline.Pipeline`.
-
-API reference
--------------
-
-Class
-^^^^^
-
-.. autosummary::
-
- ~sequentia.pipeline.Pipeline
-
-Methods
-^^^^^^^
-
-.. autosummary::
-
- ~sequentia.pipeline.Pipeline.__init__
- ~sequentia.pipeline.Pipeline.fit
- ~sequentia.pipeline.Pipeline.fit_predict
- ~sequentia.pipeline.Pipeline.fit_transform
- ~sequentia.pipeline.Pipeline.inverse_transform
- ~sequentia.pipeline.Pipeline.predict
- ~sequentia.pipeline.Pipeline.predict_proba
- ~sequentia.pipeline.Pipeline.score
- ~sequentia.pipeline.Pipeline.transform
-
-|
-
-.. autoclass:: sequentia.pipeline.Pipeline
- :members:
- :inherited-members:
- :exclude-members: decision_function, get_feature_names_out, get_params, set_params, set_output, predict_log_proba, score_samples, feature_names_in_
diff --git a/docs/source/__init__.py b/docs/source/__init__.py
new file mode 100644
index 00000000..1c085491
--- /dev/null
+++ b/docs/source/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Sphinx documentation for Sequentia."""
diff --git a/docs/_static/css/toc.css b/docs/source/_static/css/toc.css
similarity index 100%
rename from docs/_static/css/toc.css
rename to docs/source/_static/css/toc.css
diff --git a/docs/_static/images/classifier.png b/docs/source/_static/images/classifier.png
similarity index 100%
rename from docs/_static/images/classifier.png
rename to docs/source/_static/images/classifier.png
diff --git a/docs/_static/images/covariance_types.png b/docs/source/_static/images/covariance_types.png
similarity index 100%
rename from docs/_static/images/covariance_types.png
rename to docs/source/_static/images/covariance_types.png
diff --git a/docs/_static/images/hmm.png b/docs/source/_static/images/hmm.png
similarity index 100%
rename from docs/_static/images/hmm.png
rename to docs/source/_static/images/hmm.png
diff --git a/docs/_static/images/logo.png b/docs/source/_static/images/logo.png
similarity index 100%
rename from docs/_static/images/logo.png
rename to docs/source/_static/images/logo.png
diff --git a/docs/_static/images/topologies.svg b/docs/source/_static/images/topologies.svg
similarity index 100%
rename from docs/_static/images/topologies.svg
rename to docs/source/_static/images/topologies.svg
diff --git a/docs/conf.py b/docs/source/conf.py
similarity index 54%
rename from docs/conf.py
rename to docs/source/conf.py
index 55c959c0..00895b3d 100644
--- a/docs/conf.py
+++ b/docs/source/conf.py
@@ -1,30 +1,29 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
+"""Sphinx configuration file for Sequentia documentation."""
+
# -- Path setup --------------------------------------------------------------
-import sys, os
+import os
+import sys
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-try:
- # Sequentia is installed
- import sequentia
-except ImportError:
- # Sequentia is run from its source checkout
- sys.path.insert(0, os.path.abspath('../lib'))
- import sequentia
+sys.path.insert(0, os.path.abspath("../.."))
# -- Project information -----------------------------------------------------
-project = sequentia.__name__
-author = sequentia.__author__
-copyright = sequentia.__copyright__
-release = sequentia.__version__
+project = "sequentia"
+copyright = "2019-2025, Sequentia Developers" # noqa: A001
+author = "Edwin Onuonga (eonu)"
+release = "2.0.0a1"
# -- General configuration ---------------------------------------------------
@@ -32,46 +31,51 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.viewcode',
- 'sphinx.ext.intersphinx',
- 'numpydoc',
- 'm2r2',
+ "sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.mathjax",
+ # 'sphinx.ext.viewcode',
+ "sphinx.ext.intersphinx",
+ # "numpydoc",
+ "enum_tools.autoenum",
]
intersphinx_mapping = {
- 'numpy': ('https://numpy.org/doc/stable/', None),
- 'sklearn': ('https://scikit-learn.org/stable/', None),
+ "python": ("https://docs.python.org/3", None),
+ "numpy": ("https://numpy.org/doc/stable/", None),
+ "sklearn": ("https://scikit-learn.org/stable/", None),
+ "hmmlearn": ("https://hmmlearn.readthedocs.io/en/stable/", None),
}
+napoleon_numpy_docstring = True
+napoleon_use_admonition_for_examples = True
autodoc_members = True
-autodoc_member_order = 'groupwise' # bysource, groupwise, alphabetical
+autodoc_member_order = "groupwise" # bysource, groupwise, alphabetical
autosummary_generate = True
numpydoc_show_class_members = False
# Set master document
-master_doc = 'index'
+master_doc = "index"
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
-source_suffix = ['.rst']
+source_suffix = [".rst"]
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
-html_theme = 'pyramid' # sphinx_rtd_theme
+html_theme = "pyramid" # sphinx_rtd_theme
autodoc_typehints = "description"
autodoc_class_signature = "separated"
@@ -79,8 +83,9 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
+
# Custom stylesheets
-def setup(app):
- app.add_css_file('css/toc.css')
+def setup(app) -> None: # noqa: ANN001, D103
+ app.add_css_file("css/toc.css")
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 00000000..787fc110
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,52 @@
+.. Sequentia documentation master file, created by
+ sphinx-quickstart on Sat Dec 28 19:22:34 2019.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. raw:: html
+
+
+
+ Sequentia
+
+
+
+ Scikit-Learn compatible HMM and DTW based sequence machine learning algorithms in Python.
+
+
+
+
+
+
+Features
+--------
+
+.. toctree::
+ :titlesonly:
+
+ sections/models/index
+ sections/preprocessing/index
+ sections/datasets/index
+ sections/configuration
+
+Documentation Search and Index
+------------------------------
+
+* :ref:`search`
+* :ref:`genindex`
diff --git a/docs/source/sections/configuration.rst b/docs/source/sections/configuration.rst
new file mode 100644
index 00000000..62d1e9c3
--- /dev/null
+++ b/docs/source/sections/configuration.rst
@@ -0,0 +1,20 @@
+Configuration
+=============
+
+The following are configuration options for various Sequentia classes and functions.
+
+API Reference
+-------------
+
+.. autosummary::
+
+ ~sequentia.enums.CovarianceMode
+ ~sequentia.enums.PriorMode
+ ~sequentia.enums.TopologyMode
+ ~sequentia.enums.TransitionMode
+
+|
+
+.. automodule:: sequentia.enums
+ :members:
+ :exclude-members: __new__
diff --git a/docs/sections/datasets/digits.rst b/docs/source/sections/datasets/digits.rst
similarity index 100%
rename from docs/sections/datasets/digits.rst
rename to docs/source/sections/datasets/digits.rst
diff --git a/docs/sections/datasets/gene_families.rst b/docs/source/sections/datasets/gene_families.rst
similarity index 100%
rename from docs/sections/datasets/gene_families.rst
rename to docs/source/sections/datasets/gene_families.rst
diff --git a/docs/source/sections/datasets/index.rst b/docs/source/sections/datasets/index.rst
new file mode 100644
index 00000000..29cf5cde
--- /dev/null
+++ b/docs/source/sections/datasets/index.rst
@@ -0,0 +1,56 @@
+Datasets
+========
+
+.. toctree::
+ :titlesonly:
+
+ digits
+ gene_families
+
+----
+
+Sequentia provides a selection of sample sequential datasets for quick experimentation.
+
+Each dataset follows the interface described below.
+
+API reference
+-------------
+
+Class
+^^^^^
+
+.. autosummary::
+
+ ~sequentia.datasets.base.SequentialDataset
+
+Methods
+^^^^^^^
+
+.. autosummary::
+
+ ~sequentia.datasets.base.SequentialDataset.__init__
+ ~sequentia.datasets.base.SequentialDataset.copy
+ ~sequentia.datasets.base.SequentialDataset.iter_by_class
+ ~sequentia.datasets.base.SequentialDataset.load
+ ~sequentia.datasets.base.SequentialDataset.save
+ ~sequentia.datasets.base.SequentialDataset.split
+
+Properties
+^^^^^^^^^^
+
+.. autosummary::
+
+ ~sequentia.datasets.base.SequentialDataset.X
+ ~sequentia.datasets.base.SequentialDataset.X_lengths
+ ~sequentia.datasets.base.SequentialDataset.X_y
+ ~sequentia.datasets.base.SequentialDataset.X_y_lengths
+ ~sequentia.datasets.base.SequentialDataset.classes
+ ~sequentia.datasets.base.SequentialDataset.idxs
+ ~sequentia.datasets.base.SequentialDataset.lengths
+ ~sequentia.datasets.base.SequentialDataset.y
+
+|
+
+.. autoclass:: sequentia.datasets.base.SequentialDataset
+ :members:
+
diff --git a/docs/sections/models/hmm/classifier.rst b/docs/source/sections/models/hmm/classifier.rst
similarity index 86%
rename from docs/sections/models/hmm/classifier.rst
rename to docs/source/sections/models/hmm/classifier.rst
index 1345df22..a94a0874 100644
--- a/docs/sections/models/hmm/classifier.rst
+++ b/docs/source/sections/models/hmm/classifier.rst
@@ -54,9 +54,12 @@ Methods
~sequentia.models.hmm.classifier.HMMClassifier.add_models
~sequentia.models.hmm.classifier.HMMClassifier.fit
~sequentia.models.hmm.classifier.HMMClassifier.fit_predict
+ ~sequentia.models.hmm.classifier.HMMClassifier.load
~sequentia.models.hmm.classifier.HMMClassifier.predict
+ ~sequentia.models.hmm.classifier.HMMClassifier.predict_log_proba
~sequentia.models.hmm.classifier.HMMClassifier.predict_proba
~sequentia.models.hmm.classifier.HMMClassifier.predict_scores
+ ~sequentia.models.hmm.classifier.HMMClassifier.save
~sequentia.models.hmm.classifier.HMMClassifier.score
|
@@ -64,4 +67,4 @@ Methods
.. autoclass:: sequentia.models.hmm.classifier.HMMClassifier
:members:
:inherited-members:
- :exclude-members: get_params, set_params
+ :exclude-members: get_params, set_params, get_metadata_routing, set_fit_request, set_predict_log_proba_request, set_predict_proba_request, set_predict_request, set_score_request
diff --git a/docs/sections/models/hmm/index.rst b/docs/source/sections/models/hmm/index.rst
similarity index 99%
rename from docs/sections/models/hmm/index.rst
rename to docs/source/sections/models/hmm/index.rst
index 0c3517ff..13ebc481 100644
--- a/docs/sections/models/hmm/index.rst
+++ b/docs/source/sections/models/hmm/index.rst
@@ -1,3 +1,5 @@
+.. _hmms:
+
Hidden Markov Models
====================
diff --git a/docs/sections/models/hmm/variants/categorical.rst b/docs/source/sections/models/hmm/variants/categorical.rst
similarity index 89%
rename from docs/sections/models/hmm/variants/categorical.rst
rename to docs/source/sections/models/hmm/variants/categorical.rst
index d7f7984c..e746af83 100644
--- a/docs/sections/models/hmm/variants/categorical.rst
+++ b/docs/source/sections/models/hmm/variants/categorical.rst
@@ -55,16 +55,16 @@ Methods
~sequentia.models.hmm.variants.CategoricalHMM.bic
~sequentia.models.hmm.variants.CategoricalHMM.fit
~sequentia.models.hmm.variants.CategoricalHMM.freeze
- ~sequentia.models.hmm.variants.CategoricalHMM.n_params
~sequentia.models.hmm.variants.CategoricalHMM.score
- ~sequentia.models.hmm.variants.CategoricalHMM.set_start_probs
- ~sequentia.models.hmm.variants.CategoricalHMM.set_state_emissions
- ~sequentia.models.hmm.variants.CategoricalHMM.set_transitions
+ ~sequentia.models.hmm.variants.CategoricalHMM.set_state_emission_probs
+ ~sequentia.models.hmm.variants.CategoricalHMM.set_state_start_probs
+ ~sequentia.models.hmm.variants.CategoricalHMM.set_state_transition_probs
~sequentia.models.hmm.variants.CategoricalHMM.unfreeze
+ ~sequentia.models.hmm.variants.CategoricalHMM.n_params
|
.. autoclass:: sequentia.models.hmm.variants.CategoricalHMM
:members:
:inherited-members:
- :exclude-members: get_params, set_params
+ :exclude-members: get_params, set_params, get_metadata_routing, set_fit_request, set_score_request
diff --git a/docs/sections/models/hmm/variants/gaussian_mixture.rst b/docs/source/sections/models/hmm/variants/gaussian_mixture.rst
similarity index 93%
rename from docs/sections/models/hmm/variants/gaussian_mixture.rst
rename to docs/source/sections/models/hmm/variants/gaussian_mixture.rst
index 193c600c..bc322e6b 100644
--- a/docs/sections/models/hmm/variants/gaussian_mixture.rst
+++ b/docs/source/sections/models/hmm/variants/gaussian_mixture.rst
@@ -64,18 +64,18 @@ Methods
~sequentia.models.hmm.variants.GaussianMixtureHMM.bic
~sequentia.models.hmm.variants.GaussianMixtureHMM.fit
~sequentia.models.hmm.variants.GaussianMixtureHMM.freeze
- ~sequentia.models.hmm.variants.GaussianMixtureHMM.n_params
~sequentia.models.hmm.variants.GaussianMixtureHMM.score
- ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_start_probs
- ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_covariances
+ ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_covars
~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_means
+ ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_start_probs
+ ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_transition_probs
~sequentia.models.hmm.variants.GaussianMixtureHMM.set_state_weights
- ~sequentia.models.hmm.variants.GaussianMixtureHMM.set_transitions
~sequentia.models.hmm.variants.GaussianMixtureHMM.unfreeze
+ ~sequentia.models.hmm.variants.GaussianMixtureHMM.n_params
|
.. autoclass:: sequentia.models.hmm.variants.GaussianMixtureHMM
:members:
:inherited-members:
- :exclude-members: get_params, set_params
+ :exclude-members: get_params, set_params, get_metadata_routing, set_fit_request, set_score_request
diff --git a/docs/sections/models/hmm/variants/index.rst b/docs/source/sections/models/hmm/variants/index.rst
similarity index 100%
rename from docs/sections/models/hmm/variants/index.rst
rename to docs/source/sections/models/hmm/variants/index.rst
diff --git a/docs/sections/models/index.rst b/docs/source/sections/models/index.rst
similarity index 100%
rename from docs/sections/models/index.rst
rename to docs/source/sections/models/index.rst
diff --git a/docs/sections/models/knn/classifier.rst b/docs/source/sections/models/knn/classifier.rst
similarity index 91%
rename from docs/sections/models/knn/classifier.rst
rename to docs/source/sections/models/knn/classifier.rst
index 6d06f63f..906fa3b7 100644
--- a/docs/sections/models/knn/classifier.rst
+++ b/docs/source/sections/models/knn/classifier.rst
@@ -39,10 +39,8 @@ Methods
~sequentia.models.knn.classifier.KNNClassifier.fit
~sequentia.models.knn.classifier.KNNClassifier.fit_predict
~sequentia.models.knn.classifier.KNNClassifier.load
- ~sequentia.models.knn.classifier.KNNClassifier.plot_dtw_histogram
- ~sequentia.models.knn.classifier.KNNClassifier.plot_warping_path_1d
- ~sequentia.models.knn.classifier.KNNClassifier.plot_weight_histogram
~sequentia.models.knn.classifier.KNNClassifier.predict
+ ~sequentia.models.knn.classifier.KNNClassifier.predict_log_proba
~sequentia.models.knn.classifier.KNNClassifier.predict_proba
~sequentia.models.knn.classifier.KNNClassifier.predict_scores
~sequentia.models.knn.classifier.KNNClassifier.query_neighbors
@@ -54,7 +52,7 @@ Methods
.. autoclass:: sequentia.models.knn.classifier.KNNClassifier
:members:
:inherited-members:
- :exclude-members: get_params, set_params
+ :exclude-members: get_params, set_params, get_metadata_routing, set_fit_request, set_predict_request, set_predict_log_proba_request, set_predict_proba_request, set_score_request
.. rubric:: References
diff --git a/docs/sections/models/knn/index.rst b/docs/source/sections/models/knn/index.rst
similarity index 100%
rename from docs/sections/models/knn/index.rst
rename to docs/source/sections/models/knn/index.rst
diff --git a/docs/sections/models/knn/regressor.rst b/docs/source/sections/models/knn/regressor.rst
similarity index 90%
rename from docs/sections/models/knn/regressor.rst
rename to docs/source/sections/models/knn/regressor.rst
index 9ed5b58e..2e1926fa 100644
--- a/docs/sections/models/knn/regressor.rst
+++ b/docs/source/sections/models/knn/regressor.rst
@@ -43,9 +43,6 @@ Methods
~sequentia.models.knn.regressor.KNNRegressor.fit
~sequentia.models.knn.regressor.KNNRegressor.fit_predict
~sequentia.models.knn.regressor.KNNRegressor.load
- ~sequentia.models.knn.regressor.KNNRegressor.plot_dtw_histogram
- ~sequentia.models.knn.regressor.KNNRegressor.plot_warping_path_1d
- ~sequentia.models.knn.regressor.KNNRegressor.plot_weight_histogram
~sequentia.models.knn.regressor.KNNRegressor.predict
~sequentia.models.knn.regressor.KNNRegressor.query_neighbors
~sequentia.models.knn.regressor.KNNRegressor.save
@@ -56,7 +53,7 @@ Methods
.. autoclass:: sequentia.models.knn.regressor.KNNRegressor
:members:
:inherited-members:
- :exclude-members: get_params, set_params
+ :exclude-members: get_params, set_params, get_metadata_routing, set_fit_request, set_predict_request, set_score_request
.. rubric:: References
diff --git a/docs/sections/preprocessing/index.rst b/docs/source/sections/preprocessing/index.rst
similarity index 52%
rename from docs/sections/preprocessing/index.rst
rename to docs/source/sections/preprocessing/index.rst
index 28d2b34f..1403bda1 100644
--- a/docs/sections/preprocessing/index.rst
+++ b/docs/source/sections/preprocessing/index.rst
@@ -4,17 +4,14 @@ Preprocessing
.. toctree::
:titlesonly:
- pipeline
transforms/index
----
-Sequentia provides an adapted version of the :mod:`sklearn.preprocessing` interface,
-modified for sequential data support but also continuing to support most of the Scikit-Learn transformations out-of-the-box.
+Although :mod:`sklearn.preprocessing` is compatible with Sequentia,
+we also provide a lightweight preprocessing interface with additional features.
Transformations can be applied to all of the input sequences collectively — treated as a single array,
or on an individual basis by using the :class:`.IndependentFunctionTransformer`.
-Transformation steps can be combined together with an estimator in a :class:`.Pipeline` which follows the Scikit-Learn interface.
-
Additional transformations specific to sequences are also provided, such as :ref:`filters ` for signal data.
diff --git a/docs/sections/preprocessing/transforms/filters.rst b/docs/source/sections/preprocessing/transforms/filters.rst
similarity index 100%
rename from docs/sections/preprocessing/transforms/filters.rst
rename to docs/source/sections/preprocessing/transforms/filters.rst
diff --git a/docs/sections/preprocessing/transforms/function_transformer.rst b/docs/source/sections/preprocessing/transforms/function_transformer.rst
similarity index 52%
rename from docs/sections/preprocessing/transforms/function_transformer.rst
rename to docs/source/sections/preprocessing/transforms/function_transformer.rst
index a6af2ac8..0fe8954d 100644
--- a/docs/sections/preprocessing/transforms/function_transformer.rst
+++ b/docs/source/sections/preprocessing/transforms/function_transformer.rst
@@ -4,14 +4,7 @@ Function Transformer
When preprocessing sequential data, it is often preferable to apply certain transformations
on each sequence independently rather than applying a single transformation to all of the data collectively.
-For example in speech recognition, suppose we have a dataset of MFCC features extracted from audio recordings of different speakers.
-If we are not interested in speaker-focused tasks such as speaker recognition, and instead only want to classify recordings,
-we need to be able to compare recordings to each other — especially if using algorithms such as :class:`.KNNClassifier` which rely on distance comparisons.
-
-In this case, we might want to standardize the MFCCs for each recording individually, (i.e. centering and scaling by separate feature means and standard deviations for each recording) so that they are represented as deviations from zero,
-which is a form that is better suited for comparison as it reduces speaker-specific nuances in the data due to differences in scale or location.
-
-Another example would be signal filters, which should be applied to each sequence independently.
+For example, we might want to apply signal filters to each sequence independently.
:class:`.IndependentFunctionTransformer` allows for such transformations to be defined for arbitrary functions.
@@ -41,4 +34,4 @@ Methods
.. autoclass:: sequentia.preprocessing.transforms.IndependentFunctionTransformer
:members:
:inherited-members:
- :exclude-members: get_params, set_params, set_output
+ :exclude-members: get_params, set_params, get_feature_names_out, get_metadata_routing, set_fit_request, set_inverse_transform_request, set_output, set_transform_request
diff --git a/docs/sections/preprocessing/transforms/index.rst b/docs/source/sections/preprocessing/transforms/index.rst
similarity index 100%
rename from docs/sections/preprocessing/transforms/index.rst
rename to docs/source/sections/preprocessing/transforms/index.rst
diff --git a/lib/sequentia/__init__.py b/lib/sequentia/__init__.py
deleted file mode 100644
index f1d9e1e1..00000000
--- a/lib/sequentia/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .datasets import *
-from .models import *
-from .preprocessing import *
-from .pipeline import *
-from .utils import *
-
-__name__ = "sequentia"
-__version__ = "1.1.1"
-__author__ = "Edwin Onuonga"
-__email__ = "ed@eonu.net"
-__copyright__ = f"2019-2023, {__author__}"
diff --git a/lib/sequentia/datasets/__init__.py b/lib/sequentia/datasets/__init__.py
deleted file mode 100644
index bcf450e5..00000000
--- a/lib/sequentia/datasets/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .digits import load_digits
-from .gene_families import load_gene_families
diff --git a/lib/sequentia/datasets/digits.py b/lib/sequentia/datasets/digits.py
deleted file mode 100644
index 942aac56..00000000
--- a/lib/sequentia/datasets/digits.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from pkg_resources import resource_filename
-from typing import Iterable
-from operator import itemgetter
-
-import numpy as np
-from pydantic import conint, validator
-
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.validation import _Validator
-from sequentia.utils.decorators import _validate_params
-
-class _DigitsValidator(_Validator):
- digits: Iterable[conint(ge=0, le=9)] = list(range(10))
-
- @validator('digits')
- def check_digits(cls, value):
- value = list(value)
- if len(set(value)) < len(value):
- raise ValueError('Expected digits to be unique')
- return value
-
-@_validate_params(using=_DigitsValidator)
-def load_digits(
- *,
- digits: Iterable[int] = list(range(10)),
-) -> SequentialDataset:
- """Loads MFCC features of spoken digit audio samples from the Free Spoken Digit Dataset.
-
- The `Free Spoken Digit Dataset (FSDD) `_
- consists of 3000 recordings of the spoken digits 0-9.
-
- This version consists of 13 MFCC features of 50 recordings for each digit by 6 individual speakers.
-
- :param digits: Subset of digits to include in the dataset.
- :return: A dataset object representing the loaded digits.
- """
- # Load the dataset from compressed numpy file
- data = np.load(resource_filename('sequentia', 'datasets/data/digits.npz'))
-
- # Fetch arrays from loaded file
- X, y, lengths = itemgetter('X', 'y', 'lengths')(data)
-
- # Select and create a Dataset only with sequences having the specified labels
- idx = np.argwhere(np.isin(y, digits)).flatten()
- ranges = SequentialDataset._get_idxs(lengths)[idx]
- return SequentialDataset(
- np.vstack([x for x in SequentialDataset._iter_X(X, ranges)]),
- y[idx],
- lengths[idx]
- )
diff --git a/lib/sequentia/datasets/gene_families.py b/lib/sequentia/datasets/gene_families.py
deleted file mode 100644
index 64491de8..00000000
--- a/lib/sequentia/datasets/gene_families.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from pkg_resources import resource_filename
-from typing import Iterable, Tuple
-from operator import itemgetter
-
-import numpy as np
-from pydantic import conint, validator
-from sklearn.preprocessing import LabelEncoder
-
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.validation import _Validator
-from sequentia.utils.decorators import _validate_params
-
-
-class _GeneFamiliesValidator(_Validator):
- families: Iterable[conint(ge=0, le=6)] = list(range(7))
-
- @validator('families')
- def check_families(cls, value):
- value = list(value)
- if len(set(value)) < len(value):
- raise ValueError('Expected gene families to be unique')
- return value
-
-@_validate_params(using=_GeneFamiliesValidator)
-def load_gene_families(
- *,
- families: Iterable[int] = list(range(7))
-) -> Tuple[SequentialDataset, LabelEncoder]:
- """Loads human DNA sequences grouped by gene family.
-
- The `Human DNA Sequences `_ dataset
- consists of 4380 DNA sequences belonging to 7 gene families.
-
- This dataset has imbalanced classes, and uses an :class:`sklearn:sklearn.preprocessing.LabelEncoder` to
- encode the original symbols (``A``, ``T``, ``C``, ``G``, ``N``) that form the DNA sequences, into integers.
-
- The gene families have the following class labels:
-
- - G protein coupled receptors: ``0``
- - Tyrosine kinase: ``1``
- - Tyrosine phosphatase: ``2``
- - Synthetase: ``3``
- - Synthase: ``4``
- - Ion channel: ``5``
- - Transcription: ``6``
-
- :param families: Subset of gene families to include in the dataset.
-
- :return:
-
- - A dataset object representing the loaded genetic data.
- - Label encoder used to encode the observation symbols into integers.
- """
- # Load the dataset from compressed numpy file
- data = np.load(resource_filename('sequentia', 'datasets/data/gene_families.npz'))
-
- # Fetch arrays from loaded file
- X, y, lengths = itemgetter('X', 'y', 'lengths')(data)
-
- # Encode the observation symbols into integers
- enc = LabelEncoder()
- X = np.expand_dims(enc.fit_transform(X.flatten()), axis=-1)
-
- # Select and create a Dataset only with sequences having the specified labels
- idx = np.argwhere(np.isin(y, families)).flatten()
- ranges = SequentialDataset._get_idxs(lengths)[idx]
- data = SequentialDataset(
- np.vstack([x for x in SequentialDataset._iter_X(X, ranges)]),
- y[idx],
- lengths[idx]
- )
-
- return data, enc
diff --git a/lib/sequentia/models/__init__.py b/lib/sequentia/models/__init__.py
deleted file mode 100644
index 20d6ed2d..00000000
--- a/lib/sequentia/models/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .hmm import *
-from .knn import *
-from .base import *
diff --git a/lib/sequentia/models/base.py b/lib/sequentia/models/base.py
deleted file mode 100644
index 2e5aeb59..00000000
--- a/lib/sequentia/models/base.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from __future__ import annotations
-
-from typing import Optional, Any
-
-from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
-from sklearn.metrics import accuracy_score, r2_score
-
-from sequentia.utils.validation import Array
-from sequentia.utils.decorators import _requires_fit
-
-
-class _Classifier(BaseEstimator, ClassifierMixin):
- def fit(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> _Classifier:
- raise NotImplementedError
-
-
- def predict(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- raise NotImplementedError
-
-
- def fit_predict(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- return self.fit(X, y, lengths).predict(X, lengths)
-
-
- def predict_proba(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- raise NotImplementedError
-
-
- def predict_scores(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- raise NotImplementedError
-
-
- @_requires_fit
- def score(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]] = None,
- normalize: bool = True,
- sample_weight: Optional[Any] = None
- ) -> float:
- y_pred = self.predict(X, lengths)
- return accuracy_score(y, y_pred, normalize=normalize, sample_weight=sample_weight)
-
-
-class _Regressor(BaseEstimator, RegressorMixin):
- def fit(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> _Regressor:
- raise NotImplementedError
-
-
- def predict(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- raise NotImplementedError
-
-
- def fit_predict(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- return self.fit(X, y, lengths).predict(X, lengths)
-
-
- @_requires_fit
- def score(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None,
- sample_weight: Optional[Any] = None
- ) -> float:
- y_pred = self.predict(X, lengths)
- return r2_score(y, y_pred, sample_weight=sample_weight)
diff --git a/lib/sequentia/models/hmm/__init__.py b/lib/sequentia/models/hmm/__init__.py
deleted file mode 100644
index 6217960a..00000000
--- a/lib/sequentia/models/hmm/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .classifier import *
-from .variants import *
diff --git a/lib/sequentia/models/hmm/classifier.py b/lib/sequentia/models/hmm/classifier.py
deleted file mode 100644
index 59730829..00000000
--- a/lib/sequentia/models/hmm/classifier.py
+++ /dev/null
@@ -1,532 +0,0 @@
-from __future__ import annotations
-
-import joblib
-import pathlib
-from types import SimpleNamespace
-from typing import Optional, Union, Dict, Literal, IO
-from joblib import Parallel, delayed
-
-import numpy as np
-from pydantic import NegativeInt, PositiveInt, confloat, validator, root_validator
-from sklearn.utils.validation import NotFittedError
-
-from sequentia.models.base import _Classifier
-from sequentia.models.hmm.variants.base import _HMM
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.multiprocessing import _effective_n_jobs
-from sequentia.utils.decorators import _validate_params, _override_params, _requires_fit
-from sequentia.utils.validation import (
- _check_classes,
- _check_is_fitted,
- Array,
- _Validator,
-)
-
-__all__ = ['HMMClassifier']
-
-_defaults = SimpleNamespace(
- prior=None,
- classes=None,
- n_jobs=1,
-)
-
-
-class _HMMClassifierValidator(_Validator):
- prior: Optional[Union[Literal["frequency"], Dict[int, confloat(ge=0, le=1)]]] = _defaults.prior
- classes: Optional[Array[int]] = _defaults.classes
- n_jobs: Union[NegativeInt, PositiveInt] = _defaults.n_jobs
-
-
- @validator('prior')
- def check_prior(cls, value):
- if isinstance(value, dict):
- if not np.isclose(sum(value.values()), 1):
- raise ValueError('Prior distribution must sum to one')
- return value
-
-
- @root_validator
- def check_prior_keys_with_classes(cls, values):
- if 'prior' in values and 'classes' in values:
- prior, classes = values['prior'], values['classes']
- if isinstance(prior, dict) and classes is not None:
- if set(prior.keys()) != set(classes):
- raise ValueError(
- 'Provided classes are not consistent with the provided prior distribution - '
- 'ensure that every label in `classes` is present in `prior`'
- )
- return values
-
-
-class HMMClassifier(_Classifier):
- """A classifier consisting of HMMs, each trained independently to recognize sequences of a single class.
-
- The predicted class for a given observation sequence is the class represented by the HMM
- which produces the maximum posterior probability for the observation sequence.
-
- Examples
- --------
- Using a :class:`.HMMClassifier` (with :class:`.GaussianMixtureHMM` models) to classify spoken digits. ::
-
- import numpy as np
- from sequentia.datasets import load_digits
- from sequentia.models.hmm import GaussianMixtureHMM, HMMClassifier
-
- # Seed for reproducible pseudo-randomness
- random_state = np.random.RandomState(1)
-
- # Fetch MFCCs of spoken digits
- data = load_digits()
- train_data, test_data = data.split(test_size=0.2, random_state=random_state)
-
- # Create a HMMClassifier using a class frequency prior
- clf = HMMClassifier(prior='frequency')
-
- # Add an untrained HMM for each class
- for label in data.classes:
- model = GaussianMixtureHMM(random_state=random_state)
- clf.add_model(model, label)
-
- # Fit the HMMs by providing training observation sequences for all classes
- X_train, y_train, lengths_train = train_data.X_y_lengths
- clf.fit(X_train, y_train, lengths_train)
-
- # Predict classes for the test observation sequences
- X_test, lengths_test = test_data.X_lengths
- y_pred = clf.predict(X_test, lengths_test)
-
- As done in the above example, we can provide unfitted HMMs using :func:`add_model` or :func:`add_models`,
- then provide training observation sequences for all classes to :func:`fit`, which will automatically train each HMM on the appropriate subset of data.
-
- Alternatively, we may provide pre-fitted HMMs and call :func:`fit` with no arguments. ::
-
- # Create a HMMClassifier using a class frequency prior
- clf = HMMClassifier(prior='frequency')
-
- # Manually fit each HMM on its own subset of data
- for X_train, lengths_train, label for train_data.iter_by_class():
- model = GaussianMixtureHMM(random_state=random_state)
- model.fit(X_train, lengths_train)
- clf.add_model(model, label)
-
- # Fit the classifier
- clf.fit()
- """
-
- _defaults = _defaults
-
-
- @_validate_params(using=_HMMClassifierValidator)
- def __init__(
- self,
- *,
- prior: Optional[Union[Literal["frequency"], dict]] = _defaults.prior,
- classes: Optional[Array[int]] = _defaults.classes,
- n_jobs: Union[NegativeInt, PositiveInt] = _defaults.n_jobs,
- ) -> HMMClassifier:
- """Initializes a :class:`.HMMClassifier`.
-
- :param prior: Type of prior probability to assign to each HMM.
-
- - If ``None``, a uniform prior will be used, making each HMM equally likely.
- - If ``"frequency"``, the prior probability of each HMM is equal to the fraction of total observation sequences that the HMM was fitted with.
- - If a ``dict``, custom prior probabilities can be assigned to each HMM.
- The keys should be the label of the class represented by the HMM, and the value should be the prior probability for the HMM.
-
- :param classes: Set of possible class labels.
-
- - If not provided, these will be determined from the training data labels.
- - If provided, output from methods such as :func:`predict_proba` and :func:`predict_scores`
- will follow the ordering of the class labels provided here.
-
- :param n_jobs: Maximum number of concurrently running workers.
-
- - If 1, no parallelism is used at all (useful for debugging).
- - If -1, all CPUs are used.
- - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g. ``n_jobs=-2`` uses all but one.
- """
- #: Type of prior probability to assign to each HMM.
- self.prior = prior
- #: Set of possible class labels.
- self.classes = classes
- #: Maximum number of concurrently running workers.
- self.n_jobs = n_jobs
- #: HMMs constituting the :class:`.HMMClassifier`.
- self.models = {}
-
-
- def add_model(
- self,
- model: _HMM,
- label: int
- ) -> HMMClassifier:
- """Adds a single HMM to the classifier.
-
- :param model: HMM to add to the classifier.
- :param label: Class represented by the HMM.
-
- :note: All models added to the classifier must be of the same type — either :class:`.GaussianMixtureHMM` or :class:`.CategoricalHMM`.
-
- :return: The classifier.
- """
- if not isinstance(model, _HMM):
- raise TypeError('Expected `model` argument to be a type of HMM')
- if len(self.models) > 0:
- if type(model) != type(list(self.models.values())[-1]):
- raise TypeError(
- f'Model of type {type(model).__name__} must be the same as the models already provided '
- f'to this {type(self).__name__} instance'
- )
- self.models[int(label)] = model
- return self
-
-
- def add_models(
- self,
- models: Dict[int, _HMM]
- ) -> HMMClassifier:
- """Adds HMMs to the classifier.
-
- :param models: HMMs to add to the classifier. The key for each HMM should be the label of the class represented by the HMM.
-
- :note: All models added to the classifier must be of the same type — either :class:`.GaussianMixtureHMM` or :class:`.CategoricalHMM`.
-
- :return: The classifier.
- """
- if not all(isinstance(model, _HMM) for model in models.values()):
- raise TypeError('Expected all provided `models` to be a type of HMM')
- for label, model in models.items():
- self.add_model(model, label)
- return self
-
-
- def fit(
- self,
- X: Optional[Array] = None,
- y: Optional[Array[int]] = None,
- lengths: Optional[Array[int]] = None
- ) -> HMMClassifier:
- """Fits the HMMs to the sequence(s) in ``X``.
-
- - If fitted models were provided with :func:`add_model` or :func:`add_models`, no arguments should be passed to :func:`fit`.
- - If unfitted models were provided with :func:`add_model` or :func:`add_models`, training data ``X``, ``y`` and ``lengths`` must be provided to :func:`fit`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The fitted classifier.
- """
- if X is None or y is None:
- if len(self.models) == 0:
- raise RuntimeError(
- f'Fitted models must be provided to this {type(self).__name__} instance if no training data is provided - '
- 'use add_model() to add fitted models to the classifier object'
- )
-
- for label, model in self.models.items():
- if not _check_is_fitted(model, return_=True):
- raise NotFittedError(
- f'The model corresponding to label {label} must be pre-fitted if '
- f'no training data is provided to this {type(self).__name__} instance'
- )
-
- if self.classes is not None:
- # Same logic as _check_classes()
- classes_np = np.array(self.classes).flatten()
- if not np.issubdtype(classes_np.dtype, np.integer):
- raise TypeError(f'Expected classes to be integers')
- _, idx = np.unique(classes_np, return_index=True)
- self.classes_ = classes_np[np.sort(idx)]
- else:
- # Fetch classes from provided models
- self.classes_ = np.array(list(self.models.keys()))
- else:
- self.classes_ = _check_classes(Array[int].validate_type(y), self.classes)
-
- # Check that each label has a HMM (and vice versa)
- if set(self.models.keys()) != set(self.classes_):
- raise ValueError(
- 'Classes in the dataset are not consistent with the added models - '
- 'ensure that every added model corresponds to a class in the dataset'
- )
-
- if X is not None and y is not None:
- # Iterate through the dataset by class and fit the corresponding model
- data = self._sequence_classifier_validator(X=X, y=y, lengths=lengths)
- dataset = SequentialDataset(data.X, data.y, data.lengths, self.classes_)
- for X_c, lengths_c, c in dataset.iter_by_class():
- self.models[c].fit(X_c, lengths_c)
-
- # Set class priors
- if self.prior is None:
- self.prior_ = {c:1/len(self.classes_) for c, _ in self.models.items()}
- elif isinstance(self.prior, str):
- if self.prior == "frequency":
- total_seqs = sum(model.n_seqs_ for model in self.models.values())
- self.prior_ = {c:model.n_seqs_/total_seqs for c, model in self.models.items()}
- elif isinstance(self.prior, dict):
- if set(self.prior.keys()) != set(self.classes_):
- raise ValueError(
- 'Classes in the dataset are not consistent with the classes in `prior` - '
- 'ensure that every provided class prior corresponds to a class in the dataset'
- )
- self.prior_ = self.prior
-
- return self
-
-
- @_requires_fit
- def predict(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- """Predicts classes for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class predictions.
- """
- scores = self.predict_scores(X, lengths)
- max_score_idxs = scores.argmax(axis=1)
- return self.classes_[max_score_idxs]
-
-
- def fit_predict(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- """Fits the classifier to the sequence(s) in ``X`` and predicts classes for ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: Class predictions.
- """
- return super().fit_predict(X, y, lengths)
-
-
- @_requires_fit
- def predict_proba(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[confloat(ge=0, le=1)]:
- """Predicts class probabilities for the sequence(s) in ``X``.
-
- Probabilities are calculated as the posterior probability of each HMM generating the sequence.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class membership probabilities.
- """
- proba = self.predict_scores(X, lengths)
- proba -= proba.max(axis=1, keepdims=True)
- proba = np.exp(proba)
- proba /= proba.sum(axis=1, keepdims=True)
- return proba
-
-
- @_requires_fit
- def predict_scores(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- """Predicts class scores for the sequence(s) in ``X``.
-
- Scores are calculated as the log posterior probability of each HMM generating the sequence.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class scores.
- """
- data = self._base_sequence_validator(X=X, lengths=lengths)
- n_jobs = _effective_n_jobs(self.n_jobs, data.lengths)
- chunk_idxs = np.array_split(SequentialDataset._get_idxs(data.lengths), n_jobs)
- return np.concatenate(
- Parallel(n_jobs=n_jobs, max_nbytes=None)(
- delayed(self._compute_scores_chunk)(idxs, data.X)
- for idxs in chunk_idxs
- )
- )
-
-
- @_requires_fit
- def score(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]],
- normalize: bool = True,
- sample_weight: Optional[Array] = None,
- ) -> float:
- """Calculates accuracy for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D array if :class:`.CategoricalHMM` is being used, or either a 1D or 2D array if :class:`.GaussianMixtureHMM` is being used.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to the observation sequence(s) in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param normalize: See :func:`sklearn:sklearn.metrics.accuracy_score`.
-
- :param sample_weight: See :func:`sklearn:sklearn.metrics.accuracy_score`.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Classification accuracy.
- """
- return super().score(X, y, lengths, normalize, sample_weight)
-
-
- @_validate_params(using=_HMMClassifierValidator)
- @_override_params(_HMMClassifierValidator.fields(), temporary=False)
- def set_params(self, **kwargs) -> HMMClassifier:
- return self
-
-
- def _compute_scores_chunk(self, idxs, X):
- scores = np.zeros((len(idxs), len(self.classes_)))
- for i, x in enumerate(SequentialDataset._iter_X(X, idxs)):
- scores[i] = self._compute_log_posterior(x)
- return scores
-
-
- def _compute_log_posterior(self, x):
- log_posterior = np.full(len(self.classes_), -np.inf)
- for i, k in enumerate(self.classes_):
- model = self.models[k]
- log_prior = np.log(self.prior_[k])
- log_likelihood = model._score(x)
- log_posterior[i] = log_prior + log_likelihood
- return log_posterior
-
-
- def _base_sequence_validator(self, **kwargs):
- model = self.models[0]
- return model._base_sequence_validator(**kwargs)
-
-
- def _sequence_classifier_validator(self, **kwargs):
- model = self.models[0]
- return model._sequence_classifier_validator(**kwargs)
-
-
- @_requires_fit
- def save(self, path: Union[str, pathlib.Path, IO]):
- """Serializes and saves a fitted HMM classifier.
-
- :param path: Location to save the serialized classifier.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- See Also
- --------
- load:
- Loads and deserializes a fitted HMM classifier.
- """
- # Fetch main parameters and fitted values
- state = {
- 'params': self.get_params(),
- 'models': self.models,
- 'fitted': {k:v for k, v in self.__dict__.items() if k.endswith('_')}
- }
-
- # Serialize model
- joblib.dump(state, path)
-
-
- @classmethod
- def load(cls, path: Union[str, pathlib.Path, IO]) -> HMMClassifier:
- """Loads and deserializes a fitted HMM classifier.
-
- :param path: Location to load the serialized classifier from.
-
- :return: Fitted HMM classifier.
-
- See Also
- --------
- save:
- Serializes and saves a fitted HMM classifier.
- """
- state = joblib.load(path)
-
- # Set main parameters
- model = cls(**state['params'])
- model.models = state['models']
-
- # Set fitted values
- for k, v in state['fitted'].items():
- setattr(model, k, v)
-
- # Return deserialized model
- return model
diff --git a/lib/sequentia/models/hmm/topologies.py b/lib/sequentia/models/hmm/topologies.py
deleted file mode 100644
index 22e68ce2..00000000
--- a/lib/sequentia/models/hmm/topologies.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import warnings
-
-import numpy as np
-
-class _Topology:
- """Represents a topology for a HMM, imposing restrictions on the transition matrix and initial state distribution.
-
- Parameters
- ----------
- n_states: int
- Number of states in the HMM.
-
- random_state: numpy.random.RandomState
- A random state object for reproducible randomness.
- """
-
- def __init__(self, n_states: int, random_state: np.random.RandomState):
- self.n_states = n_states
- self.random_state = random_state
-
- def uniform_start_probs(self) -> np.ndarray:
- """Sets the initial state distribution as a discrete uniform distribution.
-
- Returns
- -------
- initial: :class:`numpy:numpy.ndarray` (float)
- The initial state distribution of shape `(n_states,)`.
- """
- return np.ones(self.n_states) / self.n_states
-
- def random_start_probs(self) -> np.ndarray:
- """Sets the initial state distribution by randomly sampling probabilities generated by a Dirichlet distribution.
-
- Returns
- -------
- initial: :class:`numpy:numpy.ndarray` (float)
- The initial state distribution of shape `(n_states,)`.
- """
- return self.random_state.dirichlet(np.ones(self.n_states), size=1).flatten()
-
- def uniform_transitions(self) -> np.ndarray:
- """Sets the transition matrix as uniform (equal probability of transitioning
- to all other possible states from each state) corresponding to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The uniform transition matrix of shape `(n_states, n_states)`.
- """
- raise NotImplementedError
-
- def random_transitions(self) -> np.ndarray:
- """Sets the transition matrix as random (random probability of transitioning
- to all other possible states from each state) by sampling probabilities
- from a Dirichlet distribution - according to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The random transition matrix of shape `(n_states, n_states)`.
- """
- raise NotImplementedError
-
- def check_start_probs(self, initial: np.ndarray) -> None:
- """Validates an initial state distribution according to the topology's restrictions.
-
- Parameters
- ----------
- initial: numpy.ndarray (float)
- The initial state distribution to validate.
- """
- if not isinstance(initial, np.ndarray):
- raise TypeError('Initial state distribution must be a numpy.ndarray')
- if not initial.shape == (self.n_states,):
- raise ValueError('Initial state distribution must be of shape (n_states,)')
- if not np.isclose(initial.sum(), 1):
- raise ValueError('Initial state distribution must sum to one')
- return initial
-
- def check_transitions(self, transitions: np.ndarray) -> np.ndarray:
- """Validates a transition matrix according to the topology's restrictions.
-
- Parameters
- ----------
- transitions: numpy.ndarray (float)
- The transition matrix to validate.
- """
- if not isinstance(transitions, np.ndarray):
- raise TypeError('Transition matrix must be a numpy.ndarray')
- if not transitions.shape == (self.n_states, self.n_states):
- raise ValueError('Transition matrix must be of shape (n_states, n_states)')
- if not np.allclose(transitions.sum(axis=1), np.ones(self.n_states)):
- raise ValueError('Transition probabilities out of each state must sum to one')
- return transitions
-
-class _ErgodicTopology(_Topology):
- """Represents the topology for an ergodic HMM, imposing non-zero probabilities in the transition matrix.
-
- Parameters
- ----------
- n_states: int
- Number of states in the HMM.
-
- random_state: numpy.random.RandomState
- A random state object for reproducible randomness.
- """
-
- name = "ergodic"
-
- def uniform_transitions(self) -> np.ndarray:
- """Sets the transition matrix as uniform (equal probability of transitioning
- to all other possible states from each state) corresponding to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The uniform transition matrix of shape `(n_states, n_states)`.
- """
- return np.ones((self.n_states, self.n_states)) / self.n_states
-
- def random_transitions(self) -> np.ndarray:
- """Sets the transition matrix as random (random probability of transitioning
- to all other possible states from each state) by sampling probabilities
- from a Dirichlet distribution - according to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The random transition matrix of shape `(n_states, n_states)`.
- """
- return self.random_state.dirichlet(np.ones(self.n_states), size=self.n_states)
-
- def check_transitions(self, transitions: np.ndarray) -> np.ndarray:
- """Validates a transition matrix according to the topology's restrictions.
-
- Parameters
- ----------
- transitions: numpy.ndarray (float)
- The transition matrix to validate.
- """
- super().check_transitions(transitions)
- if not np.all(transitions > 0):
- warnings.warn('Zero probabilities in ergodic transition matrix - these transition probabilities will not be learned')
- return transitions
-
-class _LeftRightTopology(_Topology):
- """Represents the topology for a left-right HMM, imposing an upper-triangular transition matrix.
-
- Parameters
- ----------
- n_states: int
- Number of states in the HMM.
-
- random_state: numpy.random.RandomState
- A random state object for reproducible randomness.
- """
-
- name = "left-right"
-
- def uniform_transitions(self) -> np.ndarray:
- """Sets the transition matrix as uniform (equal probability of transitioning
- to all other possible states from each state) corresponding to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The uniform transition matrix of shape `(n_states, n_states)`.
- """
- upper_ones = np.triu(np.ones((self.n_states, self.n_states)))
- upper_divisors = np.triu(np.tile(np.arange(self.n_states, 0, -1), (self.n_states, 1)).T)
- lower_ones = np.tril(np.ones(self.n_states), k=-1)
- return upper_ones / (upper_divisors + lower_ones)
-
- def random_transitions(self) -> np.ndarray:
- """Sets the transition matrix as random (random probability of transitioning
- to all other possible states from each state) by sampling probabilities
- from a Dirichlet distribution, according to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The random transition matrix of shape `(n_states, n_states)`.
- """
- transitions = np.zeros((self.n_states, self.n_states))
- for i, row in enumerate(transitions):
- row[i:] = self.random_state.dirichlet(np.ones(self.n_states - i))
- return transitions
-
- def check_transitions(self, transitions: np.ndarray) -> np.ndarray:
- """Validates a transition matrix according to the topology's restrictions.
-
- Parameters
- ----------
- transitions: numpy.ndarray (float)
- The transition matrix to validate.
- """
- super().check_transitions(transitions)
- if not np.allclose(transitions, np.triu(transitions)):
- raise ValueError('Left-right transition matrix must be upper-triangular')
- return transitions
-
-class _LinearTopology(_LeftRightTopology):
- """Represents the topology for a linear HMM.
-
- Parameters
- ----------
- n_states: int
- Number of states in the HMM.
-
- random_state: numpy.random.RandomState
- A random state object for reproducible randomness.
- """
-
- name = "linear"
-
- def uniform_transitions(self) -> np.ndarray:
- """Sets the transition matrix as uniform (equal probability of transitioning
- to all other possible states from each state) corresponding to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The uniform transition matrix of shape `(n_states, n_states)`.
- """
- transitions = np.zeros((self.n_states, self.n_states))
- for i, row in enumerate(transitions):
- size = min(2, self.n_states - i)
- row[i:(i + size)] = np.ones(size) / size
- return transitions
-
- def random_transitions(self) -> np.ndarray:
- """Sets the transition matrix as random (random probability of transitioning
- to all other possible states from each state) by sampling probabilities
- from a Dirichlet distribution, according to the topology.
-
- Returns
- -------
- transitions: :class:`numpy:numpy.ndarray` (float)
- The random transition matrix of shape `(n_states, n_states)`.
- """
- transitions = np.zeros((self.n_states, self.n_states))
- for i, row in enumerate(transitions):
- size = min(2, self.n_states - i)
- row[i:(i + size)] = self.random_state.dirichlet(np.ones(size))
- return transitions
-
- def check_transitions(self, transitions: np.ndarray) -> np.ndarray:
- """Validates a transition matrix according to the topology's restrictions.
-
- Parameters
- ----------
- transitions: numpy.ndarray (float)
- The transition matrix to validate.
- """
- super().check_transitions(transitions)
- if not np.allclose(transitions, np.diag(np.diag(transitions)) + np.diag(np.diag(transitions, k=1), k=1)):
- raise ValueError('Linear transition matrix must only consist of a diagonal and upper diagonal')
- return transitions
-
-_topologies = {
- topology.name:topology
- for topology in (_ErgodicTopology, _LeftRightTopology, _LinearTopology)
-}
diff --git a/lib/sequentia/models/hmm/variants/__init__.py b/lib/sequentia/models/hmm/variants/__init__.py
deleted file mode 100644
index a21ec7cb..00000000
--- a/lib/sequentia/models/hmm/variants/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .gaussian_mixture import *
-from .categorical import *
diff --git a/lib/sequentia/models/hmm/variants/base.py b/lib/sequentia/models/hmm/variants/base.py
deleted file mode 100644
index 194c48a3..00000000
--- a/lib/sequentia/models/hmm/variants/base.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from __future__ import annotations
-
-import re
-import warnings
-from copy import deepcopy
-from types import SimpleNamespace
-from typing import Optional, Union, Dict, Any, Literal
-from pydantic import NonNegativeInt, PositiveInt, validator
-
-import numpy as np
-from sklearn.base import BaseEstimator
-from sklearn.utils import check_random_state
-
-from sequentia.utils.decorators import _requires_fit
-from sequentia.utils.validation import Array, _Validator
-from sequentia.models.hmm.topologies import _ErgodicTopology
-
-_defaults = SimpleNamespace(
- n_states=5,
- topology="left-right",
- random_state=None,
- hmmlearn_kwargs=dict(
- init_params="st",
- params="st",
- ),
-)
-
-class _HMM(BaseEstimator):
- _base_sequence_validator = None
- _single_sequence_validator = None
- _sequence_classifier_validator = None
- _defaults = _defaults
- _unsettable_hmmlearn_kwargs = ["random_state", "init_params", "params"]
-
- def __init__(
- self,
- n_states: PositiveInt,
- topology: Optional[str],
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]],
- hmmlearn_kwargs: Dict[str, Any]
- ) -> _HMM:
- if type(self) == _HMM:
- raise NotImplementedError(
- f'Abstract class {type(self).__name__} cannot be instantiated - '
- 'use the subclassing HMMs defined in the sequentia.models.hmm module'
- )
-
- #: Number of states in the Markov chain.
- self.n_states = n_states
- #: Transition topology of the Markov chain — see :ref:`topologies`.
- self.topology = topology
- #: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- self.random_state = random_state
- #: Additional key-word arguments provided to the `hmmlearn `__ HMM constructor.
- self.hmmlearn_kwargs = deepcopy(hmmlearn_kwargs)
- #: Underlying HMM object from `hmmlearn `__ — only set after :func:`fit`.
- self.model = None
-
- self._skip_init_params = set()
- self._skip_params = set()
-
- def fit(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> _HMM:
- raise NotImplementedError
-
- @_requires_fit
- def score(
- self,
- x: Array,
- ) -> float:
- data = self._single_sequence_validator(sequence=x)
- return self._score(data.sequence)
-
- @_requires_fit
- def n_params(self) -> NonNegativeInt:
- """Retrieves the number of trainable parameters.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: Number of trainable parameters.
- """
- n_params = 0
- if 's' not in self._skip_params:
- n_params += self.model.startprob_.size
- if 't' not in self._skip_params:
- n_params += self.model.transmat_.size
- return n_params
-
- @_requires_fit
- def bic(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> float:
- data = self._base_sequence_validator(X=X, lengths=lengths)
- max_log_likelihood = self.model.score(data.X, data.lengths)
- n_params = self.n_params()
- n_seqs = len(lengths)
- return n_params * np.log(n_seqs) - 2 * np.log(max_log_likelihood)
-
- @_requires_fit
- def aic(
- self,
- X: Array,
- lengths: Optional[Array[int]] = None
- ) -> float:
- data = self._base_sequence_validator(X=X, lengths=lengths)
- max_log_likelihood = self.model.score(data.X, data.lengths)
- n_params = self.n_params()
- return 2 * n_params - 2 * np.log(max_log_likelihood)
-
- def set_start_probs(
- self,
- values: Union[Array, Literal["uniform", "random"]] = 'random'
- ):
- """Sets the initial state probabilities.
-
- If this method is **not** called, initial state probabilities are initialized depending on the value of ``topology`` provided to :func:`__init__`.
-
- - If ``topology`` was set to ``'ergodic'``, ``'left-right'`` or ``'linear'``, then random probabilities will be assigned according to the topology by calling :func:`set_start_probs` with ``value='random'``.
- - If ``topology`` was set to ``None``, then initial state probabilities will be initialized by `hmmlearn `__.
-
- :param values: Probabilities or probability type to assign as initial state probabilities.
-
- - If an ``Array``, should be a vector of starting probabilities for each state.
- - If ``'uniform'``, there is an equal probability of starting in any state.
- - If ``'random'``, the vector of initial state probabilities is sampled
- from a Dirichlet distribution with unit concentration parameters.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- error = ValueError("Invalid start probabilities - expected: 'uniform', 'random' or an array of probabilities")
- if isinstance(values, str):
- if values in ('uniform', 'random'):
- self._startprob = values
- self._skip_init_params |= set('s')
- else:
- raise error
- else:
- try:
- self._startprob = np.array(values)
- self._skip_init_params |= set('s')
- except Exception as e:
- raise error from e
-
- def set_transitions(
- self,
- values: Union[Array, Literal["uniform", "random"]] = 'random'
- ):
- """Sets the transition probability matrix.
-
- If this method is **not** called, transition probabilities are initialized depending on the value of ``topology`` provided to :func:`__init__`:
-
- - If ``topology`` was set to ``'ergodic'``, ``'left-right'`` or ``'linear'``, then random probabilities will be assigned according to the topology by calling :func:`set_transitions` with ``value='random'``.
- - If ``topology`` was set to ``None``, then initial state probabilities will be initialized by `hmmlearn `__.
-
- :param values: Probabilities or probability type to assign as state transition probabilities.
-
- - If an ``Array``, should be a matrix of probabilities where each row must some to one
- and represents the probabilities of transitioning out of a state.
- - If ``'uniform'``, for each state there is an equal probability of transitioning to any state permitted by the topology.
- - If ``'random'``, the vector of transition probabilities for each row is sampled from a
- Dirichlet distribution with unit concentration parameters, according to the shape of the topology.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- error = ValueError("Invalid transition matrix - expected: 'uniform', 'random' or an array of probabilities")
- if isinstance(values, str):
- if values in ('uniform', 'random'):
- self._transmat = values
- self._skip_init_params |= set('t')
- else:
- raise error
- else:
- try:
- self._transmat = np.array(values)
- self._skip_init_params |= set('t')
- except Exception as e:
- raise error from e
-
- def freeze(
- self,
- params: str,
- ):
- self._skip_params |= set(self._modify_params(params))
-
- def unfreeze(
- self,
- params: str,
- ):
- self._skip_params -= set(self._modify_params(params))
-
- def _modify_params(self, params):
- defaults = deepcopy(self._defaults.hmmlearn_kwargs["params"])
- error_msg = f"Expected a string consisting of any combination of {defaults}"
- if isinstance(params, str):
- if bool(re.compile(fr'[^{defaults}]').search(params)):
- raise ValueError(error_msg)
- else:
- raise TypeError(error_msg)
- return params
-
- def _check_init_params(self):
- topology = self.topology_ or _ErgodicTopology(self.n_states, check_random_state(self.random_state))
-
- if 's' in self._skip_init_params:
- if isinstance(self._startprob, str):
- if self._startprob == 'uniform':
- self._startprob = topology.uniform_start_probs()
- elif self._startprob == 'random':
- self._startprob = topology.random_start_probs()
- elif isinstance(self._startprob, np.ndarray):
- self._startprob = topology.check_start_probs(self._startprob)
- else:
- if self.topology_ is not None:
- self.set_start_probs(topology.random_start_probs())
-
- if 't' in self._skip_init_params:
- if isinstance(self._transmat, str):
- if self._transmat == 'uniform':
- self._transmat = topology.uniform_transitions()
- elif self._transmat == 'random':
- self._transmat = topology.random_transitions()
- elif isinstance(self._transmat, np.ndarray):
- self._transmat = topology.check_transitions(self._transmat)
- else:
- if self.topology_ is not None:
- self.set_transitions(topology.random_transitions())
-
- def _score(self, x: Array) -> float:
- return self.model.score(x)
-
-class _HMMValidator(_Validator):
- n_states: PositiveInt = _defaults.n_states
- topology: Optional[Literal["ergodic", "left-right", "linear"]] = _defaults.topology
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state
- hmmlearn_kwargs: Dict[str, Any] = deepcopy(_defaults.hmmlearn_kwargs)
-
- _class = _HMM
-
- @validator('random_state')
- def check_random_state(cls, value):
- return check_random_state(value)
-
- @validator('hmmlearn_kwargs')
- def check_hmmlearn_kwargs(cls, value):
- params = deepcopy(value)
-
- defaults = deepcopy(cls._class._defaults.hmmlearn_kwargs["params"])
- setter_methods = [f"{func}()" for func in dir(cls._class) if func.startswith("set") and func != "set_params"]
-
- for param in value.keys():
- if param in cls._class._unsettable_hmmlearn_kwargs:
- if param == 'init_params':
- if set(params[param]) != set(defaults):
- params[param] = defaults
- warnings.warn(
- f"The `init_params` hmmlearn argument cannot be overridden manually - defaulting to all parameters '{defaults}'. "
- f'Use the following methods to initialize model parameters: {", ".join(setter_methods)}.'
- )
- elif param == 'params':
- if set(params[param]) != set(defaults):
- params[param] = defaults
- warnings.warn(
- f"The `params` hmmlearn argument cannot be overridden manually - defaulting to all parameters '{defaults}'. "
- 'Use the freeze() and unfreeze() methods to specify the learnable model parameters.'
- )
- else:
- del params[param]
- warnings.warn(
- f'The `{param}` hmmlearn argument cannot be overriden manually - use the {cls._class.__name__} constructor to specify this argument.'
- )
-
- if 'init_params' not in params:
- params['init_params'] = defaults
- warnings.warn(
- f"No initializable parameters set in hmmlearn `init_params` argument - defaulting to '{defaults}'. "
- f'If you intend to manually initialize all parameters, use the following methods: {", ".join(setter_methods)}.'
- )
-
- if 'params' not in params:
- params['params'] = defaults
- warnings.warn(
- f"No learnable parameters set in hmmlearn `params` argument - defaulting to '{defaults}'. "
- 'If you intend to make no parameters learnable, use the freeze() method.'
- )
-
- return params
\ No newline at end of file
diff --git a/lib/sequentia/models/hmm/variants/categorical.py b/lib/sequentia/models/hmm/variants/categorical.py
deleted file mode 100644
index e8145bd2..00000000
--- a/lib/sequentia/models/hmm/variants/categorical.py
+++ /dev/null
@@ -1,269 +0,0 @@
-from __future__ import annotations
-
-from copy import deepcopy
-from types import SimpleNamespace
-from typing import Optional, Union, Dict, Any, Literal
-from pydantic import NonNegativeInt, PositiveInt
-
-import numpy as np
-import hmmlearn.hmm
-from sklearn.utils import check_random_state
-
-from sequentia.models.hmm.topologies import _topologies
-from sequentia.models.hmm.variants.base import _HMM, _HMMValidator
-from sequentia.utils.decorators import _validate_params, _requires_fit
-from sequentia.utils.validation import (
- Array,
- _BaseUnivariateCategoricalSequenceValidator,
- _SingleUnivariateCategoricalSequenceValidator,
- _UnivariateCategoricalSequenceClassifierValidator,
-)
-
-__all__ = ['CategoricalHMM']
-
-_defaults = SimpleNamespace(
- **{
- **_HMM._defaults.__dict__,
- "hmmlearn_kwargs": dict(
- init_params="ste",
- params="ste",
- )
- }
-)
-
-class CategoricalHMM(_HMM):
- """A hidden Markov model with univariate categorical emissions.
-
- Examples
- --------
- Using a :class:`.CategoricalHMM` to learn how to recognize DNA sequences from the synthetase gene family.
-
- See :func:`.load_gene_families` for more information on the sample dataset used in this example.
-
- ::
-
- import numpy as np
- from sequentia.datasets import load_gene_families
- from sequentia.models.hmm import CategoricalHMM
-
- # Seed for reproducible pseudo-randomness
- random_state = np.random.RandomState(1)
-
- # Fetch DNA sequences for the synthetase gene family (no. 4)
- data, enc = load_gene_families(families=[4])
- train_data, test_data = data.split(test_size=0.2, random_state=random_state)
-
- # Create and train a CategoricalHMM to recognize the synthetase DNA sequences
- model = CategoricalHMM(random_state=random_state)
- X_train, lengths_train = train_data.X_lengths
- model.fit(X_train, lengths_train)
-
- # Calculate the log-likelihood of the first test sample being generated by this model
- x, y = test_data[0]
- model.score(x)
- """
-
- _base_sequence_validator = _BaseUnivariateCategoricalSequenceValidator
- _single_sequence_validator = _SingleUnivariateCategoricalSequenceValidator
- _sequence_classifier_validator = _UnivariateCategoricalSequenceClassifierValidator
- _defaults = _defaults
-
- def __init__(
- self,
- *,
- n_states: PositiveInt = _defaults.n_states,
- topology: Optional[Literal["ergodic", "left-right", "linear"]] = _defaults.topology,
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state,
- hmmlearn_kwargs: Dict[str, Any] = deepcopy(_defaults.hmmlearn_kwargs)
- ) -> CategoricalHMM:
- """Initializes the :class:`.CategoricalHMM`.
-
- :param n_states: Number of states in the Markov chain.
- :param topology: Transition topology of the Markov chain — see :ref:`topologies`.
-
- - If ``None``, behaves the same as ``'ergodic'`` but with `hmmlearn `__ initialization.
-
- :param random_state: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- :param hmmlearn_kwargs: Additional key-word arguments provided to the `hmmlearn `__ HMM constructor.
- """
- super().__init__(n_states, topology, random_state, hmmlearn_kwargs)
-
- def fit(
- self,
- X: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> CategoricalHMM:
- """Fits the HMM to the sequences in ``X``, using the Baum—Welch algorithm.
-
- :param X: Univariate observation sequence(s).
-
- - Should be a single 1D array.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The fitted HMM.
- """
- data = self._base_sequence_validator(X=X, lengths=lengths)
- self.random_state_ = check_random_state(self.random_state)
- if self.topology is None:
- self.topology_ = None
- else:
- self.topology_ = _topologies[self.topology](self.n_states, self.random_state_)
- self._check_init_params()
-
- kwargs = deepcopy(self.hmmlearn_kwargs)
- kwargs['init_params'] = ''.join(set(kwargs['init_params']) - self._skip_init_params)
- kwargs['params'] = ''.join(set(kwargs['params']) - self._skip_params)
-
- self.model = hmmlearn.hmm.CategoricalHMM(
- n_components=self.n_states,
- random_state=self.random_state_,
- **kwargs
- )
-
- for attr in ('startprob', 'transmat', 'emissionprob'):
- if hasattr(self, f'_{attr}'):
- setattr(self.model, f'{attr}_', getattr(self, f'_{attr}'))
-
- self.model.fit(data.X, lengths=data.lengths)
- self.n_seqs_ = len(data.lengths)
-
- return self
-
- @_requires_fit
- def score(
- self,
- x: Array[int],
- ) -> float:
- """Calculates the log-likelihood of the HMM generating a single observation sequence.
-
- :param x: Univariate observation sequence.
-
- - Should be a single 1D array.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The log-likelihood.
- """
- return super().score(x)
-
- @_requires_fit
- def n_params(self) -> NonNegativeInt:
- n_params = super().n_params()
- if 'e' not in self._skip_params:
- n_params += self.model.emissionprob_.size
- return n_params
-
- @_requires_fit
- def bic(
- self,
- X: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> float:
- """The Bayesian information criterion of the model, evaluated with the maximum likelihood of ``X``.
-
- :param X: Univariate observation sequence(s).
-
- - Should be a single 1D array.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The Bayesian information criterion.
- """
- return super().bic(X, lengths)
-
- @_requires_fit
- def aic(
- self,
- X: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> float:
- """The Akaike information criterion of the model, evaluated with the maximum likelihood of ``X``.
-
- :param X: Univariate observation sequence(s).
-
- - Should be a single 1D array.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The Akaike information criterion.
- """
- return super().aic(X, lengths)
-
- def set_state_emissions(self, values: Array[float]):
- """Sets the state emission distribution of the HMM's emission model.
-
- If this method is **not** called, emission probabilities will be initialized by `hmmlearn `__.
-
- :param values: Array of emission probabilities.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- self._emissionprob = Array[float].validate_type(values)
- self._skip_init_params |= set('e')
-
- def freeze(
- self,
- params: str = deepcopy(_defaults.hmmlearn_kwargs["params"]),
- ):
- """Freezes the trainable parameters of the HMM, preventing them from being updated during the Baum—Welch algorithm.
-
- :param params: A string specifying which parameters to freeze. Can contain a combination of:
-
- - ``'s'`` for initial state probabilities,
- - ``'t'`` for transition probabilities,
- - ``'e'`` for emission probailities.
-
- :note: If used, this method should normally be called before :func:`fit`.
-
- See Also
- --------
- unfreeze:
- Unfreezes the trainable parameters of the HMM, allowing them to be updated during the Baum—Welch algorithm.
- """
- super().freeze(params)
-
- def unfreeze(
- self,
- params: str = deepcopy(_defaults.hmmlearn_kwargs["params"]),
- ):
- """Unfreezes the trainable parameters of the HMM, allowing them to be updated during the Baum—Welch algorithm.
-
- :param params: A string specifying which parameters to unfreeze. Can contain a combination of:
-
- - ``'s'`` for initial state probabilities,
- - ``'t'`` for transition probabilities,
- - ``'e'`` for emission probailities.
-
- See Also
- --------
- freeze:
- Freezes the trainable parameters of the HMM, preventing them from being updated during the Baum—Welch algorithm.
- """
- super().unfreeze(params)
-
-class _CategoricalHMMValidator(_HMMValidator):
- hmmlearn_kwargs: Dict[str, Any] = deepcopy(_defaults.hmmlearn_kwargs)
-
- _class = CategoricalHMM
-
-CategoricalHMM.__init__ = _validate_params(using=_CategoricalHMMValidator)(CategoricalHMM.__init__)
diff --git a/lib/sequentia/models/hmm/variants/gaussian_mixture.py b/lib/sequentia/models/hmm/variants/gaussian_mixture.py
deleted file mode 100644
index d60b6e0d..00000000
--- a/lib/sequentia/models/hmm/variants/gaussian_mixture.py
+++ /dev/null
@@ -1,321 +0,0 @@
-from __future__ import annotations
-
-from copy import deepcopy
-from types import SimpleNamespace
-from typing import Optional, Union, Dict, Any, Literal
-from pydantic import NonNegativeInt, PositiveInt
-
-import numpy as np
-import hmmlearn.hmm
-from sklearn.utils import check_random_state
-
-from sequentia.models.hmm.topologies import _topologies
-from sequentia.models.hmm.variants.base import _HMM, _HMMValidator
-from sequentia.utils.decorators import _validate_params, _requires_fit
-from sequentia.utils.validation import (
- Array,
- _BaseMultivariateFloatSequenceValidator,
- _SingleMultivariateFloatSequenceValidator,
- _MultivariateFloatSequenceClassifierValidator,
-)
-
-__all__ = ['GaussianMixtureHMM']
-
-_defaults = SimpleNamespace(
- **{
- **_HMM._defaults.__dict__,
- "n_components": 3,
- "covariance_type": "spherical",
- "hmmlearn_kwargs": dict(
- init_params="stmcw",
- params="stmcw",
- )
- }
-)
-
-
-class GaussianMixtureHMM(_HMM):
- """A hidden Markov model with multivariate Gaussian mixture emissions.
-
- Examples
- --------
- Using a :class:`.GaussianMixtureHMM` to learn how to recognize spoken samples of the digit 3.
-
- See :func:`.load_digits` for more information on the sample dataset used in this example.
-
- ::
-
- import numpy as np
- from sequentia.datasets import load_digits
- from sequentia.models.hmm import GaussianMixtureHMM
-
- # Seed for reproducible pseudo-randomness
- random_state = np.random.RandomState(1)
-
- # Fetch MFCCs of spoken samples for the digit 3
- data = load_digits(digits=[3])
- train_data, test_data = data.split(test_size=0.2, random_state=random_state)
-
- # Create and train a GaussianMixtureHMM to recognize the digit 3
- model = GaussianMixtureHMM(random_state=random_state)
- X_train, lengths_train = train_data.X_lengths
- model.fit(X_train, lengths_train)
-
- # Calculate the log-likelihood of the first test sample being generated by this model
- x, y = test_data[0]
- model.score(x)
- """
-
- _base_sequence_validator = _BaseMultivariateFloatSequenceValidator
- _single_sequence_validator = _SingleMultivariateFloatSequenceValidator
- _sequence_classifier_validator = _MultivariateFloatSequenceClassifierValidator
- _defaults = _defaults
- _unsettable_hmmlearn_kwargs = _HMM._unsettable_hmmlearn_kwargs + ["n_components", "n_mix", "covariance_type"]
-
- def __init__(
- self,
- *,
- n_states: PositiveInt = _defaults.n_states,
- n_components: PositiveInt = _defaults.n_components,
- covariance_type: Literal["spherical", "diag", "full", "tied"] = _defaults.covariance_type,
- topology: Optional[Literal["ergodic", "left-right", "linear"]] = _defaults.topology,
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state,
- hmmlearn_kwargs: Dict[str, Any] = deepcopy(_defaults.hmmlearn_kwargs)
- ) -> GaussianMixtureHMM:
- """Initializes the :class:`.GaussianMixtureHMM`.
-
- :param n_states: Number of states in the Markov chain.
- :param n_components: Number of Gaussian components in the mixture emission distribution for each state.
- :param covariance_type: Type of covariance matrix in the mixture emission distribution for each state - see :ref:`covariance_types`.
- :param topology: Transition topology of the Markov chain — see :ref:`topologies`.
-
- - If ``None``, behaves the same as ``'ergodic'`` but with `hmmlearn `__ initialization.
-
- :param random_state: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- :param hmmlearn_kwargs: Additional key-word arguments provided to the `hmmlearn `__ HMM constructor.
- """
- super().__init__(n_states, topology, random_state, hmmlearn_kwargs)
- #: Number of Gaussian components in the emission model mixture distribution for each state.
- self.n_components = n_components
- #: Type of covariance matrix in the emission model mixture distribution for each state.
- self.covariance_type = covariance_type
-
- def fit(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> GaussianMixtureHMM:
- """Fits the HMM to the sequences in ``X``, using the Baum—Welch algorithm.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The fitted HMM.
- """
- data = self._base_sequence_validator(X=X, lengths=lengths)
- self.random_state_ = check_random_state(self.random_state)
- if self.topology is None:
- self.topology_ = None
- else:
- self.topology_ = _topologies[self.topology](self.n_states, self.random_state_)
- self._check_init_params()
-
- kwargs = deepcopy(self.hmmlearn_kwargs)
- kwargs['init_params'] = ''.join(set(kwargs['init_params']) - self._skip_init_params)
- kwargs['params'] = ''.join(set(kwargs['params']) - self._skip_params)
-
- self.model = hmmlearn.hmm.GMMHMM(
- n_components=self.n_states,
- n_mix=self.n_components,
- covariance_type=self.covariance_type,
- random_state=self.random_state_,
- **kwargs
- )
-
- for attr in ('startprob', 'transmat', 'means', 'covars', 'weights'):
- if hasattr(self, f'_{attr}'):
- setattr(self.model, f'{attr}_', getattr(self, f'_{attr}'))
-
- self.model.fit(data.X, lengths=data.lengths)
- self.n_seqs_ = len(data.lengths)
-
- return self
-
- @_requires_fit
- def score(
- self,
- x: Array[float],
- ) -> float:
- """Calculates the log-likelihood of the HMM generating a single observation sequence.
-
- :param x: Univariate or multivariate observation sequence.
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The log-likelihood.
- """
- return super().score(x)
-
- @_requires_fit
- def n_params(self) -> NonNegativeInt:
- n_params = super().n_params()
- if 'm' not in self._skip_params:
- n_params += self.model.means_.size
- if 'c' not in self._skip_params:
- n_params += self.model.covars_.size
- if 'w' not in self._skip_params:
- n_params += self.model.weights_.size
- return n_params
-
- @_requires_fit
- def bic(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> float:
- """The Bayesian information criterion of the model, evaluated with the maximum likelihood of ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The Bayesian information criterion.
- """
- return super().bic(X, lengths)
-
- @_requires_fit
- def aic(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> float:
- """The Akaike information criterion of the model, evaluated with the maximum likelihood of ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained model — see :func:`fit`.
-
- :return: The Akaike information criterion.
- """
- return super().aic(X, lengths)
-
- def set_state_means(self, values: Array[float]):
- """Sets the mean vectors of the state emission distributions.
-
- If this method is **not** called, mean vectors will be initialized by `hmmlearn `__.
-
- :param values: Array of emission distribution mean values.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- self._means = Array[float].validate_type(values)
- self._skip_init_params |= set('m')
-
- def set_state_covariances(self, values: Array[float]):
- """Sets the covariance matrices of the state emission distributions.
-
- If this method is **not** called, covariance matrices will be initialized by `hmmlearn `__.
-
- :param values: Array of emission distribution covariance values.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- self._covars = Array[float].validate_type(values)
- self._skip_init_params |= set('c')
-
- def set_state_weights(self, values: Array[float]):
- """Sets the component mixture weights of the state emission distributions.
-
- If this method is **not** called, component mixture weights will be initialized by `hmmlearn `__.
-
- :param values: Array of emission distribution component mixture weights.
-
- :note: If used, this method should normally be called before :func:`fit`.
- """
- self._weights = Array[float].validate_type(values)
- self._skip_init_params |= set('w')
-
- def freeze(
- self,
- params: str = deepcopy(_defaults.hmmlearn_kwargs["params"]),
- ):
- """Freezes the trainable parameters of the HMM, preventing them from being updated during the Baum—Welch algorithm.
-
- :param params: A string specifying which parameters to freeze. Can contain a combination of:
-
- - ``'s'`` for initial state probabilities,
- - ``'t'`` for transition probabilities,
- - ``'m'`` for emission distribution means,
- - ``'c'`` for emission distribution covariances,
- - ``'w'`` for emission distribution mixture weights.
-
- :note: If used, this method should normally be called before :func:`fit`.
-
- See Also
- --------
- unfreeze:
- Unfreezes the trainable parameters of the HMM, allowing them to be updated during the Baum—Welch algorithm.
- """
- super().freeze(params)
-
- def unfreeze(
- self,
- params: str = deepcopy(_defaults.hmmlearn_kwargs["params"]),
- ):
- """Unfreezes the trainable parameters of the HMM, allowing them to be updated during the Baum—Welch algorithm.
-
- :param params: A string specifying which parameters to unfreeze. Can contain a combination of:
-
- - ``'s'`` for initial state probabilities,
- - ``'t'`` for transition probabilities,
- - ``'m'`` for emission distribution means,
- - ``'c'`` for emission distribution covariances,
- - ``'w'`` for emission distribution mixture weights.
-
- See Also
- --------
- freeze:
- Freezes the trainable parameters of the HMM, preventing them from be updated during the Baum—Welch algorithm.
- """
- super().unfreeze(params)
-
-class _GaussianMixtureHMMValidator(_HMMValidator):
- n_components: PositiveInt = _defaults.n_components
- covariance_type: Literal["spherical", "diag", "full", "tied"] = _defaults.covariance_type
- hmmlearn_kwargs: Dict[str, Any] = deepcopy(_defaults.hmmlearn_kwargs)
-
- _class = GaussianMixtureHMM
-
-GaussianMixtureHMM.__init__ = _validate_params(using=_GaussianMixtureHMMValidator)(GaussianMixtureHMM.__init__)
diff --git a/lib/sequentia/models/knn/__init__.py b/lib/sequentia/models/knn/__init__.py
deleted file mode 100644
index 308d8e4c..00000000
--- a/lib/sequentia/models/knn/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .classifier import *
-from .regressor import *
diff --git a/lib/sequentia/models/knn/base.py b/lib/sequentia/models/knn/base.py
deleted file mode 100644
index 79400e4a..00000000
--- a/lib/sequentia/models/knn/base.py
+++ /dev/null
@@ -1,459 +0,0 @@
-from __future__ import annotations
-
-import types
-import joblib
-import marshal
-import warnings
-import pathlib
-from types import SimpleNamespace
-from typing import Optional, Union, Callable, Tuple, List, Any, IO
-from joblib import Parallel, delayed
-
-import numpy as np
-from pydantic import NegativeInt, NonNegativeInt, PositiveInt, confloat, validator
-from dtaidistance import dtw, dtw_ndim
-from sklearn.utils import check_random_state
-
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.multiprocessing import _effective_n_jobs
-from sequentia.utils.decorators import (
- _validate_params,
- _override_params,
- _requires_fit,
- _check_plotting_dependencies,
-)
-from sequentia.utils.validation import (
- Array,
- _Validator,
- _BaseMultivariateFloatSequenceValidator,
- _SingleUnivariateFloatSequenceValidator,
- _SingleMultivariateFloatSequenceValidator,
-)
-
-dtw_cc = None
-try:
- from dtaidistance import dtw_cc
-except ImportError:
- pass
-
-_defaults = SimpleNamespace(
- k=1,
- weighting=None,
- window=1,
- independent=False,
- use_c=False,
- n_jobs=1,
- random_state=None,
-)
-
-
-class _KNNValidator(_Validator):
- k: PositiveInt = _defaults.k
- weighting: Optional[Callable] = _defaults.weighting
- window: confloat(ge=0, le=1) = _defaults.window
- independent: bool = _defaults.independent
- use_c: bool = _defaults.use_c
- n_jobs: Union[NegativeInt, PositiveInt] = _defaults.n_jobs
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state
-
-
- @validator('use_c')
- def check_use_c(cls, value):
- use_c = value
- if use_c and dtw_cc is None:
- warnings.warn('DTAIDistance C library not available - using Python implementation', ImportWarning)
- use_c = False
- return use_c
-
-
- @validator('random_state')
- def check_random_state(cls, value):
- return check_random_state(value)
-
-
-class _KNNMixin:
- _defaults = _defaults
-
-
- @_requires_fit
- @_override_params(['k', 'window', 'independent'])
- @_validate_params(using=_KNNValidator)
- def query_neighbors(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None,
- sort: bool = True,
- **kwargs,
- ) -> Tuple[
- Array[int],
- Array[float],
- Array
- ]:
- """Queries the k-nearest training observation sequences to each sequence in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param sort: Whether to sort the neighbors in order of nearest to furthest.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``k``: See :func:`__init__`.
- - ``window``: See :func:`__init__`.
- - ``independent``: See :func:`__init__`.
-
- :return: K-nearest neighbors for each sequence in ``X``.
-
- - Indices of the k-nearest training sequences.
- - DTW distances of the k-nearest training sequences.
- - Corresponding outputs of the k-nearest training sequences.
- """
- distances = self.compute_distance_matrix(X, lengths)
- partition_by = range(self.k) if sort else self.k
- k_idxs = np.argpartition(distances, partition_by, axis=1)[:, :self.k]
- k_distances = np.take_along_axis(distances, k_idxs, axis=1)
- k_outputs = self.y_[k_idxs]
- return k_idxs, k_distances, k_outputs
-
-
- @_requires_fit
- @_override_params(['window', 'independent'])
- @_validate_params(using=_KNNValidator)
- def compute_distance_matrix(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None,
- **kwargs
- ) -> Array[float]:
- """Calculates a matrix of DTW distances between the sequences in ``X`` and the training sequences.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``window``: See :func:`__init__`.
- - ``independent``: See :func:`__init__`.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: DTW distance matrix.
- """
- data = _BaseMultivariateFloatSequenceValidator(X=X, lengths=lengths)
-
- n_jobs = _effective_n_jobs(self.n_jobs, data.lengths)
- dtw_ = self._dtw()
-
- row_chunk_idxs = np.array_split(SequentialDataset._get_idxs(data.lengths), n_jobs)
- col_chunk_idxs = np.array_split(self.idxs_, n_jobs)
-
- return np.vstack(
- Parallel(n_jobs=n_jobs, max_nbytes=None)(
- delayed(self._distance_matrix_row_chunk)(
- row_idxs, col_chunk_idxs, data.X, n_jobs, dtw_
- ) for row_idxs in row_chunk_idxs
- )
- )
-
-
- @_override_params(['window', 'independent'])
- @_validate_params(using=_KNNValidator)
- def dtw(self, A: Array[float], B: Array[float], **kwargs) -> float:
- """Calculates the DTW distance between two univariate or multivariate sequences.
-
- :param A: The first sequence.
-
- :param B: The second sequence.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``window``: See :func:`__init__`.
- - ``independent``: See :func:`__init__`.
-
- :return: DTW distance.
- """
- A = _SingleMultivariateFloatSequenceValidator(sequence=A).sequence
- B = _SingleMultivariateFloatSequenceValidator(sequence=B).sequence
- return self._dtw(A, B)
-
-
- @_check_plotting_dependencies
- @_override_params(['window'])
- @_validate_params(using=_KNNValidator)
- def plot_warping_path_1d(
- self,
- a: Array[float],
- b: Array[float],
- **kwargs
- ) -> 'matplotlib.axes.Axes':
- """Calculates the DTW matrix between two sequences and plots the optimal warping path.
-
- :param a: The first sequence.
-
- :param b: The second sequence.
-
- :note: Only supports univariate sequences.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``window``: See :func:`__init__`.
-
- :return: Plot axes.
- """
- from dtaidistance import dtw_visualisation
-
- a = _SingleUnivariateFloatSequenceValidator(sequence=a).sequence
- b = _SingleUnivariateFloatSequenceValidator(sequence=b).sequence
-
- window = self._window(a, b)
- _, paths = dtw.warping_paths(a, b, window=window)
- best_path = dtw.best_path(paths)
-
- return dtw_visualisation.plot_warpingpaths(a, b, paths, best_path)
-
-
- @_check_plotting_dependencies
- @_requires_fit
- @_override_params(['window', 'independent'])
- @_validate_params(using=_KNNValidator)
- def plot_dtw_histogram(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None,
- ax: Optional['matplotlib.axes.Axes'] = None,
- **kwargs
- ) -> 'matplotlib.axes.Axes':
- """Calculates DTW distances between ``X`` and training sequences, and plots a distance histogram.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param ax: Plot axes. If ``None``, new axes are created.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``window``: See :func:`__init__`.
- - ``independent``: See :func:`__init__`.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Plot axes.
- """
- import matplotlib.pyplot as plt
-
- distances = self.compute_distance_matrix(X, lengths)
-
- if ax is None:
- _, ax = plt.subplots()
- ax.hist(distances.flatten())
- return ax
-
-
- @_check_plotting_dependencies
- @_requires_fit
- @_override_params(['weighting', 'window', 'independent'])
- @_validate_params(using=_KNNValidator)
- def plot_weight_histogram(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None,
- ax: Optional['matplotlib.axes.Axes'] = None,
- **kwargs
- ) -> 'matplotlib.axes.Axes':
- """Calculates DTW weights between ``X`` and training sequences, and plots a weight histogram.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param ax: Plot axes. If ``None``, new axes are created.
-
- :param \*\*kwargs: Model parameters to temporarily override (*for experimentation purposes*).
-
- - ``weighting``: See :func:`__init__`.
- - ``window``: See :func:`__init__`.
- - ``independent``: See :func:`__init__`.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Plot axes.
- """
- import matplotlib.pyplot as plt
-
- distances = self.compute_distance_matrix(X, lengths)
- weightings = self._weighting()(distances)
-
- if ax is None:
- _, ax = plt.subplots()
- ax.hist(weightings.flatten())
- return ax
-
-
- def _dtw1d(self, a: Array[float], b: Array[float], window: int) -> float:
- """Computes the DTW distance between two univariate sequences."""
- return dtw.distance(a, b, use_c=self.use_c, window=window)
-
-
- def _window(self, A: Array[float], B: Array[float]) -> int:
- """TODO"""
- return int(self.window * min(len(A), len(B)))
-
-
- def _dtwi(self, A: Array[float], B: Array[float]) -> float:
- """Computes the multivariate DTW distance as the sum of the pairwise per-feature DTW distances,
- allowing each feature to be warped independently."""
- window = self._window(A, B)
- return np.sum([self._dtw1d(A[:, i], B[:, i], window) for i in range(A.shape[1])])
-
-
- def _dtwd(self, A: Array[float], B: Array[float]) -> float:
- """Computes the multivariate DTW distance so that the warping of the features depends on each other,
- by modifying the local distance measure."""
- window = self._window(A, B)
- return dtw_ndim.distance(A, B, use_c=self.use_c, window=window)
-
-
- def _dtw(self) -> Callable:
- """TODO"""
- return self._dtwi if self.independent else self._dtwd
-
-
- def _weighting(self) -> Callable:
- """TODO"""
- return self.weighting if callable(self.weighting) else lambda x: np.ones_like(x)
-
-
- def _distance_matrix_row_chunk(
- self,
- row_idxs: Array[int],
- col_chunk_idxs: List[Array[int]],
- X: Array[float],
- n_jobs: int,
- dist: Callable
- ) -> Array[float]:
- """Calculates a distance sub-matrix for a subset of rows over all columns.
-
- TODO
- """
- return np.hstack(
- Parallel(n_jobs=n_jobs, max_nbytes=None)(
- delayed(self._distance_matrix_row_col_chunk)(
- col_idxs, row_idxs, X, dist
- ) for col_idxs in col_chunk_idxs
- )
- )
-
-
- def _distance_matrix_row_col_chunk(
- self,
- col_idxs: Array[int],
- row_idxs: Array[int],
- X: Array[float],
- dist: Callable
- ) -> Array[float]:
- """Calculates a distance sub-matrix for a subset of rows and columns.
-
- TODO
- """
- distances = np.zeros((len(row_idxs), len(col_idxs)))
- for i, x_row in enumerate(SequentialDataset._iter_X(X, row_idxs)):
- for j, x_col in enumerate(SequentialDataset._iter_X(self.X_, col_idxs)):
- distances[i, j] = dist(x_row, x_col)
- return distances
-
-
- @_requires_fit
- def save(self, path: Union[str, pathlib.Path, IO]):
- """Serializes and saves a fitted KNN estimator.
-
- :param path: Location to save the serialized estimator.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- See Also
- --------
- load:
- Loads and deserializes a fitted KNN estimator.
- """
- # Fetch main parameters and fitted values
- state = {
- 'params': self.get_params(),
- 'fitted': {k:v for k, v in self.__dict__.items() if k.endswith('_')}
- }
-
- # Serialize weighting function
- if self.weighting is None:
- state['params']['weighting'] = self.weighting
- else:
- state['params']['weighting'] = marshal.dumps(
- (self.weighting.__code__, self.weighting.__name__)
- )
-
- # Serialize model
- joblib.dump(state, path)
-
-
- @classmethod
- def load(cls, path: Union[str, pathlib.Path, IO]):
- """Loads and deserializes a fitted KNN estimator.
-
- :param path: Location to load the serialized estimator from.
-
- :return: Fitted KNN estimator.
-
- See Also
- --------
- save:
- Serializes and saves a fitted KNN estimator.
- """
- state = joblib.load(path)
-
- # Deserialize weighting function
- if state['params']['weighting'] is not None:
- weighting, name = marshal.loads(state['params']['weighting'])
- state['params']['weighting'] = types.FunctionType(weighting, globals(), name)
-
- # Set main parameters
- model = cls(**state['params'])
-
- # Set fitted values
- for k, v in state['fitted'].items():
- setattr(model, k, v)
-
- # Return deserialized model
- return model
diff --git a/lib/sequentia/models/knn/classifier.py b/lib/sequentia/models/knn/classifier.py
deleted file mode 100644
index 1adafc85..00000000
--- a/lib/sequentia/models/knn/classifier.py
+++ /dev/null
@@ -1,395 +0,0 @@
-from __future__ import annotations
-
-from types import SimpleNamespace
-from typing import Optional, Union, Callable
-from joblib import Parallel, delayed
-
-import numpy as np
-from pydantic import NegativeInt, NonNegativeInt, PositiveInt, confloat
-from numba import njit, prange
-from sklearn.utils import check_random_state
-
-from sequentia.models.knn.base import _KNNMixin, _KNNValidator
-from sequentia.models.base import _Classifier
-
-from sequentia.utils.decorators import _validate_params, _requires_fit, _override_params
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.multiprocessing import _effective_n_jobs
-from sequentia.utils.validation import (
- _check_classes,
- Array,
- _MultivariateFloatSequenceClassifierValidator
-)
-
-__all__ = ['KNNClassifier']
-
-_defaults = SimpleNamespace(
- **{
- **_KNNMixin._defaults.__dict__,
- "classes": None,
- }
-)
-
-
-class _KNNClassifierValidator(_KNNValidator):
- classes: Optional[Array[int]] = _defaults.classes
-
-
-class KNNClassifier(_KNNMixin, _Classifier):
- """A k-nearest neighbor classifier that uses DTW as a distance measure for sequence comparison.
-
- The classifier computes the score for each class as the total of the distance weightings of every sequence belonging to that class,
- within the DTW k-neighborhood of the sequence being classified.
-
- Examples
- --------
- Using a :class:`.KNNClassifier` to classify spoken digits. ::
-
- import numpy as np
- from sequentia.datasets import load_digits
- from sequentia.models.knn import KNNClassifier
-
- # Seed for reproducible pseudo-randomness
- random_state = np.random.RandomState(1)
-
- # Fetch MFCCs of spoken digits
- data = load_digits()
- train_data, test_data = data.split(test_size=0.2, random_state=random_state)
-
- # Create a HMMClassifier using a class frequency prior
- clf = KNNClassifier()
-
- # Fit the classifier
- X_train, y_train, lengths_train = train_data.X_y_lengths
- clf.fit(X_train, y_train, lengths_train)
-
- # Predict classes for the test observation sequences
- X_test, lengths_test = test_data.X_lengths
- y_pred = clf.predict(X_test, lengths_test)
- """
-
- _defaults = _defaults
-
-
- @_validate_params(using=_KNNClassifierValidator)
- def __init__(
- self,
- *,
- k: PositiveInt = _defaults.k,
- weighting: Optional[Callable] = _defaults.weighting,
- window: confloat(ge=0, le=1) = _defaults.window,
- independent: bool = _defaults.independent,
- classes: Optional[Array[int]] = None,
- use_c: bool = _defaults.use_c,
- n_jobs: Union[NegativeInt, PositiveInt] = _defaults.n_jobs,
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state
- ) -> KNNClassifier:
- """Initializes the :class:`.KNNClassifier`.
-
- :param k: Number of neighbors.
-
- :param weighting: A callable that specifies how distance weighting should be performed.
- The callable should accept a :class:`numpy:numpy.ndarray` of DTW distances, apply an element-wise weighting transformation
- to the matrix of DTW distances, then return an equally-sized :class:`numpy:numpy.ndarray` of weightings.
- If ``None``, then a uniform weighting of 1 will be applied to all distances.
-
- :param window: The size of the Sakoe—Chiba band global constrant as a fraction of the length of the shortest of the two sequences being compared.
-
- - A larger window will give more freedom to the DTW alignment, allowing more deviation but leading to potentially slower computation.
- A window of 1 is equivalent to full DTW computation with no global constraint applied.
- - A smaller window will restrict the DTW alignment, and possibly speed up the DTW computation.
- A window of 0 is equivalent to Euclidean distance.
-
- :param independent: Whether or not to allow features to be warped independently from each other. See [#dtw_multi]_ for an overview of independent and dependent dynamic time warping.
-
- :param classes: Set of possible class labels.
-
- - If not provided, these will be determined from the training data labels.
- - If provided, output from methods such as :func:`predict_proba` and :func:`predict_scores`
- will follow the ordering of the class labels provided here.
-
- :param use_c: Whether or not to use fast pure C compiled functions from `dtaidistance `__ to perform the DTW computations.
-
- :param n_jobs: Maximum number of concurrently running workers.
-
- - If 1, no parallelism is used at all (useful for debugging).
- - If -1, all CPUs are used.
- - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g. ``n_jobs=-2`` uses all but one.
-
- :param random_state: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- """
- #: Number of neighbors.
- self.k = k
- #: A callable that specifies how distance weighting should be performed.
- self.weighting = weighting
- #: The size of the Sakoe—Chiba band global constrant as a fraction of the length of the shortest of the two sequences being compared.
- self.window = window
- #: Whether or not to allow features to be warped independently from each other.
- self.independent = independent
- #: Set of possible class labels.
- self.classes = classes
- #: Whether or not to use fast pure C compiled functions from `dtaidistance `__ to perform the DTW computations.
- self.use_c = use_c
- #: Maximum number of concurrently running workers.
- self.n_jobs = n_jobs
- #: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- self.random_state = random_state
-
-
- def fit(
- self,
- X: Array[float],
- y: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> KNNClassifier:
- """Fits the classifier to the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The fitted classifier.
- """
- data = _MultivariateFloatSequenceClassifierValidator(X=X, y=y, lengths=lengths)
- self.X_ = data.X
- self.y_ = data.y
- self.lengths_ = data.lengths
- self.idxs_ = SequentialDataset._get_idxs(data.lengths)
- self.random_state_ = check_random_state(self.random_state)
- self.classes_ = _check_classes(data.y, self.classes)
- return self
-
-
- @_requires_fit
- def predict(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- """Predicts classes for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class predictions.
- """
- class_scores = self.predict_scores(X, lengths)
- return self._find_max_labels(class_scores)
-
-
- def fit_predict(
- self,
- X: Array[float],
- y: Array[int],
- lengths: Optional[Array[int]] = None
- ) -> Array[int]:
- """Fits the classifier to the sequence(s) in ``X`` and predicts classes for ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: Class predictions.
- """
- return super().fit_predict(X, y, lengths)
-
-
- @_requires_fit
- def predict_proba(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- """Predicts class probabilities for the sequence(s) in ``X``.
-
- Probabilities are calculated as normalized class scores.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class membership probabilities.
- """
- class_scores = self.predict_scores(X, lengths)
- return class_scores / class_scores.sum(axis=1, keepdims=True)
-
-
- @_requires_fit
- def predict_scores(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- """Predicts class scores for the sequence(s) in ``X``.
-
- Scores are calculated as the class distance weighting sums of all training sequences in the k-neighborhood.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Class scores.
- """
- _, k_distances, k_labels = self.query_neighbors(X, lengths, sort=False)
- k_weightings = self._weighting()(k_distances)
- return self._compute_scores(k_labels, k_weightings)
-
-
- @_requires_fit
- def score(
- self,
- X: Array,
- y: Array[int],
- lengths: Optional[Array[int]],
- normalize: bool = True,
- sample_weight: Optional[Array] = None,
- ) -> float:
- """Calculates accuracy for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Classes corresponding to the observation sequence(s) in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param normalize: See :func:`sklearn:sklearn.metrics.accuracy_score`.
-
- :param sample_weight: See :func:`sklearn:sklearn.metrics.accuracy_score`.
-
- :note: This method requires a trained classifier — see :func:`fit`.
-
- :return: Classification accuracy.
- """
- return super().score(X, y, lengths, normalize, sample_weight)
-
-
- @_validate_params(using=_KNNValidator)
- @_override_params(_KNNValidator.fields(), temporary=False)
- def set_params(self, **kwargs):
- return self
-
-
- def _compute_scores(
- self,
- labels: Array[int],
- weightings: Array[float]
- ) -> Array[float]:
- """Calculates the sum of the weightings for each label group.
-
- TODO
- """
- scores = np.zeros((len(labels), len(self.classes_)))
- for i, k in enumerate(self.classes_):
- scores[:, i] = np.einsum('ij,ij->i', labels == k, weightings)
- return scores
-
-
- def _find_max_labels(
- self,
- scores: Array[float]
- ) -> Array[int]:
- """Returns the label of the k nearest neighbors with the highest score for each example.
-
- TODO
- """
- n_jobs = _effective_n_jobs(self.n_jobs, scores)
- score_chunks = np.array_split(scores, n_jobs)
- return np.concatenate(
- Parallel(n_jobs=n_jobs, max_nbytes=None)(
- delayed(self._find_max_labels_chunk)(score_chunk)
- for score_chunk in score_chunks
- )
- )
-
-
- def _find_max_labels_chunk(
- self,
- score_chunk: Array[float]
- ) -> Array[int]:
- """Returns the label with the highest score for each item in the chunk.
-
- TODO
- """
- max_labels = np.zeros(len(score_chunk), dtype=int)
- for i, scores in enumerate(score_chunk):
- max_score_idxs = self._multi_argmax(scores)
- max_labels[i] = self.random_state_.choice(self.classes_[max_score_idxs], size=1)
- return max_labels
-
-
- @staticmethod
- @njit
- def _multi_argmax(
- arr: Array
- ) -> Array[int]:
- """Same as numpy.argmax but returns all occurrences of the maximum and only requires a single pass.
- From: https://stackoverflow.com/a/58652335
-
- TODO
- """
- all_, max_ = [0], arr[0]
- for i in prange(1, len(arr)):
- if arr[i] > max_:
- all_, max_ = [i], arr[i]
- elif arr[i] == max_:
- all_.append(i)
- return np.array(all_)
diff --git a/lib/sequentia/models/knn/regressor.py b/lib/sequentia/models/knn/regressor.py
deleted file mode 100644
index 92bbed9a..00000000
--- a/lib/sequentia/models/knn/regressor.py
+++ /dev/null
@@ -1,209 +0,0 @@
-from __future__ import annotations
-
-from typing import Optional, Union, Callable, Any
-
-import numpy as np
-from pydantic import NegativeInt, NonNegativeInt, PositiveInt, confloat
-from sklearn.utils import check_random_state
-
-from sequentia.models.knn.base import _KNNMixin, _KNNValidator, _defaults
-from sequentia.models.base import _Regressor
-
-from sequentia.utils.decorators import _validate_params, _requires_fit, _override_params
-from sequentia.utils.data import SequentialDataset
-from sequentia.utils.validation import (
- Array,
- _MultivariateFloatSequenceRegressorValidator
-)
-
-__all__ = ['KNNRegressor']
-
-
-class KNNRegressor(_KNNMixin, _Regressor):
- """A k-nearest neighbor regressor that uses DTW as a distance measure for sequence comparison.
-
- The regressor computes the output as a distance weighted average of the outputs of the sequences within the DTW k-neighborhood of the sequence being predicted.
- """
-
- @_validate_params(using=_KNNValidator)
- def __init__(
- self,
- *,
- k: PositiveInt = _defaults.k,
- weighting: Optional[Callable] = _defaults.weighting,
- window: confloat(ge=0, le=1) = _defaults.window,
- independent: bool = _defaults.independent,
- use_c: bool = _defaults.use_c,
- n_jobs: Union[NegativeInt, PositiveInt] = _defaults.n_jobs,
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = _defaults.random_state
- ) -> KNNRegressor:
- """Initializes the :class:`.KNNRegressor`.
-
- :param k: Number of neighbors.
-
- :param weighting: A callable that specifies how distance weighting should be performed.
- The callable should accept a :class:`numpy:numpy.ndarray` of DTW distances, apply an element-wise weighting transformation
- to the matrix of DTW distances, then return an equally-sized :class:`numpy:numpy.ndarray` of weightings.
- If ``None``, then a uniform weighting of 1 will be applied to all distances.
-
- :param window: The size of the Sakoe—Chiba band global constrant as a fraction of the length of the shortest of the two sequences being compared.
-
- - A larger window will give more freedom to the DTW alignment, allowing more deviation but leading to potentially slower computation.
- A window of 1 is equivalent to full DTW computation with no global constraint applied.
- - A smaller window will restrict the DTW alignment, and possibly speed up the DTW computation.
- A window of 0 is equivalent to Euclidean distance.
-
- :param independent: Whether or not to allow features to be warped independently from each other. See [#dtw_multi]_ for an overview of independent and dependent dynamic time warping.
-
- :param use_c: Whether or not to use fast pure C compiled functions from `dtaidistance `__ to perform the DTW computations.
-
- :param n_jobs: Maximum number of concurrently running workers.
-
- - If 1, no parallelism is used at all (useful for debugging).
- - If -1, all CPUs are used.
- - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g. ``n_jobs=-2`` uses all but one.
-
- :param random_state: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- """
- #: Number of neighbors.
- self.k = k
- #: A callable that specifies how distance weighting should be performed.
- self.weighting = weighting
- #: The size of the Sakoe—Chiba band global constrant as a fraction of the length of the shortest of the two sequences being compared.
- self.window = window
- #: Whether or not to allow features to be warped independently from each other.
- self.independent = independent
- #: Set of possible class labels.
- self.use_c = use_c
- #: Maximum number of concurrently running workers.
- self.n_jobs = n_jobs
- #: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- self.random_state = random_state
-
-
- def fit(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> KNNRegressor:
- """Fits the regressor to the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The fitted regressor.
- """
- data = _MultivariateFloatSequenceRegressorValidator(X=X, y=y, lengths=lengths)
- self.X_ = data.X
- self.y_ = data.y
- self.lengths_ = data.lengths
- self.idxs_ = SequentialDataset._get_idxs(data.lengths)
- self.random_state_ = check_random_state(self.random_state)
- return self
-
-
- @_requires_fit
- def predict(
- self,
- X: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- """Predicts outputs for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :note: This method requires a trained regressor — see :func:`fit`.
-
- :return: Output predictions.
- """
- _, k_distances, k_outputs = self.query_neighbors(X, lengths, sort=False)
- k_weightings = self._weighting()(k_distances)
- return (k_outputs * k_weightings).sum(axis=1) / k_weightings.sum(axis=1)
-
-
- def fit_predict(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None
- ) -> Array[float]:
- """Fits the regressor to the sequence(s) in ``X`` and predicts outputs for ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: Output predictions.
- """
- return super().fit_predict(X, y, lengths)
-
-
- @_requires_fit
- def score(
- self,
- X: Array[float],
- y: Array[float],
- lengths: Optional[Array[int]] = None,
- sample_weight: Optional[Any] = None
- ) -> float:
- """Calculates the coefficient of determination (R\ :sup:`2`) for the sequence(s) in ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to the observation sequence(s) in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param sample_weight: See :func:`sklearn:sklearn.metrics.r2_score`.
-
- :note: This method requires a trained regressor — see :func:`fit`.
-
- :return: Coefficient of determination.
- """
- return super().score(X, y, lengths, sample_weight)
-
-
- @_validate_params(using=_KNNValidator)
- @_override_params(['k', 'weighting', 'window', 'independent', 'use_c', 'n_jobs'], temporary=False)
- def set_params(self, **kwargs):
- return self
diff --git a/lib/sequentia/pipeline.py b/lib/sequentia/pipeline.py
deleted file mode 100644
index ca18d81d..00000000
--- a/lib/sequentia/pipeline.py
+++ /dev/null
@@ -1,594 +0,0 @@
-"""
-Pipeline is an adapted version of Pipeline from the sklearn.pipeline module,
-and largely relies on its source code.
-
-Below is the original license from Scikit-Learn, copied on 31st December 2022
-from https://github.com/scikit-learn/scikit-learn/blob/main/COPYING.
-
----
-
-BSD 3-Clause License
-
-Copyright (c) 2007-2022 The scikit-learn developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-from __future__ import annotations
-
-from typing import Optional, Any, Union, List, Tuple
-
-import sklearn.pipeline
-import sklearn.base
-from joblib import Memory
-
-from sklearn.pipeline import _final_estimator_has
-from sklearn.utils.metaestimators import available_if
-from sklearn.utils import _print_elapsed_time
-from sklearn.utils.validation import check_memory
-from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
-
-from sequentia.preprocessing.base import Transform
-from sequentia.utils.validation import Array
-
-__all__ = ["Pipeline"]
-
-
-class Pipeline(sklearn.pipeline.Pipeline):
- """Pipeline of transforms with a final estimator.
-
- Sequentially apply a list of transforms and a final estimator.
- Intermediate steps of the pipeline must be 'transforms', that is, they
- must implement ``fit`` and ``transform`` methods.
- The final estimator only needs to implement ``fit``.
- The transformers in the pipeline can be cached using ``memory`` argument.
- The purpose of the pipeline is to assemble several steps that can be
- cross-validated together while setting different parameters. For this, it
- enables setting parameters of the various steps using their names and the
- parameter name separated by a ``__``. A step's estimator may be replaced
- entirely by setting the parameter with its name to another estimator,
- or a transformer removed by setting it to ``'passthrough'`` or ``None``.
-
- See Also
- --------
- :class:`sklearn.pipeline.Pipeline`:
- :class:`.Pipeline` is based on :class:`sklearn.pipeline.Pipeline`,
- but adapted to accept and work with sequences.
-
- Read more in the :ref:`User Guide `.
-
- Examples
- --------
- Creating a :class:`.Pipeline` consisting of two transforms and a :class:`.KNNClassifier`,
- and fitting it to sequences in the spoken digits dataset. ::
-
- from sequentia.models import KNNClassifier
- from sequentia.preprocessing import IndependentFunctionTransformer
- from sequentia.pipeline import Pipeline
- from sequentia.datasets import load_digits
-
- from sklearn.preprocessing import scale
- from sklearn.decomposition import PCA
-
- # Fetch MFCCs of spoken digits
- digits = load_digits()
- train, test = digits.split(test_size=0.2)
-
- # Create a pipeline with two transforms and a classifier
- pipeline = Pipeline([
- ('standardize', IndependentFunctionTransformer(scale)),
- ('pca', PCA(n_components=5)),
- ('clf', KNNClassifier(k=1))
- ])
-
- # Fit the pipeline transforms and classifier to training data
- pipeline.fit(train.X, train.lengths)
-
- # Apply the transforms to training sequences and make predictions
- y_train_pred = pipeline.predict(train.X, train.y, train.lengths)
-
- # Calculate accuracy on test data
- acc = pipeline.score(test.X, test.y, test.lengths)
- """
-
- def __init__(
- self,
- steps: List[Tuple[str, BaseEstimator]],
- *,
- memory: Optional[Union[str, Memory]] = None,
- verbose: bool = False
- ) -> Pipeline:
- """Initializes the :class:`.Pipeline`.
-
- :param steps: Collection of transforms implementing ``fit``/``transform`` that are chained,
- with the last object being an estimator.
-
- :param memory: Used to cache the fitted transformers of the pipeline. By default,
- no caching is performed. If a string is given, it is the path to
- the caching directory. Enabling caching triggers a clone of
- the transformers before fitting. Therefore, the transformer
- instance given to the pipeline cannot be inspected
- directly. Use the attribute ``named_steps`` or ``steps`` to
- inspect estimators within the pipeline. Caching the
- transformers is advantageous when fitting is time consuming.
-
- :param verbose: If ``True``, the time elapsed while fitting each step will be printed as it
- is completed.
- """
- super().__init__(steps, memory=memory, verbose=verbose)
-
-
- def _can_transform(self):
- return self._final_estimator == "passthrough" or hasattr(
- self._final_estimator, "transform"
- )
-
-
- def _can_inverse_transform(self):
- return all(hasattr(t, "inverse_transform") for _, _, t in self._iter())
-
-
- def _fit(
- self,
- X: Array,
- lengths: Optional[Array] = None,
- **fit_params_steps
- ) -> Array:
- # shallow copy of steps - this should really be steps_
- self.steps = list(self.steps)
- self._validate_steps()
- # Setup the memory
- memory = check_memory(self.memory)
-
- fit_transform_one_cached = memory.cache(_fit_transform_one)
-
- for step_idx, name, transformer in self._iter(
- with_final=False, filter_passthrough=False
- ):
- if transformer is None or transformer == "passthrough":
- with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
- continue
-
- if hasattr(memory, "location") and memory.location is None:
- # we do not clone when caching is disabled to
- # preserve backward compatibility
- cloned_transformer = transformer
- else:
- cloned_transformer = clone(transformer)
- # Fit or load from cache the current transformer
- X, fitted_transformer = fit_transform_one_cached(
- cloned_transformer,
- X,
- lengths,
- None,
- message_clsname="Pipeline",
- message=self._log_message(step_idx),
- **fit_params_steps[name],
- )
- # Replace the transformer of the step with the fitted
- # transformer. This is necessary when loading the transformer
- # from the cache.
- self.steps[step_idx] = (name, fitted_transformer)
- return X
-
-
- def fit(
- self,
- X: Array,
- y: Optional[Array] = None,
- lengths: Optional[Array] = None,
- **fit_params
- ) -> Pipeline:
- """Fit the model.
-
- Fit all the transformers one after the other and transform the
- data. Finally, fit the transformed data using the final estimator.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
- Only required if the final estimator is a supervised model.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param fit_params: Parameters passed to the ``fit`` method of each step,
- where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``.
-
- :return: The fitted pipeline.
- """
- fit_params_steps = self._check_fit_params(**fit_params)
- Xt = self._fit(X, lengths, **fit_params_steps)
-
- with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
- if self._final_estimator != "passthrough":
- fit_params_last_step = fit_params_steps[self.steps[-1][0]]
- if isinstance(self._final_estimator, ClassifierMixin):
- self._final_estimator.fit(Xt, y, lengths, **fit_params_last_step)
- else:
- self._final_estimator.fit(Xt, lengths, **fit_params_last_step)
- return self
-
-
- def fit_transform(
- self,
- X: Array,
- lengths: Optional[Array] = None,
- **fit_params
- ) -> Array:
- """Fit the model and transform with the final estimator.
-
- Fits all the transformers one after the other and transform the
- data. Then uses `fit_transform` on transformed data with the final
- estimator.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param fit_params: Parameters passed to the ``fit`` method of each step,
- where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``.
-
- :return: The transformed data.
- """
- fit_params_steps = self._check_fit_params(**fit_params)
- Xt = self._fit(X, lengths, **fit_params_steps)
-
- last_step = self._final_estimator
- with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
- if last_step == "passthrough":
- return Xt
- fit_params_last_step = fit_params_steps[self.steps[-1][0]]
-
- if is_sequentia_transform := isinstance(last_step, Transform):
- fit_params_last_step["lengths"] = lengths
-
- if hasattr(last_step, "fit_transform"):
- return last_step.fit_transform(Xt, **fit_params_last_step)
- else:
- transform_params = {}
- if is_sequentia_transform:
- transform_params["lengths"] = lengths
- getattr(last_step, "transform")
- return last_step.fit(Xt, **fit_params_last_step).transform(Xt, **transform_params)
-
-
- @available_if(_final_estimator_has("predict"))
- def predict(
- self,
- X: Array,
- lengths: Optional[Array] = None
- ) -> Array:
- """Transform the data, and apply `predict` with the final estimator.
-
- Call `transform` of each transformer in the pipeline. The transformed
- data are finally passed to the final estimator that calls `predict`
- method. Only valid if the final estimator implements `predict`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: Output predictions.
- """
- Xt = X
- for _, name, transform in self._iter(with_final=False):
- transform_params = {}
- if isinstance(transform, Transform):
- transform_params["lengths"] = lengths
- Xt = transform.transform(Xt, **transform_params)
- return self.steps[-1][1].predict(Xt, lengths)
-
-
- @available_if(_final_estimator_has("fit_predict"))
- def fit_predict(
- self,
- X: Array,
- y: Array,
- lengths: Optional[Array] = None,
- **fit_params
- ) -> Array:
- """Transform the data, and apply `fit_predict` with the final estimator.
-
- Call `fit_transform` of each transformer in the pipeline. The
- transformed data are finally passed to the final estimator that calls
- `fit_predict` method. Only valid if the final estimator implements
- `fit_predict`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param fit_params: Parameters passed to the ``fit`` method of each step,
- where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``.
-
- :return: Output predictions.
- """
- fit_params_steps = self._check_fit_params(**fit_params)
- Xt = self._fit(X, lengths, **fit_params_steps)
-
- fit_params_last_step = fit_params_steps[self.steps[-1][0]]
- with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
- y_pred = self.steps[-1][1].fit_predict(Xt, y, lengths, **fit_params_last_step)
- return y_pred
-
-
- @available_if(_final_estimator_has("predict_proba"))
- def predict_proba(
- self,
- X: Array,
- lengths: Optional[Array] = None
- ) -> Array:
- """Transform the data, and apply `predict_proba` with the final estimator.
-
- Call `transform` of each transformer in the pipeline. The transformed
- data are finally passed to the final estimator that calls
- `predict_proba` method. Only valid if the final estimator implements
- `predict_proba`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: Output probabilities.
- """
- Xt = X
- for _, name, transform in self._iter(with_final=False):
- transform_params = {}
- if isinstance(transform, Transform):
- transform_params["lengths"] = lengths
- Xt = transform.transform(Xt, **transform_params)
- return self.steps[-1][1].predict_proba(Xt, lengths)
-
-
- @available_if(_can_transform)
- def transform(
- self,
- X: Array,
- lengths: Optional[Array] = None
- ) -> Array:
- """Transform the data, and apply `transform` with the final estimator.
-
- Call `transform` of each transformer in the pipeline. The transformed
- data are finally passed to the final estimator that calls
- `transform` method. Only valid if the final estimator
- implements `transform`.
-
- This also works where final estimator is `None` in which case all prior
- transformations are applied.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The transformed data.
- """
- Xt = X
- for _, _, transform in self._iter():
- transform_params = {}
- if isinstance(transform, Transform):
- transform_params["lengths"] = lengths
- Xt = transform.transform(Xt, **transform_params)
- return Xt
-
-
- @available_if(_can_inverse_transform)
- def inverse_transform(
- self,
- X: Array,
- lengths: Optional[Array] = None
- ) -> Array:
- """Apply `inverse_transform` for each step in a reverse order.
-
- All estimators in the pipeline must support `inverse_transform`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The inverse transformed data.
- """
- reverse_iter = reversed(list(self._iter()))
- for _, _, transform in reverse_iter:
- transform_params = {}
- if isinstance(transform, Transform):
- transform_params["lengths"] = lengths
- X = transform.inverse_transform(X, **transform_params)
- return X
-
-
- @available_if(_final_estimator_has("score"))
- def score(
- self,
- X: Array,
- y: Optional[Array] = None,
- lengths: Optional[Array] = None,
- sample_weight: Optional[Any] = None
- ) -> float:
- """Transform the data, and apply `score` with the final estimator.
-
- Call `transform` of each transformer in the pipeline. The transformed
- data are finally passed to the final estimator that calls
- `score` method. Only valid if the final estimator implements `score`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
- Must be provided if the final estimator is a model, i.e. not a transform.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param sample_weight: If not ``None``, this argument is passed as ``sample_weight``
- keyword argument to the ``score`` method of the final estimator.
-
- :return: Result of calling `score` on the final estimator.
- """
- Xt = X
- for _, name, transform in self._iter(with_final=False):
- transform_params = {}
- if isinstance(transform, Transform):
- transform_params["lengths"] = lengths
- Xt = transform.transform(Xt, **transform_params)
-
- score_params = {}
- if sample_weight is not None:
- score_params["sample_weight"] = sample_weight
-
- last_step = self.steps[-1][1]
- if isinstance(last_step, TransformerMixin):
- return last_step.score(Xt, lengths, **score_params)
- else:
- return last_step.score(Xt, y, lengths, **score_params)
-
-
-def _fit_transform_one(
- transformer,
- X,
- lengths,
- weight,
- message_clsname="",
- message=None,
- **fit_params
-):
- with _print_elapsed_time(message_clsname, message):
- if is_sequentia_transformer := isinstance(transformer, Transform):
- fit_params["lengths"] = lengths
- if hasattr(transformer, "fit_transform"):
- res = transformer.fit_transform(X, **fit_params)
- else:
- transform_params = {}
- if is_sequentia_transformer:
- transform_params["lengths"] = lengths
- res = transformer.fit(X, **fit_params).transform(X, **transform_params)
-
- if weight is None:
- return res, transformer
- return res * weight, transformer
-
-
-if __name__ == "__main__":
- import numpy as np
-
- from sequentia.models import KNNClassifier, HMMClassifier, GaussianMixtureHMM
- from sequentia.datasets import load_digits
- from sequentia.preprocessing import IndependentFunctionTransformer
- # try normal FunctionTransformer
-
- from sklearn.preprocessing import StandardScaler, scale
- from sklearn.decomposition import PCA
-
- random_state = np.random.RandomState(0)
-
- # digits = load_digits(digits=[0, 1])
- digits = load_digits()
- # subset, _ = digits.split(train_size=0.1, stratify=True, random_state=random_state)
- # train, test = subset.split(test_size=0.2)
- train, test = digits.split(test_size=0.2)
-
- pipeline = Pipeline([
- ('standardize', IndependentFunctionTransformer(scale)),
- ('pca', PCA(n_components=5, random_state=random_state)),
- ('clf', HMMClassifier(n_jobs=-1).add_models({
- k: GaussianMixtureHMM(random_state=random_state)
- for k in train.classes
- }))
- # ('clf', KNNClassifier(k=1, use_c=True, n_jobs=-1, random_state=random_state))
- ])
-
- # Xt = pipeline.fit_transform(*train.X_lengths)
-
- y_pred = pipeline.fit_predict(*train.X_y_lengths)
-
- breakpoint()
-
- # from scipy.signal import medfilt2d, convolve
- # ('median_filter', IndependentFunctionTransformer(medfilt2d, kw_args={"kernel_size": (5, 1)}))
diff --git a/lib/sequentia/preprocessing/__init__.py b/lib/sequentia/preprocessing/__init__.py
deleted file mode 100644
index 7986cdd6..00000000
--- a/lib/sequentia/preprocessing/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .transforms import *
diff --git a/lib/sequentia/preprocessing/base.py b/lib/sequentia/preprocessing/base.py
deleted file mode 100644
index e5ebe141..00000000
--- a/lib/sequentia/preprocessing/base.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Optional
-
-from sklearn.base import TransformerMixin, BaseEstimator
-
-from sequentia.utils.validation import Array
-
-__all__ = ['Transform']
-
-
-class Transform(TransformerMixin, BaseEstimator):
- def fit_transform(
- self,
- X: Array,
- lengths: Optional[Array] = None,
- ) -> Array:
- """Fits the transformer to the sequence(s) in ``X`` and returns a transformed version of ``X``.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :return: The transformed data.
- """
- return self.fit(X, lengths).transform(X, lengths)
diff --git a/lib/sequentia/utils/__init__.py b/lib/sequentia/utils/__init__.py
deleted file mode 100644
index c6cdafbc..00000000
--- a/lib/sequentia/utils/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .decorators import *
-from .multiprocessing import *
-from .data import *
-from .validation import *
diff --git a/lib/sequentia/utils/data.py b/lib/sequentia/utils/data.py
deleted file mode 100644
index 7011eff8..00000000
--- a/lib/sequentia/utils/data.py
+++ /dev/null
@@ -1,300 +0,0 @@
-from __future__ import annotations
-
-import copy
-import warnings
-import pathlib
-from typing import Optional, Tuple, Union, Iterator, IO
-
-import numpy as np
-from pydantic import NonNegativeInt, confloat
-from sklearn.model_selection import train_test_split
-
-from sequentia.utils.validation import _check_classes, _BaseSequenceValidator, Array
-
-__all__ = ['SequentialDataset']
-
-
-class SequentialDataset:
- """Utility wrapper for a generic sequential dataset."""
-
- def __init__(
- self,
- X: Array,
- y: Optional[Array] = None,
- lengths: Optional[Array[int]] = None,
- classes: Optional[Array[int]] = None
- ) -> SequentialDataset:
- """Initializes a :class:`.SequentialDataset`.
-
- :param X: Univariate or multivariate observation sequence(s).
-
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
-
- :param y: Outputs corresponding to sequence(s) provided in ``X``.
-
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
-
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
- - ``len(X)`` should be equal to ``sum(lengths)``.
-
- :param classes: Set of possible class labels (only if ``y`` was provided with categorical values).
-
- - If not provided, these will be determined from the training data labels.
- """
- data = _BaseSequenceValidator(X=X, lengths=lengths, y=y)
-
- self._X = data.X
- self._y = data.y
- self._lengths = data.lengths
-
- self._classes = None
- if self._y is not None and np.issubdtype(self._y.dtype, np.integer):
- self._classes = _check_classes(self._y, classes)
-
- self._idxs = self._get_idxs(self.lengths)
-
- self._X_y = (self._X, self._y)
- self._X_lengths = (self._X, self._lengths)
- self._X_y_lengths = (self._X, self._y, self._lengths)
-
-
- def split(
- self,
- test_size: Optional[Union[NonNegativeInt, confloat(ge=0, le=1)]] = None,
- train_size: Optional[Union[NonNegativeInt, confloat(ge=0, le=1)]] = None,
- random_state: Optional[Union[NonNegativeInt, np.random.RandomState]] = None,
- shuffle: bool = True,
- stratify: bool = False
- ) -> Tuple[SequentialDataset, SequentialDataset]:
- """Splits the dataset into two partitions (train/test).
-
- See :func:`sklearn:sklearn.model_selection.train_test_split`.
-
- :param test_size: Size of the test partition.
- :param train_size: Size of the train partition.
- :param random_state: Seed or :class:`numpy:numpy.random.RandomState` object for reproducible pseudo-randomness.
- :param shuffle: Whether or not to shuffle the data before splitting. If ``shuffle=False`` then ``stratify`` must be ``False``.
- :param stratify: Whether or not to stratify the partitions by class labels.
- :return: Dataset partitions.
- """
- if stratify and self._y is None:
- warnings.warn('Cannot stratify with no provided outputs')
- stratify = None
- else:
- if stratify:
- if self._classes is None:
- warnings.warn('Cannot stratify on non-categorical outputs')
- stratify = None
- else:
- stratify = self._y
- else:
- stratify = None
-
- idxs = np.arange(len(self._lengths))
- train_idxs, test_idxs = train_test_split(
- idxs,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state,
- shuffle=shuffle,
- stratify=stratify
- )
-
- if self._y is None:
- X_train, y_train = self[train_idxs], None
- X_test, y_test = self[test_idxs], None
- else:
- X_train, y_train = self[train_idxs]
- X_test, y_test = self[test_idxs]
-
- lengths_train = self._lengths[train_idxs]
- lengths_test = self._lengths[test_idxs]
- classes = self._classes
-
- data_train = SequentialDataset(np.vstack(X_train), y=y_train, lengths=lengths_train, classes=classes)
- data_test = SequentialDataset(np.vstack(X_test), y=y_test, lengths=lengths_test, classes=classes)
-
- return data_train, data_test
-
-
- def iter_by_class(self) -> Iterator[Tuple[Array, Array, int]]:
- """Subsets the observation sequences by class.
-
- :raises: ``AttributeError`` - If ``y`` was not provided to :func:`__init__`, or is not categorical.
- :return: Generator iterating over classes, yielding:
-
- - ``X`` subset of sequences belonging to the class.
- - Lengths corresponding to the ``X`` subset.
- - Class used to subset ``X``.
- """
- if self._y is None:
- raise AttributeError('No `y` values were provided during initialization')
-
- if self._classes is None:
- raise RuntimeError('Cannot iterate by class on real-valued targets')
-
- for c in self._classes:
- ind = np.argwhere(self._y == c).flatten()
- X, _ = self[ind]
- lengths = self._lengths[ind]
- yield np.vstack(X), lengths, c
-
-
- @staticmethod
- def _get_idxs(lengths):
- ends = lengths.cumsum()
- starts = np.zeros_like(ends)
- starts[1:] = ends[:-1]
- return np.c_[starts, ends]
-
-
- @staticmethod
- def _iter_X(X, idxs):
- for start, end in idxs:
- yield X[start:end]
-
-
- def __len__(self):
- return len(self._lengths)
-
-
- def __getitem__(self, i):
- idxs = np.atleast_2d(self._idxs[i])
- X = [x for x in self._iter_X(self._X, idxs)]
- X = X[0] if isinstance(i, int) and len(X) == 1 else X
- return X if self._y is None else (X, self._y[i])
-
-
- def __iter__(self):
- for i in range(len(self)):
- yield self[i]
-
-
- @property
- def X(self) -> Array:
- """Observation sequences."""
- return self._X
-
-
- @property
- def y(self) -> Array:
- """Outputs corresponding to ``X``.
-
- :raises: ``AttributeError`` - If ``y`` was not provided to :func:`__init__`.
- """
- if self._y is None:
- raise AttributeError('No `y` values were provided during initialization')
- return self._y
-
-
- @property
- def lengths(self) -> Array[int]:
- """Lengths corresponding to ``X``."""
- return self._lengths
-
-
- @property
- def classes(self) -> Optional[Array[int]]:
- """Set of unique classes in ``y``. If ``y`` is not categorical, then ``None``."""
- return self._classes
-
-
- @property
- def idxs(self) -> Array[int]:
- """Observation sequence start and end indices."""
- return self._idxs
-
-
- @property
- def X_y(self) -> Tuple[Array, Array]:
- """Observation sequences and corresponding outputs.
-
- :raises: ``AttributeError`` - If ``y`` was not provided to :func:`__init__`.
- """
- if self._y is None:
- raise AttributeError('No `y` values were provided during initialization')
- return self._X_y
-
-
- @property
- def X_lengths(self) -> Tuple[Array, Array[int]]:
- """Observation sequences and corresponding lengths."""
- return self._X_lengths
-
-
- @property
- def X_y_lengths(self) -> Tuple[Array, Array, Array[int]]:
- """Observation sequences and corresponding outputs and lengths.
-
- :raises: ``AttributeError`` - If ``y`` was not provided to :func:`__init__`.
- """
- if self._y is None:
- raise AttributeError('No `y` values were provided during initialization')
- return self._X_y_lengths
-
-
- def save(self, path: Union[str, pathlib.Path, IO], compress: bool = True):
- """Stores the dataset in ``.npz`` format.
-
- See :func:`numpy:numpy.savez` and :func:`numpy:numpy.savez_compressed`.
-
- :param path: Location to store the dataset.
- :param compress: Whether or not to compress the dataset.
-
- See Also
- --------
- load:
- Loads a stored dataset in ``.npz`` format.
- """
- arrs = {
- 'X': self._X,
- 'lengths': self._lengths
- }
-
- if self._y is not None:
- arrs['y'] = self._y
-
- if self._classes is not None:
- arrs['classes'] = self._classes
-
- save_fun = np.savez_compressed if compress else np.savez
- save_fun(path, **arrs)
-
-
- @classmethod
- def load(cls, path: Union[str, pathlib.Path, IO]) -> SequentialDataset:
- """Loads a stored dataset in ``.npz`` format.
-
- See :func:`numpy:numpy.load`.
-
- :param path: Location to store the dataset.
- :return: The loaded dataset.
-
- See Also
- --------
- save:
- Stores the dataset in ``.npz`` format.
- """
- return cls(**np.load(path))
-
-
- def copy(self) -> SequentialDataset:
- """Creates a copy of the dataset.
-
- :return: Dataset copy.
- """
- params = {
- "X": copy.deepcopy(self._X),
- "lengths": copy.deepcopy(self._lengths),
- }
-
- if self._y is not None:
- params["y"] = copy.deepcopy(self._y)
-
- if self._classes is not None:
- params["classes"] = copy.deepcopy(self._classes)
-
- return SequentialDataset(**params)
diff --git a/lib/sequentia/utils/decorators.py b/lib/sequentia/utils/decorators.py
deleted file mode 100644
index a66c0f9a..00000000
--- a/lib/sequentia/utils/decorators.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import inspect, functools
-
-from sequentia.utils.validation import _check_is_fitted
-
-def _validate_params(using):
- def decorator(function):
- @functools.wraps(function)
- def wrapper(self=None, *args, **kwargs):
- spec = inspect.getfullargspec(function)
- if spec.varkw == 'kwargs' or len(spec.kwonlyargs) > 0:
- using.parse_obj(kwargs)
- if self is None:
- return function(*args, **kwargs)
- return function(self, *args, **kwargs)
- return wrapper
- return decorator
-
-def _requires_fit(function):
- @functools.wraps(function)
- def wrapper(self, *args, **kwargs):
- _check_is_fitted(self)
- return function(self, *args, **kwargs)
- return wrapper
-
-def _override_params(params, temporary=True):
- def decorator(function):
- @functools.wraps(function)
- def wrapper(self, *args, **kwargs):
- original_params = {}
-
- for param in params:
- if not hasattr(self, param):
- raise AttributeError(f"'{type(self).__name__}' object has no attribute '{param}'")
-
- for param in params:
- if param in kwargs:
- original_params[param] = getattr(self, param)
- setattr(self, param, kwargs[param])
-
- try:
- return function(self, *args, **kwargs)
- finally:
- if temporary:
- for param, value in original_params.items():
- setattr(self, param, value)
-
- return wrapper
- return decorator
-
-def _check_plotting_dependencies(function):
- @functools.wraps(function)
- def wrapper(self, *args, **kwargs):
- try:
- import matplotlib
- except ImportError as e:
- raise ImportError(f'The {function.__name__} function requires a working installation of matplotlib') from e
- return function(self, *args, **kwargs)
- return wrapper
diff --git a/lib/sequentia/utils/multiprocessing.py b/lib/sequentia/utils/multiprocessing.py
deleted file mode 100644
index 0a55932d..00000000
--- a/lib/sequentia/utils/multiprocessing.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import joblib
-from typing import Optional
-
-from sequentia.utils.validation import Array
-
-def _effective_n_jobs(n_jobs: int, lengths: Optional[Array[int]] = None) -> int:
- n_jobs_ = 1
- if lengths is not None:
- n_jobs_ = min(joblib.effective_n_jobs(n_jobs), len(lengths))
- return n_jobs_
diff --git a/lib/sequentia/utils/validation.py b/lib/sequentia/utils/validation.py
deleted file mode 100644
index c466709d..00000000
--- a/lib/sequentia/utils/validation.py
+++ /dev/null
@@ -1,209 +0,0 @@
-from typing import Optional
-
-import numpy as np
-from pydantic import BaseModel, validator, root_validator
-from sklearn.utils.multiclass import unique_labels
-from sklearn.multiclass import check_classification_targets
-from sklearn.utils.validation import NotFittedError
-
-__all__ = ['Array']
-
-def _check_classes(y, classes=None):
- check_classification_targets(y)
- unique_y = unique_labels(y)
-
- classes_ = None
- if classes is None:
- classes_ = unique_y
- else:
- classes_np = np.array(classes).flatten()
- if not np.issubdtype(classes_np.dtype, np.integer):
- raise TypeError(f'Expected classes to be integers')
-
- _, idx = np.unique(classes_np, return_index=True)
- classes_ = classes_np[np.sort(idx)]
- unseen_labels = set(unique_y) - set(classes_np)
- if len(unseen_labels) > 0:
- raise ValueError(f'Encountered label(s) in `y` not present in specified classes - {unseen_labels}')
-
- return classes_
-
-def _check_is_fitted(estimator, attributes=None, return_=False):
- fitted = False
- if attributes is None:
- fitted = any(attr.endswith('_') for attr in estimator.__dict__.keys() if '__' not in attr)
- else:
- fitted = all(hasattr(estimator, attr) for attr in attributes)
-
- if return_:
- return fitted
-
- if not fitted:
- raise NotFittedError(
- f'This {type(estimator).__name__} instance is not fitted yet. '
- "Call 'fit' with appropriate arguments before using this method."
- )
-
-class _Validator(BaseModel):
- class Config:
- arbitrary_types_allowed = True
-
- @classmethod
- def fields(cls):
- return list(cls.__fields__.keys())
-
-class _TypedArray(np.ndarray):
- @classmethod
- def __get_validators__(cls):
- yield cls.validate_type
-
- @classmethod
- def validate_type(cls, val):
- return np.array(val, dtype=cls.inner_type)
-
-class _ArrayMeta(type):
- def __getitem__(self, t):
- return type('Array', (_TypedArray,), {'inner_type': t})
-
-class Array(np.ndarray, metaclass=_ArrayMeta):
- pass
-
-class _BaseSequenceValidator(_Validator):
- X: Array[None]
- lengths: Optional[Array[int]] = None
- y: Optional[Array[None]] = None
-
- @validator('X')
- def check_X(cls, X):
- X = np.atleast_1d(X)
-
- dim = X.ndim
- if dim == 1:
- X = X.reshape(-1, 1)
- elif dim > 2:
- raise ValueError('Expected `X` to have a maximum of two dimensions')
-
- if len(X) == 0:
- raise ValueError('Expected `X` to have at least one observation')
-
- return X
-
- @validator('lengths')
- def check_lengths(cls, value, values):
- X = values.get('X')
- if X is not None:
- len_X = len(X)
-
- # Treat whole input as one sequence if no lengths given (and try convert to numpy)
- lengths = np.array(len_X if value is None else value).flatten()
- total_lengths = lengths.sum()
-
- if total_lengths != len_X:
- raise ValueError(
- 'Sum of provided `lengths` does not match the length of `X` '
- f'(sum(lengths)={total_lengths}, len(X)={len_X})'
- )
- else:
- raise ValueError('Unable to validate `lengths` as it depends on `X`')
- return lengths
-
- # Needs lengths
- @validator('y')
- def check_y(cls, value, values):
- y = value
- if y is not None:
- lengths = values.get('lengths')
- if lengths is not None:
- y = np.array(y).flatten()
-
- len_y = len(y)
- n_seqs = len(lengths)
-
- if len_y != n_seqs:
- raise ValueError(
- 'Expected `y` to have the same number of elements as `lengths` '
- f'(len(y)={len_y}, len(lengths)={n_seqs})'
- )
- else:
- raise ValueError('Unable to validate `y` as it depends on `lengths`')
- return y
-
-class _BaseSingleUnivariateSequenceValidator(_Validator):
- sequence: Array
-
- @validator('sequence')
- def check_sequence(cls, sequence):
- sequence = np.atleast_1d(sequence)
-
- dim = sequence.ndim
- if dim == 1:
- sequence = sequence.reshape(-1, 1)
- elif dim > 2:
- raise ValueError('Expected the sequence to have a maximum of two dimensions')
-
- if sequence.shape[1] > 1:
- raise ValueError('Expected the sequence to be univariate')
-
- if len(sequence) == 0:
- raise ValueError('Expected the sequence to have at least one observation')
-
- return sequence
-
-class _BaseSingleMultivariateSequenceValidator(_Validator):
- sequence: Array
-
- @validator('sequence')
- def check_sequence(cls, sequence):
- sequence = np.atleast_1d(sequence)
-
- dim = sequence.ndim
- if dim == 1:
- sequence = sequence.reshape(-1, 1)
- elif dim > 2:
- raise ValueError('Expected sequence to have a maximum of two dimensions')
-
- if len(sequence) == 0:
- raise ValueError('Expected sequence to have at least one observation')
-
- return sequence
-
-class _BaseUnivariateCategoricalSequenceValidator(_BaseSequenceValidator):
- X: Array[int]
- lengths: Optional[Array[int]] = None
- y: Optional[Array[None]] = None
-
-class _UnivariateCategoricalSequenceClassifierValidator(_BaseSequenceValidator):
- X: Array[int]
- lengths: Optional[Array[int]] = None
- y: Array[int]
-
- @root_validator
- def check_X_is_1d(cls, values):
- X = values['X']
- if X.shape[1] > 1:
- raise ValueError('Only univariate categorical sequences are currently supported')
- return values
-
-class _SingleUnivariateCategoricalSequenceValidator(_BaseSingleUnivariateSequenceValidator):
- sequence: Array[int]
-
-class _BaseMultivariateFloatSequenceValidator(_BaseSequenceValidator):
- X: Array[np.float64]
- lengths: Optional[Array[int]] = None
- y: Optional[Array[None]] = None
-
-class _MultivariateFloatSequenceClassifierValidator(_BaseSequenceValidator):
- X: Array[np.float64]
- lengths: Optional[Array[int]] = None
- y: Array[int]
-
-class _MultivariateFloatSequenceRegressorValidator(_BaseSequenceValidator):
- X: Array[np.float64]
- lengths: Optional[Array[int]] = None
- y: Array[np.float64]
-
-class _SingleUnivariateFloatSequenceValidator(_BaseSingleUnivariateSequenceValidator):
- sequence: Array[np.float64]
-
-class _SingleMultivariateFloatSequenceValidator(_BaseSingleMultivariateSequenceValidator):
- sequence: Array[np.float64]
diff --git a/lib/test/.gitkeep b/lib/test/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/__init__.py b/lib/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/__init__.py b/lib/test/lib/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/datasets/__init__.py b/lib/test/lib/datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/datasets/test_digits.py b/lib/test/lib/datasets/test_digits.py
deleted file mode 100644
index a7fea148..00000000
--- a/lib/test/lib/datasets/test_digits.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import pytest
-
-from sequentia.datasets import load_digits
-
-from ...support.assertions import assert_equal
-
-
-@pytest.mark.parametrize('digits', [list(range(10)), [2, 5]])
-def test_digits(digits):
- data = load_digits(digits=digits)
-
- assert len(data) == 300 * len(digits)
- assert_equal(data.classes, digits)
- assert set(data.y) == set(digits)
-
- for _, lengths, c in data.iter_by_class():
- assert len(lengths) == 300
diff --git a/lib/test/lib/datasets/test_gene_families.py b/lib/test/lib/datasets/test_gene_families.py
deleted file mode 100644
index 6c0dd4d3..00000000
--- a/lib/test/lib/datasets/test_gene_families.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import pytest
-
-from sequentia.datasets import load_gene_families
-
-from ...support.assertions import assert_equal
-
-counts = {
- 0: 531,
- 1: 534,
- 2: 349,
- 3: 672,
- 4: 711,
- 5: 240,
- 6: 1343
-}
-
-
-@pytest.mark.parametrize('families', [list(range(7)), [2, 5]])
-def test_gene_families(families):
- data, enc = load_gene_families(families=families)
-
- assert set(enc.classes_) == {'A', 'C', 'G', 'N', 'T'}
-
- assert_equal(data.classes, families)
- assert set(data.y) == set(families)
-
- for family in families:
- assert (data.y == family).sum() == counts[family]
diff --git a/lib/test/lib/models/__init__.py b/lib/test/lib/models/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/models/hmm/__init__.py b/lib/test/lib/models/hmm/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/models/hmm/test_classifier.py b/lib/test/lib/models/hmm/test_classifier.py
deleted file mode 100644
index 85cfeae3..00000000
--- a/lib/test/lib/models/hmm/test_classifier.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import os
-import pytest
-from copy import deepcopy
-from tempfile import TemporaryDirectory
-from unittest.mock import Mock
-
-import numpy as np
-
-from sequentia.datasets import load_digits, load_gene_families
-from sequentia.models.hmm import GaussianMixtureHMM, CategoricalHMM, HMMClassifier
-from sequentia.utils.validation import _check_is_fitted
-
-from .variants.test_gaussian_mixture import assert_fit as assert_gaussian_mixture_fit
-from .variants.test_categorical import assert_fit as assert_categorical_fit
-from ....support.assertions import assert_equal
-
-n_classes = 7
-
-
-@pytest.fixture(scope='module')
-def random_state(request):
- return np.random.RandomState(1)
-
-
-@pytest.fixture(scope='module')
-def dataset(request):
- if request.param == 'digits':
- return load_digits(digits=range(n_classes))
- elif request.param == 'gene_families':
- data, _ = load_gene_families()
- return data
-
-
-@pytest.fixture(scope='module')
-def model(random_state, request):
- if request.param == 'gaussian_mixture':
- return GaussianMixtureHMM(topology='left-right', n_states=2, n_components=1, random_state=random_state)
- elif request.param == 'categorical':
- return CategoricalHMM(topology='left-right', n_states=2, random_state=random_state)
-
-
-class MockData:
- def __init__(self, length):
- self.length = length
-
- def __len__(self):
- return self.length
-
- @property
- def lengths(self):
- return MockData(self.length)
-
-
-def assert_fit(clf):
- assert hasattr(clf, 'prior_')
- assert hasattr(clf, 'classes_')
- assert _check_is_fitted(clf, return_=True)
-
- for hmm in clf.models.values():
- data = MockData(hmm.n_seqs_)
- if isinstance(hmm, GaussianMixtureHMM):
- assert_gaussian_mixture_fit(hmm, data)
- elif isinstance(hmm, CategoricalHMM):
- assert_categorical_fit(hmm, data)
-
-
-@pytest.mark.parametrize(
- 'model, dataset', [
- ('gaussian_mixture', 'digits'),
- ('categorical', 'gene_families')
- ],
- indirect=True
-)
-@pytest.mark.parametrize(
- 'prior', [
- None,
- 'frequency',
- {i: (i + 1) / (n_classes * (n_classes + 1) / 2) for i in range(n_classes)}
- ]
-)
-@pytest.mark.parametrize('prefit', [True, False])
-def test_classifier_e2e(request, model, dataset, prior, prefit, random_state):
- clf = HMMClassifier(prior=prior)
- clf.add_models({i: deepcopy(model) for i in range(n_classes)})
-
- assert clf.prior == prior
- assert len(clf.models) == n_classes
- assert set(clf.models) == set(range(n_classes))
- assert all(isinstance(hmm, type(model)) for hmm in clf.models.values())
-
- subset, _ = dataset.split(test_size=0.6, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- if prefit:
- for X, lengths, c in train.iter_by_class():
- clf.models[c].fit(X, lengths)
- assert_fit(clf.fit())
- else:
- assert_fit(clf.fit(*train.X_y_lengths))
-
- scores_pred = clf.predict_scores(*test.X_lengths)
- assert scores_pred.shape == (len(test), n_classes)
-
- proba_pred = clf.predict_proba(*test.X_lengths)
- assert proba_pred.shape == (len(test), n_classes)
- assert_equal(proba_pred.sum(axis=1), 1)
- assert ((proba_pred >= 0) & (proba_pred <= 1)).all()
-
- y_pred = clf.predict(*test.X_lengths)
- assert y_pred.shape == (len(test),)
- assert set(y_pred).issubset(set(range(n_classes)))
-
- acc = clf.score(*test.X_y_lengths)
- assert 0 <= acc <= 1
-
- # check serialization/deserialization
- with TemporaryDirectory() as temp_dir:
- model_path = f"{temp_dir}/{request.node.originalname}.model"
- # check that save works
- clf.save(model_path)
- assert os.path.isfile(model_path)
- # check that load works
- clf = HMMClassifier.load(model_path)
- # check that loaded model is fitted
- assert_fit(clf)
- y_pred_load = clf.predict(*test.X_lengths)
- # check that predictions are the same as before serialization
- assert_equal(y_pred, y_pred_load)
-
-
-@pytest.mark.parametrize('classes', [[0, 1, 2], [2, 0, 1]])
-def test_classifier_compute_log_posterior(classes):
- clf = HMMClassifier()
- clf.classes_ = np.array(classes)
- clf.prior_ = {i: np.exp(i) for i in clf.classes_}
- clf.models = {i: Mock(_score=Mock(side_effect=lambda x: 0)) for i in clf.classes_}
- assert_equal(clf._compute_log_posterior(None), clf.classes_)
-
-
-def test_classifier_compute_scores_chunk():
- clf = HMMClassifier()
- clf.classes_ = np.arange(3)
- clf.prior_ = {i: np.exp(i) for i in clf.classes_}
- clf.models = {i: Mock(_score=Mock(side_effect=len)) for i in clf.classes_}
- X = np.expand_dims(np.arange(10), axis=-1)
- idxs = np.array([[0, 0], [1, 2], [3, 5], [6, 9]]) # lengths = 0, 1, 2, 3
- assert_equal(
- clf._compute_scores_chunk(idxs, X),
- np.tile(np.expand_dims(clf.classes_, axis=-1), len(idxs)).T
- + np.expand_dims(np.arange(len(idxs)), axis=-1)
- )
diff --git a/lib/test/lib/models/hmm/test_topologies.py b/lib/test/lib/models/hmm/test_topologies.py
deleted file mode 100644
index 82aebdfd..00000000
--- a/lib/test/lib/models/hmm/test_topologies.py
+++ /dev/null
@@ -1,351 +0,0 @@
-import pytest, warnings, numpy as np
-from sequentia.models.hmm.topologies import _Topology, _LeftRightTopology, _ErgodicTopology, _LinearTopology
-from ....support.assertions import assert_equal, assert_all_equal, assert_distribution
-
-# Set seed for reproducible randomness
-seed = 0
-np.random.seed(seed)
-rng = np.random.RandomState(seed)
-
-# ========= #
-# _Topology #
-# ========= #
-
-# ------------------------------- #
-# _Topology.uniform_start_probs() #
-# ------------------------------- #
-
-def test_uniform_start_probs_min():
- """Generate a uniform initial state distribution with the minimum number of states"""
- topology = _Topology(n_states=1, random_state=rng)
- start_probs = topology.uniform_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 1.
- ]))
-
-def test_uniform_start_probs_small():
- """Generate a uniform initial state distribution with a few states"""
- topology = _Topology(n_states=2, random_state=rng)
- start_probs = topology.uniform_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 0.5, 0.5
- ]))
-
-def test_uniform_start_probs_many():
- """Generate a uniform initial state distribution with many states"""
- topology = _Topology(n_states=5, random_state=rng)
- start_probs = topology.uniform_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 0.2, 0.2, 0.2, 0.2, 0.2
- ]))
-
-# ------------------------------ #
-# _Topology.random_start_probs() #
-# ------------------------------ #
-
-def test_random_start_probs_min():
- """Generate a random initial state distribution with minimal states"""
- topology = _Topology(n_states=1, random_state=rng)
- start_probs = topology.random_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 1.
- ]))
-
-def test_random_start_probs_small():
- """Generate a random initial state distribution with few states"""
- topology = _Topology(n_states=2, random_state=rng)
- start_probs = topology.random_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 0.57633871, 0.42366129
- ]))
-
-def test_random_initial_many():
- """Generate a random initial state distribution with many states"""
- topology = _Topology(n_states=5, random_state=rng)
- start_probs = topology.random_start_probs()
- assert_distribution(start_probs)
- assert_equal(start_probs, np.array([
- 0.15210286, 0.10647349, 0.20059295, 0.11120171, 0.42962898
- ]))
-
-# ================== #
-# _LeftRightTopology #
-# ================== #
-
-# ---------------------------------------- #
-# _LeftRightTopology.uniform_transitions() #
-# ---------------------------------------- #
-
-def test_left_right_uniform_transitions_min():
- """Generate a uniform left-right transition matrix with minimal states"""
- topology = _LeftRightTopology(n_states=1, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_left_right_uniform_transitions_small():
- """Generate a uniform left-right transition matrix with few states"""
- topology = _LeftRightTopology(n_states=2, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.5, 0.5],
- [0. , 1. ]
- ]))
-
-def test_left_right_uniform_transitions_many():
- """Generate a uniform left-right transition matrix with many states"""
- topology = _LeftRightTopology(n_states=5, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.2, 0.2 , 0.2 , 0.2 , 0.2 ],
- [0. , 0.25, 0.25 , 0.25 , 0.25 ],
- [0. , 0. , 0.33333333, 0.33333333, 0.33333333],
- [0. , 0. , 0. , 0.5 , 0.5 ] ,
- [0. , 0. , 0. , 0. , 1. ]
- ]))
-
-# --------------------------------------- #
-# _LeftRightTopology.random_transitions() #
-# --------------------------------------- #
-
-def test_left_right_random_transitions_min():
- """Generate a random left-right transition matrix with minimal states"""
- topology = _LeftRightTopology(n_states=1, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_left_right_random_transitions_small():
- """Generate a random left-right transition matrix with few states"""
- topology = _LeftRightTopology(n_states=2, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.23561633, 0.76438367],
- [0. , 1. ]
- ]))
-
-def test_left_right_random_transitions_many():
- """Generate a random left-right transition matrix with many states"""
- topology = _LeftRightTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.23169814, 0.71716356, 0.02033845, 0.02516204, 0.00563782],
- [0. , 0.19474072, 0.16405008, 0.22228532, 0.41892388],
- [0. , 0. , 0.42912755, 0.16545797, 0.40541448],
- [0. , 0. , 0. , 0.109713 , 0.890287 ],
- [0. , 0. , 0. , 0. , 1. ]
- ]))
-
-# ---------------------------------------#
-# _LeftRightTopology.check_transitions() #
-# ---------------------------------------#
-
-def test_left_right_check_transitions_invalid():
- """Validate an invalid left-right transition matrix"""
- topology = _LeftRightTopology(n_states=5, random_state=rng)
- transitions = _ErgodicTopology(n_states=5, random_state=rng).random_transitions()
- with pytest.raises(ValueError) as e:
- topology.check_transitions(transitions)
- assert str(e.value) == 'Left-right transition matrix must be upper-triangular'
-
-def test_left_right_check_transitions_valid():
- """Validate a valid left-right transition matrix"""
- topology = _LeftRightTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- topology.check_transitions(transitions)
-
-# -------------------------------------- #
-# _ErgodicTopology.uniform_transitions() #
-# -------------------------------------- #
-
-def test_ergodic_uniform_transitions_min():
- """Generate a uniform ergodic transition matrix with minimal states"""
- topology = _ErgodicTopology(n_states=1, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_ergodic_uniform_transitions_small():
- """Generate a uniform ergodic transition matrix with few states"""
- topology = _ErgodicTopology(n_states=2, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.5, 0.5],
- [0.5, 0.5]
- ]))
-
-def test_ergodic_uniform_transitions_many():
- """Generate a uniform ergodic transition matrix with many states"""
- topology = _ErgodicTopology(n_states=5, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.2, 0.2, 0.2, 0.2, 0.2],
- [0.2, 0.2, 0.2, 0.2, 0.2],
- [0.2, 0.2, 0.2, 0.2, 0.2],
- [0.2, 0.2, 0.2, 0.2, 0.2],
- [0.2, 0.2, 0.2, 0.2, 0.2]
- ]))
-
-# ------------------------------------- #
-# _ErgodicTopology.random_transitions() #
-# ------------------------------------- #
-
-def test_ergodic_random_transitions_min():
- """Generate a random ergodic transition matrix with minimal states"""
- topology = _ErgodicTopology(n_states=1, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_ergodic_random_transitions_small():
- """Generate a random ergodic transition matrix with few states"""
- topology = _ErgodicTopology(n_states=2, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.9474011 , 0.0525989 ],
- [0.85567599, 0.14432401]
- ]))
-
-def test_ergodic_random_transitions_many():
- """Generate a random ergodic transition matrix with many states"""
- topology = _ErgodicTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.58715548, 0.14491542, 0.20980762, 0.00623944, 0.05188205],
- [0.0840705 , 0.23055049, 0.08297536, 0.25124688, 0.35115677],
- [0.02117615, 0.37664662, 0.26705912, 0.09851123, 0.23660688],
- [0.01938041, 0.16853843, 0.52046123, 0.07535256, 0.21626737],
- [0.04996846, 0.44545843, 0.12079423, 0.07154241, 0.31223646]
- ]))
-
-# ------------------------------------ #
-# _ErgodicTopology.check_transitions() #
-# ------------------------------------ #
-
-def test_ergodic_check_transitions_invalid():
- """Validate an invalid ergodic transition matrix"""
- topology = _ErgodicTopology(n_states=5, random_state=rng)
- transitions = _LeftRightTopology(n_states=5, random_state=rng).random_transitions()
- with pytest.warns(UserWarning):
- topology.check_transitions(transitions)
-
-def test_ergodic_check_transitions_valid():
- """Validate a valid ergodic transition matrix"""
- topology = _ErgodicTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- topology.check_transitions(transitions)
-
-# =============== #
-# _LinearTopology #
-# =============== #
-
-# ------------------------------------- #
-# _LinearTopology.uniform_transitions() #
-# ------------------------------------- #
-
-def test_linear_uniform_transitions_min():
- """Generate a uniform linear transition matrix with minimal states"""
- topology = _LinearTopology(n_states=1, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_linear_uniform_transitions_small():
- """Generate a uniform linear transition matrix with few states"""
- topology = _LinearTopology(n_states=2, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.5, 0.5],
- [0. , 1. ]
- ]))
-
-def test_linear_uniform_transitions_many():
- """Generate a uniform linear transition matrix with many states"""
- topology = _LinearTopology(n_states=5, random_state=rng)
- transitions = topology.uniform_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.5, 0.5, 0. , 0. , 0. ],
- [0. , 0.5, 0.5, 0. , 0. ],
- [0. , 0. , 0.5, 0.5, 0. ],
- [0. , 0. , 0. , 0.5, 0.5],
- [0. , 0. , 0. , 0. , 1. ]
- ]))
-
-# ------------------------------------ #
-# _LinearTopology.random_transitions() #
-# ------------------------------------ #
-
-def test_linear_random_transitions_min():
- """Generate a random linear transition matrix with minimal states"""
- topology = _LinearTopology(n_states=1, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [1.]
- ]))
-
-def test_linear_random_transitions_small():
- """Generate a random linear transition matrix with few states"""
- topology = _LinearTopology(n_states=2, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.65157396, 0.34842604],
- [0. , 1. ]
- ]))
-
-def test_linear_random_transitions_many():
- """Generate a random linear transition matrix with many states"""
- topology = _LinearTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- assert_distribution(transitions)
- assert_equal(transitions, np.array([
- [0.44455421, 0.55544579, 0. , 0. , 0. ],
- [0. , 0.57553614, 0.42446386, 0. , 0. ],
- [0. , 0. , 0.92014965, 0.07985035, 0. ],
- [0. , 0. , 0. , 0.66790982, 0.33209018],
- [0. , 0. , 0. , 0. , 1. ]
- ]))
-
-# ----------------------------------- #
-# _LinearTopology.check_transitions() #
-# ----------------------------------- #
-
-def test_linear_check_transitions_invalid():
- """Validate an invalid linear transition matrix"""
- topology = _LinearTopology(n_states=5, random_state=rng)
- transitions = _ErgodicTopology(n_states=5, random_state=rng).random_transitions()
- with pytest.raises(ValueError) as e:
- topology.check_transitions(transitions)
- assert str(e.value) == 'Left-right transition matrix must be upper-triangular'
-
-def test_linear_check_transitions_valid():
- """Validate a valid linear transition matrix"""
- topology = _LinearTopology(n_states=5, random_state=rng)
- transitions = topology.random_transitions()
- topology.check_transitions(transitions)
diff --git a/lib/test/lib/models/hmm/variants/__init__.py b/lib/test/lib/models/hmm/variants/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/models/hmm/variants/test_categorical.py b/lib/test/lib/models/hmm/variants/test_categorical.py
deleted file mode 100644
index b58a6805..00000000
--- a/lib/test/lib/models/hmm/variants/test_categorical.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import pytest
-
-import hmmlearn
-import numpy as np
-
-from sequentia.models import CategoricalHMM
-from sequentia.models.hmm.topologies import _topologies
-from sequentia.datasets import load_gene_families
-
-from .....support.assertions import assert_equal, assert_not_equal
-from .....support.itertools import combinations
-
-
-@pytest.fixture(scope='module')
-def random_state():
- return np.random.RandomState(0)
-
-
-@pytest.fixture(scope='module')
-def data(random_state):
- data_, _ = load_gene_families(families=[0])
- _, subset = data_.split(test_size=0.2, random_state=random_state, stratify=True)
- return subset
-
-
-@pytest.fixture(scope='module')
-def topology(request):
- return _topologies[request.param]
-
-
-def assert_fit(hmm, data):
- assert hmm.n_seqs_ == len(data.lengths)
- assert (hmm.topology_ is not None) == (hmm.topology is not None)
- assert isinstance(hmm.model, hmmlearn.hmm.CategoricalHMM)
- assert len(hmm.model.monitor_.history) > 0
-
-
-def test_categorical_fit_n_states(data, random_state):
- hmm = CategoricalHMM(n_states=7, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.n_states == 7
-
- assert hmm.model.startprob_.shape == (hmm.n_states,)
- assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
-
-
-def test_categorical_fit_no_topology(data, random_state):
- hmm = CategoricalHMM(topology=None, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.topology is None
- assert hmm.topology_ is None
-
- assert set(hmm.model.init_params) == set('ste')
- assert set(hmm.model.params) == set('ste')
-
- assert not hasattr(hmm, '_startprob')
- assert not hasattr(hmm, '_transmat')
-
-
-@pytest.mark.parametrize('topology', ['ergodic', 'left-right', 'linear'], indirect=True)
-@pytest.mark.parametrize('start_probs_type', ['uniform', 'random', None]) # None = custom
-def test_categorical_fit_set_start_probs(data, random_state, topology, start_probs_type):
- hmm = CategoricalHMM(topology=topology.name, random_state=random_state)
- hmm.set_start_probs(start_probs_type or topology(hmm.n_states, random_state).random_start_probs())
-
- assert hmm.topology == topology.name
- if start_probs_type is not None:
- assert hmm._startprob == start_probs_type
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert isinstance(hmm.topology_, topology)
-
- assert set(hmm.model.init_params) == set('e')
- assert set(hmm.model.params) == set('ste')
-
- hmm.topology_.check_start_probs(hmm._startprob) # transition matrix before fit
- hmm.topology_.check_start_probs(hmm.model.startprob_) # transition matrix after fit
-
- if start_probs_type == 'uniform':
- init_startprob = hmm.topology_.uniform_start_probs()
- assert_equal(hmm._startprob, init_startprob) # initial state probabilities should be uniform
-
- assert_not_equal(hmm._startprob, hmm.model.startprob_) # should update probabilities
- assert_equal(hmm._startprob == 0, hmm.model.startprob_ == 0) # but locations of zeros (if any) shouldn't change
-
-
-@pytest.mark.parametrize('topology', ['ergodic', 'left-right', 'linear'], indirect=True)
-@pytest.mark.parametrize('transition_type', ['uniform', 'random', None]) # None = custom
-def test_categorical_fit_set_transitions(data, random_state, topology, transition_type):
- hmm = CategoricalHMM(topology=topology.name, random_state=random_state)
- hmm.set_transitions(transition_type or topology(hmm.n_states, random_state).random_transitions())
-
- assert hmm.topology == topology.name
- if transition_type is not None:
- assert hmm._transmat == transition_type
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert isinstance(hmm.topology_, topology)
-
- assert set(hmm.model.init_params) == set('e')
- assert set(hmm.model.params) == set('ste')
-
- hmm.topology_.check_transitions(hmm._transmat) # transition matrix before fit
- hmm.topology_.check_transitions(hmm.model.transmat_) # transition matrix after fit
-
- if transition_type == 'uniform':
- init_transmat = hmm.topology_.uniform_transitions()
- assert_equal(hmm._transmat, init_transmat) # transition probabilities should be uniform
-
- assert_not_equal(hmm._transmat, hmm.model.transmat_) # should update probabilities
- assert_equal(hmm._transmat == 0, hmm.model.transmat_ == 0) # but locations of zeros (if any) shouldn't change
-
-
-@pytest.mark.parametrize('freeze_params', combinations('ste'))
-def test_categorical_fit_freeze_unfreeze(data, random_state, freeze_params):
- hmm = CategoricalHMM(topology='linear', n_states=2, random_state=random_state)
- hmm.freeze(freeze_params)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert set(hmm.model.params) == set('ste') - set(freeze_params)
-
- hmm.topology_.check_start_probs(hmm._startprob) # initial state dist. before fit
- hmm.topology_.check_start_probs(hmm.model.startprob_) # initial state dist. after fit
- assertion = assert_equal if 's' in freeze_params else assert_not_equal
- assertion(hmm._startprob, hmm.model.startprob_)
-
- hmm.topology_.check_transitions(hmm._transmat) # transition matrix before fit
- hmm.topology_.check_transitions(hmm.model.transmat_) # transition matrix after fit
- assertion = assert_equal if 't' in freeze_params else assert_not_equal
- assertion(hmm._transmat, hmm.model.transmat_)
-
- hmm.unfreeze(freeze_params)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert set(hmm.model.params) == set('ste')
-
- assert_not_equal(hmm._startprob, hmm.model.startprob_)
- assert_not_equal(hmm._transmat, hmm.model.transmat_)
diff --git a/lib/test/lib/models/hmm/variants/test_gaussian_mixture.py b/lib/test/lib/models/hmm/variants/test_gaussian_mixture.py
deleted file mode 100644
index 629724d4..00000000
--- a/lib/test/lib/models/hmm/variants/test_gaussian_mixture.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import pytest
-
-import hmmlearn
-import numpy as np
-
-from sequentia.models import GaussianMixtureHMM
-from sequentia.models.hmm.topologies import _topologies
-from sequentia.datasets import load_digits
-from sequentia.utils.validation import _check_is_fitted
-
-from .....support.assertions import assert_equal, assert_not_equal
-from .....support.itertools import combinations
-
-
-@pytest.fixture(scope='module')
-def random_state():
- return np.random.RandomState(0)
-
-
-@pytest.fixture(scope='module')
-def data(random_state):
- data_= load_digits(digits=[0])
- _, subset = data_.split(test_size=0.2, random_state=random_state, stratify=True)
- return subset
-
-
-@pytest.fixture(scope='module')
-def topology(request):
- return _topologies[request.param]
-
-
-def assert_fit(hmm, data):
- assert hmm.n_seqs_ == len(data.lengths)
- assert (hmm.topology_ is not None) == (hmm.topology is not None)
- assert isinstance(hmm.model, hmmlearn.hmm.GMMHMM)
- assert len(hmm.model.monitor_.history) > 0
- assert _check_is_fitted(hmm, return_=True)
-
-
-def test_gaussian_mixture_fit_n_states(data, random_state):
- hmm = GaussianMixtureHMM(n_states=7, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.n_states == 7
-
- assert hmm.model.startprob_.shape == (hmm.n_states,)
- assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
-
-
-def test_gaussian_mixture_fit_n_components(data, random_state):
- hmm = GaussianMixtureHMM(n_components=2, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.n_components == 2
-
- assert hmm.model.startprob_.shape == (hmm.n_states,)
- assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
-
- n_features = data.X.shape[1]
-
- assert hmm.model.means_.shape == (hmm.n_states, hmm.n_components, n_features)
- assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components)
- assert hmm.model.weights_.shape == (hmm.n_states, hmm.n_components)
-
-
-@pytest.mark.parametrize('covariance_type', ["spherical", "diag", "full", "tied"])
-def test_gaussian_mixture_fit_covariance_type(data, random_state, covariance_type):
- hmm = GaussianMixtureHMM(covariance_type=covariance_type, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.covariance_type == covariance_type
-
- assert hmm.model.startprob_.shape == (hmm.n_states,)
- assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
-
- n_features = data.X.shape[1]
-
- assert hmm.model.means_.shape == (hmm.n_states, hmm.n_components, n_features)
- assert hmm.model.weights_.shape == (hmm.n_states, hmm.n_components)
-
- if covariance_type == "spherical":
- assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components)
- elif covariance_type == "diag":
- assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components, n_features)
- elif covariance_type == "full":
- assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components, n_features, n_features)
- elif covariance_type == "tied":
- assert hmm.model.covars_.shape == (hmm.n_states, n_features, n_features)
-
-
-def test_gaussian_mixture_fit_no_topology(data, random_state):
- hmm = GaussianMixtureHMM(topology=None, random_state=random_state)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert hmm.topology is None
- assert hmm.topology_ is None
-
- assert set(hmm.model.init_params) == set('stmcw')
- assert set(hmm.model.params) == set('stmcw')
-
- assert not hasattr(hmm, '_startprob')
- assert not hasattr(hmm, '_transmat')
-
-
-@pytest.mark.parametrize('topology', ['ergodic', 'left-right', 'linear'], indirect=True)
-@pytest.mark.parametrize('start_probs_type', ['uniform', 'random', None]) # None = custom
-def test_gaussian_mixture_fit_set_start_probs(data, random_state, topology, start_probs_type):
- hmm = GaussianMixtureHMM(topology=topology.name, random_state=random_state)
- hmm.set_start_probs(start_probs_type or topology(hmm.n_states, random_state).random_start_probs())
-
- assert hmm.topology == topology.name
- if start_probs_type is not None:
- assert hmm._startprob == start_probs_type
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert isinstance(hmm.topology_, topology)
-
- assert set(hmm.model.init_params) == set('mcw')
- assert set(hmm.model.params) == set('stmcw')
-
- hmm.topology_.check_start_probs(hmm._startprob) # transition matrix before fit
- hmm.topology_.check_start_probs(hmm.model.startprob_) # transition matrix after fit
-
- if start_probs_type == 'uniform':
- init_startprob = hmm.topology_.uniform_start_probs()
- assert_equal(hmm._startprob, init_startprob) # initial state probabilities should be uniform
-
- assert_not_equal(hmm._startprob, hmm.model.startprob_) # should update probabilities
- # assert_equal(hmm._startprob == 0, hmm.model.startprob_ == 0) # but locations of zeros (if any) shouldn't change
-
-
-@pytest.mark.parametrize('topology', ['ergodic', 'left-right', 'linear'], indirect=True)
-@pytest.mark.parametrize('transition_type', ['uniform', 'random', None]) # None = custom
-def test_gaussian_mixture_fit_set_transitions(data, random_state, topology, transition_type):
- hmm = GaussianMixtureHMM(topology=topology.name, random_state=random_state)
- hmm.set_transitions(transition_type or topology(hmm.n_states, random_state).random_transitions())
-
- assert hmm.topology == topology.name
- if transition_type is not None:
- assert hmm._transmat == transition_type
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert isinstance(hmm.topology_, topology)
-
- assert set(hmm.model.init_params) == set('mcw')
- assert set(hmm.model.params) == set('stmcw')
-
- hmm.topology_.check_transitions(hmm._transmat) # transition matrix before fit
- hmm.topology_.check_transitions(hmm.model.transmat_) # transition matrix after fit
-
- if transition_type == 'uniform':
- init_transmat = hmm.topology_.uniform_transitions()
- assert_equal(hmm._transmat, init_transmat) # transition probabilities should be uniform
-
- assert_not_equal(hmm._transmat, hmm.model.transmat_) # should update probabilities
- # assert_equal(hmm._transmat == 0, hmm.model.transmat_ == 0) # but locations of zeros (if any) shouldn't change
-
-
-@pytest.mark.parametrize('freeze_params', combinations('stmcw'))
-def test_gaussian_mixture_fit_freeze_unfreeze(data, random_state, freeze_params):
- hmm = GaussianMixtureHMM(topology='linear', n_components=2, n_states=2, random_state=random_state)
- hmm.freeze(freeze_params)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert set(hmm.model.params) == set('stmcw') - set(freeze_params)
-
- hmm.topology_.check_start_probs(hmm._startprob) # initial state dist. before fit
- hmm.topology_.check_start_probs(hmm.model.startprob_) # initial state dist. after fit
- assertion = assert_equal if 's' in freeze_params else assert_not_equal
- assertion(hmm._startprob, hmm.model.startprob_)
-
- hmm.topology_.check_transitions(hmm._transmat) # transition matrix before fit
- hmm.topology_.check_transitions(hmm.model.transmat_) # transition matrix after fit
- assertion = assert_equal if 't' in freeze_params else assert_not_equal
- assertion(hmm._transmat, hmm.model.transmat_)
-
- hmm.unfreeze(freeze_params)
-
- assert_fit(hmm.fit(*data.X_lengths), data)
-
- assert set(hmm.model.params) == set('stmcw')
-
- assert_not_equal(hmm._startprob, hmm.model.startprob_)
- assert_not_equal(hmm._transmat, hmm.model.transmat_)
diff --git a/lib/test/lib/models/knn/__init__.py b/lib/test/lib/models/knn/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/models/knn/test_classifier.py b/lib/test/lib/models/knn/test_classifier.py
deleted file mode 100644
index 75c7a9f1..00000000
--- a/lib/test/lib/models/knn/test_classifier.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import os
-import math
-import pytest
-from copy import deepcopy
-from tempfile import TemporaryDirectory
-from unittest.mock import Mock
-
-import numpy as np
-
-from sequentia.datasets import load_digits
-from sequentia.models.knn import KNNClassifier
-from sequentia.utils.validation import _check_is_fitted
-
-from ....support.assertions import assert_equal
-
-n_classes = 3
-
-
-@pytest.fixture(scope='module')
-def random_state(request):
- return np.random.RandomState(1)
-
-
-@pytest.fixture(scope='module')
-def dataset():
- return load_digits(digits=range(n_classes))
-
-
-def assert_fit(clf, data):
- assert hasattr(clf, 'X_')
- assert hasattr(clf, 'y_')
- assert hasattr(clf, 'lengths_')
- assert hasattr(clf, 'idxs_')
- assert _check_is_fitted(clf, return_=True)
- assert_equal(clf.X_, data.X)
- assert_equal(clf.y_, data.y)
- assert_equal(clf.lengths_, data.lengths)
-
-
-@pytest.mark.parametrize('k', [1, 2, 5])
-@pytest.mark.parametrize('weighting', [None, lambda x: np.exp(-x)])
-def test_classifier_e2e(request, k, weighting, dataset, random_state):
- clf = KNNClassifier(k=k, weighting=weighting, random_state=random_state)
-
- assert clf.k == k
- assert clf.weighting == weighting
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- subset, _ = data.split(test_size=0.98, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
- params = clf.get_params()
-
- scores_pred = clf.predict_scores(*test.X_lengths)
- assert scores_pred.shape == (len(test), n_classes)
-
- proba_pred = clf.predict_proba(*test.X_lengths)
- assert proba_pred.shape == (len(test), n_classes)
- assert_equal(proba_pred.sum(axis=1), 1)
- assert ((proba_pred >= 0) & (proba_pred <= 1)).all()
-
- y_pred = clf.predict(*test.X_lengths)
- assert np.issubdtype(y_pred.dtype, np.integer)
- assert y_pred.shape == (len(test),)
- assert set(y_pred).issubset(set(range(n_classes)))
-
- acc = clf.score(*test.X_y_lengths)
- assert 0 <= acc <= 1
-
- # check serialization/deserialization
- with TemporaryDirectory() as temp_dir:
- model_path = f"{temp_dir}/{request.node.originalname}.model"
- # check that save works
- clf.save(model_path)
- assert os.path.isfile(model_path)
- # check that load works
- clf = KNNClassifier.load(model_path)
- assert (set(clf.get_params().keys()) - set(['weighting'])) == (set(params.keys()) - set(['weighting']))
- # sanity check that custom weighting functions are the same
- if weighting:
- x = random_state.rand(100)
- assert_equal(weighting(x), clf.weighting(x))
- # check that loaded model is fitted and can make predictions
- assert_fit(clf, train)
- y_pred_load = clf.predict(*test.X_lengths)
- if k == 1:
- # predictions should be same as before
- assert_equal(y_pred, y_pred_load)
-
-
-def test_classifier_predict_train(dataset, random_state):
- """Should be able to perfectly predict training data with k=1"""
- clf = KNNClassifier(k=1, random_state=random_state)
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- train, _ = data.split(train_size=0.05, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
- assert math.isclose(clf.score(*train.X_y_lengths), 1.)
-
-
-@pytest.mark.parametrize('classes', [[0, 1, 2], [2, 0, 1]])
-def test_classifier_compute_scores(classes, random_state):
- clf = KNNClassifier(k=5)
- clf.random_state_ = random_state
- clf.classes_ = np.array(classes)
-
- labels = np.array([
- [0, 2, 1, 2, 2],
- [0, 1, 1, 2, 0],
- [1, 0, 0, 1, 2],
- [0, 0, 0, 1, 1]
- ])
- weightings = np.ones_like(labels)
-
- scores = clf._compute_scores(labels, weightings)
- if np.allclose(classes, [0, 1, 2]):
- assert_equal(scores, [
- [1, 1, 3],
- [2, 2, 1],
- [2, 2, 1],
- [3, 2, 0]
- ])
- elif np.allclose(classes, [2, 0, 1]):
- assert_equal(scores, [
- [3, 1, 1],
- [1, 2, 2],
- [1, 2, 2],
- [0, 3, 2]
- ])
-
-
-@pytest.mark.parametrize('classes', [[0, 1, 2], [2, 0, 1]])
-def test_classifier_find_max_labels_chunk(classes, random_state):
- clf = KNNClassifier()
- clf.random_state_ = random_state
- clf.classes_ = np.array(classes)
-
- score_chunk = np.array([
- [10, 20, 20],
- [10, 30, 20],
- [10, 10, 10],
- [10, 10, 20]
- ])
-
- max_labels = clf._find_max_labels_chunk(score_chunk)
- if np.allclose(classes, [0, 1, 2]):
- assert max_labels[0] in (1, 2)
- assert max_labels[1] == 1
- assert max_labels[2] in (0, 1, 2)
- assert max_labels[3] == 2
- elif np.allclose(classes, [2, 0, 1]):
- assert max_labels[0] in (0, 1)
- assert max_labels[1] == 0
- assert max_labels[2] in (0, 1, 2)
- assert max_labels[3] == 1
-
-
-@pytest.mark.parametrize('tie', [True, False])
-def test_classifier_multi_argmax(tie):
- if tie:
- arr = np.array([3, 2, 4, 1, 3, 4, 4, 0, 2, 4])
- assert_equal(
- KNNClassifier._multi_argmax(arr),
- np.array([2, 5, 6, 9])
- )
- else:
- arr = np.array([3, 2, 1, 1, 3, 4, 1, 0, 2, 0])
- assert_equal(
- KNNClassifier._multi_argmax(arr),
- np.array([5])
- )
-
-
-@pytest.mark.parametrize('k', [1, 2, 5])
-@pytest.mark.parametrize('sort', [True, False])
-def test_classifier_query_neighbors(k, sort, dataset, random_state):
- clf = KNNClassifier(k=k, random_state=random_state)
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- subset, _ = data.split(test_size=0.98, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
-
- k_idxs, k_distances, k_labels = clf.query_neighbors(*test.X_lengths, sort=sort)
-
- # check that indices are between 0 and len(train)
- assert np.issubdtype(k_idxs.dtype, np.integer)
- assert k_idxs.shape == (len(test), clf.k)
- assert set(k_idxs.flatten()).issubset(set(np.arange(len(train))))
-
- # check that distances are sorted if sort=True
- np.issubdtype(k_distances.dtype, np.floating)
- assert k_distances.shape == (len(test), clf.k)
- if sort and k > 1:
- assert (k_distances[:, 1:] >= k_distances[:, :-1]).all()
-
- # check that labels are a subset of training labels + check that labels match indices
- assert np.issubdtype(k_labels.dtype, np.integer)
- assert k_labels.shape == (len(test), clf.k)
- assert set(k_labels.flatten()).issubset(set(train.y))
- assert_equal(train.y[k_idxs], k_labels)
-
-
-def test_classifier_compute_distance_matrix(dataset, random_state):
- clf = KNNClassifier()
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- subset, _ = data.split(test_size=0.98, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
-
- distances = clf.compute_distance_matrix(*test.X_lengths)
- assert distances.shape == (len(test), len(train))
-
-
-def test_classifier_distance_matrix_row_col_chunk():
- clf = KNNClassifier()
-
- clf.X_ = np.expand_dims(np.arange(7), axis=-1)
- col_idxs = np.array([[0, 1], [1, 3], [4, 7]]) # lengths = 1, 2, 3
-
- X = np.expand_dims(np.arange(14), axis=-1)
- row_idxs = np.array([[0, 2], [2, 5], [5, 9], [9, 14]]) # lengths = 2, 3, 4, 5
-
- distances = clf._distance_matrix_row_col_chunk(col_idxs, row_idxs, X, lambda x1, x2: len(x1) - len(x2))
- assert_equal(
- distances,
- np.array([
- [1, 0, -1],
- [2, 1, 0],
- [3, 2, 1],
- [4, 3, 2]
- ])
- )
diff --git a/lib/test/lib/models/knn/test_regressor.py b/lib/test/lib/models/knn/test_regressor.py
deleted file mode 100644
index 1fa7770c..00000000
--- a/lib/test/lib/models/knn/test_regressor.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import os
-import math
-import pytest
-from copy import deepcopy
-from tempfile import TemporaryDirectory
-from unittest.mock import Mock
-
-import numpy as np
-
-from sequentia.datasets import load_digits
-from sequentia.models.knn import KNNRegressor
-from sequentia.utils.validation import _check_is_fitted
-
-from ....support.assertions import assert_equal
-
-n_classes = 3
-
-
-@pytest.fixture(scope='module')
-def random_state(request):
- return np.random.RandomState(1)
-
-
-@pytest.fixture(scope='module')
-def dataset():
- return load_digits(digits=range(n_classes))
-
-
-def assert_fit(clf, data):
- assert hasattr(clf, 'X_')
- assert hasattr(clf, 'y_')
- assert hasattr(clf, 'lengths_')
- assert hasattr(clf, 'idxs_')
- assert _check_is_fitted(clf, return_=True)
- assert_equal(clf.X_, data.X)
- assert_equal(clf.y_, data.y)
- assert_equal(clf.lengths_, data.lengths)
-
-
-@pytest.mark.parametrize('k', [1, 2, 5])
-@pytest.mark.parametrize('weighting', [None, lambda x: np.exp(-x)])
-def test_regressor_e2e(request, k, weighting, dataset, random_state):
- clf = KNNRegressor(k=k, weighting=weighting, random_state=random_state)
-
- assert clf.k == k
- assert clf.weighting == weighting
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- subset, _ = data.split(test_size=0.98, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
- params = clf.get_params()
-
- y_pred = clf.predict(*test.X_lengths)
- assert np.issubdtype(y_pred.dtype, np.floating)
- assert y_pred.shape == (len(test),)
-
- acc = clf.score(*test.X_y_lengths)
-
- # check serialization/deserialization
- with TemporaryDirectory() as temp_dir:
- model_path = f"{temp_dir}/{request.node.originalname}.model"
- # check that save works
- clf.save(model_path)
- assert os.path.isfile(model_path)
- # check that load works
- clf = KNNRegressor.load(model_path)
- assert (set(clf.get_params().keys()) - set(['weighting'])) == (set(params.keys()) - set(['weighting']))
- # sanity check that custom weighting functions are the same
- if weighting:
- x = random_state.rand(100)
- assert_equal(weighting(x), clf.weighting(x))
- # check that loaded model is fitted and can make predictions
- assert_fit(clf, train)
- y_pred_load = clf.predict(*test.X_lengths)
- if k == 1:
- # predictions should be same as before
- assert_equal(y_pred, y_pred_load)
-
-
-def test_regressor_predict_train(dataset, random_state):
- """Should be able to perfectly predict training data with k=1"""
- clf = KNNRegressor(k=1, random_state=random_state)
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
- train, _ = data.split(train_size=0.05, random_state=random_state, stratify=True)
-
- assert_fit(clf.fit(*train.X_y_lengths), train)
- assert math.isclose(clf.score(*train.X_y_lengths), 1.)
-
-
-def test_regressor_weighting(random_state):
- clf = KNNRegressor(k=3, weighting=lambda x: np.where(x > 10, 0.5, 1))
- clf.random_state_ = random_state
-
- clf.query_neighbors = Mock(
- return_value=(
- None,
- np.array([
- [1.5, 2, 1],
- [2.5, 1, 0.5],
- ]),
- np.array([
- [10.2, 11.5, 10.4],
- [8.0, 6.5, 5.5]
- ])
- )
- )
-
- assert_equal(
- clf.predict(None, None),
- np.array([
- (10.2 * 0.5 + 11.5 * 0.5 + 10.4 * 0.5) / (0.5 * clf.k),
- (8.0 * 1 + 6.5 * 1 + 5.5 * 1) / (1 * clf.k)
- ])
- )
-
-
-@pytest.mark.parametrize('k', [1, 2, 5])
-@pytest.mark.parametrize('sort', [True, False])
-def test_regressor_query_neighbors(k, sort, dataset, random_state):
- clf = KNNRegressor(k=k, random_state=random_state)
-
- data = dataset.copy()
- data._X = data._X[:, :1] # only use one feature
-
- subset, _ = data.split(test_size=0.98, random_state=random_state, stratify=True)
- train, test = subset.split(test_size=0.2, random_state=random_state, stratify=True)
-
- clf.fit(*train.X_y_lengths)
-
- k_idxs, k_distances, k_outputs = clf.query_neighbors(*test.X_lengths, sort=sort)
-
- # check that indices are between 0 and len(train)
- assert np.issubdtype(k_idxs.dtype, np.integer)
- assert k_idxs.shape == (len(test), clf.k)
- assert set(k_idxs.flatten()).issubset(set(np.arange(len(train))))
-
- # check that distances are sorted if sort=True
- np.issubdtype(k_distances.dtype, np.floating)
- assert k_distances.shape == (len(test), clf.k)
- if sort and k > 1:
- assert (k_distances[:, 1:] >= k_distances[:, :-1]).all()
-
- # check that labels are a subset of training outputs + check that outputs match indices
- assert np.issubdtype(k_outputs.dtype, np.floating)
- assert k_outputs.shape == (len(test), clf.k)
- assert set(k_outputs.flatten()).issubset(set(train.y))
- assert_equal(train.y[k_idxs], k_outputs)
diff --git a/lib/test/lib/preprocessing/__init__.py b/lib/test/lib/preprocessing/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/preprocessing/test_transforms.py b/lib/test/lib/preprocessing/test_transforms.py
deleted file mode 100644
index 2e34b269..00000000
--- a/lib/test/lib/preprocessing/test_transforms.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import pytest
-
-import numpy as np
-
-from sklearn.preprocessing import minmax_scale
-
-from sequentia.datasets import load_digits
-from sequentia.preprocessing import transforms
-
-from sequentia.utils import SequentialDataset
-
-from ...support.assertions import assert_equal
-
-
-@pytest.fixture(scope='module')
-def random_state(request):
- return np.random.RandomState(1)
-
-
-@pytest.fixture(scope='module')
-def data(random_state):
- data_= load_digits(digits=[0])
- _, subset = data_.split(test_size=0.2, random_state=random_state, stratify=True)
- return subset
-
-
-def check_filter(x, xt, func, k):
- """Only works for odd k"""
- assert len(x) == len(xt)
- assert_equal(xt[k // 2], func(x[:k], axis=0))
-
-
-def test_function_transformer(data):
- # create the transform
- transform = transforms.IndependentFunctionTransformer(minmax_scale)
- # check that fit works - should do nothing
- transform.fit(*data.X_lengths)
- # check that fit_transform works - shouldn't do anything on fit, but should transform
- X_fit_transform = transform.fit_transform(*data.X_lengths)
- # check that transform works
- X_transform = transform.transform(*data.X_lengths)
- # check that fit_transform and transform produce the same transformed data
- assert_equal(X_fit_transform, X_transform)
- # check that features of each sequence are independently scaled to [0, 1]
- for xt in SequentialDataset._iter_X(X_transform, data.idxs):
- assert_equal(xt.min(axis=0), np.zeros(xt.shape[1]))
- assert_equal(xt.max(axis=0), np.ones(xt.shape[1]))
-
-
-@pytest.mark.parametrize("avg", ["mean", "median"])
-@pytest.mark.parametrize("k", [3, 5])
-def test_filters(data, random_state, avg, k):
- filter_ = getattr(transforms, f"{avg}_filter")
- check_filter_ = lambda x, xt: check_filter(x, xt, getattr(np, avg), k)
-
- # check that filters are correctly applied for a single sequence
- n_features = 2
- x = random_state.rand(10 * n_features).reshape(-1, n_features)
- xt = filter_(x, k)
- check_filter_(x, xt)
-
- # create a transform using the filter, passing k
- transform = transforms.IndependentFunctionTransformer(filter_, kw_args={"k": k})
- Xt = transform.transform(data.X, data.lengths)
-
- # check that filters are correctly applied for multiple sequences
- idxs = SequentialDataset._get_idxs(data.lengths)
- for x, xt in zip(*map(lambda X: SequentialDataset._iter_X(X, idxs), (data.X, Xt))):
- check_filter_(x, xt)
diff --git a/lib/test/lib/test_pipeline.py b/lib/test/lib/test_pipeline.py
deleted file mode 100644
index ebc90f65..00000000
--- a/lib/test/lib/test_pipeline.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import pytest
-
-import numpy as np
-from pydantic import ValidationError
-
-from sklearn.preprocessing import scale
-from sklearn.decomposition import PCA
-from sklearn.utils.validation import check_is_fitted
-from sklearn.exceptions import NotFittedError
-from sklearn.utils._param_validation import InvalidParameterError
-
-from sequentia.datasets import load_digits
-from sequentia.preprocessing import IndependentFunctionTransformer, mean_filter
-from sequentia.pipeline import Pipeline
-from sequentia.models import KNNClassifier
-
-from sequentia.utils import SequentialDataset
-
-from ..support.assertions import assert_equal, assert_not_equal
-
-
-@pytest.fixture(scope='module')
-def random_state(request):
- return np.random.RandomState(0)
-
-
-@pytest.fixture(scope='module')
-def data(random_state):
- data_= load_digits(digits=[0])
- _, subset = data_.split(test_size=0.05, random_state=random_state, stratify=True)
- return subset
-
-
-def test_pipeline_with_transforms(data):
- # create pipeline with a stateless and stateful transform
- pipeline = Pipeline([
- ("scale", IndependentFunctionTransformer(
- scale,
- inverse_func=lambda x: x,
- check_inverse=False
- )),
- ("pca", PCA(n_components=1)),
- ])
-
- # check that transforming without fitting doesn't work
- with pytest.raises(NotFittedError):
- pipeline.transform(*data.X_lengths)
-
- # check that fitting without y works
- check_is_fitted(pipeline.fit(*data.X_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
-
- # check that fitting with y works
- check_is_fitted(pipeline.fit(*data.X_y_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
-
- # check that transforming after fit works
- Xt = pipeline.transform(*data.X_lengths)
- assert_not_equal(data.X, Xt)
- assert Xt.shape == (len(data.X), 1)
-
- # check that fit_transform works
- Xt = pipeline.fit_transform(*data.X_lengths)
- assert_not_equal(data.X, Xt)
- assert Xt.shape == (len(data.X), 1)
-
- # check that inverse_transform works
- Xi = pipeline.inverse_transform(Xt, data.lengths)
- assert_not_equal(Xt, Xi)
-
- # check that prediction functions relying on X and lengths don't work
- for func in ('predict', 'predict_proba'):
- with pytest.raises(AttributeError):
- getattr(pipeline, func)(*data.X_lengths)
-
- # check that fit_predict doesn't work
- with pytest.raises(AttributeError):
- pipeline.fit_predict(*data.X_y_lengths)
-
- # check that score works if the final transform implements it, with y
- pipeline.score(*data.X_y_lengths)
-
- # check that score works if the final transform implements it, without y
- pipeline.score(data.X, lengths=data.lengths)
-
-
-def test_pipeline_with_estimator(data):
- pipeline = Pipeline([
- ("knn", KNNClassifier(k=1)),
- ])
-
- # check that transforming doesn't work
- with pytest.raises(AttributeError):
- pipeline.transform(*data.X_lengths)
-
- # check that fitting without y doesn't work
- with pytest.raises(ValidationError):
- pipeline.fit(data.X, lengths=data.lengths)
-
- # check that fitting with y works
- check_is_fitted(pipeline.fit(*data.X_y_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
-
- # check that transforming doesn't work
- with pytest.raises(AttributeError):
- pipeline.transform(*data.X_lengths)
-
- # check that fit_transform doesn't work
- with pytest.raises(AttributeError):
- pipeline.fit_transform(*data.X_lengths)
-
- # check that inverse_transform doesn't work
- with pytest.raises(AttributeError):
- pipeline.inverse_transform(*data.X_lengths)
-
- # check that predict works
- y_pred = pipeline.predict(*data.X_lengths)
- assert y_pred.shape == data.y.shape
- assert set(y_pred) == set(data.classes)
-
- # check that predict_proba works
- proba_pred = pipeline.predict_proba(*data.X_lengths)
- assert proba_pred.shape == (len(data), len(data.classes))
-
- # check that fit_predict works
- y_pred = pipeline.fit_predict(*data.X_y_lengths)
- # check that all steps are fitted
- check_is_fitted(pipeline.fit(*data.X_y_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
- # check that predictions are valid
- assert y_pred.shape == data.y.shape
- assert set(y_pred) == set(data.classes)
-
- # check that score with y works
- pipeline.score(*data.X_y_lengths)
-
- # check that score without y doesn't work
- with pytest.raises(InvalidParameterError):
- pipeline.score(data.X, lengths=data.lengths)
-
-
-def test_pipeline_with_transforms_and_estimator(data):
- pipeline = Pipeline([
- ("scale", IndependentFunctionTransformer(
- scale,
- inverse_func=lambda x: x,
- check_inverse=False
- )),
- ("pca", PCA(n_components=1)),
- ("knn", KNNClassifier(k=1)),
- ])
-
- # check that transforming doesn't work
- with pytest.raises(AttributeError):
- pipeline.transform(*data.X_lengths)
-
- # check that fitting without y doesn't work
- with pytest.raises(ValidationError):
- pipeline.fit(data.X, lengths=data.lengths)
-
- # check that fitting with y works
- check_is_fitted(pipeline.fit(*data.X_y_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
- # check that X values were transformed
- assert_not_equal(data.X, pipeline[-1].X_)
- assert pipeline[-1].X_.shape == (len(data.X), 1)
-
- # check that transforming doesn't work
- with pytest.raises(AttributeError):
- pipeline.transform(*data.X_lengths)
-
- # check that fit_transform doesn't work
- with pytest.raises(AttributeError):
- pipeline.fit_transform(*data.X_lengths)
-
- # check that inverse_transform doesn't work
- with pytest.raises(AttributeError):
- pipeline.inverse_transform(*data.X_lengths)
-
- # check that predict works
- y_pred = pipeline.predict(*data.X_lengths)
- assert y_pred.shape == data.y.shape
- assert set(y_pred) == set(data.classes)
-
- # check that predict_proba works
- proba_pred = pipeline.predict_proba(*data.X_lengths)
- assert proba_pred.shape == (len(data), len(data.classes))
-
- # check that fit_predict works
- y_pred = pipeline.fit_predict(*data.X_y_lengths)
- # check that all steps are fitted
- check_is_fitted(pipeline.fit(*data.X_y_lengths))
- for estimator in pipeline.named_steps.values():
- check_is_fitted(estimator)
- # check that predictions are valid
- assert y_pred.shape == data.y.shape
- assert set(y_pred) == set(data.classes)
- # check that X values were transformed
- assert_not_equal(data.X, pipeline[-1].X_)
- assert pipeline[-1].X_.shape == (len(data.X), 1)
-
- # check that score with y works
- pipeline.score(*data.X_y_lengths)
-
- # check that score without y doesn't work
- with pytest.raises(InvalidParameterError):
- pipeline.score(data.X, lengths=data.lengths)
diff --git a/lib/test/lib/utils/__init__.py b/lib/test/lib/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/lib/utils/test_data.py b/lib/test/lib/utils/test_data.py
deleted file mode 100644
index e9899f63..00000000
--- a/lib/test/lib/utils/test_data.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import os
-import pytest
-from tempfile import TemporaryDirectory
-
-import numpy as np
-
-from sequentia.utils import SequentialDataset
-
-from ...support.assertions import assert_equal, assert_all_equal
-
-
-@pytest.mark.parametrize('y_type', [int, float, None])
-@pytest.mark.parametrize('use_lengths', [True, False])
-def test_data(request, y_type, use_lengths):
- X = np.arange(10)
-
- if y_type == int:
- y = [10, 15, 10]
- elif y_type == float:
- y = [10.1, 15.2, 20.3]
- elif y_type is None:
- y = None
-
- if use_lengths:
- lengths = [2, 3, 5]
- else:
- lengths = None
- if y_type:
- y = y[0]
-
- data = SequentialDataset(X, y, lengths)
-
- # X
- assert_equal(data.X, np.atleast_2d(X).T)
-
- # y, classes
- if y_type == int:
- assert np.issubdtype(data.y.dtype, np.integer)
- assert_equal(data.y, np.atleast_1d(y))
- assert_equal(data.classes, [10, 15] if lengths else [10])
- elif y_type == float:
- assert np.issubdtype(data.y.dtype, np.floating)
- assert_equal(data.y, np.atleast_1d(y))
- assert data.classes is None
- elif y_type is None:
- for prop in ('y', 'X_y', 'X_y_lengths'):
- with pytest.raises(AttributeError):
- getattr(data, prop)
- assert data.classes is None
-
- # idxs
- if lengths:
- assert_equal(
- data.idxs, [
- [0, 2],
- [2, 5],
- [5, 10]
- ]
- )
- else:
- assert_equal(data.idxs, [[0, 10]])
-
- # _iter_X
- assert_equal(data.X, np.vstack([x for x in data._iter_X(data.X, data.idxs)]))
-
- # __getitem__
- if y_type:
- if lengths:
- # [0]
- x, y_ = data[0]
- assert_equal(x, np.atleast_2d([0, 1]).T)
- assert y_ == y[0]
- # [:1]
- xs, ys = data[:1]
- assert_all_equal(xs, [np.atleast_2d([0, 1]).T])
- assert_equal(ys, y[:1])
- # [1:3]
- xs, ys = data[1:3]
- assert_all_equal(xs, [
- np.atleast_2d([2, 3, 4]).T,
- np.atleast_2d([5, 6, 7, 8, 9]).T
- ])
- assert_equal(ys, y[1:3])
- # [-1]
- x, y_ = data[-1]
- assert_equal(x, np.atleast_2d([5, 6, 7, 8, 9]).T)
- assert y_ == y[-1]
- # [-2:]
- xs, ys = data[-2:]
- assert_all_equal(xs, [
- np.atleast_2d([2, 3, 4]).T,
- np.atleast_2d([5, 6, 7, 8, 9]).T
- ])
- assert_equal(ys, y[-2:])
- else:
- # [0]
- x, y_ = data[0]
- assert_equal(x, np.atleast_2d(X).T)
- assert y_ == y
- # [:1]
- xs, ys = data[:1]
- assert_all_equal(xs, [np.atleast_2d(X).T])
- assert_equal(ys, [y])
- else:
- if lengths:
- # [0]
- x = data[0]
- assert_equal(x, np.atleast_2d([0, 1]).T)
- # [:1]
- xs = data[:1]
- assert_all_equal(xs, [np.atleast_2d([0, 1]).T])
- # [1:3]
- xs = data[1:3]
- assert_all_equal(xs, [
- np.atleast_2d([2, 3, 4]).T,
- np.atleast_2d([5, 6, 7, 8, 9]).T
- ])
- # [-1]
- x = data[-1]
- assert_equal(x, np.atleast_2d([5, 6, 7, 8, 9]).T)
- # [-2:]
- xs = data[-2:]
- assert_all_equal(xs, [
- np.atleast_2d([2, 3, 4]).T,
- np.atleast_2d([5, 6, 7, 8, 9]).T
- ])
- else:
- # [0]
- x = data[0]
- assert_equal(x, np.atleast_2d(X).T)
- # [:1]
- xs = data[:1]
- assert_all_equal(xs, [np.atleast_2d(X).T])
-
- # split
- if y and lengths:
- train, test = data.split(test_size=1/3, shuffle=False)
- assert len(train) == 2
- assert len(test) == 1
- assert_equal(train.lengths, data.lengths[:len(train)])
- assert_equal(test.lengths, data.lengths[-len(test):])
- assert_equal(train.X, data.X[:train.lengths.sum()])
- assert_equal(test.X, data.X[-test.lengths.sum():])
- assert_equal(train.y, data.y[:len(train)])
- assert_equal(test.y, data.y[-len(test):])
-
- # iter_by_class
- if y_type == int and lengths:
- for X_, lengths_, c in data.iter_by_class():
- if c == 10:
- assert_equal(lengths_, [2, 5])
- assert_equal(X_, np.vstack([data.X[:2], data.X[-5:]]))
- elif c == 15:
- assert_equal(lengths_, [3])
- assert_equal(X_, data.X[2:5])
-
- # check serialization/deserialization
- with TemporaryDirectory() as temp_dir:
- data_path = f"{temp_dir}/{request.node.originalname}.npz"
- # check that save works
- data.save(data_path)
- assert os.path.isfile(data_path)
- # check that load works
- data_load = SequentialDataset.load(data_path)
- # check that stored values are the same
- assert_equal(data._X, data_load._X)
- assert_equal(data._lengths, data_load._lengths)
- if y:
- assert_equal(data._y, data_load._y)
- else:
- assert data._y is None and data_load._y is None
- if data._classes is not None:
- assert_equal(data._classes, data_load._classes)
- else:
- assert data._classes is None and data_load._classes is None
-
- # copy - check that stored values are the same
- data_copy = data.copy()
- assert_equal(data._X, data_copy._X)
- assert_equal(data._lengths, data_copy._lengths)
- if y:
- assert_equal(data._y, data_copy._y)
- else:
- assert data._y is None and data_copy._y is None
- if data._classes is not None:
- assert_equal(data._classes, data_copy._classes)
- else:
- assert data._classes is None and data_copy._classes is None
diff --git a/lib/test/lib/utils/test_decorators.py b/lib/test/lib/utils/test_decorators.py
deleted file mode 100644
index 30a69122..00000000
--- a/lib/test/lib/utils/test_decorators.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import pytest
-from typing import List
-from dataclasses import dataclass
-from operator import itemgetter
-
-from pydantic import BaseModel, ValidationError
-from sklearn.exceptions import NotFittedError
-
-from sequentia.models import KNNClassifier
-from sequentia.utils.decorators import _validate_params, _requires_fit, _override_params
-
-def test_validate_params():
- class Validator(BaseModel):
- param: List[int]
-
- @_validate_params(using=Validator)
- def to_validate(*, param):
- pass
-
- with pytest.raises(ValidationError):
- to_validate(param=None)
-
-
-def test_requires_fit():
- class Model:
- def fit(self):
- self.fitted_ = True
-
- @_requires_fit
- def predict(self):
- pass
-
- model = Model()
-
- with pytest.raises(NotFittedError):
- model.predict()
-
- model.fit()
- model.predict()
-
-
-@pytest.mark.parametrize('temporary', [True, False])
-@pytest.mark.parametrize('error', [True, False])
-def test_override_params(temporary, error):
- @dataclass
- class Model:
- b: int = 1
- c: int = 2
-
- @_override_params(['b', 'c'], temporary=temporary)
- def evaluate(self, a, **kwargs):
- for param in ('b', 'c'):
- if param in kwargs:
- assert getattr(self, param) == kwargs[param]
- if error:
- raise ValueError()
-
- model = Model()
-
- try:
- model.evaluate(a=0)
- except ValueError:
- pass
-
- assert model.b == 1 and model.c == 2
-
- if temporary:
- try:
- model.evaluate(a=0, b=2, c=1)
- except ValueError:
- pass
-
- assert model.b == 1 and model.c == 2
- else:
- try:
- model.evaluate(a=0, b=2, c=1)
- except ValueError:
- pass
-
- assert model.b == 2 and model.c == 1
diff --git a/lib/test/lib/utils/test_validation.py b/lib/test/lib/utils/test_validation.py
deleted file mode 100644
index 137b57a5..00000000
--- a/lib/test/lib/utils/test_validation.py
+++ /dev/null
@@ -1 +0,0 @@
-"""TODO"""
diff --git a/lib/test/support/__init__.py b/lib/test/support/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/test/support/assertions.py b/lib/test/support/assertions.py
deleted file mode 100644
index ef56d6fc..00000000
--- a/lib/test/support/assertions.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import numpy as np
-from numpy.testing import assert_allclose, assert_almost_equal
-
-def assert_equal(a, b):
- assert_allclose(a, b, rtol=1e-3)
-
-def assert_not_equal(a, b):
- assert not np.allclose(a, b, rtol=1e-3)
-
-def assert_all_equal(A, B):
- for a, b in zip(A, B):
- assert_equal(a, b)
-
-def assert_all_not_equal(A, B):
- for a, b in zip(A, B):
- assert_not_equal(a, b)
-
-def assert_distribution(a):
- if a.ndim == 1:
- assert_almost_equal(a.sum(), 1., decimal=5)
- elif a.ndim == 2:
- assert_almost_equal(a.sum(axis=1), np.ones(len(a)))
diff --git a/lib/test/support/itertools.py b/lib/test/support/itertools.py
deleted file mode 100644
index 7265baf6..00000000
--- a/lib/test/support/itertools.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from itertools import chain, combinations as combinations_
-
-def combinations(string):
- return map(lambda params: ''.join(params), chain.from_iterable(combinations_(string, i) for i in range(1, len(string))))
\ No newline at end of file
diff --git a/make/__init__.py b/make/__init__.py
new file mode 100644
index 00000000..f25c9764
--- /dev/null
+++ b/make/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for package development."""
+
+from . import cov, docs, lint, release, tests
diff --git a/make/cov.py b/make/cov.py
new file mode 100644
index 00000000..bd10475c
--- /dev/null
+++ b/make/cov.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for running coverage checks."""
+
+from invoke.config import Config
+from invoke.tasks import task
+
+
+@task
+def install(c: Config) -> None:
+ """Install package with core and coverage dependencies."""
+ c.run("poetry install --sync --only base,main,cov")
diff --git a/make/docs.py b/make/docs.py
new file mode 100644
index 00000000..592e69c4
--- /dev/null
+++ b/make/docs.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for generating Sphinx documentation."""
+
+from invoke.config import Config
+from invoke.tasks import task
+
+
+@task
+def install(c: Config) -> None:
+ """Install package with core and docs dependencies."""
+ c.run("poetry install --sync --only base,main,docs")
+
+
+@task
+def build(c: Config, *, watch: bool = True) -> None:
+ """Build package Sphinx documentation."""
+ if watch:
+ command = (
+ "poetry run sphinx-autobuild "
+ "docs/source/ docs/build/html/ "
+ "--watch docs/source/ --watch sequentia/ "
+ "--ignore sequentia/_internal/"
+ )
+ else:
+ command = "cd docs && make html"
+ c.run(command)
diff --git a/make/lint.py b/make/lint.py
new file mode 100644
index 00000000..0bd9dd7d
--- /dev/null
+++ b/make/lint.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for running linting and formatting."""
+
+from __future__ import annotations
+
+from invoke.config import Config
+from invoke.tasks import task
+
+
+@task
+def install(c: Config) -> None:
+ """Install package with core and dev dependencies."""
+ c.run("poetry install --sync --only base,main,lint")
+
+
+@task
+def check(c: Config) -> None:
+ """Lint Python files."""
+ commands: list[str] = [
+ "poetry run ruff check .",
+ "poetry run ruff format --check .",
+ # "poetry run pydoclint .",
+ ]
+ for command in commands:
+ c.run(command)
+
+
+@task(name="format")
+def format_(c: Config) -> None:
+ """Format Python files."""
+ commands: list[str] = [
+ "poetry run ruff --fix .",
+ "poetry run ruff format .",
+ ]
+ for command in commands:
+ c.run(command)
diff --git a/make/release.py b/make/release.py
new file mode 100644
index 00000000..91759055
--- /dev/null
+++ b/make/release.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for bumping the package version."""
+
+import os
+import re
+from pathlib import Path
+
+from invoke.config import Config
+from invoke.tasks import task
+
+
+@task
+def build(c: Config, *, v: str) -> None:
+ """Build release."""
+ root: Path = Path(os.getcwd())
+
+ # bump Sphinx documentation version - docs/source/conf.py
+ conf_path: Path = root / "docs" / "source" / "conf.py"
+ with open(conf_path) as f:
+ conf: str = f.read()
+ with open(conf_path, "w") as f:
+ f.write(re.sub(r'release = ".*"', f'release = "{v}"', conf))
+
+ # bump package version - sequentia/version.py)
+ init_path: Path = root / "sequentia" / "version.py"
+ with open(init_path) as f:
+ init: str = f.read()
+ with open(init_path, "w") as f:
+ f.write(re.sub(r'VERSION = ".*"', f'VERSION = "{v}"', init))
+
+ # bump project version - pyproject.toml
+ c.run(f"poetry version -q {v}")
diff --git a/make/tests.py b/make/tests.py
new file mode 100644
index 00000000..84111ee9
--- /dev/null
+++ b/make/tests.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Tasks for running tests."""
+
+from __future__ import annotations
+
+from invoke.config import Config
+from invoke.tasks import task
+
+
+@task
+def install(c: Config) -> None:
+ """Install package with core and test dependencies."""
+ c.run("poetry install --sync --only base,main,tests")
+
+
+@task
+def unit(c: Config, *, cov: bool = False) -> None:
+ """Run unit tests."""
+ command: str = "poetry run pytest tests/"
+
+ if cov:
+ command = f"{command} --cov sequentia --cov-report xml"
+
+ c.run(command)
diff --git a/notice.py b/notice.py
new file mode 100644
index 00000000..564a3a9c
--- /dev/null
+++ b/notice.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Adds a notice to the top of all Python source code files.
+
+This script is based on:
+https://github.com/fatiando/maintenance/issues/10#issuecomment-718754908
+"""
+
+from pathlib import Path
+
+notice = """
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+""".strip()
+
+
+for f in Path(".").glob("**/*.py"):
+ if not str(f).startswith("."):
+ code = f.read_text()
+ if not code.startswith(notice):
+ f.write_text(f"{notice}\n\n{code}")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..ed662593
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,204 @@
+[tool.poetry]
+name = "sequentia"
+version = "2.0.0a1"
+license = "MIT"
+authors = ["Edwin Onuonga "]
+maintainers = ["Edwin Onuonga "]
+description = "Scikit-Learn compatible HMM and DTW based sequence machine learning algorithms in Python."
+readme = "README.md"
+homepage = "https://github.com/eonu/sequentia"
+repository = "https://github.com/eonu/sequentia"
+documentation = "https://sequentia.readthedocs.io/en/latest"
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Environment :: Console",
+ "Framework :: Pydantic :: 2",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Information Technology",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Topic :: Scientific/Engineering",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ "Typing :: Typed",
+]
+keywords = [
+ "python",
+ "machine-learning",
+ "time-series",
+ "hmm",
+ "hidden-markov-models",
+ "dtw",
+ "dynamic-time-warping",
+ "knn",
+ "k-nearest-neighbors",
+ "sequence-classification",
+ "time-series-classification",
+ "multivariate-time-series",
+ "variable-length",
+ "classification-algorithms",
+]
+packages = [{ include = "sequentia" }]
+include = [
+ "sequentia",
+ "make",
+ "tests",
+ "CHANGELOG.md",
+ "LICENSE",
+ "Makefile",
+ "pyproject.toml",
+ "README.md",
+ "tasks.py",
+ "tox.ini",
+]
+
+[build-system]
+requires = ['poetry-core~=1.0', 'Cython>=0.28.5']
+build-backend = 'poetry.core.masonry.api'
+
+[tool.poetry.dependencies]
+python = "^3.11"
+numba = ">=0.56,<1"
+numpy = "^1.19.5"
+hmmlearn = ">=0.2.8,<1"
+dtaidistance = "^2.3.10"
+scikit-learn = "^1.4"
+scipy = "^1.6"
+joblib = "^1.2"
+pydantic = "^2"
+
+[tool.poetry.group.base.dependencies]
+invoke = "2.2.0"
+tox = "4.11.3"
+
+[tool.poetry.group.dev.dependencies]
+pre-commit = ">=3"
+
+[tool.poetry.group.lint.dependencies]
+ruff = "0.1.3"
+pydoclint = "0.3.8"
+
+[tool.poetry.group.docs.dependencies]
+sphinx = { version = "^7.2.4" }
+# numpydoc = { version = "^1.6" }
+sphinx-autobuild = { version = "^2021.3.14" }
+enum-tools = { version = ">=0.11,<1", extras = ["sphinx"] }
+
+[tool.poetry.group.tests.dependencies]
+pytest = { version = "^7.4.0" }
+pytest-cov = { version = "^4.1.0" }
+
+[tool.ruff]
+required-version = "0.1.3"
+select = [
+ "F", # pyflakes: https://pypi.org/project/pyflakes/
+ "E", # pycodestyle (error): https://pypi.org/project/pycodestyle/
+ "W", # pycodestyle (warning): https://pypi.org/project/pycodestyle/
+ "I", # isort: https://pypi.org/project/isort/
+ "N", # pep8-naming: https://pypi.org/project/pep8-naming/
+ "D", # pydocstyle: https://pypi.org/project/pydocstyle/
+ "UP", # pyupgrade: https://pypi.org/project/pyupgrade/
+ "YTT", # flake8-2020: https://pypi.org/project/flake8-2020/
+ "ANN", # flake8-annotations: https://pypi.org/project/flake8-annotations/
+ "S", # flake8-bandit: https://pypi.org/project/flake8-bandit/
+ "BLE", # flake8-blind-except: https://pypi.org/project/flake8-blind-except/
+ "FBT", # flake8-boolean-trap: https://pypi.org/project/flake8-boolean-trap/
+ "B", # flake8-bugbear: https://pypi.org/project/flake8-bugbear/
+ "A", # flake8-builtins: https://pypi.org/project/flake8-builtins/
+ "COM", # flake8-commas: https://pypi.org/project/flake8-commas/
+ "C4", # flake8-comprehensions: https://pypi.org/project/flake8-comprehensions/
+ "T10", # flake8-debugger: https://pypi.org/project/flake8-debugger/
+ "EM", # flake8-errmsg: https://pypi.org/project/flake8-errmsg/
+ "FA", # flake8-future-annotations: https://pypi.org/project/flake8-future-annotations/
+ "ISC", # flake8-implicit-str-concat: https://pypi.org/project/flake8-implicit-str-concat/
+ "ICN", # flake8-import-conventions: https://github.com/joaopalmeiro/flake8-import-conventions/
+ "G", # flake8-logging-format: https://pypi.org/project/flake8-logging-format/
+ "INP", # flake8-no-pep420: https://pypi.org/project/flake8-no-pep420/
+ "PIE", # flake8-pie: https://pypi.org/project/flake8-pie/
+ "T20", # flake8-print: https://pypi.org/project/flake8-print/
+ "PT", # flake8-pytest-style: https://pypi.org/project/flake8-pytest-style/
+ "Q", # flake8-quotes: https://pypi.org/project/flake8-quotes/
+ "RSE", # flake8-raise: https://pypi.org/project/flake8-raise/
+ "RET", # flake8-return: https://pypi.org/project/flake8-return/
+ "SLF", # flake8-self: https://pypi.org/project/flake8-self/
+ "SIM", # flake8-simplify: https://pypi.org/project/flake8-simplify/
+ "TID", # flake8-tidy-imports: https://pypi.org/project/flake8-tidy-imports/
+ "ARG", # flake8-unused-arguments: https://pypi.org/project/flake8-unused-arguments/
+ "TD", # flake8-todos: https://github.com/orsinium-labs/flake8-todos/
+ "ERA", # eradicate: https://pypi.org/project/eradicate/
+ "PGH", # pygrep-hooks: https://github.com/pre-commit/pygrep-hooks/
+ "PL", # pylint: https://pypi.org/project/pylint/
+ "TRY", # tryceratops: https://pypi.org/project/tryceratops/
+ "FLY", # flynt: https://pypi.org/project/flynt/
+ "PERF", # perflint: https://pypi.org/project/perflint/
+ "RUF", # ruff
+]
+ignore = [
+ "ANN401", # https://beta.ruff.rs/docs/rules/any-type/
+ "B905", # https://beta.ruff.rs/docs/rules/zip-without-explicit-strict/
+ "TD003", # https://beta.ruff.rs/docs/rules/missing-todo-link/
+ "PLR0913", # https://docs.astral.sh/ruff/rules/too-many-arguments/
+ "PLR0912", # https://docs.astral.sh/ruff/rules/too-many-branches/
+ "D205", # 1 blank line required between summary line and description
+ "PLR0911", # Too many return statements
+ "PLR2004", # Magic value used in comparison, consider replacing * with a constant variable"
+ "COM812", # ruff format conflict
+ "ISC001", # ruff format conflict
+ "ERA001", # Found commented-out code
+ "N802", # Function name should be lowercase
+ "N803", # Argument name should be lowercase
+ "N806", # Variable in function should be lowercase
+ "C408", # Unnecessary `dict` call (rewrite as a literal)
+ "D401", # First line of docstring should be in imperative mood
+]
+ignore-init-module-imports = true # allow unused imports in __init__.py
+line-length = 79
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.flake8-annotations]
+allow-star-arg-any = true
+
+[tool.ruff.extend-per-file-ignores]
+"__init__.py" = ["PLC0414", "F403", "F401", "F405"]
+"sequentia/datasets/*.py" = ["B006"]
+"sequentia/enums.py" = ["E501"]
+"sequentia/preprocessing/transforms.py" = [
+ "E",
+ "ANN",
+ "TRY",
+ "EM",
+ "E",
+ "B",
+ "SLF",
+ "ARG",
+]
+"tests/**/*.py" = ["D", "E", "S101"]
+# "tests/**/test_*.py" = ["ARG001", "S101", "D", "FA100", "FA102", "PLR0915"]
+"tests/**/test_*.py" = [
+ "SLF001",
+ "S101",
+ "D",
+ "PLR0915",
+ "ANN",
+ "TID252",
+ "ARG001",
+]
+
+[tool.pydoclint]
+style = "numpy"
+exclude = ".git|.tox|sequentia/_internal|tests" # temporary
+check-return-types = false
+arg-type-hints-in-docstring = false
+quiet = true
+
+[tool.pytest.ini_options]
+addopts = ["--import-mode=importlib"]
diff --git a/sequentia/__init__.py b/sequentia/__init__.py
new file mode 100644
index 00000000..ee898a01
--- /dev/null
+++ b/sequentia/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Scikit-Learn compatible HMM and DTW based
+sequence machine learning algorithms in Python.
+"""
+
+import sklearn
+
+from sequentia import datasets, enums, models, preprocessing, version
+
+__all__ = ["datasets", "models", "preprocessing", "enums", "version"]
+
+sklearn.set_config(enable_metadata_routing=True)
diff --git a/sequentia/_internal/__init__.py b/sequentia/_internal/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/sequentia/_internal/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/_internal/_data.py b/sequentia/_internal/_data.py
new file mode 100644
index 00000000..9d577868
--- /dev/null
+++ b/sequentia/_internal/_data.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import typing as t
+
+import numpy as np
+
+from sequentia._internal._typing import Array, IntArray
+
+__all__ = ["get_idxs", "iter_X"]
+
+
+def get_idxs(lengths: IntArray, /) -> IntArray:
+ ends = lengths.cumsum()
+ starts = np.zeros_like(ends)
+ starts[1:] = ends[:-1]
+ return np.c_[starts, ends]
+
+
+def iter_X(X: Array, /, *, idxs: IntArray) -> t.Iterator[Array]:
+ for start, end in idxs:
+ yield X[start:end]
diff --git a/sequentia/_internal/_hmm/__init__.py b/sequentia/_internal/_hmm/__init__.py
new file mode 100644
index 00000000..017d0518
--- /dev/null
+++ b/sequentia/_internal/_hmm/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from sequentia._internal._hmm import topologies
diff --git a/sequentia/_internal/_hmm/topologies.py b/sequentia/_internal/_hmm/topologies.py
new file mode 100644
index 00000000..6b605bb7
--- /dev/null
+++ b/sequentia/_internal/_hmm/topologies.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+
+from sequentia._internal._validation import FloatArray
+from sequentia.enums import TopologyMode
+
+__all__ = [
+ "ErgodicTopology",
+ "LeftRightTopology",
+ "LinearTopology",
+ "TOPOLOGY_MAP",
+]
+
+
+class BaseTopology:
+ """Represents a topology for a HMM, imposing restrictions on the
+ transition matrix and initial state distribution.
+
+ Parameters
+ ----------
+ n_states: int
+ Number of states in the HMM.
+
+ random_state: numpy.random.RandomState
+ A random state object for reproducible randomness.
+ """
+
+ mode: TopologyMode
+
+ def __init__(
+ self: BaseTopology,
+ *,
+ n_states: int,
+ random_state: np.random.RandomState,
+ ) -> BaseTopology:
+ self.n_states = n_states
+ self.random_state = random_state
+
+ def uniform_start_probs(self: BaseTopology) -> FloatArray:
+ """Set the initial state distribution as a discrete uniform
+ distribution.
+
+ Returns
+ -------
+ initial: :class:`numpy:numpy.ndarray` (float)
+ The initial state distribution of shape `(n_states,)`.
+ """
+ return np.ones(self.n_states) / self.n_states
+
+ def random_start_probs(self: BaseTopology) -> FloatArray:
+ """Set the initial state distribution by randomly sampling
+ probabilities generated by a Dirichlet distribution.
+
+ Returns
+ -------
+ initial: :class:`numpy:numpy.ndarray` (float)
+ The initial state distribution of shape `(n_states,)`.
+ """
+ return self.random_state.dirichlet(
+ np.ones(self.n_states),
+ size=1,
+ ).flatten()
+
+ def uniform_transition_probs(self: BaseTopology) -> FloatArray:
+ """Set the transition matrix as uniform (equal probability of
+ transitioning to all other possible states from each state)
+ corresponding to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The uniform transition matrix of shape `(n_states, n_states)`.
+ """
+ raise NotImplementedError
+
+ def random_transition_probs(self: BaseTopology) -> FloatArray:
+ """Set the transition matrix as random (random probability of
+ transitioning to all other possible states from each state) by
+ sampling probabilitiesfrom a Dirichlet distribution - according
+ to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The random transition matrix of shape `(n_states, n_states)`.
+ """
+ raise NotImplementedError
+
+ def check_start_probs(self: BaseTopology, initial: FloatArray, /) -> None:
+ """Validate an initial state distribution according to the
+ topology's restrictions.
+
+ Parameters
+ ----------
+ initial: numpy.ndarray (float)
+ The initial state distribution to validate.
+ """
+ if not isinstance(initial, np.ndarray):
+ msg = "Initial state distribution must be a numpy.ndarray"
+ raise TypeError(msg)
+ if initial.shape != (self.n_states,):
+ msg = "Initial state distribution must be of shape (n_states,)"
+ raise ValueError(msg)
+ if not np.isclose(initial.sum(), 1):
+ msg = "Initial state distribution must sum to one"
+ raise ValueError(msg)
+ return initial
+
+ def check_transition_probs(
+ self: BaseTopology, transitions: FloatArray, /
+ ) -> FloatArray:
+ """Validate a transition matrix according to the topology's
+ restrictions.
+
+ Parameters
+ ----------
+ transitions: numpy.ndarray (float)
+ The transition matrix to validate.
+ """
+ if not isinstance(transitions, np.ndarray):
+ msg = "Transition matrix must be a numpy.ndarray"
+ raise TypeError(msg)
+ if transitions.shape != (self.n_states, self.n_states):
+ msg = "Transition matrix must be of shape (n_states, n_states)"
+ raise ValueError(msg)
+ if not np.allclose(transitions.sum(axis=1), np.ones(self.n_states)):
+ msg = "Transition probabilities out of each state must sum to one"
+ raise ValueError(msg)
+ return transitions
+
+
+class ErgodicTopology(BaseTopology):
+ """Represents the topology for an ergodic HMM, imposing non-zero
+ probabilities in the transition matrix.
+
+ Parameters
+ ----------
+ n_states: int
+ Number of states in the HMM.
+
+ random_state: numpy.random.RandomState
+ A random state object for reproducible randomness.
+ """
+
+ mode: TopologyMode = TopologyMode.ERGODIC
+
+ def uniform_transition_probs(self: ErgodicTopology) -> FloatArray:
+ """Set the transition matrix as uniform (equal probability of
+ transitioning to all other possible states from each state)
+ corresponding to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The uniform transition matrix of shape `(n_states, n_states)`.
+ """
+ return np.ones((self.n_states, self.n_states)) / self.n_states
+
+ def random_transition_probs(self: ErgodicTopology) -> FloatArray:
+ """Set the transition matrix as random (random probability of
+ transitioning to all other possible states from each state) by
+ sampling probabilities from a Dirichlet distribution - according
+ to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The random transition matrix of shape `(n_states, n_states)`.
+ """
+ return self.random_state.dirichlet(
+ np.ones(self.n_states),
+ size=self.n_states,
+ )
+
+ def check_transition_probs(
+ self: ErgodicTopology, transitions: FloatArray, /
+ ) -> FloatArray:
+ """Validate a transition matrix according to the topology's
+ restrictions.
+
+ Parameters
+ ----------
+ transitions: numpy.ndarray (float)
+ The transition matrix to validate.
+ """
+ super().check_transition_probs(transitions)
+ if not np.all(transitions > 0):
+ msg = (
+ "Zero probabilities in ergodic transition matrix - "
+ "these transition probabilities will not be learned"
+ )
+ warnings.warn(msg, stacklevel=1)
+ return transitions
+
+
+class LeftRightTopology(BaseTopology):
+ """Represents the topology for a left-right HMM, imposing an
+ upper-triangular transition matrix.
+
+ Parameters
+ ----------
+ n_states: int
+ Number of states in the HMM.
+
+ random_state: numpy.random.RandomState
+ A random state object for reproducible randomness.
+ """
+
+ mode: TopologyMode = TopologyMode.LEFT_RIGHT
+
+ def uniform_transition_probs(self: LeftRightTopology) -> FloatArray:
+ """Set the transition matrix as uniform (equal probability of
+ transitioning to all other possible states from each state)
+ corresponding to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The uniform transition matrix of shape `(n_states, n_states)`.
+ """
+ upper_ones = np.triu(np.ones((self.n_states, self.n_states)))
+ upper_divisors = np.triu(
+ np.tile(np.arange(self.n_states, 0, -1), (self.n_states, 1)).T
+ )
+ lower_ones = np.tril(np.ones(self.n_states), k=-1)
+ return upper_ones / (upper_divisors + lower_ones)
+
+ def random_transition_probs(self: LeftRightTopology) -> FloatArray:
+ """Set the transition matrix as random (random probability of
+ transitioning to all other possible states from each state) by
+ sampling probabilities from a Dirichlet distribution, according
+ to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The random transition matrix of shape `(n_states, n_states)`.
+ """
+ transitions = np.zeros((self.n_states, self.n_states))
+ for i, row in enumerate(transitions):
+ row[i:] = self.random_state.dirichlet(np.ones(self.n_states - i))
+ return transitions
+
+ def check_transition_probs(
+ self: LeftRightTopology, transitions: FloatArray, /
+ ) -> FloatArray:
+ """Validate a transition matrix according to the topology's
+ restrictions.
+
+ Parameters
+ ----------
+ transitions: numpy.ndarray (float)
+ The transition matrix to validate.
+ """
+ super().check_transition_probs(transitions)
+ if not np.allclose(transitions, np.triu(transitions)):
+ msg = "Left-right transition matrix must be upper-triangular"
+ raise ValueError(msg)
+ return transitions
+
+
+class LinearTopology(LeftRightTopology):
+ """Represents the topology for a linear HMM.
+
+ Parameters
+ ----------
+ n_states: int
+ Number of states in the HMM.
+
+ random_state: numpy.random.RandomState
+ A random state object for reproducible randomness.
+ """
+
+ mode: TopologyMode = TopologyMode.LINEAR
+
+ def uniform_transition_probs(self: LinearTopology) -> FloatArray:
+ """Set the transition matrix as uniform (equal probability of
+ transitioning to all other possible states from each state)
+ corresponding to the topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The uniform transition matrix of shape `(n_states, n_states)`.
+ """
+ transitions = np.zeros((self.n_states, self.n_states))
+ for i, row in enumerate(transitions):
+ size = min(2, self.n_states - i)
+ row[i : (i + size)] = np.ones(size) / size
+ return transitions
+
+ def random_transition_probs(self: LinearTopology) -> FloatArray:
+ """Set the transition matrix as random (random probability of
+ transitioning to all other possible states from each state) by
+ sampling probabilities from a Dirichlet distribution, according to the
+ topology.
+
+ Returns
+ -------
+ transitions: :class:`numpy:numpy.ndarray` (float)
+ The random transition matrix of shape `(n_states, n_states)`.
+ """
+ transitions = np.zeros((self.n_states, self.n_states))
+ for i, row in enumerate(transitions):
+ size = min(2, self.n_states - i)
+ row[i : (i + size)] = self.random_state.dirichlet(np.ones(size))
+ return transitions
+
+ def check_transition_probs(
+ self: LinearTopology, transitions: FloatArray, /
+ ) -> FloatArray:
+ """Validate a transition matrix according to the topology's
+ restrictions.
+
+ Parameters
+ ----------
+ transitions: numpy.ndarray (float)
+ The transition matrix to validate.
+ """
+ super().check_transition_probs(transitions)
+ diagonal = np.diag(np.diag(transitions))
+ upper_diagonal = np.diag(np.diag(transitions, k=1), k=1)
+ if not np.allclose(transitions, diagonal + upper_diagonal):
+ msg = (
+ "Linear transition matrix must only consist of "
+ "a diagonal and upper diagonal"
+ )
+ raise ValueError(msg)
+ return transitions
+
+
+TOPOLOGY_MAP = {
+ topology.mode: topology
+ for topology in (ErgodicTopology, LeftRightTopology, LinearTopology)
+}
diff --git a/sequentia/_internal/_multiprocessing.py b/sequentia/_internal/_multiprocessing.py
new file mode 100644
index 00000000..b0260d08
--- /dev/null
+++ b/sequentia/_internal/_multiprocessing.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import joblib
+
+from sequentia._internal._typing import IntArray
+
+__all__ = ["effective_n_jobs"]
+
+
+def effective_n_jobs(n_jobs: int, *, x: IntArray | None = None) -> int:
+ if x is None:
+ return 1
+ return min(joblib.effective_n_jobs(n_jobs), len(x))
diff --git a/sequentia/_internal/_typing.py b/sequentia/_internal/_typing.py
new file mode 100644
index 00000000..d9db94bd
--- /dev/null
+++ b/sequentia/_internal/_typing.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import numpy as np
+import numpy.typing as npt
+
+__all__ = ["FloatArray", "IntArray", "Array"]
+
+FloatArray = npt.NDArray[np.float_]
+IntArray = npt.NDArray[np.int_]
+Array = FloatArray | IntArray
diff --git a/sequentia/_internal/_validation.py b/sequentia/_internal/_validation.py
new file mode 100644
index 00000000..04c8122a
--- /dev/null
+++ b/sequentia/_internal/_validation.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import functools
+import typing as t
+import warnings
+
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.multiclass import check_classification_targets
+from sklearn.utils import check_random_state
+from sklearn.utils._param_validation import InvalidParameterError
+from sklearn.utils.multiclass import unique_labels
+from sklearn.utils.validation import NotFittedError
+
+from sequentia._internal._typing import Array, FloatArray, IntArray
+
+__all__ = [
+ "check_random_state",
+ "check_is_fitted",
+ "requires_fit",
+ "check_classes",
+ "check_X",
+ "check_X_lengths",
+ "check_y",
+ "check_weighting",
+ "check_use_c",
+]
+
+
+def check_is_fitted(
+ estimator: BaseEstimator,
+ *,
+ attributes: list[str] | None = None,
+ return_: bool = False,
+) -> bool | None:
+ fitted = False
+ if attributes is None:
+ keys = estimator.__dict__
+ fitted = any(attr.endswith("_") for attr in keys if "__" not in attr)
+ else:
+ fitted = all(hasattr(estimator, attr) for attr in attributes)
+
+ if return_:
+ return fitted
+
+ if not fitted:
+ msg = (
+ f"This {type(estimator).__name__!r} instance is not fitted yet. "
+ "Call 'fit' with appropriate arguments before using this method."
+ )
+ raise NotFittedError(msg)
+
+ return None
+
+
+def requires_fit(function: t.Callable) -> t.Callable:
+ @functools.wraps(function)
+ def wrapper(self: t.Self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+ check_is_fitted(self)
+ return function(self, *args, **kwargs)
+
+ return wrapper
+
+
+def check_classes(
+ y: t.Iterable,
+ *,
+ classes: t.Iterable[int] | None = None,
+) -> IntArray:
+ check_classification_targets(y)
+ unique_y = unique_labels(y)
+
+ classes_ = None
+ if classes is None:
+ classes_ = unique_y
+ else:
+ classes_np = np.array(classes).flatten()
+ if not np.issubdtype(classes_np.dtype, np.integer):
+ msg = "Expected classes to be integers"
+ raise TypeError(msg)
+
+ _, idx = np.unique(classes_np, return_index=True)
+ classes_ = classes_np[np.sort(idx)]
+ if unseen_labels := set(unique_y) - set(classes_np):
+ msg = (
+ "Encountered label(s) in `y`"
+ f"not present in specified classes - {unseen_labels}"
+ )
+ raise ValueError(msg)
+
+ return classes_.astype(np.int8)
+
+
+def check_X(
+ X: t.Iterable[int] | t.Iterable[float],
+ /,
+ *,
+ dtype: np.float_ | np.int_,
+ univariate: bool = False,
+) -> Array:
+ if not isinstance(X, np.ndarray):
+ try:
+ X = np.array(X).astype(dtype)
+ except Exception as e: # noqa: BLE001
+ type_ = type(X).__name__
+ msg = f"Expected value to be a numpy.ndarray, got {type_!r}"
+ raise TypeError(msg) from e
+ if (dtype_ := X.dtype) != dtype:
+ try:
+ X = X.astype(dtype)
+ except Exception as e: # noqa: BLE001
+ msg = f"Expected array to have dtype {dtype}, got {dtype_}"
+ raise TypeError(msg) from e
+ if (ndim_ := X.ndim) != 2:
+ msg = f"Expected array to have two dimensions, got {ndim_}"
+ raise ValueError(msg)
+ if (len_ := len(X)) == 0:
+ msg = f"Expected array to have be at least length 1, got {len_}"
+ raise ValueError(msg)
+ if univariate and (n_features := X.shape[-1]) > 1:
+ msg = f"Expected array to be univariate, got {n_features} features"
+ raise ValueError(msg)
+ return X
+
+
+def check_X_lengths(
+ X: t.Iterable[int] | t.Iterable[float],
+ /,
+ *,
+ lengths: t.Iterable[int] | None,
+ dtype: np.float_ | np.int_,
+ univariate: bool = False,
+) -> tuple[Array, IntArray]:
+ # validate observations
+ X = check_X(X, dtype=dtype, univariate=univariate)
+
+ # treat whole input as one sequence if no lengths given
+ if lengths is None:
+ lengths = [len(X)]
+
+ # convert to numpy.ndarray and cast to integer
+ lengths = np.array(lengths).astype(int)
+
+ # check that there is at least one sequence
+ if len(lengths) == 0:
+ msg = "Expected at least one sequence"
+ raise ValueError(msg)
+
+ # check that lengths are one-dimensional
+ if (ndim := lengths.ndim) != 1:
+ msg = f"Expected lengths to have one dimension, got {ndim}"
+ raise ValueError(msg)
+
+ # validate sequence lengths
+ if (true_total := len(X)) != (given_total := lengths.sum()):
+ msg = (
+ f"Total of provided lengths ({given_total}) "
+ f"does not match the length of X ({true_total})"
+ )
+ raise ValueError(msg)
+
+ return X, lengths
+
+
+def check_y(
+ y: t.Iterable[int] | t.Iterable[float] | None,
+ /,
+ *,
+ lengths: IntArray,
+ dtype: np.float_ | np.int_ | None = None,
+) -> Array:
+ if y is None:
+ msg = "No output values `y` provided"
+ raise InvalidParameterError(msg)
+
+ # convert to numpy.ndarray and flatten
+ y = np.array(y).flatten()
+
+ # cast to dtype
+ if dtype:
+ y = y.astype(dtype)
+
+ # validate against lengths
+ if (len_y := len(y)) != (n_seqs := len(lengths)):
+ msg = (
+ f"Expected size of y ({len_y}) "
+ f"to be the same as the size of lengths ({n_seqs})"
+ )
+ raise ValueError(msg)
+
+ return y
+
+
+def check_weighting(
+ weighting: t.Callable[[FloatArray], FloatArray] | None,
+ /,
+) -> None:
+ if weighting is None:
+ return
+ try:
+ x = np.random.rand(10)
+ weights = weighting(x)
+ if not isinstance(weights, np.ndarray):
+ msg = "Weights should be an numpy.ndarray"
+ raise TypeError(msg) # noqa: TRY301
+ if not np.issubdtype(weights.dtype, np.floating):
+ msg = "Weights should be floating point values"
+ raise TypeError(msg) # noqa: TRY301
+ if x.shape != weights.shape:
+ msg = "Weights should have the same shape as inputs"
+ raise ValueError(msg) # noqa: TRY301
+ except Exception as e: # noqa: BLE001
+ msg = "Invalid weighting function"
+ raise ValueError(msg) from e
+
+
+def check_use_c(use_c: bool, /) -> bool: # noqa: FBT001
+ if not use_c:
+ return use_c
+
+ import importlib
+
+ if importlib.util.find_spec("dtaidistance.dtw_cc"):
+ return True
+
+ msg = "DTAIDistance C library not available - using Python implementation"
+ warnings.warn(msg, ImportWarning, stacklevel=1)
+ return False
diff --git a/sequentia/datasets/__init__.py b/sequentia/datasets/__init__.py
new file mode 100644
index 00000000..66efacc0
--- /dev/null
+++ b/sequentia/datasets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Utilities for creating and loading sample sequential datasets."""
+
+from sequentia.datasets import data
+from sequentia.datasets.base import SequentialDataset
+from sequentia.datasets.digits import load_digits
+from sequentia.datasets.gene_families import load_gene_families
+
+__all__ = ["data", "load_digits", "load_gene_families", "SequentialDataset"]
diff --git a/sequentia/datasets/base.py b/sequentia/datasets/base.py
new file mode 100644
index 00000000..18984f2d
--- /dev/null
+++ b/sequentia/datasets/base.py
@@ -0,0 +1,442 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Utility wrapper for a generic sequential dataset."""
+
+from __future__ import annotations
+
+import copy
+import pathlib
+import typing as t
+import warnings
+
+import numpy as np
+import pydantic as pyd
+from sklearn.model_selection import train_test_split
+
+from sequentia._internal import _data, _validation
+from sequentia._internal._typing import Array, IntArray
+
+__all__ = ["SequentialDataset"]
+
+
+class SequentialDataset:
+ """Utility wrapper for a generic sequential dataset."""
+
+ def __init__(
+ self: SequentialDataset,
+ X: Array,
+ y: Array | None = None,
+ *,
+ lengths: IntArray | None = None,
+ classes: list[int] | None = None,
+ ) -> SequentialDataset:
+ """Initialize a :class:`.SequentialDataset`.
+
+ Parameters
+ ----------
+ self: SequentialDataset
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ classes:
+ Set of possible class labels
+ (only if ``y`` was provided with categorical values).
+
+ If not provided, these will be determined from the training
+ data labels.
+ """
+ X, lengths = _validation.check_X_lengths(
+ X,
+ lengths=lengths,
+ dtype=X.dtype,
+ )
+ if y is not None:
+ y = _validation.check_y(y, lengths=lengths)
+
+ self._X = X
+ self._y = y
+ self._lengths = lengths
+
+ self._classes = None
+ if self._y is not None and np.issubdtype(self._y.dtype, np.integer):
+ self._classes = _validation.check_classes(
+ self._y,
+ classes=classes,
+ )
+
+ self._idxs = _data.get_idxs(self.lengths)
+
+ def split(
+ self: SequentialDataset,
+ *,
+ test_size: (
+ pyd.NonNegativeInt | pyd.confloat(ge=0, le=1) | None
+ ) = None, # placeholder
+ train_size: (
+ pyd.NonNegativeInt | pyd.confloat(ge=0, le=1) | None
+ ) = None, # placeholder
+ random_state: (
+ pyd.NonNegativeInt | np.random.RandomState | None
+ ) = None, # placeholder
+ shuffle: bool = True,
+ stratify: bool = False,
+ ) -> tuple[SequentialDataset, SequentialDataset]:
+ """Split the dataset into two partitions (train/test).
+
+ See :func:`sklearn:sklearn.model_selection.train_test_split`.
+
+ Parameters
+ ----------
+ self: SequentialDataset
+
+ test_size:
+ Size of the test partition.
+
+ train_size:
+ Size of the training partition.
+
+ random_state:
+ Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness.
+
+ shuffle:
+ Whether or not to shuffle the data before splitting.
+ If ``shuffle=False`` then ``stratify`` must be ``False``.
+
+ stratify:
+ Whether or not to stratify the partitions by class label.
+
+ Returns
+ -------
+ tuple[SequentialDataset, SequentialDataset]
+ Dataset partitions.
+ """
+ stratify = None
+ if stratify:
+ if self._y is None:
+ msg = "Cannot stratify with no provided outputs"
+ warnings.warn(msg, stacklevel=1)
+ elif self._classes is None:
+ msg = "Cannot stratify on non-categorical outputs"
+ warnings.warn(msg, stacklevel=1)
+ else:
+ stratify = self._y
+
+ idxs = np.arange(len(self._lengths))
+ train_idxs, test_idxs = train_test_split(
+ idxs,
+ test_size=test_size,
+ train_size=train_size,
+ random_state=random_state,
+ shuffle=shuffle,
+ stratify=stratify,
+ )
+
+ if self._y is None:
+ X_train, y_train = self[train_idxs], None
+ X_test, y_test = self[test_idxs], None
+ else:
+ X_train, y_train = self[train_idxs]
+ X_test, y_test = self[test_idxs]
+
+ lengths_train = self._lengths[train_idxs]
+ lengths_test = self._lengths[test_idxs]
+ classes = self._classes
+
+ data_train = SequentialDataset(
+ np.vstack(X_train),
+ y_train,
+ lengths=lengths_train,
+ classes=classes,
+ )
+ data_test = SequentialDataset(
+ np.vstack(X_test),
+ y_test,
+ lengths=lengths_test,
+ classes=classes,
+ )
+
+ return data_train, data_test
+
+ def iter_by_class(
+ self: SequentialDataset,
+ ) -> t.Generator[tuple[Array, Array, int]]:
+ """Subset the observation sequences by class.
+
+ Returns
+ -------
+ typing.Generator[tuple[numpy.ndarray, numpy.ndarray, int]]
+ Generator iterating over classes, yielding:
+
+ - ``X`` subset of sequences belonging to the class.
+ - Lengths corresponding to the ``X`` subset.
+ - Class used to subset ``X``.
+
+ Raises
+ ------
+ AttributeError
+ If ``y`` was not provided to :func:`__init__`.
+
+ TypeError
+ If ``y`` was provided but was not categorical.
+ """
+ if self._y is None:
+ msg = "No `y` values were provided during initialization"
+ raise AttributeError(msg)
+
+ if self._classes is None:
+ msg = "Cannot iterate by class on real-valued targets"
+ raise TypeError(msg)
+
+ for c in self._classes:
+ ind = np.argwhere(self._y == c).flatten()
+ X, _ = self[ind]
+ lengths = self._lengths[ind]
+ yield np.vstack(X), lengths, c
+
+ def __len__(self: SequentialDataset) -> int:
+ """Return the number of sequences in the dataset."""
+ return len(self._lengths)
+
+ def __getitem__(
+ self: SequentialDataset,
+ /,
+ i: int,
+ ) -> Array | tuple[Array, Array]:
+ """Slice observation sequences and corresponding outputs."""
+ idxs = np.atleast_2d(self._idxs[i])
+ X = list(_data.iter_X(self._X, idxs=idxs))
+ X = X[0] if isinstance(i, int) and len(X) == 1 else X
+ return X if self._y is None else (X, self._y[i])
+
+ def __iter__(
+ self: SequentialDataset,
+ ) -> t.Generator[Array | tuple[Array, Array]]:
+ """Create a generator over sequences and their corresponding
+ outputs.
+ """
+ for i in range(len(self)):
+ yield self[i]
+
+ @property
+ def X(self: SequentialDataset) -> Array:
+ """Observation sequences.
+
+ Returns
+ -------
+ numpy.ndarray
+ Observation sequences.
+ """
+ return self._X
+
+ @property
+ def y(self: SequentialDataset) -> Array:
+ """Outputs corresponding to ``X``.
+
+ Returns
+ -------
+ numpy.ndarray
+ Sequence outputs.
+
+ Raises
+ ------
+ AttributeError
+ If ``y`` was not provided to :func:`__init__`.
+ """
+ if self._y is None:
+ msg = "No `y` values were provided during initialization"
+ raise AttributeError(msg)
+ return self._y
+
+ @property
+ def lengths(self: SequentialDataset) -> IntArray:
+ """Lengths corresponding to ``X``.
+
+ Returns
+ -------
+ numpy.ndarray
+ Lengths for each sequence in ``X``.
+ """
+ return self._lengths
+
+ @property
+ def classes(self: SequentialDataset) -> IntArray | None:
+ """Set of unique classes in ``y``.
+
+ Returns
+ -------
+ numpy.ndarray | None
+ Unique classes if ``y`` is categorical.
+ """
+ return self._classes
+
+ @property
+ def idxs(self: SequentialDataset) -> IntArray:
+ """Observation sequence start and end indices.
+
+ Returns
+ -------
+ numpy.ndarray
+ Start and end indices for each sequence in ``X``.
+ """
+ return self._idxs
+
+ @property
+ def X_y(self: SequentialDataset) -> dict[str, Array]:
+ """Observation sequences and corresponding outputs.
+
+ Returns
+ -------
+ dict[str, numpy.ndarray]
+ Mapping with keys:
+
+ - ``"X"`` for observation sequences,
+ - ``"y"`` for outputs.
+
+ Raises
+ ------
+ AttributeError
+ If ``y`` was not provided to :func:`__init__`.
+ """
+ if self._y is None:
+ msg = "No `y` values were provided during initialization"
+ raise AttributeError(msg)
+ return {"X": self._X, "y": self._y}
+
+ @property
+ def X_lengths(self: SequentialDataset) -> dict[str, Array]:
+ """Observation sequences and corresponding lengths.
+
+ Returns
+ -------
+ dict[str, numpy.ndarray]
+ Mapping with keys:
+
+ - ``"X"`` for observation sequences,
+ - ``"lengths"`` for lengths.
+ """
+ return {"X": self._X, "lengths": self._lengths}
+
+ @property
+ def X_y_lengths(self: SequentialDataset) -> dict[str, Array]:
+ """Observation sequences and corresponding outputs and lengths.
+
+ Returns
+ -------
+ dict[str, numpy.ndarray]
+ Mapping with keys:
+
+ - ``"X"`` for observation sequences,
+ - ``"y"`` for outputs,
+ - ``"lengths"`` for lengths.
+
+ Raises
+ ------
+ AttributeError
+ If ``y`` was not provided to :func:`__init__`.
+ """
+ if self._y is None:
+ msg = "No `y` values were provided during initialization"
+ raise AttributeError(msg)
+ return {"X": self._X, "y": self._y, "lengths": self._lengths}
+
+ def save(
+ self: SequentialDataset,
+ path: str | pathlib.Path | t.IO,
+ /,
+ *,
+ compress: bool = True,
+ ) -> None:
+ """Store the dataset in ``.npz`` format.
+
+ See :func:`numpy:numpy.savez` and :func:`numpy:numpy.savez_compressed`.
+
+ Parameters
+ ----------
+ path
+ Location to store the dataset.
+
+ compress
+ Whether or not to compress the dataset.
+
+ See Also
+ --------
+ load:
+ Loads a stored dataset in ``.npz`` format.
+ """
+ arrs = self.X_lengths
+
+ if self._y is not None:
+ arrs["y"] = self._y
+
+ if self._classes is not None:
+ arrs["classes"] = self._classes
+
+ save_fun = np.savez_compressed if compress else np.savez
+ save_fun(path, **arrs)
+
+ @classmethod
+ def load(
+ cls: type[SequentialDataset], path: str | pathlib.Path | t.IO, /
+ ) -> SequentialDataset:
+ """Load a stored dataset in ``.npz`` format.
+
+ See :func:`numpy:numpy.load`.
+
+ Parameters
+ ----------
+ path:
+ Location to store the dataset.
+
+ Returns
+ -------
+ SequentialDataset
+ The loaded dataset.
+
+ See Also
+ --------
+ save:
+ Stores the dataset in ``.npz`` format.
+ """
+ return cls(**np.load(path))
+
+ def copy(self: SequentialDataset) -> SequentialDataset:
+ """Create a copy of the dataset.
+
+ Returns
+ -------
+ SequentialDataset
+ Dataset copy.
+ """
+ params = {
+ "X": copy.deepcopy(self._X),
+ "y": None,
+ "lengths": copy.deepcopy(self._lengths),
+ "classes": None,
+ }
+
+ if self._y is not None:
+ params["y"] = copy.deepcopy(self._y)
+
+ if self._classes is not None:
+ params["classes"] = copy.deepcopy(self._classes)
+
+ return SequentialDataset(
+ params["X"],
+ params["y"],
+ lengths=params["lengths"],
+ classes=params["classes"],
+ )
diff --git a/sequentia/datasets/data/__init__.py b/sequentia/datasets/data/__init__.py
new file mode 100644
index 00000000..e2068632
--- /dev/null
+++ b/sequentia/datasets/data/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Storage for sample datasets."""
diff --git a/lib/sequentia/datasets/data/digits.npz b/sequentia/datasets/data/digits.npz
similarity index 100%
rename from lib/sequentia/datasets/data/digits.npz
rename to sequentia/datasets/data/digits.npz
diff --git a/lib/sequentia/datasets/data/gene_families.npz b/sequentia/datasets/data/gene_families.npz
similarity index 100%
rename from lib/sequentia/datasets/data/gene_families.npz
rename to sequentia/datasets/data/gene_families.npz
diff --git a/sequentia/datasets/digits.py b/sequentia/datasets/digits.py
new file mode 100644
index 00000000..defe4914
--- /dev/null
+++ b/sequentia/datasets/digits.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Free Spoken Digit Dataset."""
+
+from __future__ import annotations
+
+import importlib.resources
+import operator
+
+import numpy as np
+import pydantic as pyd
+
+import sequentia.datasets.data
+from sequentia._internal import _data
+from sequentia.datasets.base import SequentialDataset
+
+__all__ = ["load_digits"]
+
+
+@pyd.validate_call
+def load_digits(
+ *, digits: set[pyd.conint(ge=0, le=9)] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+) -> SequentialDataset:
+ """Load a dataset of MFCC features of spoken digit audio samples from the
+ Free Spoken Digit Dataset.
+
+ The `Free Spoken Digit Dataset (FSDD) `_
+ consists of 3000 recordings of the spoken digits 0-9.
+
+ This version consists of 13 MFCC features of 50 recordings for each digit
+ by 6 individual speakers.
+
+ Parameters
+ ----------
+ digits:
+ Subset of digits to include in the dataset.
+
+ Returns
+ -------
+ SequentialDataset
+ A dataset object representing the loaded digits.
+ """
+ # Load the dataset from compressed numpy file
+ path = importlib.resources.files(sequentia.datasets.data)
+ data = np.load(path / "digits.npz")
+
+ # Fetch arrays from loaded file
+ X, y, lengths = operator.itemgetter("X", "y", "lengths")(data)
+
+ # Create a dataset only with sequences having the specified labels
+ idx = np.argwhere(np.isin(y, sorted(digits))).flatten()
+ ranges = _data.get_idxs(lengths)[idx]
+ return SequentialDataset(
+ np.vstack(list(_data.iter_X(X, idxs=ranges))),
+ y[idx],
+ lengths=lengths[idx],
+ )
diff --git a/sequentia/datasets/gene_families.py b/sequentia/datasets/gene_families.py
new file mode 100644
index 00000000..21515689
--- /dev/null
+++ b/sequentia/datasets/gene_families.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Gene families dataset."""
+
+from __future__ import annotations
+
+import importlib.resources
+import operator
+
+import numpy as np
+import pydantic as pyd
+from sklearn.preprocessing import LabelEncoder
+
+import sequentia.datasets.data
+from sequentia._internal import _data
+from sequentia.datasets.base import SequentialDataset
+
+__all__ = ["load_gene_families"]
+
+
+@pyd.validate_call
+def load_gene_families(
+ *, families: set[pyd.conint(ge=0, le=6)] = {0, 1, 2, 3, 4, 5, 6}
+) -> tuple[SequentialDataset, LabelEncoder]:
+ """Load a dataset of human DNA sequences grouped by gene family.
+
+ The `Human DNA Sequences `_
+ dataset consists of 4380 DNA sequences belonging to 7 gene families.
+
+ This dataset has imbalanced classes, and uses an
+ :class:`sklearn:sklearn.preprocessing.LabelEncoder` to encode the
+ original symbols (``A``, ``T``, ``C``, ``G``, ``N``) that form the DNA
+ sequences, into integers.
+
+ The gene families have the following class labels:
+
+ - G protein coupled receptors: ``0``
+ - Tyrosine kinase: ``1``
+ - Tyrosine phosphatase: ``2``
+ - Synthetase: ``3``
+ - Synthase: ``4``
+ - Ion channel: ``5``
+ - Transcription: ``6``
+
+ Parameters
+ ----------
+ families:
+ Subset of gene families to include in the dataset.
+
+ Returns
+ -------
+ tuple[SequentialDataset, sklearn.preprocessing.LabelEncoder]
+ - A dataset object representing the loaded genetic data.
+ - Label encoder used to encode the observation symbols into integers.
+ """
+ # Load the dataset from compressed numpy file
+ path = importlib.resources.files(sequentia.datasets.data)
+ data = np.load(path / "gene_families.npz")
+
+ # Fetch arrays from loaded file
+ X, y, lengths = operator.itemgetter("X", "y", "lengths")(data)
+
+ # Encode the observation symbols into integers
+ enc = LabelEncoder()
+ X = np.expand_dims(enc.fit_transform(X.flatten()), axis=-1)
+
+ # Create a dataset only with sequences having the specified labels
+ idx = np.argwhere(np.isin(y, sorted(families))).flatten()
+ ranges = _data.get_idxs(lengths)[idx]
+ data = SequentialDataset(
+ np.vstack(list(_data.iter_X(X, idxs=ranges))),
+ y[idx],
+ lengths=lengths[idx],
+ )
+
+ return data, enc
diff --git a/sequentia/enums.py b/sequentia/enums.py
new file mode 100644
index 00000000..4d3128f2
--- /dev/null
+++ b/sequentia/enums.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Configuration values for Sequentia classes and functions."""
+
+import enum
+
+__all__ = ["TopologyMode", "CovarianceMode", "TransitionMode", "PriorMode"]
+
+
+class TopologyMode(enum.StrEnum):
+ """Topology types for :ref:`hmms`."""
+
+ ERGODIC = "ergodic"
+ """All states have a non-zero probability of transitioning to any
+ state."""
+
+ LEFT_RIGHT = "left-right"
+ """States are arranged in a way such that any state may only
+ transition to itself or any state ahead of it, but not to any
+ previous state."""
+
+ LINEAR = "linear"
+ """Same as :py:enum:mem:`+TopologyMode.LEFT_RIGHT`,
+ but states are only permitted to transition to the next state."""
+
+
+class CovarianceMode(enum.StrEnum):
+ """Covariance matrix types for
+ :class:`~sequentia.models.hmm.variants.gaussian_mixture.GaussianMixtureHMM`.
+ """
+
+ FULL = "full"
+ """All values are fully learnable independently for each component."""
+
+ DIAGONAL = "diag"
+ """Only values along the diagonal may be learned independently
+ for each component."""
+
+ SPHERICAL = "spherical"
+ """Same as :py:enum:mem:`+CovarianceMode.DIAGONAL`,
+ with a single value shared along the diagonal for each component."""
+
+ TIED = "tied"
+ """Same as :py:enum:mem:`+CovarianceMode.FULL`,
+ with all components sharing the same single covariance matrix."""
+
+
+class TransitionMode(enum.StrEnum):
+ """Initial state and transition probability types for :ref:`hmms`."""
+
+ UNIFORM = "uniform"
+ """Equal probability of starting in or transitioning to each state
+ according to the topology."""
+
+ RANDOM = "random"
+ """Random probability of starting in or transitioning to each state
+ according to the topology. State probabilities are sampled from a
+ Dirichlet distribution with unit concentration parameters."""
+
+
+class PriorMode(enum.StrEnum):
+ """Prior probability types for
+ :class:`~sequentia.models.hmm.classifier.HMMClassifier`.
+ """
+
+ UNIFORM = "uniform"
+ """Equal probability for each class."""
+
+ FREQUENCY = "frequency"
+ """Inverse count of the occurrences of the class in the training data."""
+
+
+try:
+ # add enum documentation for Sphinx
+ import enum_tools.documentation
+
+ TopologyMode = enum_tools.documentation.document_enum(TopologyMode)
+ CovarianceMode = enum_tools.documentation.document_enum(CovarianceMode)
+ TransitionMode = enum_tools.documentation.document_enum(TransitionMode)
+ PriorMode = enum_tools.documentation.document_enum(PriorMode)
+except ImportError:
+ pass
diff --git a/sequentia/models/__init__.py b/sequentia/models/__init__.py
new file mode 100644
index 00000000..bbff6eb3
--- /dev/null
+++ b/sequentia/models/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Machine learning algorithms for sequence classification and regression."""
+
+from sequentia.models.hmm import (
+ CategoricalHMM,
+ GaussianMixtureHMM,
+ HMMClassifier,
+)
+from sequentia.models.knn import KNNClassifier, KNNRegressor
+
+__all__ = [
+ "CategoricalHMM",
+ "GaussianMixtureHMM",
+ "HMMClassifier",
+ "KNNClassifier",
+ "KNNRegressor",
+]
diff --git a/sequentia/models/base.py b/sequentia/models/base.py
new file mode 100644
index 00000000..7a412d51
--- /dev/null
+++ b/sequentia/models/base.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Base classifier and regressor mixin classes."""
+
+from __future__ import annotations
+
+import abc
+
+import numpy as np
+import sklearn.base
+import sklearn.metrics
+
+from sequentia._internal import _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
+
+__all__ = ["ClassifierMixin", "RegressorMixin"]
+
+
+class ClassifierMixin(
+ sklearn.base.BaseEstimator,
+ sklearn.base.ClassifierMixin,
+ metaclass=abc.ABCMeta,
+):
+ """Represents a generic sequential classifier."""
+
+ @abc.abstractmethod
+ def fit(
+ self: ClassifierMixin,
+ X: Array,
+ y: IntArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> ClassifierMixin:
+ """Fit the classifier with the provided sequences and outputs."""
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def predict(
+ self: ClassifierMixin,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> IntArray:
+ """Predict outputs for the provided sequences."""
+ raise NotImplementedError
+
+ def fit_predict(
+ self: ClassifierMixin,
+ X: Array,
+ y: IntArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> IntArray:
+ """Fit the model to the sequence(s) in ``X`` and predicts outputs for
+ ``X``.
+
+ Parameters
+ ----------
+ self: ClassifierMixin
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Output predictions.
+ """
+ return self.fit(X, y, lengths=lengths).predict(X, lengths=lengths)
+
+ @abc.abstractmethod
+ def predict_proba(
+ self: ClassifierMixin,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predict class probabilities for the provided sequences."""
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def predict_scores(
+ self: ClassifierMixin,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predict class scores for the provided sequences."""
+ raise NotImplementedError
+
+ @_validation.requires_fit
+ def score(
+ self: ClassifierMixin,
+ X: Array,
+ y: IntArray,
+ *,
+ lengths: IntArray | None = None,
+ normalize: bool = True,
+ sample_weight: Array | None = None,
+ ) -> float:
+ """Calculate the predictive accuracy for the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: ClassifierMixin
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ normalize:
+ See :func:`sklearn:sklearn.metrics.accuracy_score`.
+
+ sample_weight:
+ See :func:`sklearn:sklearn.metrics.accuracy_score`.
+
+ Returns
+ -------
+ float
+ Predictive accuracy.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ y = _validation.check_y(y, lengths=lengths, dtype=np.int8)
+ y_pred = self.predict(X, lengths=lengths)
+ return sklearn.metrics.accuracy_score(
+ y, y_pred, normalize=normalize, sample_weight=sample_weight
+ )
+
+
+class RegressorMixin(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
+ """Represents a generic sequential regressor."""
+
+ @abc.abstractmethod
+ def fit(
+ self: RegressorMixin,
+ X: FloatArray,
+ y: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> RegressorMixin:
+ """Fit the regressor with the provided sequences and outputs."""
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def predict(
+ self: RegressorMixin, X: FloatArray, lengths: IntArray | None = None
+ ) -> FloatArray:
+ """Predict outputs for the provided sequences."""
+ raise NotImplementedError
+
+ def fit_predict(
+ self: RegressorMixin,
+ X: FloatArray,
+ y: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Fit the model to the sequence(s) in ``X`` and predicts outputs for
+ ``X``.
+
+ Parameters
+ ----------
+ self: RegressorMixin
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray
+ Output predictions.
+ """
+ return self.fit(X, y, lengths=lengths).predict(X, lengths=lengths)
+
+ @_validation.requires_fit
+ def score(
+ self: RegressorMixin,
+ X: FloatArray,
+ y: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ sample_weight: Array | None = None,
+ ) -> float:
+ r"""Calculate the predictive coefficient of determination
+ (R\ :sup:`2`) for the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: RegressorMixin
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ sample_weight:
+ See :func:`sklearn:sklearn.metrics.r2_score`.
+
+ Returns
+ -------
+ float
+ Coefficient of determination.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ y = _validation.check_y(y, lengths=lengths, dtype=np.float64)
+ y_pred = self.predict(X, lengths=lengths)
+ return sklearn.metrics.r2_score(y, y_pred, sample_weight=sample_weight)
diff --git a/sequentia/models/hmm/__init__.py b/sequentia/models/hmm/__init__.py
new file mode 100644
index 00000000..480da35d
--- /dev/null
+++ b/sequentia/models/hmm/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Hidden Markov model based machine learning algorithms."""
+
+from sequentia.models.hmm.classifier import HMMClassifier
+from sequentia.models.hmm.variants import CategoricalHMM, GaussianMixtureHMM
+
+__all__ = ["CategoricalHMM", "GaussianMixtureHMM", "HMMClassifier"]
diff --git a/sequentia/models/hmm/classifier.py b/sequentia/models/hmm/classifier.py
new file mode 100644
index 00000000..be0f31e4
--- /dev/null
+++ b/sequentia/models/hmm/classifier.py
@@ -0,0 +1,570 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""A classifier consisting of HMMs, each trained independently to recognize
+sequences of a single class.
+"""
+
+from __future__ import annotations
+
+import pathlib
+import typing as t
+
+import joblib
+import numpy as np
+import pydantic as pyd
+from sklearn.utils.validation import NotFittedError
+
+from sequentia._internal import _data, _multiprocessing, _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
+from sequentia.datasets.base import SequentialDataset
+from sequentia.enums import PriorMode
+from sequentia.models.base import ClassifierMixin
+from sequentia.models.hmm.variants.base import BaseHMM
+
+
+class HMMClassifier(ClassifierMixin):
+ """A classifier consisting of HMMs, each trained independently to
+ recognize sequences of a single class.
+
+ The predicted class for a given observation sequence is the class
+ represented by the HMM which produces the maximum posterior
+ probability for the observation sequence.
+
+ Examples
+ --------
+ Using a :class:`.HMMClassifier` (with :class:`.GaussianMixtureHMM`
+ models) to classify spoken digits. ::
+
+ import numpy as np
+ from sequentia.datasets import load_digits
+ from sequentia.models.hmm import GaussianMixtureHMM, HMMClassifier
+
+ # Seed for reproducible pseudo-randomness
+ random_state = np.random.RandomState(1)
+
+ # Fetch MFCCs of spoken digits
+ data = load_digits()
+ train_data, test_data = data.split(test_size=0.2, random_state=random_state)
+
+ # Create a HMMClassifier using a class frequency prior
+ clf = HMMClassifier(prior='frequency')
+
+ # Add an untrained HMM for each class
+ for label in data.classes:
+ model = GaussianMixtureHMM(random_state=random_state)
+ clf.add_model(model, label=label)
+
+ # Fit the HMMs by providing training observation sequences for all classes
+ clf.fit(train_data.X, train_data.y, lengths=train_data.lengths)
+
+ # Predict classes for the test observation sequences
+ y_pred = clf.predict(test_data.X, lengths=test_data.lengths)
+
+ As done in the above example, we can provide unfitted HMMs using
+ :func:`add_model` or :func:`add_models`, then provide training
+ observation sequences for all classes to :func:`fit`, which will
+ automatically train each HMM on the appropriate subset of data.
+
+ Alternatively, we may provide pre-fitted HMMs and call :func:`fit` with
+ no arguments. ::
+
+ # Create a HMMClassifier using a class frequency prior
+ clf = HMMClassifier(prior='frequency')
+
+ # Manually fit each HMM on its own subset of data
+ for X_train, lengths_train, label for train_data.iter_by_class():
+ model = GaussianMixtureHMM(random_state=random_state)
+ model.fit(X_train, lengths=lengths_train)
+ clf.add_model(model, label=label)
+
+ # Fit the classifier
+ clf.fit()
+ """ # noqa: E501
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def __init__(
+ self: pyd.SkipValidation,
+ *,
+ prior: (
+ PriorMode | dict[int, pyd.confloat(ge=0, le=1)]
+ ) = PriorMode.UNIFORM, # placeholder
+ classes: list[int] | None = None,
+ n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
+ ) -> pyd.SkipValidation:
+ """Initialize a :class:`.HMMClassifier`.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ prior:
+ Type of prior probability to assign to each HMM.
+
+ - If ``None``, a uniform prior will be used, making each HMM
+ equally likely.
+ - If ``"frequency"``, the prior probability of each HMM is equal
+ to the fraction of total observation sequences that the HMM was
+ fitted with.
+ - If a ``dict``, custom prior probabilities can be assigned to
+ each HMM. The keys should be the label of the class represented
+ by the HMM, and the value should be the prior probability for
+ the HMM.
+
+ classes:
+ Set of possible class labels.
+
+ - If not provided, these will be determined from the training
+ data labels.
+ - If provided, output from methods such as :func:`predict_proba`
+ and :func:`predict_scores` will follow the ordering of the
+ class labels provided here.
+
+ n_jobs:
+ Maximum number of concurrently running workers.
+
+ - If 1, no parallelism is used at all (useful for debugging).
+ - If -1, all CPUs are used.
+ - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g.
+ ``n_jobs=-2`` uses all but one.
+
+ Returns
+ -------
+ HMMClassifier
+ """
+ #: Type of prior probability to assign to each HMM.
+ self.prior: PriorMode | dict[int, pyd.confloat(ge=0, le=1)] = prior
+ #: Set of possible class labels.
+ self.classes: list[int] | None = classes
+ #: Maximum number of concurrently running workers.
+ self.n_jobs: pyd.PositiveInt | pyd.NegativeInt = n_jobs
+ #: HMMs constituting the :class:`.HMMClassifier`.
+ self.models: dict[int, BaseHMM] = {}
+ # Allow metadata routing for lengths
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_predict_proba_request(lengths=True)
+ self.set_predict_log_proba_request(lengths=True)
+ self.set_score_request(
+ lengths=True,
+ normalize=True,
+ sample_weight=True,
+ )
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def add_model(
+ self: pyd.SkipValidation,
+ model: BaseHMM,
+ /,
+ *,
+ label: int,
+ ) -> pyd.SkipValidation:
+ """Add a single HMM to the classifier.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ model:
+ HMM to add to the classifier.
+
+ label:
+ Class represented by the HMM.
+
+ Returns
+ -------
+ HMMClassifier
+ The classifier.
+
+ Notes
+ -----
+ All models added to the classifier must be of the same type — either
+ :class:`.GaussianMixtureHMM` or :class:`.CategoricalHMM`.
+ """
+ if len(self.models) > 0 and not isinstance(
+ model, type(next(iter(self.models.values())))
+ ):
+ msg = (
+ f"Model of type {type(model).__name__} must be the same "
+ "as the models already provided to this "
+ f"{type(self).__name__} instance"
+ )
+ raise TypeError(msg)
+ self.models[int(label)] = model
+ return self
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def add_models(
+ self: pyd.SkipValidation,
+ models: dict[int, BaseHMM],
+ /,
+ ) -> pyd.SkipValidation:
+ """Add HMMs to the classifier.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ models:
+ HMMs to add to the classifier. The key for each HMM should be the
+ label of the class represented by the HMM.
+
+ Returns
+ -------
+ HMMClassifier
+ The classifier.
+
+ Notes
+ -----
+ All models added to the classifier must be of the same type — either
+ :class:`.GaussianMixtureHMM` or :class:`.CategoricalHMM`.
+ """
+ for label, model in models.items():
+ self.add_model(model, label=label)
+ return self
+
+ def fit(
+ self: HMMClassifier,
+ X: Array | None = None,
+ y: IntArray | None = None,
+ *,
+ lengths: IntArray | None = None,
+ ) -> HMMClassifier:
+ """Fit the HMMs to the sequence(s) in ``X``.
+
+ - If fitted models were provided with :func:`add_model` or
+ :func:`add_models`, no arguments should be passed to :func:`fit`.
+ - If unfitted models were provided with :func:`add_model` or
+ :func:`add_models`, training data ``X``, ``y`` and ``lengths``
+ must be provided to :func:`fit`.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ X:
+ Sequence(s).
+
+ y:
+ Classes corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ HMMClassifier
+ The fitted classifier
+ """
+ if X is None or y is None:
+ if len(self.models) == 0:
+ msg = (
+ "Fitted models must be provided if no training data is "
+ "provided - use add_model() to add fitted models to the "
+ "classifier object"
+ )
+ raise RuntimeError(msg)
+
+ for label, model in self.models.items():
+ if not _validation.check_is_fitted(model, return_=True):
+ msg = (
+ f"The model corresponding to label {label} must be "
+ "pre-fitted if no training data is provided"
+ )
+ raise NotFittedError(msg)
+
+ if self.classes is not None:
+ self._classes = _validation.check_classes(
+ self.classes, classes=self.classes
+ )
+ else:
+ # Fetch classes from provided models
+ self.classes_ = np.array(list(self.models.keys()))
+ else:
+ y = _validation.check_y(y, lengths=lengths, dtype=np.int8)
+ self.classes_ = _validation.check_classes(y, classes=self.classes)
+
+ # Check that each label has a HMM (and vice versa)
+ if set(self.models.keys()) != set(self.classes_):
+ msg = (
+ "Classes in the dataset are not consistent with the added "
+ "models - ensure that every added model corresponds to a "
+ "class in the dataset"
+ )
+ raise ValueError(msg)
+
+ if X is not None and y is not None:
+ # Iterate through dataset by class and fit the corresponding model
+ dataset = SequentialDataset(
+ X,
+ y,
+ lengths=lengths,
+ classes=self.classes_,
+ )
+ for X_c, lengths_c, c in dataset.iter_by_class():
+ self.models[c].fit(X_c, lengths=lengths_c)
+
+ # Set class priors
+ models: t.Iterator[int, BaseHMM] = self.models.items()
+ if self.prior == PriorMode.UNIFORM:
+ self.prior_ = {c: 1 / len(self.classes_) for c, _ in models}
+ elif self.prior == PriorMode.FREQUENCY:
+ total_seqs = sum(mod.n_seqs_ for _, mod in models)
+ self.prior_ = {c: mod.n_seqs_ / total_seqs for c, mod in models}
+ elif isinstance(self.prior, dict):
+ if set(self.prior.keys()) != set(self.classes_):
+ msg = (
+ "Classes in the dataset are not consistent with the "
+ "classes in `prior` - ensure that every provided class "
+ "prior corresponds to a class in the dataset"
+ )
+ raise ValueError(msg)
+ self.prior_ = self.prior
+
+ return self
+
+ @_validation.requires_fit
+ def predict(
+ self: HMMClassifier,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> IntArray:
+ """Predict classes for the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray
+ Class predictions.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ scores = self.predict_scores(X, lengths=lengths)
+ max_score_idxs = scores.argmax(axis=1)
+ return self.classes_[max_score_idxs]
+
+ @_validation.requires_fit
+ def predict_log_proba(
+ self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ ) -> FloatArray:
+ """Predict log un-normalized posterior probabilities for the
+ sequences in ``X``.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Log probabilities.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ return self.predict_scores(X, lengths=lengths)
+
+ @_validation.requires_fit
+ def predict_proba(
+ self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ ) -> FloatArray:
+ """Predict class probabilities for the sequence(s) in ``X``.
+
+ Probabilities are calculated as the posterior probability of each
+ HMM generating the sequence.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class membership probabilities.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ proba = self.predict_log_proba(X, lengths=lengths)
+ proba -= proba.max(axis=1, keepdims=True)
+ proba = np.exp(proba)
+ proba /= proba.sum(axis=1, keepdims=True)
+ return proba
+
+ @_validation.requires_fit
+ def predict_scores(
+ self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ ) -> FloatArray:
+ """Predict class scores for the sequence(s) in ``X``.
+
+ Scores are calculated as the log posterior probability of each HMM
+ generating the sequence.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class scores.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ model: BaseHMM = next(iter(self.models.values()))
+ X, lengths = _validation.check_X_lengths(
+ X,
+ lengths=lengths,
+ dtype=model._DTYPE, # noqa: SLF001
+ )
+ n_jobs = _multiprocessing.effective_n_jobs(self.n_jobs, x=lengths)
+ chunk_idxs = np.array_split(_data.get_idxs(lengths), n_jobs)
+ return np.concatenate(
+ joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.delayed(self._compute_scores_chunk)(X, idxs=idxs)
+ for idxs in chunk_idxs
+ )
+ )
+
+ @_validation.requires_fit
+ def save(self: HMMClassifier, path: str | pathlib.Path | t.IO, /) -> None:
+ """Serialize and save a fitted HMM classifier.
+
+ Parameters
+ ----------
+ self: HMMClassifier
+
+ path:
+ Location to save the serialized classifier.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+
+ See Also
+ --------
+ load:
+ Load and deserialize a fitted HMM classifier.
+ """
+ # Fetch main parameters and fitted values
+ dict_ = self.__dict__.items()
+ state = {
+ "params": self.get_params(),
+ "models": self.models,
+ "fitted": {k: v for k, v in dict_ if k.endswith("_")},
+ }
+
+ # Serialize model
+ joblib.dump(state, path)
+
+ @classmethod
+ def load(
+ cls: type[HMMClassifier],
+ path: str | pathlib.Path | t.IO,
+ /,
+ ) -> HMMClassifier:
+ """Load and deserialize a fitted HMM classifier.
+
+ Parameters
+ ----------
+ cls: type[HMMClassifier]
+
+ path:
+ Location to load the serialized classifier from.
+
+ Returns
+ -------
+ HMMClassifier
+ Fitted HMM classifier.
+
+ See Also
+ --------
+ save:
+ Serialize and save a fitted HMM classifier.
+ """
+ state = joblib.load(path)
+
+ # Set main parameters
+ model = cls(**state["params"])
+ model.models = state["models"]
+
+ # Set fitted values
+ for k, v in state["fitted"].items():
+ setattr(model, k, v)
+
+ # Return deserialized model
+ return model
+
+ def _compute_scores_chunk(
+ self: HMMClassifier, X: Array, /, *, idxs: IntArray
+ ) -> FloatArray:
+ """Compute log posterior probabilities for a chunk of sequences."""
+ scores = np.zeros((len(idxs), len(self.classes_)))
+ for i, x in enumerate(_data.iter_X(X, idxs=idxs)):
+ scores[i] = self._compute_log_posterior(x)
+ return scores
+
+ def _compute_log_posterior(
+ self: HMMClassifier,
+ x: Array,
+ /,
+ ) -> FloatArray:
+ """Compute log posterior probabilities for each class."""
+ log_posterior = np.full(len(self.classes_), -np.inf)
+ for i, k in enumerate(self.classes_):
+ model = self.models[k]
+ log_prior = np.log(self.prior_[k])
+ log_likelihood = model.score(x)
+ log_posterior[i] = log_prior + log_likelihood
+ return log_posterior
diff --git a/sequentia/models/hmm/variants/__init__.py b/sequentia/models/hmm/variants/__init__.py
new file mode 100644
index 00000000..b40b57f1
--- /dev/null
+++ b/sequentia/models/hmm/variants/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Supported hidden Markov Model variants."""
+
+from sequentia.models.hmm.variants.categorical import CategoricalHMM
+from sequentia.models.hmm.variants.gaussian_mixture import GaussianMixtureHMM
+
+__all__ = ["CategoricalHMM", "GaussianMixtureHMM"]
diff --git a/sequentia/models/hmm/variants/base.py b/sequentia/models/hmm/variants/base.py
new file mode 100644
index 00000000..d80d4e72
--- /dev/null
+++ b/sequentia/models/hmm/variants/base.py
@@ -0,0 +1,478 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Wrapper for a generic hidden Markov Model variant."""
+
+from __future__ import annotations
+
+import abc
+import copy
+import re
+import typing as t
+import warnings
+
+import hmmlearn.base
+import numpy as np
+import pydantic as pyd
+from sklearn.base import BaseEstimator
+from sklearn.utils import check_random_state
+
+from sequentia import enums
+from sequentia._internal import _hmm, _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
+
+__all__ = ["BaseHMM"]
+
+
+class BaseHMM(BaseEstimator, metaclass=abc.ABCMeta):
+ """Wrapper for a generic hidden Markov Model variant."""
+
+ _DTYPE: type
+ _UNIVARIATE: bool
+
+ @abc.abstractmethod
+ def __init__(
+ self: BaseHMM,
+ *,
+ n_states: pyd.PositiveInt,
+ topology: enums.TopologyMode | None,
+ random_state: pyd.NonNegativeInt | np.random.RandomState | None,
+ hmmlearn_kwargs: dict[str, t.Any] | None,
+ ) -> BaseHMM:
+ self.n_states: int = n_states
+ """Number of states in the Markov chain."""
+
+ self.topology: enums.TopologyMode = topology
+ """Transition topology of the Markov chain — see :ref:`topologies`."""
+
+ self.random_state: int | np.random.RandomState | None = random_state
+ """Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness."""
+
+ self.hmmlearn_kwargs: dict[str, t.Any] = self._check_hmmlearn_kwargs(
+ hmmlearn_kwargs
+ )
+ """Additional key-word arguments provided to the
+ `hmmlearn `__ HMM
+ constructor."""
+
+ self.model: hmmlearn.base.BaseHMM = None
+ """Underlying HMM object from
+ `hmmlearn `__ — only set
+ after :func:`fit`."""
+
+ self._skip_init_params = set()
+ self._skip_params = set()
+
+ def fit(
+ self: BaseHMM,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> BaseHMM:
+ """Fit the HMM to the sequences in ``X``, using the Baum—Welch
+ algorithm.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ BaseHMM
+ The fitted HMM.
+ """
+ X, lengths = _validation.check_X_lengths(
+ X, lengths=lengths, dtype=self._DTYPE, univariate=self._UNIVARIATE
+ )
+ self.random_state_ = _validation.check_random_state(self.random_state)
+ if self.topology is None:
+ self.topology_ = None
+ else:
+ self.topology_ = _hmm.topologies.TOPOLOGY_MAP[self.topology](
+ n_states=self.n_states,
+ random_state=self.random_state_,
+ )
+ self._check_init_params()
+
+ kwargs = copy.deepcopy(self.hmmlearn_kwargs)
+ kwargs["init_params"] = "".join(
+ set(kwargs["init_params"]) - self._skip_init_params
+ )
+ kwargs["params"] = "".join(set(kwargs["params"]) - self._skip_params)
+ self.model = self._init_hmm(**kwargs)
+
+ for attr in self._hmmlearn_params():
+ if hasattr(self, f"_{attr}"):
+ setattr(self.model, f"{attr}_", getattr(self, f"_{attr}"))
+
+ self.model.fit(X, lengths=lengths)
+ self.n_seqs_ = len(lengths)
+
+ return self
+
+ @_validation.requires_fit
+ def score(self: BaseHMM, x: Array, /) -> float:
+ """Calculate the log-likelihood of the HMM generating a single
+ observation sequence.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ x:
+ Sequence.
+
+ Returns
+ -------
+ float:
+ The log-likelihood.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ x = _validation.check_X(
+ x,
+ dtype=self._DTYPE,
+ univariate=self._UNIVARIATE,
+ )
+ return self.model.score(x)
+
+ @abc.abstractproperty
+ @_validation.requires_fit
+ def n_params(self: BaseHMM) -> int:
+ """Number of trainable parameters — requires :func:`fit`."""
+ n_params = 0
+ if "s" not in self._skip_params:
+ n_params += self.model.startprob_.size
+ if "t" not in self._skip_params:
+ n_params += self.model.transmat_.size
+ return n_params
+
+ @_validation.requires_fit
+ def bic(
+ self: BaseHMM,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> float:
+ """The Bayesian information criterion of the model, evaluated with
+ the maximum likelihood of ``X``.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ float:
+ The Bayesian information criterion.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ max_log_likelihood = self.model.score(X, lengths=lengths)
+ n_params = self.n_params
+ n_seqs = len(lengths)
+ return n_params * np.log(n_seqs) - 2 * np.log(max_log_likelihood)
+
+ @_validation.requires_fit
+ def aic(
+ self: BaseHMM,
+ X: Array,
+ *,
+ lengths: IntArray | None = None,
+ ) -> float:
+ """The Akaike information criterion of the model, evaluated with the
+ maximum likelihood of ``X``.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ float:
+ The Akaike information criterion.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ max_log_likelihood = self.model.score(X, lengths=lengths)
+ n_params = self.n_params
+ return 2 * (n_params - np.log(max_log_likelihood))
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def set_state_start_probs(
+ self: pyd.SkipValidation,
+ probs: (
+ FloatArray | enums.TransitionMode
+ ) = enums.TransitionMode.RANDOM, # placeholder
+ /,
+ ) -> None:
+ """Set the initial state probabilities.
+
+ If this method is **not** called, initial state probabilities are
+ initialized depending on the value of ``topology`` provided to
+ :func:`__init__`.
+
+ - If ``topology`` was set to ``'ergodic'``, ``'left-right'`` or
+ ``'linear'``, then random probabilities will be assigned
+ according to the topology by calling :func:`set_state_start_probs`
+ with ``probs='random'``.
+ - If ``topology`` was set to ``None``, then initial state
+ probabilities will be initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ probs:
+ Probabilities or probability type to assign as initial state
+ probabilities.
+
+ - If an ``Array``, should be a vector of starting probabilities
+ for each state.
+ - If ``'uniform'``, there is an equal probability of starting in
+ any state.
+ - If ``'random'``, the vector of initial state probabilities is
+ sampled from a Dirichlet distribution with unit concentration
+ parameters.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ if isinstance(probs, enums.TransitionMode):
+ self._startprob = probs
+ self._skip_init_params |= set("s")
+ else:
+ self._startprob = np.array(probs, dtype=np.float64)
+ self._skip_init_params |= set("s")
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def set_state_transition_probs(
+ self: pyd.SkipValidation,
+ probs: (
+ FloatArray | enums.TransitionMode
+ ) = enums.TransitionMode.RANDOM, # placeholder
+ /,
+ ) -> None:
+ """Set the transition probability matrix.
+
+ If this method is **not** called, transition probabilities are
+ initialized depending on the value of ``topology`` provided to
+ :func:`__init__`:
+
+ - If ``topology`` was set to ``'ergodic'``, ``'left-right'`` or
+ ``'linear'``, then random probabilities will be assigned according
+ to the topology by calling :func:`set_state_transition_probs` with
+ ``value='random'``.
+ - If ``topology`` was set to ``None``, then initial state
+ probabilities will be initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: BaseHMM
+
+ probs:
+ Probabilities or probability type to assign as state transition
+ probabilities.
+
+ - If an ``Array``, should be a matrix of probabilities where each
+ row must some to one and represents the probabilities of
+ transitioning out of a state.
+ - If ``'uniform'``, for each state there is an equal probability
+ of transitioning to any state permitted by the topology.
+ - If ``'random'``, the vector of transition probabilities for
+ each row is sampled from a Dirichlet distribution with unit
+ concentration parameters, according to the shape of the
+ topology.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ if isinstance(probs, enums.TransitionMode):
+ self._transmat = probs
+ self._skip_init_params |= set("t")
+ else:
+ self._transmat = np.array(probs, dtype=np.float64)
+ self._skip_init_params |= set("t")
+
+ @abc.abstractmethod
+ def freeze(self: BaseHMM, params: str | None, /) -> None:
+ """Freeze the trainable parameters of the HMM,
+ preventing them from be updated during the Baum—Welch algorithm.
+ """
+ defaults = self._hmmlearn_kwargs_defaults()["params"]
+ self._skip_params |= set(self._modify_params(params or defaults))
+
+ @abc.abstractmethod
+ def unfreeze(self: BaseHMM, params: str | None, /) -> None:
+ """Unfreeze the trainable parameters of the HMM,
+ allowing them to be updated during the Baum—Welch algorithm.
+ """
+ defaults = self._hmmlearn_kwargs_defaults()["params"]
+ self._skip_params -= set(self._modify_params(params or defaults))
+
+ def _modify_params(self: BaseHMM, params: str) -> str:
+ """Validate parameters to be frozen/unfrozen."""
+ defaults = self._hmmlearn_kwargs_defaults()["params"]
+ msg = (
+ "Expected a string consisting of any combination of "
+ f"{defaults!r}" #
+ )
+ if isinstance(params, str):
+ if bool(re.compile(rf"[^{defaults}]").search(params)):
+ raise ValueError(msg)
+ else:
+ raise TypeError(msg)
+ return params
+
+ def _check_init_params(self: BaseHMM) -> None:
+ """Validate hmmlearn init_params argument."""
+ topology = self.topology_ or _hmm.topologies.ErgodicTopology(
+ n_states=self.n_states,
+ random_state=check_random_state(self.random_state),
+ )
+
+ if "s" in self._skip_init_params:
+ if isinstance(self._startprob, enums.TransitionMode):
+ if self._startprob == enums.TransitionMode.UNIFORM:
+ self._startprob = topology.uniform_start_probs()
+ elif self._startprob == enums.TransitionMode.RANDOM:
+ self._startprob = topology.random_start_probs()
+ elif isinstance(self._startprob, np.ndarray):
+ self._startprob = topology.check_start_probs(
+ self._startprob,
+ )
+ elif self.topology_ is not None:
+ self.set_state_start_probs(topology.random_start_probs())
+
+ if "t" in self._skip_init_params:
+ if isinstance(self._transmat, enums.TransitionMode):
+ if self._transmat == enums.TransitionMode.UNIFORM:
+ self._transmat = topology.uniform_transition_probs()
+ elif self._transmat == enums.TransitionMode.RANDOM:
+ self._transmat = topology.random_transition_probs()
+ elif isinstance(self._transmat, np.ndarray):
+ self._transmat = topology.check_transition_probs(
+ self._transmat,
+ )
+ elif self.topology_ is not None:
+ self.set_state_transition_probs(
+ topology.random_transition_probs(),
+ )
+
+ @classmethod
+ def _check_hmmlearn_kwargs(
+ cls: type[BaseHMM], kwargs: dict[str, t.Any] | None
+ ) -> dict[str, t.Any]:
+ """Check hmmlearn forwarded key-word arguments."""
+ defaults: dict[str, t.Any] = cls._hmmlearn_kwargs_defaults()
+ kwargs: dict[str, t.Any] = kwargs or defaults
+ kwargs = copy.deepcopy(kwargs)
+
+ setter_methods = [
+ f"{func}()" for func in dir(cls) if func.startswith("set_state")
+ ]
+
+ for param in kwargs:
+ if param in cls._unsettable_hmmlearn_kwargs():
+ if param == "init_params":
+ init_params_defaults = defaults["init_params"]
+ if set(kwargs[param]) != set(init_params_defaults):
+ kwargs[param] = init_params_defaults
+ msg = (
+ "The `init_params` hmmlearn argument cannot be "
+ "overridden manually - defaulting to all "
+ f"parameters {init_params_defaults!r}. "
+ "Use the following method to initialize model "
+ f"parameters: {', '.join(setter_methods)}."
+ )
+ warnings.warn(msg, stacklevel=1)
+ elif param == "params":
+ params_defaults = defaults["params"]
+ if set(kwargs[param]) != set(params_defaults):
+ kwargs[param] = params_defaults
+ msg = (
+ "The `params` hmmlearn argument cannot be "
+ "overridden manually - defaulting to all "
+ f"parameters {params_defaults!r}. "
+ "Use the freeze() and unfreeze() methods to "
+ "specify the learnable model parameters."
+ )
+ warnings.warn(msg, stacklevel=1)
+ else:
+ del kwargs[param]
+ msg = (
+ f"The {param!r} hmmlearn argument cannot be "
+ f"overridden manually - use the {cls.__name__!r} "
+ "constructor to specify this argument."
+ )
+ warnings.warn(msg, stacklevel=1)
+
+ if "init_params" not in kwargs:
+ kwargs["init_params"] = defaults
+
+ if "params" not in kwargs:
+ kwargs["params"] = defaults
+
+ return kwargs
+
+ @abc.abstractmethod
+ def _init_hmm(self: BaseHMM, **kwargs: t.Any) -> hmmlearn.base.BaseHMM:
+ """Initialize the hmmlearn model."""
+ raise NotImplementedError
+
+ @abc.abstractstaticmethod
+ def _hmmlearn_kwargs_defaults() -> dict[str, t.Any]:
+ """Default values for hmmlearn key-word arguments."""
+ raise NotImplementedError
+
+ @staticmethod
+ def _unsettable_hmmlearn_kwargs() -> list[str]:
+ """Arguments that should not be provided in `hmmlearn_kwargs` in
+ :func:`__init__`.
+ """
+ return ["random_state", "init_params", "params"]
+
+ @staticmethod
+ def _hmmlearn_params() -> list[str]:
+ """Names of trainable hmmlearn parameters."""
+ return ["startprob", "transmat"]
diff --git a/sequentia/models/hmm/variants/categorical.py b/sequentia/models/hmm/variants/categorical.py
new file mode 100644
index 00000000..fdc9404b
--- /dev/null
+++ b/sequentia/models/hmm/variants/categorical.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""A hidden Markov model with univariate categorical emissions."""
+
+from __future__ import annotations
+
+import typing as t
+
+import hmmlearn.hmm
+import numpy as np
+import pydantic as pyd
+
+from sequentia import enums
+from sequentia._internal import _validation
+from sequentia._internal._typing import FloatArray
+from sequentia.models.hmm.variants.base import BaseHMM
+
+__all__ = ["CategoricalHMM"]
+
+
+class CategoricalHMM(BaseHMM):
+ """A hidden Markov model with univariate categorical emissions.
+
+ Examples
+ --------
+ Using a :class:`.CategoricalHMM` to learn how to recognize DNA sequences
+ from the synthetase gene family.
+
+ See :func:`.load_gene_families` for more information on the sample
+ dataset used in this example.
+
+ ::
+
+ import numpy as np
+ from sequentia.datasets import load_gene_families
+ from sequentia.models.hmm import CategoricalHMM
+
+ # Seed for reproducible pseudo-randomness
+ random_state = np.random.RandomState(1)
+
+ # Fetch DNA sequences for the synthetase gene family (no. 4)
+ data, enc = load_gene_families(families=[4])
+ train_data, test_data = data.split(test_size=0.2, random_state=random_state)
+
+ # Create and train a CategoricalHMM to recognize the synthetase DNA sequences
+ model = CategoricalHMM(random_state=random_state)
+ model.fit(train_data.X, lengths=train_data.lengths)
+
+ # Calculate the log-likelihood of the first test sample being generated by this model
+ x, y = test_data[0]
+ model.score(x)
+ """ # noqa: E501
+
+ _DTYPE: type = np.int8
+ _UNIVARIATE: bool = True
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def __init__(
+ self: pyd.SkipValidation,
+ *,
+ n_states: pyd.PositiveInt = 5,
+ topology: enums.TopologyMode | None = enums.TopologyMode.LEFT_RIGHT,
+ random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
+ hmmlearn_kwargs: dict[str, t.Any] | None = None,
+ ) -> pyd.SkipValidation:
+ """Initializes the :class:`.CategoricalHMM`.
+
+ Parameters
+ ----------
+ self: CategoricalHMM
+
+ n_states:
+ Number of states in the Markov chain.
+
+ topology:
+ Transition topology of the Markov chain — see :ref:`topologies`.
+
+ If ``None``, behaves the same as ``'ergodic'`` but with
+ `hmmlearn `__
+ initialization.
+
+ random_state:
+ Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness.
+
+ hmmlearn_kwargs:
+ Additional key-word arguments provided to the
+ `hmmlearn `__ HMM
+ constructor.
+
+ Returns
+ -------
+ CategoricalHMM
+ """
+ super().__init__(
+ n_states=n_states,
+ topology=topology,
+ random_state=random_state,
+ hmmlearn_kwargs=hmmlearn_kwargs,
+ )
+
+ @property
+ @_validation.requires_fit
+ def n_params(self: CategoricalHMM) -> int:
+ """Number of trainable parameters — requires :func:`fit`."""
+ n_params = super().n_params
+ if "e" not in self._skip_params:
+ n_params += self.model.emissionprob_.size
+ return n_params
+
+ def set_state_emission_probs(
+ self: CategoricalHMM,
+ probs: FloatArray,
+ /,
+ ) -> None:
+ """Set the state emission distribution of the HMM's emission model.
+
+ If this method is **not** called, emission probabilities will be
+ initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: CategoricalHMM
+
+ probs:
+ Array of emission probabilities.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ self._emissionprob = np.array(probs, dtype=np.float64)
+ self._skip_init_params |= set("e")
+
+ def freeze(self: CategoricalHMM, params: str | None = None, /) -> None:
+ """Freeze the trainable parameters of the HMM,
+ preventing them from being updated during the Baum—Welch algorithm.
+
+ Parameters
+ ----------
+ self: CategoricalHMM
+
+ params:
+ A string specifying which parameters to freeze.
+ Can contain a combination of:
+
+ - ``'s'`` for initial state probabilities,
+ - ``'t'`` for transition probabilities,
+ - ``'e'`` for emission probailities.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+
+ See Also
+ --------
+ unfreeze:
+ Unfreeze the trainable parameters of the HMM,
+ allowing them to be updated during the Baum—Welch algorithm.
+ """
+ super().freeze(params)
+
+ def unfreeze(self: CategoricalHMM, params: str | None = None, /) -> None:
+ """Unfreeze the trainable parameters of the HMM,
+ allowing them to be updated during the Baum—Welch algorithm.
+
+ Parameters
+ ----------
+ self: CategoricalHMM
+
+ params:
+ A string specifying which parameters to unfreeze.
+ Can contain a combination of:
+
+ - ``'s'`` for initial state probabilities,
+ - ``'t'`` for transition probabilities,
+ - ``'e'`` for emission probailities.
+
+ See Also
+ --------
+ freeze:
+ Freeze the trainable parameters of the HMM,
+ preventing them from being updated during the Baum—Welch
+ algorithm.
+ """
+ super().unfreeze(params)
+
+ def _init_hmm(
+ self: CategoricalHMM,
+ **kwargs: t.Any,
+ ) -> hmmlearn.hmm.CategoricalHMM:
+ """Initialize the hmmlearn model."""
+ return hmmlearn.hmm.CategoricalHMM(
+ n_components=self.n_states,
+ random_state=self.random_state_,
+ **kwargs,
+ )
+
+ @staticmethod
+ def _hmmlearn_kwargs_defaults() -> dict[str, t.Any]:
+ """Default values for hmmlearn key-word arguments."""
+ return {"init_params": "ste", "params": "ste"}
+
+ @staticmethod
+ def _hmmlearn_params() -> list[str]:
+ """Names of trainable hmmlearn parameters."""
+ return [*BaseHMM._hmmlearn_params(), "emissionprob"] # noqa: SLF001
diff --git a/sequentia/models/hmm/variants/gaussian_mixture.py b/sequentia/models/hmm/variants/gaussian_mixture.py
new file mode 100644
index 00000000..87bfcf86
--- /dev/null
+++ b/sequentia/models/hmm/variants/gaussian_mixture.py
@@ -0,0 +1,310 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""A hidden Markov model with multivariate Gaussian mixture emissions."""
+
+from __future__ import annotations
+
+import typing as t
+
+import hmmlearn.hmm
+import numpy as np
+import pydantic as pyd
+
+from sequentia import enums
+from sequentia._internal import _validation
+from sequentia._internal._typing import FloatArray
+from sequentia.models.hmm.variants.base import BaseHMM
+
+__all__ = ["GaussianMixtureHMM"]
+
+
+class GaussianMixtureHMM(BaseHMM):
+ """A hidden Markov model with multivariate Gaussian mixture emissions.
+
+ Examples
+ --------
+ Using a :class:`.GaussianMixtureHMM` to learn how to recognize spoken
+ samples of the digit 3.
+
+ See :func:`.load_digits` for more information on the sample dataset
+ used in this example.
+
+ ::
+
+ import numpy as np
+ from sequentia.datasets import load_digits
+ from sequentia.models.hmm import GaussianMixtureHMM
+
+ # Seed for reproducible pseudo-randomness
+ random_state = np.random.RandomState(1)
+
+ # Fetch MFCCs of spoken samples for the digit 3
+ data = load_digits(digits=[3])
+ train_data, test_data = data.split(test_size=0.2, random_state=random_state)
+
+ # Create and train a GaussianMixtureHMM to recognize the digit 3
+ model = GaussianMixtureHMM(random_state=random_state)
+ X_train, lengths_train = train_data.X_lengths
+ model.fit(train_data.X, lengths=train_data.lengths)
+
+ # Calculate the log-likelihood of the first test sample being generated by this model
+ x, y = test_data[0]
+ model.score(x)
+ """ # noqa: E501
+
+ _DTYPE: type = np.float64
+ _UNIVARIATE: bool = False
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def __init__(
+ self: pyd.SkipValidation,
+ *,
+ n_states: pyd.PositiveInt = 5,
+ n_components: pyd.PositiveInt = 3,
+ covariance: enums.CovarianceMode = enums.CovarianceMode.SPHERICAL,
+ topology: enums.TopologyMode | None = enums.TopologyMode.LEFT_RIGHT,
+ random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
+ hmmlearn_kwargs: dict[str, t.Any] | None = None,
+ ) -> pyd.SkipValidation:
+ """Initializes the :class:`.GaussianMixtureHMM`.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ n_states:
+ Number of states in the Markov chain.
+
+ n_components:
+ Number of Gaussian components in the mixture emission
+ distribution for each state.
+
+ covariance:
+ Type of covariance matrix in the mixture emission distribution
+ for each state - see :ref:`covariance_types`.
+
+ topology:
+ Transition topology of the Markov chain — see :ref:`topologies`.
+
+ If ``None``, behaves the same as ``'ergodic'`` but with
+ `hmmlearn `__
+ initialization.
+
+ random_state:
+ Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness.
+
+ hmmlearn_kwargs:
+ Additional key-word arguments provided to the
+ `hmmlearn `__ HMM
+ constructor.
+
+ Returns
+ -------
+ GaussianMixtureHMM
+ """
+ super().__init__(
+ n_states=n_states,
+ topology=topology,
+ random_state=random_state,
+ hmmlearn_kwargs=hmmlearn_kwargs,
+ )
+ self.n_components: int = n_components
+ """Number of Gaussian components in the emission model mixture
+ distribution for each state."""
+
+ self.covariance: enums.CovarianceMode = covariance
+ """Type of covariance matrix in the emission model mixture
+ distribution for each state."""
+
+ @property
+ @_validation.requires_fit
+ def n_params(self: GaussianMixtureHMM) -> int:
+ """Number of trainable parameters — requires :func:`fit`."""
+ n_params = super().n_params()
+ if "m" not in self._skip_params:
+ n_params += self.model.means_.size
+ if "c" not in self._skip_params:
+ n_params += self.model.covars_.size
+ if "w" not in self._skip_params:
+ n_params += self.model.weights_.size
+ return n_params
+
+ def set_state_means(
+ self: GaussianMixtureHMM,
+ means: FloatArray,
+ /,
+ ) -> None:
+ """Set the mean vectors of the state emission distributions.
+
+ If this method is **not** called, mean vectors will be
+ initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ means:
+ Array of mean values.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ self._means = np.array(means, dtype=np.float64)
+ self._skip_init_params |= set("m")
+
+ def set_state_covars(
+ self: GaussianMixtureHMM,
+ covars: FloatArray,
+ /,
+ ) -> None:
+ """Set the covariance matrices of the state emission distributions.
+
+ If this method is **not** called, covariance matrices will be
+ initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ covars:
+ Array of covariance values.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ self._covars = np.array(covars, dtype=np.float64)
+ self._skip_init_params |= set("c")
+
+ def set_state_weights(
+ self: GaussianMixtureHMM,
+ weights: FloatArray,
+ /,
+ ) -> None:
+ """Set the component mixture weights of the state emission
+ distributions.
+
+ If this method is **not** called, component mixture weights will be
+ initialized by
+ `hmmlearn `__.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ weights:
+ Array of component mixture weights.
+
+ Notes
+ -----
+ If used, this method should normally be called before :func:`fit`.
+ """
+ self._weights = np.array(weights, dtype=np.float64)
+ self._skip_init_params |= set("w")
+
+ def freeze(
+ self: GaussianMixtureHMM,
+ params: str | None = None,
+ /,
+ ) -> None:
+ """Freeze the trainable parameters of the HMM,
+ preventing them from be updated during the Baum—Welch algorithm.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ params:
+ A string specifying which parameters to freeze. Can contain a
+ combination of:
+
+ - ``'s'`` for initial state probabilities,
+ - ``'t'`` for transition probabilities,
+ - ``'m'`` for emission distribution means,
+ - ``'c'`` for emission distribution covariances,
+ - ``'w'`` for emission distribution mixture weights.
+
+ See Also
+ --------
+ unfreeze:
+ Unfreeze the trainable parameters of the HMM,
+ allowing them to be updated during the Baum—Welch algorithm.
+ """
+ super().freeze(params)
+
+ def unfreeze(
+ self: GaussianMixtureHMM,
+ params: str | None = None,
+ /,
+ ) -> None:
+ """Unfreeze the trainable parameters of the HMM,
+ allowing them to be updated during the Baum—Welch algorithm.
+
+ Parameters
+ ----------
+ self: GaussianMixtureHMM
+
+ params:
+ A string specifying which parameters to unfreeze. Can contain
+ a combination of:
+
+ - ``'s'`` for initial state probabilities,
+ - ``'t'`` for transition probabilities,
+ - ``'m'`` for emission distribution means,
+ - ``'c'`` for emission distribution covariances,
+ - ``'w'`` for emission distribution mixture weights.
+
+ See Also
+ --------
+ freeze:
+ Freeze the trainable parameters of the HMM,
+ preventing them from be updated during the Baum—Welch algorithm.
+ """
+ super().unfreeze(params)
+
+ def _init_hmm(
+ self: GaussianMixtureHMM,
+ **kwargs: t.Any,
+ ) -> hmmlearn.hmm.GMMHMM:
+ """Initialize the hmmlearn model."""
+ return hmmlearn.hmm.GMMHMM(
+ n_components=self.n_states,
+ n_mix=self.n_components,
+ covariance_type=self.covariance.value,
+ random_state=self.random_state_,
+ **kwargs,
+ )
+
+ @staticmethod
+ def _hmmlearn_kwargs_defaults() -> dict[str, t.Any]:
+ """Default values for hmmlearn key-word arguments."""
+ return {"init_params": "stmcw", "params": "stmcw"}
+
+ @staticmethod
+ def _unsettable_hmmlearn_kwargs() -> list[str]:
+ """Arguments that should not be provided in `hmmlearn_kwargs` in
+ :func:`__init__`.
+ """
+ return [
+ *BaseHMM._unsettable_hmmlearn_kwargs(), # noqa: SLF001
+ "n_components",
+ "n_mix",
+ "covariance_type",
+ ]
+
+ @staticmethod
+ def _hmmlearn_params() -> list[str]:
+ """Names of trainable hmmlearn parameters."""
+ return [
+ *BaseHMM._hmmlearn_params(), # noqa: SLF001
+ "means",
+ "covars",
+ "weights",
+ ]
diff --git a/sequentia/models/knn/__init__.py b/sequentia/models/knn/__init__.py
new file mode 100644
index 00000000..8e882748
--- /dev/null
+++ b/sequentia/models/knn/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""K-nearest neighbor and dynamic time warping based machine learning
+algorithms.
+"""
+
+from sequentia.models.knn.classifier import KNNClassifier
+from sequentia.models.knn.regressor import KNNRegressor
+
+__all__ = ["KNNClassifier", "KNNRegressor"]
diff --git a/sequentia/models/knn/base.py b/sequentia/models/knn/base.py
new file mode 100644
index 00000000..f60687ca
--- /dev/null
+++ b/sequentia/models/knn/base.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Generic mixin class for k-nearest neigbour based models."""
+
+from __future__ import annotations
+
+import marshal
+import pathlib
+import types
+import typing as t
+
+import dtaidistance.dtw
+import dtaidistance.dtw_ndim
+import joblib
+import numpy as np
+
+from sequentia._internal import _data, _multiprocessing, _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
+
+__all__ = ["KNNMixin"]
+
+
+class KNNMixin:
+ """Generic mixin class for k-nearest neigbour based models."""
+
+ _DTYPE: type = np.float64
+
+ @_validation.requires_fit
+ def query_neighbors(
+ self: KNNMixin,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ sort: bool = True,
+ ) -> tuple[IntArray, FloatArray, Array]:
+ """Query the k-nearest training observation sequences to each
+ sequence in ``X``.
+
+ Parameters
+ ----------
+ self: KNNMixin
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ sort:
+ Whether to sort the neighbors in order of nearest to furthest.
+
+ Returns
+ -------
+ tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
+ K-nearest neighbors for each sequence in ``X``.
+
+ - Indices of the k-nearest training sequences.
+ - DTW distances of the k-nearest training sequences.
+ - Corresponding outputs of the k-nearest training sequences.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ distances = self.compute_distance_matrix(X, lengths=lengths)
+ if distances.shape[1] == 1:
+ # only one training sequence
+ # (return for all query sequences)
+ k_idxs = np.zeros_like(distances, dtype=int)
+ k_distances = distances
+ else:
+ if distances.shape[0] == 1:
+ # only one query sequence
+ # (use np.argsort instead of np.argpartition)
+ k_idxs = distances.argsort(axis=1)[:, : self.k]
+ else:
+ # multiple query/training sequences
+ # (use np.argpartition)
+ partition_by = range(self.k) if sort else self.k
+ k_idxs = np.argpartition(
+ distances,
+ partition_by,
+ axis=1,
+ )[:, : self.k]
+ k_distances = np.take_along_axis(distances, k_idxs, axis=1)
+ k_outputs = self.y_[k_idxs]
+ return k_idxs, k_distances, k_outputs
+
+ @_validation.requires_fit
+ def compute_distance_matrix(
+ self: KNNMixin,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Calculate a matrix of DTW distances between the sequences in
+ ``X`` and the training sequences.
+
+ Parameters
+ ----------
+ self: KNNMixin
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ DTW distance matrix.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ # validate input
+ X, lengths = _validation.check_X_lengths(
+ X,
+ lengths=lengths,
+ dtype=self._DTYPE,
+ )
+
+ # get number of jobs
+ n_jobs = _multiprocessing.effective_n_jobs(self.n_jobs, x=lengths)
+
+ # get DTW callable
+ dtw = self._dtw()
+
+ # prepare indices for multiprocessed DTW calculation
+ row_chunk_idxs = np.array_split(_data.get_idxs(lengths), n_jobs)
+ col_chunk_idxs = np.array_split(self.idxs_, n_jobs)
+
+ # multiprocessed DTW calculation
+ return np.vstack(
+ joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.delayed(self._distance_matrix_row_chunk)(
+ row_idxs, col_chunk_idxs, X, n_jobs, dtw
+ )
+ for row_idxs in row_chunk_idxs
+ )
+ )
+
+ @_validation.requires_fit
+ def dtw(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ """Calculate the DTW distance between two observation sequences.
+
+ Parameters
+ ----------
+ self: KNNMixin
+
+ A:
+ The first sequence.
+
+ B:
+ The second sequence.
+
+ Returns
+ -------
+ numpy.ndarray:
+ DTW distance.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+ """
+ A = _validation.check_X(A, dtype=self._DTYPE)
+ B = _validation.check_X(B, dtype=self._DTYPE)
+ return self._dtw()(A, B)
+
+ def _dtw1d(
+ self: KNNMixin,
+ a: FloatArray,
+ b: FloatArray,
+ *,
+ window: int,
+ ) -> float:
+ """Compute the DTW distance between two univariate sequences."""
+ return dtaidistance.dtw.distance(
+ a,
+ b,
+ use_c=self.use_c_,
+ window=window,
+ )
+
+ def _window(self: KNNMixin, A: FloatArray, B: FloatArray) -> int:
+ """Calculate the absolute DTW window size."""
+ return int(self.window * min(len(A), len(B)))
+
+ def _dtwi(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ """Compute the multivariate DTW distance as the sum of the pairwise
+ per-feature DTW distances, allowing each feature to be warped
+ independently.
+ """
+ window = self._window(A, B)
+
+ def dtw(a: FloatArray, b: FloatArray) -> float:
+ """Windowed DTW wrapper function."""
+ return self._dtw(a, b, window=window)
+
+ return np.sum([dtw(A[:, i], B[:, i]) for i in range(A.shape[1])])
+
+ def _dtwd(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ """Compute the multivariate DTW distance so that the warping of the
+ features depends on each other, by modifying the local distance
+ measure.
+ """
+ window = self._window(A, B)
+ return dtaidistance.dtw_ndim.distance(
+ A,
+ B,
+ use_c=self.use_c_,
+ window=window,
+ )
+
+ def _dtw(self: KNNMixin) -> t.Callable[[FloatArray], float]:
+ """Conditional DTW callable."""
+ return self._dtwi if self.independent else self._dtwd
+
+ def _weighting(self: KNNMixin) -> t.Callable[[FloatArray], FloatArray]:
+ """Weighting function - use equal weighting if not provided."""
+ if callable(self.weighting):
+ return self.weighting
+ return np.ones_like
+
+ def _distance_matrix_row_chunk(
+ self: KNNMixin,
+ row_idxs: IntArray,
+ col_chunk_idxs: list[IntArray],
+ X: FloatArray,
+ n_jobs: int,
+ dtw: t.Callable[[FloatArray], float],
+ ) -> FloatArray:
+ """Calculate a distance sub-matrix for a subset of rows over all
+ columns.
+ """
+ return np.hstack(
+ joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.delayed(self._distance_matrix_row_col_chunk)(
+ col_idxs, row_idxs, X, dtw
+ )
+ for col_idxs in col_chunk_idxs
+ )
+ )
+
+ def _distance_matrix_row_col_chunk(
+ self: KNNMixin,
+ col_idxs: IntArray,
+ row_idxs: IntArray,
+ X: FloatArray,
+ dtw: t.Callable[[FloatArray], float],
+ ) -> FloatArray:
+ """Calculate a distance sub-matrix for a subset of rows and
+ columns.
+ """
+ distances = np.zeros((len(row_idxs), len(col_idxs)))
+ for i, x_row in enumerate(_data.iter_X(X, idxs=row_idxs)):
+ for j, x_col in enumerate(_data.iter_X(self.X_, idxs=col_idxs)):
+ distances[i, j] = dtw(x_row, x_col)
+ return distances
+
+ @_validation.requires_fit
+ def save(
+ self: KNNMixin,
+ path: str | pathlib.Path | t.IO,
+ /,
+ ) -> None:
+ """Serialize and save a fitted KNN estimator.
+
+ Parameters
+ ----------
+ self: KNNMixin
+
+ path:
+ Location to save the serialized estimator.
+
+ Notes
+ -----
+ This method requires a trained model — see :func:`fit`.
+
+ See Also
+ --------
+ load:
+ Load and deserialize a fitted KNN estimator.
+ """
+ # fetch main parameters and fitted values
+ dict_ = self.__dict__.items()
+ state = {
+ "params": self.get_params(),
+ "fitted": {k: v for k, v in dict_ if k.endswith("_")},
+ }
+
+ # serialize weighting function
+ if self.weighting is None:
+ state["params"]["weighting"] = self.weighting
+ else:
+ state["params"]["weighting"] = marshal.dumps(
+ (self.weighting.__code__, self.weighting.__name__)
+ )
+
+ # serialize model
+ joblib.dump(state, path)
+
+ @classmethod
+ def load(
+ cls: type[KNNMixin],
+ path: str | pathlib.Path | t.IO,
+ /,
+ ) -> KNNMixin:
+ """Load and deserialize a fitted KNN estimator.
+
+ Parameters
+ ----------
+ cls: type[KNNMixin]
+
+ path:
+ Location to load the serialized estimator from.
+
+ Returns
+ -------
+ KNNMixin:
+ Fitted KNN estimator.
+
+ See Also
+ --------
+ save:
+ Serialize and save a fitted KNN estimator.
+ """
+ state = joblib.load(path)
+
+ # deserialize weighting function
+ if state["params"]["weighting"] is not None:
+ weighting_ = state["params"]["weighting"]
+ weighting, name = marshal.loads(weighting_) # noqa: S302
+ state["params"]["weighting"] = types.FunctionType(
+ weighting, globals(), name
+ )
+
+ # set main parameters
+ model = cls(**state["params"])
+
+ # set fitted values
+ for k, v in state["fitted"].items():
+ setattr(model, k, v)
+
+ # return deserialized model
+ return model
diff --git a/sequentia/models/knn/classifier.py b/sequentia/models/knn/classifier.py
new file mode 100644
index 00000000..4d6a4163
--- /dev/null
+++ b/sequentia/models/knn/classifier.py
@@ -0,0 +1,434 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""A k-nearest neighbor classifier that uses DTW as a distance measure for
+sequence comparison.
+"""
+
+from __future__ import annotations
+
+import typing as t
+
+import joblib
+import numba
+import numpy as np
+import pydantic as pyd
+
+from sequentia._internal import _data, _multiprocessing, _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
+from sequentia.models.base import ClassifierMixin
+from sequentia.models.knn.base import KNNMixin
+
+__all__ = ["KNNClassifier"]
+
+
+class KNNClassifier(KNNMixin, ClassifierMixin):
+ """A k-nearest neighbor classifier that uses DTW as a distance measure for
+ sequence comparison.
+
+ The classifier computes the score for each class as the total of the
+ distance weightings of every sequence belonging to that class,
+ within the DTW k-neighborhood of the sequence being classified.
+
+ Examples
+ --------
+ Using a :class:`.KNNClassifier` to classify spoken digits. ::
+
+ import numpy as np
+ from sequentia.datasets import load_digits
+ from sequentia.models.knn import KNNClassifier
+
+ # Seed for reproducible pseudo-randomness
+ random_state = np.random.RandomState(1)
+
+ # Fetch MFCCs of spoken digits
+ data = load_digits()
+ train_data, test_data = data.split(test_size=0.2, random_state=random_state)
+
+ # Create a HMMClassifier using a class frequency prior
+ clf = KNNClassifier()
+
+ # Fit the classifier
+ clf.fit(train_data.X, train_data.y, lengths=train_data.lengths)
+
+ # Predict classes for the test observation sequences
+ y_pred = clf.predict(test_data.X, lengths=test_data.lengths)
+ """ # noqa: E501
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def __init__(
+ self: pyd.SkipValidation,
+ *,
+ k: pyd.PositiveInt = 1,
+ weighting: t.Callable[[FloatArray], FloatArray] | None = None,
+ window: pyd.confloat(ge=0.0, le=1.0) = 1.0,
+ independent: bool = False,
+ use_c: bool = False,
+ n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
+ random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
+ classes: list[int] | None = None,
+ ) -> pyd.SkipValidation:
+ """Initializes the :class:`.KNNClassifier`.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ k:
+ Number of neighbors.
+
+ weighting:
+ A callable that specifies how distance weighting should be
+ performed.
+
+ The callable should accept a :class:`numpy:numpy.ndarray` of DTW
+ distances, apply an element-wise weighting transformation to the
+ matrix of DTW distances, then return an equally-sized
+ :class:`numpy:numpy.ndarray` of weightings.
+
+ If ``None``, then a uniform weighting of 1 will be applied to all
+ distances.
+
+ window:
+ The size of the Sakoe—Chiba band global constrant as a fraction
+ of the length of the shortest of the two sequences being compared.
+
+ - A larger window will give more freedom to the DTW alignment,
+ allowing more deviation but leading to potentially slower
+ computation.
+ A window of 1 is equivalent to full DTW computation with no
+ global constraint applied.
+ - A smaller window will restrict the DTW alignment, and possibly
+ speed up the DTW computation.
+ A window of 0 is equivalent to Euclidean distance.
+
+ independent:
+ Whether or not to allow features to be warped independently from
+ each other. See [#dtw_multi]_ for an overview of independent and
+ dependent dynamic time warping.
+
+ use_c:
+ Whether or not to use fast pure C compiled functions from
+ `dtaidistance `__ to
+ perform the DTW computations.
+
+ n_jobs:
+ Maximum number of concurrently running workers.
+
+ - If 1, no parallelism is used at all (useful for debugging).
+ - If -1, all CPUs are used.
+ - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g. ``n_jobs=-2``
+ uses all but one.
+
+ random_state:
+ Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness.
+
+ classes:
+ Set of possible class labels.
+
+ - If not provided, these will be determined from the training data
+ labels.
+ - If provided, output from methods such as :func:`predict_proba`
+ and :func:`predict_scores` will follow the ordering of the class
+ labels provided here.
+
+ Returns
+ -------
+ KNNClassifier
+ """
+ self.k: int = k
+ """Number of neighbors."""
+
+ self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = (
+ weighting # placeholder
+ )
+ """A callable that specifies how distance weighting should be
+ performed."""
+
+ self.window: float = window
+ """The size of the Sakoe—Chiba band global constrant as a fraction of
+ the length of the shortest of the two sequences being compared."""
+
+ self.independent: bool = independent
+ """Whether or not to allow features to be warped independently from
+ each other."""
+
+ self.use_c: bool = use_c
+ """Whether or not to use fast pure C compiled functions from
+ `dtaidistance `__ to
+ perform the DTW computations."""
+
+ self.n_jobs: int = n_jobs
+ """Maximum number of concurrently running workers."""
+
+ self.random_state: int | np.random.RandomState | None = random_state
+ """Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness."""
+
+ self.classes: list[int] | None = classes
+ """Set of possible class labels."""
+
+ # Allow metadata routing for lengths
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_predict_log_proba_request(lengths=True)
+ self.set_predict_proba_request(lengths=True)
+ self.set_score_request(
+ lengths=True,
+ normalize=True,
+ sample_weight=True,
+ )
+
+ def fit(
+ self: KNNClassifier,
+ X: FloatArray,
+ y: IntArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> KNNClassifier:
+ """Fit the classifier to the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ X:
+ Sequence(s).
+
+ y:
+ Classes corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ KNNClassifier:
+ The fitted classifier.
+ """
+ self.X_, self.lengths_ = _validation.check_X_lengths(
+ X, lengths=lengths, dtype=self._DTYPE
+ )
+ self.y_ = _validation.check_y(
+ y,
+ lengths=self.lengths_,
+ dtype=np.int8,
+ )
+ self.idxs_ = _data.get_idxs(self.lengths_)
+ self.use_c_ = _validation.check_use_c(self.use_c)
+ self.random_state_ = _validation.check_random_state(self.random_state)
+ self.classes_ = _validation.check_classes(
+ self.y_,
+ classes=self.classes,
+ )
+ _validation.check_weighting(self.weighting)
+ return self
+
+ @_validation.requires_fit
+ def predict(
+ self: KNNClassifier,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> IntArray:
+ """Predict classes for the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class predictions.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ class_scores = self.predict_scores(X, lengths=lengths)
+ return self._find_max_labels(class_scores)
+
+ @_validation.requires_fit
+ def predict_log_proba(
+ self: KNNClassifier,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predict log class probabilities for the sequence(s) in ``X``.
+
+ Probabilities are calculated as normalized class scores.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class membership log-probabilities.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ return np.log(self.predict_scores(X, lengths=lengths))
+
+ @_validation.requires_fit
+ def predict_proba(
+ self: KNNClassifier,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predict class probabilities for the sequence(s) in ``X``.
+
+ Probabilities are calculated as normalized class scores.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class membership probabilities.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ class_scores = self.predict_scores(X, lengths=lengths)
+ return class_scores / class_scores.sum(axis=1, keepdims=True)
+
+ @_validation.requires_fit
+ def predict_scores(
+ self: KNNClassifier,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predict class scores for the sequence(s) in ``X``.
+
+ Scores are calculated as the class distance weighting sums of all
+ training sequences in the k-neighborhood.
+
+ Parameters
+ ----------
+ self: KNNClassifier
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Class scores.
+
+ Notes
+ -----
+ This method requires a trained classifier — see :func:`fit`.
+ """
+ _, k_distances, k_labels = self.query_neighbors(
+ X,
+ lengths=lengths,
+ sort=False,
+ )
+ k_weightings = self._weighting()(k_distances)
+ return self._compute_scores(k_labels, k_weightings)
+
+ def _compute_scores(
+ self: KNNClassifier, labels: IntArray, weightings: FloatArray
+ ) -> FloatArray:
+ """Calculate the sum of the weightings for each label group."""
+ scores = np.zeros((len(labels), len(self.classes_)))
+ for i, k in enumerate(self.classes_):
+ scores[:, i] = np.einsum("ij,ij->i", labels == k, weightings)
+ return scores
+
+ def _find_max_labels(
+ self: KNNClassifier,
+ scores: FloatArray,
+ /,
+ ) -> IntArray:
+ """Return the label of the k nearest neighbors with the highest score
+ for each example.
+ """
+ n_jobs = _multiprocessing.effective_n_jobs(self.n_jobs, x=scores)
+ score_chunks = np.array_split(scores, n_jobs)
+ return np.concatenate(
+ joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.delayed(self._find_max_labels_chunk)(score_chunk)
+ for score_chunk in score_chunks
+ )
+ )
+
+ def _find_max_labels_chunk(
+ self: KNNClassifier, score_chunk: FloatArray, /
+ ) -> IntArray:
+ """Return the label with the highest score for each item in the
+ chunk.
+ """
+ max_labels = np.zeros(len(score_chunk), dtype=int)
+ for i, scores in enumerate(score_chunk):
+ max_score_idxs = self._multi_argmax(scores)
+ max_labels[i] = self.random_state_.choice(
+ self.classes_[max_score_idxs], size=1
+ ).item()
+ return max_labels
+
+ @staticmethod
+ @numba.njit
+ def _multi_argmax(arr: Array, /) -> IntArray:
+ """Same as numpy.argmax but returns all occurrences of the maximum
+ and only requires a single pass.
+
+ From: https://stackoverflow.com/a/58652335
+ """
+ all_, max_ = [0], arr[0]
+ for i in numba.prange(1, len(arr)):
+ if arr[i] > max_:
+ all_, max_ = [i], arr[i]
+ elif arr[i] == max_:
+ all_.append(i)
+ return np.array(all_)
diff --git a/sequentia/models/knn/regressor.py b/sequentia/models/knn/regressor.py
new file mode 100644
index 00000000..88ed9baf
--- /dev/null
+++ b/sequentia/models/knn/regressor.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""A k-nearest neighbor regressor that uses DTW as a distance measure for
+sequence comparison.
+"""
+
+from __future__ import annotations
+
+import typing as t
+
+import numpy as np
+import pydantic as pyd
+
+from sequentia._internal import _data, _validation
+from sequentia._internal._typing import FloatArray, IntArray
+from sequentia.models.base import RegressorMixin
+from sequentia.models.knn.base import KNNMixin
+
+__all__ = ["KNNRegressor"]
+
+
+class KNNRegressor(KNNMixin, RegressorMixin):
+ """A k-nearest neighbor regressor that uses DTW as a distance measure for
+ sequence comparison.
+
+ The regressor computes the output as a distance weighted average of the
+ outputs of the sequences within the DTW k-neighborhood of the sequence
+ being predicted.
+ """
+
+ @pyd.validate_call(config=dict(arbitrary_types_allowed=True))
+ def __init__(
+ self: pyd.SkipValidation,
+ *,
+ k: pyd.PositiveInt = 1,
+ weighting: t.Callable[[FloatArray], FloatArray] | None = None,
+ window: pyd.confloat(ge=0.0, le=1.0) = 1.0,
+ independent: bool = False,
+ use_c: bool = False,
+ n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
+ random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
+ ) -> pyd.SkipValidation:
+ """Initializes the :class:`.KNNRegressor`.
+
+ Parameters
+ ----------
+ self: KNNRegressor
+
+ k:
+ Number of neighbors.
+
+ weighting:
+ A callable that specifies how distance weighting should be
+ performed.
+
+ The callable should accept a :class:`numpy:numpy.ndarray` of DTW
+ distances, apply an element-wise weighting transformation to the
+ matrix of DTW distances, then return an equally-sized
+ :class:`numpy:numpy.ndarray` of weightings.
+
+ If ``None``, then a uniform weighting of 1 will be applied to all
+ distances.
+
+ window: The size of the Sakoe—Chiba band global constrant as a
+ fraction of the length of the shortest of the two sequences being
+ compared.
+
+ - A larger window will give more freedom to the DTW alignment,
+ allowing more deviation but leading to potentially slower
+ computation.
+ A window of 1 is equivalent to full DTW computation with no
+ global constraint applied.
+ - A smaller window will restrict the DTW alignment, and possibly
+ speed up the DTW computation.
+ A window of 0 is equivalent to Euclidean distance.
+
+ independent:
+ Whether or not to allow features to be warped independently from
+ each other. See [#dtw_multi]_ for an overview of independent and
+ dependent dynamic time warping.
+
+ use_c:
+ Whether or not to use fast pure C compiled functions from
+ `dtaidistance `__ to
+ perform the DTW computations.
+
+ n_jobs:
+ Maximum number of concurrently running workers.
+
+ - If 1, no parallelism is used at all (useful for debugging).
+ - If -1, all CPUs are used.
+ - If < -1, ``(n_cpus + 1 + n_jobs)`` are used — e.g. ``n_jobs=-2``
+ uses all but one.
+
+ random_state:
+ Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness.
+
+ Returns
+ -------
+ KNNRegressor
+ """
+ self.k: int = k
+ """Number of neighbors."""
+
+ self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = (
+ weighting # placeholder
+ )
+ """A callable that specifies how distance weighting should be
+ performed."""
+
+ self.window: float = window
+ """The size of the Sakoe—Chiba band global constrant as a fraction of
+ the length of the shortest of the two sequences being compared."""
+
+ self.independent: bool = independent
+ """Whether or not to allow features to be warped independently from
+ each other."""
+
+ self.use_c: bool = use_c
+ """Set of possible class labels."""
+
+ self.n_jobs: int = n_jobs
+ """Maximum number of concurrently running workers."""
+
+ self.random_state = random_state
+ """Seed or :class:`numpy:numpy.random.RandomState` object for
+ reproducible pseudo-randomness."""
+
+ # Allow metadata routing for lengths
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_score_request(lengths=True, sample_weight=True)
+
+ def fit(
+ self: KNNRegressor,
+ X: FloatArray,
+ y: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> KNNRegressor:
+ """Fits the regressor to the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: KNNRegressor
+
+ X:
+ Sequence(s).
+
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ KNNRegressor:
+ The fitted regressor.
+ """
+ self.X_, self.lengths_ = _validation.check_X_lengths(
+ X, lengths=lengths, dtype=self._DTYPE
+ )
+ self.y_ = _validation.check_y(
+ y,
+ lengths=self.lengths_,
+ dtype=np.float64,
+ )
+ self.idxs_ = _data.get_idxs(self.lengths_)
+ self.use_c_ = _validation.check_use_c(self.use_c)
+ self.random_state_ = _validation.check_random_state(self.random_state)
+ _validation.check_weighting(self.weighting)
+ return self
+
+ @_validation.requires_fit
+ def predict(
+ self: KNNRegressor,
+ X: FloatArray,
+ *,
+ lengths: IntArray | None = None,
+ ) -> FloatArray:
+ """Predicts outputs for the sequence(s) in ``X``.
+
+ Parameters
+ ----------
+ self: KNNRegressor
+
+ X:
+ Sequence(s).
+
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
+
+ Returns
+ -------
+ numpy.ndarray:
+ Output predictions.
+
+ Notes
+ -----
+ This method requires a trained regressor — see :func:`fit`.
+ """
+ _, k_distances, k_outputs = self.query_neighbors(
+ X,
+ lengths=lengths,
+ sort=False,
+ )
+ k_weightings = self._weighting()(k_distances)
+ total_weights = k_weightings.sum(axis=1)
+ return (k_outputs * k_weightings).sum(axis=1) / total_weights
diff --git a/sequentia/preprocessing/__init__.py b/sequentia/preprocessing/__init__.py
new file mode 100644
index 00000000..236a880e
--- /dev/null
+++ b/sequentia/preprocessing/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Preprocessing utilities."""
+
+from sequentia.preprocessing.transforms import (
+ IndependentFunctionTransformer,
+ mean_filter,
+ median_filter,
+)
+
+__all__ = ["IndependentFunctionTransformer", "mean_filter", "median_filter"]
diff --git a/lib/sequentia/preprocessing/transforms.py b/sequentia/preprocessing/transforms.py
similarity index 51%
rename from lib/sequentia/preprocessing/transforms.py
rename to sequentia/preprocessing/transforms.py
index 1bacae55..d609d60a 100644
--- a/lib/sequentia/preprocessing/transforms.py
+++ b/sequentia/preprocessing/transforms.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
"""
IndependentFunctionTransformer is an adapted version of FunctionTransformer
from the sklearn.preprocessing module, and largely relies on its source code.
@@ -41,21 +46,20 @@
from __future__ import annotations
import warnings
-from typing import Callable, Optional, Dict, Any
import numpy as np
-from pydantic import PositiveInt
+import scipy.signal
+import sklearn.base
+from sklearn.preprocessing import FunctionTransformer
from sklearn.utils.validation import _allclose_dense_sparse, check_array
-from scipy.signal import medfilt2d, convolve
-from sequentia.preprocessing.base import Transform
-from sequentia.utils.validation import _BaseSequenceValidator, Array
-from sequentia.utils.data import SequentialDataset
+from sequentia._internal import _data, _validation
+from sequentia._internal._typing import Array, FloatArray, IntArray
__all__ = ["IndependentFunctionTransformer", "mean_filter", "median_filter"]
-class IndependentFunctionTransformer(Transform):
+class IndependentFunctionTransformer(FunctionTransformer):
"""Constructs a transformer from an arbitrary callable,
applying the transform independently to each sequence.
@@ -94,178 +98,248 @@ class IndependentFunctionTransformer(Transform):
transform = IndependentFunctionTransformer(minmax_scale)
# Apply the transform to the data
- Xt = transform.transform(data.X, data.lengths)
+ Xt = transform.transform(data.X, lengths=data.lengths)
"""
-
def __init__(
self,
- func: Optional[Callable] = None,
- inverse_func: Optional[Callable] = None,
+ func=None,
+ inverse_func=None,
*,
- validate: bool = False,
- check_inverse: bool = True,
- kw_args =None,
+ validate=False,
+ accept_sparse=False,
+ check_inverse=True,
+ feature_names_out=None,
+ kw_args=None,
inv_kw_args=None,
- ) -> IndependentFunctionTransformer:
- """Initializes the :class:`.IndependentFunctionTransformer`.
-
- :param func: The callable to use for the transformation.
- This will be passed the same arguments as transform, with args and kwargs forwarded.
- If ``None``, then ``func`` will be the identity function.
-
- :param inverse_func: The callable to use for the inverse transformation.
- This will be passed the same arguments as inverse transform, with args and kwargs forwarded.
- If ``None``, then ``inverse_func`` will be the identity function.
-
- :param validate: Indicates whether the input ``X`` array should be checked before calling ``func``.
-
- - If ``False``, there is no input validation.
- - If ``True``, then ``X`` will be converted to a 2-dimensional NumPy array.
- If the conversion is not possible an exception is raised.
-
- :param check_inverse: Whether to check that or ``func`` followed by ``inverse_func`` leads to the original inputs.
- It can be used for a sanity check, raising a warning when the condition is not fulfilled.
-
- :param kw_args: Dictionary of additional keyword arguments to pass to ``func``.
-
- :param inv_kw_args: Dictionary of additional keyword arguments to pass to ``inverse_func``.
- """
+ ):
+ """See :class:`sklearn:sklearn.preprocessing.FunctionTransformer`."""
self.func = func
self.inverse_func = inverse_func
self.validate = validate
+ self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
- self.kw_args: Optional[Dict[str, Any]] = kw_args
- self.inv_kw_args: Optional[Dict[str, Any]] = inv_kw_args
-
- def _check_input(self, X, lengths):
- data = _BaseSequenceValidator(X=X, lengths=lengths)
- return data.X, data.lengths
+ self.feature_names_out = feature_names_out
+ self.kw_args = kw_args
+ self.inv_kw_args = inv_kw_args
+ # Allow metadata routing for lengths
+ self.set_fit_request(lengths=True)
+ self.set_transform_request(lengths=True)
+ self.set_inverse_transform_request(lengths=True)
+
+ def _check_input(self, X, *, lengths, reset):
+ if self.validate:
+ X, lengths = _validation.check_X_lengths(
+ X, lengths=lengths, dtype=X.dtype
+ )
+ return (
+ self._validate_data(
+ X, accept_sparse=self.accept_sparse, reset=reset
+ ),
+ lengths,
+ )
+ return X, lengths
- def _check_inverse_transform(self, X, lengths):
+ def _check_inverse_transform(self, X, *, lengths):
"""Check that func and inverse_func are the inverse."""
idx_selected = slice(None, None, max(1, X.shape[0] // 100))
- X_round_trip = self.inverse_transform(self.transform(X[idx_selected], lengths), lengths)
-
- if not np.issubdtype(X.dtype, np.number):
+ X_round_trip = self.inverse_transform(
+ self.transform(X[idx_selected], lengths=lengths),
+ lengths=lengths,
+ )
+
+ if hasattr(X, "dtype"):
+ dtypes = [X.dtype]
+ elif hasattr(X, "dtypes"):
+ # Dataframes can have multiple dtypes
+ dtypes = X.dtypes
+
+ if not all(np.issubdtype(d, np.number) for d in dtypes):
raise ValueError(
- "'check_inverse' is only supported when all the elements in `X` are numerical."
+ "'check_inverse' is only supported when all the elements in `X` is"
+ " numerical."
)
if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
warnings.warn(
- "The provided functions are not strictly"
- " inverse of each other. If you are sure you"
- " want to proceed regardless, set"
- " 'check_inverse=False'.",
+ (
+ "The provided functions are not strictly"
+ " inverse of each other. If you are sure you"
+ " want to proceed regardless, set"
+ " 'check_inverse=False'."
+ ),
UserWarning,
)
+ @sklearn.base._fit_context(prefer_skip_nested_validation=True)
def fit(
- self,
+ self: IndependentFunctionTransformer,
X: Array,
- lengths: Optional[Array] = None
+ y: Array | None = None,
+ *,
+ lengths: IntArray | None = None,
) -> IndependentFunctionTransformer:
"""Fits the transformer to ``X``.
- :param X: Univariate or multivariate observation sequence(s).
+ Parameters
+ ----------
+ self: IndependentFunctionTransformer
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
+ X:
+ Sequence(s).
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
+
+ - If ``None``, then ``X`` is assumed to be a single sequence.
- ``len(X)`` should be equal to ``sum(lengths)``.
- :return: The fitted transformer.
+ Returns
+ -------
+ IndependentFunctionTransformer
+ The fitted transformer.
"""
- X, lengths = self._check_input(X, lengths)
- if self.check_inverse and not (self.func is None or self.inverse_func is None):
- self._check_inverse_transform(X, lengths)
+ X, lengths = self._check_input(X, lengths=lengths, reset=True)
+ if self.check_inverse and not (
+ self.func is None or self.inverse_func is None
+ ):
+ self._check_inverse_transform(X, lengths=lengths)
return self
def transform(
- self,
+ self: IndependentFunctionTransformer,
X: Array,
- lengths: Optional[Array] = None
+ *,
+ lengths: IntArray | None = None,
) -> Array:
- """Applies the transformation to ``X``, producing a transformed version of ``X``.
+ """Applies the transformation to ``X``,
+ producing a transformed version of ``X``.
- :param X: Univariate or multivariate observation sequence(s).
+ Parameters
+ ----------
+ self: IndependentFunctionTransformer
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
+ X:
+ Sequence(s).
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
+ - If ``None``, then ``X`` is assumed to be a single sequence.
- ``len(X)`` should be equal to ``sum(lengths)``.
- :return: The transformed array.
+ Returns
+ -------
+ numpy.ndarray:
+ The transformed array.
"""
- X, lengths = self._check_input(X, lengths)
- return self._transform(X, lengths, func=self.func, kw_args=self.kw_args)
+ X, lengths = self._check_input(X, lengths=lengths, reset=False)
+ return self._transform(
+ X, lengths=lengths, func=self.func, kw_args=self.kw_args
+ )
def inverse_transform(
- self,
+ self: IndependentFunctionTransformer,
X: Array,
- lengths: Optional[Array] = None
+ *,
+ lengths: IntArray | None = None,
) -> Array:
"""Applies the inverse transformation to ``X``.
- :param X: Univariate or multivariate observation sequence(s).
+ Parameters
+ ----------
+ self: IndependentFunctionTransformer
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- - Should be a concatenated sequence if multiple sequences are provided,
- with respective sequence lengths being provided in the ``lengths`` argument for decoding the original sequences.
+ X:
+ Sequence(s).
- :param lengths: Lengths of the observation sequence(s) provided in ``X``.
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
- - If ``None``, then ``X`` is assumed to be a single observation sequence.
+ - If ``None``, then ``X`` is assumed to be a single sequence.
- ``len(X)`` should be equal to ``sum(lengths)``.
- :return: The inverse transformed array.
+ Returns
+ -------
+ numpy.ndarray:
+ The inverse transformed array.
"""
- X, lengths = self._check_input(X, lengths)
if self.validate:
X = check_array(X, accept_sparse=False)
- return self._transform(X, lengths, func=self.inverse_func, kw_args=self.inv_kw_args)
+ X, lengths = _validation.check_X_lengths(
+ X, lengths=lengths, dtype=X.dtype
+ )
+ return self._transform(
+ X,
+ lengths=lengths,
+ func=self.inverse_func,
+ kw_args=self.inv_kw_args,
+ )
+
+ def fit_transform(
+ self: IndependentFunctionTransformer,
+ X: Array,
+ y: Array | None = None,
+ *,
+ lengths: IntArray | None = None,
+ ) -> Array:
+ """Fits the transformer to the sequence(s) in ``X`` and returns a
+ transformed version of ``X``.
- def _transform(self, X, lengths, func=None, kw_args=None):
- if func is None:
- return X
- apply = lambda x: func(x, **(kw_args if kw_args else {}))
- idxs = SequentialDataset._get_idxs(lengths)
- return np.vstack([apply(x) for x in SequentialDataset._iter_X(X, idxs)])
+ Parameters
+ ----------
+ self: IndependentFunctionTransformer
- def __sklearn_is_fitted__(self):
- """Return True since FunctionTransfomer is stateless."""
- return True
+ X:
+ Sequence(s).
- def _more_tags(self):
- return {"no_validation": not self.validate, "stateless": True}
+ y:
+ Outputs corresponding to sequence(s) in ``X``.
+ lengths:
+ Lengths of the sequence(s) provided in ``X``.
-def mean_filter(x: Array, k: PositiveInt = 5) -> Array:
- """Applies a mean filter of size ``k`` independently to each feature of the sequence,
- retaining the original input shape by using appropriate padding.
+ - If ``None``, then ``X`` is assumed to be a single sequence.
+ - ``len(X)`` should be equal to ``sum(lengths)``.
- This is implemented as a 1D convolution with a kernel of size ``k`` and values ``1 / k``.
+ Returns
+ -------
+ numpy.ndarray:
+ The transformed data.
+ """
+ return self.fit(X, lengths=lengths).transform(X, lengths=lengths)
- :param x: Univariate or multivariate observation sequence.
+ def _transform(self, X, *, lengths, func=None, kw_args=None):
+ if func is None:
+ return X
+ apply = lambda x: func(x, **(kw_args if kw_args else {}))
+ idxs = _data.get_idxs(lengths)
+ return np.vstack([apply(x) for x in _data.iter_X(X, idxs=idxs)])
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
- :param k: Width of the filter.
+def mean_filter(x: FloatArray, *, k: int = 5) -> FloatArray:
+ """Applies a mean filter of size ``k`` independently to each feature of
+ the sequence, retaining the original input shape by using appropriate
+ padding.
- :return: The filtered array.
+ This is implemented as a 1D convolution with a kernel of size ``k`` and
+ values ``1 / k``.
+
+ Parameters
+ ----------
+ x:
+ Observation sequence.
+
+ k:
+ Width of the filter.
+
+ Returns
+ -------
+ numpy.ndarray:
+ The filtered array.
Examples
--------
@@ -286,24 +360,28 @@ def mean_filter(x: Array, k: PositiveInt = 5) -> Array:
transform = IndependentFunctionTransformer(mean_filter, kw_args={"k": 7})
# Apply the transform to all sequences
- Xt = transform.transform(data.X, data.lengths)
+ Xt = transform.transform(data.X, lengths=data.lengths)
"""
- data = _BaseSequenceValidator(X=x)
- return convolve(data.X, np.ones((k, 1)) / k, mode="same")
-
+ return scipy.signal.convolve(x, np.ones((k, 1)) / k, mode="same")
-def median_filter(x: Array, k: PositiveInt = 5) -> Array:
- """Applies a median filter of size ``k`` independently to each feature of the sequence,
- retaining the original input shape by using appropriate padding.
- :param x: Univariate or multivariate observation sequence.
+def median_filter(x: FloatArray, *, k: int = 5) -> FloatArray:
+ """Applies a median filter of size ``k`` independently to each feature of
+ the sequence, retaining the original input shape by using appropriate
+ padding.
- - Should be a single 1D or 2D array.
- - Should have length as the 1st dimension and features as the 2nd dimension.
+ Parameters
+ ----------
+ x:
+ Observation sequence.
- :param k: Width of the filter.
+ k:
+ Width of the filter.
- :return: The filtered array.
+ Returns
+ -------
+ numpy.ndarray:
+ The filtered array.
Examples
--------
@@ -324,7 +402,6 @@ def median_filter(x: Array, k: PositiveInt = 5) -> Array:
transform = IndependentFunctionTransformer(median_filter, kw_args={"k": 7})
# Apply the transform to all sequences
- Xt = transform.transform(data.X, data.lengths)
+ Xt = transform.transform(data.X, lengths=data.lengths)
"""
- data = _BaseSequenceValidator(X=x)
- return medfilt2d(data.X, kernel_size=(k, 1))
+ return scipy.signal.medfilt2d(x, kernel_size=(k, 1))
diff --git a/sequentia/version.py b/sequentia/version.py
new file mode 100644
index 00000000..25370404
--- /dev/null
+++ b/sequentia/version.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Version information for Sequentia.
+
+Source code modified from pydantic (https://github.com/pydantic/pydantic).
+
+ The MIT License (MIT)
+
+ Copyright (c) 2017 to present Pydantic Services Inc. and individual
+ contributors.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+"""
+
+__all__ = ["VERSION", "version_info"]
+
+VERSION = "2.0.0a1"
+
+
+def version_info() -> str:
+ """Return complete version information for Sequentia and its
+ dependencies.
+ """
+ import importlib.metadata
+ import importlib.util
+ import platform
+ import sys
+ from pathlib import Path
+
+ # get data about packages that:
+ # - are closely related to Sequentia,
+ # - use Sequentia,
+ # - often conflict with Sequentia.
+ package_names = {
+ "numba",
+ "numpy",
+ "hmmlearn",
+ "dtaidistance",
+ "scikit-learn",
+ "scipy",
+ "joblib",
+ "pydantic",
+ }
+ related_packages = []
+
+ for dist in importlib.metadata.distributions():
+ name = dist.metadata["Name"]
+ if name in package_names:
+ entry = f"{name}-{dist.version}"
+ if name == "dtaidistance":
+ clib = bool(importlib.util.find_spec("dtaidistance.dtw_cc"))
+ entry = f"{entry} (c={clib})"
+ related_packages.append(entry)
+
+ info = {
+ "sequentia version": VERSION,
+ "install path": Path(__file__).resolve().parent,
+ "python version": sys.version,
+ "platform": platform.platform(),
+ "related packages": ", ".join(related_packages),
+ }
+ return "\n".join(
+ "{:>30} {}".format(k + ":", str(v).replace("\n", " ")) #
+ for k, v in info.items()
+ )
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b6c13f53..00000000
--- a/setup.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import re
-from setuptools import setup, find_packages
-from pathlib import Path
-
-with open('README.md', 'r', encoding='utf8') as fh:
- long_description = fh.read()
-
-init = Path(__file__).parent / "lib" / "sequentia" / "__init__.py"
-def load_meta(meta):
- with open(init, "r") as file:
- info = re.search(rf'^__{meta}__\s*=\s*[\'"]([^\'"]*)[\'"]', file.read(), re.MULTILINE).group(1)
- if not info:
- raise RuntimeError(f"Could not load {repr(meta)} metadata")
- return info
-
-setup(
- name = load_meta("name"),
- version = load_meta("version"),
- author = load_meta("author"),
- author_email = load_meta("email"),
- description = 'HMM and DTW-based sequence machine learning algorithms in Python following an sklearn-like interface.',
- long_description = long_description,
- long_description_content_type = 'text/markdown',
- url = 'https://github.com/eonu/sequentia',
- project_urls = {
- 'Documentation': 'https://sequentia.readthedocs.io/en/latest',
- 'Bug Tracker': 'https://github.com/eonu/sequentia/issues',
- 'Source Code': 'https://github.com/eonu/sequentia',
- },
- license = 'MIT',
- package_dir = {'': 'lib'},
- packages = find_packages(where='lib'),
- package_data={
- 'sequentia': [
- 'datasets/data/digits.npz',
- 'datasets/data/gene_familites.npz',
- ]
- },
- classifiers = [
- 'Development Status :: 5 - Production/Stable',
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
- 'Programming Language :: Python :: 3.10',
- 'License :: OSI Approved :: MIT License',
- 'Operating System :: Unix',
- 'Operating System :: MacOS',
- 'Intended Audience :: Science/Research',
- 'Topic :: Software Development',
- 'Topic :: Scientific/Engineering',
- 'Natural Language :: English',
- ],
- python_requires = '>=3.8',
- setup_requires = [
- 'Cython>=0.28.5',
- 'numpy>=1.18,<1.24',
- 'scipy>=1.3',
- ],
- install_requires = [
- 'numba>=0.56',
- 'numpy>=1.18,<1.24',
- 'hmmlearn>=0.2.8',
- 'dtaidistance>=2.3.10', # [numpy]
- 'scikit-learn>=1.0',
- 'joblib>=0.14',
- 'pydantic<1.9',
- ],
- extras_require = {
- 'dev': [
- 'sphinx',
- 'numpydoc',
- 'sphinx_rtd_theme',
- 'sphinx-autodoc-typehints',
- 'sphinx-autobuild',
- 'm2r2',
- 'mistune==0.8.4',
- 'Jinja2',
- 'pytest',
- ]
- }
-)
diff --git a/tasks.py b/tasks.py
new file mode 100644
index 00000000..b0755953
--- /dev/null
+++ b/tasks.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Main invoke task collection."""
+
+from __future__ import annotations
+
+from invoke.collection import Collection
+from invoke.config import Config
+from invoke.tasks import task
+
+from make import cov, docs, lint, release, tests
+
+
+@task
+def install(c: Config) -> None:
+ """Install package with pre-commit hooks and core, dev, docs, & test
+ dependencies.
+ """
+ # install dependencies
+ # NOTE: only including docs/tests dependencies to please editors
+ c.run("poetry install --sync --only base,main,dev,docs,tests")
+ # install pre-commit hooks
+ c.run("pre-commit install --install-hooks")
+
+
+@task
+def clean(c: Config) -> None:
+ """Clean temporary files, local cache and build artifacts."""
+ commands: list[str] = [
+ "rm -rf `find . -name __pycache__`",
+ "rm -f `find . -type f -name '*.py[co]'`",
+ "rm -f `find . -type f -name '*~'`",
+ "rm -f `find . -type f -name '.*~'`",
+ "rm -rf .cache",
+ "rm -rf .pytest_cache",
+ "rm -rf .ruff_cache",
+ "rm -rf .tox",
+ "rm -rf htmlcov",
+ "rm -rf *.egg-info",
+ "rm -f .coverage",
+ "rm -f .coverage.*",
+ "rm -rf build",
+ "rm -rf dist",
+ "rm -rf site",
+ "rm -rf docs/build",
+ "rm -rf coverage.xml",
+ ]
+ for command in commands:
+ c.run(command)
+
+
+# create top-level namespace
+namespace = Collection()
+
+# register top-level commands
+for t in (install, clean):
+ namespace.add_task(t)
+
+# register namespaces
+for module in (docs, tests, cov, lint, release):
+ collection = Collection.from_module(module)
+ namespace.add_collection(collection)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..8d3537b2
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Unit tests."""
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..db78dcc3
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Unit test configuration."""
+
+from __future__ import annotations
+
+import itertools
+import typing as t
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_almost_equal
+
+from sequentia._internal._typing import Array
+
+
+class Helpers:
+ """Utility functions to be accessed via a fixture."""
+
+ @staticmethod
+ def combinations(string: str, /) -> t.Iterable[str]:
+ return map( # noqa: C417
+ lambda params: "".join(params),
+ itertools.chain.from_iterable(
+ itertools.combinations(string, i) # placeholder
+ for i in range(1, len(string))
+ ),
+ )
+
+ @staticmethod
+ def assert_equal(a: Array, b: Array, /) -> None:
+ assert_allclose(a, b, rtol=1e-3)
+
+ @staticmethod
+ def assert_not_equal(a: Array, b: Array, /) -> None:
+ assert not np.allclose(a, b, rtol=1e-3)
+
+ @classmethod
+ def assert_all_equal(cls: type[Helpers], A: Array, B: Array, /) -> None:
+ for a, b in zip(A, B):
+ cls.assert_equal(a, b)
+
+ @classmethod
+ def assert_all_not_equal(
+ cls: type[Helpers],
+ A: Array,
+ B: Array,
+ /,
+ ) -> None:
+ for a, b in zip(A, B):
+ cls.assert_not_equal(a, b)
+
+ @staticmethod
+ def assert_distribution(x: Array, /) -> None:
+ if x.ndim == 1:
+ assert_almost_equal(x.sum(), 1.0, decimal=5)
+ elif x.ndim == 2:
+ assert_almost_equal(x.sum(axis=1), np.ones(len(x)))
+
+
+@pytest.fixture()
+def helpers() -> type[Helpers]:
+ return Helpers
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/__init__.py b/tests/unit/test_datasets/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_datasets/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/test_base.py b/tests/unit/test_datasets/test_base.py
new file mode 100644
index 00000000..17bbe1ba
--- /dev/null
+++ b/tests/unit/test_datasets/test_base.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import os
+import tempfile
+import typing as t
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia._internal import _data
+from sequentia.datasets.base import SequentialDataset
+
+
+@pytest.mark.parametrize("y_type", [int, float, None])
+@pytest.mark.parametrize("use_lengths", [True, False])
+def test_data(
+ request: SubRequest, helpers: t.Any, y_type: type, *, use_lengths: bool
+) -> None:
+ X = np.atleast_2d(np.arange(10)).T
+
+ if y_type == int:
+ y = [10, 15, 10]
+ elif y_type == float:
+ y = [10.1, 15.2, 20.3]
+ elif y_type is None:
+ y = None
+
+ if use_lengths:
+ lengths = [2, 3, 5]
+ else:
+ lengths = None
+ if y_type:
+ y = y[:1]
+
+ data = SequentialDataset(X, y, lengths=lengths)
+
+ # X
+ helpers.assert_equal(data.X, X)
+
+ # y, classes
+ if y_type == int:
+ assert np.issubdtype(data.y.dtype, np.integer)
+ helpers.assert_equal(data.y, np.array(y))
+ helpers.assert_equal(data.classes, [10, 15] if lengths else [10])
+ elif y_type == float:
+ assert np.issubdtype(data.y.dtype, np.floating)
+ helpers.assert_equal(data.y, np.array(y))
+ assert data.classes is None
+ elif y_type is None:
+ for prop in ("y", "X_y", "X_y_lengths"):
+ with pytest.raises(AttributeError):
+ getattr(data, prop)
+ assert data.classes is None
+
+ # idxs
+ if lengths:
+ helpers.assert_equal(
+ data.idxs,
+ [
+ [0, 2],
+ [2, 5],
+ [5, 10],
+ ],
+ )
+ else:
+ helpers.assert_equal(data.idxs, [[0, 10]])
+
+ # _data.iter_X
+ helpers.assert_equal(
+ data.X,
+ np.vstack(list(_data.iter_X(data.X, idxs=data.idxs))),
+ )
+
+ # __getitem__
+ if y_type:
+ if lengths:
+ # [0]
+ x, y_ = data[0]
+ helpers.assert_equal(x, np.atleast_2d([0, 1]).T)
+ assert y_ == y[0]
+ # [:1]
+ xs, ys = data[:1]
+ helpers.assert_all_equal(xs, [np.atleast_2d([0, 1]).T])
+ helpers.assert_equal(ys, y[:1])
+ # [1:3]
+ xs, ys = data[1:3]
+ helpers.assert_all_equal(
+ xs,
+ [np.atleast_2d([2, 3, 4]).T, np.atleast_2d([5, 6, 7, 8, 9]).T],
+ )
+ helpers.assert_equal(ys, y[1:3])
+ # [-1]
+ x, y_ = data[-1]
+ helpers.assert_equal(x, np.atleast_2d([5, 6, 7, 8, 9]).T)
+ assert y_ == y[-1]
+ # [-2:]
+ xs, ys = data[-2:]
+ helpers.assert_all_equal(
+ xs,
+ [np.atleast_2d([2, 3, 4]).T, np.atleast_2d([5, 6, 7, 8, 9]).T],
+ )
+ helpers.assert_equal(ys, y[-2:])
+ else:
+ # [0]
+ x, y_ = data[0]
+ helpers.assert_equal(x, X)
+ assert y_ == y
+ # [:1]
+ xs, ys = data[:1]
+ helpers.assert_all_equal(xs, [X])
+ helpers.assert_equal(ys, y)
+ elif lengths:
+ # [0]
+ x = data[0]
+ helpers.assert_equal(x, np.atleast_2d([0, 1]).T)
+ # [:1]
+ xs = data[:1]
+ helpers.assert_all_equal(xs, [np.atleast_2d([0, 1]).T])
+ # [1:3]
+ xs = data[1:3]
+ helpers.assert_all_equal(
+ xs, [np.atleast_2d([2, 3, 4]).T, np.atleast_2d([5, 6, 7, 8, 9]).T]
+ )
+ # [-1]
+ x = data[-1]
+ helpers.assert_equal(x, np.atleast_2d([5, 6, 7, 8, 9]).T)
+ # [-2:]
+ xs = data[-2:]
+ helpers.assert_all_equal(
+ xs, [np.atleast_2d([2, 3, 4]).T, np.atleast_2d([5, 6, 7, 8, 9]).T]
+ )
+ else:
+ # [0]
+ x = data[0]
+ helpers.assert_equal(x, X)
+ # [:1]
+ xs = data[:1]
+ helpers.assert_all_equal(xs, [X])
+
+ # split
+ if y and lengths:
+ train, test = data.split(test_size=1 / 3, shuffle=False)
+ assert len(train) == 2
+ assert len(test) == 1
+ helpers.assert_equal(train.lengths, data.lengths[: len(train)])
+ helpers.assert_equal(test.lengths, data.lengths[-len(test) :])
+ helpers.assert_equal(train.X, data.X[: train.lengths.sum()])
+ helpers.assert_equal(test.X, data.X[-test.lengths.sum() :])
+ helpers.assert_equal(train.y, data.y[: len(train)])
+ helpers.assert_equal(test.y, data.y[-len(test) :])
+
+ # iter_by_class
+ if y_type == int and lengths:
+ for X_, lengths_, c in data.iter_by_class():
+ if c == 10:
+ helpers.assert_equal(lengths_, [2, 5])
+ helpers.assert_equal(X_, np.vstack([data.X[:2], data.X[-5:]]))
+ elif c == 15:
+ helpers.assert_equal(lengths_, [3])
+ helpers.assert_equal(X_, data.X[2:5])
+
+ # check serialization/deserialization
+ with tempfile.TemporaryDirectory() as temp_dir:
+ data_path = f"{temp_dir}/{request.node.originalname}.npz"
+ # check that save works
+ data.save(data_path)
+ assert os.path.isfile(data_path)
+ # check that load works
+ data_load = SequentialDataset.load(data_path)
+ # check that stored values are the same
+ helpers.assert_equal(data._X, data_load._X)
+ helpers.assert_equal(data._lengths, data_load._lengths)
+ if y:
+ helpers.assert_equal(data._y, data_load._y)
+ else:
+ assert data._y is None
+ assert data_load._y is None
+ if data._classes is not None:
+ helpers.assert_equal(data._classes, data_load._classes)
+ else:
+ assert data._classes is None
+ assert data_load._classes is None
+
+ # copy - check that stored values are the same
+ data_copy = data.copy()
+ helpers.assert_equal(data._X, data_copy._X)
+ assert not np.shares_memory(data._X, data_copy._X)
+ helpers.assert_equal(data._lengths, data_copy._lengths)
+ assert not np.shares_memory(data._lengths, data_copy._lengths)
+ if y:
+ helpers.assert_equal(data._y, data_copy._y)
+ assert not np.shares_memory(data._y, data_copy._y)
+ else:
+ assert data._y is None
+ assert data_copy._y is None
+ if data._classes is not None:
+ helpers.assert_equal(data._classes, data_copy._classes)
+ assert not np.shares_memory(data._classes, data_copy._classes)
+ else:
+ assert data._classes is None
+ assert data_copy._classes is None
diff --git a/tests/unit/test_datasets/test_digits.py b/tests/unit/test_datasets/test_digits.py
new file mode 100644
index 00000000..6aab914f
--- /dev/null
+++ b/tests/unit/test_datasets/test_digits.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import typing as t
+
+import pytest
+
+from sequentia.datasets import load_digits
+
+
+@pytest.mark.parametrize("digits", [list(range(10)), [2, 5]])
+def test_digits(helpers: t.Any, digits: list[int]) -> None:
+ data = load_digits(digits=digits)
+
+ assert len(data) == 300 * len(digits)
+ helpers.assert_equal(data.classes, digits)
+ assert set(data.y) == set(digits)
+
+ for _, lengths, _ in data.iter_by_class():
+ assert len(lengths) == 300
diff --git a/tests/unit/test_datasets/test_gene_families.py b/tests/unit/test_datasets/test_gene_families.py
new file mode 100644
index 00000000..2baae105
--- /dev/null
+++ b/tests/unit/test_datasets/test_gene_families.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import typing as t
+
+import pytest
+
+from sequentia.datasets import load_gene_families
+
+counts = {0: 531, 1: 534, 2: 349, 3: 672, 4: 711, 5: 240, 6: 1343}
+
+
+@pytest.mark.parametrize("families", [list(range(7)), [2, 5]])
+def test_gene_families(helpers: t.Any, families: list[int]) -> None:
+ data, enc = load_gene_families(families=families)
+
+ assert set(enc.classes_) == {"A", "C", "G", "N", "T"}
+
+ helpers.assert_equal(data.classes, families)
+ assert set(data.y) == set(families)
+
+ for family in families:
+ assert (data.y == family).sum() == counts[family]
diff --git a/tests/unit/test_internal/__init__.py b/tests/unit/test_internal/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_internal/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/test_data.py b/tests/unit/test_internal/test_data.py
new file mode 100644
index 00000000..b0421f9b
--- /dev/null
+++ b/tests/unit/test_internal/test_data.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Covered in tests/unit/test_datasets/test_base.py"""
diff --git a/tests/unit/test_internal/test_hmm/__init__.py b/tests/unit/test_internal/test_hmm/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_internal/test_hmm/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/test_hmm/test_topologies.py b/tests/unit/test_internal/test_hmm/test_topologies.py
new file mode 100644
index 00000000..9b7e7ce8
--- /dev/null
+++ b/tests/unit/test_internal/test_hmm/test_topologies.py
@@ -0,0 +1,474 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import typing as t
+
+import numpy as np
+import pytest
+
+from sequentia._internal._hmm import topologies
+
+
+@pytest.fixture(scope="module")
+def random_state() -> np.random.RandomState:
+ return np.random.RandomState(0)
+
+
+def test_base_uniform_start_probs_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform initial state distribution with the minimum number of states"""
+ topology = topologies.BaseTopology(n_states=1, random_state=random_state)
+ start_probs = topology.uniform_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(start_probs, np.array([1.0]))
+
+
+def test_base_uniform_start_probs_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform initial state distribution with a few states"""
+ topology = topologies.BaseTopology(n_states=2, random_state=random_state)
+ start_probs = topology.uniform_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(start_probs, np.array([0.5, 0.5]))
+
+
+def test_base_uniform_start_probs_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform initial state distribution with many states"""
+ topology = topologies.BaseTopology(n_states=5, random_state=random_state)
+ start_probs = topology.uniform_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(start_probs, np.array([0.2, 0.2, 0.2, 0.2, 0.2]))
+
+
+def test_base_random_start_probs_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random initial state distribution with minimal states"""
+ topology = topologies.BaseTopology(n_states=1, random_state=random_state)
+ start_probs = topology.random_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(start_probs, np.array([1.0]))
+
+
+def test_base_random_start_probs_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random initial state distribution with few states"""
+ topology = topologies.BaseTopology(n_states=2, random_state=random_state)
+ start_probs = topology.random_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(start_probs, np.array([0.57633871, 0.42366129]))
+
+
+def test_base_random_initial_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random initial state distribution with many states"""
+ topology = topologies.BaseTopology(n_states=5, random_state=random_state)
+ start_probs = topology.random_start_probs()
+ helpers.assert_distribution(start_probs)
+ helpers.assert_equal(
+ start_probs,
+ np.array([0.15210286, 0.10647349, 0.20059295, 0.11120171, 0.42962898]),
+ )
+
+
+def test_left_right_uniform_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform left-right transition matrix with minimal states"""
+ topology = topologies.LeftRightTopology(
+ n_states=1, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_left_right_uniform_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform left-right transition matrix with few states"""
+ topology = topologies.LeftRightTopology(
+ n_states=2, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.5, 0.5],
+ [0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_left_right_uniform_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform left-right transition matrix with many states"""
+ topology = topologies.LeftRightTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ [0.0, 0.25, 0.25, 0.25, 0.25],
+ [0.0, 0.0, 0.33333333, 0.33333333, 0.33333333],
+ [0.0, 0.0, 0.0, 0.5, 0.5],
+ [0.0, 0.0, 0.0, 0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_left_right_random_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random left-right transition matrix with minimal states"""
+ topology = topologies.LeftRightTopology(
+ n_states=1, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_left_right_random_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random left-right transition matrix with few states"""
+ topology = topologies.LeftRightTopology(
+ n_states=2, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.23561633, 0.76438367],
+ [0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_left_right_random_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random left-right transition matrix with many states"""
+ topology = topologies.LeftRightTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.23169814, 0.71716356, 0.02033845, 0.02516204, 0.00563782],
+ [0.0, 0.19474072, 0.16405008, 0.22228532, 0.41892388],
+ [0.0, 0.0, 0.42912755, 0.16545797, 0.40541448],
+ [0.0, 0.0, 0.0, 0.109713, 0.890287],
+ [0.0, 0.0, 0.0, 0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_left_right_check_transitions_invalid(
+ random_state: np.random.RandomState,
+) -> None:
+ """Validate an invalid left-right transition matrix"""
+ topology = topologies.LeftRightTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ ).random_transition_probs()
+ with pytest.raises(ValueError) as e: # noqa: PT011
+ topology.check_transition_probs(transitions)
+ assert (
+ str(e.value) == "Left-right transition matrix must be upper-triangular"
+ )
+
+
+def test_left_right_check_transitions_valid(
+ random_state: np.random.RandomState,
+) -> None:
+ """Validate a valid left-right transition matrix"""
+ topology = topologies.LeftRightTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ topology.check_transition_probs(transitions)
+
+
+def test_ergodic_uniform_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform ergodic transition matrix with minimal states"""
+ topology = topologies.ErgodicTopology(
+ n_states=1, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_ergodic_uniform_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform ergodic transition matrix with few states"""
+ topology = topologies.ErgodicTopology(
+ n_states=2, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.5, 0.5],
+ [0.5, 0.5],
+ ]
+ ),
+ )
+
+
+def test_ergodic_uniform_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform ergodic transition matrix with many states"""
+ topology = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ [0.2, 0.2, 0.2, 0.2, 0.2],
+ ]
+ ),
+ )
+
+
+def test_ergodic_random_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random ergodic transition matrix with minimal states"""
+ topology = topologies.ErgodicTopology(
+ n_states=1, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_ergodic_random_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random ergodic transition matrix with few states"""
+ topology = topologies.ErgodicTopology(
+ n_states=2, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.9474011, 0.0525989],
+ [0.85567599, 0.14432401],
+ ]
+ ),
+ )
+
+
+def test_ergodic_random_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random ergodic transition matrix with many states"""
+ topology = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.58715548, 0.14491542, 0.20980762, 0.00623944, 0.05188205],
+ [0.0840705, 0.23055049, 0.08297536, 0.25124688, 0.35115677],
+ [0.02117615, 0.37664662, 0.26705912, 0.09851123, 0.23660688],
+ [0.01938041, 0.16853843, 0.52046123, 0.07535256, 0.21626737],
+ [0.04996846, 0.44545843, 0.12079423, 0.07154241, 0.31223646],
+ ]
+ ),
+ )
+
+
+def test_ergodic_check_transitions_invalid(
+ random_state: np.random.RandomState
+) -> None:
+ """Validate an invalid ergodic transition matrix"""
+ topology = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topologies.LeftRightTopology(
+ n_states=5, random_state=random_state
+ ).random_transition_probs()
+ with pytest.warns(UserWarning):
+ topology.check_transition_probs(transitions)
+
+
+def test_ergodic_check_transitions_valid(
+ random_state: np.random.RandomState
+) -> None:
+ """Validate a valid ergodic transition matrix"""
+ topology = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ )
+ transitions = topology.random_transition_probs()
+ topology.check_transition_probs(transitions)
+
+
+def test_linear_uniform_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform linear transition matrix with minimal states"""
+ topology = topologies.LinearTopology(n_states=1, random_state=random_state)
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_linear_uniform_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform linear transition matrix with few states"""
+ topology = topologies.LinearTopology(n_states=2, random_state=random_state)
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.5, 0.5],
+ [0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_linear_uniform_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a uniform linear transition matrix with many states"""
+ topology = topologies.LinearTopology(n_states=5, random_state=random_state)
+ transitions = topology.uniform_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.5, 0.5, 0.0, 0.0, 0.0],
+ [0.0, 0.5, 0.5, 0.0, 0.0],
+ [0.0, 0.0, 0.5, 0.5, 0.0],
+ [0.0, 0.0, 0.0, 0.5, 0.5],
+ [0.0, 0.0, 0.0, 0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_linear_random_transitions_min(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random linear transition matrix with minimal states"""
+ topology = topologies.LinearTopology(n_states=1, random_state=random_state)
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(transitions, np.array([[1.0]]))
+
+
+def test_linear_random_transitions_small(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random linear transition matrix with few states"""
+ topology = topologies.LinearTopology(n_states=2, random_state=random_state)
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.65157396, 0.34842604],
+ [0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_linear_random_transitions_many(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ """Generate a random linear transition matrix with many states"""
+ topology = topologies.LinearTopology(n_states=5, random_state=random_state)
+ transitions = topology.random_transition_probs()
+ helpers.assert_distribution(transitions)
+ helpers.assert_equal(
+ transitions,
+ np.array(
+ [
+ [0.44455421, 0.55544579, 0.0, 0.0, 0.0],
+ [0.0, 0.57553614, 0.42446386, 0.0, 0.0],
+ [0.0, 0.0, 0.92014965, 0.07985035, 0.0],
+ [0.0, 0.0, 0.0, 0.66790982, 0.33209018],
+ [0.0, 0.0, 0.0, 0.0, 1.0],
+ ]
+ ),
+ )
+
+
+def test_linear_check_transitions_invalid(
+ random_state: np.random.RandomState
+) -> None:
+ """Validate an invalid linear transition matrix"""
+ topology = topologies.LinearTopology(n_states=5, random_state=random_state)
+ transitions = topologies.ErgodicTopology(
+ n_states=5, random_state=random_state
+ ).random_transition_probs()
+ with pytest.raises(ValueError) as e: # noqa: PT011
+ topology.check_transition_probs(transitions)
+ assert (
+ str(e.value) == "Left-right transition matrix must be upper-triangular"
+ )
+
+
+def test_linear_check_transitions_valid(
+ random_state: np.random.RandomState
+) -> None:
+ """Validate a valid linear transition matrix"""
+ topology = topologies.LinearTopology(n_states=5, random_state=random_state)
+ transitions = topology.random_transition_probs()
+ topology.check_transition_probs(transitions)
diff --git a/tests/unit/test_models/__init__.py b/tests/unit/test_models/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/__init__.py b/tests/unit/test_models/hmm/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_models/hmm/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/test_classifier.py b/tests/unit/test_models/hmm/test_classifier.py
new file mode 100644
index 00000000..0ca4dbd4
--- /dev/null
+++ b/tests/unit/test_models/hmm/test_classifier.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import copy
+import os
+import tempfile
+import typing as t
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia import enums
+from sequentia._internal import _validation
+from sequentia.datasets import (
+ SequentialDataset,
+ load_digits,
+ load_gene_families,
+)
+from sequentia.models.hmm import (
+ CategoricalHMM,
+ GaussianMixtureHMM,
+ HMMClassifier,
+)
+from sequentia.models.hmm.variants.base import BaseHMM
+
+from .variants.test_categorical import assert_fit as assert_categorical_fit
+from .variants.test_gaussian_mixture import (
+ assert_fit as assert_gaussian_mixture_fit,
+)
+
+n_classes = 7
+
+
+@pytest.fixture(scope="module")
+def random_state(request: SubRequest) -> np.random.RandomState:
+ return np.random.RandomState(1)
+
+
+@pytest.fixture(scope="module")
+def dataset(request: SubRequest) -> SequentialDataset | None:
+ if request.param == "digits":
+ return load_digits(digits=range(n_classes))
+ if request.param == "gene_families":
+ data, _ = load_gene_families()
+ return data
+ return None
+
+
+@pytest.fixture(scope="module")
+def model(
+ random_state: np.random.RandomState, request: SubRequest
+) -> BaseHMM | None:
+ if request.param == "gaussian_mixture":
+ return GaussianMixtureHMM(
+ topology=enums.TopologyMode.LEFT_RIGHT,
+ n_states=2,
+ n_components=1,
+ random_state=random_state,
+ )
+ if request.param == "categorical":
+ return CategoricalHMM(
+ topology=enums.TopologyMode.LEFT_RIGHT,
+ n_states=2,
+ random_state=random_state,
+ )
+ return None
+
+
+class MockData:
+ def __init__(self, length):
+ self.length = length
+
+ def __len__(self):
+ return self.length
+
+ @property
+ def lengths(self):
+ return MockData(self.length)
+
+
+def assert_fit(clf: BaseHMM):
+ assert hasattr(clf, "prior_")
+ assert hasattr(clf, "classes_")
+ assert _validation.check_is_fitted(clf, return_=True)
+
+ for hmm in clf.models.values():
+ data = MockData(hmm.n_seqs_)
+ if isinstance(hmm, GaussianMixtureHMM):
+ assert_gaussian_mixture_fit(hmm, data=data)
+ elif isinstance(hmm, CategoricalHMM):
+ assert_categorical_fit(hmm, data=data)
+
+
+@pytest.mark.parametrize(
+ "model, dataset", # noqa: PT006
+ [("gaussian_mixture", "digits"), ("categorical", "gene_families")],
+ indirect=True,
+)
+@pytest.mark.parametrize(
+ "prior",
+ [
+ enums.PriorMode.UNIFORM,
+ enums.PriorMode.FREQUENCY,
+ {
+ i: (i + 1) / (n_classes * (n_classes + 1) / 2)
+ for i in range(n_classes)
+ },
+ ],
+)
+@pytest.mark.parametrize("prefit", [True, False])
+def test_classifier_e2e(
+ request: SubRequest,
+ helpers: t.Any,
+ model: BaseHMM,
+ dataset: SequentialDataset,
+ prior: enums.PriorMode | dict[int, float],
+ random_state: np.random.RandomState,
+ *,
+ prefit: bool,
+) -> None:
+ clf = HMMClassifier(prior=prior)
+ clf.add_models({i: copy.deepcopy(model) for i in range(n_classes)})
+
+ assert clf.prior == prior
+ assert len(clf.models) == n_classes
+ assert set(clf.models) == set(range(n_classes))
+ assert all(isinstance(hmm, type(model)) for hmm in clf.models.values())
+
+ subset, _ = dataset.split(
+ test_size=0.6, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ if prefit:
+ for X, lengths, c in train.iter_by_class():
+ clf.models[c].fit(X, lengths=lengths)
+ assert_fit(clf.fit())
+ else:
+ assert_fit(clf.fit(**train.X_y_lengths))
+
+ scores_pred = clf.predict_scores(**test.X_lengths)
+ assert scores_pred.shape == (len(test), n_classes)
+
+ proba_pred = clf.predict_proba(**test.X_lengths)
+ assert proba_pred.shape == (len(test), n_classes)
+ helpers.assert_equal(proba_pred.sum(axis=1), 1)
+ assert ((proba_pred >= 0) & (proba_pred <= 1)).all()
+
+ y_pred = clf.predict(**test.X_lengths)
+ assert y_pred.shape == (len(test),)
+ assert set(y_pred).issubset(set(range(n_classes)))
+
+ acc = clf.score(**test.X_y_lengths)
+ assert 0 <= acc <= 1
+
+ # check serialization/deserialization
+ with tempfile.TemporaryDirectory() as temp_dir:
+ model_path = f"{temp_dir}/{request.node.originalname}.model"
+ # check that save works
+ clf.save(model_path)
+ assert os.path.isfile(model_path)
+ # check that load works
+ clf = HMMClassifier.load(model_path)
+ # check that loaded model is fitted
+ assert_fit(clf)
+ y_pred_load = clf.predict(**test.X_lengths)
+ # check that predictions are the same as before serialization
+ helpers.assert_equal(y_pred, y_pred_load)
+
+
+@pytest.mark.parametrize("classes", [[0, 1, 2], [2, 0, 1]])
+def test_classifier_compute_log_posterior(
+ helpers: t.Any, classes: list[int]
+) -> None:
+ clf = HMMClassifier()
+ clf.classes_ = np.array(classes)
+ clf.prior_ = {i: np.exp(i) for i in clf.classes_}
+ clf.models = {
+ i: Mock(score=Mock(side_effect=lambda _: 0)) for i in clf.classes_
+ }
+ helpers.assert_equal(clf._compute_log_posterior(None), clf.classes_)
+
+
+def test_classifier_compute_scores_chunk(helpers: t.Any) -> None:
+ clf = HMMClassifier()
+ clf.classes_ = np.arange(3)
+ clf.prior_ = {i: np.exp(i) for i in clf.classes_}
+ clf.models = {i: Mock(score=Mock(side_effect=len)) for i in clf.classes_}
+ X = np.expand_dims(np.arange(10), axis=-1)
+ idxs = np.array([[0, 0], [1, 2], [3, 5], [6, 9]]) # lengths = 0, 1, 2, 3
+ helpers.assert_equal(
+ clf._compute_scores_chunk(X, idxs=idxs),
+ np.tile(np.expand_dims(clf.classes_, axis=-1), len(idxs)).T
+ + np.expand_dims(np.arange(len(idxs)), axis=-1),
+ )
diff --git a/tests/unit/test_models/hmm/variants/__init__.py b/tests/unit/test_models/hmm/variants/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_models/hmm/variants/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/variants/test_categorical.py b/tests/unit/test_models/hmm/variants/test_categorical.py
new file mode 100644
index 00000000..ba35326e
--- /dev/null
+++ b/tests/unit/test_models/hmm/variants/test_categorical.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import typing as t
+
+import hmmlearn
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia import enums
+from sequentia._internal import _validation
+from sequentia._internal._hmm.topologies import TOPOLOGY_MAP, BaseTopology
+from sequentia.datasets import SequentialDataset, load_gene_families
+from sequentia.models import CategoricalHMM
+
+from .....conftest import Helpers
+
+
+@pytest.fixture(scope="module")
+def random_state() -> np.random.RandomState:
+ return np.random.RandomState(0)
+
+
+@pytest.fixture(scope="module")
+def data(random_state: np.random.RandomState) -> SequentialDataset:
+ data_, _ = load_gene_families(families=[0])
+ _, subset = data_.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+ return subset
+
+
+@pytest.fixture(scope="module")
+def topology(request: SubRequest) -> BaseTopology:
+ return TOPOLOGY_MAP[request.param]
+
+
+def assert_fit(hmm: CategoricalHMM, /, *, data: SequentialDataset) -> None:
+ assert hmm.n_seqs_ == len(data.lengths)
+ assert (hmm.topology_ is not None) == (hmm.topology is not None)
+ assert isinstance(hmm.model, hmmlearn.hmm.CategoricalHMM)
+ assert len(hmm.model.monitor_.history) > 0
+ assert _validation.check_is_fitted(hmm, return_=True)
+
+
+def test_categorical_fit_n_states(
+ data: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ hmm = CategoricalHMM(n_states=7, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.n_states == 7
+
+ assert hmm.model.startprob_.shape == (hmm.n_states,)
+ assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
+
+
+def test_categorical_fit_no_topology(
+ data: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ hmm = CategoricalHMM(topology=None, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.topology is None
+ assert hmm.topology_ is None
+
+ assert set(hmm.model.init_params) == set("ste")
+ assert set(hmm.model.params) == set("ste")
+
+ assert not hasattr(hmm, "_startprob")
+ assert not hasattr(hmm, "_transmat")
+
+
+@pytest.mark.parametrize("topology", list(enums.TopologyMode), indirect=True)
+@pytest.mark.parametrize(
+ "start_probs_mode", [*list(enums.TransitionMode), None]
+)
+def test_categorical_fit_set_state_start_probs(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ topology: BaseTopology,
+ start_probs_mode: enums.TransitionMode | None,
+) -> None:
+ hmm = CategoricalHMM(topology=topology.mode, random_state=random_state)
+ hmm.set_state_start_probs(
+ start_probs_mode
+ or topology(
+ n_states=hmm.n_states, random_state=random_state
+ ).random_start_probs()
+ )
+
+ assert hmm.topology == topology.mode
+ if start_probs_mode is not None:
+ assert hmm._startprob == start_probs_mode
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert isinstance(hmm.topology_, topology)
+
+ assert set(hmm.model.init_params) == set("e")
+ assert set(hmm.model.params) == set("ste")
+
+ hmm.topology_.check_start_probs(
+ hmm._startprob
+ ) # transition matrix before fit
+ hmm.topology_.check_start_probs(
+ hmm.model.startprob_
+ ) # transition matrix after fit
+
+ if start_probs_mode == enums.TransitionMode.UNIFORM:
+ init_startprob = hmm.topology_.uniform_start_probs()
+ helpers.assert_equal(
+ hmm._startprob, init_startprob
+ ) # initial state probabilities should be uniform
+
+ helpers.assert_not_equal(
+ hmm._startprob, hmm.model.startprob_
+ ) # should update probabilities
+ helpers.assert_equal(
+ hmm._startprob == 0, hmm.model.startprob_ == 0
+ ) # but locations of zeros (if any) shouldn't change
+
+
+@pytest.mark.parametrize("topology", list(enums.TopologyMode), indirect=True)
+@pytest.mark.parametrize(
+ "transition_mode", [*list(enums.TransitionMode), None]
+) # None = custom
+def test_categorical_fit_set_state_transition_probs(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ topology: BaseTopology,
+ transition_mode: enums.TransitionMode | None,
+) -> None:
+ hmm = CategoricalHMM(topology=topology.mode, random_state=random_state)
+ hmm.set_state_transition_probs(
+ transition_mode
+ or topology(
+ n_states=hmm.n_states, random_state=random_state
+ ).random_transition_probs()
+ )
+
+ assert hmm.topology == topology.mode
+ if transition_mode is not None:
+ assert hmm._transmat == transition_mode
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert isinstance(hmm.topology_, topology)
+
+ assert set(hmm.model.init_params) == set("e")
+ assert set(hmm.model.params) == set("ste")
+
+ hmm.topology_.check_transition_probs(
+ hmm._transmat
+ ) # transition matrix before fit
+ hmm.topology_.check_transition_probs(
+ hmm.model.transmat_
+ ) # transition matrix after fit
+
+ if transition_mode == enums.TransitionMode.UNIFORM:
+ init_transmat = hmm.topology_.uniform_transition_probs()
+ helpers.assert_equal(
+ hmm._transmat, init_transmat
+ ) # transition probabilities should be uniform
+
+ helpers.assert_not_equal(
+ hmm._transmat, hmm.model.transmat_
+ ) # should update probabilities
+ helpers.assert_equal(
+ hmm._transmat == 0, hmm.model.transmat_ == 0
+ ) # but locations of zeros (if any) shouldn't change
+
+
+@pytest.mark.parametrize("freeze_params", Helpers.combinations("ste"))
+def test_categorical_fit_freeze_unfreeze(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ freeze_params: str,
+) -> None:
+ hmm = CategoricalHMM(
+ topology=enums.TopologyMode.LINEAR,
+ n_states=2,
+ random_state=random_state,
+ )
+ hmm.freeze(freeze_params)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert set(hmm.model.params) == set("ste") - set(freeze_params)
+
+ hmm.topology_.check_start_probs(
+ hmm._startprob
+ ) # initial state dist. before fit
+ hmm.topology_.check_start_probs(
+ hmm.model.startprob_
+ ) # initial state dist. after fit
+ assertion = (
+ helpers.assert_equal
+ if "s" in freeze_params
+ else helpers.assert_not_equal
+ )
+ assertion(hmm._startprob, hmm.model.startprob_)
+
+ hmm.topology_.check_transition_probs(
+ hmm._transmat
+ ) # transition matrix before fit
+ hmm.topology_.check_transition_probs(
+ hmm.model.transmat_
+ ) # transition matrix after fit
+ assertion = (
+ helpers.assert_equal
+ if "t" in freeze_params
+ else helpers.assert_not_equal
+ )
+ assertion(hmm._transmat, hmm.model.transmat_)
+
+ hmm.unfreeze(freeze_params)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert set(hmm.model.params) == set("ste")
+
+ helpers.assert_not_equal(hmm._startprob, hmm.model.startprob_)
+ helpers.assert_not_equal(hmm._transmat, hmm.model.transmat_)
diff --git a/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py b/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py
new file mode 100644
index 00000000..d091099a
--- /dev/null
+++ b/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import typing as t
+
+import hmmlearn
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia import enums
+from sequentia._internal import _validation
+from sequentia._internal._hmm.topologies import TOPOLOGY_MAP, BaseTopology
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.models import GaussianMixtureHMM
+
+from .....conftest import Helpers
+
+
+@pytest.fixture(scope="module")
+def random_state() -> np.random.RandomState:
+ return np.random.RandomState(0)
+
+
+@pytest.fixture(scope="module")
+def data(random_state: np.random.RandomState) -> SequentialDataset:
+ data_ = load_digits(digits=[0])
+ _, subset = data_.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+ return subset
+
+
+@pytest.fixture(scope="module")
+def topology(request: SubRequest) -> BaseTopology:
+ return TOPOLOGY_MAP[request.param]
+
+
+def assert_fit(hmm: GaussianMixtureHMM, /, *, data: SequentialDataset) -> None:
+ assert hmm.n_seqs_ == len(data.lengths)
+ assert (hmm.topology_ is not None) == (hmm.topology is not None)
+ assert isinstance(hmm.model, hmmlearn.hmm.GMMHMM)
+ assert len(hmm.model.monitor_.history) > 0
+ assert _validation.check_is_fitted(hmm, return_=True)
+
+
+def test_gaussian_mixture_fit_n_states(
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+) -> None:
+ hmm = GaussianMixtureHMM(n_states=7, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.n_states == 7
+
+ assert hmm.model.startprob_.shape == (hmm.n_states,)
+ assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
+
+
+def test_gaussian_mixture_fit_n_components(
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+) -> None:
+ hmm = GaussianMixtureHMM(n_components=2, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.n_components == 2
+
+ assert hmm.model.startprob_.shape == (hmm.n_states,)
+ assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
+
+ n_features = data.X.shape[1]
+
+ assert hmm.model.means_.shape == (
+ hmm.n_states,
+ hmm.n_components,
+ n_features,
+ )
+ assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components)
+ assert hmm.model.weights_.shape == (hmm.n_states, hmm.n_components)
+
+
+@pytest.mark.parametrize("covariance", list(enums.CovarianceMode))
+def test_gaussian_mixture_fit_covariance(
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ covariance: enums.CovarianceMode,
+) -> None:
+ hmm = GaussianMixtureHMM(covariance=covariance, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.covariance == covariance
+
+ assert hmm.model.startprob_.shape == (hmm.n_states,)
+ assert hmm.model.transmat_.shape == (hmm.n_states, hmm.n_states)
+
+ n_features = data.X.shape[1]
+
+ assert hmm.model.means_.shape == (
+ hmm.n_states,
+ hmm.n_components,
+ n_features,
+ )
+ assert hmm.model.weights_.shape == (hmm.n_states, hmm.n_components)
+
+ match covariance:
+ case enums.CovarianceMode.SPHERICAL:
+ assert hmm.model.covars_.shape == (hmm.n_states, hmm.n_components)
+ case enums.CovarianceMode.DIAGONAL:
+ assert hmm.model.covars_.shape == (
+ hmm.n_states,
+ hmm.n_components,
+ n_features,
+ )
+ case enums.CovarianceMode.FULL:
+ assert hmm.model.covars_.shape == (
+ hmm.n_states,
+ hmm.n_components,
+ n_features,
+ n_features,
+ )
+ case enums.CovarianceMode.TIED:
+ assert hmm.model.covars_.shape == (
+ hmm.n_states,
+ n_features,
+ n_features,
+ )
+
+
+def test_gaussian_mixture_fit_no_topology(
+ data: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ hmm = GaussianMixtureHMM(topology=None, random_state=random_state)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert hmm.topology is None
+ assert hmm.topology_ is None
+
+ assert set(hmm.model.init_params) == set("stmcw")
+ assert set(hmm.model.params) == set("stmcw")
+
+ assert not hasattr(hmm, "_startprob")
+ assert not hasattr(hmm, "_transmat")
+
+
+@pytest.mark.parametrize("topology", list(enums.TopologyMode), indirect=True)
+@pytest.mark.parametrize(
+ "start_probs_mode", [*list(enums.TransitionMode), None]
+)
+def test_gaussian_mixture_fit_set_state_start_probs(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ topology: BaseTopology,
+ start_probs_mode: enums.TransitionMode | None,
+) -> None:
+ hmm = GaussianMixtureHMM(topology=topology.mode, random_state=random_state)
+ hmm.set_state_start_probs(
+ start_probs_mode
+ or topology(
+ n_states=hmm.n_states, random_state=random_state
+ ).random_start_probs()
+ )
+
+ assert hmm.topology == topology.mode
+ if start_probs_mode is not None:
+ assert hmm._startprob == start_probs_mode
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert isinstance(hmm.topology_, topology)
+
+ assert set(hmm.model.init_params) == set("mcw")
+ assert set(hmm.model.params) == set("stmcw")
+
+ hmm.topology_.check_start_probs(
+ hmm._startprob
+ ) # transition matrix before fit
+ hmm.topology_.check_start_probs(
+ hmm.model.startprob_
+ ) # transition matrix after fit
+
+ if start_probs_mode == enums.TransitionMode.UNIFORM:
+ init_startprob = hmm.topology_.uniform_start_probs()
+ helpers.assert_equal(
+ hmm._startprob, init_startprob
+ ) # initial state probabilities should be uniform
+
+ helpers.assert_not_equal(
+ hmm._startprob, hmm.model.startprob_
+ ) # should update probabilities
+ # helpers.assert_equal(hmm._startprob == 0, hmm.model.startprob_ == 0) # but locations of zeros (if any) shouldn't change
+
+
+@pytest.mark.parametrize("topology", list(enums.TopologyMode), indirect=True)
+@pytest.mark.parametrize(
+ "transition_mode", [*list(enums.TransitionMode), None]
+)
+def test_gaussian_mixture_fit_set_state_transition_probs(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ topology: BaseTopology,
+ transition_mode: enums.TransitionMode | None,
+) -> None:
+ hmm = GaussianMixtureHMM(topology=topology.mode, random_state=random_state)
+ hmm.set_state_transition_probs(
+ transition_mode
+ or topology(
+ n_states=hmm.n_states, random_state=random_state
+ ).random_transition_probs()
+ )
+
+ assert hmm.topology == topology.mode
+ if transition_mode is not None:
+ assert hmm._transmat == transition_mode
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert isinstance(hmm.topology_, topology)
+
+ assert set(hmm.model.init_params) == set("mcw")
+ assert set(hmm.model.params) == set("stmcw")
+
+ hmm.topology_.check_transition_probs(
+ hmm._transmat
+ ) # transition matrix before fit
+ hmm.topology_.check_transition_probs(
+ hmm.model.transmat_
+ ) # transition matrix after fit
+
+ if transition_mode == enums.TransitionMode.UNIFORM:
+ init_transmat = hmm.topology_.uniform_transition_probs()
+ helpers.assert_equal(
+ hmm._transmat, init_transmat
+ ) # transition probabilities should be uniform
+
+ helpers.assert_not_equal(
+ hmm._transmat, hmm.model.transmat_
+ ) # should update probabilities
+ # assert_equal(hmm._transmat == 0, hmm.model.transmat_ == 0) # but locations of zeros (if any) shouldn't change
+
+
+@pytest.mark.parametrize("freeze_params", Helpers.combinations("stmcw"))
+def test_gaussian_mixture_fit_freeze_unfreeze(
+ helpers: t.Any,
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ freeze_params: str,
+) -> None:
+ hmm = GaussianMixtureHMM(
+ topology=enums.TopologyMode.LINEAR,
+ n_components=2,
+ n_states=2,
+ random_state=random_state,
+ )
+ hmm.freeze(freeze_params)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert set(hmm.model.params) == set("stmcw") - set(freeze_params)
+
+ hmm.topology_.check_start_probs(
+ hmm._startprob
+ ) # initial state dist. before fit
+ hmm.topology_.check_start_probs(
+ hmm.model.startprob_
+ ) # initial state dist. after fit
+ assertion = (
+ helpers.assert_equal
+ if "s" in freeze_params
+ else helpers.assert_not_equal
+ )
+ assertion(hmm._startprob, hmm.model.startprob_)
+
+ hmm.topology_.check_transition_probs(
+ hmm._transmat
+ ) # transition matrix before fit
+ hmm.topology_.check_transition_probs(
+ hmm.model.transmat_
+ ) # transition matrix after fit
+ assertion = (
+ helpers.assert_equal
+ if "t" in freeze_params
+ else helpers.assert_not_equal
+ )
+ assertion(hmm._transmat, hmm.model.transmat_)
+
+ hmm.unfreeze(freeze_params)
+
+ assert_fit(hmm.fit(**data.X_lengths), data=data)
+
+ assert set(hmm.model.params) == set("stmcw")
+
+ helpers.assert_not_equal(hmm._startprob, hmm.model.startprob_)
+ helpers.assert_not_equal(hmm._transmat, hmm.model.transmat_)
diff --git a/tests/unit/test_models/knn/__init__.py b/tests/unit/test_models/knn/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_models/knn/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/knn/test_classifier.py b/tests/unit/test_models/knn/test_classifier.py
new file mode 100644
index 00000000..ce3b0e87
--- /dev/null
+++ b/tests/unit/test_models/knn/test_classifier.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import math
+import os
+import tempfile
+import typing as t
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia._internal import _validation
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.models.knn import KNNClassifier
+
+from ....conftest import Helpers
+
+n_classes = 3
+
+
+@pytest.fixture(scope="module")
+def random_state(request: SubRequest) -> np.random.RandomState:
+ return np.random.RandomState(1)
+
+
+@pytest.fixture(scope="module")
+def dataset() -> SequentialDataset:
+ return load_digits(digits=range(n_classes))
+
+
+def assert_fit(clf: KNNClassifier, /, *, data: SequentialDataset) -> None:
+ assert hasattr(clf, "X_")
+ assert hasattr(clf, "y_")
+ assert hasattr(clf, "lengths_")
+ assert hasattr(clf, "idxs_")
+ assert _validation.check_is_fitted(clf, return_=True)
+ Helpers.assert_equal(clf.X_, data.X)
+ Helpers.assert_equal(clf.y_, data.y)
+ Helpers.assert_equal(clf.lengths_, data.lengths)
+
+
+@pytest.mark.parametrize("k", [1, 2, 5])
+@pytest.mark.parametrize("weighting", [None, lambda x: np.exp(-x)])
+def test_classifier_e2e(
+ helpers: t.Any,
+ request: SubRequest,
+ k: int,
+ weighting: t.Callable | None,
+ dataset: SequentialDataset,
+ random_state: np.random.RandomState,
+) -> None:
+ clf = KNNClassifier(k=k, weighting=weighting, random_state=random_state)
+
+ assert clf.k == k
+ assert clf.weighting == weighting
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ subset, _ = data.split(
+ test_size=0.98, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ assert_fit(clf.fit(**train.X_y_lengths), data=train)
+ params = clf.get_params()
+
+ scores_pred = clf.predict_scores(**test.X_lengths)
+ assert scores_pred.shape == (len(test), n_classes)
+
+ proba_pred = clf.predict_proba(**test.X_lengths)
+ assert proba_pred.shape == (len(test), n_classes)
+ helpers.assert_equal(proba_pred.sum(axis=1), 1)
+ assert ((proba_pred >= 0) & (proba_pred <= 1)).all()
+
+ y_pred = clf.predict(**test.X_lengths)
+ assert np.issubdtype(y_pred.dtype, np.integer)
+ assert y_pred.shape == (len(test),)
+ assert set(y_pred).issubset(set(range(n_classes)))
+
+ acc = clf.score(**test.X_y_lengths)
+ assert 0 <= acc <= 1
+
+ # check serialization/deserialization
+ with tempfile.TemporaryDirectory() as temp_dir:
+ model_path = f"{temp_dir}/{request.node.originalname}.model"
+ # check that save works
+ clf.save(model_path)
+ assert os.path.isfile(model_path)
+ # check that load works
+ clf = KNNClassifier.load(model_path)
+ assert (set(clf.get_params()) - {"weighting"}) == (
+ set(params) - {"weighting"}
+ )
+ # sanity check that custom weighting functions are the same
+ if weighting:
+ x = random_state.rand(100)
+ helpers.assert_equal(weighting(x), clf.weighting(x))
+ # check that loaded model is fitted and can make predictions
+ assert_fit(clf, data=train)
+ y_pred_load = clf.predict(**test.X_lengths)
+ if k == 1:
+ # predictions should be same as before
+ helpers.assert_equal(y_pred, y_pred_load)
+
+
+def test_classifier_predict_train(
+ dataset: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ """Should be able to perfectly predict training data with k=1"""
+ clf = KNNClassifier(k=1, random_state=random_state)
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ train, _ = data.split(
+ train_size=0.05, random_state=random_state, stratify=True
+ )
+
+ assert_fit(clf.fit(**train.X_y_lengths), data=train)
+ assert math.isclose(clf.score(**train.X_y_lengths), 1.0)
+
+
+@pytest.mark.parametrize("classes", [[0, 1, 2], [2, 0, 1]])
+def test_classifier_compute_scores(
+ helpers: t.Any, classes: list[int], random_state: np.random.RandomState
+) -> None:
+ clf = KNNClassifier(k=5)
+ clf.random_state_ = random_state
+ clf.classes_ = np.array(classes)
+
+ labels = np.array(
+ [[0, 2, 1, 2, 2], [0, 1, 1, 2, 0], [1, 0, 0, 1, 2], [0, 0, 0, 1, 1]]
+ )
+ weightings = np.ones_like(labels)
+
+ scores = clf._compute_scores(labels, weightings)
+ if np.allclose(classes, [0, 1, 2]):
+ helpers.assert_equal(
+ scores, [[1, 1, 3], [2, 2, 1], [2, 2, 1], [3, 2, 0]]
+ )
+ elif np.allclose(classes, [2, 0, 1]):
+ helpers.assert_equal(
+ scores, [[3, 1, 1], [1, 2, 2], [1, 2, 2], [0, 3, 2]]
+ )
+
+
+@pytest.mark.parametrize("classes", [[0, 1, 2], [2, 0, 1]])
+def test_classifier_find_max_labels_chunk(
+ classes: list[int], random_state: np.random.RandomState
+) -> None:
+ clf = KNNClassifier()
+ clf.random_state_ = random_state
+ clf.classes_ = np.array(classes)
+
+ score_chunk = np.array(
+ [[10, 20, 20], [10, 30, 20], [10, 10, 10], [10, 10, 20]]
+ )
+
+ max_labels = clf._find_max_labels_chunk(score_chunk)
+ if np.allclose(classes, [0, 1, 2]):
+ assert max_labels[0] in (1, 2)
+ assert max_labels[1] == 1
+ assert max_labels[2] in (0, 1, 2)
+ assert max_labels[3] == 2
+ elif np.allclose(classes, [2, 0, 1]):
+ assert max_labels[0] in (0, 1)
+ assert max_labels[1] == 0
+ assert max_labels[2] in (0, 1, 2)
+ assert max_labels[3] == 1
+
+
+@pytest.mark.parametrize("tie", [True, False])
+def test_classifier_multi_argmax(helpers: t.Any, *, tie: bool) -> None:
+ if tie:
+ arr = np.array([3, 2, 4, 1, 3, 4, 4, 0, 2, 4])
+ helpers.assert_equal(
+ KNNClassifier._multi_argmax(arr), np.array([2, 5, 6, 9])
+ )
+ else:
+ arr = np.array([3, 2, 1, 1, 3, 4, 1, 0, 2, 0])
+ helpers.assert_equal(KNNClassifier._multi_argmax(arr), np.array([5]))
+
+
+@pytest.mark.parametrize("k", [1, 2, 5])
+@pytest.mark.parametrize("sort", [True, False])
+def test_classifier_query_neighbors(
+ helpers: t.Any,
+ k: int,
+ dataset: SequentialDataset,
+ random_state: np.random.RandomState,
+ *,
+ sort: bool,
+) -> None:
+ clf = KNNClassifier(k=k, random_state=random_state)
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ subset, _ = data.split(
+ test_size=0.98, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ assert_fit(clf.fit(**train.X_y_lengths), data=train)
+
+ k_idxs, k_distances, k_labels = clf.query_neighbors(
+ **test.X_lengths, sort=sort
+ )
+
+ # check that indices are between 0 and len(train)
+ assert np.issubdtype(k_idxs.dtype, np.integer)
+ assert k_idxs.shape == (len(test), clf.k)
+ assert set(k_idxs.flatten()).issubset(set(np.arange(len(train))))
+
+ # check that distances are sorted if sort=True
+ np.issubdtype(k_distances.dtype, np.floating)
+ assert k_distances.shape == (len(test), clf.k)
+ if sort and k > 1:
+ assert (k_distances[:, 1:] >= k_distances[:, :-1]).all()
+
+ # check that labels are a subset of training labels + check that labels match indices
+ assert np.issubdtype(k_labels.dtype, np.integer)
+ assert k_labels.shape == (len(test), clf.k)
+ assert set(k_labels.flatten()).issubset(set(train.y))
+ helpers.assert_equal(train.y[k_idxs], k_labels)
+
+
+def test_classifier_compute_distance_matrix(
+ dataset: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ clf = KNNClassifier()
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ subset, _ = data.split(
+ test_size=0.98, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ assert_fit(clf.fit(**train.X_y_lengths), data=train)
+
+ distances = clf.compute_distance_matrix(**test.X_lengths)
+ assert distances.shape == (len(test), len(train))
+
+
+def test_classifier_distance_matrix_row_col_chunk(helpers: t.Any) -> None:
+ clf = KNNClassifier()
+
+ clf.X_ = np.expand_dims(np.arange(7), axis=-1)
+ col_idxs = np.array([[0, 1], [1, 3], [4, 7]]) # lengths = 1, 2, 3
+
+ X = np.expand_dims(np.arange(14), axis=-1)
+ row_idxs = np.array(
+ [[0, 2], [2, 5], [5, 9], [9, 14]]
+ ) # lengths = 2, 3, 4, 5
+
+ distances = clf._distance_matrix_row_col_chunk(
+ col_idxs, row_idxs, X, lambda x1, x2: len(x1) - len(x2)
+ )
+ helpers.assert_equal(
+ distances,
+ np.array(
+ [
+ [1, 0, -1],
+ [2, 1, 0],
+ [3, 2, 1],
+ [4, 3, 2],
+ ]
+ ),
+ )
diff --git a/tests/unit/test_models/knn/test_regressor.py b/tests/unit/test_models/knn/test_regressor.py
new file mode 100644
index 00000000..715cd16e
--- /dev/null
+++ b/tests/unit/test_models/knn/test_regressor.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import math
+import os
+import tempfile
+import typing as t
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+
+from sequentia._internal import _validation
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.models.knn import KNNRegressor
+
+from ....conftest import Helpers
+
+n_classes = 3
+
+
+@pytest.fixture(scope="module")
+def random_state(request: SubRequest) -> np.random.RandomState:
+ return np.random.RandomState(1)
+
+
+@pytest.fixture(scope="module")
+def dataset() -> SequentialDataset:
+ return load_digits(digits=range(n_classes))
+
+
+def assert_fit(reg: KNNRegressor, /, *, data: SequentialDataset) -> None:
+ assert hasattr(reg, "X_")
+ assert hasattr(reg, "y_")
+ assert hasattr(reg, "lengths_")
+ assert hasattr(reg, "idxs_")
+ assert _validation.check_is_fitted(reg, return_=True)
+ Helpers.assert_equal(reg.X_, data.X)
+ Helpers.assert_equal(reg.y_, data.y)
+ Helpers.assert_equal(reg.lengths_, data.lengths)
+
+
+@pytest.mark.parametrize("k", [1, 2, 5])
+@pytest.mark.parametrize("weighting", [None, lambda x: np.exp(-x)])
+def test_regressor_e2e(
+ request: SubRequest,
+ helpers: t.Any,
+ k: int,
+ weighting: t.Callable | None,
+ dataset: SequentialDataset,
+ random_state: np.random.RandomState,
+) -> None:
+ reg = KNNRegressor(k=k, weighting=weighting, random_state=random_state)
+
+ assert reg.k == k
+ assert reg.weighting == weighting
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ subset, _ = data.split(
+ test_size=0.98, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ assert_fit(reg.fit(**train.X_y_lengths), data=train)
+ params = reg.get_params()
+
+ y_pred = reg.predict(**test.X_lengths)
+ assert np.issubdtype(y_pred.dtype, np.floating)
+ assert y_pred.shape == (len(test),)
+
+ reg.score(**test.X_y_lengths)
+
+ # check serialization/deserialization
+ with tempfile.TemporaryDirectory() as temp_dir:
+ model_path = f"{temp_dir}/{request.node.originalname}.model"
+ # check that save works
+ reg.save(model_path)
+ assert os.path.isfile(model_path)
+ # check that load works
+ reg = KNNRegressor.load(model_path)
+ assert (set(reg.get_params()) - {"weighting"}) == (
+ set(params) - {"weighting"}
+ )
+ # sanity check that custom weighting functions are the same
+ if weighting:
+ x = random_state.rand(100)
+ helpers.assert_equal(weighting(x), reg.weighting(x))
+ # check that loaded model is fitted and can make predictions
+ assert_fit(reg, data=train)
+ y_pred_load = reg.predict(**test.X_lengths)
+ if k == 1:
+ # predictions should be same as before
+ helpers.assert_equal(y_pred, y_pred_load)
+
+
+def test_regressor_predict_train(
+ dataset: SequentialDataset, random_state: np.random.RandomState
+) -> None:
+ """Should be able to perfectly predict training data with k=1"""
+ clf = KNNRegressor(k=1, random_state=random_state)
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+ train, _ = data.split(
+ train_size=0.05, random_state=random_state, stratify=True
+ )
+
+ assert_fit(clf.fit(**train.X_y_lengths), data=train)
+ assert math.isclose(clf.score(**train.X_y_lengths), 1.0)
+
+
+def test_regressor_weighting(
+ helpers: t.Any, random_state: np.random.RandomState
+) -> None:
+ clf = KNNRegressor(k=3, weighting=lambda x: np.where(x > 10, 0.5, 1))
+ clf.random_state_ = random_state
+
+ clf.query_neighbors = Mock(
+ return_value=(
+ None,
+ np.array(
+ [
+ [1.5, 2, 1],
+ [2.5, 1, 0.5],
+ ]
+ ),
+ np.array([[10.2, 11.5, 10.4], [8.0, 6.5, 5.5]]),
+ )
+ )
+
+ helpers.assert_equal(
+ clf.predict(None, lengths=None),
+ np.array(
+ [
+ (10.2 * 0.5 + 11.5 * 0.5 + 10.4 * 0.5) / (0.5 * clf.k),
+ (8.0 * 1 + 6.5 * 1 + 5.5 * 1) / (1 * clf.k),
+ ]
+ ),
+ )
+
+
+@pytest.mark.parametrize("k", [1, 2, 5])
+@pytest.mark.parametrize("sort", [True, False])
+def test_regressor_query_neighbors(
+ helpers: t.Any,
+ k: int,
+ dataset: SequentialDataset,
+ random_state: np.random.RandomState,
+ *,
+ sort: bool,
+) -> None:
+ clf = KNNRegressor(k=k, random_state=random_state)
+
+ data = dataset.copy()
+ data._X = data._X[:, :1] # only use one feature
+
+ subset, _ = data.split(
+ test_size=0.98, random_state=random_state, stratify=True
+ )
+ train, test = subset.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+
+ clf.fit(**train.X_y_lengths)
+
+ k_idxs, k_distances, k_outputs = clf.query_neighbors(
+ **test.X_lengths, sort=sort
+ )
+
+ # check that indices are between 0 and len(train)
+ assert np.issubdtype(k_idxs.dtype, np.integer)
+ assert k_idxs.shape == (len(test), clf.k)
+ assert set(k_idxs.flatten()).issubset(set(np.arange(len(train))))
+
+ # check that distances are sorted if sort=True
+ np.issubdtype(k_distances.dtype, np.floating)
+ assert k_distances.shape == (len(test), clf.k)
+ if sort and k > 1:
+ assert (k_distances[:, 1:] >= k_distances[:, :-1]).all()
+
+ # check that labels are a subset of training outputs + check that outputs match indices
+ assert np.issubdtype(k_outputs.dtype, np.floating)
+ assert k_outputs.shape == (len(test), clf.k)
+ assert set(k_outputs.flatten()).issubset(set(train.y))
+ helpers.assert_equal(train.y[k_idxs], k_outputs)
diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py
new file mode 100644
index 00000000..dfdbff04
--- /dev/null
+++ b/tests/unit/test_pipeline.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import typing as t
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+from sklearn.decomposition import PCA
+from sklearn.exceptions import NotFittedError
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import scale
+from sklearn.utils._param_validation import InvalidParameterError
+from sklearn.utils.validation import check_is_fitted
+
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.models import KNNClassifier
+from sequentia.preprocessing import IndependentFunctionTransformer
+
+
+@pytest.fixture(scope="module")
+def random_state(request: SubRequest) -> np.random.RandomState:
+ return np.random.RandomState(0)
+
+
+@pytest.fixture(scope="module")
+def data(random_state: np.random.RandomState) -> SequentialDataset:
+ data_ = load_digits(digits=[0])
+ _, subset = data_.split(
+ test_size=0.05, random_state=random_state, stratify=True
+ )
+ return subset
+
+
+def test_pipeline_with_transforms(
+ helpers: t.Any, data: SequentialDataset
+) -> None:
+ # create pipeline with a stateless and stateful transform
+ pipeline = Pipeline(
+ [
+ (
+ "scale",
+ IndependentFunctionTransformer(
+ scale, inverse_func=lambda x: x, check_inverse=False
+ ),
+ ),
+ ("pca", PCA(n_components=1)),
+ ]
+ )
+
+ # check that transforming without fitting doesn't work
+ with pytest.raises(NotFittedError):
+ pipeline.transform(**data.X_lengths)
+
+ # check that fitting without y works
+ check_is_fitted(pipeline.fit(**data.X_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+
+ # check that fitting with y works
+ check_is_fitted(pipeline.fit(**data.X_y_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+
+ # check that transforming after fit works
+ Xt = pipeline.transform(**data.X_lengths)
+ helpers.assert_not_equal(data.X, Xt)
+ assert Xt.shape == (len(data.X), 1)
+
+ # check that fit_transform works
+ Xt = pipeline.fit_transform(**data.X_lengths)
+ helpers.assert_not_equal(data.X, Xt)
+ assert Xt.shape == (len(data.X), 1)
+
+ # check that inverse_transform works
+ Xi = pipeline.inverse_transform(Xt, lengths=data.lengths)
+ helpers.assert_not_equal(Xt, Xi)
+
+ # check that prediction functions relying on X and lengths don't work
+ for func in ("predict", "predict_proba"):
+ with pytest.raises(AttributeError):
+ getattr(pipeline, func)(**data.X_lengths)
+
+ # check that fit_predict doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.fit_predict(**data.X_y_lengths)
+
+ # TODO @eonu: currently broken for some reason
+ # check that score works if the final transform implements it, with y
+ # pipeline.score(data.X, data.y, lengths=data.lengths)
+ # pipeline.score(**data.X_y_lengths)
+
+ # TODO @eonu: currently broken for some reason
+ # check that score works if the final transform implements it, without y
+ # pipeline.score(**data.X_lengths)
+
+
+def test_pipeline_with_estimator(data: SequentialDataset) -> None:
+ pipeline = Pipeline(
+ [
+ ("knn", KNNClassifier(k=1)),
+ ]
+ )
+
+ # check that transforming doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.transform(**data.X_lengths)
+
+ # check that fitting without y doesn't work
+ with pytest.raises(InvalidParameterError):
+ pipeline.fit(**data.X_lengths)
+
+ # check that fitting with y works
+ check_is_fitted(pipeline.fit(**data.X_y_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+
+ # check that transforming doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.transform(**data.X_lengths)
+
+ # check that fit_transform doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.fit_transform(**data.X_lengths)
+
+ # check that inverse_transform doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.inverse_transform(**data.X_lengths)
+
+ # check that predict works
+ y_pred = pipeline.predict(**data.X_lengths)
+ assert y_pred.shape == data.y.shape
+ assert set(y_pred) == set(data.classes)
+
+ # check that predict_log_proba works
+ log_proba_pred = pipeline.predict_log_proba(**data.X_lengths)
+ assert log_proba_pred.shape == (len(data), len(data.classes))
+
+ # check that predict_proba works
+ proba_pred = pipeline.predict_proba(**data.X_lengths)
+ assert proba_pred.shape == (len(data), len(data.classes))
+
+ # check that fit_predict works
+ y_pred = pipeline.fit_predict(**data.X_y_lengths)
+ # check that all steps are fitted
+ check_is_fitted(pipeline.fit(**data.X_y_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+ # check that predictions are valid
+ assert y_pred.shape == data.y.shape
+ assert set(y_pred) == set(data.classes)
+
+ # check that score with y works
+ pipeline.score(**data.X_y_lengths)
+
+ # check that score without y doesn't work
+ with pytest.raises(InvalidParameterError):
+ pipeline.score(**data.X_lengths)
+
+
+def test_pipeline_with_transforms_and_estimator(
+ helpers: t.Any, data: SequentialDataset
+) -> None:
+ pipeline = Pipeline(
+ [
+ (
+ "scale",
+ IndependentFunctionTransformer(
+ scale, inverse_func=lambda x: x, check_inverse=False
+ ),
+ ),
+ ("pca", PCA(n_components=1)),
+ ("knn", KNNClassifier(k=1)),
+ ]
+ )
+
+ # check that transforming doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.transform(**data.X_lengths)
+
+ # check that fitting without y doesn't work
+ with pytest.raises(InvalidParameterError):
+ pipeline.fit(**data.X_lengths)
+
+ # check that fitting with y works
+ check_is_fitted(pipeline.fit(**data.X_y_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+ # check that X values were transformed
+ helpers.assert_not_equal(data.X, pipeline[-1].X_)
+ assert pipeline[-1].X_.shape == (len(data.X), 1)
+
+ # check that transforming doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.transform(**data.X_lengths)
+
+ # check that fit_transform doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.fit_transform(**data.X_lengths)
+
+ # check that inverse_transform doesn't work
+ with pytest.raises(AttributeError):
+ pipeline.inverse_transform(**data.X_lengths)
+
+ # check that predict works
+ y_pred = pipeline.predict(**data.X_lengths)
+ assert y_pred.shape == data.y.shape
+ assert set(y_pred) == set(data.classes)
+
+ # check that predict_log_proba works
+ log_proba_pred = pipeline.predict_log_proba(**data.X_lengths)
+ assert log_proba_pred.shape == (len(data), len(data.classes))
+
+ # check that predict_proba works
+ proba_pred = pipeline.predict_proba(**data.X_lengths)
+ assert proba_pred.shape == (len(data), len(data.classes))
+
+ # check that fit_predict works
+ y_pred = pipeline.fit_predict(**data.X_y_lengths)
+ # check that all steps are fitted
+ check_is_fitted(pipeline.fit(**data.X_y_lengths))
+ for estimator in pipeline.named_steps.values():
+ check_is_fitted(estimator)
+ # check that predictions are valid
+ assert y_pred.shape == data.y.shape
+ assert set(y_pred) == set(data.classes)
+ # check that X values were transformed
+ helpers.assert_not_equal(data.X, pipeline[-1].X_)
+ assert pipeline[-1].X_.shape == (len(data.X), 1)
+
+ # check that score with y works
+ pipeline.score(**data.X_y_lengths)
+
+ # check that score without y doesn't work
+ with pytest.raises(InvalidParameterError):
+ pipeline.score(**data.X_lengths)
diff --git a/tests/unit/test_preprocessing/__init__.py b/tests/unit/test_preprocessing/__init__.py
new file mode 100644
index 00000000..cd11e405
--- /dev/null
+++ b/tests/unit/test_preprocessing/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_preprocessing/test_transforms.py b/tests/unit/test_preprocessing/test_transforms.py
new file mode 100644
index 00000000..229ad05c
--- /dev/null
+++ b/tests/unit/test_preprocessing/test_transforms.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019-2025 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import typing as t
+
+import numpy as np
+import pytest
+from _pytest.fixtures import SubRequest
+from sklearn.preprocessing import minmax_scale
+
+from sequentia._internal import _data
+from sequentia._internal._typing import Array
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.preprocessing import transforms
+
+from ...conftest import Helpers
+
+
+@pytest.fixture(scope="module")
+def random_state(request: SubRequest) -> np.random.RandomState:
+ return np.random.RandomState(1)
+
+
+@pytest.fixture(scope="module")
+def data(random_state: np.random.RandomState) -> SequentialDataset:
+ data_ = load_digits(digits=[0])
+ _, subset = data_.split(
+ test_size=0.2, random_state=random_state, stratify=True
+ )
+ return subset
+
+
+def check_filter(x: Array, xt: Array, func: t.Callable, k: int) -> None:
+ """NOTE: Only works for odd k"""
+ assert len(x) == len(xt)
+ Helpers.assert_equal(xt[k // 2], func(x[:k], axis=0))
+
+
+def test_function_transformer(helpers: t.Any, data: SequentialDataset) -> None:
+ # create the transform
+ transform = transforms.IndependentFunctionTransformer(minmax_scale)
+ # check that fit works - should do nothing
+ transform.fit(**data.X_lengths)
+ # check that fit_transform works - shouldn't do anything on fit, but should transform
+ X_fit_transform = transform.fit_transform(**data.X_lengths)
+ # check that transform works
+ X_transform = transform.transform(**data.X_lengths)
+ # check that fit_transform and transform produce the same transformed data
+ helpers.assert_equal(X_fit_transform, X_transform)
+ # check that features of each sequence are independently scaled to [0, 1]
+ for xt in _data.iter_X(X_transform, idxs=data.idxs):
+ helpers.assert_equal(xt.min(axis=0), np.zeros(xt.shape[1]))
+ helpers.assert_equal(xt.max(axis=0), np.ones(xt.shape[1]))
+
+
+@pytest.mark.parametrize("avg", ["mean", "median"])
+@pytest.mark.parametrize("k", [3, 5])
+def test_filters(
+ data: SequentialDataset,
+ random_state: np.random.RandomState,
+ avg: t.Literal["mean", "median"],
+ k: int,
+) -> None:
+ filter_ = getattr(transforms, f"{avg}_filter")
+ check_filter_ = lambda x, xt: check_filter(x, xt, getattr(np, avg), k)
+
+ # check that filters are correctly applied for a single sequence
+ n_features = 2
+ x = random_state.rand(10 * n_features).reshape(-1, n_features)
+ xt = filter_(x, k=k)
+ check_filter_(x, xt)
+
+ # create a transform using the filter, passing k
+ transform = transforms.IndependentFunctionTransformer(
+ filter_, kw_args={"k": k}
+ )
+ Xt = transform.transform(**data.X_lengths)
+
+ # check that filters are correctly applied for multiple sequences
+ idxs = _data.get_idxs(data.lengths)
+ for x, xt in zip(
+ *map(lambda X: _data.iter_X(X, idxs=idxs), (data.X, Xt)) # noqa: C417
+ ):
+ check_filter_(x, xt)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 00000000..41f75ec6
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,18 @@
+[tox]
+isolated_build = True
+
+[testenv:tests]
+allowlist_externals = poetry
+commands = poetry run invoke tests.install tests.unit {posargs}
+
+[testenv:docs]
+allowlist_externals = poetry
+commands = poetry run invoke docs.install docs.build {posargs}
+
+[testenv:lint]
+allowlist_externals = poetry
+commands = poetry run invoke lint.install lint.check
+
+[testenv:format]
+allowlist_externals = poetry
+commands = poetry run invoke lint.install lint.format