Skip to content

Commit

Permalink
Installation through PyPi, Set default --min-kmers to 20
Browse files Browse the repository at this point in the history
Co-authored-by: aziele <[email protected]>
  • Loading branch information
agudys and aziele authored Oct 22, 2024
1 parent 928b415 commit e60a35a
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 36 deletions.
2 changes: 1 addition & 1 deletion 3rd_party/clusty
2 changes: 1 addition & 1 deletion 3rd_party/lz-ani
Submodule lz-ani updated 3 files
+3 −0 README.md
+3 −2 src/defs.h
+9 −0 src/lz-ani.cpp
2 changes: 1 addition & 1 deletion 3rd_party/ref-utils
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include README.md
include LICENSE
recursive-include bin *
recursive-exclude 3rd_party *
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# <img src="./images/logo.svg" alt="Vclust logo" /> Vclust

![version](https://img.shields.io/badge/version-1.2.7-blue.svg)
[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases)
[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)
![version](https://img.shields.io/badge/version-1.2.8-blue.svg)
![PyPI - Version](https://img.shields.io/pypi/v/vclust?label=PyPI%20version&color=blue)
[![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)

![PyPI - Downloads](https://img.shields.io/pypi/dm/vclust?label=PyPI%20downloads)
[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases)
[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)

![x86-64](https://img.shields.io/static/v1?label=%E2%80%8B&message=x86-64&color=yellow&logo=PCGamingWiki&logoColor=white)
![ARM](https://img.shields.io/static/v1?label=%E2%80%8B&message=ARM&color=yellow&logo=Raspberry%20Pi&logoColor=white)
![Apple M](https://img.shields.io/static/v1?label=%E2%80%8B&message=Apple%20M&color=yellow&logo=Apple&logoColor=white)
Expand Down Expand Up @@ -51,18 +54,17 @@ For datasets containing up to 1000 viral genomes, Vclust is available at [http:/
## Quick start

```bash
# Clone repository and build Vclust
git clone --recurse-submodules https://github.com/refresh-bio/vclust
cd vclust && make -j
# Install Vclust (requires Python >= 3.7)
pip install vclust

# Prefilter similar genome sequence pairs before conducting pairwise alignments.
./vclust.py prefilter -i example/multifasta.fna -o fltr.txt
vclust prefilter -i example/multifasta.fna -o fltr.txt

# Align similar genome sequence pairs and calculate pairwise ANI measures.
./vclust.py align -i example/multifasta.fna -o ani.tsv --filter fltr.txt
vclust align -i example/multifasta.fna -o ani.tsv --filter fltr.txt

# Cluster genome sequences based on given ANI measure and minimum threshold.
./vclust.py cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95
vclust cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95
```
## Documentation

Expand Down
56 changes: 56 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[build-system]
requires = ["setuptools>=61.0.0", "wheel"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
py-modules = ["vclust"]

[tool.setuptools.packages.find]
where = ["./"]

[project]
name = "vclust-test"
description = """Fast and accurate tool for calculating \
Average Nucleotide Identity (ANI) and clustering virus \
genomes and metagenomic contigs"""
readme = "README.md"
license = { file = "LICENSE" }
authors = [
{ name = "Andrzej Zielezinski", email = "[email protected]" },
{ name = "Adam Gudyś", email = "[email protected]" },
{ name = "Sebastian Deorowicz", email = "[email protected]" },
]
requires-python = ">=3.7"
dynamic = ["version"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Natural Language :: English",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS",
"License :: OSI Approved :: GNU Affero General Public License v3",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]

[tool.setuptools.dynamic]
version = { attr = "vclust.__version__" }

[tool.setuptools.package-data]
"*" = ["bin/*"]

[project.scripts]
vclust = "vclust:main"

[project.urls]
Homepage = "https://github.com/refresh-bio/vclust"
Documentation = "https://github.com/refresh-bio/vclust/wiki"
Website = "http://vclust.org"
118 changes: 95 additions & 23 deletions vclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
import multiprocessing
import os
import pathlib
import platform
import shutil
import subprocess
import sys
import typing
import uuid

__version__ = '1.2.7'
__version__ = '1.2.8'

DEFAULT_THREAD_COUNT = min(multiprocessing.cpu_count(), 64)

Expand Down Expand Up @@ -59,7 +60,7 @@ def ranged_float_type(value):
return f

parser = argparse.ArgumentParser(
description=f'%(prog)s v.{__version__}: calculate ANI and cluster '
description=f'%(prog)s v{__version__}: calculate ANI and cluster '
'virus (meta)genome sequences',
add_help=False,
)
Expand Down Expand Up @@ -117,7 +118,7 @@ def ranged_float_type(value):
'--min-kmers',
metavar="<int>",
type=int,
default=10,
default=20,
help='Filter genome pairs based on minimum number of shared k-mers '
'[%(default)s]'
)
Expand Down Expand Up @@ -531,7 +532,7 @@ def ranged_float_type(value):
'--bin',
metavar='<file>',
type=pathlib.Path,
dest="BIN_CLUSTY",
dest="bin_clusty",
default=f'{BIN_CLUSTY}',
help='Path to the Clusty binary [%(default)s]'
)
Expand Down Expand Up @@ -603,8 +604,8 @@ def get_uuid() -> str:
return f'vclust-{str(uuid.uuid4().hex)[:10]}'


def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
"""Validates the existence and executability of a binary file.
def _validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
"""Validates the presence and executability of a binary file.
This function checks if the provided path points to an existing binary file
and if it is executable. It also attempts to run the binary to ensure it
Expand All @@ -618,16 +619,16 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
pathlib.Path: The resolved path to the binary file.
Raises:
SystemExit: If the binary file does not exist, is not executable, or
if running the binary encounters an error.
RuntimeError: If the binary file does not exist, is not executable,
or if running the binary encounters an error.
"""
bin_path = bin_path.resolve()

if not bin_path.exists():
exit(f'error: Executable not found: {bin_path}')
raise RuntimeError(f'File not found: {bin_path}')

if not bin_path.is_file() or not os.access(bin_path, os.X_OK):
exit(f'error: Binary file not executable: {bin_path}')
raise RuntimeError(f'Binary file not executable: {bin_path}')

try:
subprocess.run(
Expand All @@ -638,14 +639,21 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
check=True
)
except subprocess.CalledProcessError as e:
exit(f'error: Running {bin_path} failed with message: {e.stderr}')
raise RuntimeError(f'Running {bin_path} failed with message: {e.stderr}')
except OSError as e:
exit(f'error: OSError in {bin_path} - {e}')
raise RuntimeError(f'OSError in {bin_path} - {e}')
except Exception as e:
exit(f'error: Unexpected error in binary {bin_path} - {e}')
raise RuntimeError(f'Unexpected error in binary {bin_path} - {e}')
return bin_path


def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
try:
return _validate_binary(bin_path)
except RuntimeError as e:
sys.exit(f'error: {e}')


def validate_args_fasta_input(args, parser) -> argparse.Namespace:
"""Validates the arguments for FASTA input."""
args.is_multifasta = True
Expand Down Expand Up @@ -732,13 +740,13 @@ def run(
)
except subprocess.CalledProcessError as e:
logger.error(f'Process {" ".join(cmd)} failed with message: {e.stderr}')
exit(1)
sys.exit(1)
except OSError as e:
logger.error(f'OSError: {" ".join(cmd)} failed with message: {e}')
exit(1)
sys.exit(1)
except Exception as e:
logger.error(f'Unexpected: {" ".join(cmd)} failed with message: {e}')
exit(1)
sys.exit(1)
logger.info(f'Done')
return process

Expand Down Expand Up @@ -1145,11 +1153,75 @@ def cmd_clusty(
return cmd


def vclust_info():
print(f'Vclust {__version__}')
for bin_path in [BIN_KMERDB, BIN_FASTASPLIT, BIN_LZANI, BIN_CLUSTY]:
validate_binary(bin_path)
print(f'{bin_path.name:<20} ok')
def vclust_info() -> None:
"""
Displays the Vclust version, installation paths, and binary dependencies.
Checks for the presence and executable status of required binaries.
Exits with a non-zero status if any dependencies are missing or
not executable.
Returns:
None
Raises:
SystemExit: If any binary dependencies are missing or not executable.
"""
# ANSI color codes for terminal output.
GREEN = '\033[92m'
RED = '\033[91m'
RESET = '\033[0m'

binaries = {
'Kmer-db': BIN_KMERDB,
'LZ-ANI': BIN_LZANI,
'Clusty': BIN_CLUSTY,
'multi-fasta-split': BIN_FASTASPLIT,
}

output_lines = [
f'Vclust version {__version__} (Python {platform.python_version()})',
'',
'Installed at:',
f' {pathlib.Path(__file__).resolve()}',
f' {BIN_DIR.resolve()}',
'',
'Binary dependencies:',
]

errors = [] # List to collect any errors encountered during binary checks.

# Check each binary's presence and version.
for name, path in binaries.items():
try:
_validate_binary(path)
version = subprocess.run(
[str(path), '-version' if name == 'Kmer-db' else '--version'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
).stderr.strip()
output_lines.append(f' {name:<20} v{version:<10}')
except Exception as e:
output_lines.append(f' {name:<20} [error]')
errors.append((name, e))

# Append the status summary based on any encountered errors.
output_lines.append('')

if errors:
output_lines.append(f'{RED}Status: error{RESET}')
output_lines.extend(f" - {name}: {error}" for name, error in errors)
else:
output_lines.append(f'{GREEN}Status: ok{RESET}')

# Output the complete information.
print('\n'.join(output_lines))

if errors:
sys.exit(1)


class CustomHelpFormatter(argparse.HelpFormatter):
Expand Down Expand Up @@ -1324,7 +1396,7 @@ def main():

# Cluster
elif args.command == 'cluster':
args.BIN_CLUSTY = validate_binary(args.BIN_CLUSTY)
args.bin_clusty = validate_binary(args.bin_clusty)
args = validate_args_cluster(args, parser)

cmd = cmd_clusty(
Expand All @@ -1344,7 +1416,7 @@ def main():
leiden_resolution=args.leiden_resolution,
leiden_beta=args.leiden_beta,
leiden_iterations=args.leiden_iterations,
bin_path=args.BIN_CLUSTY,
bin_path=args.bin_clusty,
)
p = run(cmd, args.verbose, logger)

Expand Down

0 comments on commit e60a35a

Please sign in to comment.