Skip to content

Added compressed parameter to read_10x_mtx to support STARsolo output #3564

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/3564.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added a new compressed parameter to the read_10x_mtx function to support reading uncompressed matrix files produced by tools like STARsolo. This parameter allows users to read uncompressed outputs from tools that don't produce gzipped files by default.
11 changes: 10 additions & 1 deletion src/scanpy/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ def read_10x_mtx(
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
gex_only: bool = True,
prefix: str | None = None,
compressed: bool = True,
) -> AnnData:
"""Read 10x-Genomics-formatted mtx directory.

Expand All @@ -579,6 +580,11 @@ def read_10x_mtx(
if the files are named `patientA_matrix.mtx`, `patientA_genes.tsv` and
`patientA_barcodes.tsv` the prefix is `patientA_`.
(Default: no prefix)
compressed
Whether to expect Cell Ranger v3+ files (.mtx, features.tsv, barcodes.tsv)
to be gzipped. If True, '.gz' suffix is appended to filenames.
Set to False for STARsolo output.
Has no effect on legacy (v2-) files.

Returns
-------
Expand All @@ -596,6 +602,7 @@ def read_10x_mtx(
cache_compression=cache_compression,
prefix=prefix,
is_legacy=is_legacy,
compressed=compressed,
)
if is_legacy or not gex_only:
return adata
Expand All @@ -612,9 +619,11 @@ def _read_10x_mtx(
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
prefix: str = "",
is_legacy: bool,
compressed: bool = True,
) -> AnnData:
"""Read mex from output from Cell Ranger v2- or v3+."""
suffix = "" if is_legacy else ".gz"
# Only append .gz if not a legacy file AND compression is requested
suffix = "" if is_legacy else (".gz" if compressed else "")
adata = read(
path / f"{prefix}matrix.mtx{suffix}",
cache=cache,
Expand Down
25 changes: 25 additions & 0 deletions tests/test_read_10x.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,28 @@ def test_10x_probe_barcode_read():
assert set(probe_anndata.obs.columns) == {"filtered_barcodes"}
assert probe_anndata.shape == (4987, 1000)
assert probe_anndata.X.nnz == 858


def test_read_10x_compressed_parameter(tmp_path):
"""Test that the compressed parameter works correctly."""
# Copy test data to temp directory
mtx_path_v3 = ROOT / "3.0.0" / "filtered_feature_bc_matrix"
test_path = tmp_path / "test_compressed"
test_path.mkdir()

# Create uncompressed copies of the compressed files
for file in mtx_path_v3.glob("*.gz"):
import gzip

with gzip.open(file, "rb") as f_in:
content = f_in.read()
dest_file = test_path / file.name[:-3] # Removes .gz extension
with dest_file.open("wb") as f_out:
f_out.write(content)

# Read the uncompressed data
adata_uncompressed = sc.read_10x_mtx(test_path, compressed=False)
# Read the compressed data
adata_compressed = sc.read_10x_mtx(mtx_path_v3, compressed=True)
# Check that the two AnnData objects are equal
assert_anndata_equal(adata_uncompressed, adata_compressed)
Loading