diff --git a/docs/release-notes/3564.feature.md b/docs/release-notes/3564.feature.md new file mode 100644 index 0000000000..b4dffe5850 --- /dev/null +++ b/docs/release-notes/3564.feature.md @@ -0,0 +1 @@ +Added a new compressed parameter to the read_10x_mtx function to support reading uncompressed matrix files produced by tools like STARsolo. This parameter allows users to read uncompressed outputs from tools that don't produce gzipped files by default. diff --git a/src/scanpy/readwrite.py b/src/scanpy/readwrite.py index 164c8c79c7..8d63934064 100644 --- a/src/scanpy/readwrite.py +++ b/src/scanpy/readwrite.py @@ -553,6 +553,7 @@ def read_10x_mtx( cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty, gex_only: bool = True, prefix: str | None = None, + compressed: bool = True, ) -> AnnData: """Read 10x-Genomics-formatted mtx directory. @@ -579,6 +580,11 @@ def read_10x_mtx( if the files are named `patientA_matrix.mtx`, `patientA_genes.tsv` and `patientA_barcodes.tsv` the prefix is `patientA_`. (Default: no prefix) + compressed + Whether to expect Cell Ranger v3+ files (.mtx, features.tsv, barcodes.tsv) + to be gzipped. If True, '.gz' suffix is appended to filenames. + Set to False for STARsolo output. + Has no effect on legacy (v2-) files. Returns ------- @@ -596,6 +602,7 @@ def read_10x_mtx( cache_compression=cache_compression, prefix=prefix, is_legacy=is_legacy, + compressed=compressed, ) if is_legacy or not gex_only: return adata @@ -612,9 +619,11 @@ def _read_10x_mtx( cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty, prefix: str = "", is_legacy: bool, + compressed: bool = True, ) -> AnnData: """Read mex from output from Cell Ranger v2- or v3+.""" - suffix = "" if is_legacy else ".gz" + # Only append .gz if not a legacy file AND compression is requested + suffix = "" if is_legacy else (".gz" if compressed else "") adata = read( path / f"{prefix}matrix.mtx{suffix}", cache=cache, diff --git a/tests/test_read_10x.py b/tests/test_read_10x.py index ed836a95a1..1f62924992 100644 --- a/tests/test_read_10x.py +++ b/tests/test_read_10x.py @@ -175,3 +175,28 @@ def test_10x_probe_barcode_read(): assert set(probe_anndata.obs.columns) == {"filtered_barcodes"} assert probe_anndata.shape == (4987, 1000) assert probe_anndata.X.nnz == 858 + + +def test_read_10x_compressed_parameter(tmp_path): + """Test that the compressed parameter works correctly.""" + # Copy test data to temp directory + mtx_path_v3 = ROOT / "3.0.0" / "filtered_feature_bc_matrix" + test_path = tmp_path / "test_compressed" + test_path.mkdir() + + # Create uncompressed copies of the compressed files + for file in mtx_path_v3.glob("*.gz"): + import gzip + + with gzip.open(file, "rb") as f_in: + content = f_in.read() + dest_file = test_path / file.name[:-3] # Removes .gz extension + with dest_file.open("wb") as f_out: + f_out.write(content) + + # Read the uncompressed data + adata_uncompressed = sc.read_10x_mtx(test_path, compressed=False) + # Read the compressed data + adata_compressed = sc.read_10x_mtx(mtx_path_v3, compressed=True) + # Check that the two AnnData objects are equal + assert_anndata_equal(adata_uncompressed, adata_compressed)