diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 7d87659a5a..28b34eac9b 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -13,6 +13,7 @@ import time from typing import ( Any, + ContextManager, List, Mapping, Optional, @@ -24,6 +25,7 @@ cast, overload, ) +from unittest import mock import anndata as ad import h5py @@ -32,6 +34,7 @@ import pyarrow as pa import scipy.sparse as sp import tiledb +from anndata._core import file_backing from anndata._core.sparse_dataset import SparseDataset from somacore.options import PlatformConfig @@ -104,6 +107,42 @@ def __init__(self, ingest_mode: str) -> None: ) +# This trick lets us ingest H5AD with "r" (backed mode) from S3 URIs. While h5ad +# supports any file-like object, AnnData specifically wants only an `os.PathLike` +# object. The only thing it does with the PathLike is to use it to get the filename. +class _FSPathWrapper: + """Tricks anndata into thinking a file-like object is an ``os.PathLike``. + + While h5ad supports any file-like object, anndata specifically wants + an ``os.PathLike object``, which it uses *exclusively* to get the "filename" + of the opened file. + + We need to provide ``__fspath__`` as a real class method, so simply + setting ``some_file_obj.__fspath__ = lambda: "some/path"`` won't work, + so here we just proxy all attributes except ``__fspath__``. + """ + + def __init__(self, obj: object, path: Path) -> None: + self._obj = obj + self._path = path + + def __fspath__(self) -> Path: + return self._path + + def __getattr__(self, name: str) -> object: + return getattr(self._obj, name) + + +def _hack_patch_anndata() -> ContextManager[object]: + """Part Two of the ``_FSPathWrapper`` trick.""" + + @file_backing.AnnDataFileManager.filename.setter # type: ignore + def filename(self: file_backing.AnnDataFileManager, filename: Path) -> None: + self._filename = filename + + return mock.patch.object(file_backing.AnnDataFileManager, "filename", filename) + + # ---------------------------------------------------------------- def from_h5ad( experiment_uri: str, @@ -168,22 +207,21 @@ def from_h5ad( s = _util.get_start_stamp() logging.log_io(None, f"START Experiment.from_h5ad {input_path}") - logging.log_io(None, f"START READING {input_path}") - - anndata = ad.read_h5ad(input_path, backed="r") - - logging.log_io(None, _util.format_elapsed(s, f"FINISH READING {input_path}")) - - uri = from_anndata( - experiment_uri, - anndata, - measurement_name, - context=context, - platform_config=platform_config, - ingest_mode=ingest_mode, - use_relative_uri=use_relative_uri, - X_kind=X_kind, - ) + with tiledb.VFS(ctx=context.tiledb_ctx).open(input_path) as input_handle: + logging.log_io(None, f"START READING {input_path}") + with _hack_patch_anndata(): + anndata = ad.read_h5ad(_FSPathWrapper(input_handle, input_path), "r") + logging.log_io(None, _util.format_elapsed(s, f"FINISH READING {input_path}")) + uri = from_anndata( + experiment_uri, + anndata, + measurement_name, + context=context, + platform_config=platform_config, + ingest_mode=ingest_mode, + use_relative_uri=use_relative_uri, + X_kind=X_kind, + ) logging.log_io( None, _util.format_elapsed(s, f"FINISH Experiment.from_h5ad {input_path} {uri}") diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index c882df5111..b0be11a784 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -219,16 +219,16 @@ def test_resume_mode(adata, resume_mode_h5ad_file): tempdir1 = tempfile.TemporaryDirectory() output_path1 = tempdir1.name tiledbsoma.io.from_h5ad( - output_path1, resume_mode_h5ad_file, "RNA", ingest_mode="write" + output_path1, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="write" ) tempdir2 = tempfile.TemporaryDirectory() output_path2 = tempdir2.name tiledbsoma.io.from_h5ad( - output_path2, resume_mode_h5ad_file, "RNA", ingest_mode="write" + output_path2, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="write" ) tiledbsoma.io.from_h5ad( - output_path2, resume_mode_h5ad_file, "RNA", ingest_mode="resume" + output_path2, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="resume" ) exp1 = _factory.open(output_path1) @@ -274,7 +274,7 @@ def test_ingest_relative(h5ad_file_extended, use_relative_uri): tiledbsoma.io.from_h5ad( output_path, - h5ad_file_extended, + h5ad_file_extended.as_posix(), measurement_name="RNA", use_relative_uri=use_relative_uri, ) @@ -356,7 +356,7 @@ def test_ingest_uns_string_array(h5ad_file_uns_string_array): tiledbsoma.io.from_h5ad( output_path, - h5ad_file_uns_string_array, + h5ad_file_uns_string_array.as_posix(), measurement_name="RNA", )