Skip to content

Commit

Permalink
[python] Support H5AD ingest directly from S3 in backed mode (#1629)
Browse files Browse the repository at this point in the history
* [python] Support H5AD ingest directly from S3 in backed mode

* code-review feedback, and more
  • Loading branch information
johnkerl authored Aug 22, 2023
1 parent 158546c commit 1b2c3e7
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 21 deletions.
70 changes: 54 additions & 16 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import time
from typing import (
Any,
ContextManager,
List,
Mapping,
Optional,
Expand All @@ -24,6 +25,7 @@
cast,
overload,
)
from unittest import mock

import anndata as ad
import h5py
Expand All @@ -32,6 +34,7 @@
import pyarrow as pa
import scipy.sparse as sp
import tiledb
from anndata._core import file_backing
from anndata._core.sparse_dataset import SparseDataset
from somacore.options import PlatformConfig

Expand Down Expand Up @@ -104,6 +107,42 @@ def __init__(self, ingest_mode: str) -> None:
)


# This trick lets us ingest H5AD with "r" (backed mode) from S3 URIs. While h5ad
# supports any file-like object, AnnData specifically wants only an `os.PathLike`
# object. The only thing it does with the PathLike is to use it to get the filename.
class _FSPathWrapper:
"""Tricks anndata into thinking a file-like object is an ``os.PathLike``.
While h5ad supports any file-like object, anndata specifically wants
an ``os.PathLike object``, which it uses *exclusively* to get the "filename"
of the opened file.
We need to provide ``__fspath__`` as a real class method, so simply
setting ``some_file_obj.__fspath__ = lambda: "some/path"`` won't work,
so here we just proxy all attributes except ``__fspath__``.
"""

def __init__(self, obj: object, path: Path) -> None:
self._obj = obj
self._path = path

def __fspath__(self) -> Path:
return self._path

def __getattr__(self, name: str) -> object:
return getattr(self._obj, name)


def _hack_patch_anndata() -> ContextManager[object]:
"""Part Two of the ``_FSPathWrapper`` trick."""

@file_backing.AnnDataFileManager.filename.setter # type: ignore
def filename(self: file_backing.AnnDataFileManager, filename: Path) -> None:
self._filename = filename

return mock.patch.object(file_backing.AnnDataFileManager, "filename", filename)


# ----------------------------------------------------------------
def from_h5ad(
experiment_uri: str,
Expand Down Expand Up @@ -168,22 +207,21 @@ def from_h5ad(
s = _util.get_start_stamp()
logging.log_io(None, f"START Experiment.from_h5ad {input_path}")

logging.log_io(None, f"START READING {input_path}")

anndata = ad.read_h5ad(input_path, backed="r")

logging.log_io(None, _util.format_elapsed(s, f"FINISH READING {input_path}"))

uri = from_anndata(
experiment_uri,
anndata,
measurement_name,
context=context,
platform_config=platform_config,
ingest_mode=ingest_mode,
use_relative_uri=use_relative_uri,
X_kind=X_kind,
)
with tiledb.VFS(ctx=context.tiledb_ctx).open(input_path) as input_handle:
logging.log_io(None, f"START READING {input_path}")
with _hack_patch_anndata():
anndata = ad.read_h5ad(_FSPathWrapper(input_handle, input_path), "r")
logging.log_io(None, _util.format_elapsed(s, f"FINISH READING {input_path}"))
uri = from_anndata(
experiment_uri,
anndata,
measurement_name,
context=context,
platform_config=platform_config,
ingest_mode=ingest_mode,
use_relative_uri=use_relative_uri,
X_kind=X_kind,
)

logging.log_io(
None, _util.format_elapsed(s, f"FINISH Experiment.from_h5ad {input_path} {uri}")
Expand Down
10 changes: 5 additions & 5 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,16 +219,16 @@ def test_resume_mode(adata, resume_mode_h5ad_file):
tempdir1 = tempfile.TemporaryDirectory()
output_path1 = tempdir1.name
tiledbsoma.io.from_h5ad(
output_path1, resume_mode_h5ad_file, "RNA", ingest_mode="write"
output_path1, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="write"
)

tempdir2 = tempfile.TemporaryDirectory()
output_path2 = tempdir2.name
tiledbsoma.io.from_h5ad(
output_path2, resume_mode_h5ad_file, "RNA", ingest_mode="write"
output_path2, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="write"
)
tiledbsoma.io.from_h5ad(
output_path2, resume_mode_h5ad_file, "RNA", ingest_mode="resume"
output_path2, resume_mode_h5ad_file.as_posix(), "RNA", ingest_mode="resume"
)

exp1 = _factory.open(output_path1)
Expand Down Expand Up @@ -274,7 +274,7 @@ def test_ingest_relative(h5ad_file_extended, use_relative_uri):

tiledbsoma.io.from_h5ad(
output_path,
h5ad_file_extended,
h5ad_file_extended.as_posix(),
measurement_name="RNA",
use_relative_uri=use_relative_uri,
)
Expand Down Expand Up @@ -356,7 +356,7 @@ def test_ingest_uns_string_array(h5ad_file_uns_string_array):

tiledbsoma.io.from_h5ad(
output_path,
h5ad_file_uns_string_array,
h5ad_file_uns_string_array.as_posix(),
measurement_name="RNA",
)

Expand Down

0 comments on commit 1b2c3e7

Please sign in to comment.