Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement the H5StoreManager class. #129

Merged
merged 15 commits into from
Feb 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Highlights
Added
+++++

- Adds an ``H5Store`` class, useful for storing array-like data with an HDF5 backend. Accessible via ``with job.data:`` or ``with project.data:``.
- Adds an ``H5Store`` and a ``H5StoreManager`` class, useful for storing array-like data with an HDF5-backend. Those classes are exposed as part of the ``job.data``, ``job.stores``, ``project.data``, and ``project.stores``, properties.
- Adds the ``signac.get_job()`` and the ``signac.Project.get_job()`` functions which allow users to get a job handle by switching into or providing the job's workspace directory.
- Add automatic cast of numpy arrays to lists when storing them within a `JSONDict`, e.g., a `job.statepoint` or `job.document`.
- Enable `Collection` class to manage collections stored in gzip files.
Expand Down
51 changes: 37 additions & 14 deletions signac/contrib/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ..core.json import json, CustomJSONEncoder
from ..core.attrdict import SyncedAttrDict
from ..core.jsondict import JSONDict
from ..core.h5store import H5Store
from ..core.h5store import H5StoreManager
from .hashing import calc_id
from .utility import _mkdir_p
from .errors import DestinationExistsError, JobsCorruptedError
Expand Down Expand Up @@ -53,8 +53,7 @@ class Job(object):
FN_DOCUMENT = 'signac_job_document.json'
"The job's document filename."

FN_DATA = 'signac_data.h5'
"The job's datastore filename."
KEY_DATA = 'signac_data'

def __init__(self, project, statepoint, _id=None):
self._project = project
Expand All @@ -77,9 +76,8 @@ def __init__(self, project, statepoint, _id=None):
self._fn_doc = os.path.join(self._wd, self.FN_DOCUMENT)
self._document = None

# Prepare job datastore
self._fn_data = os.path.join(self._wd, self.FN_DATA)
self._data = None
# Prepare job h5-stores
self._stores = H5StoreManager(self._wd)

# Prepare current working directory for context management
self._cwd = list()
Expand Down Expand Up @@ -163,7 +161,6 @@ def reset_statepoint(self, new_statepoint):
self._wd = dst._wd
self._fn_doc = dst._fn_doc
self._document = None
self._fn_data = dst._fn_data
self._data = None
self._cwd = list()
logger.info("Moved '{}' -> '{}'.".format(self, dst))
Expand Down Expand Up @@ -263,21 +260,47 @@ def doc(self):
def doc(self, new_doc):
self.document = new_doc

@property
def stores(self):
"""Access HDF5-stores associated wit this job.

Use this property to access an HDF5-file within the job's workspace
directory using the H5Store dict-like interface.

This is an example for accessing an HDF5-file called 'my_data.h5' within
the job's workspace:

job.stores['my_data']['array'] = np.random((32, 4))

This is equivalent to:

H5Store(job.fn('my_data.h5'))['array'] = np.random((32, 4))

Both the `job.stores` and the `H5Store` itself support attribute
access. The above example could therefore also be expressed as

job.stores.my_data.array = np.random((32, 4))

:return: The HDF5-Store manager for this job.
:rtype: :class:`~..core.h5store.H5StoreManager
"""
return self.init()._stores

@property
def data(self):
"""The data associated with this job.

Equivalent to:

return job.stores['signac_data']

:return: An HDF5-backed datastore.
:rtype: :class:`~signac.core.h5store.H5Store`"""
if self._data is None:
self.init()
self._data = H5Store(filename=self._fn_data)
return self._data
:rtype: :class:`~..core.h5store.H5Store`"""
return self.stores[self.KEY_DATA]

@data.setter
def data(self, new_data):
self._data.clear()
self._data.update(new_data)
self.stores[self.KEY_DATA] = new_data

def _init(self, force=False):
fn_manifest = os.path.join(self._wd, self.FN_MANIFEST)
Expand Down
49 changes: 36 additions & 13 deletions signac/contrib/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .. import syncutil
from ..core.json import json
from ..core.jsondict import JSONDict
from ..core.h5store import H5Store
from ..core.h5store import H5StoreManager
from .collection import Collection
from ..common import six
from ..common.config import load_config
Expand Down Expand Up @@ -124,8 +124,8 @@ class Project(object):
FN_DOCUMENT = 'signac_project_document.json'
"The project's document filename."

FN_DATA = 'signac_data.h5'
"The project's datastore filename."
KEY_DATA = 'signac_data'
"The project's datastore key."

FN_STATEPOINTS = 'signac_statepoints.json'
"The default filename to read from and write statepoints to."
Expand All @@ -147,10 +147,6 @@ def __init__(self, config=None):
self._fn_doc = os.path.join(self._rd, self.FN_DOCUMENT)
self._document = None

# Prepare project datastore
self._fn_data = os.path.join(self._rd, self.FN_DATA)
self._data = None

# Internal caches
self._index_cache = dict()
self._sp_cache = dict()
Expand Down Expand Up @@ -305,21 +301,48 @@ def doc(self):
def doc(self, new_doc):
self.document = new_doc

@property
def stores(self):
"""Access HDF5-stores associated wit this project.

Use this property to access an HDF5-file within the project's root
directory using the H5Store dict-like interface.

This is an example for accessing an HDF5-file called 'my_data.h5' within
the project's root directory:

project.stores['my_data']['array'] = np.random((32, 4))

This is equivalent to:

H5Store(project.fn('my_data.h5'))['array'] = np.random((32, 4))

Both the `project.stores` and the `H5Store` itself support attribute
access. The above example could therefore also be expressed as

project.stores.my_data.array = np.random((32, 4))

:return: The HDF5-Store manager for this project.
:rtype: :class:`~..core.h5store.H5StoreManager
"""
return H5StoreManager(self._rd)

@property
def data(self):
"""The data associated with this project.

Equivalent to:

return project.store['signac_data']

:return: An HDF5-backed datastore.
:rtype: :class:`~signac.core.h5store.H5Store`
:rtype: :class:`~..core.h5store.H5Store`
"""
if self._data is None:
self._data = H5Store(filename=self._fn_data)
return self._data
return self.stores[self.KEY_DATA]

@data.setter
def data(self, new_data):
self._data.clear()
self._data.update(new_data)
self.stores[self.KEY_DATA] = new_data

def open_job(self, statepoint=None, id=None):
"""Get a job handle associated with a statepoint.
Expand Down
115 changes: 115 additions & 0 deletions signac/core/dict_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) 2019 The Regents of the University of Michigan
# All rights reserved.
# This software is licensed under the BSD 3-Clause License.
"Basic wrapper to access multiple different data stores."
import os
import re
import errno
import uuid

from ..common import six


class DictManager(object):

cls = None
suffix = None

__slots__ = ['_prefix', '_dict_registry']

def __init__(self, prefix):
assert self.cls is not None, "Subclasses of DictManager must define the cls variable."
assert self.suffix is not None, "Subclasses of DictManager must define the suffix variable."
self._prefix = os.path.abspath(prefix)
self._dict_registry = dict()

@property
def prefix(self):
return self._prefix

def __eq__(self, other):
return os.path.realpath(self.prefix) == os.path.realpath(other.prefix) and \
self.suffix == other.suffix

def __repr__(self):
return "{}(prefix='{}')".format(type(self).__name__, os.path.relpath(self.prefix))

__str__ = __repr__

def __getitem__(self, key):
if key not in self._dict_registry:
self._dict_registry[key] = self.cls(os.path.join(self.prefix, key) + self.suffix)
return self._dict_registry[key]

def __setitem__(self, key, value):
tmp_key = str(uuid.uuid4())
try:
self[tmp_key].update(value)
if six.PY2:
os.rename(self[tmp_key].filename, self[key].filename)
else:
os.replace(self[tmp_key].filename, self[key].filename)
except (IOError, OSError) as error:
if error.errno == errno.ENOENT and not len(value):
raise ValueError("Cannot asssign empty value!")
else:
raise error
except Exception as error:
try:
del self[tmp_key]
except KeyError:
pass
raise error
else:
del self._dict_registry[key]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't follow why this is necessary. The file corresponding to key was successfully created if we reach this point, why does it need to be removed from the tracked list?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not really necessary, I figured it might be a good idea to replace any references of "old" H5Store under the same key, but there is virtually no difference. I'll remove that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out this is actually necessary. I tried to delete this line and the test_assign and test_assign_data unit tests started to fail.


def __delitem__(self, key):
try:
os.unlink(self[key].filename)
except (IOError, OSError) as error:
if error.errno == errno.ENOENT:
raise KeyError(key)
else:
raise error

def __getattr__(self, name):
try:
return super(DictManager, self).__getattribute__(name)
except AttributeError:
if name.startswith('__') or name in self.__slots__:
raise
try:
return self.__getitem__(name)
except KeyError:
raise AttributeError(name)

def __setattr__(self, name, value):
if name.startswith('__') or name in self.__slots__:
super(DictManager, self).__setattr__(name, value)
else:
self.__setitem__(name, value)

def __delattr__(self, name):
if name.startswith('__') or name in self.__slots__:
super(DictManager, self).__delattr__(name)
else:
self.__delitem__(name)

def __iter__(self):
for fn in os.listdir(self.prefix):
m = re.match('(.*){}'.format(self.suffix), fn)
if m:
yield m.groups()[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using re seems like overkill here, I'd switch this to using glob.glob: yield from glob.glob("*.{}".format(self.suffix)) (without using yield from for py2 compatibility).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't directly extract the key name with glob.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose you'd still have to do something like ...replace(self.suffix, '') to get just the key without the extension, that's all that you're referring to right? I personally think that's still clearer than using a regex since you're really just parsing out an extension from a set of filenames, but I'm OK with leaving it as is if you prefer since it doesn't simplify the code that much.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried it. I would also need to strip off the directory name, so I'm sticking to the original implementation.

However, should we explicitly skip hidden h5-files?

Copy link
Contributor

@vyasr vyasr Feb 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's necessary. I view creating and editing hidden hd5 files as a feature, not a bug. If users explicitly access a hidden file I think it's reasonable to expect that they know what they're doing.


def keys(self):
return iter(self)

def __len__(self):
return len(list(self.keys()))

def __getstate__(self):
return dict(_prefix=self._prefix, _dict_registry=self._dict_registry)

def __setstate__(self, d):
self._prefix = d['_prefix']
self._dict_registry = d['_dict_registry']
21 changes: 20 additions & 1 deletion signac/core/h5store.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from threading import RLock

from ..common import six
from .dict_manager import DictManager

if six.PY2:
from collections import Mapping
Expand Down Expand Up @@ -284,6 +285,10 @@ def __init__(self, filename, **kwargs):
self._file = None
self._kwargs = kwargs

@property
def filename(self):
return self._filename

def __repr__(self):
return "<{}(filename={})>".format(type(self).__name__, os.path.relpath(self._filename))

Expand Down Expand Up @@ -359,7 +364,10 @@ def mode(self):

def flush(self):
"""Flush the underlying HDF5-file."""
self._file.flush()
if self._file is None:
raise H5StoreClosedError(self._filename)
else:
self._file.flush()

def __getitem__(self, key):
key = key if key.startswith('/') else '/' + key
Expand Down Expand Up @@ -392,6 +400,12 @@ def __setattr__(self, key, value):
else:
self.__setitem__(key, value)

def __delattr__(self, key):
if key.startswith('__') or key in self.__slots__:
super(H5Store, self).__delattr__(key)
else:
self.__delitem__(key)

def __iter__(self):
with _ensure_open(self):
# The generator below should be refactored to use 'yield from'
Expand Down Expand Up @@ -428,3 +442,8 @@ def clear(self):
"""
with _ensure_open(self):
self._file.clear()


class H5StoreManager(DictManager):
cls = H5Store
suffix = '.h5'
Loading