Skip to content

Commit

Permalink
CalcJob: allow directories in local_copy_list (#5115)
Browse files Browse the repository at this point in the history
Up till now, the `local_copy_list` could only be used to specify
individual files that should be copied. However, very often, one may
have an input node from whose repository an entire directory, or sub
directory should be copied. The only way to do this was to manually
iterate over the contents of that repository directory and add the files
to the `local_copy_list` one by one.

Here we extend the syntax of the `local_copy_list` and the second
argument can now also point to a directory in the repository of the
source node. Its entire contents will be copied to the relative path
defined by the third element in the tuple. Note that the directory
itself won't be copied, just its contents.
  • Loading branch information
sphuber authored Sep 16, 2021
1 parent 7f84b53 commit a295616
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 18 deletions.
44 changes: 35 additions & 9 deletions aiida/engine/daemon/execmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections.abc import Mapping
from logging import LoggerAdapter
import os
import pathlib
import shutil
from tempfile import NamedTemporaryFile
from typing import Any, List, Optional, Mapping as MappingType, Tuple, Union
Expand All @@ -26,6 +27,7 @@
from aiida.common.links import LinkType
from aiida.orm import load_node, CalcJobNode, Code, FolderData, Node, RemoteData
from aiida.orm.utils.log import get_dblogger_extra
from aiida.repository.common import FileType
from aiida.schedulers.datastructures import JobState
from aiida.transports import Transport

Expand Down Expand Up @@ -181,7 +183,7 @@ def upload_calculation(
transport.put(handle.name, filename)
transport.chmod(code.get_local_executable(), 0o755) # rwxr-xr-x

# local_copy_list is a list of tuples, each with (uuid, dest_rel_path)
# local_copy_list is a list of tuples, each with (uuid, dest_path, rel_path)
# NOTE: validation of these lists are done inside calculation.presubmit()
local_copy_list = calc_info.local_copy_list or []
remote_copy_list = calc_info.remote_copy_list or []
Expand All @@ -199,13 +201,26 @@ def upload_calculation(
if data_node is None:
logger.warning(f'failed to load Node<{uuid}> specified in the `local_copy_list`')
else:
dirname = os.path.dirname(target)
if dirname:
os.makedirs(os.path.join(folder.abspath, dirname), exist_ok=True)
with folder.open(target, 'wb') as handle:
with data_node.open(filename, 'rb') as source:
shutil.copyfileobj(source, handle)
provenance_exclude_list.append(target)

# If no explicit source filename is defined, we assume the top-level directory
filename_source = filename or '.'
filename_target = target or ''

# Make the target filepath absolute and create any intermediate directories if they don't yet exist
filepath_target = pathlib.Path(folder.abspath) / filename_target
filepath_target.parent.mkdir(parents=True, exist_ok=True)

if data_node.get_object(filename_source).file_type == FileType.DIRECTORY:
# If the source object is a directory, we copy its entire contents
data_node.copy_tree(filepath_target, filename_source)
provenance_exclude_list.extend(data_node.list_object_names(filename_source))
else:
# Otherwise, simply copy the file
with folder.open(target, 'wb') as handle:
with data_node.open(filename, 'rb') as source:
shutil.copyfileobj(source, handle)

provenance_exclude_list.append(target)

# In a dry_run, the working directory is the raw input folder, which will already contain these resources
if not dry_run:
Expand Down Expand Up @@ -288,7 +303,18 @@ def upload_calculation(
for filename in filenames:
filepath = os.path.join(root, filename)
relpath = os.path.normpath(os.path.relpath(filepath, folder.abspath))
if relpath not in provenance_exclude_list:
dirname = os.path.dirname(relpath)

# Construct a list of all (partial) filepaths
# For example, if `relpath == 'some/sub/directory/file.txt'` then the list of relative directory paths is
# ['some', 'some/sub', 'some/sub/directory']
# This is necessary, because if any of these paths is in the `provenance_exclude_list` the file should not
# be copied over.
components = dirname.split(os.sep)
dirnames = [os.path.join(*components[:i]) for i in range(1, len(components) + 1)]
if relpath not in provenance_exclude_list and all(
dirname not in provenance_exclude_list for dirname in dirnames
):
with open(filepath, 'rb') as handle:
node._repository.put_object_from_filelike(handle, relpath) # pylint: disable=protected-access

Expand Down
43 changes: 40 additions & 3 deletions docs/source/topics/calculations/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,11 @@ File lists
Local copy list
~~~~~~~~~~~~~~~
The local copy list takes tuples of length three, each of which represents a file to be copied, defined through the following items:
The local copy list takes tuples of length three, each of which represents a file or directory to be copied, defined through the following items:
* `node uuid`: the node whose repository contains the file, typically a ``SinglefileData`` or ``FolderData`` node
* `source relative path`: the relative path of the file within the node repository
* `target relative path`: the relative path within the working directory to which to copy the file
* `source relative path`: the relative path of the file or directory within the node repository
* `target relative path`: the relative path within the working directory to which to copy the file or directory contents
As an example, consider a ``CalcJob`` implementation that receives a ``SinglefileData`` node as input with the name ``pseudopotential``, to copy its contents one can specify:
Expand All @@ -251,6 +251,43 @@ If instead, you need to transfer a specific file from a ``FolderData``, you can
Note that the filenames in the relative source and target path need not be the same.
This depends fully on how the files are stored in the node's repository and what files need to be written to the working directory.
To copy the contents of a directory of the source node, simply define it as the `source relative path`.
For example, imagine we have a `FolderData` node that is passed as the `folder` input, which has the following repository virtual hierarchy:
.. code:: bash
├─ sub
│ └─ file_b.txt
└─ file_a.txt
If the entire content needs to be copied over, specify the `local_copy_list` as follows:
.. code:: python
calc_info.local_copy_list = [(self.inputs.folder.uuid, '.', None)]
The ``'.'`` here indicates that the entire contents need to be copied over.
Alternatively, one can specify a sub directory, e.g.:
.. code:: python
calc_info.local_copy_list = [(self.inputs.folder.uuid, 'sub', None)]
Finally, the `target relative path` can be used to write the contents of the source repository to a particular sub directory in the working directory.
For example, the following statement:
.. code:: python
calc_info.local_copy_list = [(self.inputs.folder.uuid, 'sub', 'relative/target')]
will result in the following file hierarchy in the working directory of the calculation:
.. code:: bash
└─ relative
└─ target
└─ file_b.txt
One might think what the purpose of the list is, when one could just as easily use normal the normal API to write the file to the ``folder`` sandbox folder.
It is true, that in this way the file will be copied to the working directory, however, then it will *also* be copied into the repository of the calculation node.
Since in this case it is merely a direct one-to-one copy of the file that is already part of one of the input nodes (in an unaltered form), this duplication is unnecessary and adds useless weight to the file repository.
Expand Down
28 changes: 22 additions & 6 deletions tests/engine/daemon/test_execmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,22 @@ def test_retrieve_files_from_list(


@pytest.mark.usefixtures('clear_database_before_test')
def test_upload_local_copy_list(fixture_sandbox, aiida_localhost, aiida_local_code_factory):
def test_upload_local_copy_list(fixture_sandbox, aiida_localhost, aiida_local_code_factory, file_hierarchy, tmp_path):
"""Test the ``local_copy_list`` functionality in ``upload_calculation``.
Specifically, verify that files in the ``local_copy_list`` do not end up in the repository of the node.
"""
from aiida.common.datastructures import CalcInfo, CodeInfo
from aiida.orm import CalcJobNode, SinglefileData
from aiida.orm import CalcJobNode, SinglefileData, FolderData

create_file_hierarchy(file_hierarchy, tmp_path)
folder = FolderData()
folder.put_object_from_tree(tmp_path)

inputs = {
'file_a': SinglefileData(io.BytesIO(b'content_a')).store(),
'file_b': SinglefileData(io.BytesIO(b'content_b')).store(),
'file_x': SinglefileData(io.BytesIO(b'content_x')).store(),
'file_y': SinglefileData(io.BytesIO(b'content_y')).store(),
'folder': folder.store(),
}

node = CalcJobNode(computer=aiida_localhost)
Expand All @@ -155,11 +160,22 @@ def test_upload_local_copy_list(fixture_sandbox, aiida_localhost, aiida_local_co
calc_info.uuid = node.uuid
calc_info.codes_info = [code_info]
calc_info.local_copy_list = [
(inputs['file_a'].uuid, inputs['file_a'].filename, './files/file_a'),
(inputs['file_a'].uuid, inputs['file_a'].filename, './files/file_b'),
(inputs['file_x'].uuid, inputs['file_x'].filename, './files/file_x'),
(inputs['file_y'].uuid, inputs['file_y'].filename, './files/file_y'),
(inputs['folder'].uuid, None, '.'),
]

with LocalTransport() as transport:
execmanager.upload_calculation(node, transport, calc_info, fixture_sandbox)

# Check that none of the files were written to the repository of the calculation node, since they were communicated
# through the ``local_copy_list``.
assert node.list_object_names() == []

# Now check that all contents were successfully written to the sandbox
written_hierarchy = serialize_file_hierarchy(pathlib.Path(fixture_sandbox.abspath))
expected_hierarchy = file_hierarchy
expected_hierarchy['files'] = {}
expected_hierarchy['files']['file_x'] = 'content_x'
expected_hierarchy['files']['file_y'] = 'content_y'
assert expected_hierarchy == written_hierarchy

0 comments on commit a295616

Please sign in to comment.