Skip to content

Commit

Permalink
Merge pull request #712 from ImperialCollegeLondon/711-the-dataload_d…
Browse files Browse the repository at this point in the history
…ata_config-and-loaders-open-and-close-each-file-for-each-variable

Make data loading more efficient by opening each source file once only
  • Loading branch information
davidorme authored Feb 4, 2025
2 parents e5fe458 + c74e4b3 commit a47852b
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 145 deletions.
18 changes: 13 additions & 5 deletions docs/source/using_the_ve/data/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,16 @@ two methods:
{class}`~virtual_ecosystem.core.data.Data` instance just using the standard
dictionary assignment: ``data['var_name'] = data_array``. The Virtual Ecosystem
{mod}`~virtual_ecosystem.core.readers` module provides the
function {func}`~virtual_ecosystem.core.readers.load_to_dataarray` to read data into
a DataArray from supported file formats. This can then be added directly to a Data
instance:
function {func}`~virtual_ecosystem.core.readers.load_to_dataarray` to read a list of
variables in a file into DataArrays from supported file formats. The returned value
is a dictionary of DataArrays keyed by the variable names and can then be added
directly to a Data instance:

```{code-block} ipython3
data["var_name"] = load_to_dataarray("path/to/file.nc", var_name="temperature")
loaded_data = load_to_dataarray("path/to/file.nc", var_names=["temperature"])
# iterate over the dictionary of variable names and arrays
for var_name, data_array in loaded_data.items():
data[var_name] = data_array
```

1. The {meth}`~virtual_ecosystem.core.data.Data.load_data_config` method takes a
Expand Down Expand Up @@ -186,7 +190,11 @@ configured grid.
```{code-cell} ipython3
# Load data from a file
file_path = Path("../../data/xy_dim.nc")
data["temp"] = load_to_dataarray(file_path, var_name="temp")
loaded_data = load_to_dataarray(file_path, var_names=["temp"])
# iterate over the dictionary of variable names and arrays
for var_name, data_array in loaded_data.items():
data[var_name] = data_array
```

```{code-cell} ipython3
Expand Down
65 changes: 32 additions & 33 deletions tests/core/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,27 +229,27 @@ def test_Data_contains(fixture_data, var_name, expected):


@pytest.mark.parametrize(
argnames=["name", "exp_log"],
argnames=["var_names", "exp_log"],
argvalues=[
pytest.param(
"temp",
["temp"],
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
id="simple_load",
),
pytest.param(
"elev",
["elev"],
(
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Replacing data array for 'elev'"),
),
id="load_and_replace",
),
],
)
def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
def test_Data_load_to_dataarray_naming(caplog, shared_datadir, var_names, exp_log):
"""Test the coding of the name handling and replacement."""

# Setup a Data instance to match the example files generated in tests/core/data
Expand All @@ -258,6 +258,8 @@ def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
from virtual_ecosystem.core.grid import Grid
from virtual_ecosystem.core.readers import load_to_dataarray

caplog.clear()

grid = Grid(
grid_type="square",
cell_nx=10,
Expand All @@ -275,11 +277,14 @@ def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
# Load the data from file
datafile = shared_datadir / "cellid_coords.nc"

data[name] = load_to_dataarray(file=datafile, var_name=name)
results = load_to_dataarray(file=datafile, var_names=var_names)
for ky, val in results.items():
data[ky] = val

# Check the naming has worked and the data are loaded
assert name in data
assert data[name].sum() == (20 * 100)
for name in var_names:
# Check the naming has worked and the data are loaded
assert name in data
assert data[name].sum() == (20 * 100)

# Check the error reports
log_check(caplog, exp_log)
Expand Down Expand Up @@ -328,7 +333,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -340,7 +345,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Grid defines 100 cells, data provides 60",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Grid defines 100 cells, data provides 60"),
),
Expand All @@ -353,7 +358,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Grid defines 100 cells, data provides 200",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Grid defines 100 cells, data provides 200"),
),
Expand All @@ -366,7 +371,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -378,7 +383,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"The data cell ids do not provide a one-to-one map onto grid cell ids.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(
CRITICAL,
Expand All @@ -395,7 +400,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"The data cell ids do not provide a one-to-one map onto grid cell ids.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(
CRITICAL,
Expand All @@ -412,7 +417,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -424,7 +429,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Data XY dimensions do not match square grid",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Data XY dimensions do not match square grid"),
),
Expand All @@ -437,7 +442,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -449,7 +454,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Mapped points do not cover all cells.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Mapped points do not cover all cells."),
),
Expand All @@ -462,7 +467,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Mapped points fall outside grid.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Mapped points fall outside grid."),
),
Expand Down Expand Up @@ -499,6 +504,8 @@ def test_Data_load_to_dataarray_data_handling(
from virtual_ecosystem.core.data import Data
from virtual_ecosystem.core.readers import load_to_dataarray

caplog.clear()

# Skip combinations where validator does not supported this grid
if not (
("__any__" in supported_grids)
Expand All @@ -510,7 +517,8 @@ def test_Data_load_to_dataarray_data_handling(
datafile = shared_datadir / filename

with exp_error as err:
data["temp"] = load_to_dataarray(file=datafile, var_name="temp")
results = load_to_dataarray(file=datafile, var_names=["temp"])
data["temp"] = results["temp"]

# Check the data is in fact loaded and that a simple sum of values matches
assert "temp" in data
Expand All @@ -521,8 +529,6 @@ def test_Data_load_to_dataarray_data_handling(

log_check(caplog, exp_log)

return


@pytest.mark.parametrize(
argnames=["cfg_strings", "exp_error", "exp_msg", "exp_log"],
Expand All @@ -546,13 +552,10 @@ def test_Data_load_to_dataarray_data_handling(
None,
(
(INFO, "Loading data from configuration"),
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(INFO, "Loading variable 'prec' from file:"),
(INFO, "Adding data array for 'prec'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Adding data array for 'elev'"),
(INFO, "Loading variable 'vapd' from file:"),
(INFO, "Adding data array for 'vapd'"),
),
id="valid config",
Expand Down Expand Up @@ -587,14 +590,10 @@ def test_Data_load_to_dataarray_data_handling(
(
(INFO, "Loading data from configuration"),
(ERROR, "Duplicate variable names in data configuration"),
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(INFO, "Loading variable 'prec' from file:"),
(INFO, "Adding data array for 'prec'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Adding data array for 'elev'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Replacing data array for 'elev'"),
(CRITICAL, "Data configuration did not load cleanly - check log"),
),
id="repeated names",
Expand Down
Loading

0 comments on commit a47852b

Please sign in to comment.