Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

After xu.concat(): selection and depth conversion issues #80

Closed
Sonia-Heye opened this issue Apr 25, 2023 · 4 comments
Closed

After xu.concat(): selection and depth conversion issues #80

Sonia-Heye opened this issue Apr 25, 2023 · 4 comments

Comments

@Sonia-Heye
Copy link

Sonia-Heye commented Apr 25, 2023

Hey,
I've found an issue when using concatenated datasets, following xu.concat().
Firstly, my ds.ugrid.sel() function gives me an index error with the concatenated data.
Secondly, my depth conversion from sigma to meters (dfmt.get_Dataset_atdepths()) only works before the data is concatenated - and also does not work on the point-selected output from the first issue.
An example: P:\11208479-sequestration-seaweed\Oosterschelde_DFM_hydro_waq\dflowfm3d-oosterschelde-wq\scripts\issue_minimal_working_example.ipynb

Or:

import os
from pathlib import Path
import geopandas
import pandas as pd
import xugrid as xu
import dfm_tools as dfmt
from shapely.geometry import Point

## Reading output from 2 years, for 3 partitions

basedir = r"p:\archivedprojects\11206044-002ospareutrophication\final_results_Snellius_latestversion_20220210"
f1 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2015_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]
f2 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2016_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]

waq_xr1 = dfmt.open_partitioned_dataset(f1)       # opens every .map file in this folder
waq_xr2 = dfmt.open_partitioned_dataset(f2)

waq_xr1 = dfmt.rename_waqvars(waq_xr1)             # renames the water quality variables
waq_xr2 = dfmt.rename_waqvars(waq_xr2)

waq_xr = xu.concat([waq_xr1, waq_xr2], dim='time') # concatenate the two years


list_plifiles = [Path(r'p:\11208479-sequestration-seaweed\Oosterschelde_DFM_hydro_waq\dflowfm3d-oosterschelde-wq\boundary_conditions\zee-zuid.pli')]


## Loop over each boundary

for i,file in list(enumerate(list_plifiles)):
    print(str(file).split('\\')[-1].split('.')[0])
    
    # Read points within this offshore boundary:
    pli_file = pd.read_csv(list_plifiles[i], skiprows=1, sep=' ')   # read pli file
    pli_file = pli_file.dropna(axis=1)                              # drop empty columns
    # Convert dutch crs to decimal degrees (for DCSM)
    points = [] 
    for i in range(0,len(pli_file)):
        points.append(Point(pli_file[pli_file.columns[0]].to_list()[i], pli_file[pli_file.columns[1]].to_list()[i]))
    d = {'location': pli_file[pli_file.columns[2]].to_list(), 'geometry': points}
    gdf = geopandas.GeoDataFrame(d, crs='epsg:28992')   # set Dutch crs
    gdf = gdf.to_crs(4326)                              # convert to decimal degrees


point = 0


## Issue when using .sel from concatenated ds: 
waq_xr.ugrid.sel(x=gdf.geometry[point].x, y=gdf.geometry[point].y)

## Note: no issue with pre-concatenated data
ds_point = waq_xr1.ugrid.sel(x=gdf.geometry[point].x, y=gdf.geometry[point].y)

## Issue with depth conversion from sigma to meters:
dfmt.get_Dataset_atdepths(data_xr=ds_point, depths = waq_xr.mesh2d_layer_z[0,:].values)  ## using point selected from above

dfmt.get_Dataset_atdepths(data_xr=waq_xr, depths = waq_xr.mesh2d_layer_z[0,:].values)  # Also issue when using concat data

## But no issue when using 1 year (pre-concat and pre-point-selection):
dfmt.get_Dataset_atdepths(data_xr=waq_xr1, depths=waq_xr.mesh2d_layer_z[0,:].values.tolist(), reference='waterlevel')
@Sonia-Heye
Copy link
Author

Sonia-Heye commented Apr 25, 2023

Three MWE's including corresponding errors:

#xr.open_dataset, xr.concat, xu.UgridDataset
chunks = {'time':1}
y1p1 = xr.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y2p1 = xr.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y1p2 = xr.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
y2p2 = xr.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
p1_xr = xr.concat([y1p1,y2p1], dim='time')
p1_xu = xu.UgridDataset(p1_xr)
#raises "IndexError: boolean index did not match indexed array along dimension 1; dimension is 3094 but corresponding boolean dimension is 4"
#xu.open_dataset, xu.concat, xu.merge_partitions
chunks = {'time':1}
y1p1 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y2p1 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y1p2 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
y2p2 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
p1_xu = xu.concat([y1p1,y2p1], dim='time')
p2_xu = xu.concat([y1p2,y2p2], dim='time')
y1_xu = xu.merge_partitions([y1p1,y1p2]) #works
all_xu = xu.merge_partitions([p1_xu,p2_xu]) #fails
#raises "ValueError: Expected 2 UGRID topologies for 2 partitions, received: defaultdict(<class 'list'>, {'mesh2d': [<xarray.Dataset> [...]"
#xu.open_dataset, xu.merge_partitions, xu.concat, xu.ugrid.sel
chunks = {'time':1}
y1p1 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y2p1 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',chunks=chunks)
y1p2 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
y2p2 = xu.open_dataset(r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',chunks=chunks)
y1_xu = xu.merge_partitions([y1p1,y1p2])
y2_xu = xu.merge_partitions([y2p1,y2p2])
all_xu = xu.concat([y1_xu,y2_xu], dim='time')
all_xu_sel = all_xu.ugrid.sel(x=3.5051096780337727, y=51.56855173004132)
#raises "IndexError: index 4851 is out of bounds for axis 0 with size 1"

This one does work, with xu.open_mfdataset instead of xu.concat:

#xu.open_mfdataset, xu.merge_partitions, .ugrid.sel
chunks = {'time':1}
files_p1 = [r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc',r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0146_map.nc']
files_p2 = [r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2015_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc',r'p:\\archivedprojects\\11206044-002ospareutrophication\\final_results_Snellius_latestversion_20220210\\A16b_Numtopsiguniform1_2016_pH\\DFM_OUTPUT_DCSM-FM_0_5nm_waq\\DCSM-FM_0_5nm_waq_0147_map.nc']
all_p1 = xu.open_mfdataset(files_p1,chunks=chunks)
all_p2 = xu.open_mfdataset(files_p2,chunks=chunks)
all_xu = xu.merge_partitions([all_p1,all_p2])
all_xu_sel = all_xu.ugrid.sel(x=3.5051096780337727, y=51.56855173004132)

@veenstrajelmer
Copy link
Collaborator

I ran the code in the issue description again. Most of the lines work without errors. However, there is still one issue left. When interpolating the merged dataset (waq_xr) to depths, it raises "ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()". This is because the mesh2d_interface_z variable (and possibly others), has an additional time dimension. Reproducible code:

import os
import xugrid as xu
import dfm_tools as dfmt

## Reading output from 2 years, for 3 partitions
basedir = r"p:\archivedprojects\11206044-002ospareutrophication\final_results_Snellius_latestversion_20220210"
f1 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2015_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]
f2 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2016_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]

waq_xr1 = dfmt.open_partitioned_dataset(f1)       # opens every .map file in this folder
waq_xr2 = dfmt.open_partitioned_dataset(f2)

waq_xr1 = dfmt.rename_waqvars(waq_xr1)             # renames the water quality variables
waq_xr2 = dfmt.rename_waqvars(waq_xr2)

waq_xr = xu.concat([waq_xr1, waq_xr2], dim='time') # concatenate the two years

# this raises "ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()"
# dfmt.get_Dataset_atdepths(data_xr=waq_xr, depths = waq_xr.mesh2d_layer_z[0,:].values)

print(waq_xr2['mesh2d_interface_z'].dims) # ('mesh2d_nInterfaces',)
print(waq_xr['mesh2d_interface_z'].dims) # ('time', 'mesh2d_nInterfaces')

Prints:

('mesh2d_nInterfaces',)
('time', 'mesh2d_nInterfaces')

Is this a bug or should one use a different approach to concatenate the datasets?

@Huite
Copy link
Collaborator

Huite commented Sep 5, 2024

I think it needs a different option for one of the keywords:

xarray.concat

@veenstrajelmer
Copy link
Collaborator

veenstrajelmer commented Sep 5, 2024

Thanks, you are right, I missed data_vars='minimal'. This avoids the time dimension being broadcasted on variables without it.

This code should do all the intented things and gives no errors, at least not with the current xugrid release (0.12.0):

import os
import geopandas
import pandas as pd
import xugrid as xu
import dfm_tools as dfmt
from shapely.geometry import Point

## Reading output from 2 years, for 3 partitions
basedir = r"p:\archivedprojects\11206044-002ospareutrophication\final_results_Snellius_latestversion_20220210"
f1 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2015_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]
f2 = [os.path.join(basedir, r'A16b_Numtopsiguniform1_2016_pH\DFM_OUTPUT_DCSM-FM_0_5nm_waq',f'DCSM-FM_0_5nm_waq_{i:04d}_map.nc') for i in [146,147,148]]

# opens every .map file in this folder
waq_xr1 = dfmt.open_partitioned_dataset(f1)
waq_xr2 = dfmt.open_partitioned_dataset(f2)

# rename the water quality variables
waq_xr1 = dfmt.rename_waqvars(waq_xr1)
waq_xr2 = dfmt.rename_waqvars(waq_xr2)

# concatenate the two years
waq_xr = xu.concat([waq_xr1, waq_xr2], 
                   dim='time', # dimension along which to concatenate
                   data_vars='minimal' # to avoid expanding of non-time variables with time dimension
                   )

file_pli = r'p:\11208479-sequestration-seaweed\Oosterschelde_DFM_hydro_waq\dflowfm3d-oosterschelde-wq\boundary_conditions\zee-zuid.pli'

# Read points within this offshore boundary:
pli_file = pd.read_csv(file_pli, skiprows=1, sep=' ')  # read pli file
pli_file = pli_file.dropna(axis=1)  # drop empty columns
# Convert dutch crs to decimal degrees (for DCSM)
points = [] 
for i in range(0,len(pli_file)):
    points.append(Point(pli_file[pli_file.columns[0]].to_list()[i], pli_file[pli_file.columns[1]].to_list()[i]))
d = {'location': pli_file[pli_file.columns[2]].to_list(), 'geometry': points}
gdf = geopandas.GeoDataFrame(d, crs='epsg:28992')  # set Dutch crs
gdf = gdf.to_crs(4326)  # convert to decimal degrees

point = 0

ds_point = waq_xr.ugrid.sel(x=gdf.geometry[point].x, y=gdf.geometry[point].y)

ds_point_atdepths = dfmt.get_Dataset_atdepths(data_xr=ds_point, depths=waq_xr.mesh2d_layer_z[0].values)

waq_xr_atdepths = dfmt.get_Dataset_atdepths(data_xr=waq_xr, depths=waq_xr.mesh2d_layer_z[0].values)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants