Merge pull request #80 from cerfacs-globc/feature/user-indices

Feature/user indices
cerfacs-globc · Aug 11, 2021 · d548eb9 · d548eb9
2 parents cb61cd7 + 1fc699b
commit d548eb9
Show file tree

Hide file tree

Showing 21 changed files with 1,797 additions and 271 deletions.
diff --git a/.gitignore b/.gitignore
@@ -74,4 +74,6 @@ PET0.ESMF_LogFile
 /.project
 /.pydevproject
 
-.vscode
+.vscode
+
+.idea
diff --git a/icclim/icclim_new.py b/icclim/icclim_new.py
@@ -1,30 +1,30 @@
 # -*- Coding: latin-1 -*-
 #  Copyright CERFACS (http://cerfacs.fr/)
 #  Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-#
-#  Author: Natalia Tatarinova
-#  Additions from Christian Page (2015-2017)
 
+import datetime
+from typing import Callable, List, Optional, Union
+
+import xarray
+import xclim.core.calendar as calendar
+from xarray.core.dataarray import DataArray
 from xarray.core.dataset import Dataset
+
 from icclim import indices
-from icclim.models.frequency import Frequency, SliceMode, build_frequency
-from icclim.util import logging_info
 from icclim.indices import IndiceConfig
-from xarray.core.dataarray import DataArray
-import xarray
-from typing import Callable, List, Union
-import datetime
-import xclim.core.calendar as calendar
+from icclim.models.frequency import Frequency, SliceMode, build_frequency
+from icclim.models.indice_config import CfVariable
+from icclim.user_indice.bridge import compute_user_indice
+from icclim.user_indice.user_indice import UserIndiceConfig
 
 
 def indice(
     in_files: Union[str, List[str]],
-    var_name: List[str],
+    var_name: Union[str, List[str]],
     indice_name: str = None,
     slice_mode: SliceMode = Frequency.YEAR,
     time_range: List[datetime.datetime] = None,
-    # TODO re-add default file name
-    out_file: str = None,
+    out_file: str = "icclim_out.nc",
     threshold: Union[float, List[float]] = None,
     N_lev: int = None,
     # TODO See if it still makes sense with xarray
@@ -40,153 +40,206 @@ def indice(
     ignore_Feb29th: bool = False,
     # TODO should be an enumeration
     interpolation: str = "linear",
-    # TODO probably unecessary with xclim unit handling
+    # TODO probably unnecessary with xclim unit handling, but necessary for user-indices
     out_unit: str = "days",
     # TODO use an enum and re-add default value
     netcdf_version=None,
-    # TODO see if we can make use of a more sophisticated type than dict
     user_indice: dict = None,
-    # TODO ease to extract from percentile_doy
     save_percentile: bool = False,
-):
+) -> Dataset:
     """
-    :param indice_name: Climate index name. 
-    :type indice_name: str    
-    
+    :param indice_name: Climate index name.
+    :type indice_name: str
+
     :param in_files: Absolute path(s) to NetCDF dataset(s) (including OPeNDAP URLs).
     :type in_files: str OR list of str OR list of lists
 
     :param var_name: Target variable name to process corresponding to ``in_files``.
     :type var_name: str OR list of str
-         
+
     :param slice_mode: Type of temporal aggregation: "year", "month", "DJF", "MAM", "JJA", "SON", "ONDJFM" or "AMJJAS". If ``None``, the index will be calculated as monthly values.
     :type slice_mode: str
-    
+
     :param time_range: Temporal range: upper and lower bounds for temporal subsetting. If ``None``, whole period of input files will be processed.
     :type time_range: [datetime.datetime, datetime.datetime]
 
     :param out_file: Output NetCDF file name (default: "icclim_out.nc" in the current directory).
     :type out_file: str
-       
+
     :param threshold: User defined threshold for certain indices.
     :type threshold: float or list of floats
-    
+
     :param N_lev: Level number if 4D variable.
     :type N_lev: int
 
     :param lev_dim_pos: Position of Level dimension, either 0 or 1. 0 is leftmost dimension, 1 is second to the leftmost. Default 1.
     :type lev_dim_pos: int
-    
+
     :param transfer_limit_Mbytes: Maximum OPeNDAP/THREDDS request limit in Mbytes in case of OPeNDAP datasets.
     :type transfer_limit_Mbytes: float
-    
-    :param callback: Progress bar printing. If ``None``, progress bar will not be printed. 
+
+    :param callback: Progress bar printing. If ``None``, progress bar will not be printed.
     :type callback: :func:`callback.defaultCallback`
-    
+
     :param callback_percentage_start_value: Initial value of percentage of the progress bar (default: 0).
     :type callback_percentage_start_value: int
-    
-    :param callback_percentage_total: Total persentage value (default: 100).   
+
+    :param callback_percentage_total: Total persentage value (default: 100).
     :type callback_percentage_total: int
 
-    :param base_period_time_range: Temporal range of the base period. 
+    :param base_period_time_range: Temporal range of the base period.
     :type base_period_time_range: [datetime.datetime, datetime.datetime]
-    
+
     :param window_width: Window width, must be odd (default: 5).
     :type window_width: int
-   
+
     :param only_leap_years: Option for February 29th (default: False).
     :type only_leap_years: bool
-   
+
     :param ignore_Feb29th: Ignoring or not February 29th (default: False).
     :type ignore_Feb29th: bool
-   
+
     :param interpolation: Interpolation method to compute percentile values: "linear" or "hyndman_fan" (default: "hyndman_fan").
     :type interpolation: str
-    
+
     :param out_unit: Output unit for certain indices: "days" or "%" (default: "days").
     :type out_unit: str
-    
+
     :param user_indice: A dictionary with parameters for user defined index
     :type user_indice: dict
-    
+
     :param netcdf_version: NetCDF version to create (default: "NETCDF3_CLASSIC").
     :type netcdf_version: str
-    
+
     :rtype: path to NetCDF file
 
     .. warning:: If ``out_file`` already exists, icclim will overwrite it!
 
     """
-
-    logging_info.start_message()
-    ds = xarray.open_mfdataset(in_files)
+    if isinstance(in_files, str):
+        ds = xarray.open_dataset(in_files)
+    else:
+        ds = xarray.open_mfdataset(in_files)
     config = IndiceConfig()
-    config.data_arrays = []
-    config.data_arrays_in_base = []
-    sampling_frequency = build_frequency(slice_mode)
+    config.freq = build_frequency(slice_mode)
     if isinstance(var_name, str):
         var_name = [var_name]
-    for cf_var in var_name:
-        da = build_data_array(ds[cf_var], time_range, ignore_Feb29th)
-        config.data_arrays.append(da)
-        if base_period_time_range is not None:
-            config.data_arrays_in_base.append(
-                build_in_base_da(ds[cf_var], base_period_time_range, only_leap_years)
-            )
-
-    config.freq = sampling_frequency.panda_freq
+    config.cf_variables = [
+        _build_cf_variable(
+            ds[cf_var_name],
+            time_range,
+            ignore_Feb29th,
+            base_period_time_range,
+            only_leap_years,
+        )
+        for cf_var_name in var_name
+    ]
     config.window = window_width
+    config.save_percentile = save_percentile
+    config.is_percent = out_unit == "%"
+    # TODO add attributes to dataset
+    if user_indice is not None:
+        result_ds = _build_user_indice_dataset(config, save_percentile, user_indice)
+    else:
+        result_ds = _build_indice_dataset(config, indice_name, threshold)
+    result_ds.to_netcdf(out_file)
+    return result_ds
+
+
+def _build_indice_dataset(config: IndiceConfig, indice_name: str, threshold) -> Dataset:
     result_ds = Dataset()
-    # TODO add global attributes to dataset
+    if indice_name is None:
+        raise Exception("indice_name must be provided.")  # user input error
     if isinstance(threshold, list):
         for th in threshold:
-            result_ds[f"{indice_name}_threshold_{th}"] = compute_indice(
-                indice_name, config, sampling_frequency, th,
-            )
+            # TODO: in v4 threshold was a dimension
+            da = _compute_indice(indice_name, config, th)
+            if config.freq.resampler is not None:
+                resampled_da, time_bounds = config.freq.resampler(da)
+                result_ds[f"{indice_name}_threshold_{th}"] = resampled_da
+                result_ds["time_bounds"] = time_bounds
+            else:
+                result_ds[f"{indice_name}_threshold_{th}"] = da
     else:
-        result_ds[indice_name] = compute_indice(
-            indice_name, config, sampling_frequency, threshold, indice_name
-        )
-    result_ds.to_netcdf(out_file)
+        da = _compute_indice(indice_name, config, threshold)
+        if config.freq.resampler is not None:
+            resampled_da, time_bounds = config.freq.resampler(da)
+            result_ds[indice_name] = resampled_da
+            result_ds["time_bounds"] = time_bounds
+        else:
+            result_ds[indice_name] = da
+    return result_ds
+
+
+def _build_user_indice_dataset(
+    config: IndiceConfig, save_percentile: bool, user_indice: dict
+) -> Dataset:
+    result_ds = Dataset()
+    user_indice_config: UserIndiceConfig = UserIndiceConfig(
+        **user_indice,
+        freq=config.freq,
+        cf_vars=config.cf_variables,
+        is_percent=config.is_percent,
+        save_percentile=save_percentile,
+    )
+    user_indice_da = compute_user_indice(user_indice_config)
+    if config.freq.resampler is not None:
+        user_indice_da, time_bounds = config.freq.resampler(user_indice_da)
+        result_ds[user_indice_config.indice_name] = user_indice_da
+        result_ds["time_bounds"] = time_bounds
+    else:
+        result_ds[user_indice_config.indice_name] = user_indice_da
+    return result_ds
 
 
-def compute_indice(indice_name, config, sampling_frequency, th):
-    config.threshold = to_celcius(th)
+def _compute_indice(indice_name: str, config: IndiceConfig, threshold: Optional[float]):
+    config.threshold = _to_celsius(threshold)
     da = indices.indice_from_string(indice_name).compute(config)
-    if sampling_frequency.resampler is not None:
-        da = sampling_frequency.resampler(da)
     return da
 
 
-def build_data_array(
-    da: DataArray, time_range: List[datetime.datetime], ignore_Feb29th: bool
+def _build_cf_variable(
+    da: DataArray,
+    time_range: Optional[List[datetime.datetime]],
+    ignore_Feb29th: bool,
+    base_period_time_range: Optional[List[datetime.datetime]],
+    only_leap_years: bool,
+) -> CfVariable:
+    cf_var = CfVariable(_build_data_array(da, time_range, ignore_Feb29th))
+    if base_period_time_range is not None:
+        cf_var.in_base_da = _build_in_base_da(
+            da, base_period_time_range, only_leap_years
+        )
+    return cf_var
+
+
+def _build_data_array(
+    da: DataArray, time_range: Optional[List[datetime.datetime]], ignore_Feb29th: bool
 ) -> DataArray:
-    if len(time_range) != 2:
-        raise Exception("Not a valid time range")
-    time_range = slice(time_range[0], time_range[1])
-    da = da.sel(time=time_range)
+    if time_range is not None:
+        if len(time_range) != 2:
+            raise Exception("Not a valid time range")
+        da = da.sel(time=slice(time_range[0], time_range[1]))
     if ignore_Feb29th:
-        da = calendar.convert_calendar(da, "noleap")
+        da = calendar.convert_calendar(da, "noleap")  # type:ignore
     return da
 
 
-def build_in_base_da(
+def _build_in_base_da(
     da: DataArray,
     base_period_time_range: List[datetime.datetime],
     only_leap_years: bool,
 ) -> DataArray:
     if len(base_period_time_range) != 2:
         raise Exception("Not a valid time range")
-    base_period_time_range = slice(base_period_time_range[0], base_period_time_range[1])
-    da = da.sel(time=base_period_time_range)
+    da = da.sel(time=slice(base_period_time_range[0], base_period_time_range[1]))
     if only_leap_years:
-        da = reduce_only_leap_years(da)
+        da = _reduce_only_leap_years(da)
     return da
 
 
-def reduce_only_leap_years(da: DataArray):
-    reduced_list = []
+def _reduce_only_leap_years(da: DataArray) -> DataArray:
+    reduced_list: List[DataArray] = []
     for _, val in da.groupby(da.time.dt.year):
         if val.time.dt.dayofyear.max() == 366:
             reduced_list.append(val)
@@ -197,8 +250,7 @@ def reduce_only_leap_years(da: DataArray):
     return xarray.concat(reduced_list, "time")
 
 
-def to_celcius(threshold: float) -> Union[None, str]:
+def _to_celsius(threshold: Optional[float]) -> Optional[str]:
     if threshold is not None:
         return f"{threshold} degC"
     return None
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -74,4 +74,6 @@ PET0.ESMF_LogFile @@
     /.project
     /.pydevproject
-    .vscode
+    .vscode
+    .idea