Doc strings, type hints, code revisions (#34)

rileyhales · web-flow · commit cb6f080b473d · 2024-04-17T13:41:44.000-06:00
diff --git a/docs/api-documentation.rst b/docs/api-documentation.rst
@@ -9,4 +9,5 @@ API Documentation
    api-documentation/bias
    api-documentation/plots
    api-documentation/analyze
+   api-documentation/streams
    api-documentation/streamflow
diff --git a/docs/api-documentation/analyze.rst b/docs/api-documentation/analyze.rst
@@ -2,8 +2,6 @@
 geoglows.analyze
 ================
 
-Analyze
-~~~~~~~
 Functions which post process results from the streamflow data service into additional, useful products
 
 .. automodule:: geoglows.analyze
diff --git a/docs/api-documentation/streams.rst b/docs/api-documentation/streams.rst
@@ -0,0 +1,14 @@
+================
+geoglows.streams
+================
+
+The functions in this module lookup metadata for rivers using a table of metadata about the GEOGLOWS model. This needs
+to be downloaded or it can be retrieved and cached by the metadata table function in the data module.
+
+If you download the table in advance, you can specify it with the PYGEOGLOWS_METADATA_TABLE_PATH environment variable
+which will be checked at runtime. If it is not set, you need to restart the runtime or use the download function to
+retrieve it.
+
+.. automodule:: geoglows.streams
+    :members:
+        river_to_vpu, latlon_to_river, river_to_latlon
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,32 +1,25 @@
 geoglows
 ========
-.. image:: https://anaconda.org/geoglows/geoglows/badges/platforms.svg
+.. image:: https://anaconda.org/conda-forge/geoglows/badges/platforms.svg
         :target: https://anaconda.org/geoglows/geoglows
 .. image:: https://img.shields.io/pypi/v/geoglows
         :target: https://pypi.org/project/geoglows
-.. image:: https://anaconda.org/geoglows/geoglows/badges/latest_release_date.svg
+.. image:: https://anaconda.org/conda-forge/geoglows/badges/latest_release_date.svg
         :target: https://anaconda.org/geoglows/geoglows
 
 The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Streamflow Model <https://geoglows.ecmwf.int>`_.
 Read more about GEOGLOWS at `<https://geoglows.org>`_
 
-Demos
-=====
-These links will be maintained to reference the most updated versions of the tutorials.
-The tutorials are GitHub Gists which you can copy and launch in a Google Collaboratory setting directly from the GitHub.
-
-- Retrieve & plot GEOGLOWS model data: `<https://gist.github.com/rileyhales/873896e426a5bd1c4e68120b286bc029>`_
-- Finding Stream ID #'s programmatically: `<https://gist.github.com/rileyhales/ad92d1fce3aa36ef5873f2f7c2632d31>`_
-- Bias Evaluation and Calibration at a point: `<https://gist.github.com/rileyhales/d5290e12b5858d59960d0898fbd0ed69>`_
-- Generate/Download High Res Plot Images: `<https://gist.github.com/rileyhales/9b5bbb0c5f307eb14b9f1ced39d641e4>`_
+For demos, tutorials, and other training materials for GEOGLOWS and the geoglows Python packge, please visit
+`<https://data.geoglows.org>`_.
 
 About GEOGLOWS ECMWF Streamflow
 ===============================
-GEOGLOWS ECMWF Streamflow Project: This project provides access to the results of a hydrologic model that is run each
+GEOGLOWS ECMWF Streamflow Project: This project provides access to the results of a hydrological model that is run each
 day. The model is based on a group of unique weather forecasts, known as an ensemble, from ECMWF. Each unique
 precipitation forecast, known as an ensemble member, produces a unique streamflow forecast. There are 52 members of the
 ensemble that drives the model each day. The ERA-5 historical precipitation dataset to also used to produce a
-hindcasted streamflow on each river. `Read more here <https://geoglows.ecmwf.int>`_.
+retrospective streamflow on each river. `Read more here <https://geoglows.ecmwf.int>`_.
 
 .. toctree::
     :caption: Table of Contents
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'METADATA_TABLE_PATH'
 ]
-__version__ = '1.2.0'
+__version__ = '1.2.1'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/data.py b/geoglows/data.py
@@ -6,6 +6,7 @@
 import requests
 import s3fs
 import xarray as xr
+import numpy as np
 
 from ._constants import METADATA_TABLE_PATH
 from .analyze import (
@@ -85,7 +86,7 @@ def from_aws(*args, **kwargs):
         df = ds.to_dataframe().round(2).reset_index()
 
         # rename columns to match the REST API
-        if isinstance(river_id, int):
+        if isinstance(river_id, int) or isinstance(river_id, np.int64):
             df = df.pivot(index='time', columns='ensemble', values='Qout')
         else:
             df = df.pivot(index=['time', 'rivid'], columns='ensemble', values='Qout')
@@ -120,7 +121,7 @@ def from_rest(*args, **kwargs):
         endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint
 
         version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION)
-        assert version in ('v1', 'v2', ), ValueError(f'Unrecognized model version parameter: {version}')
+        assert version in ('v2', ), ValueError(f'Unrecognized model version parameter: {version}')
 
         product_name = function.__name__.replace("_", "").lower()
 
@@ -131,7 +132,7 @@ def from_rest(*args, **kwargs):
                              'Use data_source="aws" and version="v2" for multiple river_ids.')
         river_id = int(river_id) if river_id else None
         if river_id and version == 'v2':
-            assert river_id < 1_000_000_000 and river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer')
+            assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer')
 
         # request parameter validation before submitting
         for key in ('endpoint', 'version', 'river_id'):
@@ -178,8 +179,7 @@ def main(*args, **kwargs):
         assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}')
         if source == 'rest':
             return from_rest(*args, **kwargs)
-        else:
-            return from_aws(*args, **kwargs)
+        return from_aws(*args, **kwargs)
     main.__doc__ = function.__doc__  # necessary for code documentation auto generators
     return main
 
@@ -191,7 +191,7 @@ def dates(**kwargs) -> dict or str:
     Gets a list of available forecast product dates
 
     Keyword Args:
-        data_source: location to query for data, either 'rest' or 'aws'. default is aws.
+        data_source (str): location to query for data, either 'rest' or 'aws'. default is aws.
 
     Returns:
         dict or str
@@ -204,14 +204,14 @@ def dates(**kwargs) -> dict or str:
 
 @_forecast_endpoint_decorator
 def forecast(*, river_id: int, date: str, format: str, data_source: str,
-             **kwargs) -> pd.DataFrame or dict or str:
+             **kwargs) -> pd.DataFrame or xr.Dataset:
     """
     Gets the average forecasted flow for a certain river_id on a certain date
 
     Keyword Args:
-        river_id (str): the ID of a stream, should be a 9 digit integer
+        river_id (int): the ID of a stream, should be a 9 digit integer
         date (str): a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified
-        format (str): csv, json, or url, default csv
+        format: if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
         data_source (str): location to query for data, either 'rest' or 'aws'. default is aws.
 
     Returns:
@@ -222,16 +222,16 @@ def forecast(*, river_id: int, date: str, format: str, data_source: str,
 
 @_forecast_endpoint_decorator
 def forecast_stats(*, river_id: int, date: str, format: str, data_source: str,
-                   **kwargs) -> pd.DataFrame or dict or str:
+                   **kwargs) -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the min, 25%, mean, median, 75%, and max river discharge of the 51 ensembles members for a river_id
     The 52nd higher resolution member is excluded
 
     Keyword Args:
-        river_id: the ID of a stream, should be a 9 digit integer
-        date: a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified
-        format: if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
-        data_source: location to query for data, either 'rest' or 'aws'. default is aws.
+        river_id (int): the ID of a stream, should be a 9 digit integer
+        date (str): a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified
+        format (str): if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
+        data_source (str): location to query for data, either 'rest' or 'aws'. default is aws.
 
     Returns:
         pd.DataFrame or dict or str
@@ -241,15 +241,15 @@ def forecast_stats(*, river_id: int, date: str, format: str, data_source: str,
 
 @_forecast_endpoint_decorator
 def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: str,
-                       **kwargs) -> pd.DataFrame or dict or str:
+                       **kwargs) -> pd.DataFrame or xr.Dataset:
     """
     Retrieves each of 52 time series of forecasted discharge for a river_id on a certain date
 
     Keyword Args:
-        river_id: the ID of a stream, should be a 9 digit integer
-        date: a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified
-        format: if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
-        data_source: location to query for data, either 'rest' or 'aws'. default is aws.
+        river_id (int): the ID of a stream, should be a 9 digit integer
+        date (str): a string specifying the date to request in YYYYMMDD format, returns the latest available if not specified
+        format (str): if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
+        data_source (str): location to query for data, either 'rest' or 'aws'. default is aws.
 
     Returns:
         pd.DataFrame or dict or str
@@ -258,17 +258,16 @@ def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: st
 
 
 @_forecast_endpoint_decorator
-def forecast_records(*, river_id: int, start_date: str, end_date: str, format: str, data_source: str,
+def forecast_records(*, river_id: int, start_date: str, end_date: str, format: str,
                      **kwargs) -> pd.DataFrame or dict or str:
     """
     Retrieves a csv showing the ensemble average forecasted flow for the year from January 1 to the current date
 
     Keyword Args:
-        river_id: the ID of a stream, should be a 9 digit integer
-        start_date: a YYYYMMDD string giving the earliest date this year to include, defaults to 14 days ago.
-        end_date: a YYYYMMDD string giving the latest date this year to include, defaults to latest available
-        data_source: location to query for data, either 'rest' or 'aws'. default is aws.
-        format: if data_source=="rest": csv, json, or url, default csv. if data_source=="aws": df or xarray
+        river_id (int): the ID of a stream, should be a 9 digit integer
+        start_date (str): a YYYYMMDD string giving the earliest date this year to include, defaults to 14 days ago.
+        end_date (str): a YYYYMMDD string giving the latest date this year to include, defaults to latest available
+        format (str): csv, json, or url, default csv.
 
     Returns:
         pd.DataFrame or dict or str
@@ -280,11 +279,11 @@ def forecast_records(*, river_id: int, start_date: str, end_date: str, format: s
 def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the retrospective simulation of streamflow for a given river_id from the
-    AWS Open Data Program GEOGloWS V2 S3 bucket
+    AWS Open Data Program GEOGLOWS V2 S3 bucket
 
     Args:
-        river_id: the ID of a stream, should be a 9 digit integer
-        format: the format to return the data, either 'df' or 'xarray'. default is 'df'
+        river_id (int): the ID of a stream, should be a 9 digit integer
+        format (str): the format to return the data, either 'df' or 'xarray'. default is 'df'
 
     Returns:
         pd.DataFrame
@@ -302,12 +301,12 @@ def historical(*args, **kwargs):
     return retrospective(*args, **kwargs)
 
 
-def daily_averages(river_id: int or list) -> pd.DataFrame or xr.Dataset:
+def daily_averages(river_id: int or list) -> pd.DataFrame:
     """
     Retrieves daily average streamflow for a given river_id
 
     Args:
-        river_id: the ID of a stream, should be a 9 digit integer
+        river_id (int): the ID of a stream, should be a 9 digit integer
 
     Returns:
         pd.DataFrame
@@ -321,7 +320,7 @@ def monthly_averages(river_id: int or list) -> pd.DataFrame:
     Retrieves monthly average streamflow for a given river_id
 
     Args:
-        river_id: the ID of a stream, should be a 9 digit integer
+        river_id (int): the ID of a stream, should be a 9 digit integer
 
     Returns:
         pd.DataFrame
@@ -335,7 +334,7 @@ def annual_averages(river_id: int or list) -> pd.DataFrame:
     Retrieves annual average streamflow for a given river_id
 
     Args:
-        river_id: the ID of a stream, should be a 9 digit integer
+        river_id (int): the ID of a stream, should be a 9 digit integer
 
     Returns:
         pd.DataFrame
@@ -344,13 +343,13 @@ def annual_averages(river_id: int or list) -> pd.DataFrame:
     return calc_annual_averages(df)
 
 
-def return_periods(river_id: int or list, format: str = 'df') -> pd.DataFrame:
+def return_periods(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the return period thresholds based on a specified historic simulation forcing on a certain river_id.
 
     Args:
-        river_id: the ID of a stream, should be a 9 digit integer
-        format: the format to return the data, either 'df' or 'xarray'. default is 'df'
+        river_id (int): the ID of a stream, should be a 9 digit integer
+        format (str): the format to return the data, either 'df' or 'xarray'. default is 'df'
 
     Returns:
         pd.DataFrame
@@ -369,7 +368,7 @@ def metadata_tables(columns: list = None) -> pd.DataFrame:
     """
     Retrieves the master table of rivers metadata and properties as a pandas DataFrame
     Args:
-        columns: optional subset of columns names to read from the parquet
+        columns (list): optional subset of columns names to read from the parquet
 
     Returns:
         pd.DataFrame
@@ -379,6 +378,7 @@ def metadata_tables(columns: list = None) -> pd.DataFrame:
     warn = f"""
     Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and 
     to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_PATH}.
+    Alternatively, set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH to the path of the table.
     """
     warnings.warn(warn)
     df = pd.read_parquet('https://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet')
diff --git a/geoglows/streams.py b/geoglows/streams.py
@@ -5,7 +5,16 @@
 __all__ = ['river_to_vpu', 'latlon_to_river', 'river_to_latlon', ]
 
 
-def river_to_vpu(river_id: int) -> str or int:
+def river_to_vpu(river_id: int) -> int:
+    """
+    Gives the VPU number for a given River ID number
+
+    Args:
+        river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number
+
+    Returns:
+        int: a 3 digit integer that is the VPU number for the given River ID number
+    """
     return (
         metadata_tables(columns=['LINKNO', 'VPUCode'])
         .loc[lambda x: x['LINKNO'] == river_id, 'VPUCode']
@@ -14,12 +23,30 @@ def river_to_vpu(river_id: int) -> str or int:
 
 
 def latlon_to_river(lat: float, lon: float) -> int:
+    """
+    Gives the River ID number whose outlet is nearest the given lat and lon
+    Args:
+        lat (float): a latitude
+        lon (float): a longitude
+
+    Returns:
+        int: a 9 digit integer that is a valid GEOGLOWS River ID number
+    """
     df = metadata_tables(columns=['LINKNO', 'lat', 'lon'])
     df['dist'] = ((df['lat'] - lat) ** 2 + (df['lon'] - lon) ** 2) ** 0.5
     return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0]
 
 
 def river_to_latlon(river_id: int) -> np.ndarray:
+    """
+    Gives the lat and lon of the outlet of the river with the given River ID number
+
+    Args:
+        river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number
+
+    Returns:
+        np.ndarray: a numpy array of floats, [lat, lon]
+    """
     return (
         metadata_tables(columns=['LINKNO', 'lat', 'lon'])
         .loc[lambda x: x['LINKNO'] == river_id, ['lat', 'lon']]

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,6 @@`
`12`	`12`	`'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',`
`13`	`13`	`'METADATA_TABLE_PATH'`
`14`	`14`	`]`
`15`		`-__version__ = '1.2.0'`
	`15`	`+__version__ = '1.2.1'`
`16`	`16`	`__author__ = 'Riley Hales'`
`17`	`17`	`__license__ = 'BSD 3-Clause Clear License'`