|
2 | 2 | # -*- coding: utf-8 -*-
|
3 | 3 |
|
4 | 4 | import pandas as pd
|
| 5 | +import numpy as np |
| 6 | +import pyarrow as pa |
5 | 7 | from pyarrow import dataset
|
6 | 8 | import logging
|
7 |
| -from .unzip_and_stack import get_variables |
8 | 9 | logging.basicConfig(level=logging.INFO, format='%(message)s')
|
9 | 10 |
|
10 | 11 |
|
| 12 | +def get_variables(v): |
| 13 | + """ |
| 14 | +
|
| 15 | + Get correct data types for each field in a data table |
| 16 | +
|
| 17 | + Parameters |
| 18 | + -------- |
| 19 | + v: A pandas table containing variable definitions |
| 20 | +
|
| 21 | + Return |
| 22 | + -------- |
| 23 | + A pyarrow schema for data types based on the variables file |
| 24 | +
|
| 25 | + Created on Wed Apr 17 2024 |
| 26 | +
|
| 27 | + @author: Zachary Nickerson |
| 28 | + """ |
| 29 | + |
| 30 | + # function assumes variables are loaded as a pandas data frame. |
| 31 | + |
| 32 | + # create pyarrow schema by translating NEON data types to pyarrow types |
| 33 | + for i in range(0, len(v)): |
| 34 | + nm = v.fieldName[i] |
| 35 | + typ = pa.string() |
| 36 | + if v.dataType[i] == "real": |
| 37 | + typ = pa.float64() |
| 38 | + if v.dataType[i] in ["integer", "unsigned integer", "signed integer"]: |
| 39 | + typ = pa.int64() |
| 40 | + if v.dataType[i] in ["string", "uri"]: |
| 41 | + typ = pa.string() |
| 42 | + if v.dataType[i] == "dateTime": |
| 43 | + if v.pubFormat[i] in ["yyyy-MM-dd'T'HH:mm:ss'Z'(floor)", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ss'Z'(round)"]: |
| 44 | + typ = pa.timestamp("s", tz="UTC") |
| 45 | + else: |
| 46 | + if v.pubFormat[i] in ["yyyy-MM-dd(floor)", "yyyy-MM-dd"]: |
| 47 | + typ = pa.date64() |
| 48 | + else: |
| 49 | + if v.pubFormat[i] in ["yyyy(floor)", "yyyy(round)"]: |
| 50 | + typ = pa.int64() |
| 51 | + else: |
| 52 | + typ = pa.string() |
| 53 | + if i==0: |
| 54 | + vschema = pa.schema([(nm, typ)]) |
| 55 | + else: |
| 56 | + nfield = pa.field(nm, typ) |
| 57 | + vschema = vschema.append(nfield) |
| 58 | + |
| 59 | + return vschema |
| 60 | + |
| 61 | + |
11 | 62 | def read_table_neon(data_file,
|
12 | 63 | var_file
|
13 | 64 | ):
|
@@ -100,16 +151,129 @@ def date_convert(dates):
|
100 | 151 | """
|
101 | 152 |
|
102 | 153 | try:
|
103 |
| - dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M:%S", utc=True) |
| 154 | + dout = pd.to_datetime(dates, format="ISO8601", utc=True) |
104 | 155 | except Exception:
|
105 | 156 | try:
|
106 |
| - dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M", utc=True) |
| 157 | + dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M:%S", utc=True) |
107 | 158 | except Exception:
|
108 | 159 | try:
|
109 |
| - dout = pd.to_datetime(dates, format="%Y-%m-%dT%H", utc=True) |
| 160 | + dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M", utc=True) |
110 | 161 | except Exception:
|
111 | 162 | try:
|
112 |
| - dout = pd.to_datetime(dates, format="%Y-%m-%d", utc=True) |
| 163 | + dout = pd.to_datetime(dates, format="%Y-%m-%dT%H", utc=True) |
113 | 164 | except Exception:
|
114 |
| - dout = dates |
| 165 | + try: |
| 166 | + dout = pd.to_datetime(dates, format="%Y-%m-%d", utc=True) |
| 167 | + except Exception: |
| 168 | + dout = dates |
115 | 169 | return dout
|
| 170 | + |
| 171 | + |
| 172 | +def get_variables_pandas(v): |
| 173 | + """ |
| 174 | +
|
| 175 | + Get correct data types for each field in a data table |
| 176 | +
|
| 177 | + Parameters |
| 178 | + -------- |
| 179 | + v: A pandas table containing variable definitions |
| 180 | +
|
| 181 | + Return |
| 182 | + -------- |
| 183 | + A dictionary of field names and pandas data types based on the variables file |
| 184 | +
|
| 185 | + Created on Oct 31 2024 |
| 186 | +
|
| 187 | + @author: Claire Lunch |
| 188 | + """ |
| 189 | + |
| 190 | + dtdict = {} |
| 191 | + vf = v["fieldName"] |
| 192 | + for i in range(0, len(vf)): |
| 193 | + nm = vf[i] |
| 194 | + if v["dataType"][i] == "real": |
| 195 | + typ = "Float64" |
| 196 | + if v["dataType"][i] in ["integer", "unsigned integer", "signed integer"]: |
| 197 | + typ = "Int64" |
| 198 | + if v["dataType"][i] in ["string", "uri"]: |
| 199 | + typ = "string" |
| 200 | + if v["dataType"][i] == "dateTime": |
| 201 | + typ = "datetime64[ns, UTC]" |
| 202 | + dtdict[nm] = typ |
| 203 | + |
| 204 | + return dtdict |
| 205 | + |
| 206 | + |
| 207 | +def cast_table_neon(data_table, |
| 208 | + var_table |
| 209 | + ): |
| 210 | + """ |
| 211 | +
|
| 212 | + Cast a NEON data table to the correct data types for each variable, if possible. |
| 213 | +
|
| 214 | + Parameters |
| 215 | + -------- |
| 216 | + data_table: NEON data table as a pandas table. |
| 217 | + var_table: NEON variables file as a pandas table. |
| 218 | +
|
| 219 | + Return |
| 220 | + -------- |
| 221 | + A data frame of a NEON data table, with column classes assigned by data type. |
| 222 | +
|
| 223 | + Example |
| 224 | + -------- |
| 225 | + >>> dattab = cast_table_neon(data_table=brd_perpoint, |
| 226 | + var_table=variables_10003) |
| 227 | +
|
| 228 | + Created on Oct 30 2024 |
| 229 | +
|
| 230 | + @author: Claire Lunch |
| 231 | + """ |
| 232 | + |
| 233 | + # Check inputs formatting |
| 234 | + if not isinstance(data_table, pd.DataFrame): |
| 235 | + logging.info("Data table input is not a pandas data frame.") |
| 236 | + return None |
| 237 | + |
| 238 | + if not isinstance(var_table, pd.DataFrame): |
| 239 | + logging.info("Variables table input is not a pandas data frame.") |
| 240 | + return None |
| 241 | + |
| 242 | + # Check this is a valid variables file |
| 243 | + if any(x in ['category', 'system', 'stat'] for x in list(var_table.columns)): |
| 244 | + logging.info('var_table appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities.') |
| 245 | + return None |
| 246 | + else: |
| 247 | + if not any(x in ['table', 'fieldName', 'dataType'] for x in list(var_table.columns)): |
| 248 | + logging.info('var_table is not a variables file, or is missing critical values.') |
| 249 | + return None |
| 250 | + |
| 251 | + # get data types |
| 252 | + vdt = get_variables_pandas(var_table) |
| 253 | + |
| 254 | + # get field names from the data table |
| 255 | + tabcols = list(data_table.columns) |
| 256 | + cast_table = data_table |
| 257 | + |
| 258 | + # iterate over columns and try to cast each |
| 259 | + for i in tabcols: |
| 260 | + if i not in vdt.keys(): |
| 261 | + continue |
| 262 | + else: |
| 263 | + if vdt[i] in ["Float64", "Int64"]: |
| 264 | + try: |
| 265 | + dtemp = cast_table[i].replace(r'^\s*$', np.nan, regex=True) |
| 266 | + cast_table[i] = dtemp.astype(vdt[i]) |
| 267 | + except Exception: |
| 268 | + logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.") |
| 269 | + cast_table[i] = data_table[i] |
| 270 | + continue |
| 271 | + if vdt[i]=="datetime64[ns, UTC]" and not i=="publicationDate": |
| 272 | + try: |
| 273 | + cast_table[i] = date_convert(data_table[i]) |
| 274 | + except Exception: |
| 275 | + logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.") |
| 276 | + cast_table[i] = data_table[i] |
| 277 | + continue |
| 278 | + |
| 279 | + return cast_table |
0 commit comments