Skip to content

Commit 7447e90

Browse files
committed
post-stacking cast function
1 parent a076f61 commit 7447e90

4 files changed

+207
-38
lines changed
1.03 KB
Binary file not shown.

dist/neonutilities-1.0.1.tar.gz

1.03 KB
Binary file not shown.

src/neonutilities/read_table_neon.py

+170-6
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,63 @@
22
# -*- coding: utf-8 -*-
33

44
import pandas as pd
5+
import numpy as np
6+
import pyarrow as pa
57
from pyarrow import dataset
68
import logging
7-
from .unzip_and_stack import get_variables
89
logging.basicConfig(level=logging.INFO, format='%(message)s')
910

1011

12+
def get_variables(v):
13+
"""
14+
15+
Get correct data types for each field in a data table
16+
17+
Parameters
18+
--------
19+
v: A pandas table containing variable definitions
20+
21+
Return
22+
--------
23+
A pyarrow schema for data types based on the variables file
24+
25+
Created on Wed Apr 17 2024
26+
27+
@author: Zachary Nickerson
28+
"""
29+
30+
# function assumes variables are loaded as a pandas data frame.
31+
32+
# create pyarrow schema by translating NEON data types to pyarrow types
33+
for i in range(0, len(v)):
34+
nm = v.fieldName[i]
35+
typ = pa.string()
36+
if v.dataType[i] == "real":
37+
typ = pa.float64()
38+
if v.dataType[i] in ["integer", "unsigned integer", "signed integer"]:
39+
typ = pa.int64()
40+
if v.dataType[i] in ["string", "uri"]:
41+
typ = pa.string()
42+
if v.dataType[i] == "dateTime":
43+
if v.pubFormat[i] in ["yyyy-MM-dd'T'HH:mm:ss'Z'(floor)", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ss'Z'(round)"]:
44+
typ = pa.timestamp("s", tz="UTC")
45+
else:
46+
if v.pubFormat[i] in ["yyyy-MM-dd(floor)", "yyyy-MM-dd"]:
47+
typ = pa.date64()
48+
else:
49+
if v.pubFormat[i] in ["yyyy(floor)", "yyyy(round)"]:
50+
typ = pa.int64()
51+
else:
52+
typ = pa.string()
53+
if i==0:
54+
vschema = pa.schema([(nm, typ)])
55+
else:
56+
nfield = pa.field(nm, typ)
57+
vschema = vschema.append(nfield)
58+
59+
return vschema
60+
61+
1162
def read_table_neon(data_file,
1263
var_file
1364
):
@@ -100,16 +151,129 @@ def date_convert(dates):
100151
"""
101152

102153
try:
103-
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M:%S", utc=True)
154+
dout = pd.to_datetime(dates, format="ISO8601", utc=True)
104155
except Exception:
105156
try:
106-
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M", utc=True)
157+
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M:%S", utc=True)
107158
except Exception:
108159
try:
109-
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H", utc=True)
160+
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H:%M", utc=True)
110161
except Exception:
111162
try:
112-
dout = pd.to_datetime(dates, format="%Y-%m-%d", utc=True)
163+
dout = pd.to_datetime(dates, format="%Y-%m-%dT%H", utc=True)
113164
except Exception:
114-
dout = dates
165+
try:
166+
dout = pd.to_datetime(dates, format="%Y-%m-%d", utc=True)
167+
except Exception:
168+
dout = dates
115169
return dout
170+
171+
172+
def get_variables_pandas(v):
173+
"""
174+
175+
Get correct data types for each field in a data table
176+
177+
Parameters
178+
--------
179+
v: A pandas table containing variable definitions
180+
181+
Return
182+
--------
183+
A dictionary of field names and pandas data types based on the variables file
184+
185+
Created on Oct 31 2024
186+
187+
@author: Claire Lunch
188+
"""
189+
190+
dtdict = {}
191+
vf = v["fieldName"]
192+
for i in range(0, len(vf)):
193+
nm = vf[i]
194+
if v["dataType"][i] == "real":
195+
typ = "Float64"
196+
if v["dataType"][i] in ["integer", "unsigned integer", "signed integer"]:
197+
typ = "Int64"
198+
if v["dataType"][i] in ["string", "uri"]:
199+
typ = "string"
200+
if v["dataType"][i] == "dateTime":
201+
typ = "datetime64[ns, UTC]"
202+
dtdict[nm] = typ
203+
204+
return dtdict
205+
206+
207+
def cast_table_neon(data_table,
208+
var_table
209+
):
210+
"""
211+
212+
Cast a NEON data table to the correct data types for each variable, if possible.
213+
214+
Parameters
215+
--------
216+
data_table: NEON data table as a pandas table.
217+
var_table: NEON variables file as a pandas table.
218+
219+
Return
220+
--------
221+
A data frame of a NEON data table, with column classes assigned by data type.
222+
223+
Example
224+
--------
225+
>>> dattab = cast_table_neon(data_table=brd_perpoint,
226+
var_table=variables_10003)
227+
228+
Created on Oct 30 2024
229+
230+
@author: Claire Lunch
231+
"""
232+
233+
# Check inputs formatting
234+
if not isinstance(data_table, pd.DataFrame):
235+
logging.info("Data table input is not a pandas data frame.")
236+
return None
237+
238+
if not isinstance(var_table, pd.DataFrame):
239+
logging.info("Variables table input is not a pandas data frame.")
240+
return None
241+
242+
# Check this is a valid variables file
243+
if any(x in ['category', 'system', 'stat'] for x in list(var_table.columns)):
244+
logging.info('var_table appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities.')
245+
return None
246+
else:
247+
if not any(x in ['table', 'fieldName', 'dataType'] for x in list(var_table.columns)):
248+
logging.info('var_table is not a variables file, or is missing critical values.')
249+
return None
250+
251+
# get data types
252+
vdt = get_variables_pandas(var_table)
253+
254+
# get field names from the data table
255+
tabcols = list(data_table.columns)
256+
cast_table = data_table
257+
258+
# iterate over columns and try to cast each
259+
for i in tabcols:
260+
if i not in vdt.keys():
261+
continue
262+
else:
263+
if vdt[i] in ["Float64", "Int64"]:
264+
try:
265+
dtemp = cast_table[i].replace(r'^\s*$', np.nan, regex=True)
266+
cast_table[i] = dtemp.astype(vdt[i])
267+
except Exception:
268+
logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.")
269+
cast_table[i] = data_table[i]
270+
continue
271+
if vdt[i]=="datetime64[ns, UTC]" and not i=="publicationDate":
272+
try:
273+
cast_table[i] = date_convert(data_table[i])
274+
except Exception:
275+
logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.")
276+
cast_table[i] = data_table[i]
277+
continue
278+
279+
return cast_table

src/neonutilities/unzip_and_stack.py

+37-32
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .get_issue_log import get_issue_log
1717
from .citation import get_citation
1818
from .helper_mods.api_helpers import readme_url
19+
from .read_table_neon import get_variables, cast_table_neon
1920
from . import __resources__
2021
import logging
2122
logging.basicConfig(level=logging.INFO, format='%(message)s')
@@ -164,51 +165,31 @@ def get_recent_publication(filepaths):
164165
return recent_files
165166

166167

167-
def get_variables(v):
168+
169+
def string_schema(v):
168170
"""
169171
170-
Get correct data types
172+
Assign all variables to string. Fallback option if table read with true schema fails
171173
172174
Parameters
173175
--------
174-
v: A file that contains variable definition
176+
v: A pandas table containing variable definitions
175177
176178
Return
177179
--------
178-
A pyarrow schema for data types based on the variables file
180+
A pyarrow schema for all string data types based on the variable names
179181
180-
Created on Wed Apr 17 2024
182+
Created on Oct 29 2024
181183
182-
@author: Zachary Nickerson
184+
@author: Claire Lunch
183185
"""
184186

185-
# function assumes variables are loaded as a pandas data frame.
186-
187-
# create pyarrow schema by translating NEON data types to pyarrow types
188187
for i in range(0, len(v)):
189188
nm = v.fieldName[i]
190-
typ = pa.string()
191-
if v.dataType[i] == "real":
192-
typ = pa.float64()
193-
if v.dataType[i] in ["integer", "unsigned integer", "signed integer"]:
194-
typ = pa.int64()
195-
if v.dataType[i] in ["string", "uri"]:
196-
typ = pa.string()
197-
if v.dataType[i] == "dateTime":
198-
if v.pubFormat[i] in ["yyyy-MM-dd'T'HH:mm:ss'Z'(floor)", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ss'Z'(round)"]:
199-
typ = pa.timestamp("s", tz="UTC")
200-
else:
201-
if v.pubFormat[i] in ["yyyy-MM-dd(floor)", "yyyy-MM-dd"]:
202-
typ = pa.date64()
203-
else:
204-
if v.pubFormat[i] in ["yyyy(floor)", "yyyy(round)"]:
205-
typ = pa.int64()
206-
else:
207-
typ = pa.string()
208-
if i==0:
209-
vschema = pa.schema([(nm, typ)])
189+
if i == 0:
190+
vschema = pa.schema([(nm, pa.string())])
210191
else:
211-
nfield = pa.field(nm, typ)
192+
nfield = pa.field(nm, pa.string())
212193
vschema = vschema.append(nfield)
213194

214195
return vschema
@@ -838,15 +819,39 @@ def stack_data_files_parallel(folder,
838819
repl="", string=b) for b in tablepaths]
839820
dat = dataset.dataset(source=tablebuckets, filesystem=gcs,
840821
format="csv", schema=tableschema)
841-
842822
else:
843823
dat = dataset.dataset(source=tablepaths,
844824
format="csv", schema=tableschema)
845825

846826
cols = tableschema.names
847827
cols.append("__filename")
848-
dattab = dat.to_table(columns=cols)
828+
829+
# attempt to stack to table. if it fails, stack as all string fields and warn
830+
stringset = False
831+
try:
832+
dattab = dat.to_table(columns=cols)
833+
except Exception:
834+
try:
835+
stringschema = string_schema(tablepkgvar)
836+
if cloud_mode:
837+
dat = dataset.dataset(source=tablebuckets, filesystem=gcs,
838+
format="csv", schema=stringschema)
839+
else:
840+
dat = dataset.dataset(source=tablepaths,
841+
format="csv", schema=stringschema)
842+
dattab = dat.to_table(columns=cols)
843+
logging.info(f"Table {j} schema did not match data; all variable types set to string. Data type casting will be attempted after stacking step.")
844+
stringset = True
845+
except Exception:
846+
logging.info(f"Failed to stack table {j}. Check input data and variables file.")
847+
continue
848+
849849
pdat = dattab.to_pandas()
850+
if stringset:
851+
try:
852+
pdat = cast_table_neon(pdat, tablepkgvar)
853+
except Exception:
854+
logging.info(f"Data type casting failed for table {j}. Variable types set to string.")
850855

851856
# append publication date
852857
pubr = re.compile("20[0-9]{6}T[0-9]{6}Z")

0 commit comments

Comments
 (0)