Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Json normalize nan support #25619

Merged
merged 12 commits into from
Mar 13, 2019
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,10 @@ I/O
- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`)
- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
-
-


Plotting
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/json/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
raise ValueError('Conflicting metadata name {name}, '
'need distinguishing prefix '.format(name=k))

result[k] = np.array(v).repeat(lengths)
# forcing dtype to object to avoid the metadata being casted to string
result[k] = np.array(v, dtype=object).repeat(lengths)

return result
114 changes: 59 additions & 55 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ def author_missing_data():
}]


@pytest.fixture
def missing_metadata():
return [
{'name': 'Alice',
'addresses': [{'number': 9562,
'street': 'Morris St.',
'city': 'Massillon',
'state': 'OH',
'zip': 44646}]
},
{'addresses': [{'number': 8449,
'street': 'Spring St.',
'city': 'Elizabethton',
'state': 'TN',
'zip': 37643}]
}
]


class TestJSONNormalize(object):

def test_simple_records(self):
Expand Down Expand Up @@ -318,66 +337,51 @@ def test_nested_flattens(self):

assert result == expected

def test_json_normalize_errors(self):
# GH14583: If meta keys are not always present
# a new option to set errors='ignore' has been implemented
i = {
"Trades": [{
"general": {
"tradeid": 100,
"trade_version": 1,
"stocks": [{

"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}, {
"general": {
"tradeid": 100,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}
]
}
j = json_normalize(data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='ignore')
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}

assert j.fillna('').to_dict() == expected

msg = ("Try running with errors='ignore' as key 'trade_version'"
def test_json_normalize_errors(self, missing_metadata):
# GH14583:
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented

msg = ("Try running with errors='ignore' as key 'name'"
" is not always present")
with pytest.raises(KeyError, match=msg):
json_normalize(
data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
data=missing_metadata,
record_path='addresses',
meta='name',
errors='raise')

def test_missing_meta(self, missing_metadata):
# GH25468
# If metadata is nullable with errors set to ignore, the null values
# should be numpy.nan values
result = json_normalize(
data=missing_metadata,
record_path='addresses',
meta='name',
errors='ignore')
ex_data = [
{'city': 'Massillon',
'number': 9562,
'state': 'OH',
'street': 'Morris St.',
'zip': 44646,
'name': 'Alice'},
{'city': 'Elizabethton',
'number': 8449,
'state': 'TN',
'street': 'Spring St.',
'zip': 37643,
'name': np.nan}
]
ex_data = [
['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'],
['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan]
]
columns = ['city', 'number', 'state', 'street', 'zip', 'name']
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_donot_drop_nonevalues(self):
# GH21356
data = [
Expand Down