-
Notifications
You must be signed in to change notification settings - Fork 19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Flatten json #94
Flatten json #94
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,8 @@ | |
from typing import Any, Dict, Iterable, Optional | ||
|
||
from arche import SH_URL | ||
from arche.tools import pandas, api | ||
from arche.tools import api | ||
from flatten_json import flatten | ||
import numpy as np | ||
import pandas as pd | ||
from scrapinghub import ScrapinghubClient | ||
|
@@ -13,23 +14,25 @@ | |
|
||
|
||
class Items: | ||
def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool = False): | ||
def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool): | ||
self.raw = raw | ||
self.df = self.process_df(df) | ||
self._flat_df = None | ||
self.expand = expand | ||
|
||
def __len__(self): | ||
def __len__(self) -> int: | ||
return len(self.df) | ||
|
||
@property | ||
def flat_df(self): | ||
def flat_df(self) -> pd.DataFrame: | ||
if self._flat_df is None: | ||
if self.expand: | ||
self._flat_df, self._columns_map = pandas.flatten_df(self.df) | ||
self._flat_df = pd.DataFrame(flatten(i) for i in self.raw) | ||
self._flat_df["_key"] = self.df.get( | ||
"_key", [str(i) for i in range(len(self))] | ||
) | ||
else: | ||
self._flat_df = self.df | ||
self._columns_map = {} | ||
return self._flat_df | ||
|
||
@staticmethod | ||
|
@@ -40,8 +43,12 @@ def process_df(df: pd.DataFrame) -> pd.DataFrame: | |
df["_type"] = df["_type"].astype("category") | ||
return df | ||
|
||
def get_origin_column_name(self, column_name: str) -> str: | ||
return self._columns_map.get(column_name, column_name) | ||
def origin_column_name(self, new: str) -> str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. KISS approach. I believe it will work in most cases. |
||
if new in self.df.columns: | ||
return new | ||
for column in self.df.columns: | ||
if column in new: | ||
return column | ||
|
||
@classmethod | ||
def from_df(cls, df: pd.DataFrame, expand: bool = True): | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,12 +7,11 @@ | |
|
||
|
||
@pytest.mark.parametrize( | ||
"name, expected_name", [("address", "address"), ("address_0", "address")] | ||
"name, expected_name", [("price", "price"), ("name_0", "name")] | ||
) | ||
def test_get_origin_column_name(get_cloud_items, name, expected_name): | ||
def test_origin_column_name(get_cloud_items, name, expected_name): | ||
items = Items.from_df(pd.DataFrame(get_cloud_items)) | ||
items._columns_map = {"address_0": "address"} | ||
assert items.get_origin_column_name(name) == expected_name | ||
assert items.origin_column_name(name) == expected_name | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
@@ -134,3 +133,78 @@ def test_process_df(): | |
exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"]) | ||
exp_df["_type"] = exp_df["_type"].astype("category") | ||
pd.testing.assert_frame_equal(df, exp_df) | ||
|
||
|
||
flat_df_inputs = [ | ||
( | ||
[{"name": "Bob", "alive": True, "_key": 0, "following": None}], | ||
{"_key": [0], "alive": [True], "following": [None], "name": ["Bob"]}, | ||
{"name": "name", "alive": "alive", "_key": "_key", "following": "following"}, | ||
), | ||
( | ||
[{"tags": ["western", "comedy"]}, {"tags": ["drama", "history"]}], | ||
{ | ||
"tags_0": ["western", "drama"], | ||
"tags_1": ["comedy", "history"], | ||
"_key": ["0", "1"], | ||
}, | ||
{"tags_0": "tags", "tags_1": "tags"}, | ||
), | ||
( | ||
[ | ||
{ | ||
"links": [ | ||
{"Instagram": "http://www.instagram.com"}, | ||
{"ITW website": "http://www.itw.com"}, | ||
] | ||
} | ||
], | ||
{ | ||
"links_0_Instagram": ["http://www.instagram.com"], | ||
"links_1_ITW website": ["http://www.itw.com"], | ||
"_key": ["0"], | ||
}, | ||
{"links_0_Instagram": "links", "links_1_ITW website": "links"}, | ||
), | ||
( | ||
[ | ||
{ | ||
"links": [ | ||
{"Instagram": ["http://www.instagram.com"]}, | ||
{"ITW website": ["http://www.itw.com"]}, | ||
] | ||
} | ||
], | ||
{ | ||
"links_0_Instagram_0": ["http://www.instagram.com"], | ||
"links_1_ITW website_0": ["http://www.itw.com"], | ||
"_key": ["0"], | ||
}, | ||
{"links_0_Instagram_0": "links", "links_1_ITW website_0": "links"}, | ||
), | ||
# Corner case https://github.com/amirziai/flatten/issues/48 | ||
( | ||
[ | ||
{"type": [0], "str": "k", "type_0": 5}, | ||
{"type": [0, [2, 3]], "str": "s", "type_0": 6}, | ||
], | ||
{ | ||
"str": ["k", "s"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we lose |
||
"type_0": [5, 6], | ||
"type_1_0": [np.nan, 2.0], | ||
"type_1_1": [np.nan, 3.0], | ||
"_key": ["0", "1"], | ||
}, | ||
{"type_0": "type_0", "str": "str", "type_1_0": "type", "type_1_1": "type"}, | ||
), | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("data, expected_data, expected_map", flat_df_inputs) | ||
def test_flat_df(data, expected_data, expected_map): | ||
i = Items.from_array(data, expand=True) | ||
pd.testing.assert_frame_equal( | ||
i.flat_df, pd.DataFrame(expected_data), check_like=False | ||
) | ||
for new, old in expected_map.items(): | ||
assert i.origin_column_name(new) == old |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need the same formatted keys as in original df.