Skip to content

Commit

Permalink
Merge pull request #94 from scrapinghub/flatten_json
Browse files Browse the repository at this point in the history
Flatten json
  • Loading branch information
manycoding authored May 27, 2019
2 parents 94ea065 + 5bf6383 commit 7fee5f2
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 175 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
- `Arche.glance()`, #88
- Item links in Schema validation errors, #89
- Empty NAN bars on category graphs, #93
- `expand=True` which enables nested data flattening is more than 100x faster and consumes ~2x less memory than before, #94
### Removed


Expand Down
4 changes: 4 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ name = "pypi"

[packages]
pandas = "*"
flatten_json = "*"
scrapinghub = {extras = ["msgpack"],version = "*"}
plotly = "*"
genson = "*"
Expand Down Expand Up @@ -38,6 +39,9 @@ sphinx-autoapi = {git = "https://github.com/rtfd/sphinx-autoapi"}
nbsphinx = "*"
memory-profiler = "*"
jupyter-console = "*"
matplotlib = "*"
pyarrow = "*"
cufflinks = "*"

[requires]
python_version = "3.7"
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ packages = find:
setup_requires = setuptools_scm
install_requires =
pandas
flatten_json
scrapinghub[msgpack]
plotly>=3.8.0
genson
Expand Down
23 changes: 15 additions & 8 deletions src/arche/readers/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from typing import Any, Dict, Iterable, Optional

from arche import SH_URL
from arche.tools import pandas, api
from arche.tools import api
from flatten_json import flatten
import numpy as np
import pandas as pd
from scrapinghub import ScrapinghubClient
Expand All @@ -13,23 +14,25 @@


class Items:
def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool = False):
def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool):
self.raw = raw
self.df = self.process_df(df)
self._flat_df = None
self.expand = expand

def __len__(self):
def __len__(self) -> int:
return len(self.df)

@property
def flat_df(self):
def flat_df(self) -> pd.DataFrame:
if self._flat_df is None:
if self.expand:
self._flat_df, self._columns_map = pandas.flatten_df(self.df)
self._flat_df = pd.DataFrame(flatten(i) for i in self.raw)
self._flat_df["_key"] = self.df.get(
"_key", [str(i) for i in range(len(self))]
)
else:
self._flat_df = self.df
self._columns_map = {}
return self._flat_df

@staticmethod
Expand All @@ -40,8 +43,12 @@ def process_df(df: pd.DataFrame) -> pd.DataFrame:
df["_type"] = df["_type"].astype("category")
return df

def get_origin_column_name(self, column_name: str) -> str:
return self._columns_map.get(column_name, column_name)
def origin_column_name(self, new: str) -> str:
if new in self.df.columns:
return new
for column in self.df.columns:
if column in new:
return column

@classmethod
def from_df(cls, df: pd.DataFrame, expand: bool = True):
Expand Down
2 changes: 1 addition & 1 deletion src/arche/rules/others.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def garbage_symbols(items: Items) -> Result:
matches = matches[["spaces", "html_entities", "css", "html_tags"]]
if not matches.empty:
error_keys = items.flat_df.iloc[matches.unstack().index.values]["_key"]
original_column = items.get_origin_column_name(column)
original_column = items.origin_column_name(column)
bad_texts = matches.stack().value_counts().index.sort_values().tolist()
error = (
f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' "
Expand Down
79 changes: 0 additions & 79 deletions src/arche/tools/pandas.py

This file was deleted.

6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ def get_job(request):
return Job(*request.param)


@pytest.fixture(scope="function", params=[(cloud_items)])
def get_collection(request):
return Collection(*request.param)
@pytest.fixture(scope="function")
def get_collection():
return Collection()


@pytest.fixture(scope="function")
Expand Down
82 changes: 78 additions & 4 deletions tests/readers/test_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@


@pytest.mark.parametrize(
"name, expected_name", [("address", "address"), ("address_0", "address")]
"name, expected_name", [("price", "price"), ("name_0", "name")]
)
def test_get_origin_column_name(get_cloud_items, name, expected_name):
def test_origin_column_name(get_cloud_items, name, expected_name):
items = Items.from_df(pd.DataFrame(get_cloud_items))
items._columns_map = {"address_0": "address"}
assert items.get_origin_column_name(name) == expected_name
assert items.origin_column_name(name) == expected_name


@pytest.mark.parametrize(
Expand Down Expand Up @@ -134,3 +133,78 @@ def test_process_df():
exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"])
exp_df["_type"] = exp_df["_type"].astype("category")
pd.testing.assert_frame_equal(df, exp_df)


flat_df_inputs = [
(
[{"name": "Bob", "alive": True, "_key": 0, "following": None}],
{"_key": [0], "alive": [True], "following": [None], "name": ["Bob"]},
{"name": "name", "alive": "alive", "_key": "_key", "following": "following"},
),
(
[{"tags": ["western", "comedy"]}, {"tags": ["drama", "history"]}],
{
"tags_0": ["western", "drama"],
"tags_1": ["comedy", "history"],
"_key": ["0", "1"],
},
{"tags_0": "tags", "tags_1": "tags"},
),
(
[
{
"links": [
{"Instagram": "http://www.instagram.com"},
{"ITW website": "http://www.itw.com"},
]
}
],
{
"links_0_Instagram": ["http://www.instagram.com"],
"links_1_ITW website": ["http://www.itw.com"],
"_key": ["0"],
},
{"links_0_Instagram": "links", "links_1_ITW website": "links"},
),
(
[
{
"links": [
{"Instagram": ["http://www.instagram.com"]},
{"ITW website": ["http://www.itw.com"]},
]
}
],
{
"links_0_Instagram_0": ["http://www.instagram.com"],
"links_1_ITW website_0": ["http://www.itw.com"],
"_key": ["0"],
},
{"links_0_Instagram_0": "links", "links_1_ITW website_0": "links"},
),
# Corner case https://github.com/amirziai/flatten/issues/48
(
[
{"type": [0], "str": "k", "type_0": 5},
{"type": [0, [2, 3]], "str": "s", "type_0": 6},
],
{
"str": ["k", "s"],
"type_0": [5, 6],
"type_1_0": [np.nan, 2.0],
"type_1_1": [np.nan, 3.0],
"_key": ["0", "1"],
},
{"type_0": "type_0", "str": "str", "type_1_0": "type", "type_1_1": "type"},
),
]


@pytest.mark.parametrize("data, expected_data, expected_map", flat_df_inputs)
def test_flat_df(data, expected_data, expected_map):
i = Items.from_array(data, expand=True)
pd.testing.assert_frame_equal(
i.flat_df, pd.DataFrame(expected_data), check_like=False
)
for new, old in expected_map.items():
assert i.origin_column_name(new) == old
80 changes: 0 additions & 80 deletions tests/tools/test_pandas.py

This file was deleted.

0 comments on commit 7fee5f2

Please sign in to comment.