Merge pull request #94 from scrapinghub/flatten_json

Flatten json
scrapinghub · May 27, 2019 · 7fee5f2 · 7fee5f2
2 parents 94ea065 + 5bf6383
commit 7fee5f2
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 175 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -17,6 +17,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
 - `Arche.glance()`, #88
 - Item links in Schema validation errors, #89
 - Empty NAN bars on category graphs, #93
+- `expand=True` which enables nested data flattening is more than 100x faster and consumes ~2x less memory than before, #94
 ### Removed
 
 

diff --git a/Pipfile b/Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
 
 [packages]
 pandas = "*"
+flatten_json = "*"
 scrapinghub = {extras = ["msgpack"],version = "*"}
 plotly = "*"
 genson = "*"
@@ -38,6 +39,9 @@ sphinx-autoapi = {git = "https://github.com/rtfd/sphinx-autoapi"}
 nbsphinx = "*"
 memory-profiler = "*"
 jupyter-console = "*"
+matplotlib = "*"
+pyarrow = "*"
+cufflinks = "*"
 
 [requires]
 python_version = "3.7"

diff --git a/setup.cfg b/setup.cfg
@@ -21,6 +21,7 @@ packages = find:
 setup_requires = setuptools_scm
 install_requires = 
     pandas
+    flatten_json
     scrapinghub[msgpack]
     plotly>=3.8.0
     genson

diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
@@ -3,7 +3,8 @@
 from typing import Any, Dict, Iterable, Optional
 
 from arche import SH_URL
-from arche.tools import pandas, api
+from arche.tools import api
+from flatten_json import flatten
 import numpy as np
 import pandas as pd
 from scrapinghub import ScrapinghubClient
@@ -13,23 +14,25 @@
 
 
 class Items:
-    def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool = False):
+    def __init__(self, raw: RawItems, df: pd.DataFrame, expand: bool):
         self.raw = raw
         self.df = self.process_df(df)
         self._flat_df = None
         self.expand = expand
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.df)
 
     @property
-    def flat_df(self):
+    def flat_df(self) -> pd.DataFrame:
         if self._flat_df is None:
             if self.expand:
-                self._flat_df, self._columns_map = pandas.flatten_df(self.df)
+                self._flat_df = pd.DataFrame(flatten(i) for i in self.raw)
+                self._flat_df["_key"] = self.df.get(
+                    "_key", [str(i) for i in range(len(self))]
+                )
             else:
                 self._flat_df = self.df
-                self._columns_map = {}
         return self._flat_df
 
     @staticmethod
@@ -40,8 +43,12 @@ def process_df(df: pd.DataFrame) -> pd.DataFrame:
             df["_type"] = df["_type"].astype("category")
         return df
 
-    def get_origin_column_name(self, column_name: str) -> str:
-        return self._columns_map.get(column_name, column_name)
+    def origin_column_name(self, new: str) -> str:
+        if new in self.df.columns:
+            return new
+        for column in self.df.columns:
+            if column in new:
+                return column
 
     @classmethod
     def from_df(cls, df: pd.DataFrame, expand: bool = True):

diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
@@ -93,7 +93,7 @@ def garbage_symbols(items: Items) -> Result:
         matches = matches[["spaces", "html_entities", "css", "html_tags"]]
         if not matches.empty:
             error_keys = items.flat_df.iloc[matches.unstack().index.values]["_key"]
-            original_column = items.get_origin_column_name(column)
+            original_column = items.origin_column_name(column)
             bad_texts = matches.stack().value_counts().index.sort_values().tolist()
             error = (
                 f"{len(error_keys)/len(items)*100:.1f}% of '{original_column}' "

diff --git a/src/arche/tools/pandas.py b/src/arche/tools/pandas.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -124,9 +124,9 @@ def get_job(request):
     return Job(*request.param)
 
 
-@pytest.fixture(scope="function", params=[(cloud_items)])
-def get_collection(request):
-    return Collection(*request.param)
+@pytest.fixture(scope="function")
+def get_collection():
+    return Collection()
 
 
 @pytest.fixture(scope="function")

diff --git a/tests/readers/test_items.py b/tests/readers/test_items.py
@@ -7,12 +7,11 @@
 
 
 @pytest.mark.parametrize(
-    "name, expected_name", [("address", "address"), ("address_0", "address")]
+    "name, expected_name", [("price", "price"), ("name_0", "name")]
 )
-def test_get_origin_column_name(get_cloud_items, name, expected_name):
+def test_origin_column_name(get_cloud_items, name, expected_name):
     items = Items.from_df(pd.DataFrame(get_cloud_items))
-    items._columns_map = {"address_0": "address"}
-    assert items.get_origin_column_name(name) == expected_name
+    assert items.origin_column_name(name) == expected_name
 
 
 @pytest.mark.parametrize(
@@ -134,3 +133,78 @@ def test_process_df():
     exp_df = pd.DataFrame([[np.nan, np.nan, "NameItem"]], columns=["a", "b", "_type"])
     exp_df["_type"] = exp_df["_type"].astype("category")
     pd.testing.assert_frame_equal(df, exp_df)
+
+
+flat_df_inputs = [
+    (
+        [{"name": "Bob", "alive": True, "_key": 0, "following": None}],
+        {"_key": [0], "alive": [True], "following": [None], "name": ["Bob"]},
+        {"name": "name", "alive": "alive", "_key": "_key", "following": "following"},
+    ),
+    (
+        [{"tags": ["western", "comedy"]}, {"tags": ["drama", "history"]}],
+        {
+            "tags_0": ["western", "drama"],
+            "tags_1": ["comedy", "history"],
+            "_key": ["0", "1"],
+        },
+        {"tags_0": "tags", "tags_1": "tags"},
+    ),
+    (
+        [
+            {
+                "links": [
+                    {"Instagram": "http://www.instagram.com"},
+                    {"ITW website": "http://www.itw.com"},
+                ]
+            }
+        ],
+        {
+            "links_0_Instagram": ["http://www.instagram.com"],
+            "links_1_ITW website": ["http://www.itw.com"],
+            "_key": ["0"],
+        },
+        {"links_0_Instagram": "links", "links_1_ITW website": "links"},
+    ),
+    (
+        [
+            {
+                "links": [
+                    {"Instagram": ["http://www.instagram.com"]},
+                    {"ITW website": ["http://www.itw.com"]},
+                ]
+            }
+        ],
+        {
+            "links_0_Instagram_0": ["http://www.instagram.com"],
+            "links_1_ITW website_0": ["http://www.itw.com"],
+            "_key": ["0"],
+        },
+        {"links_0_Instagram_0": "links", "links_1_ITW website_0": "links"},
+    ),
+    # Corner case https://github.com/amirziai/flatten/issues/48
+    (
+        [
+            {"type": [0], "str": "k", "type_0": 5},
+            {"type": [0, [2, 3]], "str": "s", "type_0": 6},
+        ],
+        {
+            "str": ["k", "s"],
+            "type_0": [5, 6],
+            "type_1_0": [np.nan, 2.0],
+            "type_1_1": [np.nan, 3.0],
+            "_key": ["0", "1"],
+        },
+        {"type_0": "type_0", "str": "str", "type_1_0": "type", "type_1_1": "type"},
+    ),
+]
+
+
+@pytest.mark.parametrize("data, expected_data, expected_map", flat_df_inputs)
+def test_flat_df(data, expected_data, expected_map):
+    i = Items.from_array(data, expand=True)
+    pd.testing.assert_frame_equal(
+        i.flat_df, pd.DataFrame(expected_data), check_like=False
+    )
+    for new, old in expected_map.items():
+        assert i.origin_column_name(new) == old
diff --git a/tests/tools/test_pandas.py b/tests/tools/test_pandas.py