From b8d47ca3ea6b32ba43cef418891d121df7c49366 Mon Sep 17 00:00:00 2001 From: Elizabeth Thompson Date: Thu, 26 Jan 2023 16:43:46 -0800 Subject: [PATCH 1/2] edit pyarrow stringify to better handle emojis and accents --- superset/result_set.py | 9 +- .../utils/pandas_postprocessing/boxplot.py | 8 +- .../utils/pandas_postprocessing/flatten.py | 2 +- tests/unit_tests/dataframe_test.py | 82 ++++++++++++++++++- tests/unit_tests/result_set_test.py | 12 +-- 5 files changed, 99 insertions(+), 14 deletions(-) diff --git a/superset/result_set.py b/superset/result_set.py index 3d29673b9fcb9..170de1869c830 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -70,9 +70,14 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]: for obj in it: if na_obj := pd.isna(obj): # pandas type cannot be converted to string - obj[na_obj] = None # type: ignore + obj[na_obj] = None else: - obj[...] = stringify(obj) # type: ignore + try: + # for simple string conversions + # this handles odd character types better + obj[...] = obj.astype(str) + except ValueError: + obj[...] = stringify(obj) return result diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py index 673c39ebf3836..e2706345b1ea9 100644 --- a/superset/utils/pandas_postprocessing/boxplot.py +++ b/superset/utils/pandas_postprocessing/boxplot.py @@ -57,10 +57,10 @@ def boxplot( """ def quartile1(series: Series) -> float: - return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore + return np.nanpercentile(series, 25, interpolation="midpoint") def quartile3(series: Series) -> float: - return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore + return np.nanpercentile(series, 75, interpolation="midpoint") if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY: @@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float: return np.nanpercentile(series, low) else: - whisker_high = np.max # type: ignore - whisker_low = np.min # type: ignore + whisker_high = np.max + whisker_low = np.min def outliers(series: Series) -> Set[float]: above = series[series > whisker_high(series)] diff --git a/superset/utils/pandas_postprocessing/flatten.py b/superset/utils/pandas_postprocessing/flatten.py index 1026164e454ee..db783c4bed264 100644 --- a/superset/utils/pandas_postprocessing/flatten.py +++ b/superset/utils/pandas_postprocessing/flatten.py @@ -85,7 +85,7 @@ def flatten( _columns = [] for series in df.columns.to_flat_index(): _cells = [] - for cell in series if is_sequence(series) else [series]: # type: ignore + for cell in series if is_sequence(series) else [series]: if pd.notnull(cell): # every cell should be converted to string and escape comma _cells.append(escape_separator(str(cell))) diff --git a/tests/unit_tests/dataframe_test.py b/tests/unit_tests/dataframe_test.py index 3d8bd15aeff33..f0d9bc31b064b 100644 --- a/tests/unit_tests/dataframe_test.py +++ b/tests/unit_tests/dataframe_test.py @@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None: assert df_to_records(df) == [ {"date": None}, - {"date": '"2023-01-06T20:50:31.749000+00:00"'}, + {"date": "2023-01-06 20:50:31.749000+00:00"}, + ] + + +def test_df_to_records_mixed_emoji_type() -> None: + from superset.db_engine_specs import BaseEngineSpec + from superset.result_set import SupersetResultSet + + data = [ + ("What's up?", "This is a string text", 1), + ("What's up?", "This is a string with an 😍 added", 2), + ("What's up?", NaT, 3), + ("What's up?", "Last emoji 😁", 4), + ] + + cursor_descr: DbapiDescription = [ + ("question", "varchar", None, None, None, None, False), + ("response", "varchar", None, None, None, None, False), + ("count", "integer", None, None, None, None, False), + ] + + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + assert df_to_records(df) == [ + {"question": "What's up?", "response": "This is a string text", "count": 1}, + { + "question": "What's up?", + "response": "This is a string with an 😍 added", + "count": 2, + }, + { + "question": "What's up?", + "response": None, + "count": 3, + }, + { + "question": "What's up?", + "response": "Last emoji 😁", + "count": 4, + }, + ] + + +def test_df_to_records_mixed_accent_type() -> None: + from superset.db_engine_specs import BaseEngineSpec + from superset.result_set import SupersetResultSet + + data = [ + ("What's up?", "This is a string text", 1), + ("What's up?", "This is a string with áccent", 2), + ("What's up?", NaT, 3), + ("What's up?", "móre áccent", 4), + ] + + cursor_descr: DbapiDescription = [ + ("question", "varchar", None, None, None, None, False), + ("response", "varchar", None, None, None, None, False), + ("count", "integer", None, None, None, None, False), + ] + + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + assert df_to_records(df) == [ + {"question": "What's up?", "response": "This is a string text", "count": 1}, + { + "question": "What's up?", + "response": "This is a string with áccent", + "count": 2, + }, + { + "question": "What's up?", + "response": None, + "count": 3, + }, + { + "question": "What's up?", + "response": "móre áccent", + "count": 4, + }, ] diff --git a/tests/unit_tests/result_set_test.py b/tests/unit_tests/result_set_test.py index 0a78e0a5edd0a..331810bb1ed62 100644 --- a/tests/unit_tests/result_set_test.py +++ b/tests/unit_tests/result_set_test.py @@ -98,10 +98,10 @@ def test_stringify_with_null_integers(): expected = np.array( [ - array(['"foo"', '"foo"', '"foo"'], dtype=object), - array(['"bar"', '"bar"', '"bar"'], dtype=object), + array(["foo", "foo", "foo"], dtype=object), + array(["bar", "bar", "bar"], dtype=object), array([None, None, None], dtype=object), - array([None, "true", None], dtype=object), + array([None, "True", None], dtype=object), ] ) @@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps(): expected = np.array( [ - array(['"foo"', '"foo"', '"foo"'], dtype=object), - array(['"bar"', '"bar"', '"bar"'], dtype=object), + array(["foo", "foo", "foo"], dtype=object), + array(["bar", "bar", "bar"], dtype=object), array([None, None, None], dtype=object), - array([None, "true", None], dtype=object), + array([None, "True", None], dtype=object), ] ) From ebdc33776973f71e133a00fef7cb39a2b7dd4f6b Mon Sep 17 00:00:00 2001 From: Elizabeth Thompson Date: Mon, 30 Jan 2023 09:25:12 -0800 Subject: [PATCH 2/2] fix tests --- superset/result_set.py | 6 +++--- superset/utils/pandas_postprocessing/boxplot.py | 8 ++++---- superset/utils/pandas_postprocessing/flatten.py | 2 +- tests/integration_tests/result_set_tests.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/superset/result_set.py b/superset/result_set.py index 170de1869c830..1c4ae98dc9112 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -70,14 +70,14 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]: for obj in it: if na_obj := pd.isna(obj): # pandas type cannot be converted to string - obj[na_obj] = None + obj[na_obj] = None # type: ignore else: try: # for simple string conversions # this handles odd character types better - obj[...] = obj.astype(str) + obj[...] = obj.astype(str) # type: ignore except ValueError: - obj[...] = stringify(obj) + obj[...] = stringify(obj) # type: ignore return result diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py index e2706345b1ea9..d4c78bf15e8c8 100644 --- a/superset/utils/pandas_postprocessing/boxplot.py +++ b/superset/utils/pandas_postprocessing/boxplot.py @@ -57,10 +57,10 @@ def boxplot( """ def quartile1(series: Series) -> float: - return np.nanpercentile(series, 25, interpolation="midpoint") + return np.nanpercentile(series, 25, method="midpoint") def quartile3(series: Series) -> float: - return np.nanpercentile(series, 75, interpolation="midpoint") + return np.nanpercentile(series, 75, method="midpoint") if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY: @@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float: return np.nanpercentile(series, low) else: - whisker_high = np.max - whisker_low = np.min + whisker_high = np.max # type: ignore + whisker_low = np.min # type: ignore def outliers(series: Series) -> Set[float]: above = series[series > whisker_high(series)] diff --git a/superset/utils/pandas_postprocessing/flatten.py b/superset/utils/pandas_postprocessing/flatten.py index db783c4bed264..1026164e454ee 100644 --- a/superset/utils/pandas_postprocessing/flatten.py +++ b/superset/utils/pandas_postprocessing/flatten.py @@ -85,7 +85,7 @@ def flatten( _columns = [] for series in df.columns.to_flat_index(): _cells = [] - for cell in series if is_sequence(series) else [series]: + for cell in series if is_sequence(series) else [series]: # type: ignore if pd.notnull(cell): # every cell should be converted to string and escape comma _cells.append(escape_separator(str(cell))) diff --git a/tests/integration_tests/result_set_tests.py b/tests/integration_tests/result_set_tests.py index 626468fc5aae0..18135c486dbea 100644 --- a/tests/integration_tests/result_set_tests.py +++ b/tests/integration_tests/result_set_tests.py @@ -169,13 +169,13 @@ def test_nested_types(self): "id": 4, "dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]', "num_arr": "[1, 2, 3]", - "map_col": '{"chart_name": "scatter"}', + "map_col": "{'chart_name': 'scatter'}", }, { "id": 3, "dict_arr": '[{"table_name": "birth_names", "database_id": 1}]', "num_arr": "[4, 5, 6]", - "map_col": '{"chart_name": "plot"}', + "map_col": "{'chart_name': 'plot'}", }, ], )