From b8d47ca3ea6b32ba43cef418891d121df7c49366 Mon Sep 17 00:00:00 2001
From: Elizabeth Thompson <eschutho@gmail.com>
Date: Thu, 26 Jan 2023 16:43:46 -0800
Subject: [PATCH 1/2] edit pyarrow stringify to better handle emojis and
 accents

---
 superset/result_set.py                        |  9 +-
 .../utils/pandas_postprocessing/boxplot.py    |  8 +-
 .../utils/pandas_postprocessing/flatten.py    |  2 +-
 tests/unit_tests/dataframe_test.py            | 82 ++++++++++++++++++-
 tests/unit_tests/result_set_test.py           | 12 +--
 5 files changed, 99 insertions(+), 14 deletions(-)
diff --git a/superset/result_set.py b/superset/result_set.py
index 3d29673b9fcb9..170de1869c830 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -70,9 +70,14 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
         for obj in it:
             if na_obj := pd.isna(obj):
                 # pandas <NA> type cannot be converted to string
-                obj[na_obj] = None  # type: ignore
+                obj[na_obj] = None
             else:
-                obj[...] = stringify(obj)  # type: ignore
+                try:
+                    # for simple string conversions
+                    # this handles odd character types better
+                    obj[...] = obj.astype(str)
+                except ValueError:
+                    obj[...] = stringify(obj)
 
     return result
 
diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py
index 673c39ebf3836..e2706345b1ea9 100644
--- a/superset/utils/pandas_postprocessing/boxplot.py
+++ b/superset/utils/pandas_postprocessing/boxplot.py
@@ -57,10 +57,10 @@ def boxplot(
     """
 
     def quartile1(series: Series) -> float:
-        return np.nanpercentile(series, 25, interpolation="midpoint")  # type: ignore
+        return np.nanpercentile(series, 25, interpolation="midpoint")
 
     def quartile3(series: Series) -> float:
-        return np.nanpercentile(series, 75, interpolation="midpoint")  # type: ignore
+        return np.nanpercentile(series, 75, interpolation="midpoint")
 
     if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:
 
@@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float:
             return np.nanpercentile(series, low)
 
     else:
-        whisker_high = np.max  # type: ignore
-        whisker_low = np.min  # type: ignore
+        whisker_high = np.max
+        whisker_low = np.min
 
     def outliers(series: Series) -> Set[float]:
         above = series[series > whisker_high(series)]
diff --git a/superset/utils/pandas_postprocessing/flatten.py b/superset/utils/pandas_postprocessing/flatten.py
index 1026164e454ee..db783c4bed264 100644
--- a/superset/utils/pandas_postprocessing/flatten.py
+++ b/superset/utils/pandas_postprocessing/flatten.py
@@ -85,7 +85,7 @@ def flatten(
         _columns = []
         for series in df.columns.to_flat_index():
             _cells = []
-            for cell in series if is_sequence(series) else [series]:  # type: ignore
+            for cell in series if is_sequence(series) else [series]:
                 if pd.notnull(cell):
                     # every cell should be converted to string and escape comma
                     _cells.append(escape_separator(str(cell)))
diff --git a/tests/unit_tests/dataframe_test.py b/tests/unit_tests/dataframe_test.py
index 3d8bd15aeff33..f0d9bc31b064b 100644
--- a/tests/unit_tests/dataframe_test.py
+++ b/tests/unit_tests/dataframe_test.py
@@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None:
 
     assert df_to_records(df) == [
         {"date": None},
-        {"date": '"2023-01-06T20:50:31.749000+00:00"'},
+        {"date": "2023-01-06 20:50:31.749000+00:00"},
+    ]
+
+
+def test_df_to_records_mixed_emoji_type() -> None:
+    from superset.db_engine_specs import BaseEngineSpec
+    from superset.result_set import SupersetResultSet
+
+    data = [
+        ("What's up?", "This is a string text", 1),
+        ("What's up?", "This is a string with an 😍 added", 2),
+        ("What's up?", NaT, 3),
+        ("What's up?", "Last emoji 😁", 4),
+    ]
+
+    cursor_descr: DbapiDescription = [
+        ("question", "varchar", None, None, None, None, False),
+        ("response", "varchar", None, None, None, None, False),
+        ("count", "integer", None, None, None, None, False),
+    ]
+
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    assert df_to_records(df) == [
+        {"question": "What's up?", "response": "This is a string text", "count": 1},
+        {
+            "question": "What's up?",
+            "response": "This is a string with an 😍 added",
+            "count": 2,
+        },
+        {
+            "question": "What's up?",
+            "response": None,
+            "count": 3,
+        },
+        {
+            "question": "What's up?",
+            "response": "Last emoji 😁",
+            "count": 4,
+        },
+    ]
+
+
+def test_df_to_records_mixed_accent_type() -> None:
+    from superset.db_engine_specs import BaseEngineSpec
+    from superset.result_set import SupersetResultSet
+
+    data = [
+        ("What's up?", "This is a string text", 1),
+        ("What's up?", "This is a string with áccent", 2),
+        ("What's up?", NaT, 3),
+        ("What's up?", "móre áccent", 4),
+    ]
+
+    cursor_descr: DbapiDescription = [
+        ("question", "varchar", None, None, None, None, False),
+        ("response", "varchar", None, None, None, None, False),
+        ("count", "integer", None, None, None, None, False),
+    ]
+
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    assert df_to_records(df) == [
+        {"question": "What's up?", "response": "This is a string text", "count": 1},
+        {
+            "question": "What's up?",
+            "response": "This is a string with áccent",
+            "count": 2,
+        },
+        {
+            "question": "What's up?",
+            "response": None,
+            "count": 3,
+        },
+        {
+            "question": "What's up?",
+            "response": "móre áccent",
+            "count": 4,
+        },
     ]
 
 
diff --git a/tests/unit_tests/result_set_test.py b/tests/unit_tests/result_set_test.py
index 0a78e0a5edd0a..331810bb1ed62 100644
--- a/tests/unit_tests/result_set_test.py
+++ b/tests/unit_tests/result_set_test.py
@@ -98,10 +98,10 @@ def test_stringify_with_null_integers():
 
     expected = np.array(
         [
-            array(['"foo"', '"foo"', '"foo"'], dtype=object),
-            array(['"bar"', '"bar"', '"bar"'], dtype=object),
+            array(["foo", "foo", "foo"], dtype=object),
+            array(["bar", "bar", "bar"], dtype=object),
             array([None, None, None], dtype=object),
-            array([None, "true", None], dtype=object),
+            array([None, "True", None], dtype=object),
         ]
     )
 
@@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps():
 
     expected = np.array(
         [
-            array(['"foo"', '"foo"', '"foo"'], dtype=object),
-            array(['"bar"', '"bar"', '"bar"'], dtype=object),
+            array(["foo", "foo", "foo"], dtype=object),
+            array(["bar", "bar", "bar"], dtype=object),
             array([None, None, None], dtype=object),
-            array([None, "true", None], dtype=object),
+            array([None, "True", None], dtype=object),
         ]
     )
 

From ebdc33776973f71e133a00fef7cb39a2b7dd4f6b Mon Sep 17 00:00:00 2001
From: Elizabeth Thompson <eschutho@gmail.com>
Date: Mon, 30 Jan 2023 09:25:12 -0800
Subject: [PATCH 2/2] fix tests

---
 superset/result_set.py                          | 6 +++---
 superset/utils/pandas_postprocessing/boxplot.py | 8 ++++----
 superset/utils/pandas_postprocessing/flatten.py | 2 +-
 tests/integration_tests/result_set_tests.py     | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/superset/result_set.py b/superset/result_set.py
index 170de1869c830..1c4ae98dc9112 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -70,14 +70,14 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
         for obj in it:
             if na_obj := pd.isna(obj):
                 # pandas <NA> type cannot be converted to string
-                obj[na_obj] = None
+                obj[na_obj] = None  # type: ignore
             else:
                 try:
                     # for simple string conversions
                     # this handles odd character types better
-                    obj[...] = obj.astype(str)
+                    obj[...] = obj.astype(str)  # type: ignore
                 except ValueError:
-                    obj[...] = stringify(obj)
+                    obj[...] = stringify(obj)  # type: ignore
 
     return result
 
diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py
index e2706345b1ea9..d4c78bf15e8c8 100644
--- a/superset/utils/pandas_postprocessing/boxplot.py
+++ b/superset/utils/pandas_postprocessing/boxplot.py
@@ -57,10 +57,10 @@ def boxplot(
     """
 
     def quartile1(series: Series) -> float:
-        return np.nanpercentile(series, 25, interpolation="midpoint")
+        return np.nanpercentile(series, 25, method="midpoint")
 
     def quartile3(series: Series) -> float:
-        return np.nanpercentile(series, 75, interpolation="midpoint")
+        return np.nanpercentile(series, 75, method="midpoint")
 
     if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:
 
@@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float:
             return np.nanpercentile(series, low)
 
     else:
-        whisker_high = np.max
-        whisker_low = np.min
+        whisker_high = np.max  # type: ignore
+        whisker_low = np.min  # type: ignore
 
     def outliers(series: Series) -> Set[float]:
         above = series[series > whisker_high(series)]
diff --git a/superset/utils/pandas_postprocessing/flatten.py b/superset/utils/pandas_postprocessing/flatten.py
index db783c4bed264..1026164e454ee 100644
--- a/superset/utils/pandas_postprocessing/flatten.py
+++ b/superset/utils/pandas_postprocessing/flatten.py
@@ -85,7 +85,7 @@ def flatten(
         _columns = []
         for series in df.columns.to_flat_index():
             _cells = []
-            for cell in series if is_sequence(series) else [series]:
+            for cell in series if is_sequence(series) else [series]:  # type: ignore
                 if pd.notnull(cell):
                     # every cell should be converted to string and escape comma
                     _cells.append(escape_separator(str(cell)))
diff --git a/tests/integration_tests/result_set_tests.py b/tests/integration_tests/result_set_tests.py
index 626468fc5aae0..18135c486dbea 100644
--- a/tests/integration_tests/result_set_tests.py
+++ b/tests/integration_tests/result_set_tests.py
@@ -169,13 +169,13 @@ def test_nested_types(self):
                     "id": 4,
                     "dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
                     "num_arr": "[1, 2, 3]",
-                    "map_col": '{"chart_name": "scatter"}',
+                    "map_col": "{'chart_name': 'scatter'}",
                 },
                 {
                     "id": 3,
                     "dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
                     "num_arr": "[4, 5, 6]",
-                    "map_col": '{"chart_name": "plot"}',
+                    "map_col": "{'chart_name': 'plot'}",
                 },
             ],
         )