Skip to content

Commit

Permalink
edit pyarrow stringify to better handle emojis and accents
Browse files Browse the repository at this point in the history
  • Loading branch information
eschutho committed Jan 30, 2023
1 parent c839d0d commit b8d47ca
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 14 deletions.
9 changes: 7 additions & 2 deletions superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,14 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
for obj in it:
if na_obj := pd.isna(obj):
# pandas <NA> type cannot be converted to string
obj[na_obj] = None # type: ignore
obj[na_obj] = None
else:
obj[...] = stringify(obj) # type: ignore
try:
# for simple string conversions
# this handles odd character types better
obj[...] = obj.astype(str)
except ValueError:
obj[...] = stringify(obj)

return result

Expand Down
8 changes: 4 additions & 4 deletions superset/utils/pandas_postprocessing/boxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ def boxplot(
"""

def quartile1(series: Series) -> float:
return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 25, interpolation="midpoint")

def quartile3(series: Series) -> float:
return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 75, interpolation="midpoint")

if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

Expand Down Expand Up @@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float:
return np.nanpercentile(series, low)

else:
whisker_high = np.max # type: ignore
whisker_low = np.min # type: ignore
whisker_high = np.max
whisker_low = np.min

def outliers(series: Series) -> Set[float]:
above = series[series > whisker_high(series)]
Expand Down
2 changes: 1 addition & 1 deletion superset/utils/pandas_postprocessing/flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def flatten(
_columns = []
for series in df.columns.to_flat_index():
_cells = []
for cell in series if is_sequence(series) else [series]: # type: ignore
for cell in series if is_sequence(series) else [series]:
if pd.notnull(cell):
# every cell should be converted to string and escape comma
_cells.append(escape_separator(str(cell)))
Expand Down
82 changes: 81 additions & 1 deletion tests/unit_tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None:

assert df_to_records(df) == [
{"date": None},
{"date": '"2023-01-06T20:50:31.749000+00:00"'},
{"date": "2023-01-06 20:50:31.749000+00:00"},
]


def test_df_to_records_mixed_emoji_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet

data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with an 😍 added", 2),
("What's up?", NaT, 3),
("What's up?", "Last emoji 😁", 4),
]

cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]

results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()

assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with an 😍 added",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "Last emoji 😁",
"count": 4,
},
]


def test_df_to_records_mixed_accent_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet

data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with áccent", 2),
("What's up?", NaT, 3),
("What's up?", "móre áccent", 4),
]

cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]

results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()

assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with áccent",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "móre áccent",
"count": 4,
},
]


Expand Down
12 changes: 6 additions & 6 deletions tests/unit_tests/result_set_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def test_stringify_with_null_integers():

expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)

Expand Down Expand Up @@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps():

expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)

Expand Down

0 comments on commit b8d47ca

Please sign in to comment.