Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically downcast data types in from_geopandas #195

Merged
merged 2 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion lonboard/_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from lonboard._constants import EPSG_4326, EXTENSION_NAME, OGC_84
from lonboard._geoarrow.geopandas_interop import geopandas_to_geoarrow
from lonboard._serialization import infer_rows_per_chunk
from lonboard._utils import auto_downcast as _auto_downcast
from lonboard.traits import ColorAccessor, FloatAccessor, PyarrowTableTrait

if TYPE_CHECKING:
Expand Down Expand Up @@ -82,7 +83,9 @@ def _default_rows_per_chunk(self):
return infer_rows_per_chunk(self.table)

@classmethod
def from_geopandas(cls, gdf: gpd.GeoDataFrame, **kwargs) -> Self:
def from_geopandas(
cls, gdf: gpd.GeoDataFrame, *, auto_downcast: bool = True, **kwargs
) -> Self:
"""Construct a Layer from a geopandas GeoDataFrame.

The GeoDataFrame will be reprojected to EPSG:4326 if it is not already in that
Expand All @@ -91,13 +94,23 @@ def from_geopandas(cls, gdf: gpd.GeoDataFrame, **kwargs) -> Self:
Args:
gdf: The GeoDataFrame to set on the layer.

Other parameters:
auto_downcast: If `True`, automatically downcast to smaller-size data types
if possible without loss of precision. This calls
[pandas.DataFrame.convert_dtypes][pandas.DataFrame.convert_dtypes] and
[pandas.to_numeric][pandas.to_numeric] under the hood.

Returns:
A Layer with the initialized data.
"""
if gdf.crs and gdf.crs not in [EPSG_4326, OGC_84]:
warnings.warn("GeoDataFrame being reprojected to EPSG:4326")
gdf = gdf.to_crs(OGC_84) # type: ignore

if auto_downcast:
# Note: we don't deep copy because we don't need to clone geometries
gdf = _auto_downcast(gdf.copy())

table = geopandas_to_geoarrow(gdf)
return cls(table=table, **kwargs)

Expand Down
53 changes: 53 additions & 0 deletions lonboard/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from typing import TypeVar

import numpy as np
import pandas as pd
import pyarrow as pa

from lonboard._constants import EXTENSION_NAME

DF = TypeVar("DF", bound=pd.DataFrame)

GEOARROW_EXTENSION_TYPE_NAMES = {e.value for e in EXTENSION_NAME}


Expand All @@ -17,3 +23,50 @@ def get_geometry_column_index(schema: pa.Schema) -> int:
return field_idx

raise ValueError("No geometry column in table schema.")


def auto_downcast(df: DF) -> DF:
"""Automatically downcast types to smallest data size

Args:
df: pandas DataFrame or geopandas GeoDataFrame

Returns:
DataFrame with downcasted data types
"""
# Convert objects to numeric types where possible.
# Note: we have to exclude geometry because
# `convert_dtypes(dtype_backend="pyarrow")` fails on the geometory column, but we
# also have to manually cast to a non-geo data frame because it'll fail to convert
# dtypes on a GeoDataFrame without a geom col
casted_df = pd.DataFrame(df.select_dtypes(exclude="geometry")).convert_dtypes(
infer_objects=True,
convert_string=True,
convert_integer=True,
convert_boolean=True,
convert_floating=True,
dtype_backend="pyarrow",
)
df[casted_df.columns] = casted_df

# Try to convert _all_ integer columns to unsigned integer columns, but use
# errors='ignore' to return signed integer data types for columns with negative
# integers.
for col_name in df.select_dtypes(np.integer).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="unsigned", dtype_backend="pyarrow"
)

# For any integer columns that are still signed integer, downcast those to smaller
# signed types
for col_name in df.select_dtypes(np.signedinteger).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="signed", dtype_backend="pyarrow"
)

for col_name in df.select_dtypes(np.floating).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="float", dtype_backend="pyarrow"
)

return df