Skip to content

Commit

Permalink
[feat] improvements to duckDB column type handling (#2970)
Browse files Browse the repository at this point in the history
This PR intends to preserve column types between different types of ingestion into Kepler and DuckDb

- timestamps stored as strings from Arrow tables are recognized as timestamps. 
- apply extra metadata from table.schema.metadata (geoparquet files). 
- DuckDB geometry is automatically casted to WKB, and properly marked with geoarrow extensions.
- DuckDB column types and query result Arrow table types consolidation.
- Apply extra logic only to the last select query.
- geoarrow constants to constants module
- add getSampleForTypeAnalyzeArrow to support and not fail for arrow data
- arrowSchemaToFields accepts extra info from DuckDB table schemas. JSON type gets GEOMETRY_FROM_STRING type, GEOMETRY with geoarrow metadata gets GEOMETRY type, timestamp ...
- fix in validateInputData - check analyzerType only for current field
- fix in validateInputData - support arrow input data

---------

Signed-off-by: Ihor Dykhta <[email protected]>
  • Loading branch information
igorDykhta authored Feb 18, 2025
1 parent d30a95b commit 221b243
Show file tree
Hide file tree
Showing 20 changed files with 715 additions and 181 deletions.
50 changes: 49 additions & 1 deletion src/common-utils/src/data-type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Copyright contributors to the kepler.gl project

import {Analyzer, DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
import {RowData, Field} from '@kepler.gl/types';
import {ArrowTableInterface, ApacheVectorInterface, RowData, Field} from '@kepler.gl/types';
import {ALL_FIELD_TYPES} from '@kepler.gl/constants';
import {console as globalConsole} from 'global/window';
import {range} from 'd3-array';
Expand Down Expand Up @@ -80,6 +80,54 @@ export function getSampleForTypeAnalyze({
return sample;
}

/**
* Getting sample data for analyzing field type for Arrow tables.
* @param table Arrow table or an array of vectors.
* @param fields Field names.
* @param sampleCount Number of sample rows to get.
* @returns Sample rows.
*/
export function getSampleForTypeAnalyzeArrow(
table: ArrowTableInterface | ApacheVectorInterface[],
fields: string[],
sampleCount: number = 50
): any[] {
const isTable = !Array.isArray(table);

const numRows = isTable ? table.numRows : table[0].length;
const getVector = isTable ? index => table.getChildAt(index) : index => table[index];

const total = Math.min(sampleCount, numRows);
const sample = range(0, total, 1).map(() => ({}));

if (numRows < 1) {
return [];
}

// collect sample data for each field
fields.forEach((field, fieldIdx) => {
let rowIndex = 0;
let sampleIndex = 0;

while (sampleIndex < total) {
if (rowIndex >= numRows) {
// if depleted data pool
sample[sampleIndex][field] = null;
sampleIndex++;
} else if (notNullorUndefined(getVector(fieldIdx)?.get(rowIndex))) {
const value = getVector(fieldIdx)?.get(rowIndex);
sample[sampleIndex][field] = typeof value === 'string' ? value.trim() : value;
sampleIndex++;
rowIndex++;
} else {
rowIndex++;
}
}
});

return sample;
}

/**
* Convert type-analyzer output to kepler.gl field types
*
Expand Down
2 changes: 1 addition & 1 deletion src/components/src/common/data-table/header-cell.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ const HeaderCellFactory = (FieldToken: React.FC<FieldTokenProps>) => {
const firstCell = columnIndex === 0;
const isFormatted = Boolean(colMeta[column]?.displayFormat);
const formatLabels = isFormatted ? getFieldFormatLabels(colMeta[column].type) : [];
const onSortTable = useCallback(() => sortTableColumn(column), [sortTableColumn, column]);
const onSortTable = useCallback(() => sortTableColumn?.(column), [sortTableColumn, column]);
const onToggleOptionMenu = useCallback(
() => toggleMoreOptions(column),
[toggleMoreOptions, column]
Expand Down
15 changes: 15 additions & 0 deletions src/constants/src/default-settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1578,3 +1578,18 @@ export const SYNC_TIMELINE_MODES: Record<string, SyncTimelineMode> = {
start: 0,
end: 1
};

export const GEOARROW_METADATA_KEY = 'ARROW:extension:name';

/**
* Enum holding GeoArrow extension type names
*/
export enum GEOARROW_EXTENSIONS {
POINT = 'geoarrow.point',
LINESTRING = 'geoarrow.linestring',
POLYGON = 'geoarrow.polygon',
MULTIPOINT = 'geoarrow.multipoint',
MULTILINESTRING = 'geoarrow.multilinestring',
MULTIPOLYGON = 'geoarrow.multipolygon',
WKB = 'geoarrow.wkb'
}
1 change: 1 addition & 0 deletions src/deckgl-arrow-layers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
],
"dependencies": {
"@geoarrow/geoarrow-js": "^0.3.0",
"@kepler.gl/constants": "^3.1.0",
"@math.gl/core": "^4.0.0",
"@math.gl/polygon": "^4.0.0",
"@math.gl/types": "^4.0.0",
Expand Down
13 changes: 0 additions & 13 deletions src/deckgl-arrow-layers/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,4 @@
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

/**
* Enum holding GeoArrow extension type names
*/
export enum EXTENSION_NAME {
POINT = 'geoarrow.point',
LINESTRING = 'geoarrow.linestring',
POLYGON = 'geoarrow.polygon',
MULTIPOINT = 'geoarrow.multipoint',
MULTILINESTRING = 'geoarrow.multilinestring',
MULTIPOLYGON = 'geoarrow.multipolygon',
WKB = 'geoarrow.wkb'
}

export const DEFAULT_COLOR: [number, number, number, number] = [0, 0, 0, 255];
2 changes: 0 additions & 2 deletions src/deckgl-arrow-layers/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: MIT
// Copyright contributors to the kepler.gl project

export {EXTENSION_NAME} from './constants';

export {GeoArrowScatterplotLayer} from './layers/geo-arrow-scatterplot-layer';
export {GeoArrowTextLayer} from './layers/geo-arrow-text-layer';
export {GeoArrowArcLayer} from './layers/geo-arrow-arc-layer';
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ import {ScatterplotLayer} from '@deck.gl/layers/typed';
import type {ScatterplotLayerProps} from '@deck.gl/layers/typed';
import * as arrow from 'apache-arrow';
import * as ga from '@geoarrow/geoarrow-js';

import {GEOARROW_EXTENSIONS} from '@kepler.gl/constants';

import {
assignAccessor,
extractAccessorsFromProps,
Expand All @@ -26,7 +29,6 @@ import {
} from '../utils/utils';
import {GeoArrowExtraPickingProps, computeChunkOffsets, getPickingInfo} from '../utils/picking';
import {ColorAccessor, FloatAccessor, GeoArrowPickingInfo, ExtensionProps} from '../types';
import {EXTENSION_NAME} from '../constants';
import {validateAccessors} from '../utils/validate';

/** All properties supported by GeoArrowScatterplotLayer */
Expand Down Expand Up @@ -121,12 +123,12 @@ export class GeoArrowScatterplotLayer<ExtraProps extends object = object> extend

throw new Error('getPosition should pass in an arrow Vector of Point or MultiPoint type');
} else {
const pointVector = getGeometryVector(table, EXTENSION_NAME.POINT);
const pointVector = getGeometryVector(table, GEOARROW_EXTENSIONS.POINT);
if (pointVector !== null) {
return this._renderLayersPoint(pointVector);
}

const multiPointVector = getGeometryVector(table, EXTENSION_NAME.MULTIPOINT);
const multiPointVector = getGeometryVector(table, GEOARROW_EXTENSIONS.MULTIPOINT);
if (multiPointVector !== null) {
return this._renderLayersMultiPoint(multiPointVector);
}
Expand Down
6 changes: 4 additions & 2 deletions src/deckgl-arrow-layers/src/layers/geo-arrow-text-layer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ import {TextLayer} from '@deck.gl/layers/typed';
import type {TextLayerProps} from '@deck.gl/layers';
import * as arrow from 'apache-arrow';
import * as ga from '@geoarrow/geoarrow-js';

import {GEOARROW_EXTENSIONS} from '@kepler.gl/constants';

import {
assignAccessor,
expandArrayToCoords,
Expand All @@ -26,7 +29,6 @@ import {
} from '../utils/utils';
import {GeoArrowExtraPickingProps, computeChunkOffsets, getPickingInfo} from '../utils/picking';
import {ColorAccessor, FloatAccessor, GeoArrowPickingInfo, ExtensionProps} from '../types';
import {EXTENSION_NAME} from '../constants';
import {validateAccessors} from '../utils/validate';

/** All properties supported by GeoArrowTextLayer */
Expand Down Expand Up @@ -167,7 +169,7 @@ export class GeoArrowTextLayer<ExtraProps extends object = object> extends Compo

throw new Error('getPosition should pass in an arrow Vector of Point type');
} else {
const pointVector = getGeometryVector(table, EXTENSION_NAME.POINT);
const pointVector = getGeometryVector(table, GEOARROW_EXTENSIONS.POINT);
if (pointVector !== null) {
return this._renderLayersPoint(pointVector);
}
Expand Down
22 changes: 15 additions & 7 deletions src/duckdb/src/components/preview-data-panel.tsx
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
// SPDX-License-Identifier: MIT
// Copyright contributors to the kepler.gl project

import * as arrow from 'apache-arrow';
import React, {useCallback, useMemo, useState, CSSProperties} from 'react';
import {withTheme} from 'styled-components';

import {DataTable, renderedSize} from '@kepler.gl/components';
import {parseFieldValue, createDataContainer} from '@kepler.gl/utils';
import {arrowSchemaToFields} from '@kepler.gl/processors';
import {DataForm} from '@kepler.gl/utils';
import {withTheme} from 'styled-components';
import {parseFieldValue, createDataContainer, DataForm} from '@kepler.gl/utils';

type BaseComponentProps = {
className?: string;
Expand Down Expand Up @@ -39,14 +39,19 @@ export type DataTableStyle = {
optionsButton?: number;
};

export type QueryResult = {
table: arrow.Table;
tableDuckDBTypes: Record<string, string>;
};

export type PreviewDataPanelProps = BaseComponentProps & {
result: any;
result: QueryResult;
rowsToCalculatePreview?: number;
theme?: any;
setColumnDisplayFormat?: (formats: {[key: string]: string}) => void;
defaultPinnedColumns?: string[];
dataTableStyle: DataTableStyle;
onAddResultToMap: (result: any) => void;
onAddResultToMap: (result: QueryResult) => void;
};

const PreviewDataPanelWOTheme: React.FC<PreviewDataPanelProps> = ({
Expand All @@ -57,9 +62,12 @@ const PreviewDataPanelWOTheme: React.FC<PreviewDataPanelProps> = ({
theme
}) => {
const [pinnedColumns, setPinnedColumns] = useState<string[]>(defaultPinnedColumns);
const fields = useMemo(() => arrowSchemaToFields(result.schema), [result.schema]);
const fields = useMemo(
() => arrowSchemaToFields(result.table, result.tableDuckDBTypes),
[result]
);
const dataContainer = useMemo(() => {
const cols = [...Array(result.numCols).keys()].map(i => result.getChildAt(i));
const cols = [...Array(result.table.numCols).keys()].map(i => result.table.getChildAt(i));

const dataContainer = createDataContainer(cols, {
fields,
Expand Down
25 changes: 14 additions & 11 deletions src/duckdb/src/components/schema-panel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import React, {useCallback, useEffect, useState} from 'react';
import {useSelector} from 'react-redux';
import styled from 'styled-components';
import {arrowDataTypeToFieldType} from '@kepler.gl/utils';
import {ALL_FIELD_TYPES} from '@kepler.gl/constants';
import {AsyncDuckDBConnection} from '@duckdb/duckdb-wasm';

import {arrowSchemaToFields} from '@kepler.gl/processors';
import {VisState} from '@kepler.gl/schemas';

import {Tree, DatasetNode, ColumnNode, TreeNodeData} from './tree';
import {getDuckDB} from '../init';
import {getDuckDBColumnTypes, getDuckDBColumnTypesMap} from '../table/duckdb-table-utils';

// TODO note that demo state is available in demo-app, but not when add modules to dependencies in a custom map
type State = {
Expand All @@ -28,26 +31,26 @@ const StyledSchemaPanel = styled.div`
font-family: ${props => props.theme.fontFamily};
`;

async function getColumnSchema(c, name) {
const columnResult = await c.query(`Select * from '${name}' LIMIT 1;`);
async function getColumnSchema(connection: AsyncDuckDBConnection, tableName: string) {
const columnResult = await connection.query(`Select * from '${tableName}' LIMIT 1;`);

const columnDescribe = await getDuckDBColumnTypes(connection, tableName);
const keplerFields = arrowSchemaToFields(columnResult, getDuckDBColumnTypesMap(columnDescribe));

return {
key: name,
key: tableName,
object: {
type: 'dataset',
tableName: name
tableName: tableName
},
children: columnResult.schema.fields.map(field => {
const isGeoArrowColumn = field.metadata.get('ARROW:extension:name')?.startsWith('geoarrow');
children: columnResult.schema.fields.map((field, fieldIndex) => {
return {
key: field.name,
object: {
type: 'column',
name: field.name,
arrowType: field.type,
fieldType: isGeoArrowColumn
? ALL_FIELD_TYPES.geoarrow
: arrowDataTypeToFieldType(field.type)
fieldType: keplerFields[fieldIndex].type
}
};
})
Expand Down
Loading

0 comments on commit 221b243

Please sign in to comment.