From 0b838f31f223e9827ca821e8e5c509bfa5f9b3c9 Mon Sep 17 00:00:00 2001 From: Raoul Date: Mon, 20 Jan 2025 12:09:22 +0100 Subject: [PATCH 1/4] fix(VirtualDataframe): fixing virtual dataframe name conflict --- pandasai/dataframe/base.py | 29 +++++++++++++------------ pandasai/dataframe/virtual_dataframe.py | 15 ++++++++----- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pandasai/dataframe/base.py b/pandasai/dataframe/base.py index 345225419..7099ea1f3 100644 --- a/pandasai/dataframe/base.py +++ b/pandasai/dataframe/base.py @@ -37,14 +37,14 @@ class DataFrame(pd.DataFrame): config (Config): Configuration settings """ - _metadata: ClassVar[list] = [ - "name", - "description", - "schema", - "path", - "config", + _metadata = [ "_agent", "_column_hash", + "config", + "description", + "name", + "path", + "schema", ] def __init__( @@ -56,21 +56,22 @@ def __init__( copy: bool | None = None, **kwargs, ) -> None: + _name: Optional[str] = kwargs.pop("name", None) + _schema: Optional[SemanticLayerSchema] = kwargs.pop("schema", None) + _description: Optional[str] = kwargs.pop("description", None) + _path: Optional[str] = kwargs.pop("path", None) + super().__init__( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) - self.name: Optional[str] = kwargs.pop("name", None) self._column_hash = self._calculate_column_hash() - if not self.name: - self.name = f"table_{self._column_hash}" - - schema: Optional[SemanticLayerSchema] = kwargs.pop("schema", None) - self.schema = schema or DataFrame.get_default_schema(self) + self.name = _name or f"table_{self._column_hash}" + self.schema = _schema or DataFrame.get_default_schema(self) + self.description = _description + self.path = _path - self.description: Optional[str] = kwargs.pop("description", None) - self.path: Optional[str] = kwargs.pop("path", None) self.config = pai.config.get() self._agent: Optional[Agent] = None diff --git a/pandasai/dataframe/virtual_dataframe.py b/pandasai/dataframe/virtual_dataframe.py index 5baac91c4..a6c571b83 100644 --- a/pandasai/dataframe/virtual_dataframe.py +++ b/pandasai/dataframe/virtual_dataframe.py @@ -13,14 +13,17 @@ class VirtualDataFrame(DataFrame): - _metadata: ClassVar[list] = [ + _metadata = [ + "_agent", + "_column_hash", + "_head", "_loader", + "config", + "description", "head", - "_head", + "name", + "path", "schema", - "config", - "_agent", - "_column_hash", ] def __init__(self, *args, **kwargs): @@ -34,6 +37,7 @@ def __init__(self, *args, **kwargs): raise VirtualizationError("Schema is required for virtualization!") table_name = schema.source.table + description = schema.description super().__init__( @@ -47,7 +51,6 @@ def __init__(self, *args, **kwargs): def head(self): if self._head is None: self._head = self._loader.load_head() - return self._head @property From 65dc12ad5ec49e27c5d725dcad1802bb91160755 Mon Sep 17 00:00:00 2001 From: Raoul Date: Mon, 20 Jan 2025 15:44:29 +0100 Subject: [PATCH 2/4] feature(View): enabling view in SemanticLayerSchema --- docs/v3/semantic-layer.mdx | 76 ++++++++++++++--- pandasai/data_loader/semantic_layer_schema.py | 85 +++++++++++++++++-- .../dataframe/test_semantic_layer_schema.py | 79 +++++++++++++++++ 3 files changed, 225 insertions(+), 15 deletions(-) diff --git a/docs/v3/semantic-layer.mdx b/docs/v3/semantic-layer.mdx index b393a393c..e71efb8fd 100644 --- a/docs/v3/semantic-layer.mdx +++ b/docs/v3/semantic-layer.mdx @@ -238,15 +238,22 @@ columns: ``` **Type**: `list[dict]` -- Each dictionary represents a column -- `name` (str): Name of the column -- `type` (str): Data type of the column - - "string": IDs, names, categories - - "integer": counts, whole numbers - - "float": prices, percentages - - "datetime": timestamps, dates - - "boolean": flags, true/false values -- `description` (str): Clear explanation of what the column represents +- Each dictionary represents a column. +- **Fields**: + - `name` (str): Name of the column. + - For tables: Use simple column names (e.g., `transaction_id`). + - `type` (str): Data type of the column. + - Supported types: + - `"string"`: IDs, names, categories. + - `"integer"`: Counts, whole numbers. + - `"float"`: Prices, percentages. + - `"datetime"`: Timestamps, dates. + - `"boolean"`: Flags, true/false values. + - `description` (str): Clear explanation of what the column represents. + +**Constraints**: +1. Column names must be unique. +2. For views, all column names must be in the format `[table].[column]`. #### transformations Apply transformations to your data to clean, convert, or anonymize it. @@ -350,4 +357,53 @@ Specify the maximum number of records to load. **Type**: `int` ```yaml -limit: 1000 \ No newline at end of file +limit: 1000 +``` + +### View Configuration + +The following sections detail all available configurations for view options in your `schema.yaml` file. Similar to views in SQL, you can define multiple tables and the relationships between them. + +#### Example Configuration + +```yaml +name: table_heart +source: + type: postgres + connection: + host: localhost + port: 5432 + database: test + user: test + password: test + view: true +columns: +- name: parents.id +- name: parents.name +- name: parents.age +- name: children.name +- name: children.age +relations: +- name: parent_to_children + description: Relation linking the parent to its children + from: parents.id + to: children.id +``` + +--- + +#### Constraints + +1. **Mutual Exclusivity**: + - A schema cannot define both `table` and `view` simultaneously. + - If `source.view` is `true`, then the schema represents a view. + +2. **Column Format**: + - For views: + - All columns must follow the format `[table].[column]`. + - `from` and `to` fields in `relations` must follow the `[table].[column]` format. + - Example: `parents.id`, `children.name`. + +3. **Relationships for Views**: + - Each table referenced in `columns` must have at least one relationship defined in `relations`. + - Relationships must specify `from` and `to` attributes in the `[table].[column]` format. diff --git a/pandasai/data_loader/semantic_layer_schema.py b/pandasai/data_loader/semantic_layer_schema.py index b5c922188..b9e8e4373 100644 --- a/pandasai/data_loader/semantic_layer_schema.py +++ b/pandasai/data_loader/semantic_layer_schema.py @@ -1,4 +1,6 @@ import json +import re +from functools import partial from typing import Any, Dict, List, Optional, Union import yaml @@ -32,6 +34,17 @@ def is_column_type_supported(cls, type: str) -> str: return type +class Relation(BaseModel): + name: Optional[str] = Field(None, description="Name of the relationship.") + description: Optional[str] = Field( + None, description="Description of the relationship." + ) + from_: str = Field( + ..., alias="from", description="Source column for the relationship." + ) + to: str = Field(..., description="Target column for the relationship.") + + class Transformation(BaseModel): type: str = Field(..., description="Type of transformation to be applied.") params: Optional[Dict[str, str]] = Field( @@ -48,11 +61,12 @@ def is_transformation_type_supported(cls, type: str) -> str: class Source(BaseModel): type: str = Field(..., description="Type of the data source.") + path: Optional[str] = Field(None, description="Path of the local data source.") connection: Optional[Dict[str, Union[str, int]]] = Field( None, description="Connection object of the data source." ) - path: Optional[str] = Field(None, description="Path of the local data source.") table: Optional[str] = Field(None, description="Table of the data source.") + view: Optional[bool] = Field(False, description="Whether table is a view") @model_validator(mode="before") @classmethod @@ -60,6 +74,7 @@ def validate_type_and_fields(cls, values): _type = values.get("type") path = values.get("path") table = values.get("table") + view = values.get("view") connection = values.get("connection") if _type in LOCAL_SOURCE_TYPES: @@ -67,15 +82,17 @@ def validate_type_and_fields(cls, values): raise ValueError( f"For local source type '{_type}', 'path' must be defined." ) + if view: + raise ValueError("For local source type you can't use a view.") elif _type in REMOTE_SOURCE_TYPES: if not connection: raise ValueError( f"For remote source type '{_type}', 'connection' must be defined." ) - if not table: - raise ValueError( - f"For remote source type '{_type}', 'table' must be defined." - ) + if table and view: + raise ValueError("Only one of 'table' or 'view' can be defined.") + if not table and not view: + raise ValueError("Either 'table' or 'view' must be defined.") else: raise ValueError(f"Unsupported source type: {_type}") @@ -104,6 +121,9 @@ class SemanticLayerSchema(BaseModel): columns: Optional[List[Column]] = Field( None, description="Structure and metadata of your dataset’s columns" ) + relations: Optional[List[Relation]] = Field( + None, description="Relationships between columns and tables." + ) order_by: Optional[List[str]] = Field( None, description="Ordering criteria for the dataset." ) @@ -120,6 +140,61 @@ class SemanticLayerSchema(BaseModel): None, description="Frequency of dataset updates." ) + @model_validator(mode="after") + def check_columns_relations(self): + column_re_check = r"^[a-zA-Z_]+\.[a-zA-Z_]+$" + is_view_column_name = partial(re.match, column_re_check) + + # unpack columns info + _columns = self.columns + _column_names = [col.name for col in _columns or ()] + _tables_names_in_columns = { + column_name.split(".")[0] for column_name in _column_names or () + } + + if len(_column_names) != len(set(_column_names)): + raise ValueError("Column names must be unique. Duplicate names found.") + + if self.source.view: + # unpack relations info + _relations = self.relations + _column_names_in_relations = { + table + for relation in _relations or () + for table in (relation.from_, relation.to) + } + _tables_names_in_relations = { + column_name.split(".")[0] + for column_name in _column_names_in_relations or () + } + + if not all( + is_view_column_name(column_name) for column_name in _column_names + ): + raise ValueError( + "All columns in a view must be in the format '[table].[column]'." + ) + + if not all( + is_view_column_name(column_name) + for column_name in _column_names_in_relations + ): + raise ValueError( + "All params 'from' and 'to' in the relations must be in the format '[table].[column]'." + ) + + if ( + uncovered_tables := _tables_names_in_columns + - _tables_names_in_relations + ): + raise ValueError( + f"No relations provided for the following tables {uncovered_tables}." + ) + + elif any(is_view_column_name(column_name) for column_name in _column_names): + raise ValueError("All columns in a table must be in the format '[column]'.") + return self + def to_dict(self) -> dict[str, Any]: return self.model_dump(exclude_none=True) diff --git a/tests/unit_tests/dataframe/test_semantic_layer_schema.py b/tests/unit_tests/dataframe/test_semantic_layer_schema.py index 87d053245..0d7233c29 100644 --- a/tests/unit_tests/dataframe/test_semantic_layer_schema.py +++ b/tests/unit_tests/dataframe/test_semantic_layer_schema.py @@ -91,6 +91,29 @@ def mysql_schema(self): }, } + @pytest.fixture + def mysql_view_schema(self): + return { + "name": "Users", + "columns": [ + {"name": "parents.id"}, + {"name": "parents.name"}, + {"name": "children.name"}, + ], + "relations": [{"from": "parents.id", "to": "children.id"}], + "source": { + "type": "mysql", + "connection": { + "host": "localhost", + "port": 3306, + "database": "test_db", + "user": "test_user", + "password": "test_password", + }, + "view": "true", + }, + } + def test_valid_schema(self, sample_schema): schema = SemanticLayerSchema(**sample_schema) @@ -113,6 +136,14 @@ def test_valid_mysql_schema(self, mysql_schema): assert len(schema.transformations) == 2 assert schema.source.type == "mysql" + def test_valid_mysql_view_schema(self, mysql_view_schema): + schema = SemanticLayerSchema(**mysql_view_schema) + + assert schema.name == "Users" + assert len(schema.columns) == 3 + assert schema.source.view == True + assert schema.source.type == "mysql" + def test_missing_source_path(self, sample_schema): sample_schema["source"].pop("path") @@ -203,3 +234,51 @@ def test_is_schema_source_same_false(self, mysql_schema, sample_schema): schema2 = SemanticLayerSchema(**sample_schema) assert is_schema_source_same(schema1, schema2) is False + + def test_invalid_source_view_for_local_type(self, sample_schema): + sample_schema["source"]["view"] = True + + with pytest.raises(ValidationError): + SemanticLayerSchema(**sample_schema) + + def test_invalid_source_view_and_table(self, mysql_schema): + mysql_schema["source"]["view"] = True + + with pytest.raises(ValidationError): + SemanticLayerSchema(**mysql_schema) + + def test_invalid_source_missing_view_or_table(self, mysql_schema): + mysql_schema["source"].pop("table") + + with pytest.raises(ValidationError): + SemanticLayerSchema(**mysql_schema) + + def test_invalid_duplicated_columns(self, sample_schema): + sample_schema["columns"].append(sample_schema["columns"][0]) + + with pytest.raises(ValidationError): + SemanticLayerSchema(**sample_schema) + + def test_invalid_wrong_column_format_in_view(self, mysql_view_schema): + mysql_view_schema["columns"][0]["name"] = "parentsid" + + with pytest.raises(ValidationError): + SemanticLayerSchema(**mysql_view_schema) + + def test_invalid_wrong_column_format(self, sample_schema): + sample_schema["columns"][0]["name"] = "parents.id" + + with pytest.raises(ValidationError): + SemanticLayerSchema(**sample_schema) + + def test_invalid_wrong_relation_format_in_view(self, mysql_view_schema): + mysql_view_schema["relations"][0]["to"] = "parentsid" + + with pytest.raises(ValidationError): + SemanticLayerSchema(**mysql_view_schema) + + def test_invalid_uncovered_columns_in_view(self, mysql_view_schema): + mysql_view_schema.pop("relations") + + with pytest.raises(ValidationError): + SemanticLayerSchema(**mysql_view_schema) From 57b890038d6291998c8ba72b63db70de8c30a613 Mon Sep 17 00:00:00 2001 From: Raoul Scalise <36519284+scaliseraoul@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:57:17 +0100 Subject: [PATCH 3/4] Update pandasai/data_loader/semantic_layer_schema.py Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- pandasai/data_loader/semantic_layer_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandasai/data_loader/semantic_layer_schema.py b/pandasai/data_loader/semantic_layer_schema.py index b9e8e4373..65baf81ed 100644 --- a/pandasai/data_loader/semantic_layer_schema.py +++ b/pandasai/data_loader/semantic_layer_schema.py @@ -83,7 +83,7 @@ def validate_type_and_fields(cls, values): f"For local source type '{_type}', 'path' must be defined." ) if view: - raise ValueError("For local source type you can't use a view.") + raise ValueError("A view cannot be used with a local source type.") elif _type in REMOTE_SOURCE_TYPES: if not connection: raise ValueError( From e6ce56de80184a24f7a537a5a05facd4a2e81a7d Mon Sep 17 00:00:00 2001 From: "ellipsis-dev[bot]" <65095814+ellipsis-dev[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 14:59:18 +0000 Subject: [PATCH 4/4] address comments left by @scaliseraoul on #1532 (feature(View): enabling view in SemanticLayerSchema); --- pandasai/data_loader/semantic_layer_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandasai/data_loader/semantic_layer_schema.py b/pandasai/data_loader/semantic_layer_schema.py index 65baf81ed..263f75910 100644 --- a/pandasai/data_loader/semantic_layer_schema.py +++ b/pandasai/data_loader/semantic_layer_schema.py @@ -142,7 +142,7 @@ class SemanticLayerSchema(BaseModel): @model_validator(mode="after") def check_columns_relations(self): - column_re_check = r"^[a-zA-Z_]+\.[a-zA-Z_]+$" + column_re_check = r"^[a-zA-Z0-9_]+\.[a-zA-Z0-9_]+$" is_view_column_name = partial(re.match, column_re_check) # unpack columns info