diff --git a/docs/source/_static/config_schema.html b/docs/source/_static/config_schema.html index d81eb9c..e891b63 100644 --- a/docs/source/_static/config_schema.html +++ b/docs/source/_static/config_schema.html @@ -1 +1 @@ - SQLSynthGen Config

SQLSynthGen Config

Type: object

A SQLSynthGen configuration YAML file

No Additional Properties

Type: boolean

Type: string

Type: string

Type: array

Each item of this array must be:

Type: object
No Additional Properties

Type: string

Type: string

Type: number

Type: number

Type: object

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^(?!censor_dims).*$
Type: object

Type: array of object

Each item of this array must be:

Type: object
No Additional Properties

Type: integer

Type: object

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: .*
Type: object
No Additional Properties

Type: boolean

Type: integer

Type: array of object

Each item of this array must be:

Type: object

Type: array of string or string

Each item of this array must be:

\ No newline at end of file + SQLSynthGen Config

SQLSynthGen Config

Type: object

A SQLSynthGen configuration YAML file

No Additional Properties

Type: boolean

Run source-statistics queries using asyncpg.

Type: string

The name of a local Python module of row generators (excluding .py).

Type: string

The name of a local Python module of story generators (excluding .py).

Type: array

An array of source statistics queries.

Each item of this array must be:

Type: object
No Additional Properties

Type: string

A name for the query, which will be used in the stats file.

Type: string

A SQL query.

Type: string

A SmartNoise SQL query.

Type: number

The differential privacy epsilon value for the DP query.

Type: number

The differential privacy delta value for the DP query.

Type: object

See https://docs.smartnoise.org/sql/metadata.html#yaml-format.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^(?!(max_ids|row_privacy|sample_max_ids|censor_dims|clamp_counts|clamp_columns|use_dpsu)).*$
Type: object
No Additional Properties

Type: array of object

An array of story generators.

Each item of this array must be:

Type: object
No Additional Properties

Type: string

The full name of a story generator (e.g. mystorygenerators.short_story).

Type: array

Positional arguments to pass to the story generator.

Type: object

Keyword arguments to pass to the story generator.

Type: integer

The number of times to call the story generator per pass.

Type: integer

The maximum number of tries to respect a uniqueness constraint.

Type: object

Table configurations.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: .*
Type: object

A table configuration.

No Additional Properties

Type: boolean

Whether to export the table data.

Type: integer

The number of rows to generate per pass.

Type: array of object

An array of row generators to create column values.

Each item of this array must be:

Type: object

Type: string

The name of a (built-in or custom) function (e.g. max or myrowgenerators.my_gen).

Type: array

Positional arguments to pass to the function.

Type: object

Keyword arguments to pass to the function.

Type: array of string or string

One or more columns to assign the return value to.

Each item of this array must be:

\ No newline at end of file diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 848d514..0798fd3 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -1,5 +1,11 @@ Configuration ------------- +SqlSynthGen is configured using a YAML file, which is passed to several commands with the ``--config`` option. +Throughout the docs, we will refer to this file as ``config.yaml`` but it can be called anything (the exception being that there will be a naming conflict if you have a vocabulary table called ``config``). + +Below, we see the schema for the configuration file. +Note that our config file format includes a section of SmartNoise SQL metadata, which is explained more fully `here `_. + .. raw:: html :file: _static/config_schema.html diff --git a/sqlsynthgen/json_schemas/config_schema.json b/sqlsynthgen/json_schemas/config_schema.json index 7972577..f3e3c65 100644 --- a/sqlsynthgen/json_schemas/config_schema.json +++ b/sqlsynthgen/json_schemas/config_schema.json @@ -6,56 +6,94 @@ "additionalProperties": false, "properties": { "use-asyncio": { + "description": "Run source-statistics queries using asyncpg.", "type": "boolean" }, "row_generators_module": { + "description": "The name of a local Python module of row generators (excluding .py).", "type": "string" }, "story_generators_module": { + "description": "The name of a local Python module of story generators (excluding .py).", "type": "string" }, - "src-stats": { + "description": "An array of source statistics queries.", "type": "array", "items": { "additionalProperties": false, "required": ["name", "query"], "properties": { "name": { + "description": "A name for the query, which will be used in the stats file.", "type": "string" }, "query": { + "description": "A SQL query.", "type": "string" }, "dp-query": { + "description": "A SmartNoise SQL query.", "type": "string" }, "epsilon": { + "description": "The differential privacy epsilon value for the DP query.", "type": "number" }, "delta": { + "description": "The differential privacy delta value for the DP query.", "type": "number" }, - "snsql-metadata": { + "description": "See https://docs.smartnoise.org/sql/metadata.html#yaml-format.", "type": "object", "properties": { + "max_ids": { + "type": "integer" + }, + "row_privacy": { + "type": "boolean" + }, + "sample_max_ids": { + "type": "boolean" + }, "censor_dims": { "type": "boolean" + }, + "clamp_counts": { + "type": "boolean" + }, + "clamp_columns": { + "type": "boolean" + }, + "use_dpsu": { + "type": "boolean" } }, "patternProperties": { - "^(?!censor_dims).*$": { + "^(?!(max_ids|row_privacy|sample_max_ids|censor_dims|clamp_counts|clamp_columns|use_dpsu)).*$": { "type": "object", + "additionalProperties": false, + "required": ["type"], "properties": { - "name": { - "type": "string" - }, "type": { "type": "string" }, "private_id": { "type": "boolean" + }, + "lower": { + "type": "number" + }, + "upper": { + "type": "number" + }, + "nullable": { + "type": "boolean" + }, + "missing_value": {}, + "sensitivity": { + "type": "number" } } } @@ -66,6 +104,7 @@ }, "story_generators": { + "description": "An array of story generators.", "type": "array", "items": { "type": "object", @@ -73,15 +112,19 @@ "required": ["name", "num_stories_per_pass"], "properties": { "name": { + "description": "The full name of a story generator (e.g. my_story_generators.short_story).", "type": "string" }, "args": { + "description": "Positional arguments to pass to the story generator.", "type": "array" }, "kwargs": { + "description": "Keyword arguments to pass to the story generator.", "type": "object" }, "num_stories_per_pass": { + "description": "The number of times to call the story generator per pass.", "type": "integer" } } @@ -89,37 +132,48 @@ }, "max-unique-constraint-tries": { + "description": "The maximum number of tries to respect a uniqueness constraint.", "type": "integer" }, "tables": { + "description": "Table configurations.", "type": "object", "patternProperties": { ".*": { + "description": "A table configuration.", "additionalProperties": false, "type": "object", "properties": { "vocabulary_table": { + "description": "Whether to export the table data.", "type": "boolean" }, "num_rows_per_pass": { + "description": "The number of rows to generate per pass.", "type": "integer" }, "row_generators": { + "description": "An array of row generators to create column values.", "type": "array", "items": { "type": "object", + "required": ["name", "columns_assigned"], "properties": { "name": { + "description": "The name of a (built-in or custom) function (e.g. max or my_row_generators.my_gen).", "type": "string" }, "args": { + "description": "Positional arguments to pass to the function.", "type": "array" }, "kwargs": { + "description": "Keyword arguments to pass to the function.", "type": "object" }, "columns_assigned": { + "description": "One or more columns to assign the return value to.", "type": ["array", "string"], "items": { "type": "string" diff --git a/tests/examples/example_config.yaml b/tests/examples/example_config.yaml index fe3b458..8bb7fdd 100644 --- a/tests/examples/example_config.yaml +++ b/tests/examples/example_config.yaml @@ -25,13 +25,22 @@ src-stats: snsql-metadata: # You may well want censor_dims to be on, but we turn it off for the # tests to silence a smartnoise-sql nag warning. + max_ids: 10 + row_privacy: False + sample_max_ids: True censor_dims: False + clamp_counts: False + clamp_columns: False + use_dpsu: False person_id: - name: person_id type: int private_id: True + lower: 10 + upper: 100 + nullable: True + sensitivity: 1 + # missing_value: breaks things research_opt_out: - name: research_opt_out type: boolean private_id: False