Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SNS metadata spec and update some of ours. #136

Merged
merged 6 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/_static/config_schema.html

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions docs/source/configuration.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
Configuration
-------------

SqlSynthGen is configured using a YAML file, which is passed to several commands with the ``--config`` option.
Throughout the docs, we will refer to this file as ``config.yaml`` but it can be called anything (the exception being that there will be a naming conflict if you have a vocabulary table called ``config``).

Below, we see the schema for the configuration file.
Note that our config file format includes a section of SmartNoise SQL metadata, which is explained more fully `here <https://docs.smartnoise.org/sql/metadata.html#yaml-format>`_.

.. raw:: html
:file: _static/config_schema.html
66 changes: 60 additions & 6 deletions sqlsynthgen/json_schemas/config_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,56 +6,94 @@
"additionalProperties": false,
"properties": {
"use-asyncio": {
"description": "Run source-statistics queries using asyncpg.",
"type": "boolean"
},
"row_generators_module": {
"description": "The name of a local Python module of row generators (excluding .py).",
"type": "string"
},
"story_generators_module": {
"description": "The name of a local Python module of story generators (excluding .py).",
"type": "string"
},

"src-stats": {
"description": "An array of source statistics queries.",
"type": "array",
"items": {
"additionalProperties": false,
"required": ["name", "query"],
"properties": {
"name": {
"description": "A name for the query, which will be used in the stats file.",
"type": "string"
},
"query": {
"description": "A SQL query.",
"type": "string"
},
"dp-query": {
"description": "A SmartNoise SQL query.",
"type": "string"
},
"epsilon": {
"description": "The differential privacy epsilon value for the DP query.",
"type": "number"
},
"delta": {
"description": "The differential privacy delta value for the DP query.",
"type": "number"
},

"snsql-metadata": {
"description": "See https://docs.smartnoise.org/sql/metadata.html#yaml-format.",
"type": "object",
"properties": {
"max_ids": {
"type": "integer"
},
"row_privacy": {
"type": "boolean"
},
"sample_max_ids": {
"type": "boolean"
},
"censor_dims": {
"type": "boolean"
},
"clamp_counts": {
"type": "boolean"
},
"clamp_columns": {
"type": "boolean"
},
"use_dpsu": {
"type": "boolean"
}
},
"patternProperties": {
"^(?!censor_dims).*$": {
"^(?!(max_ids|row_privacy|sample_max_ids|censor_dims|clamp_counts|clamp_columns|use_dpsu)).*$": {
"type": "object",
"additionalProperties": false,
"required": ["type"],
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
},
"private_id": {
"type": "boolean"
},
"lower": {
"type": "number"
},
"upper": {
"type": "number"
},
"nullable": {
"type": "boolean"
},
"missing_value": {},
"sensitivity": {
"type": "number"
}
}
}
Expand All @@ -66,60 +104,76 @@
},

"story_generators": {
"description": "An array of story generators.",
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"required": ["name", "num_stories_per_pass"],
"properties": {
"name": {
"description": "The full name of a story generator (e.g. my_story_generators.short_story).",
"type": "string"
},
"args": {
"description": "Positional arguments to pass to the story generator.",
"type": "array"
},
"kwargs": {
"description": "Keyword arguments to pass to the story generator.",
"type": "object"
},
"num_stories_per_pass": {
"description": "The number of times to call the story generator per pass.",
"type": "integer"
}
}
}
},

"max-unique-constraint-tries": {
"description": "The maximum number of tries to respect a uniqueness constraint.",
"type": "integer"
},

"tables": {
"description": "Table configurations.",
"type": "object",
"patternProperties": {
".*": {
"description": "A table configuration.",
"additionalProperties": false,
"type": "object",
"properties": {
"vocabulary_table": {
"description": "Whether to export the table data.",
"type": "boolean"
},
"num_rows_per_pass": {
"description": "The number of rows to generate per pass.",
"type": "integer"
},
"row_generators": {
"description": "An array of row generators to create column values.",
"type": "array",
"items": {
"type": "object",
"required": ["name", "columns_assigned"],
"properties": {
"name": {
"description": "The name of a (built-in or custom) function (e.g. max or my_row_generators.my_gen).",
"type": "string"
},
"args": {
"description": "Positional arguments to pass to the function.",
"type": "array"
},
"kwargs": {
"description": "Keyword arguments to pass to the function.",
"type": "object"
},
"columns_assigned": {
"description": "One or more columns to assign the return value to.",
"type": ["array", "string"],
"items": {
"type": "string"
Expand Down
13 changes: 11 additions & 2 deletions tests/examples/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,22 @@ src-stats:
snsql-metadata:
# You may well want censor_dims to be on, but we turn it off for the
# tests to silence a smartnoise-sql nag warning.
max_ids: 10
row_privacy: False
sample_max_ids: True
censor_dims: False
clamp_counts: False
clamp_columns: False
use_dpsu: False
person_id:
name: person_id
type: int
private_id: True
lower: 10
upper: 100
nullable: True
sensitivity: 1
# missing_value: breaks things
research_opt_out:
name: research_opt_out
type: boolean
private_id: False

Expand Down