Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Source S3: fixed bug in spec so that Format field displays in UI correctly #5135

Merged
merged 2 commits into from
Aug 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
"sourceDefinitionId": "69589781-7828-43c5-9f63-8925b1c1ccc2",
"name": "S3",
"dockerRepository": "airbyte/source-s3",
"dockerImageTag": "0.1.1",
"dockerImageTag": "0.1.2",
"documentationUrl": "https://hub.docker.com/r/airbyte/source-s3"
}
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
- sourceDefinitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
name: S3
dockerRepository: airbyte/source-s3
dockerImageTag: 0.1.1
dockerImageTag: 0.1.2
documentationUrl: https://hub.docker.com/r/airbyte/source-s3
- sourceDefinitionId: fbb5fbe2-16ad-4cf4-af7d-ff9d9c316c87
name: Sendgrid
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-s3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ RUN pip install .
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.1.1
LABEL io.airbyte.version=0.1.2
LABEL io.airbyte.name=airbyte/source-s3
281 changes: 150 additions & 131 deletions airbyte-integrations/connectors/source-s3/integration_tests/spec.json
Original file line number Diff line number Diff line change
@@ -1,143 +1,162 @@
{
"documentationUrl": "https://docs.airbyte.io/integrations/sources/s3",
"changelogUrl": "https://docs.airbyte.io/integrations/sources/s3",
"connectionSpecification": {
"title": "S3 Source Spec",
"type": "object",
"properties": {
"dataset": {
"title": "Dataset",
"description": "This source creates one table per connection, this field is the name of that table. This should include only letters, numbers, dash and underscores. Note that this may be altered according to destination.",
"pattern": "^([A-Za-z0-9-_]+)$",
"type": "string"
},
"path_pattern": {
"title": "Path Pattern",
"description": "Add at least 1 pattern here to match filepaths against. Use | to separate multiple patterns. Airbyte uses these patterns to determine which files to pick up from the provider storage. See <a href=\"https://facelessuser.github.io/wcmatch/glob/\" target=\"_blank\">wcmatch.glob</a> to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern <strong>**</strong> to pick up all files.",
"examples": [
"**",
"myFolder/myTableFiles/*.csv|myFolder/myOtherTableFiles/*.csv"
],
"type": "string"
},
"schema": {
"title": "Schema",
"description": "Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of <strong>{ \"column\" : \"type\" }</strong>, where types are valid <a href=\"https://json-schema.org/understanding-json-schema/reference/type.html\" target=\"_blank\">JSON Schema datatypes</a>. Leave as {} to auto-infer the schema.",
"default": "{}",
"examples": [
"{\"column_1\": \"number\", \"column_2\": \"string\", \"column_3\": \"array\", \"column_4\": \"object\", \"column_5\": \"boolean\"}"
],
"type": "string"
},
"format": {
"title": "Format",
"default": "csv",
"oneOf": [
{
"title": "csv",
"type": "object",
"properties": {
"filetype": {
"title": "CsvFiletype",
"description": "This connector utilises <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.open_csv.html\" target=\"_blank\">PyArrow (Apache Arrow)</a> for CSV parsing.",
"enum": ["csv"],
"type": "string"
},
"delimiter": {
"title": "Delimiter",
"description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string.",
"default": ",",
"minLength": 1,
"type": "string"
},
"quote_char": {
"title": "Quote Char",
"description": "The character used optionally for quoting CSV values. To disallow quoting, make this field blank.",
"default": "\"",
"type": "string"
},
"escape_char": {
"title": "Escape Char",
"description": "The character used optionally for escaping special characters. To disallow escaping, leave this field blank.",
"documentationUrl": "https://docs.airbyte.io/integrations/sources/s3",
"changelogUrl": "https://docs.airbyte.io/integrations/sources/s3",
"connectionSpecification": {
"title": "S3 Source Spec",
"type": "object",
"properties": {
"dataset": {
"title": "Dataset",
"description": "This source creates one table per connection, this field is the name of that table. This should include only letters, numbers, dash and underscores. Note that this may be altered according to destination.",
"pattern": "^([A-Za-z0-9-_]+)$",
"type": "string"
},
"encoding": {
"title": "Encoding",
"description": "The character encoding of the CSV data. Leave blank to default to <strong>UTF-8</strong>. See <a href=\"https://docs.python.org/3/library/codecs.html#standard-encodings\" target=\"_blank\">list of python encodings</a> for allowable options.",
},
"path_pattern": {
"title": "Path Pattern",
"description": "Add at least 1 pattern here to match filepaths against. Use | to separate multiple patterns. Airbyte uses these patterns to determine which files to pick up from the provider storage. See <a href=\"https://facelessuser.github.io/wcmatch/glob/\" target=\"_blank\">wcmatch.glob</a> to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern <strong>**</strong> to pick up all files.",
"examples": [
"**",
"myFolder/myTableFiles/*.csv|myFolder/myOtherTableFiles/*.csv"
],
"type": "string"
},
"double_quote": {
"title": "Double Quote",
"description": "Whether two quotes in a quoted CSV value denote a single quote in the data.",
"default": true,
"type": "boolean"
},
"newlines_in_values": {
"title": "Newlines In Values",
"description": "Whether newline characters are allowed in CSV values. Turning this on may affect performance. Leave blank to default to False.",
"default": false,
"type": "boolean"
},
"additional_reader_options": {
"title": "Additional Reader Options",
"description": "Optionally add a valid JSON string here to provide additional options to the csv reader. Mappings must correspond to options <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions\" target=\"_blank\">detailed here</a>. 'column_types' is used internally to handle schema so overriding that would likely cause problems.",
},
"schema": {
"title": "Schema",
"description": "Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of <strong>{ \"column\" : \"type\" }</strong>, where types are valid <a href=\"https://json-schema.org/understanding-json-schema/reference/type.html\" target=\"_blank\">JSON Schema datatypes</a>. Leave as {} to auto-infer the schema.",
"default": "{}",
"examples": [
"{\"timestamp_parsers\": [\"%m/%d/%Y %H:%M\", \"%Y/%m/%d %H:%M\"], \"strings_can_be_null\": true, \"null_values\": [\"NA\", \"NULL\"]}"
"{\"column_1\": \"number\", \"column_2\": \"string\", \"column_3\": \"array\", \"column_4\": \"object\", \"column_5\": \"boolean\"}"
],
"type": "string"
}
},
"required": ["filetype"]
},
{
"title": "Coming Soon...",
"type": "object",
"properties": {
"filetype": {
"title": "ParquetFiletype",
"description": "An enumeration.",
"enum": ["parquet"],
"type": "string"
}
"format": {
"title": "Format",
"default": "csv",
"oneOf": [
{
"title": "csv",
"type": "object",
"properties": {
"filetype": {
"title": "CsvFiletype",
"description": "This connector utilises <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.open_csv.html\" target=\"_blank\">PyArrow (Apache Arrow)</a> for CSV parsing.",
"enum": [
"csv"
],
"type": "string"
},
"delimiter": {
"title": "Delimiter",
"description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string.",
"default": ",",
"minLength": 1,
"type": "string"
},
"quote_char": {
"title": "Quote Char",
"description": "The character used optionally for quoting CSV values. To disallow quoting, make this field blank.",
"default": "\"",
"type": "string"
},
"escape_char": {
"title": "Escape Char",
"description": "The character used optionally for escaping special characters. To disallow escaping, leave this field blank.",
"type": "string"
},
"encoding": {
"title": "Encoding",
"description": "The character encoding of the CSV data. Leave blank to default to <strong>UTF-8</strong>. See <a href=\"https://docs.python.org/3/library/codecs.html#standard-encodings\" target=\"_blank\">list of python encodings</a> for allowable options.",
"type": "string"
},
"double_quote": {
"title": "Double Quote",
"description": "Whether two quotes in a quoted CSV value denote a single quote in the data.",
"default": true,
"type": "boolean"
},
"newlines_in_values": {
"title": "Newlines In Values",
"description": "Whether newline characters are allowed in CSV values. Turning this on may affect performance. Leave blank to default to False.",
"default": false,
"type": "boolean"
},
"additional_reader_options": {
"title": "Additional Reader Options",
"description": "Optionally add a valid JSON string here to provide additional options to the csv reader. Mappings must correspond to options <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions\" target=\"_blank\">detailed here</a>. 'column_types' is used internally to handle schema so overriding that would likely cause problems.",
"default": "{}",
"examples": [
"{\"timestamp_parsers\": [\"%m/%d/%Y %H:%M\", \"%Y/%m/%d %H:%M\"], \"strings_can_be_null\": true, \"null_values\": [\"NA\", \"NULL\"]}"
],
"type": "string"
}
},
"required": [
"filetype"
]
},
{
"title": "Coming Soon...",
"type": "object",
"properties": {
"filetype": {
"title": "ParquetFiletype",
"description": "An enumeration.",
"enum": [
"parquet"
],
"type": "string"
}
},
"required": [
"filetype"
]
}
],
"type": "object"
},
"required": ["filetype"]
}
]
},
"provider": {
"title": "S3: Amazon Web Services",
"type": "object",
"properties": {
"bucket": {
"title": "Bucket",
"description": "Name of the S3 bucket where the file(s) exist.",
"type": "string"
},
"aws_access_key_id": {
"title": "Aws Access Key Id",
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
"airbyte_secret": true,
"type": "string"
},
"aws_secret_access_key": {
"title": "Aws Secret Access Key",
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
"airbyte_secret": true,
"type": "string"
},
"path_prefix": {
"title": "Path Prefix",
"description": "By providing a path-like prefix (e.g. myFolder/thisTable/) under which all the relevant files sit, we can optimise finding these in S3. This is optional but recommended if your bucket contains many folders/files.",
"default": "",
"type": "string"
}
"provider": {
"title": "S3: Amazon Web Services",
"type": "object",
"properties": {
"bucket": {
"title": "Bucket",
"description": "Name of the S3 bucket where the file(s) exist.",
"type": "string"
},
"aws_access_key_id": {
"title": "Aws Access Key Id",
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
"airbyte_secret": true,
"type": "string"
},
"aws_secret_access_key": {
"title": "Aws Secret Access Key",
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
"airbyte_secret": true,
"type": "string"
},
"path_prefix": {
"title": "Path Prefix",
"description": "By providing a path-like prefix (e.g. myFolder/thisTable/) under which all the relevant files sit, we can optimise finding these in S3. This is optional but recommended if your bucket contains many folders/files.",
"default": "",
"type": "string"
}
},
"required": [
"bucket"
]
}
},
"required": ["bucket"]
}
"required": [
"dataset",
"path_pattern",
"provider"
]
},
"required": ["dataset", "path_pattern", "provider"]
},
"supportsIncremental": true,
"supported_destination_sync_modes": ["overwrite", "append", "append_dedup"]
"supportsIncremental": true,
"supported_destination_sync_modes": [
"overwrite",
"append",
"append_dedup"
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ class SourceFilesAbstractSpec(BaseModel):
@staticmethod
def change_format_to_oneOf(schema: dict) -> dict:
schema["properties"]["format"]["oneOf"] = deepcopy(schema["properties"]["format"]["anyOf"])
schema["properties"]["format"]["type"] = "object"
del schema["properties"]["format"]["anyOf"]
return schema

Expand Down
1 change: 1 addition & 0 deletions docs/integrations/sources/s3.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,5 +181,6 @@ You can find details on [available options here](https://arrow.apache.org/docs/p

| Version | Date | Pull Request | Subject |
| :------ | :-------- | :----- | :------ |
| 0.1.2 | 2021-08-02 | [5135](https://github.com/airbytehq/airbyte/pull/5135) | Fixed bug in spec so it displays in UI correctly |
| 0.1.1 | 2021-07-30 | [4990](https://github.com/airbytehq/airbyte/pull/4990/commits/ff5f70662c5f84eabc03526cddfcc9d73c58c0f4) | Fixed documentation url in source definition |
| 0.1.0 | 2021-07-30 | [4990](https://github.com/airbytehq/airbyte/pull/4990) | Created S3 source connector |