meltano · edgarrmondragon · Oct 6, 2023 · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023
@@ -185,6 +185,101 @@ def flatten_schema(
         }
       }
     }
+
+    >>> nullable_leaves_schema = {
+    ...     "type": "object",
+    ...     "properties": {
+    ...         "id": {
+    ...             "type": "string"
+    ...         },
+    ...         "foo": {
+    ...             "type": ["object", "null"],
+    ...             "properties": {
+    ...                 "bar": {
+    ...                     "type": ["object", "null"],
+    ...                     "properties": {
+    ...                         "baz": {
+    ...                             "type": ["object", "null"],
+    ...                             "properties": {
+    ...                                 "qux": {
+    ...                                     "type": "string"
+    ...                                 }
+    ...                             }
+    ...                         }
+    ...                     }
+    ...                 }
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> print(json.dumps(flatten_schema(nullable_leaves_schema, 0), indent=2))
+    {
+      "type": "object",
+      "properties": {
+        "id": {
+          "type": "string"
+        },
+        "foo": {
+          "type": [
+            "object",
+            "null"
+          ],
+          "properties": {
+            "bar": {
+              "type": [
+                "object",
+                "null"
+              ],
+              "properties": {
+                "baz": {
+                  "type": [
+                    "object",
+                    "null"
+                  ],
+                  "properties": {
+                    "qux": {
+                      "type": "string"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    >>> print(json.dumps(flatten_schema(nullable_leaves_schema, 1), indent=2))
+    {
+      "type": "object",
+      "properties": {
+        "id": {
+          "type": "string"
+        },
+        "foo__bar": {
+          "type": [
+            "string",
+            "null"
+          ]
+        }
+      }
+    }
+
+    >>> print(json.dumps(flatten_schema(nullable_leaves_schema, 2), indent=2))
+    {
+      "type": "object",
+      "properties": {
+        "id": {
+          "type": "string"
+        },
+        "foo__bar__baz": {
+          "type": [
+            "string",
+            "null"
+          ]
+        }
+      }
+    }
     """
     new_schema = deepcopy(schema)
     new_schema["properties"] = _flatten_schema(
@@ -221,42 +316,55 @@ def _flatten_schema(  # noqa: C901, PLR0912
     if "properties" not in schema_node:
         return {}
 
-    for k, v in schema_node["properties"].items():
-        new_key = flatten_key(k, parent_keys, separator)
-        if "type" in v:
-            if "object" in v["type"] and "properties" in v and level < max_level:
+    for field_name, field_schema in schema_node["properties"].items():
+        new_key = flatten_key(field_name, parent_keys, separator)
+        if "type" in field_schema:
+            if (
+                "object" in field_schema["type"]
+                and "properties" in field_schema
+                and level < max_level
+            ):
                 items.extend(
                     _flatten_schema(
-                        v,
-                        [*parent_keys, k],
+                        field_schema,
+                        [*parent_keys, field_name],
                         separator=separator,
                         level=level + 1,
                         max_level=max_level,
                     ).items(),
                 )
-            elif "array" in v["type"] or "object" in v["type"] and max_level > 0:
-                items.append((new_key, {"type": "string"}))
+            elif (
+                "array" in field_schema["type"]
+                or "object" in field_schema["type"]
+                and max_level > 0
+            ):
+                types = (
+                    ["string", "null"] if "null" in field_schema["type"] else "string"
+                )
+                items.append((new_key, {"type": types}))
             else:
-                items.append((new_key, v))
-        elif len(v.values()) > 0:
-            if next(iter(v.values()))[0]["type"] == "string":
-                next(iter(v.values()))[0]["type"] = ["null", "string"]
-                items.append((new_key, next(iter(v.values()))[0]))
-            elif next(iter(v.values()))[0]["type"] == "array":
-                next(iter(v.values()))[0]["type"] = ["null", "array"]
-                items.append((new_key, next(iter(v.values()))[0]))
-            elif next(iter(v.values()))[0]["type"] == "object":
-                next(iter(v.values()))[0]["type"] = ["null", "object"]
-                items.append((new_key, next(iter(v.values()))[0]))
+                items.append((new_key, field_schema))
+        # TODO: Figure out what this really does, try breaking it.
+        # If it's not needed, remove it.
+        elif len(field_schema.values()) > 0:
+            if next(iter(field_schema.values()))[0]["type"] == "string":
+                next(iter(field_schema.values()))[0]["type"] = ["null", "string"]
+                items.append((new_key, next(iter(field_schema.values()))[0]))
+            elif next(iter(field_schema.values()))[0]["type"] == "array":
+                next(iter(field_schema.values()))[0]["type"] = ["null", "array"]
+                items.append((new_key, next(iter(field_schema.values()))[0]))
+            elif next(iter(field_schema.values()))[0]["type"] == "object":
+                next(iter(field_schema.values()))[0]["type"] = ["null", "object"]
+                items.append((new_key, next(iter(field_schema.values()))[0]))
 
     # Sort and check for duplicates
     def _key_func(item):
         return item[0]  # first item is tuple is the key name.
 
     sorted_items = sorted(items, key=_key_func)
-    for k, g in itertools.groupby(sorted_items, key=_key_func):
+    for field_name, g in itertools.groupby(sorted_items, key=_key_func):
         if len(list(g)) > 1:
-            msg = f"Duplicate column name produced in schema: {k}"
+            msg = f"Duplicate column name produced in schema: {field_name}"
             raise ValueError(msg)
 
     # Return the (unsorted) result as a dict.

@@ -1,5 +1,5 @@
 {"type": "STATE", "value": {}}
-{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub__num": {"type": ["integer", "null"]}, "user__some_numbers": {"type": "string"}}, "type": "object"}, "key_properties": []}
+{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub__num": {"type": ["integer", "null"]}, "user__some_numbers": {"type": ["string", "null"]}}, "type": "object"}, "key_properties": []}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 21, "user__id": 1, "user__sub__num": 1, "user__some_numbers": "[3.14, 2.718]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 13, "user__id": 2, "user__sub__num": 2, "user__some_numbers": "[10.32, 1.618]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 19, "user__id": 3, "user__sub__num": 3, "user__some_numbers": "[1.414, 1.732]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}

@@ -1,5 +1,5 @@
 {"type": "STATE", "value": {}}
-{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub": {"type": "string"}, "user__some_numbers": {"type": "string"}}, "type": "object"}, "key_properties": []}
+{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub": {"type": ["string", "null"]}, "user__some_numbers": {"type": ["string", "null"]}}, "type": "object"}, "key_properties": []}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 21, "user__id": 1, "user__sub": "{\"num\": 1}", "user__some_numbers": "[3.14, 2.718]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 13, "user__id": 2, "user__sub": "{\"num\": 2}", "user__some_numbers": "[10.32, 1.618]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 19, "user__id": 3, "user__sub": "{\"num\": 3}", "user__some_numbers": "[1.414, 1.732]"}, "time_extracted": "2022-01-01T00:00:00+00:00"}

@@ -1,5 +1,5 @@
 {"type": "STATE", "value": {}}
-{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub__num": {"type": ["integer", "null"]}, "user__some_numbers": {"type": "string"}, "email_hash": {"type": ["string", "null"]}}, "type": "object"}, "key_properties": ["email_hash"]}
+{"type": "SCHEMA", "stream": "mystream", "schema": {"properties": {"email": {"type": ["string", "null"]}, "count": {"type": ["integer", "null"]}, "user__id": {"type": ["integer", "null"]}, "user__sub__num": {"type": ["integer", "null"]}, "user__some_numbers": {"type": ["string", "null"]}, "email_hash": {"type": ["string", "null"]}}, "type": "object"}, "key_properties": ["email_hash"]}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 21, "user__id": 1, "user__sub__num": 1, "user__some_numbers": "[3.14, 2.718]", "email_hash": "c160f8cc69a4f0bf2b0362752353d060"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 13, "user__id": 2, "user__sub__num": 2, "user__some_numbers": "[10.32, 1.618]", "email_hash": "4b9bb80620f03eb3719e0a061c14283d"}, "time_extracted": "2022-01-01T00:00:00+00:00"}
 {"type": "RECORD", "stream": "mystream", "record": {"email": "[email protected]", "count": 19, "user__id": 3, "user__sub__num": 3, "user__some_numbers": "[1.414, 1.732]", "email_hash": "426b189df1e2f359efe6ee90f2d2030f"}, "time_extracted": "2022-01-01T00:00:00+00:00"}