-
Notifications
You must be signed in to change notification settings - Fork 106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixed bug where nested json inside pandas wouldn't be ingested correctly #568
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,20 @@ | ||
import gzip | ||
import ipaddress | ||
import os | ||
import tempfile | ||
import time | ||
import uuid | ||
from abc import ABCMeta, abstractmethod | ||
from copy import copy | ||
from enum import Enum | ||
from io import TextIOWrapper | ||
from typing import TYPE_CHECKING, Union, IO, AnyStr, Optional, Tuple | ||
from urllib.parse import urlparse | ||
|
||
from azure.kusto.data.data_format import DataFormat | ||
from azure.kusto.data.exceptions import KustoClosedError | ||
|
||
from .descriptors import FileDescriptor, StreamDescriptor | ||
from .ingestion_properties import IngestionProperties | ||
|
||
|
||
if TYPE_CHECKING: | ||
import pandas | ||
|
||
|
@@ -117,12 +115,11 @@ def ingest_from_dataframe(self, df: "pandas.DataFrame", ingestion_properties: In | |
if not isinstance(df, DataFrame): | ||
raise ValueError("Expected DataFrame instance, found {}".format(type(df))) | ||
|
||
file_name = "df_{id}_{timestamp}_{uid}.csv.gz".format(id=id(df), timestamp=int(time.time()), uid=uuid.uuid4()) | ||
file_name = "df_{id}_{timestamp}_{uid}.json.gz".format(id=id(df), timestamp=int(time.time()), uid=uuid.uuid4()) | ||
temp_file_path = os.path.join(tempfile.gettempdir(), file_name) | ||
|
||
df.to_csv(temp_file_path, index=False, encoding="utf-8", header=False, compression="gzip") | ||
|
||
ingestion_properties.format = DataFormat.CSV | ||
with gzip.open(temp_file_path, "wb") as temp_file: | ||
df.to_json(temp_file, orient="records", date_format="iso", lines=True) | ||
ingestion_properties.format = DataFormat.JSON | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought JSON was obsolete. |
||
|
||
try: | ||
return self.ingest_from_file(temp_file_path, ingestion_properties) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -529,15 +529,20 @@ async def test_streaming_ingest_from_dataframe(self): | |
"xtextWithNulls", | ||
"xdynamicWithNulls", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: but column name is not correct anymore (not sure if the table is created ad-hoc or not) |
||
] | ||
rows = [ | ||
[0, "00000000-0000-0000-0001-020304050607", 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, "2014-01-01T01:01:01Z", "Zero", "Zero", "0", "00:00:00", None, ""] | ||
] | ||
|
||
guid = uuid.uuid4() | ||
|
||
dynamic_value = ["[email protected]", "[email protected]", "[email protected]"] | ||
rows = [[0, str(guid), 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, "2014-01-01T01:01:01Z", "Zero", "Zero", "0", "00:00:00", None, dynamic_value]] | ||
df = DataFrame(data=rows, columns=fields) | ||
ingestion_properties = IngestionProperties(database=self.test_db, table=self.test_table, flush_immediately=True, data_format=DataFormat.CSV) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Leftover CSV There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's actually good to have, to make sure it forces the right format (or do we want to throw in this case) |
||
self.ingest_client.ingest_from_dataframe(df, ingestion_properties) | ||
|
||
await self.assert_rows_added(1, timeout=120) | ||
|
||
a = self.client.execute(self.test_db, f"{self.test_table} | where rowguid == '{guid}'") | ||
assert a.primary_results[0].rows[0]["xdynamicWithNulls"] == dynamic_value | ||
|
||
@pytest.mark.asyncio | ||
async def test_streaming_ingest_from_blob(self, is_managed_streaming): | ||
ingestion_properties = IngestionProperties( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a breaking change - as this wont work the same for users who configured csv mapping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a good point, but before we had:
So if a user used a different format mapping it wouldn't work.
I can roll it in the next breaking version (with a comment on the release notes),
Or I can check for a mapping and error? Or if the format doesn't match error?
I don't think we can get around not converting it to json
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not there's another hidden braking change here.
CSV matching is commonly ordinal while json matching is by name.
This means that if the dataframe names do not match the table names the mapping will fail.