update cargo.toml in python crate and fix unit test due to hash joins (#483)

jimexist · web-flow · commit e713bc3b33fc · 2021-06-03T17:45:52.000-04:00
* update cargo.toml

* fix group by

* remove unused imports
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -31,7 +31,7 @@ libc = "0.2"
 tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
 rand = "0.7"
 pyo3 = { version = "0.13.2", features = ["extension-module"] }
-datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "c3fc0c75af5ff2ebb99dba197d9d2ccd83eb5952" }
 
 [lib]
 name = "datafusion"
diff --git a/python/tests/generic.py b/python/tests/generic.py
@@ -15,15 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import unittest
-import tempfile
 import datetime
-import os.path
-import shutil
-
 import numpy
 import pyarrow
-import datafusion
 
 # used to write parquet files
 import pyarrow.parquet
diff --git a/python/tests/test_df.py b/python/tests/test_df.py
@@ -19,11 +19,11 @@
 
 import pyarrow as pa
 import datafusion
+
 f = datafusion.functions
 
 
 class TestCase(unittest.TestCase):
-
     def _prepare(self):
         ctx = datafusion.ExecutionContext()
 
@@ -51,12 +51,10 @@ def test_select(self):
     def test_filter(self):
         df = self._prepare()
 
-        df = df \
-            .select(
-                f.col("a") + f.col("b"),
-                f.col("a") - f.col("b"),
-            ) \
-            .filter(f.col("a") > f.lit(2))
+        df = df.select(
+            f.col("a") + f.col("b"),
+            f.col("a") - f.col("b"),
+        ).filter(f.col("a") > f.lit(2))
 
         # execute and collect the first (and only) batch
         result = df.collect()[0]
@@ -66,12 +64,10 @@ def test_filter(self):
 
     def test_sort(self):
         df = self._prepare()
-        df = df.sort([
-            f.col("b").sort(ascending=False)
-        ])
+        df = df.sort([f.col("b").sort(ascending=False)])
 
         table = pa.Table.from_batches(df.collect())
-        expected = {'a': [3, 2, 1], 'b': [6, 5, 4]}
+        expected = {"a": [3, 2, 1], "b": [6, 5, 4]}
         self.assertEqual(table.to_pydict(), expected)
 
     def test_limit(self):
@@ -111,10 +107,8 @@ def test_join(self):
         df1 = ctx.create_dataframe([[batch]])
 
         df = df.join(df1, on="a", how="inner")
-        df = df.sort([
-            f.col("a").sort(ascending=True)
-        ])
+        df = df.sort([f.col("a").sort(ascending=True)])
         table = pa.Table.from_batches(df.collect())
 
-        expected = {'a': [1, 2], 'c': [8, 10], 'b': [4, 5]}
+        expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
         self.assertEqual(table.to_pydict(), expected)
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -82,12 +82,18 @@ def test_execute(self):
         )
 
         # group by
-        result = ctx.sql(
+        results = ctx.sql(
             "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)"
         ).collect()
 
-        result_keys = result[0].to_pydict()["CAST(a AS Int32)"]
-        result_values = result[0].to_pydict()["COUNT(a)"]
+        # group by returns batches
+        result_keys = []
+        result_values = []
+        for result in results:
+            pydict = result.to_pydict()
+            result_keys.extend(pydict["CAST(a AS Int32)"])
+            result_values.extend(pydict["COUNT(a)"])
+
         result_keys, result_values = (
             list(t) for t in zip(*sorted(zip(result_keys, result_values)))
         )
diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import unittest
-
 import pyarrow
 import pyarrow.compute
 import datafusion
@@ -86,6 +85,7 @@ def test_group_by(self):
         df = df.aggregate([f.col("b")], [udaf(f.col("a"))])
 
         # execute and collect the first (and only) batch
-        result = df.collect()[0]
-
-        self.assertEqual(result.column(1), pyarrow.array([1.0 + 2.0, 3.0]))
+        batches = df.collect()
+        arrays = [batch.column(1) for batch in batches]
+        joined = pyarrow.concat_arrays(arrays)
+        self.assertEqual(joined, pyarrow.array([1.0 + 2.0, 3.0]))