Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/15821#discussion_r123121124
--- Diff: python/pyspark/sql/tests.py ---
@@ -2473,6 +2482,76 @@ def range_frame_match():
importlib.reload(window)
+
[email protected](not _have_arrow, "Arrow not installed")
+class ArrowTests(ReusedPySparkTestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ ReusedPySparkTestCase.setUpClass()
+ cls.spark = SparkSession(cls.sc)
+ cls.spark.conf.set("spark.sql.execution.arrow.enable", "true")
+ cls.schema = StructType([
+ StructField("1_str_t", StringType(), True),
+ StructField("2_int_t", IntegerType(), True),
+ StructField("3_long_t", LongType(), True),
+ StructField("4_float_t", FloatType(), True),
+ StructField("5_double_t", DoubleType(), True)])
+ cls.data = [("a", 1, 10, 0.2, 2.0),
+ ("b", 2, 20, 0.4, 4.0),
+ ("c", 3, 30, 0.8, 6.0)]
+
+ def assertFramesEqual(self, df_with_arrow, df_without):
+ msg = ("DataFrame from Arrow is not equal" +
+ ("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow,
df_with_arrow.dtypes)) +
+ ("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
+ self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
+
+ def test_unsupported_datatype(self):
+ schema = StructType([StructField("array", ArrayType(IntegerType(),
False), True)])
+ df = self.spark.createDataFrame([([1, 2, 3],)], schema=schema)
+ with QuietTest(self.sc):
+ self.assertRaises(Exception, lambda: df.toPandas())
+
+ def test_null_conversion(self):
+ df_null = self.spark.createDataFrame([tuple([None for _ in
range(len(self.data[0]))])] +
+ self.data)
+ pdf = df_null.toPandas()
+ null_counts = pdf.isnull().sum().tolist()
+ self.assertTrue(all([c == 1 for c in null_counts]))
+
+ def test_toPandas_arrow_toggle(self):
+ df = self.spark.createDataFrame(self.data, schema=self.schema)
+ # NOTE - toPandas() without pyarrow will infer standard python
data types
--- End diff --
What I meant by this is if your Spark DataFrame has a column of
`IntegerType` and `toPandas` is called without using Arrow, then Spark will
pickle the data to standard python (python only has a single int type). Then a
pandas DataFrame is created using `from_records` with the python ints. Calling
this constructor forces pandas to infer the data type and it will choose
`int64`. With Arrow, the Spark schema is carried over, so pandas will know the
correct data type is `int32`. So if this test were to compare columns of
`IntegerType` or `FloatType` with/without Arrow then the schema's would not
match because of this. The data is still the same, but I would say the schema
is now correct with arrow.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]