[GitHub] spark pull request #15821: [SPARK-13534][PySpark] Using Apache Arrow to incr...

cloud-fan Thu, 22 Jun 2017 17:58:53 -0700

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15821#discussion_r123654149
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -2620,6 +2629,74 @@ def range_frame_match():
     
             importlib.reload(window)
     
    +
    [email protected](not _have_arrow, "Arrow not installed")
    +class ArrowTests(ReusedPySparkTestCase):
    +
    +    @classmethod
    +    def setUpClass(cls):
    +        ReusedPySparkTestCase.setUpClass()
    +        cls.spark = SparkSession(cls.sc)
    +        cls.spark.conf.set("spark.sql.execution.arrow.enable", "true")
    +        cls.schema = StructType([
    +            StructField("1_str_t", StringType(), True),
    +            StructField("2_int_t", IntegerType(), True),
    +            StructField("3_long_t", LongType(), True),
    +            StructField("4_float_t", FloatType(), True),
    +            StructField("5_double_t", DoubleType(), True)])
    +        cls.data = [("a", 1, 10, 0.2, 2.0),
    +                    ("b", 2, 20, 0.4, 4.0),
    +                    ("c", 3, 30, 0.8, 6.0)]
    +
    +    def assertFramesEqual(self, df_with_arrow, df_without):
    +        msg = ("DataFrame from Arrow is not equal" +
    +               ("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, 
df_with_arrow.dtypes)) +
    +               ("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
    +        self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
    +
    +    def test_unsupported_datatype(self):
    +        schema = StructType([StructField("array", ArrayType(IntegerType(), 
False), True)])
    +        df = self.spark.createDataFrame([([1, 2, 3],)], schema=schema)
    +        with QuietTest(self.sc):
    +            self.assertRaises(Exception, lambda: df.toPandas())
    +
    +    def test_null_conversion(self):
    +        df_null = self.spark.createDataFrame([tuple([None for _ in 
range(len(self.data[0]))])] +
    +                                             self.data)
    +        pdf = df_null.toPandas()
    +        null_counts = pdf.isnull().sum().tolist()
    +        self.assertTrue(all([c == 1 for c in null_counts]))
    +
    +    def test_toPandas_arrow_toggle(self):
    +        df = self.spark.createDataFrame(self.data, schema=self.schema)
    +        self.spark.conf.set("spark.sql.execution.arrow.enable", "false")
    +        pdf = df.toPandas()
    +        self.spark.conf.set("spark.sql.execution.arrow.enable", "true")
    +        pdf_arrow = df.toPandas()
    +        self.assertFramesEqual(pdf_arrow, pdf)
    +
    +    def test_pandas_round_trip(self):
    +        import pandas as pd
    +        import numpy as np
    +        data_dict = {}
    +        for j, name in enumerate(self.schema.names):
    +            data_dict[name] = [self.data[i][j] for i in 
range(len(self.data))]
    +        # need to convert these to numpy types first
    --- End diff --
    
    is this still needed?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #15821: [SPARK-13534][PySpark] Using Apache Arrow to incr...

Reply via email to