[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

gatorsmile Tue, 05 Sep 2017 09:59:31 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19060#discussion_r137052603
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/sources/DataSourceSuite.scala ---
    @@ -0,0 +1,125 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.sources
    +
    +import java.sql.{Date, Timestamp}
    +
    +import org.apache.orc.OrcConf
    +
    +import org.apache.spark.sql.{Dataset, QueryTest, Row}
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.sql.test.SQLTestUtils
    +
    +/**
    + * Data Source qualification as Apache Spark Data Sources.
    + * - Apache Spark Data Type Value Limits: CSV, JSON, ORC, Parquet
    + * - Predicate Push Down: ORC
    + */
    +class DataSourceSuite
    +  extends QueryTest
    +  with SQLTestUtils
    +  with TestHiveSingleton {
    +
    +  import testImplicits._
    +
    +  var df: Dataset[Row] = _
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    spark.conf.set("spark.sql.session.timeZone", "GMT")
    +
    +    df = ((
    +      false,
    +      true,
    +      Byte.MinValue,
    +      Byte.MaxValue,
    +      Short.MinValue,
    +      Short.MaxValue,
    +      Int.MinValue,
    +      Int.MaxValue,
    +      Long.MinValue,
    +      Long.MaxValue,
    +      Float.MinValue,
    +      Float.MaxValue,
    +      Double.MinValue,
    +      Double.MaxValue,
    +      Date.valueOf("0001-01-01"),
    +      Date.valueOf("9999-12-31"),
    +      new Timestamp(-62135769600000L), // 0001-01-01 00:00:00.000
    +      new Timestamp(253402300799999L)  // 9999-12-31 23:59:59.999
    +    ) :: Nil).toDF()
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.conf.unset("spark.sql.session.timeZone")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  Seq("parquet", "orc", "json", "csv").foreach { dataSource =>
    +    test(s"$dataSource - data type value limit") {
    +      withTempPath { dir =>
    +        df.write.format(dataSource).save(dir.getCanonicalPath)
    +
    +        // Use the same schema for saving/loading
    +        checkAnswer(
    +          
spark.read.format(dataSource).schema(df.schema).load(dir.getCanonicalPath),
    +          df)
    +
    +        // Use schema inference, but skip text-based format due to its 
limitation
    +        if (Seq("parquet", "orc").contains(dataSource)) {
    +          withTable("tab1") {
    +            sql(s"CREATE TABLE tab1 USING $dataSource LOCATION 
'${dir.toURI}'")
    +            checkAnswer(sql(s"SELECT ${df.schema.fieldNames.mkString(",")} 
FROM tab1"), df)
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  Seq("orc").foreach { dataSource =>
    +    test(s"$dataSource - predicate push down") {
    +      withSQLConf(
    +        SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true",
    +        SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
    +        withTempPath { dir =>
    +          // write 4000 rows with the integer and the string in a single 
orc file with stride 1000
    +          spark
    +            .range(4000)
    +            .map(i => (i, s"$i"))
    +            .toDF("i", "s")
    +            .repartition(1)
    +            .write
    +            .option(OrcConf.ROW_INDEX_STRIDE.getAttribute, 1000)
    +            // TODO: Add Parquet option, too.
    +            .format(dataSource)
    +            .save(dir.getCanonicalPath)
    +
    +          val df = spark.read.format(dataSource).load(dir.getCanonicalPath)
    +            .where(s"i BETWEEN 1500 AND 1999")
    --- End diff --
    
    Parquet is the default format. It is being used by most our Spark users. We 
already got many related JIRA issues and then fixed/blocked them. You were also 
involved in some of these PRs. 
    
    To avoid repeating the same issue in ORC, we should improve the coverage 
before turning on ORC predicate pushdown by default.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

Reply via email to