[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

gatorsmile Tue, 05 Sep 2017 10:27:01 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19060#discussion_r137060959
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/sources/DataSourceSuite.scala ---
    @@ -0,0 +1,125 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.sources
    +
    +import java.sql.{Date, Timestamp}
    +
    +import org.apache.orc.OrcConf
    +
    +import org.apache.spark.sql.{Dataset, QueryTest, Row}
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.sql.test.SQLTestUtils
    +
    +/**
    + * Data Source qualification as Apache Spark Data Sources.
    + * - Apache Spark Data Type Value Limits: CSV, JSON, ORC, Parquet
    + * - Predicate Push Down: ORC
    + */
    +class DataSourceSuite
    +  extends QueryTest
    +  with SQLTestUtils
    +  with TestHiveSingleton {
    +
    +  import testImplicits._
    +
    +  var df: Dataset[Row] = _
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    spark.conf.set("spark.sql.session.timeZone", "GMT")
    +
    +    df = ((
    +      false,
    +      true,
    +      Byte.MinValue,
    +      Byte.MaxValue,
    +      Short.MinValue,
    +      Short.MaxValue,
    +      Int.MinValue,
    +      Int.MaxValue,
    +      Long.MinValue,
    +      Long.MaxValue,
    +      Float.MinValue,
    +      Float.MaxValue,
    +      Double.MinValue,
    +      Double.MaxValue,
    +      Date.valueOf("0001-01-01"),
    +      Date.valueOf("9999-12-31"),
    +      new Timestamp(-62135769600000L), // 0001-01-01 00:00:00.000
    +      new Timestamp(253402300799999L)  // 9999-12-31 23:59:59.999
    +    ) :: Nil).toDF()
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.conf.unset("spark.sql.session.timeZone")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  Seq("parquet", "orc", "json", "csv").foreach { dataSource =>
    +    test(s"$dataSource - data type value limit") {
    +      withTempPath { dir =>
    +        df.write.format(dataSource).save(dir.getCanonicalPath)
    +
    +        // Use the same schema for saving/loading
    +        checkAnswer(
    +          
spark.read.format(dataSource).schema(df.schema).load(dir.getCanonicalPath),
    +          df)
    +
    +        // Use schema inference, but skip text-based format due to its 
limitation
    +        if (Seq("parquet", "orc").contains(dataSource)) {
    +          withTable("tab1") {
    +            sql(s"CREATE TABLE tab1 USING $dataSource LOCATION 
'${dir.toURI}'")
    +            checkAnswer(sql(s"SELECT ${df.schema.fieldNames.mkString(",")} 
FROM tab1"), df)
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  Seq("orc").foreach { dataSource =>
    +    test(s"$dataSource - predicate push down") {
    +      withSQLConf(
    +        SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true",
    +        SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
    +        withTempPath { dir =>
    +          // write 4000 rows with the integer and the string in a single 
orc file with stride 1000
    +          spark
    +            .range(4000)
    +            .map(i => (i, s"$i"))
    +            .toDF("i", "s")
    +            .repartition(1)
    +            .write
    +            .option(OrcConf.ROW_INDEX_STRIDE.getAttribute, 1000)
    +            // TODO: Add Parquet option, too.
    +            .format(dataSource)
    +            .save(dir.getCanonicalPath)
    +
    +          val df = spark.read.format(dataSource).load(dir.getCanonicalPath)
    +            .where(s"i BETWEEN 1500 AND 1999")
    --- End diff --
    
    I have different philosophy. I worked for mission-critical enterprise 
software. I do not trust anybody's codes unless they can pass the corresponding 
tests. 
    
    The initial ORC PPD does not have enough test coverage. To enable it by 
default, we should improve the test coverage now. The new test case framework 
for PPD should cover all the sources. We can have source-speicific ones, if it 
is not applicable to the other sources.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #19060: [WIP][SQL] Add DataSourceSuite validating data so...

Reply via email to