Github user dongjoon-hyun commented on a diff in the pull request:
https://github.com/apache/spark/pull/19060#discussion_r137064595
--- Diff:
sql/hive/src/test/scala/org/apache/spark/sql/sources/DataSourceSuite.scala ---
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.orc.OrcConf
+
+import org.apache.spark.sql.{Dataset, QueryTest, Row}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
+
+/**
+ * Data Source qualification as Apache Spark Data Sources.
+ * - Apache Spark Data Type Value Limits: CSV, JSON, ORC, Parquet
+ * - Predicate Push Down: ORC
+ */
+class DataSourceSuite
+ extends QueryTest
+ with SQLTestUtils
+ with TestHiveSingleton {
+
+ import testImplicits._
+
+ var df: Dataset[Row] = _
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ spark.conf.set("spark.sql.session.timeZone", "GMT")
+
+ df = ((
+ false,
+ true,
+ Byte.MinValue,
+ Byte.MaxValue,
+ Short.MinValue,
+ Short.MaxValue,
+ Int.MinValue,
+ Int.MaxValue,
+ Long.MinValue,
+ Long.MaxValue,
+ Float.MinValue,
+ Float.MaxValue,
+ Double.MinValue,
+ Double.MaxValue,
+ Date.valueOf("0001-01-01"),
+ Date.valueOf("9999-12-31"),
+ new Timestamp(-62135769600000L), // 0001-01-01 00:00:00.000
+ new Timestamp(253402300799999L) // 9999-12-31 23:59:59.999
+ ) :: Nil).toDF()
+ }
+
+ override def afterAll(): Unit = {
+ try {
+ spark.conf.unset("spark.sql.session.timeZone")
+ } finally {
+ super.afterAll()
+ }
+ }
+
+ Seq("parquet", "orc", "json", "csv").foreach { dataSource =>
+ test(s"$dataSource - data type value limit") {
+ withTempPath { dir =>
+ df.write.format(dataSource).save(dir.getCanonicalPath)
+
+ // Use the same schema for saving/loading
+ checkAnswer(
+
spark.read.format(dataSource).schema(df.schema).load(dir.getCanonicalPath),
+ df)
+
+ // Use schema inference, but skip text-based format due to its
limitation
+ if (Seq("parquet", "orc").contains(dataSource)) {
+ withTable("tab1") {
+ sql(s"CREATE TABLE tab1 USING $dataSource LOCATION
'${dir.toURI}'")
+ checkAnswer(sql(s"SELECT ${df.schema.fieldNames.mkString(",")}
FROM tab1"), df)
+ }
+ }
+ }
+ }
+ }
+
+ Seq("orc").foreach { dataSource =>
+ test(s"$dataSource - predicate push down") {
+ withSQLConf(
+ SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true",
+ SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+ withTempPath { dir =>
+ // write 4000 rows with the integer and the string in a single
orc file with stride 1000
+ spark
+ .range(4000)
+ .map(i => (i, s"$i"))
+ .toDF("i", "s")
+ .repartition(1)
+ .write
+ .option(OrcConf.ROW_INDEX_STRIDE.getAttribute, 1000)
+ // TODO: Add Parquet option, too.
+ .format(dataSource)
+ .save(dir.getCanonicalPath)
+
+ val df = spark.read.format(dataSource).load(dir.getCanonicalPath)
+ .where(s"i BETWEEN 1500 AND 1999")
--- End diff --
Yep. This PR is based on the following your guidelines and I wish that it
will continue to evolve. :)
- Handle all data sources (For value ranges, I tested on Spark min/max
instead of data source min/max)
- Use high-level end-to-end test case
For PPD, Parquet and ORC are the only ones supporting PPD, aren't they?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]