Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/21696#discussion_r199991109
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
---
@@ -19,166 +19,186 @@ package
org.apache.spark.sql.execution.datasources.parquet
import java.sql.Date
+import scala.collection.JavaConverters._
+
import org.apache.parquet.filter2.predicate._
import org.apache.parquet.filter2.predicate.FilterApi._
import org.apache.parquet.io.api.Binary
-import org.apache.parquet.schema.PrimitiveComparator
+import org.apache.parquet.schema._
+import org.apache.parquet.schema.OriginalType._
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate
import org.apache.spark.sql.sources
-import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
/**
* Some utility function to convert Spark data source filters to Parquet
filters.
*/
private[parquet] class ParquetFilters(pushDownDate: Boolean,
pushDownStartWith: Boolean) {
+ case class ParquetSchemaType(
+ originalType: OriginalType,
+ primitiveTypeName: PrimitiveType.PrimitiveTypeName,
+ decimalMetadata: DecimalMetadata)
+
private def dateToDays(date: Date): SQLDate = {
DateTimeUtils.fromJavaDate(date)
}
- private val makeEq: PartialFunction[DataType, (String, Any) =>
FilterPredicate] = {
- case BooleanType =>
+ private val makeEq: PartialFunction[ParquetSchemaType, (String, Any) =>
FilterPredicate] = {
+ // BooleanType
+ case ParquetSchemaType(null, BOOLEAN, null) =>
(n: String, v: Any) => FilterApi.eq(booleanColumn(n),
v.asInstanceOf[java.lang.Boolean])
- case IntegerType =>
+ // IntegerType
+ case ParquetSchemaType(null, INT32, null) =>
--- End diff --
the safest way is to look at both file's type and Spark's type, and deal
with type mismatch. We can do it later since it's an existing problem.
Currently Spark tries its best to guarantee the type matches(except
missing/extra columns). The only case I can think of that may break the
assumption is: the parquet files have conflicting schema and Sparks read them
using a user-specified schema(so that we can skip schema inference) that
doesn't match all the parquet files.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]