Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/21696#discussion_r199991109 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala --- @@ -19,166 +19,186 @@ package org.apache.spark.sql.execution.datasources.parquet import java.sql.Date +import scala.collection.JavaConverters._ + import org.apache.parquet.filter2.predicate._ import org.apache.parquet.filter2.predicate.FilterApi._ import org.apache.parquet.io.api.Binary -import org.apache.parquet.schema.PrimitiveComparator +import org.apache.parquet.schema._ +import org.apache.parquet.schema.OriginalType._ +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate import org.apache.spark.sql.sources -import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String /** * Some utility function to convert Spark data source filters to Parquet filters. */ private[parquet] class ParquetFilters(pushDownDate: Boolean, pushDownStartWith: Boolean) { + case class ParquetSchemaType( + originalType: OriginalType, + primitiveTypeName: PrimitiveType.PrimitiveTypeName, + decimalMetadata: DecimalMetadata) + private def dateToDays(date: Date): SQLDate = { DateTimeUtils.fromJavaDate(date) } - private val makeEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = { - case BooleanType => + private val makeEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + // BooleanType + case ParquetSchemaType(null, BOOLEAN, null) => (n: String, v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[java.lang.Boolean]) - case IntegerType => + // IntegerType + case ParquetSchemaType(null, INT32, null) => --- End diff -- the safest way is to look at both file's type and Spark's type, and deal with type mismatch. We can do it later since it's an existing problem. Currently Spark tries its best to guarantee the type matches(except missing/extra columns). The only case I can think of that may break the assumption is: the parquet files have conflicting schema and Sparks read them using a user-specified schema(so that we can skip schema inference) that doesn't match all the parquet files.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org