This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new 98aebf4 [SPARK-28371][SQL] Make Parquet "StartsWith" filter null-safe 98aebf4 is described below commit 98aebf44f943050eccbdcc6f7213b384cd17cc1d Author: Marcelo Vanzin <van...@cloudera.com> AuthorDate: Sat Jul 13 11:38:54 2019 -0700 [SPARK-28371][SQL] Make Parquet "StartsWith" filter null-safe Parquet may call the filter with a null value to check whether nulls are accepted. While it seems Spark avoids that path in Parquet with 1.10, in 1.11 that causes Spark unit tests to fail. Tested with Parquet 1.11 (and new unit test). Closes #25140 from vanzin/SPARK-28371. Authored-by: Marcelo Vanzin <van...@cloudera.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> (cherry picked from commit 7f9da2b7f8a2331ce403cd7afecfd874f8049c04) Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../spark/sql/execution/datasources/parquet/ParquetFilters.scala | 2 +- .../sql/execution/datasources/parquet/ParquetFilterSuite.scala | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 0c286de..7e420d3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -541,7 +541,7 @@ private[parquet] class ParquetFilters( } override def keep(value: Binary): Boolean = { - UTF8String.fromBytes(value.getBytes).startsWith( + value != null && UTF8String.fromBytes(value.getBytes).startsWith( UTF8String.fromBytes(strToBinary.getBytes)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 7ebb750..0f04e82 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -955,6 +955,14 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex } } + // SPARK-28371: make sure filter is null-safe. + withParquetDataFrame(Seq(Tuple1[String](null))) { implicit df => + checkFilterPredicate( + '_1.startsWith("blah").asInstanceOf[Predicate], + classOf[UserDefinedByInstance[_, _]], + Seq.empty[Row]) + } + import testImplicits._ // Test canDrop() has taken effect testStringStartsWith(spark.range(1024).map(_.toString).toDF(), "value like 'a%'") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org