[
https://issues.apache.org/jira/browse/PARQUET-1336?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16521326#comment-16521326
]
ASF GitHub Bot commented on PARQUET-1336:
-----------------------------------------
wangyum opened a new pull request #497: PARQUET-1336: BinaryComparator should
implements Serializable
URL: https://github.com/apache/parquet-mr/pull/497
`BinaryComparator` should implements `Serializable`. Otherwise, the
following `UserDefinedPredicate` will throw `NotSerializableException`:
```scala
new UserDefinedPredicate[Binary] with Serializable {
private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
private val size = strToBinary.length
val comparator =
PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
override def canDrop(statistics: Statistics[Binary]): Boolean = {
val max = statistics.getMax
val min = statistics.getMin
comparator.compare(max.slice(0, math.min(size, max.length)),
strToBinary) < 0 ||
comparator.compare(min.slice(0, math.min(size, min.length)),
strToBinary) > 0
}
override def inverseCanDrop(statistics: Statistics[Binary]): Boolean =
false
override def keep(value: Binary): Boolean =
UTF8String.fromBytes(value.getBytes).startsWith(UTF8String.fromString(v))
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> BinaryComparator should implements Serializable
> ------------------------------------------------
>
> Key: PARQUET-1336
> URL: https://issues.apache.org/jira/browse/PARQUET-1336
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-mr
> Affects Versions: 1.10.0
> Reporter: Yuming Wang
> Priority: Major
> Labels: pull-request-available
>
> {code:java}
> [info] Cause: java.lang.RuntimeException: java.io.NotSerializableException:
> org.apache.parquet.schema.PrimitiveComparator$8
> [info] at
> org.apache.parquet.hadoop.ParquetInputFormat.setFilterPredicate(ParquetInputFormat.java:211)
> [info] at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:399)
> [info] at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:349)
> [info] at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:128)
> [info] at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:182)
> [info] at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1791)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
> [info] at org.apache.spark.scheduler.Task.run(Task.scala:109)
> [info] at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:367)
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)