[
https://issues.apache.org/jira/browse/SPARK-4182?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Cheng Lian updated SPARK-4182:
------------------------------
Description:
If a table contains a column whose type is binary, array, struct, map, and for
some reason, boolean, in-memory columnar caching doesn't work because a
{{NoopColumnStats}} is used to collect column statistics. {{NoopColumnStats}}
returns an empty statistics row, and thus breaks {{InMemoryRelation}}
statistics calculation.
Reproduction steps:
{code}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.types._
val sparkContext = sc
import sparkContext._
val sqlContext = new SQLContext(sparkContext)
import sqlContext._
case class BoolField(flag: Boolean)
val schemaRDD = parallelize(true :: false :: Nil).map(BoolField(_)).toSchemaRDD
schemaRDD.cache().count()
schemaRDD.count()
{code}
Exception throw:
{code}
java.lang.ArrayIndexOutOfBoundsException: 4
at
org.apache.spark.sql.catalyst.expressions.GenericRow.apply(Row.scala:142)
at
org.apache.spark.sql.catalyst.expressions.BoundReference.eval(BoundAttribute.scala:37)
at
org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66)
at
org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at
org.apache.spark.sql.columnar.InMemoryRelation.computeSizeInBytes(InMemoryColumnarTableScan.scala:66)
at
org.apache.spark.sql.columnar.InMemoryRelation.statistics(InMemoryColumnarTableScan.scala:87)
at
org.apache.spark.sql.columnar.InMemoryRelation.statisticsToBePropagated(InMemoryColumnarTableScan.scala:73)
at
org.apache.spark.sql.columnar.InMemoryRelation.withOutput(InMemoryColumnarTableScan.scala:147)
at
org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122)
at
org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122)
...
{code}
was:
If a table contains a column whose type is binary, array, struct, map, and for
some reason, boolean, in-memory columnar caching doesn't work because a
{{NoopColumnStats}} is used to collect column statistics. {{NoopColumnStats}}
returns an empty statistics row, and thus breaks {{InMemoryRelation}}
statistics calculation.
{code}
{code}
> Caching tables containing boolean, binary, array, struct and/or map columns
> doesn't work
> ----------------------------------------------------------------------------------------
>
> Key: SPARK-4182
> URL: https://issues.apache.org/jira/browse/SPARK-4182
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.1.1
> Reporter: Cheng Lian
> Assignee: Cheng Lian
> Priority: Blocker
>
> If a table contains a column whose type is binary, array, struct, map, and
> for some reason, boolean, in-memory columnar caching doesn't work because a
> {{NoopColumnStats}} is used to collect column statistics. {{NoopColumnStats}}
> returns an empty statistics row, and thus breaks {{InMemoryRelation}}
> statistics calculation.
> Reproduction steps:
> {code}
> import org.apache.spark.sql.SQLContext
> import org.apache.spark.sql.catalyst.types._
> val sparkContext = sc
> import sparkContext._
> val sqlContext = new SQLContext(sparkContext)
> import sqlContext._
> case class BoolField(flag: Boolean)
> val schemaRDD = parallelize(true :: false ::
> Nil).map(BoolField(_)).toSchemaRDD
> schemaRDD.cache().count()
> schemaRDD.count()
> {code}
> Exception throw:
> {code}
> java.lang.ArrayIndexOutOfBoundsException: 4
> at
> org.apache.spark.sql.catalyst.expressions.GenericRow.apply(Row.scala:142)
> at
> org.apache.spark.sql.catalyst.expressions.BoundReference.eval(BoundAttribute.scala:37)
> at
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66)
> at
> org.apache.spark.sql.columnar.InMemoryRelation$$anonfun$computeSizeInBytes$1.apply(InMemoryColumnarTableScan.scala:66)
> at
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at
> scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
> at scala.collection.AbstractTraversable.map(Traversable.scala:105)
> at
> org.apache.spark.sql.columnar.InMemoryRelation.computeSizeInBytes(InMemoryColumnarTableScan.scala:66)
> at
> org.apache.spark.sql.columnar.InMemoryRelation.statistics(InMemoryColumnarTableScan.scala:87)
> at
> org.apache.spark.sql.columnar.InMemoryRelation.statisticsToBePropagated(InMemoryColumnarTableScan.scala:73)
> at
> org.apache.spark.sql.columnar.InMemoryRelation.withOutput(InMemoryColumnarTableScan.scala:147)
> at
> org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122)
> at
> org.apache.spark.sql.CacheManager$$anonfun$useCachedData$1$$anonfun$applyOrElse$1.apply(CacheManager.scala:122)
> ...
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]