[
https://issues.apache.org/jira/browse/SPARK-8128?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14576027#comment-14576027
]
Brad Willard commented on SPARK-8128:
-------------------------------------
Initially I had through this was a bug related to the same attribute name being
reused in the nested structure. I've actually experienced this now with other
columns that have unique names. With a data file of 25 million records of this
schema I was able to get the same exception. I also want to note that I'm
building these parquet files with Spark Itself from json files.
This seems like a definite bug with the schema merging.
sdf.printSchema()
root
|-- _id_s: string (nullable = true)
|-- className_s: string (nullable = true)
|-- collectionId_s: string (nullable = true)
|-- data_m: struct (nullable = true)
| |-- html_s_1: string (nullable = true)
| |-- layout_s_1: string (nullable = true)
| |-- raw_s_1: string (nullable = true)
| |-- updatedOn_s_1: string (nullable = true)
|-- updatedOn_s: string (nullable = true)
|-- websiteId_s: string (nullable = true)
|-- brickSize_i: long (nullable = true)
|-- bricks_l: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- block_m_1: struct (nullable = true)
| | | |-- className_s_2: string (nullable = true)
| | | |-- imageId_s_2: string (nullable = true)
| | | |-- lightbox_s_2: string (nullable = true)
| | | |-- newWindow_s_2: string (nullable = true)
| | | |-- stretch_s_2: string (nullable = true)
| | | |-- type_i_2: long (nullable = true)
| | | |-- vSize_i_2: long (nullable = true)
| | |-- column_i_1: long (nullable = true)
| | |-- height_i_1: long (nullable = true)
| | |-- row_i_1: long (nullable = true)
| | |-- width_i_1: long (nullable = true)
|-- unitsPerRow_i: long (nullable = true)
|-- key: string (nullable = true)
sdf.where(sdf.unitsPerRow_i > 0).count()
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-41af5d8cf204> in <module>()
----> 1 sdf.where(sdf.unitsPerRow_i > 0).count()
/root/spark/python/pyspark/sql/dataframe.pyc in count(self)
282 2L
283 """
--> 284 return self._jdf.count()
285
286 def collect(self):
/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in
__call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o53.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 44 in
stage 4.0 failed 30 times, most recent failure: Lost task 44.29 in stage 4.0
(TID 4700, XXXXXXXXXX): java.lang.IllegalArgumentException: Column
[unitsPerRow_i] was not found in schema!
at parquet.Preconditions.checkArgument(Preconditions.java:47)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:172)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:160)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:142)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:41)
at parquet.filter2.predicate.Operators$Gt.accept(Operators.java:217)
at
parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:46)
at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:41)
at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:22)
at
parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:108)
at
parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:28)
at
parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:158)
at
parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:138)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:133)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:104)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:66)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> Dataframe Fails to Recognize Column in Schema
> ---------------------------------------------
>
> Key: SPARK-8128
> URL: https://issues.apache.org/jira/browse/SPARK-8128
> Project: Spark
> Issue Type: Bug
> Components: PySpark, Spark Core
> Affects Versions: 1.3.1
> Reporter: Brad Willard
>
> I'm loading a folder of parquet files with about 600 parquet files and
> loading it into one dataframe so schema merging is involved. There is some
> bug with the schema merging that you print the schema and it shows and
> attributes. However when you run a query and filter on that attribute is
> errors saying it's not in the schema.
> I think this bug could be related to an attribute name being reused in a
> nested object. "mediaProcessingState" appears twice in the schema and is the
> problem.
> sdf = sql_context.parquet('/parquet/big_data_folder')
> sdf.printSchema()
> root
> |-- _id: string (nullable = true)
> |-- addedOn: string (nullable = true)
> |-- attachment: string (nullable = true)
> .......
> |-- items: array (nullable = true)
> | |-- element: struct (containsNull = true)
> | | |-- _id: string (nullable = true)
> | | |-- addedOn: string (nullable = true)
> | | |-- authorId: string (nullable = true)
> | | |-- mediaProcessingState: long (nullable = true)
> |-- mediaProcessingState: long (nullable = true)
> |-- title: string (nullable = true)
> |-- key: string (nullable = true)
> sdf.filter(sdf.mediaProcessingState == 3).count()
> causes this exception
> Py4JJavaError: An error occurred while calling o67.count.
> : org.apache.spark.SparkException: Job aborted due to stage failure: Task
> 1106 in stage 4.0 failed 30 times, most recent failure: Lost task 1106.29 in
> stage 4.0 (TID 70565, XXXXXXXXXXXXXXX): java.lang.IllegalArgumentException:
> Column [mediaProcessingState] was not found in schema!
> at parquet.Preconditions.checkArgument(Preconditions.java:47)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:172)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:160)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:142)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:76)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:41)
> at parquet.filter2.predicate.Operators$Eq.accept(Operators.java:162)
> at
> parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:46)
> at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:41)
> at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:22)
> at
> parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:108)
> at
> parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:28)
> at
> parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:158)
> at
> parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:138)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:133)
> at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:104)
> at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:66)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
> at org.apache.spark.scheduler.Task.run(Task.scala:64)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> You also get the same error if you register it as a temp table and try to
> execute the same sql query.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]