[ https://issues.apache.org/jira/browse/SPARK-8128?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14576027#comment-14576027 ]
Brad Willard edited comment on SPARK-8128 at 6/11/15 9:36 PM: -------------------------------------------------------------- Initially I had through this was a bug related to the same attribute name being reused in the nested structure. I've actually experienced this now with other columns that have unique names. With a data file of 25 million records of this schema I was able to get the same exception. I also want to note that I'm building these parquet files with Spark Itself from json files. This seems like a definite bug with the schema merging. sdf.printSchema() root \|-- _id_s: string (nullable = true) \|-- className_s: string (nullable = true) \|-- collectionId_s: string (nullable = true) \|-- data_m: struct (nullable = true) \| |-- html_s_1: string (nullable = true) \| |-- layout_s_1: string (nullable = true) \| |-- raw_s_1: string (nullable = true) \| |-- updatedOn_s_1: string (nullable = true) \|-- updatedOn_s: string (nullable = true) \|-- websiteId_s: string (nullable = true) \|-- brickSize_i: long (nullable = true) \|-- bricks_l: array (nullable = true) \| |-- element: struct (containsNull = true) \| | |-- block_m_1: struct (nullable = true) \| | | |-- className_s_2: string (nullable = true) \| | | |-- imageId_s_2: string (nullable = true) \| | | |-- lightbox_s_2: string (nullable = true) \| | | |-- newWindow_s_2: string (nullable = true) \| | | |-- stretch_s_2: string (nullable = true) \| | | |-- type_i_2: long (nullable = true) \| | | |-- vSize_i_2: long (nullable = true) \| | |-- column_i_1: long (nullable = true) \| | |-- height_i_1: long (nullable = true) \| | |-- row_i_1: long (nullable = true) \| | |-- width_i_1: long (nullable = true) \|-- unitsPerRow_i: long (nullable = true) \|-- key: string (nullable = true) sdf.where(sdf.unitsPerRow_i > 0).count() --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-6-41af5d8cf204> in <module>() ----> 1 sdf.where(sdf.unitsPerRow_i > 0).count() /root/spark/python/pyspark/sql/dataframe.pyc in count(self) 282 2L 283 """ --> 284 return self._jdf.count() 285 286 def collect(self): /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, --> 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. --> 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling o53.count. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 44 in stage 4.0 failed 30 times, most recent failure: Lost task 44.29 in stage 4.0 (TID 4700, XXXXXXXXXX): java.lang.IllegalArgumentException: Column [unitsPerRow_i] was not found in schema! at parquet.Preconditions.checkArgument(Preconditions.java:47) at parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:172) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:160) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:142) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:41) at parquet.filter2.predicate.Operators$Gt.accept(Operators.java:217) at parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:46) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:41) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:22) at parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:108) at parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:28) at parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:158) at parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:138) at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:133) at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:104) at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:66) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) was (Author: brdwrd): Initially I had through this was a bug related to the same attribute name being reused in the nested structure. I've actually experienced this now with other columns that have unique names. With a data file of 25 million records of this schema I was able to get the same exception. I also want to note that I'm building these parquet files with Spark Itself from json files. This seems like a definite bug with the schema merging. sdf.printSchema() root |-- _id_s: string (nullable = true) |-- className_s: string (nullable = true) |-- collectionId_s: string (nullable = true) |-- data_m: struct (nullable = true) | |-- html_s_1: string (nullable = true) | |-- layout_s_1: string (nullable = true) | |-- raw_s_1: string (nullable = true) | |-- updatedOn_s_1: string (nullable = true) |-- updatedOn_s: string (nullable = true) |-- websiteId_s: string (nullable = true) |-- brickSize_i: long (nullable = true) |-- bricks_l: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- block_m_1: struct (nullable = true) | | | |-- className_s_2: string (nullable = true) | | | |-- imageId_s_2: string (nullable = true) | | | |-- lightbox_s_2: string (nullable = true) | | | |-- newWindow_s_2: string (nullable = true) | | | |-- stretch_s_2: string (nullable = true) | | | |-- type_i_2: long (nullable = true) | | | |-- vSize_i_2: long (nullable = true) | | |-- column_i_1: long (nullable = true) | | |-- height_i_1: long (nullable = true) | | |-- row_i_1: long (nullable = true) | | |-- width_i_1: long (nullable = true) |-- unitsPerRow_i: long (nullable = true) |-- key: string (nullable = true) sdf.where(sdf.unitsPerRow_i > 0).count() --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-6-41af5d8cf204> in <module>() ----> 1 sdf.where(sdf.unitsPerRow_i > 0).count() /root/spark/python/pyspark/sql/dataframe.pyc in count(self) 282 2L 283 """ --> 284 return self._jdf.count() 285 286 def collect(self): /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, --> 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. --> 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling o53.count. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 44 in stage 4.0 failed 30 times, most recent failure: Lost task 44.29 in stage 4.0 (TID 4700, XXXXXXXXXX): java.lang.IllegalArgumentException: Column [unitsPerRow_i] was not found in schema! at parquet.Preconditions.checkArgument(Preconditions.java:47) at parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:172) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:160) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:142) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:41) at parquet.filter2.predicate.Operators$Gt.accept(Operators.java:217) at parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:46) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:41) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:22) at parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:108) at parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:28) at parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:158) at parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:138) at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:133) at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:104) at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:66) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > Schema Merging Broken: Dataframe Fails to Recognize Column in Schema > -------------------------------------------------------------------- > > Key: SPARK-8128 > URL: https://issues.apache.org/jira/browse/SPARK-8128 > Project: Spark > Issue Type: Bug > Components: PySpark, Spark Core > Affects Versions: 1.3.0, 1.3.1, 1.4.0 > Reporter: Brad Willard > > I'm loading a folder of parquet files with about 600 parquet files and > loading it into one dataframe so schema merging is involved. There is some > bug with the schema merging that you print the schema and it shows and > attributes. However when you run a query and filter on that attribute is > errors saying it's not in the schema. > I think this bug could be related to an attribute name being reused in a > nested object. "mediaProcessingState" appears twice in the schema and is the > problem. > sdf = sql_context.parquet('/parquet/big_data_folder') > sdf.printSchema() > root > \|-- _id: string (nullable = true) > \|-- addedOn: string (nullable = true) > \|-- attachment: string (nullable = true) > ....... > \|-- items: array (nullable = true) > \| |-- element: struct (containsNull = true) > \| | |-- _id: string (nullable = true) > \| | |-- addedOn: string (nullable = true) > \| | |-- authorId: string (nullable = true) > \| | |-- mediaProcessingState: long (nullable = true) > \|-- mediaProcessingState: long (nullable = true) > \|-- title: string (nullable = true) > \|-- key: string (nullable = true) > sdf.filter(sdf.mediaProcessingState == 3).count() > causes this exception > Py4JJavaError: An error occurred while calling o67.count. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task > 1106 in stage 4.0 failed 30 times, most recent failure: Lost task 1106.29 in > stage 4.0 (TID 70565, XXXXXXXXXXXXXXX): java.lang.IllegalArgumentException: > Column [mediaProcessingState] was not found in schema! > at parquet.Preconditions.checkArgument(Preconditions.java:47) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:172) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:160) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:142) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:76) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:41) > at parquet.filter2.predicate.Operators$Eq.accept(Operators.java:162) > at > parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:46) > at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:41) > at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:22) > at > parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:108) > at > parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:28) > at > parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:158) > at > parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:138) > at > org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:133) > at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:104) > at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:66) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) > at org.apache.spark.scheduler.Task.run(Task.scala:64) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > You also get the same error if you register it as a temp table and try to > execute the same sql query. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org