[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15851965#comment-15851965 ] Cheng Lian commented on SPARK-18539: SPARK-19409 upgrades parquet-mr to 1.8.2 and fixed this issue. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15840857#comment-15840857 ] Liang-Chi Hsieh commented on SPARK-18539: - [~lian cheng] Yea, I see. The term {{optional}} is more proper and won't cause misunderstanding. If we can upgrade to 1.8.2, that is great we can remove the workaround. The workaround is actually a hacky solution. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15840186#comment-15840186 ] Cheng Lian commented on SPARK-18539: [~viirya], sorry for the (super) late reply. What I mentioned was a *nullable* column instead of a *null* column. To be more specific, say we have two Parquet files: - File {{A}} has columns {{}} - File {{B}} has columns {{}}, where {{c}} is marked as nullable (or {{optional}} in the term of Parquet) Then it should be fine to treat these two files as a single dataset with a merged schema {{}} and you should be able to push down predicates involving {{c}}. BTW, the Parquet community just made a patch release 1.8.2 that includes a fix for PARQUET-389 and we probably will upgrade to 1.8.2 in 2.2.0. Then we'll have a proper fix for this issue and remove the workaround we did while doing schema merging. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15727542#comment-15727542 ] Liang-Chi Hsieh commented on SPARK-18539: - [~lian cheng], in Parquet's code, looks like a null column still can have its ColumnChunkMetaData. It won't cause problem even before PARQUET-389, because Parquet will check if all values in the chunk are null. PARQUET-389 resolves the case there is no ColumnChunkMetaData for a column, i.e., the column is missing from the Parquet file. So I am not sure is, in a Parquet file, can a nullable column have no ColumnChunkMetaData like you said? Appreciate if you can clarify it. Thanks. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15724110#comment-15724110 ] Liang-Chi Hsieh commented on SPARK-18539: - That's cool. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:86) >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15724013#comment-15724013 ] Cheng Lian commented on SPARK-18539: [~xwu0226], thanks for the new use case! [~viirya], I do think this is a valid use case as long as all the missing columns are nullable. The only reason that this use case doesn't work right now is PARQUET-389. I got some vague idea about a possible cleaner fix for this issue. Will post it later. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723960#comment-15723960 ] Liang-Chi Hsieh commented on SPARK-18539: - Actually I am not sure if this is a valid usage. I tend to think it is not. As you specify a non existing column, and the system reports the column was not found in schema. It looks reasonable to me. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723949#comment-15723949 ] Liang-Chi Hsieh commented on SPARK-18539: - Because we respect user-specified schema, we won't infer schema and schema merging won't step in of course. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723888#comment-15723888 ] Xin Wu commented on SPARK-18539: I think we will hit the issue if we use user-specified schema. Here is what I tried in spark-shell built from master branch: {code} val df = spark.range(1).coalesce(1) df.selectExpr("id AS a").write.parquet("/Users/xinwu/spark-test/data/spark-18539") val schema = StructType(Seq(StructField("a", IntegerType), StructField("b", IntegerType))) spark.read.option("mergeSchema", "true").schema(schema).parquet("/Users/xinwu/spark-test/data/spark-18539").filter("b < 0").count() {code} The exception is {code} Caused by: java.lang.IllegalArgumentException: Column [b] was not found in schema! at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:181) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:169) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:151) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:91) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:58) at org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:121) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:58) at org.apache.parquet.filter2.predicate.Operators$And.accept(Operators.java:308) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:63) at org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) at org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) at org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) at org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) at org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377) {code} Here I have one parquet file missing column b and query with user-specified schema (a, b). > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723781#comment-15723781 ] Cheng Lian commented on SPARK-18539: Please remind me if I missed anything important, otherwise, we can resolve this ticket as "Not a Problem". > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723747#comment-15723747 ] Cheng Lian commented on SPARK-18539: [~v-gerasimov], [~smilegator], and [~xwu0226], after some investigation, I don't think this is a bug now. Just tested the master branch using the following test cases: {code} for { useVectorizedReader <- Seq(true, false) mergeSchema <- Seq(true, false) } { test(s"foo - mergeSchema: $mergeSchema - vectorized: $useVectorizedReader") { withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> useVectorizedReader.toString) { withTempPath { dir => val path = dir.getCanonicalPath val df = spark.range(1).coalesce(1) df.selectExpr("id AS a", "id AS b").write.parquet(path) df.selectExpr("id AS a").write.mode("append").parquet(path) assertResult(0) { spark.read .option("mergeSchema", mergeSchema.toString) .parquet(path) .filter("b < 0") .count() } } } } } {code} It turned out that this issue only happens when schema merging is turned off. This also explains why PR #9940 doesn't prevent PARQUET-389: because the trick PR #9940 employs happens during schema merging phase. On the other hand, you can't expect missing columns can be properly read when schema merging is turned off. Therefore, I don't think it's a bug. The fix for the snippet mentioned in the ticket description is easy, just add {{.option("mergeSchema", "true")}} to enable schema merging. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723718#comment-15723718 ] Cheng Lian commented on SPARK-18539: As commented on GitHub, there're two issues right now: # This bug also affects the normal Parquet reader code path, where {{ParquetRecordReader}} is a 3rd party class closed for modification. Therefore, we can't capture the exception there. # [PR #9940|https://github.com/apache/spark/pull/9940] should have already fixed this issue. But somehow it is broken right now. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723473#comment-15723473 ] Apache Spark commented on SPARK-18539: -- User 'xwu0226' has created a pull request for this issue: https://github.com/apache/spark/pull/16156 > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723296#comment-15723296 ] Xin Wu commented on SPARK-18539: Yes. I have the fix and will submit PR and cc everyone for review. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15723259#comment-15723259 ] Xiao Li commented on SPARK-18539: - [~lian cheng] [~rxin] We might be able to capture and process the Parquet-issued exception in our reader. [~xwu0226] will submit a very tiny PR to resolve this issue. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15722891#comment-15722891 ] Cheng Lian commented on SPARK-18539: Haven't looked deeply into this issue, but my hunch is that this is related to https://issues.apache.org/jira/browse/PARQUET-389, which was fixed in parquet-mr 1.9.0, while Spark is still using 1.8 (in 2.1) and 1.7 (in 2.0). > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721507#comment-15721507 ] Xiao Li commented on SPARK-18539: - Below is the test case you can try. {noformat} Seq("parquet").foreach { format => withTempPath { path => Seq((1, "abc"), (2, "hello")).toDF("a", "b").write.format(format).save(path.toString) // user-specified schema contains nonexistent columns val schema = StructType( Seq(StructField("a", IntegerType), StructField("b", StringType), StructField("c", IntegerType))) val readDf = spark.read.schema(schema).format(format).load(path.toString) // Read the table without any filter checkAnswer(readDf, Row(1, "abc", null) :: Row(2, "hello", null) :: Nil) // Read the table with a filter on existing columns checkAnswer(readDf.filter("a < 2"), Row(1, "abc", null) :: Nil) val e = intercept[SparkException] { // Read the table with a filter on nonexistent columns readDf.filter("c < 2").show() }.getMessage assert(e.contains("Column [c] was not found in schema")) withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "false") { checkAnswer(readDf.filter("c < 2"), Nil) } } } {noformat} > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721499#comment-15721499 ] Xiao Li commented on SPARK-18539: - The error is from Parquet. {noformat} 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) java.lang.IllegalArgumentException: Column [b] was not found in schema! at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) at org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) {noformat} If users specify the schema, we do not check whether the user-specified schema is right or wrong. We always respect the schema. If the schema contains the non-existent columns, we push down the filters to the Parquet, and then Parquet returns the above error to us. Below is the test case you can try. {noformat} Seq("parquet").foreach { format => withTempPath { path => Seq((1, "abc"), (2, "hello")).toDF("a", "b").write.format(format).save(path.toString) // user-specified schema contains nonexistent columns val schema = StructType( Seq(StructField("a", IntegerType), StructField("b", StringType), StructField("c", IntegerType))) val readDf = spark.read.schema(schema).format(format).load(path.toString) // Read the table without any filter checkAnswer(readDf, Row(1, "abc", null) :: Row(2, "hello", null) :: Nil) // Read the table without a filter on existing columns checkAnswer(readDf.filter("a < 2"), Row(1, "abc", null) :: Nil) val e = intercept[SparkException] { // Read the table without a filter on nonexistent columns readDf.filter("c < 2").show() }.getMessage assert(e.contains("Column [c] was not found in schema")) withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "false") { checkAnswer(readDf.filter("c < 2"), Nil) } } } {noformat} > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721488#comment-15721488 ] Reynold Xin commented on SPARK-18539: - Why don't we fix the parquet reader so it can tolerate non-existent columns? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721481#comment-15721481 ] Xiao Li commented on SPARK-18539: - The default of `spark.sql.parquet.mergeSchema` is false. To figure out the schema of parquet, the default behavior is based on a much cheaper solution {noformat} // Always tries the summary files first if users don't require a merged schema. In this case, // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row // groups information, and could be much smaller for large Parquet files with lots of row // groups. If no summary file is available, falls back to some random part-file. {noformat} It might not always resolve your case. If we introduce such a parameter to always infer the schema and compare the inferred schema with user-specified schemas, we can do extra checking and processing on the schema (e.g., type promotion, schema merging between user-specified schema and actual data schema). The scope will be much bigger. This is a design decision. cc [~rxin] [~marmbrus] [~lian cheng] [~cloud_fan] [~ekhliang] > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721430#comment-15721430 ] Vitaly Gerasimov commented on SPARK-18539: -- I think this is another reason why we should make user-specified schemas working correctly with parquet and ORC, it would be great have the same behavior for all formats. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721423#comment-15721423 ] Vitaly Gerasimov commented on SPARK-18539: -- If we can neglect the performance when we use schema merging, maybe we can neglect the performance when we use user-specified schemas, possibly we should make a new parameter for using this. I think small files are not very big problem, in this case, so the problem is I can't normally use user-specified schemas with filter pushdown optimization. What do you think? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721414#comment-15721414 ] Xiao Li commented on SPARK-18539: - FYI, I checked the other formats, CSV and JSON work as expected. No error is issued. Just an empty data set. However, ORC is even worse. It does not tolerate it if users specify a schema with extra non-existent columns, no matter whether it is used in the filter or not. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721392#comment-15721392 ] Xiao Li commented on SPARK-18539: - Yeah. It is very slow when you have many many small parquet files. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721385#comment-15721385 ] Vitaly Gerasimov commented on SPARK-18539: -- Hmm.. How it works when we use schema merging? Does it also very expensive when we need to scan many files, doesn't it? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721376#comment-15721376 ] Xiao Li commented on SPARK-18539: - Basically, we have to know whether a column exists or not before executing/processing a query. Otherwise, we are unable to know whether a filter can be pushed down or not. To completely resolve the issue, we need to infer the schema every time when we need to read the external file. Schema inference is very expensive when we need to scan many many (small) files. Thus, it does not make sense for Spark to do it. Right? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721372#comment-15721372 ] Vitaly Gerasimov commented on SPARK-18539: -- If I turn off `spark.sql.parquet.filterPushdown` it works correctly. But I would prefer to use filter pushdown optimization in my case. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721316#comment-15721316 ] Xiao Li commented on SPARK-18539: - Could you turn off `spark.sql.parquet.filterPushdown`? Is that acceptable in your use case? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721310#comment-15721310 ] Dongjoon Hyun commented on SPARK-18539: --- The use case makes sense. I see now! > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721300#comment-15721300 ] Vitaly Gerasimov commented on SPARK-18539: -- Thank you for your reply. I think we need to make user-specified schemas (that do not match the actual data schema) work correctly e.g. I have parquet files that are generated every day and have field `a`, in some time I add nullable field `b`, so I don't want to add that field to previous files, but I want my application can read both of them. It's like schema merging. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15721065#comment-15721065 ] Dongjoon Hyun commented on SPARK-18539: --- Thank you so much! > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:86) >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15720964#comment-15720964 ] Xiao Li commented on SPARK-18539: - The parquet filter push-down of Spark 2.x is different from the Spark 1.x. Since Spark 2.0, parquet scan starts using vectorization. Unfortunately, Spark 2.0.0 had a serious bug in filter push-down : https://issues.apache.org/jira/browse/SPARK-15639 After the fix was merged into Spark 2.0.1, you hit the behavior difference. We always respect user-specified schemas. When user-specified schema does not match the actual data schema, the behaviors are not well-defined. You might hit different errors or unexpected results. The behaviors could be different when you choose different formats. Let me dig it deeper about this specific issues and might provide you a better answer later > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at >
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15720726#comment-15720726 ] Dongjoon Hyun commented on SPARK-18539: --- It looks like the predicates are pushed down to Parquet and Parquet is screaming about that. I'm not sure the original behavior is desirable. [~smilegator]. How do you think about the previous behavior? > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15699367#comment-15699367 ] Vitaly Gerasimov commented on SPARK-18539: -- [~dongjoon] I don't know. However, it works fine in previous versions of spark including Spark 2.0.0. I think this is normal to apply schema with nullable fields. > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at
[jira] [Commented] (SPARK-18539) Cannot filter by nonexisting column in parquet file
[ https://issues.apache.org/jira/browse/SPARK-18539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15697793#comment-15697793 ] Dongjoon Hyun commented on SPARK-18539: --- Interesting. Is it valid? {code} sc.read .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", IntegerType, nullable = true .load("/tmp/test") .createOrReplaceTempView("table") {code} > Cannot filter by nonexisting column in parquet file > --- > > Key: SPARK-18539 > URL: https://issues.apache.org/jira/browse/SPARK-18539 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.1, 2.0.2 >Reporter: Vitaly Gerasimov >Priority: Critical > > {code} > import org.apache.spark.SparkConf > import org.apache.spark.sql.SparkSession > import org.apache.spark.sql.types.DataTypes._ > import org.apache.spark.sql.types.{StructField, StructType} > val sc = SparkSession.builder().config(new > SparkConf().setMaster("local")).getOrCreate() > val jsonRDD = sc.sparkContext.parallelize(Seq("""{"a":1}""")) > sc.read > .schema(StructType(Seq(StructField("a", IntegerType > .json(jsonRDD) > .write > .parquet("/tmp/test") > sc.read > .schema(StructType(Seq(StructField("a", IntegerType), StructField("b", > IntegerType, nullable = true > .load("/tmp/test") > .createOrReplaceTempView("table") > sc.sql("select b from table where b is not null").show() > {code} > returns: > {code} > 16/11/22 17:43:47 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) > java.lang.IllegalArgumentException: Column [b] was not found in schema! > at org.apache.parquet.Preconditions.checkArgument(Preconditions.java:55) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.getColumnDescriptor(SchemaCompatibilityValidator.java:190) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:178) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:100) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) > at > org.apache.parquet.filter2.predicate.Operators$NotEq.accept(Operators.java:194) > at > org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) > at > org.apache.parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) > at > org.apache.parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) > at > org.apache.parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:110) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:367) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:341) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) >