[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r460374936 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -199,4 +205,25 @@ object OrcUtils extends Logging { s"map<${orcTypeDescriptionString(m.keyType)},${orcTypeDescriptionString(m.valueType)}>" case _ => dt.catalogString } + + /** + * @return Returns the result schema string based on the canPruneCols flag. + * resultSchemaString will be created using resultsSchema in case of + * canPruneCols is true and for canPruneCols as false value + * resultSchemaString will be created using the actual dataSchema. Review comment: @gatorsmile - This is the new helper method that we have added as the part of this PR sure I Will update the description in the follow-up PR . Shall I raised the PR against the new Jira or with this same jira . Since this Jira is already resolved This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r460374936 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -199,4 +205,25 @@ object OrcUtils extends Logging { s"map<${orcTypeDescriptionString(m.keyType)},${orcTypeDescriptionString(m.valueType)}>" case _ => dt.catalogString } + + /** + * @return Returns the result schema string based on the canPruneCols flag. + * resultSchemaString will be created using resultsSchema in case of + * canPruneCols is true and for canPruneCols as false value + * resultSchemaString will be created using the actual dataSchema. Review comment: @gatorsmile - This is the new helper method that we have added as the part of this PR sure I Will update the description of PR . Shall I raised the PR against the new Jira or with this same jira . Since this Jira is already resolved This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455213803 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -112,25 +112,26 @@ case class OrcPartitionReaderFactory( override def buildColumnarReader(file: PartitionedFile): PartitionReader[ColumnarBatch] = { val conf = broadcastedConf.value.value -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader } else { - val requestedColIds = requestedColIdsOrEmptyFile.get ++ Array.fill(partitionSchema.length)(-1) - assert(requestedColIds.length == resultSchema.length, + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = OrcUtils.orcResultSchemaString(canPruneCols, +dataSchema, resultSchema, partitionSchema, conf) + val requestedDataColIds = requestedColIds ++ Array.fill(partitionSchema.length)(-1) Review comment: switched the name This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455118080 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,15 +116,16 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual Review comment: updated the comment This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455117586 ## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala ## @@ -288,4 +288,33 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name" + Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455117835 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -66,24 +66,28 @@ case class OrcPartitionReaderFactory( override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader[InternalRow] } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = if (canPruneCols) { +OrcUtils.orcTypeDescriptionString(resultSchema) + } else { +OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) Review comment: Added the helper method This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455018000 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -66,24 +66,28 @@ case class OrcPartitionReaderFactory( override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader[InternalRow] } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = if (canPruneCols) { +OrcUtils.orcTypeDescriptionString(resultSchema) + } else { +OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) Review comment: @cloud-fan - In this we need to return the resultSchemaString from this method Option[(Array[Int], String)] which is for else if (orcFieldNames.forall(_.startsWith("_col"))) { val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields) else val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(requiredSchema.fields ++ partitionSchema.fields))) since we are using this resultSchemaString in batchReader.initBatch( TypeDescription.fromString(resultSchemaString), resultSchema.fields, shall we make this change or create some helper method from the code in orc utils val resultSchemaString =someMethod() someMethod(): String { val resultSchemaString = if (canPruneCols) { OrcUtils.orcTypeDescriptionString(resultSchema) } else { OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) } OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) resultSchemaString } This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455018000 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -66,24 +66,28 @@ case class OrcPartitionReaderFactory( override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader[InternalRow] } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = if (canPruneCols) { +OrcUtils.orcTypeDescriptionString(resultSchema) + } else { +OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) Review comment: @cloud-fan - In this we need to return the resultSchemaString from this method Option[(Array[Int], String)] which is for else if (orcFieldNames.forall(_.startsWith("_col"))) { val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields) else val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(requiredSchema.fields ++ partitionSchema.fields))) since we are using this resultSchemaString in batchReader.initBatch( TypeDescription.fromString(resultSchemaString), resultSchema.fields, shall we make this change or create some helper method from the code in orc utils val resultSchemaString =someMethod() someMethod(): String { val resultSchemaString = if (canPruneCols) { OrcUtils.orcTypeDescriptionString(resultSchema) } else { OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) } OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) } This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r455018000 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -66,24 +66,28 @@ case class OrcPartitionReaderFactory( override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader[InternalRow] } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = if (canPruneCols) { +OrcUtils.orcTypeDescriptionString(resultSchema) + } else { +OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) Review comment: @cloud-fan - In this we need to return the resultSchemaString from this method Option[(Array[Int], String)] which is for else if (orcFieldNames.forall(_.startsWith("_col"))) { val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields) else val resultSchemaString = OrcUtils.orcTypeDescriptionString(StructType(requiredSchema.fields ++ partitionSchema.fields))) since we are using this resultSchemaString in batchReader.initBatch( TypeDescription.fromString(resultSchemaString), resultSchema.fields, This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454901819 ## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala ## @@ -288,4 +288,35 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name" + +" for ORC_IMPLEMENTATION") { +Seq("native", "hive").foreach { orcImpl => + Seq("false", "true").foreach { vectorized => +withSQLConf( + SQLConf.ORC_IMPLEMENTATION.key -> orcImpl, + SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized) { + withTempPath { dir => Review comment: Removed it This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454891107 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var canPruneCols = true Review comment: Removed it This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454891001 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454890891 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -179,12 +179,17 @@ class OrcFileFormat val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val requestedColIdsOrEmptyFile = + val (requestedColIdsOrEmptyFile, canPruneCols) = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, requiredSchema, reader, conf) } + if (!canPruneCols) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454573994 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { Review comment: I did similar change by returning the one attribute in the return type. I was asked to make it two attributes to return from this method Please find the review comment for that - https://github.com/apache/spark/pull/29045#discussion_r453335630 So I change it after this review comment This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454570883 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala if (orcFieldNames.isEmpty) { // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer. - None + (None, sendActualSchema) } else { if (orcFieldNames.forall(_.startsWith("_col"))) { // This is a ORC file written by Hive, no field names in the physical schema, assume the // physical schema maps to the data scheme by index. assert(orcFieldNames.length <= dataSchema.length, "The given data schema " + s"${dataSchema.catalogString} has less fields than the actual ORC physical schema, " + "no idea which columns were dropped, fail to read.") -Some(requiredSchema.fieldNames.map { name => +(Some(requiredSchema.fieldNames.map { name => val index = dataSchema.fieldIndex(name) if (index < orcFieldNames.length) { +// for ORC file written by Hive, no field names +// in the physical schema, there is a need to send the +// entire dataSchema instead of required schema +sendActualSchema = true index } else { -1 Review comment: added for -1 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454559642 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala if (orcFieldNames.isEmpty) { // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer. - None + (None, sendActualSchema) } else { if (orcFieldNames.forall(_.startsWith("_col"))) { // This is a ORC file written by Hive, no field names in the physical schema, assume the // physical schema maps to the data scheme by index. assert(orcFieldNames.length <= dataSchema.length, "The given data schema " + s"${dataSchema.catalogString} has less fields than the actual ORC physical schema, " + "no idea which columns were dropped, fail to read.") -Some(requiredSchema.fieldNames.map { name => +(Some(requiredSchema.fieldNames.map { name => val index = dataSchema.fieldIndex(name) if (index < orcFieldNames.length) { +// for ORC file written by Hive, no field names +// in the physical schema, there is a need to send the +// entire dataSchema instead of required schema +sendActualSchema = true index } else { -1 Review comment: for -1 also we need to send true This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454558890 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -74,7 +74,7 @@ case class OrcPartitionReaderFactory( val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) -val requestedColIdsOrEmptyFile = +val (requestedColIdsOrEmptyFile, _) = Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454524703 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala if (orcFieldNames.isEmpty) { Review comment: This will not give correct result, On changing the code as above comment, results are showing empty rows ``` scala> val u = """select date_dim.d_date_id from date_dim limit 5""" u: String = select date_dim.d_date_id from date_dim limit 5 scala> spark.sql(u).collect res1: Array[org.apache.spark.sql.Row] = Array() scala> val u = """select * from date_dim limit 5""" u: String = select * from date_dim limit 5 scala> spark.sql(u).collect res2: Array[org.apache.spark.sql.Row] = Array() ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454524703 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala if (orcFieldNames.isEmpty) { Review comment: This will not give correct result, On changing the above comment, results are showing empty rows ``` scala> val u = """select date_dim.d_date_id from date_dim limit 5""" u: String = select date_dim.d_date_id from date_dim limit 5 scala> spark.sql(u).collect res1: Array[org.apache.spark.sql.Row] = Array() scala> val u = """select * from date_dim limit 5""" u: String = select * from date_dim limit 5 scala> spark.sql(u).collect res2: Array[org.apache.spark.sql.Row] = Array() ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454515273 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -160,12 +160,12 @@ class OrcFileFormat } val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) +val actualSchema = StructType(dataSchema.fields ++ partitionSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableVectorizedReader = supportBatch(sparkSession, resultSchema) val capacity = sqlConf.orcVectorizedReaderBatchSize -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(hadoopConf, resultSchemaString) Review comment: @cloud-fan - Do we need to make the similar change in these file OrcPartitionReaderFactory override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { and override def buildColumnarReader(file: PartitionedFile): PartitionReader[ColumnarBatch] = { This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454491048 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -160,12 +160,12 @@ class OrcFileFormat } val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) +val actualSchema = StructType(dataSchema.fields ++ partitionSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableVectorizedReader = supportBatch(sparkSession, resultSchema) val capacity = sqlConf.orcVectorizedReaderBatchSize -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(hadoopConf, resultSchemaString) Review comment: will this not help I have already added this in the code ``` if (sendActualSchema) { resultSchemaString = OrcUtils.orcTypeDescriptionString(actualSchema) } OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) ``` here the resultschema is having the value of actual schema This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454491048 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -160,12 +160,12 @@ class OrcFileFormat } val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) +val actualSchema = StructType(dataSchema.fields ++ partitionSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableVectorizedReader = supportBatch(sparkSession, resultSchema) val capacity = sqlConf.orcVectorizedReaderBatchSize -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(hadoopConf, resultSchemaString) Review comment: will this not help I have already added this in the code `OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString)` here the resultschema is having the value of actual schema This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454493959 ## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala ## @@ -288,4 +288,35 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name" + +" for ORC_IMPLEMENTATION") { +Seq("native", "hive").foreach { orcImpl => + Seq("false", "true").foreach { vectorized => +withSQLConf( + SQLConf.ORC_IMPLEMENTATION.key -> orcImpl, + SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized) { + withTempPath { dir => +withTable("test_hive_orc_impl") { + spark.sql( +s""" + | CREATE TABLE test_hive_orc_impl + | (_col1 INT, _col2 STRING, _col3 INT) Review comment: yes this can be reproduce by this also, But I have attached the date_dim tpcds orc data in the jira itself This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454491048 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -160,12 +160,12 @@ class OrcFileFormat } val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) +val actualSchema = StructType(dataSchema.fields ++ partitionSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableVectorizedReader = supportBatch(sparkSession, resultSchema) val capacity = sqlConf.orcVectorizedReaderBatchSize -val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) -OrcConf.MAPRED_INPUT_SCHEMA.setString(hadoopConf, resultSchemaString) Review comment: will this not help I have already added this in the code `OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString)` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454410994 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala Review comment: In this case code follows this path https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala#L158 Now requiredSchema (result schema) is used for getting the column id instead of entire dataschema . So in this scenario required requiredSchema(requiredSchema) is used to create the VectorizedRowBatchWrap wrap = new VectorizedRowBatchWrap(orcSchema.createRowBatch(capacity)); Where as in failure case column id got from the dataschema and VectorizedRowBatchWrap is created using the requiredSchema (result schema). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454359574 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala Review comment: yes, if that would be the case then it will be same as how the data created from the spark Application using orc. And it follows the code flow as its working today for spark orc data source tables. So this only failing for the orc data created by hive, So if I create the data from this using the spark orc datasource , This error is not coming ``` val u = """select * from date_dim limit 5""" scala> spark.sql(u).write.format("orc").save("/Users/tpcdsdata/testFS/testorc") val table = """CREATE TABLE `date_dim345` ( | `d_date_sk` INT, | `d_date_id` STRING, | `d_date` TIMESTAMP, | `d_month_seq` INT, | `d_week_seq` INT, | `d_quarter_seq` INT, | `d_year` INT, | `d_dow` INT, | `d_moy` INT, | `d_dom` INT, | `d_qoy` INT, | `d_fy_year` INT, | `d_fy_quarter_seq` INT, | `d_fy_week_seq` INT, | `d_day_name` STRING, | `d_quarter_name` STRING, | `d_holiday` STRING, | `d_weekend` STRING, | `d_following_holiday` STRING, | `d_first_dom` INT, | `d_last_dom` INT, | `d_same_day_ly` INT, | `d_same_day_lq` INT, | `d_current_day` STRING, | `d_current_week` STRING, | `d_current_month` STRING, | `d_current_quarter` STRING, | `d_current_year` STRING) | USING orc | LOCATION '/Users/tpcdsdata/testFS/testorc/'""" spark.sql(table).collect val u = """select d_date_id from date_dim345 limit 5""" ``` Now this is having the correct value of physical orc file not _col1, _col2 etc ``` orcFieldNames = {Wrappers$JListWrapper@19940} "Wrappers$JListWrapper" size = 28 1 = "d_date_id" 9 = "d_dom" 7 = "d_dow" 11 = "d_fy_year" 5 = "d_quarter_seq" 0 = "d_date_sk" 8 = "d_moy" 10 = "d_qoy" 4 = "d_week_seq" 6 = "d_year" 3 = "d_month_seq" 2 = "d_date" ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454359574 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala Review comment: yes, if that would be the case then it will be same as how the data created from the spark Application using orc. And it follows the code flow as its working today for spark orc data source tables. So this only failing for the orc data created by hive, So if I create the data from this using the spark orc datasource , This error is not coming ``` val u = """select * from date_dim limit 5""" scala> spark.sql(u).write.format("orc").save("/Users/tpcdsdata/testFS/testorc") val table = """CREATE TABLE `date_dim345` ( | `d_date_sk` INT, | `d_date_id` STRING, | `d_date` TIMESTAMP, | `d_month_seq` INT, | `d_week_seq` INT, | `d_quarter_seq` INT, | `d_year` INT, | `d_dow` INT, | `d_moy` INT, | `d_dom` INT, | `d_qoy` INT, | `d_fy_year` INT, | `d_fy_quarter_seq` INT, | `d_fy_week_seq` INT, | `d_day_name` STRING, | `d_quarter_name` STRING, | `d_holiday` STRING, | `d_weekend` STRING, | `d_following_holiday` STRING, | `d_first_dom` INT, | `d_last_dom` INT, | `d_same_day_ly` INT, | `d_same_day_lq` INT, | `d_current_day` STRING, | `d_current_week` STRING, | `d_current_month` STRING, | `d_current_quarter` STRING, | `d_current_year` STRING) | USING orc | LOCATION '/Users/tpcdsdata/testFS/testorc/'""" spark.sql(table).collect val u = """select date_dim.d_date_id from date_dim345 limit 5""" ``` Now this is having the correct value of physical orc file not _col1, _col2 etc ``` orcFieldNames = {Wrappers$JListWrapper@19940} "Wrappers$JListWrapper" size = 28 1 = "d_date_id" 9 = "d_dom" 7 = "d_dow" 11 = "d_fy_year" 5 = "d_quarter_seq" 0 = "d_date_sk" 8 = "d_moy" 10 = "d_qoy" 4 = "d_week_seq" 6 = "d_year" 3 = "d_month_seq" 2 = "d_date" ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454234413 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala Review comment: In this code , this is the place where we are getting exception https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java#L183 So the code is creating a wrap VectorizedRowBatchWrap using the result schema which is ``` result = {StructType@16990} "StructType" size = 1 0 = {StructField@17128} "StructField(d_year,IntegerType,true)" ``` https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java#L149 Now wrap is created with size 1 and in this code ` code orcVectorWrappers[i] = new OrcColumnVector(dt, wrap.batch().cols[colId]);` value of colId will be 6 (which is the requestedColumnIds will be [6]) Now the size of wrap is 1 and code is requesting the fetch wrap.batch().cols[6] so here its getting ArrayIndexOutOfBoundsException This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454128710 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): (Option[Array[Int]], Boolean) = { +var sendActualSchema = false val orcFieldNames = reader.getSchema.getFieldNames.asScala Review comment: For this example query ``` val u = """select date_dim.d_year from date_dim limit 5""" spark.sql(u).collect ``` value of the orcFieldNames ``` val of orcFieldNames = {Wrappers$JListWrapper@13621} "Wrappers$JListWrapper" size = 28 0 = "_col0" 1 = "_col1" 2 = "_col2" 3 = "_col3" 4 = "_col4" 5 = "_col5" 6 = "_col6" 7 = "_col7" 8 = "_col8" 9 = "_col9" 10 = "_col10" ``` value of dataSchema ``` value of dataSchema = {StructType@13616} "StructType" size = 28 0 = {StructField@16487} "StructField(d_date_sk,IntegerType,true)" 1 = {StructField@16488} "StructField(d_date_id,StringType,true)" 2 = {StructField@16489} "StructField(d_date,TimestampType,true)" 3 = {StructField@16490} "StructField(d_month_seq,IntegerType,true)" 4 = {StructField@16491} "StructField(d_week_seq,IntegerType,true)" 5 = {StructField@16492} "StructField(d_quarter_seq,IntegerType,true)" 6 = {StructField@16493} "StructField(d_year,IntegerType,true)" 7 = {StructField@16494} "StructField(d_dow,IntegerType,true)" 8 = {StructField@16495} "StructField(d_moy,IntegerType,true)" 9 = {StructField@16496} "StructField(d_dom,IntegerType,true)" 10 = {StructField@16497} "StructField(d_qoy,IntegerType,true)" ``` value of requiredSchema ``` result = {StructType@16990} "StructType" size = 1 0 = {StructField@17128} "StructField(d_year,IntegerType,true)" ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r454120426 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -179,12 +179,17 @@ class OrcFileFormat val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val requestedColIdsOrEmptyFile = + val (requestedColIdsOrEmptyFile, sendActualSchema) = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, requiredSchema, reader, conf) } + if (sendActualSchema) { +resultSchemaString = OrcUtils.orcTypeDescriptionString(actualSchema) Review comment: Yes for this case we cannot do it ,so as the result of which we have the give the actual schema This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453453601 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala ## @@ -80,10 +80,10 @@ case class OrcPartitionReaderFactory( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } -if (requestedColIdsOrEmptyFile.isEmpty) { +if (requestedColIdsOrEmptyFile._1.isEmpty) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453437981 ## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala ## @@ -288,4 +288,56 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234: orc data created by the hive tables having _col fields" + +" name for vectorized reader") { +Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { +withTable("test_hive_orc_vect_read") { + spark.sql( +""" + | CREATE TABLE test_hive_orc_vect_read + | (_col1 INT, _col2 STRING, _col3 INT) + | USING orc +""".stripMargin) + spark.sql( +""" + | INSERT INTO + | test_hive_orc_vect_read + | VALUES(9, '12', 2020) +""".stripMargin) + + val df = spark.sql("SELECT _col2 FROM test_hive_orc_vect_read") + checkAnswer(df, Row("12")) +} + } +} + } + + test("SPARK-32234: orc data created by the hive tables having _col fields name" + Review comment: Merged the test This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453437981 ## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala ## @@ -288,4 +288,56 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234: orc data created by the hive tables having _col fields" + +" name for vectorized reader") { +Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { +withTable("test_hive_orc_vect_read") { + spark.sql( +""" + | CREATE TABLE test_hive_orc_vect_read + | (_col1 INT, _col2 STRING, _col3 INT) + | USING orc +""".stripMargin) + spark.sql( +""" + | INSERT INTO + | test_hive_orc_vect_read + | VALUES(9, '12', 2020) +""".stripMargin) + + val df = spark.sql("SELECT _col2 FROM test_hive_orc_vect_read") + checkAnswer(df, Row("12")) +} + } +} + } + + test("SPARK-32234: orc data created by the hive tables having _col fields name" + Review comment: Merge the test This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453438056 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -185,10 +185,15 @@ class OrcFileFormat isCaseSensitive, dataSchema, requiredSchema, reader, conf) } - if (requestedColIdsOrEmptyFile.isEmpty) { + if (requestedColIdsOrEmptyFile._2) { +resultSchemaString = OrcUtils.orcTypeDescriptionString(actualSchema) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) + + if (requestedColIdsOrEmptyFile._1.isEmpty) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453437937 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala ## @@ -116,47 +116,53 @@ object OrcUtils extends Logging { } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * Returns the requested column ids from the given ORC file and Boolean flag to use actual + * schema or result schema. Column id can be -1, which means the requested column doesn't + * exist in the ORC file. Returns None if the given ORC file is empty. Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453278778 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -181,10 +183,19 @@ class OrcFileFormat val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) val requestedColIdsOrEmptyFile = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => + // for ORC file written by Hive, no field names + // in the physical schema, there is a need to send the + // entire dataSchema instead of required schema + val orcFieldNames = reader.getSchema.getFieldNames.asScala + if (orcFieldNames.forall(_.startsWith("_col"))) { +resultSchemaString = OrcUtils.orcTypeDescriptionString(actualSchema) + } Review comment: I have refactored the code an include this in OrcUtils.requestedColumnIds ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,44 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +withTable("test_date_hive_orc") { + spark.sql( +""" + | CREATE TABLE test_date_hive_orc + | (_col1 INT, _col2 STRING, _col3 INT) + | USING orc +""".stripMargin) + spark.sql( +""" + | INSERT INTO + | test_date_hive_orc + | VALUES(9, '12', 2020) +""".stripMargin) + + val df = spark.sql("SELECT _col2 from test_date_hive_orc") + checkAnswer(df, Row("12")) +} + } + + test("SPARK-32234: orc data created by the spark having proper fields name") { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453171980 ## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ## @@ -181,10 +183,19 @@ class OrcFileFormat val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) val requestedColIdsOrEmptyFile = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => + // for ORC file written by Hive, no field names + // in the physical schema, there is a need to send the + // entire dataSchema instead of required schema + val orcFieldNames = reader.getSchema.getFieldNames.asScala + if (orcFieldNames.forall(_.startsWith("_col"))) { Review comment: So this is for a ORC file written by Hive, no field names in the physical schema. In that case it its having names like _col1, _col2 etc. Check this code for reference https://github.com/apache/spark/blob/84db660ebef4f9c543ab2709103c4542b407a829/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala#L133 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453171512 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null +withTable("test_date_hive_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_hive_orc` Review comment: not required ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null +withTable("test_date_hive_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc +""".stripMargin) + spark.sql( +"""insert into + | test_date_hive_orc + | values(9, '12', 2020) +""".stripMargin) + + val df = spark.sql("select _col2 from test_date_hive_orc") + checkAnswer(df, Row("12")) +} + } + + test("SPARK-32234: orc data created by the spark having proper fields name") { +withTable("test_date_spark_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_spark_orc` + | (`d_date_sk` INT,`d_date_id` STRING,`d_year` INT) + | USING orc +""".stripMargin) + spark.sql( +"""insert into + | test_date_spark_orc + | values(9, '12', 2020) +""".stripMargin) + + val df = spark.sql("select d_date_id from test_date_spark_orc") Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453171504 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453164571 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,64 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null +withTable("test_date_hive_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc +""".stripMargin) + spark.sql( +"""insert into + | test_date_hive_orc + | values(9, '12', 2020) +""".stripMargin) + try { +val df = spark.sql("select _col2 from test_date_hive_orc") +checkAnswer(df, Row("12")) + } catch { +case e: Throwable => + error = e + } + assert(error == null) Review comment: Yes not required already handled in the framework in checkAnswer,I had removed it This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453164571 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,64 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null +withTable("test_date_hive_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc +""".stripMargin) + spark.sql( +"""insert into + | test_date_hive_orc + | values(9, '12', 2020) +""".stripMargin) + try { +val df = spark.sql("select _col2 from test_date_hive_orc") +checkAnswer(df, Row("12")) + } catch { +case e: Throwable => + error = e + } + assert(error == null) Review comment: Yes not required already handled in the framework in checkAnswer This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r453164516 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,64 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-32234: orc data created by the hive tables having _col fields name") { +var error: Throwable = null +withTable("test_date_hive_orc") { + spark.sql( +""" + |CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc +""".stripMargin) + spark.sql( +"""insert into + | test_date_hive_orc + | values(9, '12', 2020) +""".stripMargin) + try { +val df = spark.sql("select _col2 from test_date_hive_orc") +checkAnswer(df, Row("12")) + } catch { +case e: Throwable => + error = e + } + assert(error == null) + spark.sql( +s""" + |DROP TABLE IF Review comment: yes in withTable table its handled, Removed this drop table . This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r452727689 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("orc data created by the hive tables having _col fields name") { +var error: Throwable = null +val table = """CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc""".stripMargin +spark.sql(table).collect +spark.sql("insert into test_date_hive_orc values(9, '12', 2020)").collect +val df = spark.sql("select _col2 from test_date_hive_orc") +try { + val data = df.collect() + assert(data.length == 1) +} catch { + case e: Throwable => +error = e +} +assert(error == null) +spark.sql(s"DROP TABLE IF EXISTS test_date_hive_orc") + } + + test("orc data created by the spark having proper fields name") { Review comment: done ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("orc data created by the hive tables having _col fields name") { +var error: Throwable = null +val table = """CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc""".stripMargin +spark.sql(table).collect +spark.sql("insert into test_date_hive_orc values(9, '12', 2020)").collect +val df = spark.sql("select _col2 from test_date_hive_orc") +try { + val data = df.collect() + assert(data.length == 1) +} catch { + case e: Throwable => +error = e +} +assert(error == null) +spark.sql(s"DROP TABLE IF EXISTS test_date_hive_orc") + } + + test("orc data created by the spark having proper fields name") { +var error: Throwable = null +val table = """CREATE TABLE `test_date_spark_orc` + | (`d_date_sk` INT,`d_date_id` STRING,`d_year` INT) + | USING orc""".stripMargin +spark.sql(table).collect +spark.sql("insert into test_date_spark_orc values(9, '12', 2020)").collect +val df = spark.sql("select d_date_id from test_date_spark_orc") +try { + val data = df.collect() + assert(data.length == 1) +} catch { + case e: Throwable => +error = e +} +assert(error == null) +spark.sql(s"DROP TABLE IF EXISTS test_date_spark_orc") + } + Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r452727485 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("orc data created by the hive tables having _col fields name") { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] SaurabhChawla100 commented on a change in pull request #29045: [SPARK-32234][SQL] Spark sql commands are failing on selecting the orc tables
SaurabhChawla100 commented on a change in pull request #29045: URL: https://github.com/apache/spark/pull/29045#discussion_r452727623 ## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala ## @@ -77,4 +77,43 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("orc data created by the hive tables having _col fields name") { +var error: Throwable = null +val table = """CREATE TABLE `test_date_hive_orc` + | (`_col1` INT,`_col2` STRING,`_col3` INT) + | USING orc""".stripMargin +spark.sql(table).collect +spark.sql("insert into test_date_hive_orc values(9, '12', 2020)").collect +val df = spark.sql("select _col2 from test_date_hive_orc") +try { + val data = df.collect() + assert(data.length == 1) +} catch { + case e: Throwable => +error = e +} +assert(error == null) +spark.sql(s"DROP TABLE IF EXISTS test_date_hive_orc") Review comment: Refactored the unit test This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org