Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/19571#discussion_r146751242
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
---
@@ -39,4 +45,33 @@ private[sql] object OrcFileFormat {
schema.fieldNames.foreach(checkFieldName)
schema
}
+
+ def getSchemaString(schema: StructType): String = {
+ schema.fields.map(f =>
s"${f.name}:${f.dataType.catalogString}").mkString("struct<", ",", ">")
+ }
+
+ private def readSchema(file: Path, conf: ReaderOptions):
Option[TypeDescription] = {
+ try {
+ val reader = OrcFile.createReader(file, conf)
+ val schema = reader.getSchema
+ if (schema.getFieldNames.size == 0) {
+ None
+ } else {
+ Some(schema)
+ }
+ } catch {
+ case _: IOException => None
+ }
+ }
+
+ def readSchema(sparkSession: SparkSession, files: Seq[FileStatus]):
Option[StructType] = {
+ val conf = sparkSession.sparkContext.hadoopConfiguration
+ val fs = FileSystem.get(conf)
+ val options = OrcFile.readerOptions(conf).filesystem(fs)
+ files.map(_.getPath).flatMap(readSchema(_, options))
+ .headOption.map { schema =>
--- End diff --
Seems that you just take the first available schema. Looks like we don't
need to read other files when we found the first available schema.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]