ulysses-you commented on code in PR #39277: URL: https://github.com/apache/spark/pull/39277#discussion_r1062052748
########## sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/V1WritesHiveUtils.scala: ########## @@ -105,4 +112,164 @@ trait V1WritesHiveUtils { .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true")) .getOrElse(Map.empty) } + + def setupCompression( + fileSinkConf: FileSinkDesc, + hadoopConf: Configuration, + sparkSession: SparkSession): Unit = { + val isCompressed = + fileSinkConf.getTableInfo.getOutputFileFormatClassName.toLowerCase(Locale.ROOT) match { + case formatName if formatName.endsWith("orcoutputformat") => + // For ORC,"mapreduce.output.fileoutputformat.compress", + // "mapreduce.output.fileoutputformat.compress.codec", and + // "mapreduce.output.fileoutputformat.compress.type" + // have no impact because it uses table properties to store compression information. + false + case _ => hadoopConf.get("hive.exec.compress.output", "false").toBoolean + } + + if (isCompressed) { + hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true") + fileSinkConf.setCompressed(true) + fileSinkConf.setCompressCodec(hadoopConf + .get("mapreduce.output.fileoutputformat.compress.codec")) + fileSinkConf.setCompressType(hadoopConf + .get("mapreduce.output.fileoutputformat.compress.type")) + } else { + // Set compression by priority + HiveOptions.getHiveWriteCompression(fileSinkConf.getTableInfo, sparkSession.sessionState.conf) + .foreach { case (compression, codec) => hadoopConf.set(compression, codec) } + } + } + + /** + * Return two paths: + * 1. The first path is `stagingDir` which can be the parent path of `externalTmpPath` + * 2. The second path is `externalTmpPath`, e.g. `$stagingDir/-ext-10000` + * The call side should create `stagingDir` before using `externalTmpPath` and + * delete `stagingDir` at the end. Review Comment: wrapped using `HiveTempPath` since it would be used by `InsertIntoHiveDirCommand` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org