sadikovi commented on code in PR #37654:
URL: https://github.com/apache/spark/pull/37654#discussion_r982881638
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala:
##########
@@ -72,87 +69,9 @@ class ParquetFileFormat
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
- val parquetOptions = new ParquetOptions(options,
sparkSession.sessionState.conf)
-
- val conf = ContextUtil.getConfiguration(job)
-
- val committerClass =
- conf.getClass(
- SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
- classOf[ParquetOutputCommitter],
- classOf[OutputCommitter])
-
- if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
- logInfo("Using default output committer for Parquet: " +
- classOf[ParquetOutputCommitter].getCanonicalName)
- } else {
- logInfo("Using user defined output committer for Parquet: " +
committerClass.getCanonicalName)
- }
-
- conf.setClass(
- SQLConf.OUTPUT_COMMITTER_CLASS.key,
- committerClass,
- classOf[OutputCommitter])
-
- // We're not really using `ParquetOutputFormat[Row]` for writing data
here, because we override
- // it in `ParquetOutputWriter` to support appending and dynamic
partitioning. The reason why
- // we set it here is to setup the output committer class to
`ParquetOutputCommitter`, which is
- // bundled with `ParquetOutputFormat[Row]`.
- job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
-
- ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
-
- // This metadata is useful for keeping UDTs like Vector/Matrix.
- ParquetWriteSupport.setSchema(dataSchema, conf)
-
- // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to
Parquet
- // schema and writes actual rows to Parquet files.
- conf.set(
- SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
- sparkSession.sessionState.conf.writeLegacyParquetFormat.toString)
-
- conf.set(
- SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key,
- sparkSession.sessionState.conf.parquetOutputTimestampType.toString)
-
- conf.set(
- SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED.key,
- sparkSession.sessionState.conf.parquetFieldIdWriteEnabled.toString)
-
- conf.set(
- SQLConf.PARQUET_TIMESTAMP_NTZ_ENABLED.key,
- sparkSession.sessionState.conf.parquetTimestampNTZEnabled.toString)
-
- // Sets compression scheme
- conf.set(ParquetOutputFormat.COMPRESSION,
parquetOptions.compressionCodecClassName)
-
- // SPARK-15719: Disables writing Parquet summary files by default.
- if (conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null
- && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null) {
- conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE)
- }
-
- if (ParquetOutputFormat.getJobSummaryLevel(conf) != JobSummaryLevel.NONE
Review Comment:
Yes, DSv1 version is the correct one. We essentially disallow creating job
summaries with an unknown output committer
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]