Github user srowen commented on a diff in the pull request:
https://github.com/apache/spark/pull/23052#discussion_r237210777
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
---
@@ -169,13 +169,18 @@ private[csv] class CsvOutputWriter(
context: TaskAttemptContext,
params: CSVOptions) extends OutputWriter with Logging {
- private val charset = Charset.forName(params.charset)
+ private var univocityGenerator: Option[UnivocityGenerator] = None
- private val writer = CodecStreams.createOutputStreamWriter(context, new
Path(path), charset)
-
- private val gen = new UnivocityGenerator(dataSchema, writer, params)
+ override def write(row: InternalRow): Unit = {
+ val gen = univocityGenerator.getOrElse {
+ val charset = Charset.forName(params.charset)
+ val os = CodecStreams.createOutputStreamWriter(context, new
Path(path), charset)
+ new UnivocityGenerator(dataSchema, os, params)
+ }
+ univocityGenerator = Some(gen)
--- End diff --
Doesn't this need to be in the getOrElse block? although it doesn't matter,
it's setting this to itself every time, and maybe that's a little bit of
overhead to avoid.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]