Github user MaxGekk commented on a diff in the pull request:
https://github.com/apache/spark/pull/21247#discussion_r194147472
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
---
@@ -138,3 +121,40 @@ private[sql] class JSONOptions(
factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS,
allowUnquotedControlChars)
}
}
+
+private[sql] class JSONOptionsInRead(
+ @transient override val parameters: CaseInsensitiveMap[String],
+ defaultTimeZoneId: String,
+ defaultColumnNameOfCorruptRecord: String)
+ extends JSONOptions(parameters, defaultTimeZoneId,
defaultColumnNameOfCorruptRecord) {
+
+ def this(
+ parameters: Map[String, String],
+ defaultTimeZoneId: String,
+ defaultColumnNameOfCorruptRecord: String = "") = {
+ this(
+ CaseInsensitiveMap(parameters),
+ defaultTimeZoneId,
+ defaultColumnNameOfCorruptRecord)
+ }
+
+ protected override def checkedEncoding(enc: String): String = {
+ // The following encodings are not supported in per-line mode
(multiline is false)
+ // because they cause some problems in reading files with BOM which is
supposed to
+ // present in the files with such encodings. After splitting input
files by lines,
+ // only the first lines will have the BOM which leads to impossibility
for reading
+ // the rest lines. Besides of that, the lineSep option must have the
BOM in such
+ // encodings which can never present between lines.
+ val blacklist = Seq(Charset.forName("UTF-16"),
Charset.forName("UTF-32"))
+ val isBlacklisted = blacklist.contains(Charset.forName(enc))
+ require(multiLine || !isBlacklisted,
--- End diff --
@HyukjinKwon I have already implemented `lineSep` detection for different
encodings (for `UTF-16` and `UTF-32` in particular). At the moment I am writing
tests for that. I will prepare a PR soon.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]