Repository: spark
Updated Branches:
refs/heads/branch-2.0 790de600b -> 9e7e2f916
[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour
## What changes were proposed in this pull request?
This pr fixes the behaviour of `format("csv").option("quote", null)` along with
one of spark-csv.
Also, it explicitly sets default values for CSV options in python.
## How was this patch tested?
Added tests in CSVSuite.
Author: Takeshi YAMAMURO <[email protected]>
Closes #13372 from maropu/SPARK-15585.
(cherry picked from commit b7e8d1cb3ce932ba4a784be59744af8a8ef027ce)
Signed-off-by: Reynold Xin <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e7e2f91
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e7e2f91
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e7e2f91
Branch: refs/heads/branch-2.0
Commit: 9e7e2f9164e0b3bd555e795b871626057b4fed31
Parents: 790de60
Author: Takeshi YAMAMURO <[email protected]>
Authored: Sun Jun 5 23:35:04 2016 -0700
Committer: Reynold Xin <[email protected]>
Committed: Sun Jun 5 23:35:10 2016 -0700
----------------------------------------------------------------------
python/pyspark/sql/readwriter.py | 81 ++++++++++----------
.../execution/datasources/csv/CSVOptions.scala | 11 ++-
.../execution/datasources/csv/CSVSuite.scala | 11 +++
3 files changed, 55 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/9e7e2f91/python/pyspark/sql/readwriter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 9208a52..19aa8dd 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -303,10 +303,11 @@ class DataFrameReader(object):
return
self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path)))
@since(2.0)
- def csv(self, path, schema=None, sep=None, encoding=None, quote=None,
escape=None,
- comment=None, header=None, ignoreLeadingWhiteSpace=None,
ignoreTrailingWhiteSpace=None,
- nullValue=None, nanValue=None, positiveInf=None, negativeInf=None,
dateFormat=None,
- maxColumns=None, maxCharsPerColumn=None, mode=None):
+ def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"',
escape=u'\\',
+ comment=None, header='false', ignoreLeadingWhiteSpace='false',
+ ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN',
positiveInf='Inf',
+ negativeInf='Inf', dateFormat=None, maxColumns='20480',
maxCharsPerColumn='1000000',
+ mode='PERMISSIVE'):
"""Loads a CSV file and returns the result as a [[DataFrame]].
This function goes through the input once to determine the input
schema. To avoid going
@@ -315,44 +316,41 @@ class DataFrameReader(object):
:param path: string, or list of strings, for input path(s).
:param schema: an optional :class:`StructType` for the input schema.
:param sep: sets the single character as a separator for each field
and value.
- If None is set, it uses the default value, ``,``.
- :param encoding: decodes the CSV files by the given encoding type. If
None is set,
- it uses the default value, ``UTF-8``.
+ The default value is ``,``.
+ :param encoding: decodes the CSV files by the given encoding type.
+ The default value is ``UTF-8``.
:param quote: sets the single character used for escaping quoted
values where the
- separator can be part of the value. If None is set, it
uses the default
- value, ``"``.
+ separator can be part of the value. The default value is
``"``.
:param escape: sets the single character used for escaping quotes
inside an already
- quoted value. If None is set, it uses the default
value, ``\``.
+ quoted value. The default value is ``\``.
:param comment: sets the single character used for skipping lines
beginning with this
character. By default (None), it is disabled.
- :param header: uses the first line as names of columns. If None is
set, it uses the
- default value, ``false``.
+ :param header: uses the first line as names of columns. The default
value is ``false``.
:param ignoreLeadingWhiteSpace: defines whether or not leading
whitespaces from values
- being read should be skipped. If None
is set, it uses
- the default value, ``false``.
+ being read should be skipped. The
default value is
+ ``false``.
:param ignoreTrailingWhiteSpace: defines whether or not trailing
whitespaces from values
- being read should be skipped. If None
is set, it uses
- the default value, ``false``.
- :param nullValue: sets the string representation of a null value. If
None is set, it uses
- the default value, empty string.
- :param nanValue: sets the string representation of a non-number value.
If None is set, it
- uses the default value, ``NaN``.
- :param positiveInf: sets the string representation of a positive
infinity value. If None
- is set, it uses the default value, ``Inf``.
- :param negativeInf: sets the string representation of a negative
infinity value. If None
- is set, it uses the default value, ``Inf``.
+ being read should be skipped. The
default value is
+ ``false``.
+ :param nullValue: sets the string representation of a null value. The
default value is a
+ empty string.
+ :param nanValue: sets the string representation of a non-number value.
The default value is
+ ``NaN``.
+ :param positiveInf: sets the string representation of a positive
infinity value. The default
+ value is ``Inf``.
+ :param negativeInf: sets the string representation of a negative
infinity value. The default
+ value is ``Inf``.
:param dateFormat: sets the string that indicates a date format.
Custom date formats
follow the formats at
``java.text.SimpleDateFormat``. This
applies to both date type and timestamp type. By
default, it is None
which means trying to parse times and date by
``java.sql.Timestamp.valueOf()`` and
``java.sql.Date.valueOf()``.
- :param maxColumns: defines a hard limit of how many columns a record
can have. If None is
- set, it uses the default value, ``20480``.
+ :param maxColumns: defines a hard limit of how many columns a record
can have. The default
+ value is ``20480``.
:param maxCharsPerColumn: defines the maximum number of characters
allowed for any given
- value being read. If None is set, it uses
the default value,
- ``1000000``.
- :param mode: allows a mode for dealing with corrupt records during
parsing. If None is
- set, it uses the default value, ``PERMISSIVE``.
+ value being read. The default value is
``1000000``.
+ :param mode: allows a mode for dealing with corrupt records during
parsing. The default
+ value is ``PERMISSIVE``.
* ``PERMISSIVE`` : sets other fields to ``null`` when it meets
a corrupted record.
When a schema is set by user, it sets ``null`` for extra
fields.
@@ -785,8 +783,8 @@ class DataFrameWriter(object):
self._jwrite.text(path)
@since(2.0)
- def csv(self, path, mode=None, compression=None, sep=None, quote=None,
escape=None,
- header=None, nullValue=None, escapeQuotes=None):
+ def csv(self, path, mode='error', compression=None, sep=',', quote=u'\"',
escape='\\',
+ header='false', nullValue='', escapeQuotes='true'):
"""Saves the content of the [[DataFrame]] in CSV format at the
specified path.
:param path: the path in any Hadoop supported file system
@@ -800,20 +798,19 @@ class DataFrameWriter(object):
:param compression: compression codec to use when saving to file. This
can be one of the
known case-insensitive shorten names (none, bzip2,
gzip, lz4,
snappy and deflate).
- :param sep: sets the single character as a separator for each field
and value. If None is
- set, it uses the default value, ``,``.
+ :param sep: sets the single character as a separator for each field
and value. The default
+ value is ``,``.
:param quote: sets the single character used for escaping quoted
values where the
- separator can be part of the value. If None is set, it
uses the default
- value, ``"``.
+ separator can be part of the value. The default value is
``"``.
:param escape: sets the single character used for escaping quotes
inside an already
- quoted value. If None is set, it uses the default
value, ``\``
+ quoted value. The default value is ``\``
:param escapeQuotes: A flag indicating whether values containing
quotes should always
be enclosed in quotes. If None is set, it uses
the default value
``true``, escaping all values containing a quote
character.
- :param header: writes the names of columns as the first line. If None
is set, it uses
- the default value, ``false``.
- :param nullValue: sets the string representation of a null value. If
None is set, it uses
- the default value, empty string.
+ :param header: writes the names of columns as the first line. The
default value is
+ ``false``.
+ :param nullValue: sets the string representation of a null value. The
default value is a
+ empty string.
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
@@ -831,7 +828,7 @@ class DataFrameWriter(object):
if nullValue is not None:
self.option("nullValue", nullValue)
if escapeQuotes is not None:
- self.option("escapeQuotes", nullValue)
+ self.option("escapeQuotes", escapeQuotes)
self._jwrite.csv(path)
@since(1.5)
http://git-wip-us.apache.org/repos/asf/spark/blob/9e7e2f91/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 9f4ce83..044ada2 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -30,8 +30,7 @@ private[sql] class CSVOptions(@transient private val
parameters: Map[String, Str
val paramValue = parameters.get(paramName)
paramValue match {
case None => default
- case Some(null) => default
- case Some(value) if value.length == 0 => '\u0000'
+ case Some(value) if value == null || value.length == 0 => '\u0000'
case Some(value) if value.length == 1 => value.charAt(0)
case _ => throw new RuntimeException(s"$paramName cannot be more than
one character")
}
@@ -52,12 +51,12 @@ private[sql] class CSVOptions(@transient private val
parameters: Map[String, Str
}
private def getBool(paramName: String, default: Boolean = false): Boolean = {
- val param = parameters.getOrElse(paramName, default.toString)
- if (param == null) {
+ val paramValue = parameters.getOrElse(paramName, default.toString)
+ if (paramValue == null) {
default
- } else if (param.toLowerCase == "true") {
+ } else if (paramValue.toLowerCase == "true") {
true
- } else if (param.toLowerCase == "false") {
+ } else if (paramValue.toLowerCase == "false") {
false
} else {
throw new Exception(s"$paramName flag can be true or false")
http://git-wip-us.apache.org/repos/asf/spark/blob/9e7e2f91/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index bc95446..b26fcea 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -655,4 +655,15 @@ class CSVSuite extends QueryTest with SharedSQLContext
with SQLTestUtils {
assert(msg.contains("CSV data source does not support array<string> data
type"))
}
}
+
+ test("SPARK-15585 set null at quote") {
+ val cars = spark.read
+ .format("csv")
+ .option("header", "true")
+ .option("quote", null)
+ .load(testFile(carsUnbalancedQuotesFile))
+
+ verifyCars(cars, withHeader = true, checkValues = false)
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]