spark git commit: Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"

rxin Sun, 05 Jun 2016 23:40:42 -0700

Repository: spark
Updated Branches:
  refs/heads/master b7e8d1cb3 -> 32f2f95db



Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"

This reverts commit b7e8d1cb3ce932ba4a784be59744af8a8ef027ce.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/32f2f95d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/32f2f95d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/32f2f95d

Branch: refs/heads/master
Commit: 32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973
Parents: b7e8d1c
Author: Reynold Xin <r...@databricks.com>
Authored: Sun Jun 5 23:40:13 2016 -0700
Committer: Reynold Xin <r...@databricks.com>
Committed: Sun Jun 5 23:40:13 2016 -0700

----------------------------------------------------------------------
 python/pyspark/sql/readwriter.py                | 81 ++++++++++----------
 .../execution/datasources/csv/CSVOptions.scala  | 11 +--
 .../execution/datasources/csv/CSVSuite.scala    | 11 ---
 3 files changed, 48 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/32f2f95d/python/pyspark/sql/readwriter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 19aa8dd..9208a52 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -303,11 +303,10 @@ class DataFrameReader(object):
         return 
self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path)))
 
     @since(2.0)
-    def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', 
escape=u'\\',
-            comment=None, header='false', ignoreLeadingWhiteSpace='false',
-            ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', 
positiveInf='Inf',
-            negativeInf='Inf', dateFormat=None, maxColumns='20480', 
maxCharsPerColumn='1000000',
-            mode='PERMISSIVE'):
+    def csv(self, path, schema=None, sep=None, encoding=None, quote=None, 
escape=None,
+            comment=None, header=None, ignoreLeadingWhiteSpace=None, 
ignoreTrailingWhiteSpace=None,
+            nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, 
dateFormat=None,
+            maxColumns=None, maxCharsPerColumn=None, mode=None):
         """Loads a CSV file and returns the result as a [[DataFrame]].
 
         This function goes through the input once to determine the input 
schema. To avoid going
@@ -316,41 +315,44 @@ class DataFrameReader(object):
         :param path: string, or list of strings, for input path(s).
         :param schema: an optional :class:`StructType` for the input schema.
         :param sep: sets the single character as a separator for each field 
and value.
-                    The default value is ``,``.
-        :param encoding: decodes the CSV files by the given encoding type.
-                    The default value is ``UTF-8``.
+                    If None is set, it uses the default value, ``,``.
+        :param encoding: decodes the CSV files by the given encoding type. If 
None is set,
+                         it uses the default value, ``UTF-8``.
         :param quote: sets the single character used for escaping quoted 
values where the
-                      separator can be part of the value. The default value is 
``"``.
+                      separator can be part of the value. If None is set, it 
uses the default
+                      value, ``"``.
         :param escape: sets the single character used for escaping quotes 
inside an already
-                       quoted value. The default value is ``\``.
+                       quoted value. If None is set, it uses the default 
value, ``\``.
         :param comment: sets the single character used for skipping lines 
beginning with this
                         character. By default (None), it is disabled.
-        :param header: uses the first line as names of columns. The default 
value is ``false``.
+        :param header: uses the first line as names of columns. If None is 
set, it uses the
+                       default value, ``false``.
         :param ignoreLeadingWhiteSpace: defines whether or not leading 
whitespaces from values
-                                        being read should be skipped. The 
default value is
-                                        ``false``.
+                                        being read should be skipped. If None 
is set, it uses
+                                        the default value, ``false``.
         :param ignoreTrailingWhiteSpace: defines whether or not trailing 
whitespaces from values
-                                         being read should be skipped. The 
default value is
-                                         ``false``.
-        :param nullValue: sets the string representation of a null value. The 
default value is a
-                          empty string.
-        :param nanValue: sets the string representation of a non-number value. 
The default value is
-                         ``NaN``.
-        :param positiveInf: sets the string representation of a positive 
infinity value. The default
-                            value is ``Inf``.
-        :param negativeInf: sets the string representation of a negative 
infinity value. The default
-                            value is ``Inf``.
+                                         being read should be skipped. If None 
is set, it uses
+                                         the default value, ``false``.
+        :param nullValue: sets the string representation of a null value. If 
None is set, it uses
+                          the default value, empty string.
+        :param nanValue: sets the string representation of a non-number value. 
If None is set, it
+                         uses the default value, ``NaN``.
+        :param positiveInf: sets the string representation of a positive 
infinity value. If None
+                            is set, it uses the default value, ``Inf``.
+        :param negativeInf: sets the string representation of a negative 
infinity value. If None
+                            is set, it uses the default value, ``Inf``.
         :param dateFormat: sets the string that indicates a date format. 
Custom date formats
                            follow the formats at 
``java.text.SimpleDateFormat``. This
                            applies to both date type and timestamp type. By 
default, it is None
                            which means trying to parse times and date by
                            ``java.sql.Timestamp.valueOf()`` and 
``java.sql.Date.valueOf()``.
-        :param maxColumns: defines a hard limit of how many columns a record 
can have. The default
-                           value is ``20480``.
+        :param maxColumns: defines a hard limit of how many columns a record 
can have. If None is
+                           set, it uses the default value, ``20480``.
         :param maxCharsPerColumn: defines the maximum number of characters 
allowed for any given
-                                  value being read. The default value is 
``1000000``.
-        :param mode: allows a mode for dealing with corrupt records during 
parsing. The default
-                     value is ``PERMISSIVE``.
+                                  value being read. If None is set, it uses 
the default value,
+                                  ``1000000``.
+        :param mode: allows a mode for dealing with corrupt records during 
parsing. If None is
+                     set, it uses the default value, ``PERMISSIVE``.
 
                 * ``PERMISSIVE`` : sets other fields to ``null`` when it meets 
a corrupted record.
                     When a schema is set by user, it sets ``null`` for extra 
fields.
@@ -783,8 +785,8 @@ class DataFrameWriter(object):
         self._jwrite.text(path)
 
     @since(2.0)
-    def csv(self, path, mode='error', compression=None, sep=',', quote=u'\"', 
escape='\\',
-            header='false', nullValue='', escapeQuotes='true'):
+    def csv(self, path, mode=None, compression=None, sep=None, quote=None, 
escape=None,
+            header=None, nullValue=None, escapeQuotes=None):
         """Saves the content of the [[DataFrame]] in CSV format at the 
specified path.
 
         :param path: the path in any Hadoop supported file system
@@ -798,19 +800,20 @@ class DataFrameWriter(object):
         :param compression: compression codec to use when saving to file. This 
can be one of the
                             known case-insensitive shorten names (none, bzip2, 
gzip, lz4,
                             snappy and deflate).
-        :param sep: sets the single character as a separator for each field 
and value. The default
-                    value is ``,``.
+        :param sep: sets the single character as a separator for each field 
and value. If None is
+                    set, it uses the default value, ``,``.
         :param quote: sets the single character used for escaping quoted 
values where the
-                      separator can be part of the value. The default value is 
``"``.
+                      separator can be part of the value. If None is set, it 
uses the default
+                      value, ``"``.
         :param escape: sets the single character used for escaping quotes 
inside an already
-                       quoted value. The default value is ``\``
+                       quoted value. If None is set, it uses the default 
value, ``\``
         :param escapeQuotes: A flag indicating whether values containing 
quotes should always
                              be enclosed in quotes. If None is set, it uses 
the default value
                              ``true``, escaping all values containing a quote 
character.
-        :param header: writes the names of columns as the first line. The 
default value is
-                       ``false``.
-        :param nullValue: sets the string representation of a null value. The 
default value is a
-                          empty string.
+        :param header: writes the names of columns as the first line. If None 
is set, it uses
+                       the default value, ``false``.
+        :param nullValue: sets the string representation of a null value. If 
None is set, it uses
+                          the default value, empty string.
 
         >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
         """
@@ -828,7 +831,7 @@ class DataFrameWriter(object):
         if nullValue is not None:
             self.option("nullValue", nullValue)
         if escapeQuotes is not None:
-            self.option("escapeQuotes", escapeQuotes)
+            self.option("escapeQuotes", nullValue)
         self._jwrite.csv(path)
 
     @since(1.5)

http://git-wip-us.apache.org/repos/asf/spark/blob/32f2f95d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 044ada2..9f4ce83 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -30,7 +30,8 @@ private[sql] class CSVOptions(@transient private val 
parameters: Map[String, Str
     val paramValue = parameters.get(paramName)
     paramValue match {
       case None => default
-      case Some(value) if value == null || value.length == 0 => '\u0000'
+      case Some(null) => default
+      case Some(value) if value.length == 0 => '\u0000'
       case Some(value) if value.length == 1 => value.charAt(0)
       case _ => throw new RuntimeException(s"$paramName cannot be more than 
one character")
     }
@@ -51,12 +52,12 @@ private[sql] class CSVOptions(@transient private val 
parameters: Map[String, Str
   }
 
   private def getBool(paramName: String, default: Boolean = false): Boolean = {
-    val paramValue = parameters.getOrElse(paramName, default.toString)
-    if (paramValue == null) {
+    val param = parameters.getOrElse(paramName, default.toString)
+    if (param == null) {
       default
-    } else if (paramValue.toLowerCase == "true") {
+    } else if (param.toLowerCase == "true") {
       true
-    } else if (paramValue.toLowerCase == "false") {
+    } else if (param.toLowerCase == "false") {
       false
     } else {
       throw new Exception(s"$paramName flag can be true or false")

http://git-wip-us.apache.org/repos/asf/spark/blob/32f2f95d/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index b26fcea..bc95446 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -655,15 +655,4 @@ class CSVSuite extends QueryTest with SharedSQLContext 
with SQLTestUtils {
       assert(msg.contains("CSV data source does not support array<string> data 
type"))
     }
   }
-
-  test("SPARK-15585 set null at quote") {
-    val cars = spark.read
-      .format("csv")
-      .option("header", "true")
-      .option("quote", null)
-      .load(testFile(carsUnbalancedQuotesFile))
-
-    verifyCars(cars, withHeader = true, checkValues = false)
-  }
-
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"

Reply via email to