Repository: spark Updated Branches: refs/heads/master ac527b520 -> 64ad7b841
[SPARK-23772][FOLLOW-UP][SQL] Provide an option to ignore column of all null values or empty array during JSON schema inference ## What changes were proposed in this pull request? The `dropFieldIfAllNull` parameter of the `json` method wasn't set as an option. This PR fixes that. ## How was this patch tested? I added a test to `sql/test.py` Author: Maxim Gekk <maxim.g...@databricks.com> Closes #22002 from MaxGekk/drop-field-if-all-null. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/64ad7b84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/64ad7b84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/64ad7b84 Branch: refs/heads/master Commit: 64ad7b841d1efa979041358ee2a19aea7382d737 Parents: ac527b52 Author: Maxim Gekk <maxim.g...@databricks.com> Authored: Mon Aug 6 16:46:55 2018 +0800 Committer: hyukjinkwon <gurwls...@apache.org> Committed: Mon Aug 6 16:46:55 2018 +0800 ---------------------------------------------------------------------- python/pyspark/sql/readwriter.py | 2 +- python/pyspark/sql/tests.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/64ad7b84/python/pyspark/sql/readwriter.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 98b2cd9..abf878a 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -267,7 +267,7 @@ class DataFrameReader(OptionUtils): mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, timestampFormat=timestampFormat, multiLine=multiLine, allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, - samplingRatio=samplingRatio, encoding=encoding) + samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding) if isinstance(path, basestring): path = [path] if type(path) == list: http://git-wip-us.apache.org/repos/asf/spark/blob/64ad7b84/python/pyspark/sql/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index a294d70..ed97a63 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3351,6 +3351,22 @@ class SQLTests(ReusedSQLTestCase): finally: shutil.rmtree(path) + def test_ignore_column_of_all_nulls(self): + path = tempfile.mkdtemp() + shutil.rmtree(path) + try: + df = self.spark.createDataFrame([["""{"a":null, "b":1, "c":3.0}"""], + ["""{"a":null, "b":null, "c":"string"}"""], + ["""{"a":null, "b":null, "c":null}"""]]) + df.write.text(path) + schema = StructType([ + StructField('b', LongType(), nullable=True), + StructField('c', StringType(), nullable=True)]) + readback = self.spark.read.json(path, dropFieldIfAllNull=True) + self.assertEquals(readback.schema, schema) + finally: + shutil.rmtree(path) + def test_repr_behaviors(self): import re pattern = re.compile(r'^ *\|', re.MULTILINE) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org