[spark] branch branch-3.0 updated: Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' methods in Dataset"

weichenxu123 Mon, 17 Feb 2020 18:45:13 -0800

This is an automated email from the ASF dual-hosted git repository.

weichenxu123 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new cb890d9  Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 
'sementicHash' methods in Dataset"
cb890d9 is described below

commit cb890d96bc38860988dba97efaf6d88cc8c09288
Author: WeichenXu <[email protected]>
AuthorDate: Tue Feb 18 10:41:49 2020 +0800

    Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' 
methods in Dataset"
    
    This reverts commit ba9141592d0f0ce23c207efb21ae84ac7cc4670a.
---
 python/pyspark/sql/dataframe.py                    | 46 ----------------------
 python/pyspark/sql/tests/test_dataframe.py         |  5 ---
 .../main/scala/org/apache/spark/sql/Dataset.scala  | 28 -------------
 .../scala/org/apache/spark/sql/DatasetSuite.scala  | 15 -------
 4 files changed, 94 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8325b68..2432b81 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2153,52 +2153,6 @@ class DataFrame(PandasMapOpsMixin, 
PandasConversionMixin):
                                               "should have been DataFrame." % 
type(result)
         return result
 
-    @since(3.1)
-    def sameSemantics(self, other):
-        """
-        Returns `True` when the logical query plans inside both 
:class:`DataFrame`\\s are equal and
-        therefore return same results.
-
-        .. note:: The equality comparison here is simplified by tolerating the 
cosmetic differences
-            such as attribute names.
-
-        .. note:: This API can compare both :class:`DataFrame`\\s very fast 
but can still return
-            `False` on the :class:`DataFrame` that return the same results, 
for instance, from
-            different plans. Such false negative semantic can be useful when 
caching as an example.
-
-        .. note:: DeveloperApi
-
-        >>> df1 = spark.range(10)
-        >>> df2 = spark.range(10)
-        >>> df1.withColumn("col1", df1.id * 
2).sameSemantics(df2.withColumn("col1", df2.id * 2))
-        True
-        >>> df1.withColumn("col1", df1.id * 
2).sameSemantics(df2.withColumn("col1", df2.id + 2))
-        False
-        >>> df1.withColumn("col1", df1.id * 
2).sameSemantics(df2.withColumn("col0", df2.id * 2))
-        True
-        """
-        if not isinstance(other, DataFrame):
-            raise ValueError("other parameter should be of DataFrame; however, 
got %s"
-                             % type(other))
-        return self._jdf.sameSemantics(other._jdf)
-
-    @since(3.1)
-    def semanticHash(self):
-        """
-        Returns a hash code of the logical query plan against this 
:class:`DataFrame`.
-
-        .. note:: Unlike the standard hash code, the hash is calculated 
against the query plan
-            simplified by tolerating the cosmetic differences such as 
attribute names.
-
-        .. note:: DeveloperApi
-
-        >>> spark.range(10).selectExpr("id as col0").semanticHash()  # 
doctest: +SKIP
-        1855039936
-        >>> spark.range(10).selectExpr("id as col1").semanticHash()  # 
doctest: +SKIP
-        1855039936
-        """
-        return self._jdf.semanticHash()
-
     where = copy_func(
         filter,
         sinceversion=1.3,
diff --git a/python/pyspark/sql/tests/test_dataframe.py 
b/python/pyspark/sql/tests/test_dataframe.py
index 942cd4b..d738449 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -782,11 +782,6 @@ class DataFrameTests(ReusedSQLTestCase):
                     break
             self.assertEqual(df.take(8), result)
 
-    def test_same_semantics_error(self):
-        with QuietTest(self.sc):
-            with self.assertRaisesRegexp(ValueError, "should be of 
DataFrame.*int"):
-                self.spark.range(10).sameSemantics(1)
-
 
 class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
     # These tests are separate because it uses 
'spark.sql.queryExecutionListeners' which is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 5cd2583..42f3535 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3310,34 +3310,6 @@ class Dataset[T] private[sql](
     files.toSet.toArray
   }
 
-  /**
-   * Returns `true` when the logical query plans inside both [[Dataset]]s are 
equal and
-   * therefore return same results.
-   *
-   * @note The equality comparison here is simplified by tolerating the 
cosmetic differences
-   *       such as attribute names.
-   * @note This API can compare both [[Dataset]]s very fast but can still 
return `false` on
-   *       the [[Dataset]] that return the same results, for instance, from 
different plans. Such
-   *       false negative semantic can be useful when caching as an example.
-   * @since 3.1.0
-   */
-  @DeveloperApi
-  def sameSemantics(other: Dataset[T]): Boolean = {
-    queryExecution.analyzed.sameResult(other.queryExecution.analyzed)
-  }
-
-  /**
-   * Returns a `hashCode` of the logical query plan against this [[Dataset]].
-   *
-   * @note Unlike the standard `hashCode`, the hash is calculated against the 
query plan
-   *       simplified by tolerating the cosmetic differences such as attribute 
names.
-   * @since 3.1.0
-   */
-  @DeveloperApi
-  def semanticHash(): Int = {
-    queryExecution.analyzed.semanticHash()
-  }
-
   ////////////////////////////////////////////////////////////////////////////
   // For Python API
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index b4ed4ec..b0bd612 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1909,21 +1909,6 @@ class DatasetSuite extends QueryTest
 
     assert(active eq SparkSession.getActiveSession.get)
   }
-
-  test("SPARK-30791: sameSemantics and semanticHash work") {
-    val df1 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
-    val df2 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
-    val df3 = Seq((0, 2), (4, 5)).toDF("col1", "col2")
-    val df4 = Seq((0, 2), (4, 5)).toDF("col0", "col2")
-
-    assert(df1.sameSemantics(df2) === true)
-    assert(df1.sameSemantics(df3) === false)
-    assert(df3.sameSemantics(df4) === true)
-
-    assert(df1.semanticHash === df2.semanticHash)
-    assert(df1.semanticHash !== df3.semanticHash)
-    assert(df3.semanticHash === df4.semanticHash)
-  }
 }
 
 object AssertExecutionId {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch branch-3.0 updated: Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' methods in Dataset"

Reply via email to