This is an automated email from the ASF dual-hosted git repository.
weichenxu123 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new cb890d9 Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and
'sementicHash' methods in Dataset"
cb890d9 is described below
commit cb890d96bc38860988dba97efaf6d88cc8c09288
Author: WeichenXu <[email protected]>
AuthorDate: Tue Feb 18 10:41:49 2020 +0800
Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash'
methods in Dataset"
This reverts commit ba9141592d0f0ce23c207efb21ae84ac7cc4670a.
---
python/pyspark/sql/dataframe.py | 46 ----------------------
python/pyspark/sql/tests/test_dataframe.py | 5 ---
.../main/scala/org/apache/spark/sql/Dataset.scala | 28 -------------
.../scala/org/apache/spark/sql/DatasetSuite.scala | 15 -------
4 files changed, 94 deletions(-)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8325b68..2432b81 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2153,52 +2153,6 @@ class DataFrame(PandasMapOpsMixin,
PandasConversionMixin):
"should have been DataFrame." %
type(result)
return result
- @since(3.1)
- def sameSemantics(self, other):
- """
- Returns `True` when the logical query plans inside both
:class:`DataFrame`\\s are equal and
- therefore return same results.
-
- .. note:: The equality comparison here is simplified by tolerating the
cosmetic differences
- such as attribute names.
-
- .. note:: This API can compare both :class:`DataFrame`\\s very fast
but can still return
- `False` on the :class:`DataFrame` that return the same results,
for instance, from
- different plans. Such false negative semantic can be useful when
caching as an example.
-
- .. note:: DeveloperApi
-
- >>> df1 = spark.range(10)
- >>> df2 = spark.range(10)
- >>> df1.withColumn("col1", df1.id *
2).sameSemantics(df2.withColumn("col1", df2.id * 2))
- True
- >>> df1.withColumn("col1", df1.id *
2).sameSemantics(df2.withColumn("col1", df2.id + 2))
- False
- >>> df1.withColumn("col1", df1.id *
2).sameSemantics(df2.withColumn("col0", df2.id * 2))
- True
- """
- if not isinstance(other, DataFrame):
- raise ValueError("other parameter should be of DataFrame; however,
got %s"
- % type(other))
- return self._jdf.sameSemantics(other._jdf)
-
- @since(3.1)
- def semanticHash(self):
- """
- Returns a hash code of the logical query plan against this
:class:`DataFrame`.
-
- .. note:: Unlike the standard hash code, the hash is calculated
against the query plan
- simplified by tolerating the cosmetic differences such as
attribute names.
-
- .. note:: DeveloperApi
-
- >>> spark.range(10).selectExpr("id as col0").semanticHash() #
doctest: +SKIP
- 1855039936
- >>> spark.range(10).selectExpr("id as col1").semanticHash() #
doctest: +SKIP
- 1855039936
- """
- return self._jdf.semanticHash()
-
where = copy_func(
filter,
sinceversion=1.3,
diff --git a/python/pyspark/sql/tests/test_dataframe.py
b/python/pyspark/sql/tests/test_dataframe.py
index 942cd4b..d738449 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -782,11 +782,6 @@ class DataFrameTests(ReusedSQLTestCase):
break
self.assertEqual(df.take(8), result)
- def test_same_semantics_error(self):
- with QuietTest(self.sc):
- with self.assertRaisesRegexp(ValueError, "should be of
DataFrame.*int"):
- self.spark.range(10).sameSemantics(1)
-
class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils):
# These tests are separate because it uses
'spark.sql.queryExecutionListeners' which is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 5cd2583..42f3535 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3310,34 +3310,6 @@ class Dataset[T] private[sql](
files.toSet.toArray
}
- /**
- * Returns `true` when the logical query plans inside both [[Dataset]]s are
equal and
- * therefore return same results.
- *
- * @note The equality comparison here is simplified by tolerating the
cosmetic differences
- * such as attribute names.
- * @note This API can compare both [[Dataset]]s very fast but can still
return `false` on
- * the [[Dataset]] that return the same results, for instance, from
different plans. Such
- * false negative semantic can be useful when caching as an example.
- * @since 3.1.0
- */
- @DeveloperApi
- def sameSemantics(other: Dataset[T]): Boolean = {
- queryExecution.analyzed.sameResult(other.queryExecution.analyzed)
- }
-
- /**
- * Returns a `hashCode` of the logical query plan against this [[Dataset]].
- *
- * @note Unlike the standard `hashCode`, the hash is calculated against the
query plan
- * simplified by tolerating the cosmetic differences such as attribute
names.
- * @since 3.1.0
- */
- @DeveloperApi
- def semanticHash(): Int = {
- queryExecution.analyzed.semanticHash()
- }
-
////////////////////////////////////////////////////////////////////////////
// For Python API
////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index b4ed4ec..b0bd612 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1909,21 +1909,6 @@ class DatasetSuite extends QueryTest
assert(active eq SparkSession.getActiveSession.get)
}
-
- test("SPARK-30791: sameSemantics and semanticHash work") {
- val df1 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
- val df2 = Seq((1, 2), (4, 5)).toDF("col1", "col2")
- val df3 = Seq((0, 2), (4, 5)).toDF("col1", "col2")
- val df4 = Seq((0, 2), (4, 5)).toDF("col0", "col2")
-
- assert(df1.sameSemantics(df2) === true)
- assert(df1.sameSemantics(df3) === false)
- assert(df3.sameSemantics(df4) === true)
-
- assert(df1.semanticHash === df2.semanticHash)
- assert(df1.semanticHash !== df3.semanticHash)
- assert(df3.semanticHash === df4.semanticHash)
- }
}
object AssertExecutionId {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]