Maxim Gekk created SPARK-33941:
----------------------------------

             Summary: ALTER TABLE .. DROP PARTITION doesn't invalidate the cache
                 Key: SPARK-33941
                 URL: https://issues.apache.org/jira/browse/SPARK-33941
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 3.0.1, 3.1.0, 3.2.0
            Reporter: Maxim Gekk


1. Create a partitioned table:
{code:scala}
scala> sql("CREATE TABLE tbl2 (id int, part int) USING parquet PARTITIONED BY 
(part)")
res31: org.apache.spark.sql.DataFrame = []
{code}
2. Create partitions:
{code:scala}
scala> sql("INSERT INTO tbl2 PARTITION (part=0) SELECT 0")
res32: org.apache.spark.sql.DataFrame = []

scala> sql("INSERT INTO tbl2 PARTITION (part=1) SELECT 1")
res33: org.apache.spark.sql.DataFrame = []
{code}
3. Create dataframes from the table:
{code:scala}
scala> val df1 = spark.table("tbl2")
df1: org.apache.spark.sql.DataFrame = [id: int, part: int]

scala> val df2 = spark.table("tbl2")
df2: org.apache.spark.sql.DataFrame = [id: int, part: int]
{code}
4. Cache df2 and fill in the cache:
{code:scala}
scala> df2.cache
res34: df2.type = [id: int, part: int]

scala> df1.show(false)
+---+----+
|id |part|
+---+----+
|0  |0   |
|1  |1   |
+---+----+


scala> df2.show(false)
+---+----+
|id |part|
+---+----+
|0  |0   |
|1  |1   |
+---+----+
{code}
5. Drop a partition:
{code:scala}
scala> sql("ALTER TABLE tbl2 DROP PARTITION(part=0)")
res37: org.apache.spark.sql.DataFrame = []

scala> df1.show(false)
+---+----+
|id |part|
+---+----+
|0  |0   |
|1  |1   |
+---+----+


scala> df2.show(false)
+---+----+
|id |part|
+---+----+
|0  |0   |
|1  |1   |
+---+----+
{code}

The same without caching:
{code:scala}
scala> sql("CREATE TABLE tbl3 (id int, part int) USING parquet PARTITIONED BY 
(part)")
res40: org.apache.spark.sql.DataFrame = []

scala> sql("INSERT INTO tbl3 PARTITION (part=0) SELECT 0")
res41: org.apache.spark.sql.DataFrame = []

scala> sql("INSERT INTO tbl3 PARTITION (part=1) SELECT 1")
res42: org.apache.spark.sql.DataFrame = []

scala> val df3 = spark.table("tbl3")
df3: org.apache.spark.sql.DataFrame = [id: int, part: int]

scala> df3.show(false)
+---+----+
|id |part|
+---+----+
|0  |0   |
|1  |1   |
+---+----+


scala> sql("ALTER TABLE tbl3 DROP PARTITION(part=0)")
res44: org.apache.spark.sql.DataFrame = []

scala> df3.show(false)
+---+----+
|id |part|
+---+----+
|1  |1   |
+---+----+
{code}




--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to