anishshri-db commented on code in PR #53874:
URL: https://github.com/apache/spark/pull/53874#discussion_r2748633255


##########
sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/OfflineStateRepartitionIntegrationSuite.scala:
##########
@@ -134,23 +152,49 @@ class OfflineStateRepartitionIntegrationSuite
       val repartitionBatchId = lastBatchId + 1
       val hadoopConf = spark.sessionState.newHadoopConf()
 
-      // Step 4: Verify offset and commit logs are updated correctly
-      verifyRepartitionBatch(
+      // Step 4: Verify offset and commit logs
+      OfflineStateRepartitionTestUtils.verifyRepartitionBatch(
         repartitionBatchId, checkpointMetadata, hadoopConf,
         checkpointDir.getAbsolutePath, newPartitions, spark)
 
-      // Step 5: Validate state data matches after repartition
-      validateStateDataAfterRepartition(
-        checkpointDir.getAbsolutePath, stateBeforeRepartition.collect(),
-        repartitionBatchId)
+      // Step 5: Validate state for each store and column family after 
repartition
+      val stateAfterRepartition = readStateDataByStoreName(
+        checkpointDir.getAbsolutePath, repartitionBatchId, 
storeToColumnFamilyToStateSourceOptions)
+
+      storeToColumnFamilyToStateSourceOptions.foreach { case (storeName, 
columnFamilies) =>
+        columnFamilies.keys.foreach { cfName =>
+          val beforeState = stateBeforeRepartition(storeName)(cfName)
+          val afterState = stateAfterRepartition(storeName)(cfName)
+
+          // Validate row count
+          assert(beforeState.length == afterState.length,
+            s"Store '$storeName', CF '$cfName': State row count mismatch: " +
+              s"before=${beforeState.length}, after=${afterState.length}")
+
+          // Extract (key, value) pairs and compare
+          val beforeByKey = extractKeyValuePairs(beforeState)
+          val afterByKey = extractKeyValuePairs(afterState)
+
+          // Compare each (key, value) pair
+          beforeByKey.zip(afterByKey).zipWithIndex.foreach {
+            case (((keyBefore, valueBefore), (keyAfter, valueAfter)), idx) =>
+              assert(keyBefore == keyAfter,
+                s"Store '$storeName', CF '$cfName': Key mismatch at index 
$idx")
+              assert(valueBefore == valueAfter,
+                s"Store '$storeName', CF '$cfName': Value mismatch for key 
$keyBefore")
+          }
+        }
+      }
 
       // Step 6: Resume query with new input and verify
       verifyResumedQuery(inputData, checkpointDir.getAbsolutePath, clock)
     }
   }
 
   def testWithChangelogConfig(testName: String)(testFun: => Unit): Unit = {
-    Seq(true, false).foreach { changelogCheckpointingEnabled =>
+    // TODO: add test with changelog checkpointing disabled after SPARK 
increases its test timeout

Review Comment:
   Can we create a SPARK JIRA for this and tag it here @zifeif2 ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to