anishshri-db commented on code in PR #49156:
URL: https://github.com/apache/spark/pull/49156#discussion_r1887719393
##########
python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py:
##########
@@ -907,6 +923,184 @@ def
test_transform_with_state_in_pandas_batch_query_initial_state(self):
Row(id="1", value=str(146 + 346)),
}
+ def test_transform_with_state_metadata(self):
+ checkpoint_path = tempfile.mktemp()
+
+ def check_results(batch_df, batch_id):
+ if batch_id == 0:
+ assert set(batch_df.sort("id").collect()) == {
+ Row(id="0", countAsString="2"),
+ Row(id="1", countAsString="2"),
+ }
+ else:
+ # check for state metadata source
+ metadata_df =
self.spark.read.format("state-metadata").load(checkpoint_path)
+ assert set(
+ metadata_df.select(
+ "operatorId",
+ "operatorName",
+ "stateStoreName",
+ "numPartitions",
+ "minBatchId",
+ "maxBatchId",
+ ).collect()
+ ) == {
+ Row(
+ operatorId=0,
+ operatorName="transformWithStateInPandasExec",
+ stateStoreName="default",
+ numPartitions=5,
+ minBatchId=0,
+ maxBatchId=0,
+ )
+ }
+ operator_properties_json_obj = json.loads(
+ metadata_df.select("operatorProperties").collect()[0][0]
+ )
+ assert operator_properties_json_obj["timeMode"] ==
"ProcessingTime"
+ assert operator_properties_json_obj["outputMode"] == "Update"
+
+ state_var_list = operator_properties_json_obj["stateVariables"]
+ assert len(state_var_list) == 3
+ for state_var in state_var_list:
+ if state_var["stateName"] == "mapState":
+ assert state_var["stateVariableType"] == "MapState"
+ assert state_var["ttlEnabled"]
+ elif state_var["stateName"] == "listState":
+ assert state_var["stateVariableType"] == "ListState"
+ assert not state_var["ttlEnabled"]
+ else:
+ assert state_var["stateName"] ==
"$procTimers_keyToTimestamp"
+ assert state_var["stateVariableType"] == "TimerState"
+
+ # check for state data source
Review Comment:
Are we covering all variations here ?
- all supported composite types
- options supported by state data source reader ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]