[
https://issues.apache.org/jira/browse/SPARK-35602?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17356614#comment-17356614
]
Kousuke Saruta commented on SPARK-35602:
----------------------------------------
[[email protected]]
Thank you for reporting. I understand the reason.
You can set spark.sql.streaming.stateStore.stateSchemaCheck to false for a
workaround.
> Job crashes with java.io.UTFDataFormatException: encoded string too long
> ------------------------------------------------------------------------
>
> Key: SPARK-35602
> URL: https://issues.apache.org/jira/browse/SPARK-35602
> Project: Spark
> Issue Type: Bug
> Components: Structured Streaming
> Affects Versions: 3.1.1
> Environment: AWS emr-6.3.0
> Reporter: dejan miljkovic
> Priority: Major
>
> Running stafeful structured streaming app using java. When running on Spark
> 3.1.1 app is crashing with java.io.UTFDataFormatException: encoded string too
> long. I am not getting this problem when running on Spark 3.0.1
> 21/06/01 17:50:35 WARN DAGScheduler: Broadcasting large task binary with size
> 1986.3 KiB21/06/01 17:50:35 WARN DAGScheduler: Broadcasting large task binary
> with size 1986.3 KiB21/06/01 17:50:37 WARN TaskSetManager: Lost task 0.0 in
> stage 1.0 (TID 0) (ip-10-64-12-189.eu-west-1.compute.internal executor 1):
> java.io.UTFDataFormatException: encoded string too long: 156449 bytes at
> java.io.DataOutputStream.writeUTF(DataOutputStream.java:364) at
> java.io.DataOutputStream.writeUTF(DataOutputStream.java:323) at
> org.apache.spark.sql.execution.streaming.state.StateSchemaCompatibilityChecker.createSchemaFile(StateSchemaCompatibilityChecker.scala:102)
> at
> org.apache.spark.sql.execution.streaming.state.StateSchemaCompatibilityChecker.check(StateSchemaCompatibilityChecker.scala:67)
> at
> org.apache.spark.sql.execution.streaming.state.StateStore$.$anonfun$getStateStoreProvider$2(StateStore.scala:487)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at
> scala.util.Try$.apply(Try.scala:213) at
> org.apache.spark.sql.execution.streaming.state.StateStore$.$anonfun$getStateStoreProvider$1(StateStore.scala:487)
> at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86) at
> org.apache.spark.sql.execution.streaming.state.StateStore$.getStateStoreProvider(StateStore.scala:483)
> at
> org.apache.spark.sql.execution.streaming.state.StateStore$.get(StateStore.scala:468)
> at
> org.apache.spark.sql.execution.streaming.state.StateStoreRDD.compute(StateStoreRDD.scala:125)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at
> org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at
> org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at
> org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at
> org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at
> org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
> org.apache.spark.scheduler.Task.run(Task.scala:131) at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> 21/06/01 17:50:40 ERROR TaskSetManager: Task 0 in stage 1.0 failed 4 times;
> aborting job21/06/01 17:50:40 ERROR WriteToDataSourceV2Exec: Data source
> write support
> org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2745b41f is
> aborting.21/06/01 17:50:40 ERROR WriteToDataSourceV2Exec: Data source write
> support
> org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2745b41f
> aborted.21/06/01 17:50:40 ERROR MicroBatchExecution: Query [id =
> adcf4f93-8c51-4a14-9d9d-1e7a858c8a8c, runId =
> 86b6c41c-a32f-485d-bbf3-24b844c27739] terminated with
> errororg.apache.spark.SparkException: Writing job aborted. at
> org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:388)
> at
> org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:336)
> at
> org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:297)
> at
> org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:304)
> at
> org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:40)
> at
> org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:40)
> at
> org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:46)
> at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3733) at
> org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:3005) at
> org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3724) at
> org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
> at
> org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
> at
> org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:110)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:135)
> at
> org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
> at
> org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:135)
> at
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:253)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:134)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772) at
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
> at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3722) at
> org.apache.spark.sql.Dataset.collect(Dataset.scala:3005) at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:589)
> at
> org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
> at
> org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
> at
> org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:110)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:135)
> at
> org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
> at
> org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:135)
> at
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:253)
> at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:134)
> at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772) at
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
> at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:584)
> at
> org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
> at
> org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
> at
> org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
> at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:584)
> at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:226)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at
> org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
> at
> org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
> at
> org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
> at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:194)
> at
> org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
> at
> org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:188)
> at
> org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:333)
> at
> org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:244)
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]