Hello,
I am having an issue with Linear Regression when trying to fit training data to
the model. The code below used to work, but it stopped recently. Spark is
version 3.2.1.
# Split Data into train and test data
train, test = data.randomSplit([0.9, 0.1])
y = 'Build_Rate'
# Perform regression with train data
assembler = VectorAssembler(inputCols=feature_cols, outputCol="Features")
vtrain = assembler.transform(train).select('Features', y)
lin_reg = LinearRegression(regParam = 0.0, elasticNetParam = 0.0,
solver='normal', featuresCol = 'Features', labelCol = y)
model = lin_reg.fit(vtrain) FAILS HERE
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in
stage 388.0 failed 4 times, most recent failure: Lost task 0.3 in stage 388.0
(TID 422) (10.139.64.4 executor 0): org.apache.spark.SparkUpgradeException: You
may get a different result due to the upgrading of Spark 3.0: Fail to recognize
'MMM dd, yyyy hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can
set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before
Spark 3.0. 2) You can form a valid datetime pattern with the guide from
https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
The full traceback is attached.
The error is confusing me because there are no datetime columns in "train".
"vtrain" is just "train" with the feature columns in dense vector form.
[cid:[email protected]]
Does anyone know how to fix this error?
Thanks,
Ken Bassett
Data Scientist
1451 Marvin Griffin Rd.
Augusta, GA 30906
(m) (706) 469-0696
[email protected]<mailto:[email protected]>
[2019 E-mail Signature]
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-2567512786910096> in <module>
----> 1 model = lin_reg.fit(vtrain)
/databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_pyspark.py in
patched_method(self, *args, **kwargs)
28 call_succeeded = False
29 try:
---> 30 result = original_method(self, *args, **kwargs)
31 call_succeeded = True
32 return result
/databricks/spark/python/pyspark/ml/base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise TypeError("Params must be either a param map or a
list/tuple of param maps, "
/databricks/spark/python/pyspark/ml/wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
/databricks/spark/python/pyspark/ml/wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in
__call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1033.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in
stage 413.0 failed 4 times, most recent failure: Lost task 0.3 in stage 413.0
(TID 461) (10.139.64.4 executor 0): org.apache.spark.SparkUpgradeException: You
may get a different result due to the upgrading of Spark 3.0: Fail to recognize
'MMM dd, yyyy hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can set
spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before
Spark 3.0. 2) You can form a valid datetime pattern with the guide from
https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
at
org.apache.spark.sql.errors.QueryExecutionErrors$.failToRecognizePatternAfterUpgradeError(QueryExecutionErrors.scala:1054)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:187)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:180)
at
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:172)
at
org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:405)
at
org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:466)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:94)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:88)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:85)
at scala.Option.map(Option.scala:230)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:85)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:83)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.eval(datetimeExpressions.scala:1216)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:168)
at
org.apache.spark.sql.catalyst.expressions.InterpretedUnsafeProjection.apply(InterpretedUnsafeProjection.scala:90)
at
org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:260)
at
org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:154)
at
org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:30)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithKeys_0$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
at
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at
org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$3(ShuffleMapTask.scala:81)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at
org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:81)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:153)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:122)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:93)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:824)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1641)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:827)
at
scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:683)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: Too many pattern letters: a
at
java.time.format.DateTimeFormatterBuilder.parseField(DateTimeFormatterBuilder.java:1774)
at
java.time.format.DateTimeFormatterBuilder.parsePattern(DateTimeFormatterBuilder.java:1613)
at
java.time.format.DateTimeFormatterBuilder.appendPattern(DateTimeFormatterBuilder.java:1581)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.buildFormatter(DateTimeFormatterHelper.scala:250)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:126)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:108)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:117)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:116)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:171)
... 43 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2979)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2926)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2920)
at
scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2920)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1340)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1340)
at scala.Option.foreach(Option.scala:407)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1340)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3188)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3129)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3117)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkUpgradeException: You may get a different
result due to the upgrading of Spark 3.0: Fail to recognize 'MMM dd, yyyy
hh:mm:ss aa' pattern in the DateTimeFormatter. 1) You can set
spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before
Spark 3.0. 2) You can form a valid datetime pattern with the guide from
https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
at
org.apache.spark.sql.errors.QueryExecutionErrors$.failToRecognizePatternAfterUpgradeError(QueryExecutionErrors.scala:1054)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:187)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:180)
at
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:172)
at
org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:405)
at
org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:466)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:94)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:88)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:85)
at scala.Option.map(Option.scala:230)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:85)
at
org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:83)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:1174)
at
org.apache.spark.sql.catalyst.expressions.ToTimestamp.eval(datetimeExpressions.scala:1216)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:168)
at
org.apache.spark.sql.catalyst.expressions.InterpretedUnsafeProjection.apply(InterpretedUnsafeProjection.scala:90)
at
org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:260)
at
org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:154)
at
org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:30)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithKeys_0$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
at
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at
org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$3(ShuffleMapTask.scala:81)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at
org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:81)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:153)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:122)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:93)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:824)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1641)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:827)
at
scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at
com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:683)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: Too many pattern letters: a
at
java.time.format.DateTimeFormatterBuilder.parseField(DateTimeFormatterBuilder.java:1774)
at
java.time.format.DateTimeFormatterBuilder.parsePattern(DateTimeFormatterBuilder.java:1613)
at
java.time.format.DateTimeFormatterBuilder.appendPattern(DateTimeFormatterBuilder.java:1581)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.buildFormatter(DateTimeFormatterHelper.scala:250)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:126)
at
org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:108)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:117)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:116)
at
org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:171)
... 43 more
---------------------------------------------------------------------
To unsubscribe e-mail: [email protected]