mahesh2247 commented on issue #7688:
URL: https://github.com/apache/hudi/issues/7688#issuecomment-1386594913
Resulting in
```
23/01/18 06:44:20 ERROR ProcessLauncher: Error from Python:Traceback (most
recent call last):
File "/tmp/glue_job_script.py", line 77, in <module>
glueContext.forEachBatch(frame = data_frame_DataSource0, batch_function
= processBatch, options = {"windowSize": "10 seconds", "checkpointLocation":
s3_path_spark})
File "/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line
678, in forEachBatch
raise e
File "/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line
668, in forEachBatch
query.start().awaitTermination()
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/streaming.py",
line 101, in awaitTermination
return self._jsq.awaitTermination()
File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line
117, in deco
raise converted from None
pyspark.sql.utils.StreamingQueryException: An exception was raised by the
Python Proxy. Return Message: Traceback (most recent call last):
File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
2442, in _call_proxy
return_value = getattr(self.pool[obj_id], method)(*params)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line
196, in call
raise e
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line
193, in call
self.func(DataFrame(jdf, self.sql_ctx), batch_id)
File "/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line
653, in batch_function_with_persist
batch_function(data_frame, batchId)
File "/tmp/glue_job_script.py", line 69, in processBatch
inputDf = inputDf.filter("eventName !='REMOVE' AND id !='
{}
'".format(abc['id']))
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py",
line 1717, in filter
jdf = self._jdf.filter(condition)
File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line
117, in deco
raise converted from None
pyspark.sql.utils.ParseException:
extraneous input ''>'' expecting <EOF>(line 1, pos 41)
== SQL ==
eventName !='REMOVE' AND id !='Column<'id'>'
-----------------------------------------^^^
=== Streaming Query ===
Identifier: [id = 35151fa1-0206-4f5d-b966-d79ead941926, runId =
e1e910b1-7a33-49e0-ac03-7fada17d6138]Current Committed Offsets: {}Current
Available Offsets: {KinesisSource[hudi_demo_stream_424f5840]:
{"metadata":{"streamName":"hudi_demo_stream_424f5840","batchId":"0"},"shardId-000000000000":{"iteratorType":"TRIM_HORIZON","iteratorPosition":""}}}Current
State: ACTIVEThread State: RUNNABLELogical Plan:Project [cast(data#18 as
string) AS $json$data_infer_schema$_temporary$#27]+- Project [UDF(data#5) AS
data#18, streamName#6, partitionKey#7, sequenceNumber#8,
approximateArrivalTimestamp#9] +- StreamingExecutionRelation
KinesisSource[hudi_demo_stream_424f5840], [data#5, streamName#6,
partitionKey#7, sequenceNumber#8, approximateArrivalTimestamp#9] | 23/01/18
06:44:20 ERROR ProcessLauncher: Error from Python:Traceback (most recent call
last): File "/tmp/glue_job_script.py", line 77, in <module>
glueContext.forEachBatch(frame = data_frame_DataSource0, batch_function =
processBatch, opt
ions = {"windowSize": "10 seconds", "checkpointLocation": s3_path_spark}) File
"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line 678, in
forEachBatch raise e File
"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line 668, in
forEachBatch query.start().awaitTermination() File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/streaming.py", line 101,
in awaitTermination return self._jsq.awaitTermination() File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
1305, in __call__ answer, self.gateway_client, self.target_id, self.name) File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 117, in
deco raise converted from None pyspark.sql.utils.StreamingQueryException: An
exception was raised by the Python Proxy. Return Message: Traceback (most
recent call last): File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
2442, in _call_proxy return_value = getattr(self.pool[obj_id],
method)(*params) File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 196, in
call raise e File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 193, in
call self.func(DataFrame(jdf, self.sql_ctx), batch_id) File
"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line 653, in
batch_function_with_persist batch_function(data_frame, batchId) File
"/tmp/glue_job_script.py", line 69, in processBatch inputDf =
inputDf.filter("eventName !='REMOVE' AND id !='{}'".format(abc['id'])) File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1717,
in filter jdf = self._jdf.filter(condition) File
"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line
1305, in __call__ answer, self.gateway_client, self.target_id, self.name) File
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 117, in
deco raise converted from None pyspark.sql.utils.ParseException: extraneous
input ''>'' expecti
ng <EOF>(line 1, pos 41) == SQL == eventName !='REMOVE' AND id
!='Column<'id'>' -----------------------------------------^^^ === Streaming
Query === Identifier: [id = 35151fa1-0206-4f5d-b966-d79ead941926, runId =
e1e910b1-7a33-49e0-ac03-7fada17d6138] Current Committed Offsets: {} Current
Available Offsets: {KinesisSource[hudi_demo_stream_424f5840]:
{"metadata":{"streamName":"hudi_demo_stream_424f5840","batchId":"0"},"shardId-000000000000":{"iteratorType":"TRIM_HORIZON","iteratorPosition":""}}}
Current State: ACTIVE Thread State: RUNNABLE Logical Plan: Project
[cast(data#18 as string) AS $json$data_infer_schema$_temporary$#27] +- Project
[UDF(data#5) AS data#18, streamName#6, partitionKey#7, sequenceNumber#8,
approximateArrivalTimestamp#9] +- StreamingExecutionRelation
KinesisSource[hudi_demo_stream_424f5840], [data#5, streamName#6,
partitionKey#7, sequenceNumber#8, approximateArrivalTimestamp#9]
-- | --
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]