[
https://issues.apache.org/jira/browse/SPARK-24409?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16494647#comment-16494647
]
Liang-Chi Hsieh commented on SPARK-24409:
-----------------------------------------
Seems you use AWS Glue Data Catalog as the Metastore for Hive. And the too long
partition filtering expressions cause this exception.
> exception when sending large list in filter(col(x).isin(list))
> --------------------------------------------------------------
>
> Key: SPARK-24409
> URL: https://issues.apache.org/jira/browse/SPARK-24409
> Project: Spark
> Issue Type: Bug
> Components: PySpark
> Affects Versions: 2.2.1
> Reporter: Janet Levin
> Priority: Major
>
> This is the error we get:
>
> /mnt/yarn/usercache/hadoop/appcache/application_1526466002571_8701/container_1526466002571_8701_01_000001/pyspark.zip/pyspark/sql/dataframe.py",
> line 88, in rdd
> File
> "/mnt/yarn/usercache/hadoop/appcache/application_1526466002571_8701/container_1526466002571_8701_01_000001/py4j-0.10.6-src.zip/py4j/java_gateway.py",
> line 1160, in __call__
> File
> "/mnt/yarn/usercache/hadoop/appcache/application_1526466002571_8701/container_1526466002571_8701_01_000001/pyspark.zip/pyspark/sql/utils.py",
> line 63, in deco
> File
> "/mnt/yarn/usercache/hadoop/appcache/application_1526466002571_8701/container_1526466002571_8701_01_000001/py4j-0.10.6-src.zip/py4j/protocol.py",
> line 320, in get_return_value
> py4j.protocol.Py4JJavaError: An error occurred while calling
> o605.javaToPython.
> : java.lang.RuntimeException: Caught Hive MetaException attempting to get
> partition metadata by filter from Hive. You can set the Spark configuration
> setting spark.sql.hive.manageFilesourcePartitions to false to work around
> this problem, however this will result in degraded performance. Please report
> a bug: https://issues.apache.org/jira/browse/SPARK
> at
> org.apache.spark.sql.hive.client.Shim_v0_13.getPartitionsByFilter(HiveShim.scala:741)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getPartitionsByFilter$1.apply(HiveClientImpl.scala:655)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getPartitionsByFilter$1.apply(HiveClientImpl.scala:653)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:272)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:210)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:209)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:255)
> at
> org.apache.spark.sql.hive.client.HiveClientImpl.getPartitionsByFilter(HiveClientImpl.scala:653)
> at
> org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$listPartitionsByFilter$1.apply(HiveExternalCatalog.scala:1218)
> at
> org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$listPartitionsByFilter$1.apply(HiveExternalCatalog.scala:1211)
> at
> org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
> at
> org.apache.spark.sql.hive.HiveExternalCatalog.listPartitionsByFilter(HiveExternalCatalog.scala:1211)
> at
> org.apache.spark.sql.catalyst.catalog.SessionCatalog.listPartitionsByFilter(SessionCatalog.scala:925)
> at
> org.apache.spark.sql.execution.datasources.CatalogFileIndex.filterPartitions(CatalogFileIndex.scala:73)
> at
> org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions$$anonfun$apply$1.applyOrElse(PruneFileSourcePartitions.scala:61)
> at
> org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions$$anonfun$apply$1.applyOrElse(PruneFileSourcePartitions.scala:27)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
> at
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
> at
> org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions$.apply(PruneFileSourcePartitions.scala:27)
> at
> org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions$.apply(PruneFileSourcePartitions.scala:26)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
> at
> scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
> at
> scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
> at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
> at scala.collection.immutable.List.foreach(List.scala:381)
> at
> org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
> at
> org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:66)
> at
> org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:66)
> at
> org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:72)
> at
> org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:68)
> at
> org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
> at
> org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
> at
> org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
> at
> org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
> at org.apache.spark.sql.Dataset.javaToPython(Dataset.scala:3186)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
> at py4j.Gateway.invoke(Gateway.java:282)
> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
> at py4j.commands.CallCommand.execute(CallCommand.java:79)
> at py4j.GatewayConnection.run(GatewayConnection.java:214)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.lang.reflect.InvocationTargetException
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at
> org.apache.spark.sql.hive.client.Shim_v0_13.getPartitionsByFilter(HiveShim.scala:728)
> ... 56 more
> Caused by: MetaException(message:1 validation error detected: Value '(x
> ='foo' or x='bar' ...)
> at 'expression' failed to satisfy constraint: Member must have length less
> than or equal to 2048 (Service: AWSGlue; Status Code: 400; Error Code:
> ValidationException; Request ID: cce3a44f-630f-11e8-9e23-13fd64997c0c))
> at
> com.amazonaws.glue.catalog.converters.CatalogToHiveConverter.getHiveException(CatalogToHiveConverter.java:98)
> at
> com.amazonaws.glue.catalog.converters.CatalogToHiveConverter.wrapInHiveException(CatalogToHiveConverter.java:86)
> at
> com.amazonaws.glue.catalog.metastore.GlueMetastoreClientDelegate.getCatalogPartitions(GlueMetastoreClientDelegate.java:865)
> at
> com.amazonaws.glue.catalog.metastore.GlueMetastoreClientDelegate.getPartitions(GlueMetastoreClientDelegate.java:828)
> at
> com.amazonaws.glue.catalog.metastore.AWSCatalogMetastoreClient.listPartitionsByFilter(AWSCatalogMetastoreClient.java:1250)
> at
> org.apache.hadoop.hive.ql.metadata.Hive.getPartitionsByFilter(Hive.java:2255)
> ... 61 more
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]