[
https://issues.apache.org/jira/browse/HADOOP-16458?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17380756#comment-17380756
]
Matteo Martignon edited comment on HADOOP-16458 at 7/14/21, 5:26 PM:
---------------------------------------------------------------------
We are facing the same issue of intermittent failures on GCP with DataFusion
service.
{quote}"org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
Pattern gs://<bucket_name>/\*/\*/\*/*.parquet" matches 0 files.
at
org.apache.hadoop.mapred.LocatedFileStatusFetcher.getFileStatuses(LocatedFileStatusFetcher.java:152)
~[hadoop-mapreduce-client-core-2.9.2.jar:na]
at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:277)
~[hadoop-mapreduce-client-core-2.9.2.jar:na]
at
org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.getSplits(CombineFileInputFormat.java:216)
~[hadoop-mapreduce-client-core-2.9.2.jar:na]
at
io.cdap.plugin.format.parquet.input.CombineParquetInputFormat.access$001(CombineParquetInputFormat.java:38)
~[1626278937395-0/:na]
at
io.cdap.plugin.format.parquet.input.CombineParquetInputFormat.lambda$getSplits$0(CombineParquetInputFormat.java:43)
~[1626278937395-0/:na]
at
io.cdap.plugin.common.batch.JobUtils.applyWithExtraClassLoader(JobUtils.java:63)
~[na:na]
at
io.cdap.plugin.format.parquet.input.CombineParquetInputFormat.getSplits(CombineParquetInputFormat.java:42)
~[1626278937395-0/:na]
at
org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:127)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
io.cdap.cdap.app.runtime.spark.data.DatasetRDD.getPartitions(DatasetRDD.scala:61)
~[na:na]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at scala.Option.getOrElse(Option.scala:121)
~[scala-library-2.11.8.jar:na]
at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
~[na:2.3.4]
at
org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78)
~[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1083)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1081)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1081)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
[spark-core_2.11-2.3.4.jar:2.3.4]
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1081)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
org.apache.spark.api.java.JavaPairRDD.saveAsNewAPIHadoopDataset(JavaPairRDD.scala:831)
[spark-core_2.11-2.3.4.jar:2.3.4]
at
io.cdap.cdap.etl.spark.batch.SparkBatchSinkFactory.saveUsingOutputFormat(SparkBatchSinkFactory.java:198)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.etl.spark.batch.SparkBatchSinkFactory.writeFromRDD(SparkBatchSinkFactory.java:174)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.etl.spark.batch.BaseRDDCollection$1.run(BaseRDDCollection.java:230)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.etl.spark.SparkPipelineRunner.runPipeline(SparkPipelineRunner.java:370)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.etl.spark.batch.BatchSparkPipelineDriver.run(BatchSparkPipelineDriver.java:150)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.app.runtime.spark.SparkTransactional$2.run(SparkTransactional.java:236)
[na:na]
at
io.cdap.cdap.app.runtime.spark.SparkTransactional.execute(SparkTransactional.java:208)
[na:na]
at
io.cdap.cdap.app.runtime.spark.SparkTransactional.execute(SparkTransactional.java:138)
[na:na]
at
io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext.execute(AbstractSparkExecutionContext.scala:228)
[na:na]
at
io.cdap.cdap.app.runtime.spark.SerializableSparkExecutionContext.execute(SerializableSparkExecutionContext.scala:61)
[na:na]
at
io.cdap.cdap.app.runtime.spark.DefaultJavaSparkExecutionContext.execute(DefaultJavaSparkExecutionContext.scala:89)
[na:na]
at io.cdap.cdap.api.Transactionals.execute(Transactionals.java:63)
[na:na]
at
io.cdap.cdap.etl.spark.batch.BatchSparkPipelineDriver.run(BatchSparkPipelineDriver.java:116)
[hydrator-spark-core2_2.11-6.1.4.jar:na]
at
io.cdap.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:87)
[na:na]
at
io.cdap.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala)
[na:na]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
~[na:1.8.0_275]
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
~[na:1.8.0_275]
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
~[na:1.8.0_275]
at java.lang.reflect.Method.invoke(Method.java:498) ~[na:1.8.0_275]
at
org.apache.spark.deploy.yarn.ApplicationMaster$$anon$4.run(ApplicationMaster.scala:721)
[spark-yarn_2.11-2.3.4.jar:2.3.4]
{quote}
was (Author: mmart__):
We are facing the same issue of intermittent failures on GCP with DataFusion
service.
{quote}"org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
Pattern gs://<bucket_name>/\*/\*/\*/*.parquet" matches 0 files.{quote}
> LocatedFileStatusFetcher scans failing intermittently against S3 store
> ----------------------------------------------------------------------
>
> Key: HADOOP-16458
> URL: https://issues.apache.org/jira/browse/HADOOP-16458
> Project: Hadoop Common
> Issue Type: Sub-task
> Components: fs/s3
> Affects Versions: 3.3.0
> Environment: S3 + S3Guard
> Reporter: Steve Loughran
> Assignee: Steve Loughran
> Priority: Major
> Fix For: 3.3.0
>
>
> Intermittent failure of LocatedFileStatusFetcher.getFileStatuses(), which is
> using globStatus to find files.
> I'd say "turn s3guard on" except this appears to be the case, and the dataset
> being read is
> over 1h old.
> Which means it is harder than I'd like to blame S3 for what would sound like
> an inconsistency
> We're hampered by the number of debug level statements in the globber code
> being approximately none; there's no debugging to turn on. All we know is
> that globFiles returns null without any explanation.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]