prodeezy opened a new issue #128: Iceberg Table Loaded using relative path 
fails to find snapshots/manifest data 
URL: https://github.com/apache/incubator-iceberg/issues/128
 
 
   Iceberg api seems to always assumes $CWD at time of creation as root path 
when reading manifest-list from metadata. 
   
   This happens coz when creating the iceberg data the snapshot manifests are 
created and kept with the path as per the time of creation. If this table is 
read from a different $CWD then it will fail to find the snapshots. 
   
   
   Example : 
   ```
   bash-3.2$ ls -d test/iceberg-people-nestedfield-metrics
   test/iceberg-people-nestedfield-metrics
   
   
   spark-shell  --jars runtime/build/libs/iceberg-runtime.jar
   
   
   // load any pre-generated Iceberg table using a relative path
   val iceDf = 
spark.read.format("iceberg").load("test/iceberg-people-nestedfield-metrics")
   
   iceDf.createOrReplaceTempView("iceberg_people_nestedfield_metrics")
   
   // fails here 
   spark.sql("select * from iceberg_people_nestedfield_metrics where age = 
30").show()
   
   
   
   
   com.netflix.iceberg.exceptions.RuntimeIOException: Failed to open input 
stream for file: 
iceberg-people-nestedfield-metrics/metadata/snap-7605852248950368081-1-c6ed79f5-333d-43ba-acae-86f963ab45c9.avro
     at 
com.netflix.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:127)
     at 
com.netflix.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:95)
     at com.netflix.iceberg.avro.AvroIterable.iterator(AvroIterable.java:77)
     at com.google.common.collect.Lists.newLinkedList(Lists.java:223)
     at com.netflix.iceberg.BaseSnapshot.manifests(BaseSnapshot.java:123)
     at com.netflix.iceberg.BaseTableScan.planFiles(BaseTableScan.java:171)
     at com.netflix.iceberg.BaseTableScan.planTasks(BaseTableScan.java:211)
     at com.netflix.iceberg.spark.source.Reader.tasks(Reader.java:219)
     at 
com.netflix.iceberg.spark.source.Reader.planInputPartitions(Reader.java:146)
     at 
org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExec.partitions$lzycompute(DataSourceV2ScanExec.scala:76)
     at 
org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExec.partitions(DataSourceV2ScanExec.scala:75)
     at 
org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExec.outputPartitioning(DataSourceV2ScanExec.scala:65)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:150)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:149)
     at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
     at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
     at scala.collection.immutable.List.foreach(List.scala:392)
     at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
     at scala.collection.immutable.List.map(List.scala:296)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements.org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering(EnsureRequirements.scala:149)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$apply$1.applyOrElse(EnsureRequirements.scala:304)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$apply$1.applyOrElse(EnsureRequirements.scala:296)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
     at 
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:277)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:326)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:324)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:275)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:326)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:324)
     at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:275)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements.apply(EnsureRequirements.scala:296)
     at 
org.apache.spark.sql.execution.exchange.EnsureRequirements.apply(EnsureRequirements.scala:38)
     at 
org.apache.spark.sql.execution.QueryExecution$$anonfun$prepareForExecution$1.apply(QueryExecution.scala:87)
     at 
org.apache.spark.sql.execution.QueryExecution$$anonfun$prepareForExecution$1.apply(QueryExecution.scala:87)
     at 
scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
     at scala.collection.immutable.List.foldLeft(List.scala:84)
     at 
org.apache.spark.sql.execution.QueryExecution.prepareForExecution(QueryExecution.scala:87)
     at 
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
     at 
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
     at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3359)
     at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
     at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
     at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
     at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
     at org.apache.spark.sql.Dataset.show(Dataset.scala:745)
     at org.apache.spark.sql.Dataset.show(Dataset.scala:704)
     at org.apache.spark.sql.Dataset.show(Dataset.scala:713)
     ... 53 elided
   Caused by: java.io.FileNotFoundException: File 
iceberg-people-nestedfield-metrics/metadata/snap-7605852248950368081-1-c6ed79f5-333d-43ba-acae-86f963ab45c9.avro
 does not exist
     at 
org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:539)
     at 
org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:752)
     at 
org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:529)
     at 
org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:409)
     at 
org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:142)
     at 
org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:346)
     at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766)
     at 
com.netflix.iceberg.hadoop.HadoopInputFile.newStream(HadoopInputFile.java:125)
     ... 107 more
   
   ```
   
   
   Gist : https://gist.github.com/prodeezy/cd8257899e90a52dfa37d5c58b3cc08f

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to