kingeasternsun commented on a change in pull request #3876:
URL: https://github.com/apache/iceberg/pull/3876#discussion_r790391805



##########
File path: data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java
##########
@@ -74,111 +77,106 @@ private TableMigrationUtil() {
   public static List<DataFile> listPartition(Map<String, String> partition, 
String uri, String format,
                                              PartitionSpec spec, Configuration 
conf, MetricsConfig metricsConfig,
                                              NameMapping mapping) {
-    if (format.contains("avro")) {
-      return listAvroPartition(partition, uri, spec, conf);
-    } else if (format.contains("parquet")) {
-      return listParquetPartition(partition, uri, spec, conf, metricsConfig, 
mapping);
-    } else if (format.contains("orc")) {
-      return listOrcPartition(partition, uri, spec, conf, metricsConfig, 
mapping);
-    } else {
-      throw new UnsupportedOperationException("Unknown partition format: " + 
format);
-    }
+    return listPartition(partition, uri, format, spec, conf, metricsConfig, 
mapping, 1);
   }
 
-  private static List<DataFile> listAvroPartition(Map<String, String> 
partitionPath, String partitionUri,
-                                                  PartitionSpec spec, 
Configuration conf) {
+  public static List<DataFile> listPartition(Map<String, String> 
partitionPath, String partitionUri, String format,
+                                             PartitionSpec spec, Configuration 
conf, MetricsConfig metricsSpec,
+                                             NameMapping mapping, int 
parallelism) {
     try {
+      String partitionKey = spec.fields().stream()
+              .map(PartitionField::name)
+              .map(name -> String.format("%s=%s", name, 
partitionPath.get(name)))
+              .collect(Collectors.joining("/"));
+      
       Path partition = new Path(partitionUri);
       FileSystem fs = partition.getFileSystem(conf);
-      return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
-          .filter(FileStatus::isFile)
-          .map(stat -> {
-            InputFile file = 
HadoopInputFile.fromLocation(stat.getPath().toString(), conf);
-            long rowCount = Avro.rowCount(file);
-            Metrics metrics = new Metrics(rowCount, null, null, null, null);
-            String partitionKey = spec.fields().stream()
-                .map(PartitionField::name)
-                .map(name -> String.format("%s=%s", name, 
partitionPath.get(name)))
-                .collect(Collectors.joining("/"));
-
-            return DataFiles.builder(spec)
-                .withPath(stat.getPath().toString())
-                .withFormat("avro")
-                .withFileSizeInBytes(stat.getLen())
-                .withMetrics(metrics)
-                .withPartitionPath(partitionKey)
-                .build();
-
-          }).collect(Collectors.toList());
+      List<FileStatus> fileStatus = Arrays.stream(fs.listStatus(partition, 
HIDDEN_PATH_FILTER))
+              .filter(FileStatus::isFile)
+              .collect(Collectors.toList());
+      DataFile[] datafiles = new DataFile[fileStatus.size()];
+      Tasks.Builder<Integer> task = Tasks.range(fileStatus.size())
+              .stopOnFailure()
+              .throwFailureWhenFinished();
+
+      if (parallelism > 1) {
+        task.executeWith(migrationService(parallelism));
+      }
+      
+      if (format.contains("avro")) {
+        task.run(index -> {
+          Metrics metrics = getAvroMerics(fileStatus.get(index), conf);
+          datafiles[index] = buildDataFile(fileStatus.get(index), 
partitionKey, spec, metrics, "avro");
+        });
+      } else if (format.contains("parquet")) {
+        task.run(index -> {
+          Metrics metrics = getParquetMerics(fileStatus.get(index), conf, 
metricsSpec, mapping);
+          datafiles[index] = buildDataFile(fileStatus.get(index), 
partitionKey, spec, metrics, "parquet");
+        });
+      } else if (format.contains("orc")) {
+        task.run(index -> {
+          Metrics metrics = getOrcMerics(fileStatus.get(index), conf, 
metricsSpec, mapping);
+          datafiles[index] = buildDataFile(fileStatus.get(index), 
partitionKey, spec, metrics, "avro");
+        });
+      } else {
+        throw new UnsupportedOperationException("Unknown partition format: " + 
format);
+      }
+      return Arrays.asList(datafiles);
     } catch (IOException e) {
       throw new RuntimeException("Unable to list files in partition: " + 
partitionUri, e);
     }
   }
 
-  private static List<DataFile> listParquetPartition(Map<String, String> 
partitionPath, String partitionUri,
-                                                     PartitionSpec spec, 
Configuration conf,
-                                                     MetricsConfig 
metricsSpec, NameMapping mapping) {
+  private static Metrics getAvroMerics(FileStatus stat,  Configuration conf) {

Review comment:
       > > Looks weird, All these failed cases
   > > ```shell
   > >     Caused by: org.apache.avro.InvalidAvroMagicException: Not an Avro 
data file
   > >          at 
org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:70)
   > >          at 
org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)
   > > ```
   > 
   > Yeah something wrong with the format matching? Odd, for example the one I 
looked at is failing because an ORC file is being read as an AVRO one.
   
   
   ```shell
   Error: eckstyle] [ERROR] 
/home/runner/work/iceberg/iceberg/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java:91:
 Trailing whitespace is not allowed. [RegexpSinglelineJava]
   Error: eckstyle] [ERROR] 
/home/runner/work/iceberg/iceberg/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java:91:
 Whitespace at end-of-line [RegexpSingleline]
   Error: eckstyle] [ERROR] 
/home/runner/work/iceberg/iceberg/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java:105:
 Trailing whitespace is not allowed. [RegexpSinglelineJava]
   > Task :iceberg-data:checkstyleMain FAILED
   Error: eckstyle] [ERROR] 
/home/runner/work/iceberg/iceberg/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java:105:
 Whitespace at end-of-line [RegexpSingleline]
   ```
   
   this checkstyle error , can i just remove the empty line ?
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to