This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 5e115b1fb PARQUET-2283: Remove Hadoop HiddenFileFilter (#1072)
5e115b1fb is described below
commit 5e115b1fb4581cbd0a90aa4a8970eb1694082075
Author: Fokko Driesprong <[email protected]>
AuthorDate: Tue Apr 18 09:41:27 2023 +0200
PARQUET-2283: Remove Hadoop HiddenFileFilter (#1072)
For Iceberg/Flink we would like to run without the hadoop dependencies.
The use of the HiddenFileFilter is blocking this. This replaces the filter
with a nice stream.
---
.../apache/parquet/hadoop/ParquetFileReader.java | 25 +++++++++++++++-------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
index 7fa71cb61..b50149cdb 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -53,6 +53,8 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import java.util.zip.CRC32;
import org.apache.hadoop.conf.Configuration;
@@ -99,7 +101,6 @@ import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
-import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
@@ -374,17 +375,25 @@ public class ParquetFileReader implements Closeable {
return readAllFootersInParallelUsingSummaryFiles(configuration, files,
skipRowGroups);
}
+ static boolean filterHiddenFiles(FileStatus file) {
+ final char c = file.getPath().getName().charAt(0);
+ return c != '.' && c != '_';
+ }
+
private static List<FileStatus> listFiles(Configuration conf, FileStatus
fileStatus) throws IOException {
if (fileStatus.isDir()) {
FileSystem fs = fileStatus.getPath().getFileSystem(conf);
- FileStatus[] list = fs.listStatus(fileStatus.getPath(),
HiddenFileFilter.INSTANCE);
- List<FileStatus> result = new ArrayList<FileStatus>();
- for (FileStatus sub : list) {
- result.addAll(listFiles(conf, sub));
- }
- return result;
+ return Arrays.stream(fs.listStatus(fileStatus.getPath()))
+ .filter(ParquetFileReader::filterHiddenFiles)
+ .flatMap(sub -> {
+ try {
+ return listFiles(conf, sub).stream();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }).collect(Collectors.toList());
} else {
- return Arrays.asList(fileStatus);
+ return Collections.singletonList(fileStatus);
}
}