This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new afa6bc0 [HUDI-1723] Fix path selector listing files with the same mod
date (#2845)
afa6bc0 is described below
commit afa6bc0b100450b5d80a27bf5b87cbd4a0fbb3a5
Author: Raymond Xu <[email protected]>
AuthorDate: Tue May 25 07:19:10 2021 -0700
[HUDI-1723] Fix path selector listing files with the same mod date (#2845)
---
.../hudi/common/testutils/FileCreateUtils.java | 8 +
.../utilities/sources/helpers/DFSPathSelector.java | 12 +-
.../sources/helpers/DatePartitionPathSelector.java | 13 +-
.../helpers/TestDFSPathSelectorCommonMethods.java | 161 +++++++++++++++++++++
4 files changed, 184 insertions(+), 10 deletions(-)
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
index 35f2cfe..b7754b0 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java
@@ -45,6 +45,8 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.nio.file.attribute.FileTime;
+import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
@@ -224,6 +226,11 @@ public class FileCreateUtils {
public static void createBaseFile(String basePath, String partitionPath,
String instantTime, String fileId, long length)
throws Exception {
+ createBaseFile(basePath, partitionPath, instantTime, fileId, length,
Instant.now().toEpochMilli());
+ }
+
+ public static void createBaseFile(String basePath, String partitionPath,
String instantTime, String fileId, long length, long lastModificationTimeMilli)
+ throws Exception {
Path parentPath = Paths.get(basePath, partitionPath);
Files.createDirectories(parentPath);
Path baseFilePath = parentPath.resolve(baseFileName(instantTime, fileId));
@@ -231,6 +238,7 @@ public class FileCreateUtils {
Files.createFile(baseFilePath);
}
new RandomAccessFile(baseFilePath.toFile(), "rw").setLength(length);
+ Files.setLastModifiedTime(baseFilePath,
FileTime.fromMillis(lastModificationTimeMilli));
}
public static void createLogFile(String basePath, String partitionPath,
String instantTime, String fileId, int version)
diff --git
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java
index d9d3444..9301bf3 100644
---
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java
+++
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DFSPathSelector.java
@@ -121,28 +121,30 @@ public class DFSPathSelector implements Serializable {
eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime));
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
- long maxModificationTime = Long.MIN_VALUE;
+ long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : eligibleFiles) {
- if (currentBytes + f.getLen() >= sourceLimit) {
+ if (currentBytes + f.getLen() >= sourceLimit &&
f.getModificationTime() > newCheckpointTime) {
// we have enough data, we are done
+ // Also, we've read up to a file with a newer modification time
+ // so that some files with the same modification time won't be
skipped in next read
break;
}
- maxModificationTime = f.getModificationTime();
+ newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
- return new ImmutablePair<>(Option.empty(),
lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
+ return new ImmutablePair<>(Option.empty(),
String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f ->
f.getPath().toString()).collect(Collectors.joining(","));
- return new ImmutablePair<>(Option.ofNullable(pathStr),
String.valueOf(maxModificationTime));
+ return new ImmutablePair<>(Option.ofNullable(pathStr),
String.valueOf(newCheckpointTime));
} catch (IOException ioe) {
throw new HoodieIOException("Unable to read from source from checkpoint:
" + lastCheckpointStr, ioe);
}
diff --git
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DatePartitionPathSelector.java
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DatePartitionPathSelector.java
index 97106de..71e6a57 100644
---
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DatePartitionPathSelector.java
+++
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/DatePartitionPathSelector.java
@@ -144,27 +144,30 @@ public class DatePartitionPathSelector extends
DFSPathSelector {
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
+ long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : sortedEligibleFiles) {
- if (currentBytes + f.getLen() >= sourceLimit) {
+ if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime()
> newCheckpointTime) {
// we have enough data, we are done
+ // Also, we've read up to a file with a newer modification time
+ // so that some files with the same modification time won't be skipped
in next read
break;
}
+ newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
- return new ImmutablePair<>(
- Option.empty(), lastCheckpointStr.orElseGet(() ->
String.valueOf(Long.MIN_VALUE)));
+ return new ImmutablePair<>(Option.empty(),
String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f ->
f.getPath().toString()).collect(Collectors.joining(","));
- long maxModificationTime = filteredFiles.get(filteredFiles.size() -
1).getModificationTime();
- return new ImmutablePair<>(Option.ofNullable(pathStr),
String.valueOf(maxModificationTime));
+
+ return new ImmutablePair<>(Option.ofNullable(pathStr),
String.valueOf(newCheckpointTime));
}
/**
diff --git
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestDFSPathSelectorCommonMethods.java
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestDFSPathSelectorCommonMethods.java
new file mode 100644
index 0000000..08acc87
--- /dev/null
+++
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestDFSPathSelectorCommonMethods.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.utilities.sources.helpers;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.testutils.HoodieClientTestHarness;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import static org.apache.hudi.common.testutils.FileCreateUtils.createBaseFile;
+import static
org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP;
+import static
org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestDFSPathSelectorCommonMethods extends HoodieClientTestHarness {
+
+ TypedProperties props;
+ Path inputPath;
+
+ @BeforeEach
+ void setUp() {
+ initSparkContexts();
+ initPath();
+ initFileSystem();
+ props = new TypedProperties();
+ props.setProperty(ROOT_INPUT_PATH_PROP, basePath);
+ props.setProperty(PARTITIONS_LIST_PARALLELISM, "1");
+ inputPath = new Path(basePath);
+ }
+
+ @AfterEach
+ public void teardown() throws Exception {
+ cleanupResources();
+ }
+
+ @ParameterizedTest
+ @ValueSource(classes = {DFSPathSelector.class,
DatePartitionPathSelector.class})
+ public void listEligibleFilesShouldIgnoreCertainPrefixes(Class<?> clazz)
throws Exception {
+ DFSPathSelector selector = (DFSPathSelector)
ReflectionUtils.loadClass(clazz.getName(), props, hadoopConf);
+ createBaseFile(basePath, "p1", "000", "foo1", 1);
+ createBaseFile(basePath, "p1", "000", ".foo2", 1);
+ createBaseFile(basePath, "p1", "000", "_foo3", 1);
+
+ List<FileStatus> eligibleFiles = selector.listEligibleFiles(fs, inputPath,
0);
+ assertEquals(1, eligibleFiles.size());
+ assertTrue(eligibleFiles.get(0).getPath().getName().startsWith("foo1"));
+ }
+
+ @ParameterizedTest
+ @ValueSource(classes = {DFSPathSelector.class,
DatePartitionPathSelector.class})
+ public void listEligibleFilesShouldIgnore0LengthFiles(Class<?> clazz) throws
Exception {
+ DFSPathSelector selector = (DFSPathSelector)
ReflectionUtils.loadClass(clazz.getName(), props, hadoopConf);
+ createBaseFile(basePath, "p1", "000", "foo1", 1);
+ createBaseFile(basePath, "p1", "000", "foo2", 0);
+ createBaseFile(basePath, "p1", "000", "foo3", 0);
+
+ List<FileStatus> eligibleFiles = selector.listEligibleFiles(fs, inputPath,
0);
+ assertEquals(1, eligibleFiles.size());
+ assertTrue(eligibleFiles.get(0).getPath().getName().startsWith("foo1"));
+ }
+
+ @ParameterizedTest
+ @ValueSource(classes = {DFSPathSelector.class,
DatePartitionPathSelector.class})
+ public void
listEligibleFilesShouldIgnoreFilesEarlierThanCheckpointTime(Class<?> clazz)
throws Exception {
+ DFSPathSelector selector = (DFSPathSelector)
ReflectionUtils.loadClass(clazz.getName(), props, hadoopConf);
+ createBaseFile(basePath, "p1", "000", "foo1", 1);
+ createBaseFile(basePath, "p1", "000", "foo2", 1);
+ createBaseFile(basePath, "p1", "000", "foo3", 1);
+
+ List<FileStatus> eligibleFiles = selector.listEligibleFiles(fs, inputPath,
Long.MAX_VALUE);
+ assertEquals(0, eligibleFiles.size());
+ }
+
+ @ParameterizedTest
+ @ValueSource(classes = {DFSPathSelector.class,
DatePartitionPathSelector.class})
+ public void
getNextFilePathsAndMaxModificationTimeShouldRespectSourceLimit(Class<?> clazz)
throws Exception {
+ DFSPathSelector selector = (DFSPathSelector)
ReflectionUtils.loadClass(clazz.getName(), props, hadoopConf);
+ createBaseFile(basePath, "p1", "000", "foo1", 10, 1000);
+ createBaseFile(basePath, "p1", "000", "foo2", 10, 2000);
+ createBaseFile(basePath, "p1", "000", "foo3", 10, 3000);
+ createBaseFile(basePath, "p1", "000", "foo4", 10, 4000);
+ createBaseFile(basePath, "p1", "000", "foo5", 10, 5000);
+ Pair<Option<String>, String> nextFilePathsAndCheckpoint = selector
+ .getNextFilePathsAndMaxModificationTime(jsc, Option.empty(), 30);
+ List<String> fileNames = Arrays
+ .stream(nextFilePathsAndCheckpoint.getLeft().get().split(","))
+ .map(p -> Paths.get(p).toFile().getName())
+ .sorted().collect(Collectors.toList());
+ assertEquals(2, fileNames.size());
+ assertTrue(fileNames.get(0).startsWith("foo1"));
+ assertTrue(fileNames.get(1).startsWith("foo2"));
+ String checkpointStr1stRead = nextFilePathsAndCheckpoint.getRight();
+ assertEquals(2000L, Long.parseLong(checkpointStr1stRead), "should read up
to foo2 (inclusive)");
+ }
+
+ @ParameterizedTest
+ @ValueSource(classes = {DFSPathSelector.class,
DatePartitionPathSelector.class})
+ public void
getNextFilePathsAndMaxModificationTimeShouldIgnoreSourceLimitIfSameModTimeFilesPresent(Class<?>
clazz) throws Exception {
+ DFSPathSelector selector = (DFSPathSelector)
ReflectionUtils.loadClass(clazz.getName(), props, hadoopConf);
+ createBaseFile(basePath, "p1", "000", "foo1", 10, 1000);
+ createBaseFile(basePath, "p1", "000", "foo2", 10, 1000);
+ createBaseFile(basePath, "p1", "000", "foo3", 10, 1000);
+ createBaseFile(basePath, "p1", "000", "foo4", 10, 2000);
+ createBaseFile(basePath, "p1", "000", "foo5", 10, 2000);
+ Pair<Option<String>, String> nextFilePathsAndCheckpoint = selector
+ .getNextFilePathsAndMaxModificationTime(jsc, Option.empty(), 20);
+ List<String> fileNames1stRead = Arrays
+ .stream(nextFilePathsAndCheckpoint.getLeft().get().split(","))
+ .map(p -> Paths.get(p).toFile().getName())
+ .sorted().collect(Collectors.toList());
+ assertEquals(3, fileNames1stRead.size());
+ assertTrue(fileNames1stRead.get(0).startsWith("foo1"));
+ assertTrue(fileNames1stRead.get(1).startsWith("foo2"));
+ assertTrue(fileNames1stRead.get(2).startsWith("foo3"));
+ String checkpointStr1stRead = nextFilePathsAndCheckpoint.getRight();
+ assertEquals(1000L, Long.parseLong(checkpointStr1stRead), "should read up
to foo3 (inclusive)");
+
+ nextFilePathsAndCheckpoint = selector
+ .getNextFilePathsAndMaxModificationTime(jsc,
Option.of(checkpointStr1stRead), 20);
+ List<String> fileNames2ndRead = Arrays
+ .stream(nextFilePathsAndCheckpoint.getLeft().get().split(","))
+ .map(p -> Paths.get(p).toFile().getName())
+ .sorted().collect(Collectors.toList());
+ assertEquals(2, fileNames2ndRead.size());
+ assertTrue(fileNames2ndRead.get(0).startsWith("foo4"));
+ assertTrue(fileNames2ndRead.get(1).startsWith("foo5"));
+ String checkpointStr2ndRead = nextFilePathsAndCheckpoint.getRight();
+ assertEquals(2000L, Long.parseLong(checkpointStr2ndRead), "should read up
to foo5 (inclusive)");
+ }
+}