yihua commented on code in PR #12039:
URL: https://github.com/apache/hudi/pull/12039#discussion_r1796179118
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestSparkNonBlockingConcurrencyControl.java:
##########
@@ -206,6 +213,98 @@ public void
testNonBlockingConcurrencyControlWithInflightInstant() throws Except
checkWrittenData(result, 1);
}
+ //Prove that multiple writers will only produce base files for bulk insert
Review Comment:
```suggestion
// Validate that multiple writers will only produce base files for bulk
insert
```
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestSparkNonBlockingConcurrencyControl.java:
##########
@@ -206,6 +213,98 @@ public void
testNonBlockingConcurrencyControlWithInflightInstant() throws Except
checkWrittenData(result, 1);
}
+ //Prove that multiple writers will only produce base files for bulk insert
+ @ParameterizedTest
+ @ValueSource(booleans = {true, false})
+ public void testMultiBaseFile(boolean bulkInsertFirst) throws Exception {
+ HoodieWriteConfig config = createHoodieWriteConfig(true);
+ metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ,
config.getProps());
+ //there should only be a single filegroup, so we will verify that it is
consistent
+ String fileID = null;
+
+ // if there is not a bulk insert first, then we will write to log files
for a filegroup
+ // without a base file. Having a base file adds the possibility of small
file handling
+ // which we want to ensure doesn't happen.
+ if (bulkInsertFirst) {
+ SparkRDDWriteClient client0 = getHoodieWriteClient(config);
+ List<String> dataset0 = Collections.singletonList("id0,Danny,0,0,par1");
+ String insertTime0 = client0.createNewInstantTime();
+ List<WriteStatus> writeStatuses0 = writeData(client0, insertTime0,
dataset0, false, WriteOperationType.BULK_INSERT, true);
+ client0.commitStats(
+ insertTime0,
+ context().parallelize(writeStatuses0, 1),
+
writeStatuses0.stream().map(WriteStatus::getStat).collect(Collectors.toList()),
+ Option.empty(),
+ metaClient.getCommitActionType());
+ for (WriteStatus status : writeStatuses0) {
+ if (fileID == null) {
+ fileID = status.getFileId();
+ } else {
+ assertEquals(fileID, status.getFileId());
+ }
+
assertFalse(status.getStat().getPath().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()));
Review Comment:
Use `FSUtils.isLogFile`
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestSparkNonBlockingConcurrencyControl.java:
##########
@@ -206,6 +213,98 @@ public void
testNonBlockingConcurrencyControlWithInflightInstant() throws Except
checkWrittenData(result, 1);
}
+ //Prove that multiple writers will only produce base files for bulk insert
+ @ParameterizedTest
+ @ValueSource(booleans = {true, false})
+ public void testMultiBaseFile(boolean bulkInsertFirst) throws Exception {
+ HoodieWriteConfig config = createHoodieWriteConfig(true);
+ metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ,
config.getProps());
+ //there should only be a single filegroup, so we will verify that it is
consistent
Review Comment:
```suggestion
// there should only be a single filegroup, so we will verify that it is
consistent
```
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestSparkNonBlockingConcurrencyControl.java:
##########
@@ -206,6 +213,98 @@ public void
testNonBlockingConcurrencyControlWithInflightInstant() throws Except
checkWrittenData(result, 1);
}
+ //Prove that multiple writers will only produce base files for bulk insert
+ @ParameterizedTest
+ @ValueSource(booleans = {true, false})
+ public void testMultiBaseFile(boolean bulkInsertFirst) throws Exception {
+ HoodieWriteConfig config = createHoodieWriteConfig(true);
+ metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ,
config.getProps());
+ //there should only be a single filegroup, so we will verify that it is
consistent
+ String fileID = null;
+
+ // if there is not a bulk insert first, then we will write to log files
for a filegroup
+ // without a base file. Having a base file adds the possibility of small
file handling
+ // which we want to ensure doesn't happen.
+ if (bulkInsertFirst) {
+ SparkRDDWriteClient client0 = getHoodieWriteClient(config);
+ List<String> dataset0 = Collections.singletonList("id0,Danny,0,0,par1");
+ String insertTime0 = client0.createNewInstantTime();
+ List<WriteStatus> writeStatuses0 = writeData(client0, insertTime0,
dataset0, false, WriteOperationType.BULK_INSERT, true);
+ client0.commitStats(
+ insertTime0,
+ context().parallelize(writeStatuses0, 1),
+
writeStatuses0.stream().map(WriteStatus::getStat).collect(Collectors.toList()),
+ Option.empty(),
+ metaClient.getCommitActionType());
+ for (WriteStatus status : writeStatuses0) {
+ if (fileID == null) {
+ fileID = status.getFileId();
+ } else {
+ assertEquals(fileID, status.getFileId());
+ }
+
assertFalse(status.getStat().getPath().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()));
+ }
+ client0.close();
+ }
+
+ SparkRDDWriteClient client1 = getHoodieWriteClient(config);
+ List<String> dataset1 = Collections.singletonList("id1,Danny,22,1,par1");
+ String insertTime1 = client1.createNewInstantTime();
+ List<WriteStatus> writeStatuses1 = writeData(client1, insertTime1,
dataset1, false, WriteOperationType.INSERT, true);
+ for (WriteStatus status : writeStatuses1) {
+ if (fileID == null) {
+ fileID = status.getFileId();
+ } else {
+ assertEquals(fileID, status.getFileId());
+ }
+
assertTrue(status.getStat().getPath().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()));
Review Comment:
Same here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]