nsivabalan commented on code in PR #9367: URL: https://github.com/apache/hudi/pull/9367#discussion_r1284898855
########## hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java: ########## @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +public class ExternalFilePathUtil { + private static final String EXTERNAL_FILE_PREFIX = "hudiext_"; + + public static String prefixExternalFileWithCommitTime(String fileName, String commitTime) { Review Comment: naming: convertExternalFileToHudiFilePath ########## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ########## @@ -324,33 +325,45 @@ public static HoodieRecord<HoodieMetadataPayload> createPartitionListRecord(List * @param partition The name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord<HoodieMetadataPayload> createPartitionFilesRecord(String partition, - Option<Map<String, Long>> filesAdded, - Option<List<String>> filesDeleted) { - Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(); - filesAdded.ifPresent(filesMap -> - fileInfo.putAll( - filesMap.entrySet().stream().collect( - Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); - }))) - ); - filesDeleted.ifPresent(filesList -> - fileInfo.putAll( - filesList.stream().collect( - Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true)))) - ); + Map<String, Long> filesAdded, + List<String> filesDeleted, + Option<String> instantTime) { + int size = filesAdded.size() + filesDeleted.size(); + Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(size, 1); + filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); + }); + + filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing Review Comment: lets try to use the same terminology everywhere. if we standardize on "externally created files", lets try to use the same phrase everywhere to avoid confusion ########## hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java: ########## @@ -312,6 +312,13 @@ public final class HoodieMetadataConfig extends HoodieConfig { .withDocumentation("Maximum size in bytes of a single log file. Larger log files can contain larger log blocks " + "thereby reducing the number of blocks to search for keys"); + public static final ConfigProperty<Boolean> DISABLE_FILESYSTEM_BOOTSTRAP = ConfigProperty + .key(METADATA_PREFIX + ".filesystem.bootstrap.disabled") + .defaultValue(false) + .sinceVersion("0.14.0") + .withDocumentation("Disable bootstrapping metadata table from the file system when the table is first created. " Review Comment: @yihua : is our config generation skips configs with "_" automatically ? ########## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ########## @@ -177,6 +177,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata * You can find more details in HUDI-3834. */ private static final Lazy<HoodieMetadataColumnStats.Builder> METADATA_COLUMN_STATS_BUILDER_STUB = Lazy.lazily(HoodieMetadataColumnStats::newBuilder); + private static final HoodieMetadataFileInfo DELETE_FILE_METADATA = new HoodieMetadataFileInfo(0L, true); Review Comment: 👍 ########## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ########## @@ -324,33 +325,45 @@ public static HoodieRecord<HoodieMetadataPayload> createPartitionListRecord(List * @param partition The name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord<HoodieMetadataPayload> createPartitionFilesRecord(String partition, - Option<Map<String, Long>> filesAdded, - Option<List<String>> filesDeleted) { - Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(); - filesAdded.ifPresent(filesMap -> - fileInfo.putAll( - filesMap.entrySet().stream().collect( - Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); - }))) - ); - filesDeleted.ifPresent(filesList -> - fileInfo.putAll( - filesList.stream().collect( - Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true)))) - ); + Map<String, Long> filesAdded, + List<String> filesDeleted, + Option<String> instantTime) { + int size = filesAdded.size() + filesDeleted.size(); + Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(size, 1); + filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); + }); + + filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option<String> commitTime) { + return commitTime.map(commit -> { + if (fileName.contains(commit) || FSUtils.isLogFile(fileName)) { Review Comment: can't we avoid this. if it possible to write in an arugment from the caller only. Just looking to avoid these parsing logic for every file. if possible ########## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ########## @@ -177,6 +177,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata * You can find more details in HUDI-3834. */ private static final Lazy<HoodieMetadataColumnStats.Builder> METADATA_COLUMN_STATS_BUILDER_STUB = Lazy.lazily(HoodieMetadataColumnStats::newBuilder); + private static final HoodieMetadataFileInfo DELETE_FILE_METADATA = new HoodieMetadataFileInfo(0L, true); Review Comment: 👍 ########## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java: ########## @@ -61,16 +62,39 @@ public HoodieBaseFile(String filePath) { public HoodieBaseFile(String filePath, BaseFile bootstrapBaseFile) { super(filePath); this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile); - String[] fileIdAndCommitTime = getFileIdAndCommitTimeFromFileName(); + String[] fileIdAndCommitTime = getFileIdAndCommitTimeFromFileName(getFileName()); this.fileId = fileIdAndCommitTime[0]; this.commitTime = fileIdAndCommitTime[1]; } + public HoodieBaseFile(String filePath, String fileId, String commitTime, BaseFile bootstrapBaseFile) { + super(filePath); + this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile); + this.fileId = fileId; + this.commitTime = commitTime; + } + + private HoodieBaseFile(FileStatus fileStatus, String[] fileIdAndCommitTime, BaseFile bootstrapBaseFile) { + this(fileStatus, fileIdAndCommitTime[0], fileIdAndCommitTime[1], bootstrapBaseFile); + } + + public HoodieBaseFile(FileStatus fileStatus, String fileId, String commitTime, BaseFile bootstrapBaseFile) { + super(handleExternallyGeneratedFileName(fileStatus, fileId)); Review Comment: so, this constructor should be used only for externally created files right. should we add validation on this? I don't see we do any pattern matching in handleExternallyGeneratedFileName. So, it could silently fail. ########## hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java: ########## @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +public class ExternalFilePathUtil { Review Comment: java docs ########## hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java: ########## @@ -324,33 +325,45 @@ public static HoodieRecord<HoodieMetadataPayload> createPartitionListRecord(List * @param partition The name of the partition * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition + * @param instantTime Commit time of the commit responsible for adding and/or deleting these files, will be empty during bootstrapping of the metadata table */ public static HoodieRecord<HoodieMetadataPayload> createPartitionFilesRecord(String partition, - Option<Map<String, Long>> filesAdded, - Option<List<String>> filesDeleted) { - Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(); - filesAdded.ifPresent(filesMap -> - fileInfo.putAll( - filesMap.entrySet().stream().collect( - Collectors.toMap(Map.Entry::getKey, (entry) -> { - long fileSize = entry.getValue(); - // Assert that the file-size of the file being added is positive, since Hudi - // should not be creating empty files - checkState(fileSize > 0); - return new HoodieMetadataFileInfo(fileSize, false); - }))) - ); - filesDeleted.ifPresent(filesList -> - fileInfo.putAll( - filesList.stream().collect( - Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true)))) - ); + Map<String, Long> filesAdded, + List<String> filesDeleted, + Option<String> instantTime) { + int size = filesAdded.size() + filesDeleted.size(); + Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>(size, 1); + filesAdded.forEach((fileName, fileSize) -> { + // Assert that the file-size of the file being added is positive, since Hudi + // should not be creating empty files + checkState(fileSize > 0); + fileInfo.put(handleFileName(fileName, instantTime), new HoodieMetadataFileInfo(fileSize, false)); + }); + + filesDeleted.forEach(fileName -> fileInfo.put(handleFileName(fileName, instantTime), DELETE_FILE_METADATA)); HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); return new HoodieAvroRecord<>(key, payload); } + /** + * In the case where a file was created by something other than a Hudi writer, the file name will not contain the commit time. We will prefix the file name with hudiext_[commitTime] before storing + * in the metadata table. The constructor for {@link org.apache.hudi.common.model.HoodieBaseFile} will properly handle this prefix. + * @param fileName incoming file name + * @param commitTime time of the commit (will be empty during bootstrap operations) + * @return file name with commit time prefix if the input file name does not contain the commit time, otherwise returns the original input + */ + private static String handleFileName(String fileName, Option<String> commitTime) { Review Comment: So, we wanted to support the use-case where we start as external sync and later move to writing directly via hudi? and so we did not want to introduce a write config or table property to deduce the file paths/name ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
