prashantwason commented on code in PR #8758:
URL: https://github.com/apache/hudi/pull/8758#discussion_r1229171599
##########
hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java:
##########
@@ -1378,6 +1340,254 @@ public static Set<String>
getInflightAndCompletedMetadataPartitions(HoodieTableC
*/
public static boolean isIndexingCommit(String instantTime) {
return instantTime.length() == MILLIS_INSTANT_ID_LENGTH +
METADATA_INDEXER_TIME_SUFFIX.length()
- && instantTime.endsWith(METADATA_INDEXER_TIME_SUFFIX);
+ && instantTime.endsWith(METADATA_INDEXER_TIME_SUFFIX);
+ }
+
+ /**
+ * Delete the metadata table for the dataset and backup if required.
+ *
+ * @param dataMetaClient {@code HoodieTableMetaClient} of the dataset for
which metadata table is to be deleted
+ * @param context instance of {@link HoodieEngineContext}.
+ * @param backup Whether metadata table should be backed up before
deletion. If true, the table is backed up to the
+ * directory with name metadata_<current_timestamp>.
+ * @return The backup directory if backup was requested
+ */
+ public static String deleteMetadataTable(HoodieTableMetaClient
dataMetaClient, HoodieEngineContext context, boolean backup) {
+ final Path metadataTablePath =
HoodieTableMetadata.getMetadataTableBasePath(dataMetaClient.getBasePathV2());
+ FileSystem fs = FSUtils.getFs(metadataTablePath.toString(),
context.getHadoopConf().get());
+ dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient,
MetadataPartitionType.FILES, false);
+ try {
+ if (!fs.exists(metadataTablePath)) {
+ return null;
+ }
+ } catch (FileNotFoundException e) {
+ // Ignoring exception as metadata table already does not exist
+ return null;
+ } catch (IOException e) {
+ throw new HoodieMetadataException("Failed to check metadata table
existence", e);
+ }
+
+ if (backup) {
+ final Path metadataBackupPath = new Path(metadataTablePath.getParent(),
".metadata_" + HoodieActiveTimeline.createNewInstantTime());
+ LOG.info("Backing up metadata directory to " + metadataBackupPath + "
before deletion");
+ try {
+ if (fs.rename(metadataTablePath, metadataBackupPath)) {
+ return metadataBackupPath.toString();
+ }
+ } catch (Exception e) {
+ // If rename fails, we will ignore the backup and still delete the MDT
+ LOG.error("Failed to backup metadata table using rename", e);
+ }
+ }
+
+ LOG.info("Deleting metadata table from " + metadataTablePath);
+ try {
+ fs.delete(metadataTablePath, true);
+ } catch (Exception e) {
+ throw new HoodieMetadataException("Failed to delete metadata table from
path " + metadataTablePath, e);
+ }
+
+ return null;
+ }
+
+ /**
+ * Delete a partition within the metadata table.
+ * <p>
+ * This can be used to delete a partition so that it can be re-bootstrapped.
+ *
+ * @param dataMetaClient {@code HoodieTableMetaClient} of the dataset for
which metadata table is to be deleted
+ * @param context instance of {@code HoodieEngineContext}.
+ * @param backup Whether metadata table should be backed up before
deletion. If true, the table is backed up to the
+ * directory with name metadata_<current_timestamp>.
+ * @param partitionType The partition to delete
+ * @return The backup directory if backup was requested, null otherwise
+ */
+ public static String deleteMetadataTablePartition(HoodieTableMetaClient
dataMetaClient, HoodieEngineContext context,
+ MetadataPartitionType
partitionType, boolean backup) {
+ if (partitionType.equals(MetadataPartitionType.FILES)) {
+ return deleteMetadataTable(dataMetaClient, context, backup);
+ }
+
+ final Path metadataTablePartitionPath = new
Path(HoodieTableMetadata.getMetadataTableBasePath(dataMetaClient.getBasePath()),
partitionType.getPartitionPath());
+ FileSystem fs = FSUtils.getFs(metadataTablePartitionPath.toString(),
context.getHadoopConf().get());
+ dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient,
partitionType, false);
+ try {
+ if (!fs.exists(metadataTablePartitionPath)) {
+ return null;
+ }
+ } catch (FileNotFoundException e) {
+ // Ignoring exception as metadata table already does not exist
+ LOG.debug("Metadata table partition " + partitionType + " not found at
path " + metadataTablePartitionPath);
+ return null;
+ } catch (Exception e) {
+ throw new HoodieMetadataException(String.format("Failed to check
existence of MDT partition %s at path %s: ", partitionType,
metadataTablePartitionPath), e);
+ }
+
+ if (backup) {
+ final Path metadataPartitionBackupPath = new
Path(metadataTablePartitionPath.getParent().getParent(),
+ String.format(".metadata_%s_%s",
partitionType.getPartitionPath(), HoodieActiveTimeline.createNewInstantTime()));
+ LOG.info(String.format("Backing up MDT partition %s to %s before
deletion", partitionType, metadataPartitionBackupPath));
+ try {
+ if (fs.rename(metadataTablePartitionPath,
metadataPartitionBackupPath)) {
+ return metadataPartitionBackupPath.toString();
+ }
+ } catch (Exception e) {
+ // If rename fails, we will try to delete the table instead
+ LOG.error(String.format("Failed to backup MDT partition %s using
rename", partitionType), e);
+ }
+ } else {
+ LOG.info("Deleting metadata table partition from " +
metadataTablePartitionPath);
+ try {
+ fs.delete(metadataTablePartitionPath, true);
+ } catch (Exception e) {
+ throw new HoodieMetadataException("Failed to delete metadata table
partition from path " + metadataTablePartitionPath, e);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Return the complete fileID for a file group within a MDT partition.
+ * <p>
+ * MDT fileGroups have the format <fileIDPrefix>-<index>. The fileIDPrefix
is hardcoded for each MDT partition and index is an integer.
+ *
+ * @param partitionType The type of the MDT partition
+ * @param index Index of the file group within the partition
+ * @return The fileID
+ */
+ public static String getFileIDForFileGroup(MetadataPartitionType
partitionType, int index) {
+ return String.format("%s%04d", partitionType.getFileIdPrefix(), index);
+ }
+
+ /**
+ * Extract the index from the fileID of a file group in the MDT partition.
See {@code getFileIDForFileGroup} for the format of the fileID.
+ *
+ * @param fileId fileID of a file group.
+ * @return The index of file group
+ */
+ public static int getFileGroupIndexFromFileId(String fileId) {
+ final int endIndex = getFileIdLengthWithoutFileIndex(fileId);
+ final int fromIndex = fileId.lastIndexOf("-", endIndex - 1);
+ return Integer.parseInt(fileId.substring(fromIndex + 1, endIndex));
+ }
+
+ /**
+ * Extract the fileID prefix from the fileID of a file group in the MDT
partition. See {@code getFileIDForFileGroup} for the format of the fileID.
+ *
+ * @param fileId fileID of a file group.
+ * @return The fileID without the file index
+ */
+ public static String getFileGroupPrefix(String fileId) {
+ return fileId.substring(0, getFileIdLengthWithoutFileIndex(fileId));
+ }
+
+ /**
+ * Returns the length of the fileID ignoring the fileIndex suffix
+ * <p>
+ * 0.10 version MDT code added -0 (0th fileIndex) to the fileID. This was
removed later.
+ * <p>
+ * Examples:
+ * 0.11+ version: fileID: files-0000 returns 10
+ * 0.10 version: fileID: files-0000-0 returns 10
+ *
+ * @param fileId The fileID
+ * @return The length of the fileID ignoring the fileIndex suffix
+ */
+ private static int getFileIdLengthWithoutFileIndex(String fileId) {
+ return fileId.endsWith("-0") ? fileId.length() - 2 : fileId.length();
+ }
+
+ /**
+ * Create the timestamp for a clean operation on the metadata table.
+ */
+ public static String createCleanTimestamp(String timestamp) {
+ return timestamp + CLEAN_TIMESTAMP_SUFFIX;
+ }
+
+ /**
+ * Create the timestamp for a compaction operation on the metadata table.
+ */
+ public static String createCompactionTimestamp(String timestamp) {
+ return timestamp + COMPACTION_TIMESTAMP_SUFFIX;
+ }
+
+ /**
+ * Create the timestamp for an index initialization operation on the
metadata table.
+ * <p>
+ * Since many MDT partitions can be initialized one after other the offset
parameter controls generating a
+ * unique timestamp.
+ */
+ public static String createIndexInitTimestamp(String timestamp, int offset) {
+ return String.format("%s%03d", timestamp,
PARTITION_INITIALIZATION_TIME_SUFFIX + offset);
+ }
+
+ /**
+ * Estimates the file group count to use for a MDT partition.
+ *
+ * @param partitionType Type of the partition for which the file
group count is to be estimated.
+ * @param recordCount The number of records expected to be written.
+ * @param averageRecordSize Average size of each record to be writen.
+ * @param minFileGroupCount Minimum number of file groups to use.
+ * @param maxFileGroupCount Maximum number of file groups to use.
+ * @param growthFactor By what factor are the records (recordCount)
expected to grow?
+ * @param maxFileGroupSizeBytes Maximum size of the file group.
+ * @return The estimated number of file groups.
+ */
+ public static int estimateFileGroupCount(MetadataPartitionType
partitionType, long recordCount, int averageRecordSize, int minFileGroupCount,
+ int maxFileGroupCount, float growthFactor, int maxFileGroupSizeBytes) {
+ int fileGroupCount;
+
+ // If a fixed number of file groups are desired
+ if ((minFileGroupCount == maxFileGroupCount) && (minFileGroupCount != 0)) {
+ fileGroupCount = minFileGroupCount;
+ } else {
+ // Number of records to estimate for
+ final long expectedNumRecords = (long) Math.ceil((float) recordCount *
growthFactor);
+ // Maximum records that should be written to each file group so that it
does not go over the size limit required
+ final long maxRecordsPerFileGroup = maxFileGroupSizeBytes /
Math.max(averageRecordSize, 1L);
+ final long estimatedFileGroupCount = expectedNumRecords /
maxRecordsPerFileGroup;
+
+ if (estimatedFileGroupCount >= maxFileGroupCount) {
+ fileGroupCount = maxFileGroupCount;
+ } else if (estimatedFileGroupCount <= minFileGroupCount) {
+ fileGroupCount = minFileGroupCount;
+ } else {
+ fileGroupCount = Math.max(1, (int) estimatedFileGroupCount);
+ }
+ }
+
+ LOG.info(String.format("Estimated file group count for MDT partition %s is
%d "
+ + "[recordCount=%d, avgRecordSize=%d, minFileGroupCount=%d,
maxFileGroupCount=%d, growthFactor=%f, "
+ + "maxFileGroupSizeBytes=%d]", partitionType, fileGroupCount,
recordCount, averageRecordSize, minFileGroupCount,
+ maxFileGroupCount, growthFactor, maxFileGroupSizeBytes));
+ return fileGroupCount;
+ }
+
+ /**
+ * Returns true if any enabled metadata partition in the given hoodie table
requires WriteStatus to track the written records.
+ *
+ * @param config MDT config
+ * @param metaClient {@code HoodieTableMetaClient} of the data table
+ * @return true if WriteStatus should track the written records else false.
+ */
+ public static boolean needsWriteStatusTracking(HoodieMetadataConfig config,
HoodieTableMetaClient metaClient) {
Review Comment:
Renamed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]