anuragrai16 commented on code in PR #17264:
URL: https://github.com/apache/pinot/pull/17264#discussion_r2567378681
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
private static final int BUFFER_SIZE = 65536;
private static final String CRC_FILE_EXTENSTION = ".crc";
+ private static final List<String> DATA_FILE_EXTENSIONS =
Arrays.asList(".fwd", ".dict", ".inv");
+ private static final String METADATA_FILE = "metadata.properties";
private final List<File> _files;
+ private final List<File> _dataFiles;
- private CrcUtils(List<File> files) {
+ private CrcUtils(List<File> files, List<File> dataFiles) {
_files = files;
+ _dataFiles = dataFiles;
}
public static CrcUtils forAllFilesInFolder(File dir) {
- List<File> normalFiles = new ArrayList<>();
- getAllNormalFiles(dir, normalFiles);
- Collections.sort(normalFiles);
- return new CrcUtils(normalFiles);
+ List<File> allNormalFiles = new ArrayList<>();
+ List<File> dataFiles = new ArrayList<>();
+ collectFiles(dir, allNormalFiles, dataFiles);
+ Collections.sort(allNormalFiles);
+ Collections.sort(dataFiles);
+ return new CrcUtils(allNormalFiles, dataFiles);
}
/**
- * Helper method to get all normal (non-directory) files under a directory
recursively.
+ * Helper method to get all files (normal and data files) in the directory
to later compute CRC for them.
* <p>NOTE: do not include the segment creation meta file.
*/
- private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+ private static void collectFiles(File dir, List<File> normalFiles,
List<File> dataFiles) {
File[] files = dir.listFiles();
Preconditions.checkNotNull(files);
for (File file : files) {
if (file.isFile()) {
+ String fileName = file.getName();
// Certain file systems, e.g. HDFS will create .crc files when perform
data copy.
// We should ignore both SEGMENT_CREATION_META and generated '.crc'
files.
- if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) &&
!file.getName()
+ if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
.endsWith(CRC_FILE_EXTENSTION)) {
+ // add all files to normal files
normalFiles.add(file);
+ //include data extension files and metadata file to dataFiles
+ // Conditionally add to the data-only list
+ if (isDataFile(fileName)) {
+ dataFiles.add(file);
+ }
}
} else {
- getAllNormalFiles(file, normalFiles);
+ collectFiles(file, normalFiles, dataFiles);
}
}
}
- public long computeCrc()
+ /**
+ * Determines if a file is considered a "Data File" (Metadata or one of
".fwd", ".dict", ".inv" file types).
+ */
+ private static boolean isDataFile(String fileName) {
Review Comment:
Agree with not including metadata.properties file, it's not needed. But the
inverted index is indeed needed. When forward index is disabled for a column,
we mandate adding both dictionary and inverted index for the column. The data
for a column with forward-index disabled can be appropriately represented only
using both the dictionary and the inverted index. Removing the inverted index
file would mean we can no longer capture data changes for a column with forward
index disabled. eg. Think of a case where two segments have the same data and
differs only by one value in a column that has forward index disabled. The
dictionary created for both segments is same (unique values are same) - only
the inverted index is disabled.
Lmk if you disagree.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]