Jackie-Jiang commented on code in PR #17264:
URL: https://github.com/apache/pinot/pull/17264#discussion_r2566688579
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
private static final int BUFFER_SIZE = 65536;
private static final String CRC_FILE_EXTENSTION = ".crc";
+ private static final List<String> DATA_FILE_EXTENSIONS =
Arrays.asList(".fwd", ".dict", ".inv");
+ private static final String METADATA_FILE = "metadata.properties";
private final List<File> _files;
+ private final List<File> _dataFiles;
- private CrcUtils(List<File> files) {
+ private CrcUtils(List<File> files, List<File> dataFiles) {
_files = files;
+ _dataFiles = dataFiles;
}
public static CrcUtils forAllFilesInFolder(File dir) {
- List<File> normalFiles = new ArrayList<>();
- getAllNormalFiles(dir, normalFiles);
- Collections.sort(normalFiles);
- return new CrcUtils(normalFiles);
+ List<File> allNormalFiles = new ArrayList<>();
+ List<File> dataFiles = new ArrayList<>();
+ collectFiles(dir, allNormalFiles, dataFiles);
+ Collections.sort(allNormalFiles);
+ Collections.sort(dataFiles);
+ return new CrcUtils(allNormalFiles, dataFiles);
}
/**
- * Helper method to get all normal (non-directory) files under a directory
recursively.
+ * Helper method to get all files (normal and data files) in the directory
to later compute CRC for them.
* <p>NOTE: do not include the segment creation meta file.
*/
- private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+ private static void collectFiles(File dir, List<File> normalFiles,
List<File> dataFiles) {
File[] files = dir.listFiles();
Preconditions.checkNotNull(files);
for (File file : files) {
if (file.isFile()) {
+ String fileName = file.getName();
// Certain file systems, e.g. HDFS will create .crc files when perform
data copy.
// We should ignore both SEGMENT_CREATION_META and generated '.crc'
files.
- if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) &&
!file.getName()
+ if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
.endsWith(CRC_FILE_EXTENSTION)) {
+ // add all files to normal files
normalFiles.add(file);
+ //include data extension files and metadata file to dataFiles
+ // Conditionally add to the data-only list
+ if (isDataFile(fileName)) {
+ dataFiles.add(file);
+ }
}
} else {
- getAllNormalFiles(file, normalFiles);
+ collectFiles(file, normalFiles, dataFiles);
}
}
}
- public long computeCrc()
+ /**
+ * Determines if a file is considered a "Data File" (Metadata or one of
".fwd", ".dict", ".inv" file types).
+ */
+ private static boolean isDataFile(String fileName) {
Review Comment:
Collecting `.fwd` and `.dict` should be enough. I don't think we need
metadata and inverted index.
When forward index is disabled for any column, we shouldn't generate data
CRC because it cannot correctly represent the data.
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java:
##########
@@ -681,13 +683,14 @@ public ColumnStatistics
getColumnStatisticsCollector(final String columnName)
return _segmentStats.getColumnProfileFor(columnName);
}
- public static void persistCreationMeta(File indexDir, long crc, long
creationTime)
+ public static void persistCreationMeta(File indexDir, long crc, long
dataOnlyCrc, long creationTime)
Review Comment:
(nit) Call it `dataCrc` for consistency
##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/SegmentMetadataImpl.java:
##########
@@ -192,6 +193,13 @@ private void loadCreationMeta(File crcFile)
final DataInputStream ds = new DataInputStream(new
FileInputStream(crcFile));
_crc = ds.readLong();
_creationTime = ds.readLong();
+ try {
+ _dataCrc = ds.readLong();
+ } catch (IOException e) {
+ LOGGER.warn("Could not find data crc, falling back to default LONG_MIN
value");
Review Comment:
Don't log here. For all existing segments, this is common case. Same for
other places
##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/SegmentMetadataImpl.java:
##########
@@ -467,6 +485,7 @@ public JsonNode toJson(@Nullable Set<String> columnFilter) {
segmentMetadata.put("segmentName", _segmentName);
segmentMetadata.put("schemaName", _schema != null ?
_schema.getSchemaName() : null);
segmentMetadata.put("crc", _crc);
+ segmentMetadata.put("dataCrc", _dataCrc);
Review Comment:
Include it only when available
##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/realtime/PinotLLCRealtimeSegmentManager.java:
##########
@@ -2137,6 +2138,7 @@ private void handleLLCUpload(String segmentName, String
rawTableName, SegmentZKM
TableLLCSegmentUploadResponse response, PinotFS pinotFS)
throws Exception {
long currentCrc = currentMetadata.getCrc();
+
Review Comment:
How about data crc?
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
private static final int BUFFER_SIZE = 65536;
private static final String CRC_FILE_EXTENSTION = ".crc";
+ private static final List<String> DATA_FILE_EXTENSIONS =
Arrays.asList(".fwd", ".dict", ".inv");
+ private static final String METADATA_FILE = "metadata.properties";
private final List<File> _files;
+ private final List<File> _dataFiles;
- private CrcUtils(List<File> files) {
+ private CrcUtils(List<File> files, List<File> dataFiles) {
_files = files;
+ _dataFiles = dataFiles;
}
public static CrcUtils forAllFilesInFolder(File dir) {
- List<File> normalFiles = new ArrayList<>();
- getAllNormalFiles(dir, normalFiles);
- Collections.sort(normalFiles);
- return new CrcUtils(normalFiles);
+ List<File> allNormalFiles = new ArrayList<>();
+ List<File> dataFiles = new ArrayList<>();
+ collectFiles(dir, allNormalFiles, dataFiles);
+ Collections.sort(allNormalFiles);
+ Collections.sort(dataFiles);
+ return new CrcUtils(allNormalFiles, dataFiles);
}
/**
- * Helper method to get all normal (non-directory) files under a directory
recursively.
+ * Helper method to get all files (normal and data files) in the directory
to later compute CRC for them.
* <p>NOTE: do not include the segment creation meta file.
*/
- private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+ private static void collectFiles(File dir, List<File> normalFiles,
List<File> dataFiles) {
File[] files = dir.listFiles();
Preconditions.checkNotNull(files);
for (File file : files) {
if (file.isFile()) {
+ String fileName = file.getName();
// Certain file systems, e.g. HDFS will create .crc files when perform
data copy.
// We should ignore both SEGMENT_CREATION_META and generated '.crc'
files.
- if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) &&
!file.getName()
+ if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
.endsWith(CRC_FILE_EXTENSTION)) {
+ // add all files to normal files
normalFiles.add(file);
+ //include data extension files and metadata file to dataFiles
+ // Conditionally add to the data-only list
+ if (isDataFile(fileName)) {
+ dataFiles.add(file);
+ }
}
} else {
- getAllNormalFiles(file, normalFiles);
+ collectFiles(file, normalFiles, dataFiles);
}
}
}
- public long computeCrc()
+ /**
+ * Determines if a file is considered a "Data File" (Metadata or one of
".fwd", ".dict", ".inv" file types).
+ */
+ private static boolean isDataFile(String fileName) {
+ if (fileName.equals(METADATA_FILE)) {
+ return true;
+ }
+ for (String ext : DATA_FILE_EXTENSIONS) {
+ if (fileName.endsWith(ext)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public long computeCrc(boolean dataOnly)
Review Comment:
Let's separate it into 2 static util methods: `computeDataOnlyCrc()` and
`computeCrc()` for clarity
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]