Jackie-Jiang commented on code in PR #17264:
URL: https://github.com/apache/pinot/pull/17264#discussion_r2566688579


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
   private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
   private static final int BUFFER_SIZE = 65536;
   private static final String CRC_FILE_EXTENSTION = ".crc";
+  private static final List<String> DATA_FILE_EXTENSIONS = 
Arrays.asList(".fwd", ".dict", ".inv");
+  private static final String METADATA_FILE = "metadata.properties";
 
   private final List<File> _files;
+  private final List<File> _dataFiles;
 
-  private CrcUtils(List<File> files) {
+  private CrcUtils(List<File> files, List<File> dataFiles) {
     _files = files;
+    _dataFiles = dataFiles;
   }
 
   public static CrcUtils forAllFilesInFolder(File dir) {
-    List<File> normalFiles = new ArrayList<>();
-    getAllNormalFiles(dir, normalFiles);
-    Collections.sort(normalFiles);
-    return new CrcUtils(normalFiles);
+    List<File> allNormalFiles = new ArrayList<>();
+    List<File> dataFiles = new ArrayList<>();
+    collectFiles(dir, allNormalFiles, dataFiles);
+    Collections.sort(allNormalFiles);
+    Collections.sort(dataFiles);
+    return new CrcUtils(allNormalFiles, dataFiles);
   }
 
   /**
-   * Helper method to get all normal (non-directory) files under a directory 
recursively.
+   * Helper method to get all files (normal and data files) in the directory 
to later compute CRC for them.
    * <p>NOTE: do not include the segment creation meta file.
    */
-  private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+  private static void collectFiles(File dir, List<File> normalFiles, 
List<File> dataFiles) {
     File[] files = dir.listFiles();
     Preconditions.checkNotNull(files);
     for (File file : files) {
       if (file.isFile()) {
+        String fileName = file.getName();
         // Certain file systems, e.g. HDFS will create .crc files when perform 
data copy.
         // We should ignore both SEGMENT_CREATION_META and generated '.crc' 
files.
-        if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) && 
!file.getName()
+        if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
             .endsWith(CRC_FILE_EXTENSTION)) {
+          // add all files to normal files
           normalFiles.add(file);
+          //include data extension files and metadata file to dataFiles
+          // Conditionally add to the data-only list
+          if (isDataFile(fileName)) {
+            dataFiles.add(file);
+          }
         }
       } else {
-        getAllNormalFiles(file, normalFiles);
+        collectFiles(file, normalFiles, dataFiles);
       }
     }
   }
 
-  public long computeCrc()
+  /**
+   * Determines if a file is considered a "Data File" (Metadata or one of 
".fwd", ".dict", ".inv" file types).
+   */
+  private static boolean isDataFile(String fileName) {

Review Comment:
   Collecting `.fwd` and `.dict` should be enough. I don't think we need 
metadata and inverted index.
   When forward index is disabled for any column, we shouldn't generate data 
CRC because it cannot correctly represent the data.



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentIndexCreationDriverImpl.java:
##########
@@ -681,13 +683,14 @@ public ColumnStatistics 
getColumnStatisticsCollector(final String columnName)
     return _segmentStats.getColumnProfileFor(columnName);
   }
 
-  public static void persistCreationMeta(File indexDir, long crc, long 
creationTime)
+  public static void persistCreationMeta(File indexDir, long crc, long 
dataOnlyCrc, long creationTime)

Review Comment:
   (nit) Call it `dataCrc` for consistency



##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/SegmentMetadataImpl.java:
##########
@@ -192,6 +193,13 @@ private void loadCreationMeta(File crcFile)
       final DataInputStream ds = new DataInputStream(new 
FileInputStream(crcFile));
       _crc = ds.readLong();
       _creationTime = ds.readLong();
+      try {
+        _dataCrc = ds.readLong();
+      } catch (IOException e) {
+        LOGGER.warn("Could not find data crc, falling back to default LONG_MIN 
value");

Review Comment:
   Don't log here. For all existing segments, this is common case. Same for 
other places



##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/SegmentMetadataImpl.java:
##########
@@ -467,6 +485,7 @@ public JsonNode toJson(@Nullable Set<String> columnFilter) {
     segmentMetadata.put("segmentName", _segmentName);
     segmentMetadata.put("schemaName", _schema != null ? 
_schema.getSchemaName() : null);
     segmentMetadata.put("crc", _crc);
+    segmentMetadata.put("dataCrc", _dataCrc);

Review Comment:
   Include it only when available



##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/realtime/PinotLLCRealtimeSegmentManager.java:
##########
@@ -2137,6 +2138,7 @@ private void handleLLCUpload(String segmentName, String 
rawTableName, SegmentZKM
       TableLLCSegmentUploadResponse response, PinotFS pinotFS)
       throws Exception {
     long currentCrc = currentMetadata.getCrc();
+

Review Comment:
   How about data crc?



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java:
##########
@@ -38,47 +39,80 @@ public class CrcUtils {
   private static final Logger LOGGER = LoggerFactory.getLogger(CrcUtils.class);
   private static final int BUFFER_SIZE = 65536;
   private static final String CRC_FILE_EXTENSTION = ".crc";
+  private static final List<String> DATA_FILE_EXTENSIONS = 
Arrays.asList(".fwd", ".dict", ".inv");
+  private static final String METADATA_FILE = "metadata.properties";
 
   private final List<File> _files;
+  private final List<File> _dataFiles;
 
-  private CrcUtils(List<File> files) {
+  private CrcUtils(List<File> files, List<File> dataFiles) {
     _files = files;
+    _dataFiles = dataFiles;
   }
 
   public static CrcUtils forAllFilesInFolder(File dir) {
-    List<File> normalFiles = new ArrayList<>();
-    getAllNormalFiles(dir, normalFiles);
-    Collections.sort(normalFiles);
-    return new CrcUtils(normalFiles);
+    List<File> allNormalFiles = new ArrayList<>();
+    List<File> dataFiles = new ArrayList<>();
+    collectFiles(dir, allNormalFiles, dataFiles);
+    Collections.sort(allNormalFiles);
+    Collections.sort(dataFiles);
+    return new CrcUtils(allNormalFiles, dataFiles);
   }
 
   /**
-   * Helper method to get all normal (non-directory) files under a directory 
recursively.
+   * Helper method to get all files (normal and data files) in the directory 
to later compute CRC for them.
    * <p>NOTE: do not include the segment creation meta file.
    */
-  private static void getAllNormalFiles(File dir, List<File> normalFiles) {
+  private static void collectFiles(File dir, List<File> normalFiles, 
List<File> dataFiles) {
     File[] files = dir.listFiles();
     Preconditions.checkNotNull(files);
     for (File file : files) {
       if (file.isFile()) {
+        String fileName = file.getName();
         // Certain file systems, e.g. HDFS will create .crc files when perform 
data copy.
         // We should ignore both SEGMENT_CREATION_META and generated '.crc' 
files.
-        if (!file.getName().equals(V1Constants.SEGMENT_CREATION_META) && 
!file.getName()
+        if (!fileName.equals(V1Constants.SEGMENT_CREATION_META) && !fileName
             .endsWith(CRC_FILE_EXTENSTION)) {
+          // add all files to normal files
           normalFiles.add(file);
+          //include data extension files and metadata file to dataFiles
+          // Conditionally add to the data-only list
+          if (isDataFile(fileName)) {
+            dataFiles.add(file);
+          }
         }
       } else {
-        getAllNormalFiles(file, normalFiles);
+        collectFiles(file, normalFiles, dataFiles);
       }
     }
   }
 
-  public long computeCrc()
+  /**
+   * Determines if a file is considered a "Data File" (Metadata or one of 
".fwd", ".dict", ".inv" file types).
+   */
+  private static boolean isDataFile(String fileName) {
+    if (fileName.equals(METADATA_FILE)) {
+      return true;
+    }
+    for (String ext : DATA_FILE_EXTENSIONS) {
+      if (fileName.endsWith(ext)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public long computeCrc(boolean dataOnly)

Review Comment:
   Let's separate it into 2 static util methods: `computeDataOnlyCrc()` and 
`computeCrc()` for clarity



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to