akashrn5 commented on a change in pull request #3676: [WIP]Clean up the data file and index files after SI rebuild URL: https://github.com/apache/carbondata/pull/3676#discussion_r398489828
########## File path: integration/spark/src/main/scala/org/apache/spark/sql/secondaryindex/rdd/CarbonSIRebuildRDD.scala ########## @@ -321,6 +324,26 @@ class CarbonSIRebuildRDD[K, V]( LOGGER.info("Closing compaction processor instance to clean up loading resources") processor.close() } + + // delete all the old data files which are used for merging + splits.asScala.foreach { split => + val carbonFile = FileFactory.getCarbonFile(split.getFilePath) + carbonFile.delete() + } + + // delete the indexfile/merge index carbonFile of old data files + val segmentPath = FileFactory.getCarbonFile(indexTable.getSegmentPath(segmentId)) + val indexFiles = segmentPath.listFiles(new CarbonFileFilter { + override def accept(carbonFile: CarbonFile): Boolean = { + (carbonFile.getName.endsWith(CarbonTablePath.INDEX_FILE_EXT) || + carbonFile.getName.endsWith(CarbonTablePath.MERGE_INDEX_FILE_EXT)) && + DataFileUtil.getTimeStampFromFileName(carbonFile.getAbsolutePath).toLong < + carbonLoadModelCopy.getFactTimeStamp + } + }) + indexFiles.foreach { indexFile => + indexFile.delete() Review comment: it was already handled to clear the cache after rebuild. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services