[GitHub] [carbondata] akashrn5 commented on a change in pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
akashrn5 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r590016320 ## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ## @@ -87,13 +106,53 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeSnapshot(carbonTable: CarbonTable): Long = { +val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) +var size: Long = 0 +val segmentFileLocation = CarbonTablePath.getSegmentFilesLocation(carbonTable.getTablePath) +if (FileFactory.isFileExist(segmentFileLocation)) { + size += FileFactory.getDirectorySize(segmentFileLocation) +} +metadataDetails.foreach(oneLoad => + if (oneLoad.getVisibility.toBoolean) { +size += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, metadataDetails) + } +) +size + } + + /** + * Method to handle the Clean files dry run operation + */ + def cleanFilesDryRunOperation ( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean, + showStats: Boolean): (Long, Long) = { +// get size freed from the trash folder +val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, +isDryRun = true, showStats) +// get size that will be deleted (MFD, COmpacted, Inprogress segments) +val expiredSegmentsSizeStats = dryRunOnExpiredSegments(carbonTable, isForceDelete, + cleanStaleInProgress) +(trashFolderSizeStats._1 + expiredSegmentsSizeStats._1, trashFolderSizeStats._2 + +expiredSegmentsSizeStats._2) + } + + private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean, + isDryRun: Boolean, showStats: Boolean): (Long, Long) = { if (isForceDelete) { // empty the trash folder - TrashUtil.emptyTrash(carbonTable.getTablePath) + val a = TrashUtil.emptyTrash(carbonTable.getTablePath, isDryRun, showStats) + (a.head, a(1)) } else { // clear trash based on timestamp - TrashUtil.deleteExpiredDataFromTrash(carbonTable.getTablePath) + val a = TrashUtil.deleteExpiredDataFromTrash(carbonTable.getTablePath, isDryRun, showStats) Review comment: same as above This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
akashrn5 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r590016222 ## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ## @@ -87,13 +106,53 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeSnapshot(carbonTable: CarbonTable): Long = { +val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) +var size: Long = 0 +val segmentFileLocation = CarbonTablePath.getSegmentFilesLocation(carbonTable.getTablePath) +if (FileFactory.isFileExist(segmentFileLocation)) { + size += FileFactory.getDirectorySize(segmentFileLocation) +} +metadataDetails.foreach(oneLoad => + if (oneLoad.getVisibility.toBoolean) { +size += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, metadataDetails) + } +) +size + } + + /** + * Method to handle the Clean files dry run operation + */ + def cleanFilesDryRunOperation ( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean, + showStats: Boolean): (Long, Long) = { +// get size freed from the trash folder +val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, +isDryRun = true, showStats) +// get size that will be deleted (MFD, COmpacted, Inprogress segments) +val expiredSegmentsSizeStats = dryRunOnExpiredSegments(carbonTable, isForceDelete, + cleanStaleInProgress) +(trashFolderSizeStats._1 + expiredSegmentsSizeStats._1, trashFolderSizeStats._2 + +expiredSegmentsSizeStats._2) + } + + private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean, + isDryRun: Boolean, showStats: Boolean): (Long, Long) = { if (isForceDelete) { // empty the trash folder - TrashUtil.emptyTrash(carbonTable.getTablePath) + val a = TrashUtil.emptyTrash(carbonTable.getTablePath, isDryRun, showStats) Review comment: give a proper variable name here This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on pull request #4099: [CARBONDATA-4141] Index Server is not caching indexes for external tables with sdk segments.
vikramahuja1001 commented on pull request #4099: URL: https://github.com/apache/carbondata/pull/4099#issuecomment-793477512 please correct the splits path of getSplits in case of External Tables in DistributedPruneRDD as well This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r589996949 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { +showStats = false + } + + override def output: Seq[AttributeReference] = { +if (isDryRun) { + // dry run operation + Seq( +AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: Okay i will change to "Size to be freed". Trash data remaining is both the trash data inside and outside the trash folder This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] akashrn5 commented on a change in pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
akashrn5 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r589969086 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { +showStats = false + } + + override def output: Seq[AttributeReference] = { +if (isDryRun) { + // dry run operation + Seq( +AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: agree with @ajantha-bhat , change the titles accordingly and @ajantha-bhat for clean files just total cleaned size is fine right? anyways we have separated size in dry run, what do you think? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
vikramahuja1001 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-793427100 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Indhumathi27 commented on pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
Indhumathi27 commented on pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#issuecomment-793425074 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] QiangCai commented on pull request #4100: [CARBONDATA-4138] Reordering Carbon Expression instead of Spark Filter
QiangCai commented on pull request #4100: URL: https://github.com/apache/carbondata/pull/4100#issuecomment-793243858 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
CarbonDataQA2 commented on pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#issuecomment-793005760 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3358/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
CarbonDataQA2 commented on pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#issuecomment-793005147 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5117/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Indhumathi27 commented on a change in pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
Indhumathi27 commented on a change in pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#discussion_r589412990 ## File path: core/src/main/java/org/apache/carbondata/core/indexstore/ExtendedBlocklet.java ## @@ -225,25 +223,15 @@ public void deserializeFields(DataInput in, String[] locations, String tablePath if (in.readBoolean()) { indexUniqueId = in.readUTF(); } -String filePath = getPath(); -boolean isLocalFile = FileFactory.getCarbonFile(filePath) instanceof LocalCarbonFile; - -// For external segment, table path need not be appended to filePath as contains has full path -// Example filepath for ext segment: -// 1. /home/user/carbondata/integration/spark/newsegmentpath -// 2. hdfs://hacluster/opt/newsegmentpath/ -// Example filepath for loaded segment: /Fact/Part/Segment0 -// To identify a filePath as ext segment path, -// for other storage systems (hdfs,s3): filePath doesn't start with File separator. -// for ubuntu storage: it starts with File separator, so check if given path exists or not. -if ((!isLocalFile && filePath.startsWith(File.separator)) || (isLocalFile && !FileFactory -.isFileExist(filePath))) { - setFilePath(tablePath + filePath); -} else { - setFilePath(filePath); -} boolean isSplitPresent = in.readBoolean(); if (isSplitPresent) { + String filePath = getPath(); + boolean isExternalPath = in.readBoolean(); Review comment: ```suggestion boolean isExternalSegment = in.readBoolean(); ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
CarbonDataQA2 commented on pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#issuecomment-792743166 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3356/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4098: [CARBONDATA-4143] UT with index server
CarbonDataQA2 commented on pull request #4098: URL: https://github.com/apache/carbondata/pull/4098#issuecomment-792736994 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5114/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
CarbonDataQA2 commented on pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#issuecomment-792736871 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5115/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4099: [CARBONDATA-4141] Index Server is not caching indexes for external tables with sdk segments.
CarbonDataQA2 commented on pull request #4099: URL: https://github.com/apache/carbondata/pull/4099#issuecomment-792735786 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3354/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-792734488 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3357/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4098: [CARBONDATA-4143] UT with index server
CarbonDataQA2 commented on pull request #4098: URL: https://github.com/apache/carbondata/pull/4098#issuecomment-792734276 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3355/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4099: [CARBONDATA-4141] Index Server is not caching indexes for external tables with sdk segments.
CarbonDataQA2 commented on pull request #4099: URL: https://github.com/apache/carbondata/pull/4099#issuecomment-792733994 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5113/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] CarbonDataQA2 commented on pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-792688228 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5116/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] Karan980 commented on a change in pull request #4099: [CARBONDATA-4141] Index Server is not caching indexes for external tables with sdk segments.
Karan980 commented on a change in pull request #4099: URL: https://github.com/apache/carbondata/pull/4099#discussion_r589323912 ## File path: core/src/main/java/org/apache/carbondata/core/index/IndexUtil.java ## @@ -112,15 +116,31 @@ public static IndexJob getIndexJob(Configuration configuration) throws IOExcepti */ private static void executeClearIndexJob(IndexJob indexJob, CarbonTable carbonTable, String indexToClear) throws IOException { -SegmentStatusManager.ValidAndInvalidSegmentsInfo validAndInvalidSegmentsInfo = -getValidAndInvalidSegments(carbonTable, FileFactory.getConfiguration()); -List invalidSegment = new ArrayList<>(); -for (Segment segment : validAndInvalidSegmentsInfo.getInvalidSegments()) { - invalidSegment.add(segment.getSegmentNo()); +IndexInputFormat indexInputFormat; +if (!carbonTable.isTransactionalTable()) { Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
vikramahuja1001 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-792624555 retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] ShreelekhyaG commented on a change in pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
ShreelekhyaG commented on a change in pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#discussion_r589273743 ## File path: core/src/main/java/org/apache/carbondata/core/indexstore/ExtendedBlockletWrapper.java ## @@ -121,8 +121,9 @@ public ExtendedBlockletWrapper(List extendedBlockletList, Stri DataOutputStream stream = new DataOutputStream(bos); try { for (ExtendedBlocklet extendedBlocklet : extendedBlockletList) { +boolean isExternalPath = !extendedBlocklet.getFilePath().contains(tablePath); Review comment: ok, removed. using isexternalSegment flag instead This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] ShreelekhyaG commented on a change in pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
ShreelekhyaG commented on a change in pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#discussion_r589273380 ## File path: core/src/main/java/org/apache/carbondata/core/index/Segment.java ## @@ -378,11 +382,10 @@ public void write(DataOutput out) throws IOException { out.writeUTF(segmentString); } out.writeLong(indexSize); -if (segmentPath == null) { - out.writeBoolean(false); -} else { +if (isExternalSegment()) { Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] ShreelekhyaG commented on a change in pull request #4096: [CARBONDATA-4133] Concurrent Insert Overwrite with static partition on Index server fails
ShreelekhyaG commented on a change in pull request #4096: URL: https://github.com/apache/carbondata/pull/4096#discussion_r589272680 ## File path: core/src/main/java/org/apache/carbondata/core/index/Segment.java ## @@ -79,7 +79,9 @@ /** * Path of segment where it exists */ - private String segmentPath; + private transient String segmentPath; + + private boolean isExternalSegment = false; Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] ajantha-bhat commented on pull request #4072: [CARBONDATA-4110] Support clean files dry run operation and show statistics after clean files operation
ajantha-bhat commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-792594384 LGTM This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org