[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516851234 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); Review comment: in this case while copying directory, should i list the files then move them one by one or is there any other way to copy directory using FileFactory.getDataOutputStream, FileFactory.getDataInputStream, IOUtils.copyBytes This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516850514 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolderPath(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[]
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516841463 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolderPath(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[]
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516840202 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolderPath(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[]
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516828124 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516807830 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -48,13 +54,38 @@ import org.apache.carbondata.view.MVManagerInSpark case class CarbonCleanFilesCommand( databaseNameOp: Option[String], tableName: Option[String], +options: Option[List[(String, String)]], forceTableClean: Boolean = false, isInternalCleanCall: Boolean = false, truncateTable: Boolean = false) extends AtomicRunnableCommand { var carbonTable: CarbonTable = _ var cleanFileCommands: List[CarbonCleanFilesCommand] = List.empty + val optionsMap = options.getOrElse(List.empty[(String, String)]).toMap + var isDryRun: Boolean = false + val dryRun = "isDryRun" + if (optionsMap.contains(dryRun.toLowerCase) ) { +isDryRun = Boolean.parseBoolean(optionsMap(dryRun.toLowerCase).toString) + } + var forceTrashClean: Boolean = false + if (optionsMap.contains("force") ) { +forceTrashClean = Boolean.parseBoolean(optionsMap("force").toString) + } Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516806745 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +112,96 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") -} -val operationContext = new OperationContext -val cleanFilesPreEvent: CleanFilesPreEvent = - CleanFilesPreEvent(carbonTable, -sparkSession) -OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) -if (tableName.isDefined) { - Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) - if (forceTableClean) { -deleteAllData(sparkSession, databaseNameOp, tableName.get) +if (!isDryRun) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516803097 ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java ## @@ -3441,4 +3443,33 @@ public static void agingTempFolderForIndexServer(long agingTime)throws }); } } + + /** + * The below method tries to get the segment lock for the given segment. + */ + public static boolean tryGettingSegmentLock(LoadMetadataDetails oneLoad, + AbsoluteTableIdentifier absoluteTableIdentifier) { +ICarbonLock segmentLock = CarbonLockFactory.getCarbonLockObj(absoluteTableIdentifier, +CarbonTablePath.addSegmentPrefix(oneLoad.getLoadName()) + LockUsage.LOCK); +boolean canBeDeleted; +try { + if (segmentLock.lockWithRetries(CarbonCommonConstants + .NUMBER_OF_TRIES_FOR_CARBON_LOCK_DEFAULT, CarbonCommonConstants + .MAX_TIMEOUT_FOR_CARBON_LOCK_DEFAULT)) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516802654 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -149,9 +132,167 @@ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, } finally { carbonTableStatusLock.unlock(); } +} else { + int retryCount = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK_DEFAULT); + int maxTimeout = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK_DEFAULT); Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516803267 ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java ## @@ -3441,4 +3443,33 @@ public static void agingTempFolderForIndexServer(long agingTime)throws }); } } + + /** + * The below method tries to get the segment lock for the given segment. + */ + public static boolean tryGettingSegmentLock(LoadMetadataDetails oneLoad, + AbsoluteTableIdentifier absoluteTableIdentifier) { +ICarbonLock segmentLock = CarbonLockFactory.getCarbonLockObj(absoluteTableIdentifier, +CarbonTablePath.addSegmentPrefix(oneLoad.getLoadName()) + LockUsage.LOCK); +boolean canBeDeleted; Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516802032 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +112,96 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") -} -val operationContext = new OperationContext -val cleanFilesPreEvent: CleanFilesPreEvent = - CleanFilesPreEvent(carbonTable, -sparkSession) -OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) -if (tableName.isDefined) { - Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) - if (forceTableClean) { -deleteAllData(sparkSession, databaseNameOp, tableName.get) +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = CleanFilesPreEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) + if (tableName.isDefined) { +Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) +if (forceTrashClean) { + CleanFilesUtil.deleteDataFromTrashFolder(carbonTable, sparkSession) +} else { + // clear trash based on timestamp + CleanFilesUtil.deleteDataFromTrashFolderByTimeStamp(carbonTable, sparkSession) +} +if (forceTableClean) { + deleteAllData(sparkSession, databaseNameOp, tableName.get) +} else { + cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +} +// delete partial load and send them to trash +TableProcessingOperations + .deletePartialLoadDataIfExist(carbonTable, false) +// clean stash in metadata folder too +deleteStashInMetadataFolder(carbonTable) } else { -cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +cleanGarbageDataInAllTables(sparkSession) + } + if (cleanFileCommands != null) { +cleanFileCommands.foreach(_.processData(sparkSession)) } + val cleanFilesPostEvent: CleanFilesPostEvent = +CleanFilesPostEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPostEvent, operationContext) + Seq.empty +} else if (isDryRun && tableName.isDefined) { + // dry run, do not clean anything and do not delete trash too + CleanFilesUtil.cleanFilesDryRun(carbonTable, sparkSession) } else { - cleanGarbageDataInAllTables(sparkSession) + Seq.empty } -if (cleanFileCommands != null) { - cleanFileCommands.foreach(_.processData(sparkSession)) + } + + // This method deletes the stale segment files in the segment folder. + def deleteStashInMetadataFolder(carbonTable: CarbonTable): Unit = { +val tableStatusLock = CarbonLockFactory + .getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier, LockUsage.TABLE_STATUS_LOCK) +val carbonLoadModel = new CarbonLoadModel +try { + if (tableStatusLock.lockWithRetries()) { +val tableStatusFilePath = CarbonTablePath + .getTableStatusFilePath(carbonTable.getTablePath) +val loadMetaDataDetails = SegmentStatusManager + .readTableStatusFile(tableStatusFilePath).filter(details => details.getSegmentStatus == Review comment: just reading it once This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516802253 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -149,9 +132,167 @@ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, } finally { carbonTableStatusLock.unlock(); } +} else { + int retryCount = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK_DEFAULT); + int maxTimeout = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK_DEFAULT); + ICarbonLock carbonTableStatusLock = CarbonLockFactory + .getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier(), LockUsage.TABLE_STATUS_LOCK); + + try { +if (carbonTableStatusLock.lockWithRetries(retryCount, maxTimeout)) { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516800475 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +112,96 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") -} -val operationContext = new OperationContext -val cleanFilesPreEvent: CleanFilesPreEvent = - CleanFilesPreEvent(carbonTable, -sparkSession) -OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) -if (tableName.isDefined) { - Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) - if (forceTableClean) { -deleteAllData(sparkSession, databaseNameOp, tableName.get) +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = CleanFilesPreEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) + if (tableName.isDefined) { +Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) +if (forceTrashClean) { + CleanFilesUtil.deleteDataFromTrashFolder(carbonTable, sparkSession) +} else { + // clear trash based on timestamp + CleanFilesUtil.deleteDataFromTrashFolderByTimeStamp(carbonTable, sparkSession) +} +if (forceTableClean) { + deleteAllData(sparkSession, databaseNameOp, tableName.get) +} else { + cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +} +// delete partial load and send them to trash +TableProcessingOperations + .deletePartialLoadDataIfExist(carbonTable, false) +// clean stash in metadata folder too +deleteStashInMetadataFolder(carbonTable) } else { -cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +cleanGarbageDataInAllTables(sparkSession) + } + if (cleanFileCommands != null) { +cleanFileCommands.foreach(_.processData(sparkSession)) } + val cleanFilesPostEvent: CleanFilesPostEvent = +CleanFilesPostEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPostEvent, operationContext) + Seq.empty +} else if (isDryRun && tableName.isDefined) { + // dry run, do not clean anything and do not delete trash too + CleanFilesUtil.cleanFilesDryRun(carbonTable, sparkSession) } else { - cleanGarbageDataInAllTables(sparkSession) + Seq.empty } -if (cleanFileCommands != null) { - cleanFileCommands.foreach(_.processData(sparkSession)) + } + + // This method deletes the stale segment files in the segment folder. + def deleteStashInMetadataFolder(carbonTable: CarbonTable): Unit = { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516799647 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -48,13 +54,38 @@ import org.apache.carbondata.view.MVManagerInSpark case class CarbonCleanFilesCommand( databaseNameOp: Option[String], tableName: Option[String], +options: Option[List[(String, String)]], forceTableClean: Boolean = false, isInternalCleanCall: Boolean = false, truncateTable: Boolean = false) extends AtomicRunnableCommand { var carbonTable: CarbonTable = _ var cleanFileCommands: List[CarbonCleanFilesCommand] = List.empty + val optionsMap = options.getOrElse(List.empty[(String, String)]).toMap + var isDryRun: Boolean = false + val dryRun = "isDryRun" + if (optionsMap.contains(dryRun.toLowerCase) ) { +isDryRun = Boolean.parseBoolean(optionsMap(dryRun.toLowerCase).toString) + } Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516798767 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { + if (currentTablePartitions.equ
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516798306 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { + if (currentTablePartitions.equ
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516797523 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { + if (currentTablePartitions.equ
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516796678 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { + if (currentTablePartitions.equ
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516795339 ## File path: core/src/main/java/org/apache/carbondata/core/util/CleanUtil.java ## @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.filesystem.CarbonFileFilter; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.index.IndexStoreManager; +import org.apache.carbondata.core.index.Segment; +import org.apache.carbondata.core.index.TableIndex; +import org.apache.carbondata.core.indexstore.PartitionSpec; +import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; +import org.apache.carbondata.core.metadata.SegmentFileStore; +import org.apache.carbondata.core.metadata.schema.table.CarbonTable; +import org.apache.carbondata.core.mutate.CarbonUpdateUtil; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatus; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager; +import org.apache.carbondata.core.util.path.CarbonTablePath; + +import org.apache.log4j.Logger; + +/** + * Mantains the code used in clean files command, to delete the load folders and move the data + * to trash folder + */ +public final class CleanUtil { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516793794 ## File path: docs/dml-of-carbondata.md ## @@ -560,5 +561,4 @@ CarbonData DML statements are documented here,which includes: Clean the segments which are compacted: ``` - CLEAN FILES FOR TABLE carbon_table - ``` Review comment: added back This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516793523 ## File path: docs/dml-of-carbondata.md ## @@ -27,6 +27,7 @@ CarbonData DML statements are documented here,which includes: * [UPDATE AND DELETE](#update-and-delete) * [COMPACTION](#compaction) * [SEGMENT MANAGEMENT](./segment-management-on-carbondata.md) +* [CLEAN FILES](./cleanfiles.md) Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516792757 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES FOR TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all stale carbondata segments are moved to during clean files operation. + This trash folder is mantained inside the table path. It is a hidden folder(.Trash). The segments that are moved to the trash folder are mantained under a timestamp + subfolder(timestamp at which clean files operation is called). This helps the user to list down segments by timestamp. By default all the timestamp sub-directory have an expiration + time of (3 days since that timestamp) and it can be configured by the user using the following carbon property Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516793022 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES FOR TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all stale carbondata segments are moved to during clean files operation. + This trash folder is mantained inside the table path. It is a hidden folder(.Trash). The segments that are moved to the trash folder are mantained under a timestamp + subfolder(timestamp at which clean files operation is called). This helps the user to list down segments by timestamp. By default all the timestamp sub-directory have an expiration + time of (3 days since that timestamp) and it can be configured by the user using the following carbon property + ``` + carbon.trash.expiration.time = "Number of days" + ``` + Once the timestamp subdirectory is expired as per the configured expiration day value, the subdirectory is deleted from the trash folder in the subsequent clean files command. + + + + +### DRY RUN + Support for dry run is provided before the actual clean files operation. This dry run operation will list down all the segments which are going to be manipulated during + the clean files operation. The dry run result will show the current location of the segment(it can be in FACT folder, Partition folder or trash folder), where that segment + will be moved(to the trash folder or deleted from store) and the number of days left before it expires once the actual operation will be called. + + + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('dry_run'='true') + ``` + +### FORCE DELETE TRASH +The force option with clean files command deletes all the files and folders from the trash folder. Review comment: no other difference, it just force clean trash folder This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516792560 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516791220 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516790181 ## File path: core/src/main/java/org/apache/carbondata/core/util/CleanUtil.java ## @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.filesystem.CarbonFileFilter; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.index.IndexStoreManager; +import org.apache.carbondata.core.index.Segment; +import org.apache.carbondata.core.index.TableIndex; +import org.apache.carbondata.core.indexstore.PartitionSpec; +import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; +import org.apache.carbondata.core.metadata.SegmentFileStore; +import org.apache.carbondata.core.metadata.schema.table.CarbonTable; +import org.apache.carbondata.core.mutate.CarbonUpdateUtil; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatus; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager; +import org.apache.carbondata.core.util.path.CarbonTablePath; + +import org.apache.log4j.Logger; + +/** + * Mantains the code used in clean files command, to delete the load folders and move the data + * to trash folder + */ +public final class CleanUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + public static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] newAddedLoadHistoryList, boolean isForceDelete, + List specs) { +LoadMetadataDetails[] currentDetails = +SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath()); +physicalFactAndMeasureMetadataDeletion(carbonTable, +currentDetails, +isForceDelete, +specs, +currentDetails); +if (newAddedLoadHistoryList != null && newAddedLoadHistoryList.length > 0) { + physicalFactAndMeasureMetadataDeletion(carbonTable, + newAddedLoadHistoryList, + isForceDelete, + specs, + currentDetails); +} + } + + /** + * Delete the invalid data physically from table. + * @param carbonTable table + * @param loadDetails Load details which need clean up + * @param isForceDelete is Force delete requested by user + * @param specs Partition specs + * @param currLoadDetails Current table status load details which are required for update manager. + */ + private static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] loadDetails, boolean isForceDelete, List specs, + LoadMetadataDetails[] currLoadDetails) { +List indexes = new ArrayList<>(); +try { + for (TableIndex index : IndexStoreManager.getInstance().getAllCGAndFGIndexes(carbonTable)) { +if (index.getIndexSchema().isIndex()) { + indexes.add(index); +} + } +} catch (IOException e) { + LOGGER.warn(String.format( + "Failed to get indexes for %s.%s, therefore the index files could not be cleaned.", + carbonTable.getAbsoluteTableIdentifier().getDatabaseName(), + carbonTable.getAbsoluteTableIdentifier().getTableName())); +} +SegmentUpdateStatusManager updateStatusManager = +new SegmentUpdateStatusManager(carbonTable, currLoadDetails); +for (final LoadMetadataDetails oneLoad : loadDetails) { + if (checkIfLoadCanBeDeletedPhysically(oneLoad, isForceDelete)) { +try { + if (oneLoad.getSegmentFile() != null) { + SegmentFileStore.deleteSegment(carbonTable.getAbsoluteTableIdentifier().getTablePath(), +new Segment(oneLoad.getLoadName(), oneLoad.getSegmentFile()), specs, +updateStat
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516789854 ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java ## @@ -2116,6 +2087,28 @@ public int getMaxSIRepairLimit(String dbName, String tableName) { return Math.abs(Integer.parseInt(thresholdValue)); } + /** + * The below method returns the microseconds after which the trash folder will expire + */ + public long getTrashFolderExpirationTime() { +String configuredValue = getProperty(CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS, +CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); +Integer result = 0; +try { + result = Integer.parseInt(configuredValue); + if (result < 0) { +LOGGER.warn("Value of carbon.trash.expiration.days is negative, taking default value"); +result = Integer.parseInt(CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); + } +} catch (NumberFormatException e) { + LOGGER.error("Invalid value configured for CarbonCommonConstants" + + ".CARBON_TRASH_EXPIRATION_DAYS, considering the default value"); Review comment: done ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java ## @@ -2116,6 +2087,28 @@ public int getMaxSIRepairLimit(String dbName, String tableName) { return Math.abs(Integer.parseInt(thresholdValue)); } + /** + * The below method returns the microseconds after which the trash folder will expire + */ + public long getTrashFolderExpirationTime() { +String configuredValue = getProperty(CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS, +CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); +Integer result = 0; +try { + result = Integer.parseInt(configuredValue); + if (result < 0) { +LOGGER.warn("Value of carbon.trash.expiration.days is negative, taking default value"); +result = Integer.parseInt(CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); + } +} catch (NumberFormatException e) { + LOGGER.error("Invalid value configured for CarbonCommonConstants" + + ".CARBON_TRASH_EXPIRATION_DAYS, considering the default value"); + result = Integer.parseInt(CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); +} +Long microSecondsInADay = Long.valueOf(TimeUnit.DAYS.toMillis(1)); +return result * microSecondsInADay; Review comment: done ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java ## @@ -3441,4 +3443,33 @@ public static void agingTempFolderForIndexServer(long agingTime)throws }); } } + + /** + * The below method tries to get the segment lock for the given segment. + */ + public static boolean tryGettingSegmentLock(LoadMetadataDetails oneLoad, + AbsoluteTableIdentifier absoluteTableIdentifier) { +ICarbonLock segmentLock = CarbonLockFactory.getCarbonLockObj(absoluteTableIdentifier, +CarbonTablePath.addSegmentPrefix(oneLoad.getLoadName()) + LockUsage.LOCK); +boolean canBeDeleted; +try { + if (segmentLock.lockWithRetries(CarbonCommonConstants + .NUMBER_OF_TRIES_FOR_CARBON_LOCK_DEFAULT, CarbonCommonConstants + .MAX_TIMEOUT_FOR_CARBON_LOCK_DEFAULT)) { +LOGGER.info("Info: Acquired segment lock on segment: " + oneLoad.getLoadName() + ". It " + +"can be deleted as load is not in progress"); +canBeDeleted = true; + } else { +LOGGER.info("Info: Load in progress for segment" + oneLoad.getLoadName()); +canBeDeleted = false; + } +} finally { + if (segmentLock.unlock()) { +LOGGER.info("Info: Segment lock on segment:" + oneLoad.getLoadName() + " is released"); + } else { +LOGGER.error("Error: Unable to release segment lock on : " + oneLoad.getLoadName()); Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516789762 ## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ## @@ -1427,6 +1427,23 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + /** + * this is the user defined time(in days), when a specific timestamp subdirectory in + * trash folder will expire + */ + @CarbonProperty + public static final String CARBON_TRASH_EXPIRATION_DAYS = "carbon.trash.expiration.days"; + + /** + * Default expiration time of trash folder is 3 days. + */ + public static final String CARBON_TRASH_EXPIRATION_DAYS_DEFAULT = "3"; Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516789476 ## File path: core/src/main/java/org/apache/carbondata/core/util/CleanUtil.java ## @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.filesystem.CarbonFileFilter; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.index.IndexStoreManager; +import org.apache.carbondata.core.index.Segment; +import org.apache.carbondata.core.index.TableIndex; +import org.apache.carbondata.core.indexstore.PartitionSpec; +import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; +import org.apache.carbondata.core.metadata.SegmentFileStore; +import org.apache.carbondata.core.metadata.schema.table.CarbonTable; +import org.apache.carbondata.core.mutate.CarbonUpdateUtil; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatus; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager; +import org.apache.carbondata.core.util.path.CarbonTablePath; + +import org.apache.log4j.Logger; + +/** + * Mantains the code used in clean files command, to delete the load folders and move the data + * to trash folder + */ +public final class CleanUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + public static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] newAddedLoadHistoryList, boolean isForceDelete, + List specs) { +LoadMetadataDetails[] currentDetails = +SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath()); +physicalFactAndMeasureMetadataDeletion(carbonTable, +currentDetails, +isForceDelete, +specs, +currentDetails); +if (newAddedLoadHistoryList != null && newAddedLoadHistoryList.length > 0) { + physicalFactAndMeasureMetadataDeletion(carbonTable, + newAddedLoadHistoryList, + isForceDelete, + specs, + currentDetails); +} + } + + /** + * Delete the invalid data physically from table. + * @param carbonTable table + * @param loadDetails Load details which need clean up + * @param isForceDelete is Force delete requested by user + * @param specs Partition specs + * @param currLoadDetails Current table status load details which are required for update manager. + */ + private static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] loadDetails, boolean isForceDelete, List specs, + LoadMetadataDetails[] currLoadDetails) { +List indexes = new ArrayList<>(); +try { + for (TableIndex index : IndexStoreManager.getInstance().getAllCGAndFGIndexes(carbonTable)) { +if (index.getIndexSchema().isIndex()) { + indexes.add(index); +} + } +} catch (IOException e) { + LOGGER.warn(String.format( + "Failed to get indexes for %s.%s, therefore the index files could not be cleaned.", + carbonTable.getAbsoluteTableIdentifier().getDatabaseName(), + carbonTable.getAbsoluteTableIdentifier().getTableName())); +} +SegmentUpdateStatusManager updateStatusManager = +new SegmentUpdateStatusManager(carbonTable, currLoadDetails); +for (final LoadMetadataDetails oneLoad : loadDetails) { + if (checkIfLoadCanBeDeletedPhysically(oneLoad, isForceDelete)) { +try { + if (oneLoad.getSegmentFile() != null) { + SegmentFileStore.deleteSegment(carbonTable.getAbsoluteTableIdentifier().getTablePath(), +new Segment(oneLoad.getLoadName(), oneLoad.getSegmentFile()), specs, +updateStat
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516789597 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolderPath(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[]
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516788672 ## File path: core/src/main/java/org/apache/carbondata/core/util/CleanUtil.java ## @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.filesystem.CarbonFileFilter; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.index.IndexStoreManager; +import org.apache.carbondata.core.index.Segment; +import org.apache.carbondata.core.index.TableIndex; +import org.apache.carbondata.core.indexstore.PartitionSpec; +import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; +import org.apache.carbondata.core.metadata.SegmentFileStore; +import org.apache.carbondata.core.metadata.schema.table.CarbonTable; +import org.apache.carbondata.core.mutate.CarbonUpdateUtil; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatus; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager; +import org.apache.carbondata.core.util.path.CarbonTablePath; + +import org.apache.log4j.Logger; + +/** + * Mantains the code used in clean files command, to delete the load folders and move the data + * to trash folder + */ +public final class CleanUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + public static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] newAddedLoadHistoryList, boolean isForceDelete, + List specs) { +LoadMetadataDetails[] currentDetails = +SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath()); +physicalFactAndMeasureMetadataDeletion(carbonTable, +currentDetails, +isForceDelete, +specs, +currentDetails); +if (newAddedLoadHistoryList != null && newAddedLoadHistoryList.length > 0) { + physicalFactAndMeasureMetadataDeletion(carbonTable, + newAddedLoadHistoryList, + isForceDelete, + specs, + currentDetails); +} + } + + /** + * Delete the invalid data physically from table. + * @param carbonTable table + * @param loadDetails Load details which need clean up + * @param isForceDelete is Force delete requested by user + * @param specs Partition specs + * @param currLoadDetails Current table status load details which are required for update manager. + */ + private static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTable, + LoadMetadataDetails[] loadDetails, boolean isForceDelete, List specs, + LoadMetadataDetails[] currLoadDetails) { +List indexes = new ArrayList<>(); +try { + for (TableIndex index : IndexStoreManager.getInstance().getAllCGAndFGIndexes(carbonTable)) { +if (index.getIndexSchema().isIndex()) { + indexes.add(index); +} + } +} catch (IOException e) { + LOGGER.warn(String.format( + "Failed to get indexes for %s.%s, therefore the index files could not be cleaned.", + carbonTable.getAbsoluteTableIdentifier().getDatabaseName(), + carbonTable.getAbsoluteTableIdentifier().getTableName())); +} +SegmentUpdateStatusManager updateStatusManager = +new SegmentUpdateStatusManager(carbonTable, currLoadDetails); +for (final LoadMetadataDetails oneLoad : loadDetails) { + if (checkIfLoadCanBeDeletedPhysically(oneLoad, isForceDelete)) { +try { + if (oneLoad.getSegmentFile() != null) { + SegmentFileStore.deleteSegment(carbonTable.getAbsoluteTableIdentifier().getTablePath(), +new Segment(oneLoad.getLoadName(), oneLoad.getSegmentFile()), specs, +updateStat
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r516787255 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +/** + * Mantains the trash folder in carbondata. This class has methods to copy data to the trash and + * remove data from the trash. + */ +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param pathOfFileToCopy the files which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + * + * @param carbonTablePath table path of the carbon table + * @param path the folder which are to be moved to the trash folder + * @param suffixToAdd timestamp, partition folder(if any) and segment number + * @return + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolderPath(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[]
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r515789003 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES FOR TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all the unnecessary files and folders are moved to during clean files operation. Review comment: done ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES FOR TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all the unnecessary files and folders are moved to during clean files operation. + This trash folder is mantained inside the table path. It is a hidden folder(.Trash). The segments that are moved to the trash folder are mantained under a timestamp + subfolder(timestamp at which clean files operation is called). This helps the user to list down segments by timestamp. By default all the timestamp sub-directory have an expiration + time of (3 days since that timestamp) and it can be configured by the user using the following carbon property + ``` + carbon.trash.expiration.time = "Number of days" + ``` + Once the timestamp subdirectory is expired as per the configured expiration day value, the subdirectory is deleted from the trash folder in the subsequent clean files command. + + + + +### DRY RUN + Support for dry run is provided before the actual clean files operation. This dry run operation will list down all the segments which are going to be manipulated during + the clean files operation. The dry run result will show the current location of the segment(it can be in FACT folder, Partition folder or trash folder) and where that segment Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r514429233 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,400 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) Review comment: no, we are copying the complete segment to trash, so no issues with delta files. I added a test case too with delete delta This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For querie
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r514271886 ## File path: core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java ## @@ -1136,7 +1137,8 @@ public static void deleteLoadsAndUpdateMetadata(CarbonTable carbonTable, boolean if (updateCompletionStatus) { DeleteLoadFolders .physicalFactAndMeasureMetadataDeletion(carbonTable, newAddedLoadHistoryList, -isForceDeletion, partitionSpecs); +isForceDeletion, partitionSpecs, String.valueOf(new Timestamp(System Review comment: yes, moved This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513252427 ## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ## @@ -138,8 +143,19 @@ public boolean accept(CarbonFile file) { if (filesToBeDeleted.length == 0) { status = true; } else { - for (CarbonFile eachFile : filesToBeDeleted) { + // If the file to be deleted is a carbondata file, index file, index merge file + // or a delta file, copy that file to the trash folder. + if ((eachFile.getName().endsWith(CarbonCommonConstants.FACT_FILE_EXT) || Review comment: changed, it's not needed anymore This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513243540 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations +import org.apache.carbondata.processing.loading.model.CarbonLoadModel + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableCLean and lean garbage segment + * (MARKED_FOR_DELETE state) if forceTableCLean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( + dbName: String, + tableName: String, + tablePath: String, + carbonTable: CarbonTable, + forceTableClean: Boolean, + currentTablePartitions: Option[Seq[PartitionSpec]] = None, + truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513229197 ## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ## @@ -113,12 +116,24 @@ private static void physicalFactAndMeasureMetadataDeletion(CarbonTable carbonTab SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(carbonTable, currLoadDetails); for (final LoadMetadataDetails oneLoad : loadDetails) { - if (checkIfLoadCanBeDeletedPhysically(oneLoad, isForceDelete)) { + if (checkIfLoadCanBeDeletedPhysically(oneLoad, isForceDelete, carbonTable + .getAbsoluteTableIdentifier())) { try { + // if insert in progress, then move it to trash + if (oneLoad.getSegmentStatus() == SegmentStatus.INSERT_IN_PROGRESS && !carbonTable + .isHivePartitionTable()) { +// move this segment to trash + TrashUtil.copyDataToTrashBySegment(FileFactory.getCarbonFile(CarbonTablePath +.getFactDir(carbonTable.getTablePath()) + "/Part0/Segment_" + oneLoad Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513225060 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolderPath(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, Review comment: done ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Pr
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513220391 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513218983 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations +import org.apache.carbondata.processing.loading.model.CarbonLoadModel + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { +
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513217586 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations +import org.apache.carbondata.processing.loading.model.CarbonLoadModel + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceTableClean) { +FileFactory.deleteAllCarbonFilesOfDir( + FileFactory.getCarbonFile(absoluteTableIdentifier.getTablePath)) + } else { +carbonCleanFilesLock = + CarbonLockUtil +.getLockObject(absoluteTableIdentifier, LockUsage.CLEAN_FILES_LOCK, errorMsg) +if (truncateTable) { + SegmentStatusManager.truncateTable(carbonTable) +} +SegmentStatusManager.deleteLoadsAndUpdateMetadata( + carbonTable, true, currentTablePartitions.map(_.asJava).orNull) +CarbonUpdateUtil.cleanUpDeltaFiles(carbonTable, true) +currentTablePartitions match { + case Some(partitions) => +SegmentFileStore.cleanSegments( + carbonTable, + currentTablePartitions.map(_.asJava).orNull, + true) + case _ => +} + } +} finally { +
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513217211 ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.sql.Timestamp +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513215028 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +113,98 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = +CleanFilesPreEvent(carbonTable, + sparkSession) Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513215239 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +113,98 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = +CleanFilesPreEvent(carbonTable, + sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) + if (tableName.isDefined) { +Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) +if (forceTrashClean) { + CleanFilesUtil.deleteDataFromTrashFolder(carbonTable, sparkSession) +} else { + // clear trash based on timestamp + CleanFilesUtil.deleteDataFromTrashFolderByTimeStamp(carbonTable, sparkSession) +} +if (forceTableClean) { + deleteAllData(sparkSession, databaseNameOp, tableName.get) +} else { + cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +} +// delete partial load and send them to trash +TableProcessingOperations + .deletePartialLoadDataIfExist(carbonTable, false) +// clean stash in metadata folder too +deleteStashInMetadataFolder(carbonTable) + } else { +cleanGarbageDataInAllTables(sparkSession) + } + if (cleanFileCommands != null) { +cleanFileCommands.foreach(_.processData(sparkSession)) + } + val cleanFilesPostEvent: CleanFilesPostEvent = +CleanFilesPostEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPostEvent, operationContext) + Seq.empty +} else if (isDryRun && tableName.isDefined) { + // dry run, do not clean anything and do not delete trash too + CleanFilesUtil.cleanFilesDryRun(carbonTable, sparkSession) +} +else { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513214833 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +113,98 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = +CleanFilesPreEvent(carbonTable, + sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) + if (tableName.isDefined) { +Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) +if (forceTrashClean) { + CleanFilesUtil.deleteDataFromTrashFolder(carbonTable, sparkSession) +} else { + // clear trash based on timestamp + CleanFilesUtil.deleteDataFromTrashFolderByTimeStamp(carbonTable, sparkSession) +} +if (forceTableClean) { + deleteAllData(sparkSession, databaseNameOp, tableName.get) +} else { + cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +} +// delete partial load and send them to trash +TableProcessingOperations + .deletePartialLoadDataIfExist(carbonTable, false) +// clean stash in metadata folder too +deleteStashInMetadataFolder(carbonTable) + } else { +cleanGarbageDataInAllTables(sparkSession) + } + if (cleanFileCommands != null) { +cleanFileCommands.foreach(_.processData(sparkSession)) + } + val cleanFilesPostEvent: CleanFilesPostEvent = +CleanFilesPostEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPostEvent, operationContext) + Seq.empty +} else if (isDryRun && tableName.isDefined) { + // dry run, do not clean anything and do not delete trash too + CleanFilesUtil.cleanFilesDryRun(carbonTable, sparkSession) +} +else { + Seq.empty } -val operationContext = new OperationContext -val cleanFilesPreEvent: CleanFilesPreEvent = - CleanFilesPreEvent(carbonTable, -sparkSession) -OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) -if (tableName.isDefined) { - Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) - if (forceTableClean) { -deleteAllData(sparkSession, databaseNameOp, tableName.get) + } + + def deleteStashInMetadataFolder(carbonTable: CarbonTable): Unit = { Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513209768 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java ## @@ -47,6 +47,7 @@ public static final String BATCH_PREFIX = "_batchno"; private static final String LOCK_DIR = "LockFiles"; + public static final String SEGMENTS_METADATA_FOLDER = "segments"; Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513205026 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -149,9 +132,171 @@ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, } finally { carbonTableStatusLock.unlock(); } +} else { + + int retryCount = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.NUMBER_OF_TRIES_FOR_CONCURRENT_LOCK_DEFAULT); + int maxTimeout = CarbonLockUtil + .getLockProperty(CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK, + CarbonCommonConstants.MAX_TIMEOUT_FOR_CONCURRENT_LOCK_DEFAULT); + ICarbonLock carbonTableStatusLock = CarbonLockFactory + .getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier(), LockUsage.TABLE_STATUS_LOCK); + + try { +if (carbonTableStatusLock.lockWithRetries(retryCount, maxTimeout)) { + Review comment: done ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -149,9 +132,171 @@ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, } finally { carbonTableStatusLock.unlock(); } +} else { + Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513205119 ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ## @@ -80,40 +113,98 @@ case class CarbonCleanFilesCommand( } override def processData(sparkSession: SparkSession): Seq[Row] = { -// if insert overwrite in progress, do not allow delete segment -if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { - throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") +if (!isDryRun) { + // if insert overwrite in progress, do not allow delete segment + if (SegmentStatusManager.isOverwriteInProgressInTable(carbonTable)) { +throw new ConcurrentOperationException(carbonTable, "insert overwrite", "clean file") + } + val operationContext = new OperationContext + val cleanFilesPreEvent: CleanFilesPreEvent = +CleanFilesPreEvent(carbonTable, + sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) + if (tableName.isDefined) { +Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) +if (forceTrashClean) { + CleanFilesUtil.deleteDataFromTrashFolder(carbonTable, sparkSession) +} else { + // clear trash based on timestamp + CleanFilesUtil.deleteDataFromTrashFolderByTimeStamp(carbonTable, sparkSession) +} +if (forceTableClean) { + deleteAllData(sparkSession, databaseNameOp, tableName.get) +} else { + cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +} +// delete partial load and send them to trash +TableProcessingOperations + .deletePartialLoadDataIfExist(carbonTable, false) +// clean stash in metadata folder too +deleteStashInMetadataFolder(carbonTable) + } else { +cleanGarbageDataInAllTables(sparkSession) + } + if (cleanFileCommands != null) { +cleanFileCommands.foreach(_.processData(sparkSession)) + } + val cleanFilesPostEvent: CleanFilesPostEvent = +CleanFilesPostEvent(carbonTable, sparkSession) + OperationListenerBus.getInstance.fireEvent(cleanFilesPostEvent, operationContext) + Seq.empty +} else if (isDryRun && tableName.isDefined) { + // dry run, do not clean anything and do not delete trash too + CleanFilesUtil.cleanFilesDryRun(carbonTable, sparkSession) +} +else { + Seq.empty } -val operationContext = new OperationContext -val cleanFilesPreEvent: CleanFilesPreEvent = - CleanFilesPreEvent(carbonTable, -sparkSession) -OperationListenerBus.getInstance.fireEvent(cleanFilesPreEvent, operationContext) -if (tableName.isDefined) { - Checker.validateTableExists(databaseNameOp, tableName.get, sparkSession) - if (forceTableClean) { -deleteAllData(sparkSession, databaseNameOp, tableName.get) + } + + def deleteStashInMetadataFolder(carbonTable: CarbonTable): Unit = { +val tableStatusLock = CarbonLockFactory + .getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier, LockUsage.TABLE_STATUS_LOCK) +val carbonLoadModel = new CarbonLoadModel +try { + if (tableStatusLock.lockWithRetries()) { +val tableStatusFilePath = CarbonTablePath + .getTableStatusFilePath(carbonTable.getTablePath) +val loadMetaDataDetails = SegmentStatusManager + .readTableStatusFile(tableStatusFilePath).filter(details => details.getSegmentStatus == + SegmentStatus.SUCCESS || details.getSegmentStatus == SegmentStatus.LOAD_PARTIAL_SUCCESS) + .sortWith(_.getLoadName < _.getLoadName) + carbonLoadModel.setLoadMetadataDetails(loadMetaDataDetails.toList.asJava) } else { -cleanGarbageData(sparkSession, databaseNameOp, tableName.get) +throw new ConcurrentOperationException(carbonTable.getDatabaseName, + carbonTable.getTableName, "table status read", "clean files command") } -} else { - cleanGarbageDataInAllTables(sparkSession) +} finally { + tableStatusLock.unlock() } -if (cleanFileCommands != null) { - cleanFileCommands.foreach(_.processData(sparkSession)) +val loadMetaDataDetails = carbonLoadModel.getLoadMetadataDetails.asScala +val segmentFileList = loadMetaDataDetails.map(f => CarbonTablePath.getSegmentFilesLocation( + carbonTable.getTablePath) + CarbonCommonConstants.FILE_SEPARATOR + f.getSegmentFile) + +val metaDataPath = CarbonTablePath.getMetadataPath(carbonTable.getTablePath) + + CarbonCommonConstants.FILE_SEPARATOR + "segments" Review comment: done This is an automated message from the Apache Git Service
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513204008 ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ## @@ -0,0 +1,540 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.spark.testsuite.cleanfiles + +import java.io.{File, PrintWriter} + +import scala.io.Source + +import org.apache.spark.sql.{CarbonEnv, Row} +import org.apache.spark.sql.test.util.QueryTest +import org.scalatest.BeforeAndAfterAll + +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.impl.FileFactory + +class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { + + var count = 0 + + test("clean up table and test trash folder with In Progress segments") { +sql("""DROP TABLE IF EXISTS CLEANTEST""") +sql("""DROP TABLE IF EXISTS CLEANTEST1""") +sql( + """ +| CREATE TABLE cleantest (name String, id Int) +| STORED AS carbondata + """.stripMargin) +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +// run a select query before deletion +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + +val path = CarbonEnv.getCarbonTable(Some("default"), "cleantest")(sqlContext.sparkSession) + .getTablePath +val tableStatusFilePath = path + CarbonCommonConstants.FILE_SEPARATOR + "Metadata" + + CarbonCommonConstants.FILE_SEPARATOR + "tableStatus" +editTableStatusFile(path) +val trashFolderPath = path + CarbonCommonConstants.FILE_SEPARATOR + + CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + +assert(!FileFactory.isFileExist(trashFolderPath)) +val dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() +// dry run shows 3 segments to move to trash +assert(dryRun == 3) + +sql(s"CLEAN FILES FOR TABLE cleantest").show + +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(0))) +assert(FileFactory.isFileExist(trashFolderPath)) +var list = getFileCountInTrashFolder(trashFolderPath) +assert(list == 6) + +val dryRun1 = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() +sql(s"CLEAN FILES FOR TABLE cleantest").show + +count = 0 +list = getFileCountInTrashFolder(trashFolderPath) +// no carbondata file is added to the trash +assert(list == 6) + + +val timeStamp = getTimestampFolderName(trashFolderPath) + +// recovering data from trash folder +sql( + """ +| CREATE TABLE cleantest1 (name String, id Int) +| STORED AS carbondata + """.stripMargin) + +val segment0Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '0' +val segment1Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '1' +val segment2Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '2' + +sql(s"alter table cleantest1 add segment options('path'='$segment0Path'," + + s"'format'='carbon')").show() +sql(s"alter table cleantest1 add segment options('path'='$segment1Path'," + + s"'format'='carbon')").show() +sql(s"alter table cleantest1 add segment options('path'='$segment2Path'," + + s"'format'='carbon')").show() +sql(s"""INSERT INTO CLEANTEST SELECT * from cleantest1""") + +// test after recovering data from trash +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + +sql(s"CLEAN FILES FOR TABLE cleantest options('force'='true')").show +count = 0 +list = getFileCountInTrashFolder(trashFolderPath) +// no carbondata file is added to the trash +assert(list == 0) +sql("""DROP TABLE IF EXIS
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513198165 ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java ## @@ -2116,6 +2087,26 @@ public int getMaxSIRepairLimit(String dbName, String tableName) { return Math.abs(Integer.parseInt(thresholdValue)); } + /** + * The below method returns the microseconds after which the trash folder will expire + */ + public long getTrashFolderExpirationTime() { +String configuredValue = getProperty(CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS, +CarbonCommonConstants.CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); +Integer result = 0; +try { + result = Integer.parseInt(configuredValue); + if (result < 0) { +LOGGER.warn("Value of carbon.trash.expiration.days is negative, taking default value"); +result = Integer.parseInt(CARBON_TRASH_EXPIRATION_DAYS_DEFAULT); + } +} catch (NumberFormatException e) { + LOGGER.error("Error happened while parsing", e); Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r513197605 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1143,28 +1148,62 @@ public static void cleanSegments(CarbonTable table, * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStampForTrashFolder) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); +List filesToDelete = new ArrayList<>(); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS && isPartitionTable) { +TrashUtil.copyDataToTrashFolderByFile(tablePath, entry.getKey(), timeStampForTrashFolder + Review comment: changed This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512553653 ## File path: docs/dml-of-carbondata.md ## @@ -562,3 +563,50 @@ CarbonData DML statements are documented here,which includes: ``` CLEAN FILES FOR TABLE carbon_table ``` + +## CLEAN FILES + + Clean files command is used to remove the Compacted and Marked + For Delete Segments from the store. Carbondata also supports Trash + Folder where all the stale data is moved to after clean files + is called + + There are several types of compaction + + ``` + CLEAN FILES ON TABLE TableName + ``` + + - **Minor Compaction** Review comment: removed ## File path: docs/dml-of-carbondata.md ## @@ -562,3 +563,50 @@ CarbonData DML statements are documented here,which includes: ``` CLEAN FILES FOR TABLE carbon_table ``` + +## CLEAN FILES + + Clean files command is used to remove the Compacted and Marked Review comment: linked ## File path: integration/spark/src/main/scala/org/apache/carbondata/cleanfiles/CleanFilesUtil.scala ## @@ -0,0 +1,409 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.cleanfiles + +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.{AnalysisException, CarbonEnv, Row, SparkSession} +import org.apache.spark.sql.index.CarbonIndexUtil + +import org.apache.carbondata.common.logging.LogServiceFactory +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.exception.ConcurrentOperationException +import org.apache.carbondata.core.indexstore.PartitionSpec +import org.apache.carbondata.core.locks.{CarbonLockFactory, CarbonLockUtil, ICarbonLock, LockUsage} +import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, CarbonMetadata, SegmentFileStore} +import org.apache.carbondata.core.metadata.schema.table.CarbonTable +import org.apache.carbondata.core.mutate.CarbonUpdateUtil +import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} +import org.apache.carbondata.core.util.path.{CarbonTablePath, TrashUtil} +import org.apache.carbondata.processing.loading.TableProcessingOperations +import org.apache.carbondata.processing.loading.model.CarbonLoadModel + +object CleanFilesUtil { + private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + + /** + * The method deletes all data if forceTableClean and clean garbage segment + * (MARKED_FOR_DELETE state) if forceTableClean + * + * @param dbName : Database name + * @param tableName : Table name + * @param tablePath : Table path + * @param carbonTable: CarbonTable Object in case of force clean + * @param forceTableClean: for force clean it will delete all data + *it will clean garbage segment (MARKED_FOR_DELETE state) + * @param currentTablePartitions : Hive Partitions details + */ + def cleanFiles( +dbName: String, +tableName: String, +tablePath: String, +timeStamp: String, +carbonTable: CarbonTable, +forceTableClean: Boolean, +currentTablePartitions: Option[Seq[PartitionSpec]] = None, +truncateTable: Boolean = false): Unit = { +var carbonCleanFilesLock: ICarbonLock = null +val absoluteTableIdentifier = if (forceTableClean) { + AbsoluteTableIdentifier.from(tablePath, dbName, tableName, tableName) +} else { + carbonTable.getAbsoluteTableIdentifier +} +try { + val errorMsg = "Clean files request is failed for " + +s"$dbName.$tableName" + +". Not able to acquire the clean files lock due to another clean files " + +"operation is running in the background." + // in case of force clean the lock is not required + if (forceT
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512553372 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void copyDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolder(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), new File(trashFolderPath)); +LOGGER.info("File: " + pathOfFileToCopy + " successfully copied to the trash folder: " ++ trashFolderPath); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder", e); +} + } + + /** + * The below method copies the complete segment folder to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void copyDataToTrashBySegment(CarbonFile path, String carbonTablePath, + String suffixToAdd) { +String trashFolderPath = CarbonTablePath.getTrashFolder(carbonTablePath) + +CarbonCommonConstants.FILE_SEPARATOR + suffixToAdd; +try { + FileUtils.copyDirectory(new File(path.getAbsolutePath()), new File(trashFolderPath)); + LOGGER.info("Segment: " + path.getAbsolutePath() + " has been copied to the trash folder" + + " successfully"); +} catch (IOException e) { + LOGGER.error("Unable to create the trash folder and copy data to it", e); +} + } + + /** + * The below method deletes timestamp subdirectories in the trash folder which have expired as + * per the user defined expiration time + */ + public static void deleteAllDataFromTrashFolderByTimeStamp(String carbonTablePath, Long timeStamp) + throws IOException { +String pathOfTrashFolder = CarbonTablePath.getTrashFolder(carbonTablePath); +// Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. +if (FileFactory.isFileExist(pathOfTrashFolder)) { + try { +List carbonFileList = FileFactory.getFolderList(pathOfTrashFolder); +for (CarbonFile carbonFile : carbonFileList) { + String[] aB = carbonFile.getAbsolutePath().split(CarbonCommonConstants.FILE_SEPARATOR); Review comment: different names for partition tables and normal tables, changed the variable name though ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "Lice
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512552552 ## File path: core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java ## @@ -1049,7 +1049,7 @@ private static ReturnTuple isUpdateRequired(boolean isForceDeletion, CarbonTable } public static void deleteLoadsAndUpdateMetadata(CarbonTable carbonTable, boolean isForceDeletion, - List partitionSpecs) throws IOException { + List partitionSpecs, String timeStamp) throws IOException { Review comment: i have changed this behaviour, after this change even one clean files command can create multiple timestamp subdirectories. The user can use tree command to list the files and use the timestamp subfolder as he desires. ## File path: core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java ## @@ -2116,6 +2086,20 @@ public int getMaxSIRepairLimit(String dbName, String tableName) { return Math.abs(Integer.parseInt(thresholdValue)); } + /** + * The below method returns the microseconds after which the trash folder will expire + */ + public long getTrashFolderExpirationTime() { +String configuredValue = getProperty(CarbonCommonConstants.TRASH_EXPIRATION_DAYS, +CarbonCommonConstants.TRASH_EXPIRATION_DAYS_DEFAULT); +int result = Integer.parseInt(configuredValue); +if (result < 0) { + result = Integer.parseInt(TRASH_EXPIRATION_DAYS_DEFAULT); Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512552754 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java ## @@ -47,6 +47,7 @@ public static final String BATCH_PREFIX = "_batchno"; private static final String LOCK_DIR = "LockFiles"; + public static final String SEGMENTS_FOLDER = "segments"; Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512551259 ## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ## @@ -192,11 +208,17 @@ private static boolean checkIfLoadCanBeDeleted(LoadMetadataDetails oneLoad, } private static boolean checkIfLoadCanBeDeletedPhysically(LoadMetadataDetails oneLoad, - boolean isForceDelete) { + boolean isForceDelete, AbsoluteTableIdentifier absoluteTableIdentifier) { // Check if the segment is added externally and path is set then do not delete it if ((SegmentStatus.MARKED_FOR_DELETE == oneLoad.getSegmentStatus() -|| SegmentStatus.COMPACTED == oneLoad.getSegmentStatus()) && (oneLoad.getPath() == null +|| SegmentStatus.COMPACTED == oneLoad.getSegmentStatus() || SegmentStatus +.INSERT_IN_PROGRESS == oneLoad.getSegmentStatus()) && (oneLoad.getPath() == null Review comment: i am not sure about this. maybe we can discuss with @ajantha-bhat or @akashrn5 once? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512551385 ## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ## @@ -1427,6 +1428,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final long MILLIS_SECONDS_IN_A_DAY = TimeUnit.DAYS.toMillis(1); Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512551780 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); +List filesToDelete = new ArrayList<>(); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { +if (!isPartitionTable) { + TrashUtil.copyDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + Review comment: copying it whole segment wise for normal tables, but in case of partition table, doing it file level. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512550744 ## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ## @@ -138,8 +143,19 @@ public boolean accept(CarbonFile file) { if (filesToBeDeleted.length == 0) { status = true; } else { - for (CarbonFile eachFile : filesToBeDeleted) { + // If the file to be deleted is a carbondata file, index file, index merge file + // or a delta file, copy that file to the trash folder. + if ((eachFile.getName().endsWith(CarbonCommonConstants.FACT_FILE_EXT) || Review comment: coping segment wise in the case of normal table, in the case of partition flow have kept it file by file. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512550325 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); +List filesToDelete = new ArrayList<>(); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { Review comment: for the normal table flow, i have changed it to copy to trash by segment, but in case of partition table copying to trash by file because i will have to read the segment file to get the desired carbondata and the index files per segment, which will increase the IO time. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512549332 ## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ## @@ -0,0 +1,484 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.carbondata.spark.testsuite.cleanfiles + +import java.io.{File, PrintWriter} +import java.util +import java.util.List + +import org.apache.carbondata.cleanfiles.CleanFilesUtil +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.util.CarbonUtil +import org.apache.spark.sql.{CarbonEnv, Row} +import org.apache.spark.sql.test.util.QueryTest +import org.scalatest.BeforeAndAfterAll + +import scala.io.Source + +class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { + + var count = 0 + + test("clean up table and test trash folder with In Progress segments") { +sql("""DROP TABLE IF EXISTS CLEANTEST""") +sql("""DROP TABLE IF EXISTS CLEANTEST1""") +sql( + """ +| CREATE TABLE cleantest (name String, id Int) +| STORED AS carbondata + """.stripMargin) +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") +// run a select query before deletion +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + +val path = CarbonEnv.getCarbonTable(Some("default"), "cleantest")(sqlContext.sparkSession) + .getTablePath +val tableStatusFilePath = path + CarbonCommonConstants.FILE_SEPARATOR + "Metadata" + + CarbonCommonConstants.FILE_SEPARATOR + "tableStatus" +editTableStatusFile(path) +val trashFolderPath = path + CarbonCommonConstants.FILE_SEPARATOR + + CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + +assert(!FileFactory.isFileExist(trashFolderPath)) +val dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() +// dry run shows 3 segments to move to trash +assert(dryRun == 3) + +sql(s"CLEAN FILES FOR TABLE cleantest").show + +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(0))) +assert(FileFactory.isFileExist(trashFolderPath)) +var list = getFileCountInTrashFolder(trashFolderPath) +assert(list == 6) + +val dryRun1 = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() +sql(s"CLEAN FILES FOR TABLE cleantest").show + +count = 0 +list = getFileCountInTrashFolder(trashFolderPath) +// no carbondata file is added to the trash +assert(list == 6) + + +val timeStamp = getTimestampFolderName(trashFolderPath) + +// recovering data from trash folder +sql( + """ +| CREATE TABLE cleantest1 (name String, id Int) +| STORED AS carbondata + """.stripMargin) + +val segment0Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '0' +val segment1Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '1' +val segment2Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '2' + +sql(s"alter table cleantest1 add segment options('path'='$segment0Path'," + + s"'format'='carbon')").show() +sql(s"alter table cleantest1 add segment options('path'='$segment1Path'," + + s"'format'='carbon')").show() +sql(s"alter table cleantest1 add segment options('path'='$segment2Path'," + + s"'format'='carbon')").show() +sql(s"""INSERT INTO CLEANTEST SELECT * from cleantest1""") + +// test after recovering data from trash +checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + +sql(s"CLEAN FILES FOR TA
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512548768 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES ON TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all the unnecessary files and folders are moved to during clean files operation. + This trash folder is mantained inside the table path. It is a hidden folder(.Trash). The segments that are moved to the trash folder are mantained under a timestamp + subfolder(timestamp at which clean files operation is called). This helps the user to list down segments by timestamp. By default all the timestamp sub-directory have an expiration + time of (3 days since that timestamp) and it can be configured by the user using the following carbon property + ``` + carbon.trash.expiration.time = "Number of days" + ``` + Once the timestamp subdirectory is expired as per the configured expiration day value, the subdirectory is deleted from the trash folder in the subsequent clean files command. + + + + +### DRY RUN + Support for dry run is provided before the actual clean files operation. This dry run operation will list down all the segments which are going to be manipulated during + the clean files operation. The dry run result will show the current location of the segment(it can be in FACT folder, Partition folder or trash folder) and where that segment + will be moved(to the trash folder or deleted from store) once the actual operation will be called. + + + ``` + CLEAN FILES ON TABLE TABLE_NAME options('dry_run'='true') + ``` + +### FORCE DELETE TRASH +The force option with clean files command deletes all the files and folders from the trash folder. + + ``` + CLEAN FILES ON TABLE TABLE_NAME options('force'='true') + ``` + +### DATA RECOVERY FROM THE TRASH FOLDER + +The segments from can be recovered from the trash folder by creating an external table from the desired segment location Review comment: changed ## File path: docs/dml-of-carbondata.md ## @@ -552,3 +553,50 @@ CarbonData DML statements are documented here,which includes: ``` CLEAN FILES FOR TABLE carbon_table ``` + +## CLEAN FILES + + Clean files command is used to remove the Compacted and Marked + For Delete Segments from the store. Carbondata also supports Trash + Folder where all the stale data is moved to after clean files + is called + + There are several types of compaction + + ``` + CLEAN FILES ON TABLE TableName + ``` Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r512545772 ## File path: docs/cleanfiles.md ## @@ -0,0 +1,78 @@ + + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES ON TABLE TABLE_NAME Review comment: changed This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510659146 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); +List filesToDelete = new ArrayList<>(); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { +if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); +} else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + Review comment: For normal table, we do like: timestamp/Segment_#, there is no use of having Fact and Part0 folders in trash For partition table, we do like: timestamp/Segment_#/partition_folder, the segment number is added so as the recovery can be segment wise. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510658251 ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -53,12 +52,14 @@ private static final Logger LOGGER = LogServiceFactory.getLogService(CarbonLoaderUtil.class.getName()); + private static List filesInTrashFolder = new ArrayList(); + /** * delete folder which metadata no exist in tablestatus * this method don't check tablestatus history. */ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, Review comment: This method is being called from CarbonCleanFIlesCommand class This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510647379 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); +List filesToDelete = new ArrayList<>(); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { +if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); +} else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + entry.getKey().substring( +tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); +} + } + // add the file to the filesToDelete map to delete it after the complete segment + // has been copied. + filesToDelete.add(entry.getKey()); for (String file : entry.getValue()) { String[] deltaFilePaths = updateStatusManager.getDeleteDeltaFilePath(file, segment.getSegmentNo()); for (String deltaFilePath : deltaFilePaths) { - FileFactory.deleteFile(deltaFilePath); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { +if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); +} else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + deltaFilePath.substring( +tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); +} + } + filesToDelete.add(deltaFilePath); +} +// If the file to be deleted is a carbondata file, copy that file to the trash folder. +if (file.endsWith(CarbonCommonConstants.FACT_FILE_EXT) && segmentStatus == +SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { +TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + +CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment +.getSegmentNo()); + } else { +TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + +CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment +.getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + file.substring(tablePath +.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } } -FileFactory.deleteFile(file); +filesToDelete.add(file); } } -deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath); +LoadMetadataDetails loadMetaDataDetail = new LoadMetadataDetails(); +loadMetaDataDetail.setSegmentStatus(segmentStatus); +loadMetaDataDetail.setLoadName(segment.getSegmentNo()); +deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath, +loadMetaDataDetail, filesToDelete, timeStamp); String segmentFilePath = CarbonTablePath.getSegmentFilePath(tablePath, segment.getSegmentFileName()); // Deletes the physical segment file FileFactory.deleteFile(segmentFilePath); Review comment: no, do
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510647134 ## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ## @@ -1427,6 +1427,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final String MICROSECONDS_IN_A_DAY = "8640"; Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509138245 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + /** + * Attribute for Carbon LOGGER + */ + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + private TrashUtil() { + + } + + public static void copyDataToTrashFolder(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) throws IOException { +String trashFolderPath = carbonTablePath + CarbonCommonConstants.FILE_SEPARATOR + +CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + CarbonCommonConstants.FILE_SEPARATOR ++ suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +if (!FileFactory.isFileExist(trashFolderPath)) { + LOGGER.info("Creating Trash folder at:" + trashFolderPath); + FileFactory.createDirectoryAndSetPermission(trashFolderPath, + new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); +} +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), +new File(trashFolderPath)); + } +} catch (IOException e) { + LOGGER.error("Unable to copy " + pathOfFileToCopy + " to the trash folder"); +} + } + + public static void copyDataRecursivelyToTrashFolder(CarbonFile path, String carbonTablePath, + String segmentNo) throws IOException { +if (!path.isDirectory()) { + // copy data to trash + copyDataToTrashFolder(carbonTablePath, path.getAbsolutePath(), segmentNo); + return; +} +CarbonFile[] files = path.listFiles(); Review comment: changed logic ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + /** + * Att
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509135611 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + /** + * Attribute for Carbon LOGGER + */ + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + private TrashUtil() { + + } + + public static void copyDataToTrashFolder(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) throws IOException { +String trashFolderPath = carbonTablePath + CarbonCommonConstants.FILE_SEPARATOR + +CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + CarbonCommonConstants.FILE_SEPARATOR ++ suffixToAdd; +try { + if (new File(pathOfFileToCopy).exists()) { +if (!FileFactory.isFileExist(trashFolderPath)) { + LOGGER.info("Creating Trash folder at:" + trashFolderPath); + FileFactory.createDirectoryAndSetPermission(trashFolderPath, + new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); +} +FileUtils.copyFileToDirectory(new File(pathOfFileToCopy), Review comment: using copy, because if anything crashes while moving files, cannot recover them. So, copying all the files of a segment and then deleting them after copying is success ## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ## @@ -152,6 +123,41 @@ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, } } + public static HashMap getStaleSegments(LoadMetadataDetails[] details, Review comment: done ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/CarbonTruncateCommand.scala ## @@ -45,9 +45,11 @@ case class CarbonTruncateCommand(child: TruncateTableCommand) extends DataComman throw new MalformedCarbonCommandException( "Unsupported truncate table with specified partition") } +val optionList = List.empty[(String, String)] + CarbonCleanFilesCommand( databaseNameOp = Option(dbName), - tableName = Option(tableName), + tableName = Option(tableName), Option(optionList), Review comment: done ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/mutation/CarbonTruncateCommand.scala ## @@ -45,9 +45,11 @@ case class CarbonTruncateCommand(child: TruncateTableCommand) extends DataComman throw new MalformedCarbonCommandException( "Unsupported truncate table with specified partition") } +val optionList = List.empty[(String, String)] Review comment: done ## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonLoadDataCommand.scala ## @@ -108,7 +108,7 @@ case class CarbonLoadDataCommand(databaseNameOp: Option[String], // Delete stale segment folders that are not in table status but are physically present in // the Fact folder LOGGER.info(s"Deleting stale folders if present for table $dbName.$tableName") -TableProcessingOperations.deletePartialLoadDataIfExist(table, false) +// TableProcessingOperations.deletePartialLoadDataIfExist(table, false) Review comment: done ## File path: inte
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509138090 ## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ## @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + /** + * Attribute for Carbon LOGGER + */ Review comment: done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [carbondata] vikramahuja1001 commented on a change in pull request #3917: [CARBONDATA-3978] Clean Files Refactor and support for trash folder in carbondata
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r503370601 ## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ## @@ -1106,23 +1107,55 @@ public static void cleanSegments(CarbonTable table, List partitio */ public static void deleteSegment(String tablePath, Segment segment, List partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + SegmentUpdateStatusManager updateStatusManager, String tableName, String DatabaseName, + SegmentStatus segmentStatus, Boolean isPartitionTable) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); -List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, -FileFactory.getConfiguration()); +List indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, +true, FileFactory.getConfiguration()); Map> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry> entry : indexFilesMap.entrySet()) { + // If the file to be deleted is a carbondata file, copy that file to the trash folder. + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { +if (!isPartitionTable) { + TrashUtil.copyDataToTrashFolder(tablePath, entry.getKey(), CarbonCommonConstants + .LOAD_FOLDER + segment.getSegmentNo()); +} else { + TrashUtil.copyDataToTrashFolder(tablePath, entry.getKey(), CarbonCommonConstants + .LOAD_FOLDER + segment.getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + + entry.getKey().substring(tablePath.length() + 1, entry.getKey().length())); +} + } FileFactory.deleteFile(entry.getKey()); for (String file : entry.getValue()) { String[] deltaFilePaths = updateStatusManager.getDeleteDeltaFilePath(file, segment.getSegmentNo()); for (String deltaFilePath : deltaFilePaths) { + // If the file to be deleted is a carbondata file, copy that file to the trash folder. + if (segmentStatus == SegmentStatus + .INSERT_IN_PROGRESS) { +TrashUtil.copyDataToTrashFolder(tablePath, deltaFilePath, deltaFilePath +.substring(tablePath.length() + 1, deltaFilePath.length())); + } FileFactory.deleteFile(deltaFilePath); } +// If the file to be deleted is a carbondata file, copy that file to the trash folder. +if (file.endsWith(CarbonCommonConstants.FACT_FILE_EXT) && segmentStatus == +SegmentStatus.INSERT_IN_PROGRESS) { Review comment: the indexfile map will contain both the index files as well as the .carbondata file. `file` is entry.getValue and entry is indexfileMap which also has the carbondata file in it. So, we can have places where this condition will be true This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org