This is an automated email from the ASF dual-hosted git repository.
indhumuthumurugesh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git
The following commit(s) were added to refs/heads/master by this push:
new 9aaeba5 [CARBONDATA-4251][CARBONDATA-4253] Optimize Clean Files
Performance
9aaeba5 is described below
commit 9aaeba5af810b51d1958b2c20de2e32993ff7e10
Author: marchpure <[email protected]>
AuthorDate: Mon Jul 26 10:54:43 2021 +0800
[CARBONDATA-4251][CARBONDATA-4253] Optimize Clean Files Performance
Why is this PR needed?
1) When execute cleanfile command, it cleans up all the carbonindex and
carbonmergeindex that once existed, even though carbonindex files have
been
merged into carbonergeindex and deleted. When there are tens of
thousands
of carbonindex that once existed after the completion of the compaction,
the clean file command will take serveral hours to clean index files
which
actually doesn't exist. We just need to clean up the existing
files, carbonmergeindex or carbonindex files
2) The rename command will list partitions of the table, but the partitions
information is not actually used. If the table has hundreds of thousands
partitions, the performance of rename table will degrade a lot
What changes were proposed in this PR?
1) There is a variable indexOrMergeFiles, which means all existing
indexfiles,
CLEAN FILE commmand will delete all existing files instead of delete all
files in 'indexFilesMap', which is actually all '.carbonindex' files
once
exists. Clean 'indexOrMergeFiles' helps to improve CLEAN FILES
performance a lot.
2) The rename command will list partitions for the table, but the
partitions
information is not actually used. If the table has hundreds of thousands
partitions, the performance of rename table will degrade a lot
This closes #4183
---
.../org/apache/carbondata/core/metadata/SegmentFileStore.java | 6 ++++--
.../execution/command/schema/CarbonAlterTableRenameCommand.scala | 5 -----
.../spark/testsuite/cleanfiles/TestCleanFileCommand.scala | 9 +++++++++
3 files changed, 13 insertions(+), 7 deletions(-)
diff --git
a/core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java
b/core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java
index 0bd517c..67c061b 100644
---
a/core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java
+++
b/core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java
@@ -1206,9 +1206,11 @@ public class SegmentFileStore {
FileFactory.getConfiguration());
Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap();
List<String> deletedFiles = new ArrayList<>();
+ for (String indexFilePath : indexOrMergeFiles) {
+ FileFactory.deleteFile(indexFilePath);
+ deletedFiles.add(indexFilePath);
+ }
for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) {
- FileFactory.deleteFile(entry.getKey());
- deletedFiles.add(entry.getKey());
for (String file : entry.getValue()) {
String[] deltaFilePaths =
updateStatusManager.getDeleteDeltaFilePath(file,
segment.getSegmentNo());
diff --git
a/integration/spark/src/main/scala/org/apache/spark/sql/execution/command/schema/CarbonAlterTableRenameCommand.scala
b/integration/spark/src/main/scala/org/apache/spark/sql/execution/command/schema/CarbonAlterTableRenameCommand.scala
index 4b8b1a4..a899572 100644
---
a/integration/spark/src/main/scala/org/apache/spark/sql/execution/command/schema/CarbonAlterTableRenameCommand.scala
+++
b/integration/spark/src/main/scala/org/apache/spark/sql/execution/command/schema/CarbonAlterTableRenameCommand.scala
@@ -124,11 +124,6 @@ private[sql] case class CarbonAlterTableRenameCommand(
val newCarbonTableIdentifier = new CarbonTableIdentifier(oldDatabaseName,
newTableName, carbonTable.getCarbonTableIdentifier.getTableId)
metastore.removeTableFromMetadata(oldDatabaseName, oldTableName)
- var partitions: Seq[CatalogTablePartition] = Seq.empty
- if (carbonTable.isHivePartitionTable) {
- partitions =
- sparkSession.sessionState.catalog.listPartitions(oldTableIdentifier)
- }
sparkSession.catalog.refreshTable(oldTableIdentifier.quotedString)
CarbonSessionCatalogUtil.alterTableRename(
oldTableIdentifier,
diff --git
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala
index e7cc230..b36872c 100644
---
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala
+++
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala
@@ -402,10 +402,19 @@ class TestCleanFileCommand extends QueryTest with
BeforeAndAfterAll {
sql(s"alter table addsegment1 add segment " +
s"options('path'='${ newPath + i }', 'format'='carbon')").collect()
}
+
checkAnswer(sql("select count(*) from addsegment1"), Seq(Row(80)))
sql("alter table addsegment1 compact 'minor'").collect()
+ for (i <- 0 until 2) {
+ assert(CarbonTestUtil.getIndexFileCount("default_addsegment1",
i.toString,
+ CarbonTablePath.MERGE_INDEX_FILE_EXT) == 1)
+ }
checkAnswer(sql("select count(*) from addsegment1"), Seq(Row(80)))
sql("clean files for table addsegment1 OPTIONS('force'='true')")
+ for (i <- 0 until 2) {
+ assert(CarbonTestUtil.getIndexFileCount("default_addsegment1",
i.toString,
+ CarbonTablePath.MERGE_INDEX_FILE_EXT) == 0)
+ }
checkAnswer(sql("select count(*) from addsegment1"), Seq(Row(80)))
sql(s"alter table addsegment1 add segment " +
s"options('path'='${ newPath + 0 }', 'format'='carbon')").collect()