This is an automated email from the ASF dual-hosted git repository. jackylk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/carbondata.git
The following commit(s) were added to refs/heads/master by this push: new 2418317 [CARBONDATA-3544] cli support summary statistics for all columns 2418317 is described below commit 2418317e4b4a918441e5c120766f12ad0d2fb089 Author: QiangCai <qiang...@qq.com> AuthorDate: Thu Oct 10 19:42:55 2019 +0800 [CARBONDATA-3544] cli support summary statistics for all columns add option -C to show statistics for all columns This closes #3409 --- .../command/management/CarbonCliCommand.scala | 21 ++++++++++----- .../java/org/apache/carbondata/tool/CarbonCli.java | 5 ++++ .../org/apache/carbondata/tool/DataSummary.java | 30 +++++++++++++++++++++- .../org/apache/carbondata/tool/FileCollector.java | 9 ++++++- .../org/apache/carbondata/tool/CarbonCliTest.java | 29 ++++++++++++++++++--- 5 files changed, 83 insertions(+), 11 deletions(-) diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCliCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCliCommand.scala index 5dd0c12..e4fb725 100644 --- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCliCommand.scala +++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCliCommand.scala @@ -49,12 +49,21 @@ case class CarbonCliCommand( val carbonTable = CarbonEnv.getCarbonTable(databaseNameOp, tableName)(sparkSession) setAuditTable(carbonTable) setAuditInfo(Map("options" -> commandOptions)) - val commandArgs: Seq[String] = commandOptions.split("\\s+") - val finalCommands = commandArgs.collect { - case a if a.trim.equalsIgnoreCase("summary") || a.trim.equalsIgnoreCase("benchmark") => - Seq(a, "-p", carbonTable.getTablePath) - case x => Seq(x.trim) - }.flatten + val commandArgs: Seq[String] = commandOptions.split("\\s+").map(_.trim) + val finalCommands = commandArgs.exists(_.equalsIgnoreCase("-p")) match { + case true => + commandArgs + case false => + val needPath = commandArgs.exists { command => + command.equalsIgnoreCase("summary") || command.equalsIgnoreCase("benchmark") + } + needPath match { + case true => + commandArgs ++ Seq("-p", carbonTable.getTablePath) + case false => + commandArgs + } + } val summaryOutput = new util.ArrayList[String]() CarbonCli.run(finalCommands.toArray, summaryOutput, false) summaryOutput.asScala.map(x => diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java index ef9a50e..6cef91a 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java @@ -79,6 +79,10 @@ public class CarbonCli { .withDescription("column to print statistics") .withLongOpt("column") .create("c"); + Option columns = OptionBuilder + .withDescription("print statistics for all columns") + .withLongOpt("columns") + .create("C"); Option blockletDetail = OptionBuilder.withArgName("limitSize").hasOptionalArg() .withDescription("print blocklet size detail").withLongOpt("limitSize") @@ -101,6 +105,7 @@ public class CarbonCli { options.addOption(blockletDetail); options.addOption(columnMeta); options.addOption(columnName); + options.addOption(columns); options.addOption(version); options.addOption(blockLevelDetail); return options; diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java index 1930bf5..31ab535 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java @@ -108,7 +108,9 @@ class DataSummary implements Command { collectColumnChunkMeta(columName); } } - + if (line.hasOption("C")) { + printAllColumnStats(); + } collector.close(); for (DataFile file : dataFiles.values()) { file.close(); @@ -371,6 +373,32 @@ class DataSummary implements Command { printer.collectFormattedData(); } + private void printAllColumnStats() { + if (!dataFiles.isEmpty()) { + outPuts.add(""); + outPuts.add("## Statistics for All Columns"); + String[] header = + new String[] { "Block", "Blocklet", "Column Name", "Meta Size", "Data Size" }; + ShardPrinter printer = new ShardPrinter(header, outPuts); + for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) { + DataFile dataFile = entry.getValue(); + List<ColumnSchema> columns = dataFile.getSchema(); + int columnNum = columns.size(); + int blockletNum = dataFile.getNumBlocklet(); + for (int j = 0; j < blockletNum; j++) { + for (int i = 0; i < columnNum; i++) { + printer.addRow(dataFile.getShardName(), + new String[] { dataFile.getPartNo(), String.valueOf(j), + columns.get(i).getColumnName(), + Strings.formatSize(dataFile.getColumnMetaSizeInBytes(j, i)), + Strings.formatSize(dataFile.getColumnDataSizeInBytes(j, i)) }); + } + } + } + printer.collectFormattedData(); + } + } + private void collectStats(String columnName) throws IOException, MemoryException { if (!collected) { for (DataFile dataFile : dataFiles.values()) { diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/FileCollector.java b/tools/cli/src/main/java/org/apache/carbondata/tool/FileCollector.java index 6c7eaf9..66daa0d 100644 --- a/tools/cli/src/main/java/org/apache/carbondata/tool/FileCollector.java +++ b/tools/cli/src/main/java/org/apache/carbondata/tool/FileCollector.java @@ -57,7 +57,14 @@ class FileCollector { void collectFiles(String dataFolder) throws IOException { Set<String> shards = new HashSet<>(); CarbonFile folder = FileFactory.getCarbonFile(dataFolder); - List<CarbonFile> files = folder.listFiles(true); + List<CarbonFile> files = new ArrayList<>(); + if (folder.exists()) { + if (folder.isDirectory()) { + files = folder.listFiles(true); + } else { + files.add(folder); + } + } List<DataFile> unsortedFiles = new ArrayList<>(); for (CarbonFile file : files) { if (isColumnarFile(file.getName())) { diff --git a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java index 4d89777..f6ff49c 100644 --- a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java +++ b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java @@ -24,8 +24,11 @@ import java.io.PrintStream; import org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException; import org.apache.carbondata.core.constants.CarbonVersionConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.util.CarbonUtil; +import org.apache.carbondata.core.util.path.CarbonTablePath; import org.apache.carbondata.sdk.file.*; import org.apache.commons.io.FileUtils; @@ -253,7 +256,6 @@ public class CarbonCliTest { PrintStream stream = new PrintStream(out); CarbonCli.run(args, stream); String output = new String(out.toByteArray()); - System.out.println(output); String expectedOutput = buildLines( "Blocklet 0:", "Page 0 (offset 0, length 9): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:96000, total_compressed_size:9, compressor_name:snappy), rowMajor:false, data_page_length:5, rle_page_length:4, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_NATIVE, encoders:[RLE], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true] [...] @@ -261,13 +263,34 @@ public class CarbonCliTest { } @Test + public void testSummaryAllColumns() { + String[] args = { "-cmd", "summary", "-p", path, "-C" }; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PrintStream stream = new PrintStream(out); + CarbonCli.run(args, stream); + String output = new String(out.toByteArray()); + Assert.assertTrue(output.contains("Block Blocklet Column Name Meta Size Data Size")); + } + + @Test + public void testSummaryAllColumnsForOneFile() { + CarbonFile folder = FileFactory.getCarbonFile(path); + CarbonFile[] carbonFiles = + folder.listFiles(file -> file.getName().endsWith(CarbonTablePath.CARBON_DATA_EXT)); + String[] args = { "-cmd", "summary", "-p", carbonFiles[0].getCanonicalPath(), "-C" }; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PrintStream stream = new PrintStream(out); + CarbonCli.run(args, stream); + String output = new String(out.toByteArray()); + Assert.assertTrue(output.contains("Block Blocklet Column Name Meta Size Data Size")); + } + + @Test public void testBenchmark() { String[] args = {"-cmd", "benchmark", "-p", path, "-a", "-c", "name"}; ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream stream = new PrintStream(out); CarbonCli.run(args, stream); - String output = new String(out.toByteArray()); - System.out.println(output); } @Test