Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2683#discussion_r216981868
--- Diff:
tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java ---
@@ -0,0 +1,360 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.tool;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.carbondata.common.Strings;
+import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.memory.MemoryException;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import
org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
+import org.apache.carbondata.core.reader.CarbonHeaderReader;
+import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
+import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
+import org.apache.carbondata.core.util.CarbonUtil;
+import org.apache.carbondata.core.util.path.CarbonTablePath;
+import org.apache.carbondata.format.BlockletInfo3;
+import org.apache.carbondata.format.FileFooter3;
+import org.apache.carbondata.format.FileHeader;
+import org.apache.carbondata.format.TableInfo;
+
+import static
org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET;
+
+/**
+ * Data Summary command implementation for {@link CarbonCli}
+ */
+class DataSummary {
+ private String dataFolder;
+ private PrintStream out;
+
+ private long numBlock;
+ private long numShard;
+ private long numBlocklet;
+ private long numPage;
+ private long numRow;
+ private long totalDataSize;
+
+ // file path mapping to file object
+ private LinkedHashMap<String, DataFile> dataFiles = new
LinkedHashMap<>();
+ private CarbonFile tableStatusFile;
+ private CarbonFile schemaFile;
+
+ DataSummary(String dataFolder, PrintStream out) throws IOException {
+ this.dataFolder = dataFolder;
+ this.out = out;
+ collectDataFiles();
+ }
+
+ private boolean isColumnarFile(String fileName) {
+ // if the timestamp in file name is "0", it is a streaming file
+ return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
+
!CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
+ }
+
+ private boolean isStreamFile(String fileName) {
+ // if the timestamp in file name is "0", it is a streaming file
+ return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) &&
+
CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0");
+ }
+
+ private void collectDataFiles() throws IOException {
+ Set<String> shards = new HashSet<>();
+ CarbonFile folder = FileFactory.getCarbonFile(dataFolder);
+ List<CarbonFile> files = folder.listFiles(true);
+ List<DataFile> unsortedFiles = new ArrayList<>();
+ for (CarbonFile file : files) {
+ if (isColumnarFile(file.getName())) {
+ DataFile dataFile = new DataFile(file);
+ unsortedFiles.add(dataFile);
+ collectNum(dataFile.getFooter());
+ shards.add(dataFile.getShardName());
+ totalDataSize += file.getSize();
+ } else if
(file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) {
+ tableStatusFile = file;
+ } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) {
+ schemaFile = file;
+ } else if (isStreamFile(file.getName())) {
+ out.println("WARN: input path contains streaming file, this tool
does not support it yet, "
+ + "skipping it...");
+ }
+ }
+ unsortedFiles.sort((o1, o2) -> {
+ if (o1.getShardName().equalsIgnoreCase(o2.getShardName())) {
+ return Integer.parseInt(o1.getPartNo()) -
Integer.parseInt(o2.getPartNo());
+ } else {
+ return o1.getShardName().hashCode() - o2.getShardName().hashCode();
--- End diff --
fixed, will use `o1.getShardName().compareto(o2.getShardName())`
---