[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user asfgit closed the pull request at: https://github.com/apache/carbondata/pull/2683 ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r217589388 --- Diff: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java --- @@ -168,6 +168,65 @@ public static Object getMeasureObjectBasedOnDataType(ColumnPage measurePage, int } } + /** + * Calculate data percentage in [min, max] scope based on data type + * @param data data to calculate the percentage + * @param min min value + * @param max max value + * @param column column schema including data type + * @return result + */ + public static double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) { --- End diff -- ok ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r217589323 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/TestUtil.java --- @@ -36,38 +36,34 @@ import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DecoderFactory; -import org.apache.avro.io.Encoder; import org.apache.avro.io.JsonDecoder; import org.apache.hadoop.conf.Configuration; -import org.junit.Assert; +@InterfaceAudience.Developer("Test") public class TestUtil { - public static Configuration configuration = new Configuration(); + public static final Configuration configuration = new Configuration(); --- End diff -- Because now CLI is a separate module and testcase in CLI need to use TestUtil to write carbonfiles using SDK, so I have to move the TestUtil to src so that CLI testcase can use it. Is there other way? ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user ravipesala commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r217323443 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/TestUtil.java --- @@ -36,38 +36,34 @@ import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DecoderFactory; -import org.apache.avro.io.Encoder; import org.apache.avro.io.JsonDecoder; import org.apache.hadoop.conf.Configuration; -import org.junit.Assert; +@InterfaceAudience.Developer("Test") public class TestUtil { - public static Configuration configuration = new Configuration(); + public static final Configuration configuration = new Configuration(); --- End diff -- Why is this test class changed? How it is related to CLI tool ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user ravipesala commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r217322845 --- Diff: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java --- @@ -168,6 +168,65 @@ public static Object getMeasureObjectBasedOnDataType(ColumnPage measurePage, int } } + /** + * Calculate data percentage in [min, max] scope based on data type + * @param data data to calculate the percentage + * @param min min value + * @param max max value + * @param column column schema including data type + * @return result + */ + public static double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) { --- End diff -- Bette move this method to CI tool only as it is used only there ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216982870 --- Diff: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java --- @@ -168,6 +168,65 @@ public static Object getMeasureObjectBasedOnDataType(ColumnPage measurePage, int } } + /** + * Calculate data percentage in [min, max] scope based on data type + * @param data data to calculate the percentage + * @param min min value + * @param max max value + * @param column column schema including data type + * @return result + */ + public static double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) { +if (column.getDataType() == DataTypes.STRING) { + // for string, we do not calculate + return 0; +} else if (DataTypes.isDecimal(column.getDataType())) { + BigDecimal minValue = DataTypeUtil.byteToBigDecimal(min); + BigDecimal dataValue = DataTypeUtil.byteToBigDecimal(data).subtract(minValue); + BigDecimal factorValue = DataTypeUtil.byteToBigDecimal(max).subtract(minValue); + return dataValue.divide(factorValue).doubleValue(); +} +double dataValue, minValue, factorValue; +if (column.getDataType() == DataTypes.SHORT) { + minValue = ByteUtil.toShort(min, 0); + dataValue = ByteUtil.toShort(data, 0) - minValue; + factorValue = ByteUtil.toShort(max, 0) - ByteUtil.toShort(min, 0); +} else if (column.getDataType() == DataTypes.INT) { + if (column.isSortColumn()) { +minValue = ByteUtil.toXorInt(min, 0, min.length); +dataValue = ByteUtil.toXorInt(data, 0, data.length) - minValue; +factorValue = ByteUtil.toXorInt(max, 0, max.length) - ByteUtil.toXorInt(min, 0, min.length); + } else { +minValue = ByteUtil.toLong(min, 0, min.length); +dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; +factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); + } +} else if (column.getDataType() == DataTypes.LONG) { + minValue = ByteUtil.toLong(min, 0, min.length); + dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; + factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); +} else if (column.getDataType() == DataTypes.DATE) { + minValue = ByteUtil.toInt(min, 0, min.length); + dataValue = ByteUtil.toInt(data, 0, data.length) - minValue; + factorValue = ByteUtil.toInt(max, 0, max.length) - ByteUtil.toInt(min, 0, min.length); +} else if (column.getDataType() == DataTypes.TIMESTAMP) { + minValue = ByteUtil.toLong(min, 0, min.length); + dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; + factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); +} else if (column.getDataType() == DataTypes.DOUBLE) { + minValue = ByteUtil.toDouble(min, 0, min.length); + dataValue = ByteUtil.toDouble(data, 0, data.length) - minValue; + factorValue = ByteUtil.toDouble(max, 0, max.length) - ByteUtil.toDouble(min, 0, min.length); +} else { + throw new UnsupportedOperationException("data type: " + column.getDataType()); +} + +if (factorValue == 0d) { + return Double.MIN_VALUE; --- End diff -- fixed ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216981868 --- Diff: tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java --- @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.tool; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.carbondata.common.Strings; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema; +import org.apache.carbondata.core.reader.CarbonHeaderReader; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.util.CarbonUtil; +import org.apache.carbondata.core.util.path.CarbonTablePath; +import org.apache.carbondata.format.BlockletInfo3; +import org.apache.carbondata.format.FileFooter3; +import org.apache.carbondata.format.FileHeader; +import org.apache.carbondata.format.TableInfo; + +import static org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET; + +/** + * Data Summary command implementation for {@link CarbonCli} + */ +class DataSummary { + private String dataFolder; + private PrintStream out; + + private long numBlock; + private long numShard; + private long numBlocklet; + private long numPage; + private long numRow; + private long totalDataSize; + + // file path mapping to file object + private LinkedHashMap dataFiles = new LinkedHashMap<>(); + private CarbonFile tableStatusFile; + private CarbonFile schemaFile; + + DataSummary(String dataFolder, PrintStream out) throws IOException { +this.dataFolder = dataFolder; +this.out = out; +collectDataFiles(); + } + + private boolean isColumnarFile(String fileName) { +// if the timestamp in file name is "0", it is a streaming file +return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + !CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private boolean isStreamFile(String fileName) { +// if the timestamp in file name is "0", it is a streaming file +return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private void collectDataFiles() throws IOException { +Set shards = new HashSet<>(); +CarbonFile folder = FileFactory.getCarbonFile(dataFolder); +List files = folder.listFiles(true); +List unsortedFiles = new ArrayList<>(); +for (CarbonFile file : files) { + if (isColumnarFile(file.getName())) { +DataFile dataFile = new DataFile(file); +unsortedFiles.add(dataFile); +collectNum(dataFile.getFooter()); +shards.add(dataFile.getShardName()); +totalDataSize += file.getSize(); + } else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) { +tableStatusFile = file; + } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) { +schemaFile = file; + } else if (isStreamFile(file.getName())) { +out.println("WARN: input path contains streaming file, this tool does not support it yet, " +
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216981072 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/TestUtil.java --- @@ -136,33 +136,40 @@ public static void writeFilesAndVerify(int rows, Schema schema, String path, Str CarbonWriter writer = builder.buildWriterForCSVInput(schema, configuration); for (int i = 0; i < rows; i++) { -writer.write(new String[]{"robot" + (i % 10), String.valueOf(i), String.valueOf((double) i / 2)}); +writer.write(new String[]{ +"robot" + (i % 10), String.valueOf(i % 300), String.valueOf((double) i / 2)}); } writer.close(); -} catch (IOException e) { +} catch (Exception e) { e.printStackTrace(); --- End diff -- fixed ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user xuchuanyin commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216965853 --- Diff: core/src/main/java/org/apache/carbondata/core/util/DataTypeUtil.java --- @@ -168,6 +168,65 @@ public static Object getMeasureObjectBasedOnDataType(ColumnPage measurePage, int } } + /** + * Calculate data percentage in [min, max] scope based on data type + * @param data data to calculate the percentage + * @param min min value + * @param max max value + * @param column column schema including data type + * @return result + */ + public static double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) { +if (column.getDataType() == DataTypes.STRING) { + // for string, we do not calculate + return 0; +} else if (DataTypes.isDecimal(column.getDataType())) { + BigDecimal minValue = DataTypeUtil.byteToBigDecimal(min); + BigDecimal dataValue = DataTypeUtil.byteToBigDecimal(data).subtract(minValue); + BigDecimal factorValue = DataTypeUtil.byteToBigDecimal(max).subtract(minValue); + return dataValue.divide(factorValue).doubleValue(); +} +double dataValue, minValue, factorValue; +if (column.getDataType() == DataTypes.SHORT) { + minValue = ByteUtil.toShort(min, 0); + dataValue = ByteUtil.toShort(data, 0) - minValue; + factorValue = ByteUtil.toShort(max, 0) - ByteUtil.toShort(min, 0); +} else if (column.getDataType() == DataTypes.INT) { + if (column.isSortColumn()) { +minValue = ByteUtil.toXorInt(min, 0, min.length); +dataValue = ByteUtil.toXorInt(data, 0, data.length) - minValue; +factorValue = ByteUtil.toXorInt(max, 0, max.length) - ByteUtil.toXorInt(min, 0, min.length); + } else { +minValue = ByteUtil.toLong(min, 0, min.length); +dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; +factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); + } +} else if (column.getDataType() == DataTypes.LONG) { + minValue = ByteUtil.toLong(min, 0, min.length); + dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; + factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); +} else if (column.getDataType() == DataTypes.DATE) { + minValue = ByteUtil.toInt(min, 0, min.length); + dataValue = ByteUtil.toInt(data, 0, data.length) - minValue; + factorValue = ByteUtil.toInt(max, 0, max.length) - ByteUtil.toInt(min, 0, min.length); +} else if (column.getDataType() == DataTypes.TIMESTAMP) { + minValue = ByteUtil.toLong(min, 0, min.length); + dataValue = ByteUtil.toLong(data, 0, data.length) - minValue; + factorValue = ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length); +} else if (column.getDataType() == DataTypes.DOUBLE) { + minValue = ByteUtil.toDouble(min, 0, min.length); + dataValue = ByteUtil.toDouble(data, 0, data.length) - minValue; + factorValue = ByteUtil.toDouble(max, 0, max.length) - ByteUtil.toDouble(min, 0, min.length); +} else { + throw new UnsupportedOperationException("data type: " + column.getDataType()); +} + +if (factorValue == 0d) { + return Double.MIN_VALUE; --- End diff -- If the value for the column is constant, the 'factorValue' here will be '0'. And I think the percentage should be '1' instead of 'Double.MIN_VALUE'. ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user xuchuanyin commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216966619 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/TestUtil.java --- @@ -136,33 +136,40 @@ public static void writeFilesAndVerify(int rows, Schema schema, String path, Str CarbonWriter writer = builder.buildWriterForCSVInput(schema, configuration); for (int i = 0; i < rows; i++) { -writer.write(new String[]{"robot" + (i % 10), String.valueOf(i), String.valueOf((double) i / 2)}); +writer.write(new String[]{ +"robot" + (i % 10), String.valueOf(i % 300), String.valueOf((double) i / 2)}); } writer.close(); -} catch (IOException e) { +} catch (Exception e) { e.printStackTrace(); --- End diff -- It's not recommended to print the stack trace like this. ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user xuchuanyin commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216968250 --- Diff: tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java --- @@ -0,0 +1,360 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.tool; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.carbondata.common.Strings; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema; +import org.apache.carbondata.core.reader.CarbonHeaderReader; +import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; +import org.apache.carbondata.core.statusmanager.SegmentStatusManager; +import org.apache.carbondata.core.util.CarbonUtil; +import org.apache.carbondata.core.util.path.CarbonTablePath; +import org.apache.carbondata.format.BlockletInfo3; +import org.apache.carbondata.format.FileFooter3; +import org.apache.carbondata.format.FileHeader; +import org.apache.carbondata.format.TableInfo; + +import static org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET; + +/** + * Data Summary command implementation for {@link CarbonCli} + */ +class DataSummary { + private String dataFolder; + private PrintStream out; + + private long numBlock; + private long numShard; + private long numBlocklet; + private long numPage; + private long numRow; + private long totalDataSize; + + // file path mapping to file object + private LinkedHashMap dataFiles = new LinkedHashMap<>(); + private CarbonFile tableStatusFile; + private CarbonFile schemaFile; + + DataSummary(String dataFolder, PrintStream out) throws IOException { +this.dataFolder = dataFolder; +this.out = out; +collectDataFiles(); + } + + private boolean isColumnarFile(String fileName) { +// if the timestamp in file name is "0", it is a streaming file +return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + !CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private boolean isStreamFile(String fileName) { +// if the timestamp in file name is "0", it is a streaming file +return fileName.endsWith(CarbonTablePath.CARBON_DATA_EXT) && + CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileName).equals("0"); + } + + private void collectDataFiles() throws IOException { +Set shards = new HashSet<>(); +CarbonFile folder = FileFactory.getCarbonFile(dataFolder); +List files = folder.listFiles(true); +List unsortedFiles = new ArrayList<>(); +for (CarbonFile file : files) { + if (isColumnarFile(file.getName())) { +DataFile dataFile = new DataFile(file); +unsortedFiles.add(dataFile); +collectNum(dataFile.getFooter()); +shards.add(dataFile.getShardName()); +totalDataSize += file.getSize(); + } else if (file.getName().endsWith(CarbonTablePath.TABLE_STATUS_FILE)) { +tableStatusFile = file; + } else if (file.getName().startsWith(CarbonTablePath.SCHEMA_FILE)) { +schemaFile = file; + } else if (isStreamFile(file.getName())) { +out.println("WARN: input path contains streaming file, this tool does not support it yet, " +
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r216316121 --- Diff: pom.xml --- @@ -706,6 +706,12 @@ datamap/mv/core + + tool --- End diff -- ok, fixed ---
[GitHub] carbondata pull request #2683: [CARBONDATA-2916] Add CarbonCli tool for data...
Github user chenliang613 commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2683#discussion_r215669915 --- Diff: pom.xml --- @@ -706,6 +706,12 @@ datamap/mv/core + + tool --- End diff -- suggest using "tools" ---