Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228477978
--- Diff:
examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderExampleForBigData.java
---
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.benchmark;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+
+import
org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.util.CarbonProperties;
+import org.apache.carbondata.sdk.file.*;
+
+/**
+ * Test SDK read example for big data
+ */
+public class SDKReaderExampleForBigData {
+ public static void main(String[] args) throws InterruptedException,
InvalidLoadOptionException, IOException {
+ System.out.println("start to read data");
+ String path = "../../../../Downloads/carbon-data-big";
+ if (args.length > 0) {
+ path = args[0];
+ }
+ double num = 1000000000.0;
+ String originPath = "../../../../Downloads/carbon-data";
+ String newPath = "../../../../Downloads/carbon-data-big";
+ boolean writeNewData = false;
+ if (writeNewData) {
+ extendData(originPath, newPath);
+ }
+
+ Configuration conf = new Configuration();
+ if (args.length > 3) {
+ conf.set("fs.s3a.access.key", args[1]);
+ conf.set("fs.s3a.secret.key", args[2]);
+ conf.set("fs.s3a.endpoint", args[3]);
+ }
+ readNextBatchRow(path, num, conf, 100000, 100000);
+ readNextRow(path, num, conf, 100000);
+ }
+
+ public static void readNextRow(String path, double num, Configuration
conf, int printNum) {
+ System.out.println("readNextRow");
+ try {
+ // Read data
+ Long startTime = System.nanoTime();
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
+ .withHadoopConf(conf)
+ .build();
+
+ Long startReadTime = System.nanoTime();
+ System.out.println("build time is " + (startReadTime - startTime) /
num);
+ int i = 0;
+ while (reader.hasNext()) {
+ Object[] data = (Object[]) reader.readNextRow();
+ i++;
+ if (i % printNum == 0) {
+ Long point = System.nanoTime();
+ System.out.print(i + ": time is " + (point - startReadTime) / num
+ + " s, speed is " + (i / ((point - startReadTime) / num)) +
" records/s \t");
+ for (int j = 0; j < data.length; j++) {
+ System.out.print(data[j] + "\t\t");
+ }
+ System.out.println();
+ }
+ }
+ Long endReadTime = System.nanoTime();
+ System.out.println("total lines is " + i + ", build time is " +
(startReadTime - startTime) / num
+ + " s, \ttotal read time is " + (endReadTime - startReadTime) /
num
+ + " s, \taverage speed is " + (i / ((endReadTime -
startReadTime) / num))
+ + " records/s.");
+ reader.close();
+ } catch (Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * read next batch row
+ *
+ * @param path data path
+ * @param num number for time
+ * @param conf configuration
+ * @param batch batch size
+ * @param printNum print number for each batch
+ */
+ public static void readNextBatchRow(String path, double num,
Configuration conf, int batch, int printNum) {
+ System.out.println("readNextBatchRow");
+ try {
+ // Read data
+ Long startTime = System.nanoTime();
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
+ .withHadoopConf(conf)
+ .withBatch(batch)
+ .build();
+
+ Long startReadTime = System.nanoTime();
+ Long startBatchReadTime = startReadTime;
+ System.out.println("build time is " + (startBatchReadTime -
startTime) / num);
+ int i = 0;
+ long startHasNext = System.nanoTime();
+ while (reader.hasNext()) {
+ Long endHasNext = System.nanoTime();
+
+ Object[] batchRow = reader.readNextBatchRow();
+ for (int k = 0; k < batchRow.length; k++) {
+ Object[] data = (Object[]) batchRow[k];
+ i++;
+ if (i > 0 && i % printNum == 0) {
+ Long point = System.nanoTime();
+ System.out.print(i + ": time is " + (point -
startBatchReadTime) / num
+ + " s, \tspeed is " + (printNum / ((point -
startBatchReadTime) / num))
+ + " records/s, \thasNext time is " + (endHasNext -
startHasNext) / num + " s\t");
+ for (int j = 0; j < data.length; j++) {
+ System.out.print(data[j] + "\t\t");
+ }
+ System.out.println();
+ startBatchReadTime = System.nanoTime();
+ }
+ }
+ startHasNext = System.nanoTime();
+ }
+ Long endReadTime = System.nanoTime();
+ System.out.println("total lines is " + i + ", build time is " +
(startReadTime - startTime) / num
+ + " s, \ttotal read time is " + (endReadTime - startReadTime) /
num
+ + " s, \taverage speed is " + (i / ((endReadTime -
startReadTime) / num))
+ + " records/s.");
+ reader.close();
+ } catch (Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static Schema readSchema(String path) throws IOException {
+ File[] dataFiles = new File(path).listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ if (name == null) {
+ return false;
+ }
+ return name.endsWith("carbondata");
+ }
+ });
+ if (dataFiles == null || dataFiles.length < 1) {
+ throw new RuntimeException("Carbon index file not exists.");
+ }
+ Schema schema = CarbonSchemaReader
+ .readSchemaInDataFile(dataFiles[0].getAbsolutePath())
+ .asOriginOrder();
+ return schema;
+ }
+
+ /**
+ * extend data
+ * read origin path data and generate new data in new path,
+ * the new data is bigger than origin data
+ *
+ * @param originPath origin path of data
+ * @param newPath new path of data
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws InvalidLoadOptionException
+ */
+ public static void extendData(String originPath, String newPath)
+ throws IOException, InterruptedException, InvalidLoadOptionException
{
--- End diff --
we don't need this.
1) Benchmarking read performace if you want to check, just have a simple
exmpale that takes path of files and reads it and prints the time. why
extendData is required ?
2) Also below readSchema(originPath) is called for writing carbonfiles. we
should not write any files.
just you can keep an example for read from path and print time. thats all
---