[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

ajantha-bhat Fri, 26 Oct 2018 03:25:00 -0700

Github user ajantha-bhat commented on a diff in the pull request:

    https://github.com/apache/carbondata/pull/2816#discussion_r228477978
  
    --- Diff: 
examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderExampleForBigData.java
 ---
    @@ -0,0 +1,262 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.carbondata.benchmark;
    +
    +import java.io.File;
    +import java.io.FilenameFilter;
    +import java.io.IOException;
    +import java.sql.Timestamp;
    +import java.util.HashMap;
    +import java.util.Map;
    +import java.util.Random;
    +
    +import org.apache.hadoop.conf.Configuration;
    +
    +import 
org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException;
    +import org.apache.carbondata.core.constants.CarbonCommonConstants;
    +import org.apache.carbondata.core.util.CarbonProperties;
    +import org.apache.carbondata.sdk.file.*;
    +
    +/**
    + * Test SDK read example for big data
    + */
    +public class SDKReaderExampleForBigData {
    +  public static void main(String[] args) throws InterruptedException, 
InvalidLoadOptionException, IOException {
    +    System.out.println("start to read data");
    +    String path = "../../../../Downloads/carbon-data-big";
    +    if (args.length > 0) {
    +      path = args[0];
    +    }
    +    double num = 1000000000.0;
    +    String originPath = "../../../../Downloads/carbon-data";
    +    String newPath = "../../../../Downloads/carbon-data-big";
    +    boolean writeNewData = false;
    +    if (writeNewData) {
    +      extendData(originPath, newPath);
    +    }
    +
    +    Configuration conf = new Configuration();
    +    if (args.length > 3) {
    +      conf.set("fs.s3a.access.key", args[1]);
    +      conf.set("fs.s3a.secret.key", args[2]);
    +      conf.set("fs.s3a.endpoint", args[3]);
    +    }
    +    readNextBatchRow(path, num, conf, 100000, 100000);
    +    readNextRow(path, num, conf, 100000);
    +  }
    +
    +  public static void readNextRow(String path, double num, Configuration 
conf, int printNum) {
    +    System.out.println("readNextRow");
    +    try {
    +      // Read data
    +      Long startTime = System.nanoTime();
    +      CarbonReader reader = CarbonReader
    +          .builder(path, "_temp")
    +          .withHadoopConf(conf)
    +          .build();
    +
    +      Long startReadTime = System.nanoTime();
    +      System.out.println("build time is " + (startReadTime - startTime) / 
num);
    +      int i = 0;
    +      while (reader.hasNext()) {
    +        Object[] data = (Object[]) reader.readNextRow();
    +        i++;
    +        if (i % printNum == 0) {
    +          Long point = System.nanoTime();
    +          System.out.print(i + ": time is " + (point - startReadTime) / num
    +              + " s, speed is " + (i / ((point - startReadTime) / num)) + 
" records/s \t");
    +          for (int j = 0; j < data.length; j++) {
    +            System.out.print(data[j] + "\t\t");
    +          }
    +          System.out.println();
    +        }
    +      }
    +      Long endReadTime = System.nanoTime();
    +      System.out.println("total lines is " + i + ", build time is " + 
(startReadTime - startTime) / num
    +          + " s, \ttotal read time is " + (endReadTime - startReadTime) / 
num
    +          + " s, \taverage speed is " + (i / ((endReadTime - 
startReadTime) / num))
    +          + " records/s.");
    +      reader.close();
    +    } catch (Throwable e) {
    +      e.printStackTrace();
    +    }
    +  }
    +
    +  /**
    +   * read next batch row
    +   *
    +   * @param path     data path
    +   * @param num      number for time
    +   * @param conf     configuration
    +   * @param batch    batch size
    +   * @param printNum print number for each batch
    +   */
    +  public static void readNextBatchRow(String path, double num, 
Configuration conf, int batch, int printNum) {
    +    System.out.println("readNextBatchRow");
    +    try {
    +      // Read data
    +      Long startTime = System.nanoTime();
    +      CarbonReader reader = CarbonReader
    +          .builder(path, "_temp")
    +          .withHadoopConf(conf)
    +          .withBatch(batch)
    +          .build();
    +
    +      Long startReadTime = System.nanoTime();
    +      Long startBatchReadTime = startReadTime;
    +      System.out.println("build time is " + (startBatchReadTime - 
startTime) / num);
    +      int i = 0;
    +      long startHasNext = System.nanoTime();
    +      while (reader.hasNext()) {
    +        Long endHasNext = System.nanoTime();
    +
    +        Object[] batchRow = reader.readNextBatchRow();
    +        for (int k = 0; k < batchRow.length; k++) {
    +          Object[] data = (Object[]) batchRow[k];
    +          i++;
    +          if (i > 0 && i % printNum == 0) {
    +            Long point = System.nanoTime();
    +            System.out.print(i + ": time is " + (point - 
startBatchReadTime) / num
    +                + " s, \tspeed is " + (printNum / ((point - 
startBatchReadTime) / num))
    +                + " records/s, \thasNext time is " + (endHasNext - 
startHasNext) / num + " s\t");
    +            for (int j = 0; j < data.length; j++) {
    +              System.out.print(data[j] + "\t\t");
    +            }
    +            System.out.println();
    +            startBatchReadTime = System.nanoTime();
    +          }
    +        }
    +        startHasNext = System.nanoTime();
    +      }
    +      Long endReadTime = System.nanoTime();
    +      System.out.println("total lines is " + i + ", build time is " + 
(startReadTime - startTime) / num
    +          + " s, \ttotal read time is " + (endReadTime - startReadTime) / 
num
    +          + " s, \taverage speed is " + (i / ((endReadTime - 
startReadTime) / num))
    +          + " records/s.");
    +      reader.close();
    +    } catch (Throwable e) {
    +      e.printStackTrace();
    +    }
    +  }
    +
    +  public static Schema readSchema(String path) throws IOException {
    +    File[] dataFiles = new File(path).listFiles(new FilenameFilter() {
    +      @Override
    +      public boolean accept(File dir, String name) {
    +        if (name == null) {
    +          return false;
    +        }
    +        return name.endsWith("carbondata");
    +      }
    +    });
    +    if (dataFiles == null || dataFiles.length < 1) {
    +      throw new RuntimeException("Carbon index file not exists.");
    +    }
    +    Schema schema = CarbonSchemaReader
    +        .readSchemaInDataFile(dataFiles[0].getAbsolutePath())
    +        .asOriginOrder();
    +    return schema;
    +  }
    +
    +  /**
    +   * extend data
    +   * read origin path data and generate new data in new path,
    +   * the new data is bigger than origin data
    +   *
    +   * @param originPath origin path of data
    +   * @param newPath    new path of data
    +   * @throws IOException
    +   * @throws InterruptedException
    +   * @throws InvalidLoadOptionException
    +   */
    +  public static void extendData(String originPath, String newPath)
    +      throws IOException, InterruptedException, InvalidLoadOptionException 
{
    --- End diff --
    
    we  don't need this.
    1) Benchmarking read performace if you want to check,  just have a simple 
exmpale that takes path of files and reads it and prints the time. why 
extendData is required ?
    2) Also below readSchema(originPath) is called for writing carbonfiles.  we 
should not write any files.
    
    just you can keep an example for read from path and print time. thats all

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

Reply via email to