[GitHub] orc pull request #252: ORC-345 Create and use a new Decimal64StatisticsImpl ...

omalley Thu, 03 May 2018 08:39:48 -0700

Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/252#discussion_r185845008
  
    --- Diff: java/bench/src/java/org/apache/orc/bench/DecimalBench.java ---
    @@ -0,0 +1,270 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.orc.bench;
    +
    +import com.google.gson.JsonStreamParser;
    +import org.apache.avro.file.DataFileReader;
    +import org.apache.avro.generic.GenericDatumReader;
    +import org.apache.avro.generic.GenericRecord;
    +import org.apache.avro.io.DatumReader;
    +import org.apache.avro.mapred.FsInput;
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.fs.FileSystem;
    +import org.apache.hadoop.fs.Path;
    +import org.apache.hadoop.fs.TrackingLocalFileSystem;
    +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
    +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
    +import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
    +import 
org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
    +import org.apache.hadoop.io.ArrayWritable;
    +import org.apache.hadoop.io.NullWritable;
    +import org.apache.hadoop.mapred.FileSplit;
    +import org.apache.hadoop.mapred.JobConf;
    +import org.apache.hadoop.mapred.Reporter;
    +import org.apache.orc.CompressionKind;
    +import org.apache.orc.OrcFile;
    +import org.apache.orc.Reader;
    +import org.apache.orc.RecordReader;
    +import org.apache.orc.TypeDescription;
    +import org.apache.orc.Writer;
    +import org.apache.orc.bench.convert.BatchReader;
    +import org.apache.orc.bench.convert.GenerateVariants;
    +import org.apache.orc.bench.convert.csv.CsvReader;
    +import org.apache.parquet.hadoop.ParquetInputFormat;
    +import org.openjdk.jmh.annotations.AuxCounters;
    +import org.openjdk.jmh.annotations.Benchmark;
    +import org.openjdk.jmh.annotations.BenchmarkMode;
    +import org.openjdk.jmh.annotations.Fork;
    +import org.openjdk.jmh.annotations.Level;
    +import org.openjdk.jmh.annotations.Measurement;
    +import org.openjdk.jmh.annotations.Mode;
    +import org.openjdk.jmh.annotations.OutputTimeUnit;
    +import org.openjdk.jmh.annotations.Param;
    +import org.openjdk.jmh.annotations.Scope;
    +import org.openjdk.jmh.annotations.Setup;
    +import org.openjdk.jmh.annotations.State;
    +import org.openjdk.jmh.annotations.TearDown;
    +import org.openjdk.jmh.annotations.Warmup;
    +import org.openjdk.jmh.infra.Blackhole;
    +import org.openjdk.jmh.runner.Runner;
    +import org.openjdk.jmh.runner.options.OptionsBuilder;
    +
    +import java.io.IOException;
    +import java.io.InputStream;
    +import java.io.InputStreamReader;
    +import java.net.URI;
    +import java.nio.charset.StandardCharsets;
    +import java.util.concurrent.TimeUnit;
    +
    +@BenchmarkMode(Mode.AverageTime)
    +@Warmup(iterations=2, time=30, timeUnit = TimeUnit.SECONDS)
    +@Measurement(iterations=10, time=30, timeUnit = TimeUnit.SECONDS)
    +@State(Scope.Thread)
    +@OutputTimeUnit(TimeUnit.MICROSECONDS)
    +@Fork(2)
    +public class DecimalBench {
    +
    +  private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir";
    +  private static final Path root;
    +  static {
    +    String value = System.getProperty(ROOT_ENVIRONMENT_NAME);
    +    root = value == null ? null : new Path(value);
    +  }
    +
    +  /**
    +   * Abstract out whether we are writing short or long decimals
    +   */
    +  interface Loader {
    +    /**
    +     * Load the data from the values array into the ColumnVector.
    +     * @param vector the output
    +     * @param values the intput
    +     * @param offset the first input value
    +     * @param length the number of values to copy
    +     */
    +    void loadData(ColumnVector vector, long[] values, int offset, int 
length);
    +  }
    +
    +  static class Decimal64Loader implements Loader {
    +    final int scale;
    +    final int precision;
    +
    +    Decimal64Loader(int precision, int scale) {
    +      this.precision = precision;
    +      this.scale = scale;
    +    }
    +
    +    @Override
    +    public void loadData(ColumnVector vector, long[] values, int offset, 
int length) {
    +      Decimal64ColumnVector v = (Decimal64ColumnVector) vector;
    +      v.ensureSize(length, false);
    +      v.noNulls = true;
    +      for(int p=0; p < length; ++p) {
    +        v.vector[p] = values[p + offset];
    +      }
    +      v.precision = (short) precision;
    +      v.scale = (short) scale;
    +    }
    +  }
    +
    +  static class DecimalLoader implements Loader {
    +    final int scale;
    +    final int precision;
    +
    +    DecimalLoader(int precision, int scale) {
    +      this.precision = precision;
    +      this.scale = scale;
    +    }
    +
    +    @Override
    +    public void loadData(ColumnVector vector, long[] values, int offset, 
int length) {
    +      DecimalColumnVector v = (DecimalColumnVector) vector;
    +      v.noNulls = true;
    +      for(int p=0; p < length; ++p) {
    +        v.vector[p].setFromLongAndScale(values[offset + p], scale);
    +      }
    +      v.precision = (short) precision;
    +      v.scale = (short) scale;
    +    }
    +  }
    +
    +  @State(Scope.Thread)
    +  public static class OutputState {
    +
    +    // try both short and long decimals
    +    @Param({"8", "19"})
    +    public int precision;
    +
    +    long[] total_amount = new long[1024 * 1024];
    +    Configuration conf = new Configuration();
    +    FileSystem fs = new NullFileSystem();
    +    TypeDescription schema;
    +    VectorizedRowBatch batch;
    +    Loader loader;
    +
    +    @Setup
    +    public void setup() throws IOException {
    +      schema = TypeDescription.createDecimal()
    +          .withScale(2)
    +          .withPrecision(precision);
    +      loader = precision <= 18 ?
    +            new Decimal64Loader(precision, 2) :
    +            new DecimalLoader(precision, 2);
    +      readCsvData(total_amount, root, "total_amount", conf);
    +      batch = schema.createRowBatchV2();
    +    }
    +  }
    +
    +  @Benchmark
    +  public void write(OutputState state) throws Exception {
    +    Writer writer = OrcFile.createWriter(new Path("null"),
    --- End diff --
    
    I started that way, but the problem is that I do want to measure the close, 
since a lot of the work is deferred until then, as part of the benchmark and it 
felt wrong to have mismatched open/close calls.

---

[GitHub] orc pull request #252: ORC-345 Create and use a new Decimal64StatisticsImpl ...

Reply via email to