[orc] branch branch-1.8 updated: ORC-1139: Benchmark for ORC-1136 that combines multiple individual close reads into a single read

dongjoon Wed, 11 May 2022 19:46:31 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.8
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/branch-1.8 by this push:
     new 8daa92eb3 ORC-1139: Benchmark for ORC-1136 that combines multiple 
individual close reads into a single read
8daa92eb3 is described below

commit 8daa92eb391e783d73f4e70291cafeb24e900601
Author: Pavan Lanka <[email protected]>
AuthorDate: Mon May 9 09:42:34 2022 -0700

    ORC-1139: Benchmark for ORC-1136 that combines multiple individual close 
reads into a single read
    
    Adds new benchmark to estimate the impact of combining multiple close reads 
into a single read.
    
    Bench addition to ensure that we have not added any penalties as a result 
of this change
    
    Running and verifying the bench results
    
    Closes #1112
    
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 1ff438813333b75585c49828768e41199de5a088)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../apache/orc/bench/core/filter/FilterBench.java  |  35 ++---
 .../apache/orc/bench/core/impl/ChunkReadBench.java | 121 +++++++++++++++++
 .../apache/orc/bench/core/impl/ChunkReadUtil.java  | 148 +++++++++++++++++++++
 .../orc/bench/core/impl/ChunkReadUtilTest.java     |  90 +++++++++++++
 4 files changed, 379 insertions(+), 15 deletions(-)

diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/filter/FilterBench.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/filter/FilterBench.java
index c984e6d23..e3d88b119 100644
--- a/java/bench/core/src/java/org/apache/orc/bench/core/filter/FilterBench.java
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/filter/FilterBench.java
@@ -81,7 +81,7 @@ public class FilterBench implements OrcBenchmark {
     new Runner(parseOptions(args)).run();
   }
 
-  private static CommandLine parseCommandLine(String[] args) {
+  public static CommandLine parseCommandLine(String[] args, boolean needsArgs) 
{
     org.apache.commons.cli.Options options = new 
org.apache.commons.cli.Options()
       .addOption("h", HELP, false, "Provide help")
       .addOption("i", ITERATIONS, true, "Number of iterations")
@@ -98,7 +98,7 @@ public class FilterBench implements OrcBenchmark {
       System.err.println("Argument exception - " + pe.getMessage());
       result = null;
     }
-    if (result == null || result.hasOption(HELP) || result.getArgs().length == 
0) {
+    if (result == null || result.hasOption(HELP) || (needsArgs && 
result.getArgs().length == 0)) {
       new HelpFormatter().printHelp("java -jar <jar> <command> <options> 
<sub_cmd>\n"
                                     + "sub_cmd:\nsimple\ncomplex\n",
                                     options);
@@ -108,20 +108,8 @@ public class FilterBench implements OrcBenchmark {
     return result;
   }
 
-  public static Options parseOptions(String[] args) {
-    CommandLine options = parseCommandLine(args);
-    String cmd = options.getArgs()[0];
+  public static OptionsBuilder optionsBuilder(CommandLine options) {
     OptionsBuilder builder = new OptionsBuilder();
-    switch (cmd) {
-      case "simple":
-        builder.include(SimpleFilter.class.getSimpleName());
-        break;
-      case "complex":
-        builder.include(ComplexFilter.class.getSimpleName());
-        break;
-      default:
-        throw new UnsupportedOperationException(String.format("Command %s is 
not supported", cmd));
-    }
     if (options.hasOption(GC)) {
       builder.addProfiler("hs_gc");
     }
@@ -147,6 +135,23 @@ public class FilterBench implements OrcBenchmark {
     String maxMemory = options.getOptionValue(MAX_MEMORY, "2g");
     builder.jvmArgs("-server",
                     "-Xms" + minMemory, "-Xmx" + maxMemory);
+    return builder;
+  }
+
+  public static Options parseOptions(String[] args) {
+    CommandLine options = parseCommandLine(args, true);
+    OptionsBuilder builder = optionsBuilder(options);
+    String cmd = options.getArgs()[0];
+    switch (cmd) {
+      case "simple":
+        builder.include(SimpleFilter.class.getSimpleName());
+        break;
+      case "complex":
+        builder.include(ComplexFilter.class.getSimpleName());
+        break;
+      default:
+        throw new UnsupportedOperationException(String.format("Command %s is 
not supported", cmd));
+    }
     return builder.build();
   }
 
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadBench.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadBench.java
new file mode 100644
index 000000000..a33618449
--- /dev/null
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadBench.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.impl;
+
+import com.google.auto.service.AutoService;
+import org.apache.commons.cli.CommandLine;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.bench.core.OrcBenchmark;
+import org.apache.orc.bench.core.filter.FilterBench;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+@OutputTimeUnit(value = TimeUnit.SECONDS)
+@Warmup(iterations = 20, time = 1)
+@BenchmarkMode(value = Mode.AverageTime)
+@Fork(value = 1)
+@State(value = Scope.Benchmark)
+@Measurement(iterations = 20, time = 1)
+@AutoService(OrcBenchmark.class)
+public class ChunkReadBench implements OrcBenchmark {
+  @Override
+  public String getName() {
+    return "chunk_read";
+  }
+
+  @Override
+  public String getDescription() {
+    return "Perform chunk read bench";
+  }
+
+  @Override
+  public void run(String[] args) throws Exception {
+    CommandLine options = FilterBench.parseCommandLine(args, false);
+    OptionsBuilder builder = FilterBench.optionsBuilder(options);
+    builder.include(getClass().getSimpleName());
+    new Runner(builder.build()).run();
+  }
+
+  private final Path workDir = new Path(System.getProperty("test.tmp.dir",
+                                                           "target" + 
File.separator + "test"
+                                                           + File.separator + 
"tmp"));
+  private final Path filePath = new Path(workDir, "perf_chunk_read_file.orc");
+  private final Configuration conf = new Configuration();
+  @Param( {"128"})
+  private int colCount;
+
+  @Param( {"65536"})
+  private int rowCount;
+
+  @Param( {"true", "false"})
+  private boolean alternate;
+
+  @Param( {"0", "4194304"})
+  private int minSeekSize;
+
+  @Param( {"0.0", "10.0"})
+  private double extraByteTolerance;
+
+  private long readRows = 0;
+
+  @Setup
+  public void setup() throws IOException {
+    if (minSeekSize == 0 && extraByteTolerance > 0) {
+      throw new IllegalArgumentException("Ignore extraByteTolerance variations 
with seekSize is"
+                                         + " 0");
+    }
+    FileSystem fs = FileSystem.get(conf);
+    if (!fs.exists(filePath)) {
+      ChunkReadUtil.createORCFile(colCount, rowCount, filePath);
+    }
+    ChunkReadUtil.setConf(conf, minSeekSize, extraByteTolerance);
+  }
+
+  @Benchmark
+  public long read() throws IOException {
+    readRows = ChunkReadUtil.readORCFile(filePath, conf, alternate);
+    return readRows;
+  }
+
+  @TearDown
+  public void tearDown() {
+    if (readRows != rowCount) {
+      throw new IllegalArgumentException(String.format(
+        "readRows %d is not equal to expected rows %d", readRows, rowCount));
+    }
+  }
+}
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadUtil.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadUtil.java
new file mode 100644
index 000000000..6877c9e34
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/impl/ChunkReadUtil.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.impl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Random;
+
+public class ChunkReadUtil {
+  private static final int SCALE = 6;
+
+  static void setConf(Configuration conf, int minSeekSize, double 
extraByteTolerance) {
+    OrcConf.ORC_MIN_DISK_SEEK_SIZE.setInt(conf, minSeekSize);
+    OrcConf.ORC_MIN_DISK_SEEK_SIZE_TOLERANCE.setDouble(conf, 
extraByteTolerance);
+  }
+
+  static long readORCFile(Path file, Configuration conf, boolean alternate)
+    throws IOException {
+    Reader r = OrcFile.createReader(file, OrcFile.readerOptions(conf));
+    long rowCount = 0;
+    VectorizedRowBatch batch = r.getSchema().createRowBatch();
+    Reader.Options o = r.options();
+    if (alternate) {
+      o.include(includeAlternate(r.getSchema()));
+    }
+    RecordReader rr = r.rows(o);
+    while (rr.nextBatch(batch)) {
+      rowCount += batch.size;
+    }
+    return rowCount;
+  }
+
+  private static boolean[] includeAlternate(TypeDescription schema) {
+    boolean[] includes = new boolean[schema.getMaximumId() + 1];
+    for (int i = 1; i < includes.length; i += 2) {
+      includes[i] = true;
+    }
+    includes[0] = true;
+    return includes;
+  }
+
+  static long createORCFile(int colCount, int rowCount, Path file) throws 
IOException {
+    TypeDescription schema = createSchema(colCount);
+    return writeFile(schema, rowCount, file);
+  }
+
+  private static long writeFile(TypeDescription schema, int rowCount, Path 
path)
+    throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+
+    try (Writer writer = OrcFile.createWriter(path,
+                                              OrcFile.writerOptions(conf)
+                                                .fileSystem(fs)
+                                                .overwrite(true)
+                                                .rowIndexStride(8192)
+                                                .setSchema(schema)
+                                                .overwrite(true))) {
+      Random rnd = new Random(1024);
+      VectorizedRowBatch b = schema.createRowBatch();
+      for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) {
+        ((LongColumnVector) b.cols[0]).vector[b.size] = rowIdx;
+        long v = rnd.nextLong();
+        for (int colIdx = 1; colIdx < schema.getChildren().size() - 1; 
colIdx++) {
+          switch (schema.getChildren().get(colIdx).getCategory()) {
+            case LONG:
+              ((LongColumnVector) b.cols[colIdx]).vector[b.size] = v;
+              break;
+            case DECIMAL:
+              HiveDecimalWritable d = new HiveDecimalWritable();
+              d.setFromLongAndScale(v, SCALE);
+              ((DecimalColumnVector) b.cols[colIdx]).vector[b.size] = d;
+              break;
+            case STRING:
+              ((BytesColumnVector) b.cols[colIdx]).setVal(b.size,
+                                                          String.valueOf(v)
+                                                            
.getBytes(StandardCharsets.UTF_8));
+              break;
+            default:
+              throw new IllegalArgumentException();
+          }
+        }
+        b.size += 1;
+        if (b.size == b.getMaxSize()) {
+          writer.addRowBatch(b);
+          b.reset();
+        }
+      }
+      if (b.size > 0) {
+        writer.addRowBatch(b);
+        b.reset();
+      }
+    }
+    return fs.getFileStatus(path).getLen();
+  }
+
+  private static TypeDescription createSchema(int colCount) {
+    TypeDescription schema = TypeDescription.createStruct()
+      .addField("id", TypeDescription.createLong());
+    for (int i = 1; i <= colCount; i++) {
+      TypeDescription fieldType;
+      switch (i % 3) {
+        case 0:
+          fieldType = TypeDescription.createString();
+          break;
+        case 1:
+          fieldType = 
TypeDescription.createDecimal().withPrecision(20).withScale(SCALE);
+          break;
+        default:
+          fieldType = TypeDescription.createLong();
+          break;
+      }
+      schema.addField("f_" + i, fieldType);
+    }
+    return schema;
+  }
+}
diff --git 
a/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java
 
b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java
new file mode 100644
index 000000000..1169998d8
--- /dev/null
+++ 
b/java/bench/core/src/test/org/apache/orc/bench/core/impl/ChunkReadUtilTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.impl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class ChunkReadUtilTest {
+  private static final Path workDir = new 
Path(System.getProperty("test.tmp.dir",
+                                                                  "target" + 
File.separator + "test"
+                                                                  + 
File.separator + "tmp"));
+  private static final Path filePath = new Path(workDir, 
"chunk_read_file.orc");
+  private static long fileLength;
+  private static final int ROW_COUNT = 524288;
+  private static final int COL_COUNT = 16;
+
+  @BeforeAll
+  public static void setup() throws IOException {
+    fileLength = ChunkReadUtil.createORCFile(COL_COUNT, ROW_COUNT, filePath);
+  }
+
+  private static void readStart() {
+    FileSystem.clearStatistics();
+  }
+
+  private static FileSystem.Statistics readEnd() {
+    return FileSystem.getAllStatistics().get(0);
+  }
+
+  @Test
+  public void testReadAll() throws IOException {
+    Configuration conf = new Configuration();
+    readStart();
+    assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, false));
+    assertTrue((readEnd().getBytesRead() / (double) fileLength) > 1);
+  }
+
+  @Test
+  public void testReadAlternate() throws IOException {
+    Configuration conf = new Configuration();
+    readStart();
+    assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true));
+    assertTrue((readEnd().getBytesRead() / (double) fileLength) < .5);
+  }
+
+  @Test
+  public void testReadAlternateWMinSeekSize() throws IOException {
+    Configuration conf = new Configuration();
+    ChunkReadUtil.setConf(conf, 4 * 1024 * 1024, 10);
+    readStart();
+    assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true));
+    double readFraction = readEnd().getBytesRead() / (double) fileLength;
+    assertTrue(readFraction > 1 && readFraction < 1.01);
+  }
+
+  @Test
+  public void testReadAlternateWMinSeekSizeDrop() throws IOException {
+    Configuration conf = new Configuration();
+    ChunkReadUtil.setConf(conf, 4 * 1024 * 1024, 0);
+    readStart();
+    assertEquals(ROW_COUNT, ChunkReadUtil.readORCFile(filePath, conf, true));
+    double readFraction = readEnd().getBytesRead() / (double) fileLength;
+    assertTrue(readFraction > 1 && readFraction < 1.01);
+  }
+}
\ No newline at end of file

[orc] branch branch-1.8 updated: ORC-1139: Benchmark for ORC-1136 that combines multiple individual close reads into a single read

Reply via email to