This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 5937e7530 ORC-1667: Add `check` tool to check the index of the 
specified column
5937e7530 is described below

commit 5937e7530751852c76195dfbef492fdef65f9a1a
Author: sychen <syc...@ctrip.com>
AuthorDate: Fri Mar 29 10:57:10 2024 -0700

    ORC-1667: Add `check` tool to check the index of the specified column
    
    ### What changes were proposed in this pull request?
    This PR aims to check the index of the specified column.
    
    We can test the filtering effect by specifying different types.
    
    `check --type stat`  -  Only use column statistics.
    `check --type bloom-filter` -  Only use bloom filter.
    `check --type predicate`  - Used in combination with column statistics and 
bloom filter.
    
    ### Why are the changes needed?
    ORC supports specifying multiple columns to generate bloom filter indexes, 
but it lacks a convenient tool to verify the effect of bloom filter.
    
    Parquet also has similar commands.
    [PARQUET-2138](https://issues.apache.org/jira/browse/PARQUET-2138): Add 
ShowBloomFilterCommand to parquet-cli
    
    ### How was this patch tested?
    Add UT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1862 from cxzl25/ORC-1667.
    
    Authored-by: sychen <syc...@ctrip.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
    (cherry picked from commit 3b5b2a6286df48a4ab471aece74bc7b7947042ad)
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../src/java/org/apache/orc/tools/CheckTool.java   | 336 +++++++++++++++++++++
 .../src/java/org/apache/orc/tools/Driver.java      |   4 +
 .../orc/tools/bloomfilter/TestCheckTool.java       | 212 +++++++++++++
 site/_docs/java-tools.md                           |  19 ++
 4 files changed, 571 insertions(+)

diff --git a/java/tools/src/java/org/apache/orc/tools/CheckTool.java 
b/java/tools/src/java/org/apache/orc/tools/CheckTool.java
new file mode 100644
index 000000000..2d90241bd
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/CheckTool.java
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Check whether the specified column of multiple ORC files can filter the 
specified value.
+ */
+public class CheckTool {
+
+  private static final String CHECK_TYPE_PREDICATE = "predicate";
+  private static final String CHECK_TYPE_STAT = "stat";
+  private static final String CHECK_TYPE_BLOOM_FILTER = "bloom-filter";
+
+  public static void main(Configuration conf, String[] args) throws Exception {
+    Options opts = createOptions();
+    CommandLine cli = new DefaultParser().parse(opts, args);
+    HelpFormatter formatter = new HelpFormatter();
+    if (cli.hasOption('h')) {
+      formatter.printHelp("check", opts);
+      return;
+    }
+
+    String type = cli.getOptionValue("type");
+    if (type == null ||
+        (!type.equals(CHECK_TYPE_PREDICATE) &&
+            !type.equals(CHECK_TYPE_STAT) &&
+            !type.equals(CHECK_TYPE_BLOOM_FILTER))) {
+      System.err.printf("type %s not support %n", type);
+      formatter.printHelp("check", opts);
+      return;
+    }
+    String column = cli.getOptionValue("column");
+    if (column == null || column.isEmpty()) {
+      System.err.println("column is null");
+      formatter.printHelp("check", opts);
+      return;
+    }
+    String[] values = cli.getOptionValues("values");
+    if (values == null || values.length == 0) {
+      System.err.println("values is null");
+      formatter.printHelp("check", opts);
+      return;
+    }
+    boolean ignoreExtension = cli.hasOption("ignoreExtension");
+
+    List<Path> inputFiles = new ArrayList<>();
+    String[] files = cli.getArgs();
+    for (String root : files) {
+      Path rootPath = new Path(root);
+      FileSystem fs = rootPath.getFileSystem(conf);
+      for (RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, 
true); itr.hasNext(); ) {
+        LocatedFileStatus status = itr.next();
+        if (status.isFile() && (ignoreExtension || 
status.getPath().getName().endsWith(".orc"))) {
+          inputFiles.add(status.getPath());
+        }
+      }
+    }
+    if (inputFiles.isEmpty()) {
+      System.err.println("No files found.");
+      System.exit(1);
+    }
+
+    for (Path inputFile : inputFiles) {
+      System.out.println("input file: " + inputFile);
+      FileSystem fs = inputFile.getFileSystem(conf);
+      try (Reader reader = OrcFile.createReader(inputFile,
+          OrcFile.readerOptions(conf).filesystem(fs))) {
+        RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+        TypeDescription schema = reader.getSchema();
+        boolean[] includedColumns = OrcUtils.includeColumns(column, schema);
+        int colIndex = -1;
+        for (int i = 0; i < includedColumns.length; i++) {
+          if (includedColumns[i]) {
+            colIndex = i;
+            break;
+          }
+        }
+        if (colIndex == -1) {
+          System.err.printf("column: %s not found in file: %s%n", column, 
inputFile);
+          continue;
+        }
+        int stripeIndex = -1;
+        for (StripeInformation stripe : reader.getStripes()) {
+          ++stripeIndex;
+
+          OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+
+          OrcProto.ColumnEncoding columnEncoding = footer.getColumns(colIndex);
+          TypeDescription subtype = reader.getSchema().findSubtype(colIndex);
+          TypeDescription.Category columnCategory = subtype.getCategory();
+          OrcIndex indices = rows.readRowIndex(stripeIndex, null, 
includedColumns);
+          if (type.equals(CHECK_TYPE_BLOOM_FILTER)) {
+            checkBloomFilter(inputFile, reader, indices, stripeIndex,
+                colIndex, column, columnEncoding, columnCategory, values);
+          } else {
+            checkStatOrPredicate(inputFile, reader, indices, stripeIndex,
+                colIndex, column, columnEncoding, subtype, columnCategory, 
values, type);
+          }
+        }
+      }
+    }
+  }
+
+  private static void checkStatOrPredicate(Path inputFile,
+      Reader reader,
+      OrcIndex indices,
+      int stripeIndex,
+      int colIndex,
+      String column,
+      OrcProto.ColumnEncoding columnEncoding,
+      TypeDescription subtype,
+      TypeDescription.Category columnCategory,
+      String[] values,
+      String type) {
+    OrcProto.RowIndex rowGroupIndex = indices.getRowGroupIndex()[colIndex];
+    int entryCount = rowGroupIndex.getEntryCount();
+    boolean hasBloomFilter = true;
+    OrcProto.BloomFilterIndex[] bloomFilterIndices = 
indices.getBloomFilterIndex();
+    OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
+    if (bloomFilterIndex == null || 
bloomFilterIndex.getBloomFilterList().isEmpty()) {
+      hasBloomFilter = false;
+    }
+    for (int i = 0; i < entryCount; i++) {
+      OrcProto.ColumnStatistics statistics = 
rowGroupIndex.getEntry(i).getStatistics();
+      ColumnStatistics cs = ColumnStatisticsImpl.deserialize(subtype,
+          statistics,
+          reader.writerUsedProlepticGregorian(),
+          reader.getConvertToProlepticGregorian());
+
+      BloomFilter bloomFilter = null;
+      if (type.equals(CHECK_TYPE_PREDICATE) && hasBloomFilter) {
+        bloomFilter = BloomFilterIO.deserialize(
+            indices.getBloomFilterKinds()[colIndex], columnEncoding,
+            reader.getWriterVersion(), columnCategory, 
bloomFilterIndex.getBloomFilter(i));
+      }
+
+      for (String value : values) {
+        PredicateLeaf predicateLeaf = 
createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
+            getPredicateLeafType(columnCategory), column, 
convert(columnCategory, value));
+        SearchArgument.TruthValue truthValue = 
RecordReaderImpl.evaluatePredicate(
+            cs, predicateLeaf, bloomFilter);
+        System.out.printf("stripe: %d, rowIndex: %d, value: %s, test value: 
%s%n",
+            stripeIndex, i, value, truthValue);
+      }
+    }
+  }
+
+  private static void checkBloomFilter(Path inputFile,
+      Reader reader,
+      OrcIndex indices,
+      int stripeIndex,
+      int colIndex,
+      String column,
+      OrcProto.ColumnEncoding columnEncoding,
+      TypeDescription.Category columnCategory,
+      String[] values) {
+    OrcProto.BloomFilterIndex[] bloomFilterIndices = 
indices.getBloomFilterIndex();
+    OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
+    if (bloomFilterIndex == null || 
bloomFilterIndex.getBloomFilterList().isEmpty()) {
+      System.err.printf("The bloom filter index for column: %s is not found in 
file: %s%n",
+          column, inputFile);
+      return;
+    }
+    List<OrcProto.BloomFilter> bloomFilterList = 
bloomFilterIndex.getBloomFilterList();
+    for (int i = 0; i < bloomFilterList.size(); i++) {
+      OrcProto.BloomFilter bf = bloomFilterList.get(i);
+      org.apache.orc.util.BloomFilter bloomFilter = BloomFilterIO.deserialize(
+          indices.getBloomFilterKinds()[colIndex], columnEncoding,
+          reader.getWriterVersion(), columnCategory, bf);
+      for (String value : values) {
+        boolean testResult = test(bloomFilter, columnCategory, value);
+        if (testResult) {
+          System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom 
filter: maybe exist%n",
+              stripeIndex, i, value);
+        } else {
+          System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom 
filter: not exist%n",
+              stripeIndex, i, value);
+        }
+      }
+    }
+  }
+
+  private static boolean test(BloomFilter bloomFilter,
+      TypeDescription.Category columnCategory, String value) {
+    switch (columnCategory){
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+      case DATE:
+      case TIMESTAMP:
+        return bloomFilter.testLong(Long.parseLong(value));
+      case FLOAT:
+      case DOUBLE:
+        return bloomFilter.testDouble(Double.parseDouble(value));
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+      case DECIMAL:
+        return bloomFilter.testString(value);
+      default:
+        throw new IllegalStateException("Not supported type:" + 
columnCategory);
+    }
+  }
+
+  private static Object convert(
+      TypeDescription.Category columnCategory, String value) {
+    switch (columnCategory) {
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+      case DATE:
+      case TIMESTAMP:
+        return Long.parseLong(value);
+      case FLOAT:
+      case DOUBLE:
+        return Double.parseDouble(value);
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+      case DECIMAL:
+        return value;
+      default:
+        throw new IllegalStateException("Not supported type:" + 
columnCategory);
+    }
+  }
+
+  private static PredicateLeaf.Type 
getPredicateLeafType(TypeDescription.Category columnCategory) {
+    switch (columnCategory){
+      case BOOLEAN:
+        return PredicateLeaf.Type.BOOLEAN;
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+        return PredicateLeaf.Type.LONG;
+      case DATE:
+        return PredicateLeaf.Type.DATE;
+      case TIMESTAMP:
+        return  PredicateLeaf.Type.TIMESTAMP;
+      case FLOAT:
+      case DOUBLE:
+        return  PredicateLeaf.Type.FLOAT;
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+      case DECIMAL:
+        return PredicateLeaf.Type.STRING;
+      default:
+        throw new IllegalStateException("Not supported type:" + 
columnCategory);
+    }
+  }
+
+  private static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator 
operator,
+      PredicateLeaf.Type type,
+      String columnName,
+      Object literal) {
+    return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+        literal, null);
+  }
+
+  private static Options createOptions() {
+    Options result = new Options();
+
+    result.addOption(Option.builder("t")
+        .longOpt("type")
+        .desc(String.format("check type = {%s, %s, %s}",
+            CHECK_TYPE_PREDICATE, CHECK_TYPE_STAT, CHECK_TYPE_BLOOM_FILTER))
+        .hasArg()
+        .build());
+
+    result.addOption(Option.builder("col")
+        .longOpt("column")
+        .desc("column name")
+        .hasArg()
+        .build());
+
+    result.addOption(Option.builder("v")
+        .longOpt("values")
+        .desc("test values")
+        .hasArgs()
+        .build());
+
+    result.addOption(Option.builder("h")
+        .longOpt("help")
+        .desc("print help message")
+        .build());
+    return result;
+  }
+}
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java 
b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 5b993c2e9..0d2778b41 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -86,6 +86,7 @@ public class Driver {
           " [--define X=Y] <command> <args>");
       System.err.println();
       System.err.println("Commands:");
+      System.err.println("   check - check the index of the specified column");
       System.err.println("   convert - convert CSV/JSON/ORC files to ORC");
       System.err.println("   count - recursively find *.orc and print the 
number of rows");
       System.err.println("   data - print the data from the ORC file");
@@ -106,6 +107,9 @@ public class Driver {
       conf.set(pair.getKey().toString(), pair.getValue().toString());
     }
     switch (options.command) {
+      case "check":
+        CheckTool.main(conf, options.commandArgs);
+        break;
       case "convert":
         ConvertTool.main(conf, options.commandArgs);
         break;
diff --git 
a/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java 
b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java
new file mode 100644
index 000000000..ada80a569
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.bloomfilter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.CheckTool;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestCheckTool {
+  private Path workDir = new Path(System.getProperty("test.tmp.dir"));
+  private Configuration conf;
+  private FileSystem fs;
+  private Path testFilePath;
+
+  @BeforeEach
+  public void openFileSystem() throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    fs.setWorkingDirectory(workDir);
+    testFilePath = new Path("TestCheckTool.testCheckTool.orc");
+    fs.delete(testFilePath, false);
+    createFile();
+  }
+
+  private void createFile() throws IOException {
+    TypeDescription schema = 
TypeDescription.fromString("struct<x:int,y:string,z:string>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .bloomFilterColumns("x,y")
+            .rowIndexStride(5000)
+            .setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    LongColumnVector x = (LongColumnVector) batch.cols[0];
+    BytesColumnVector y = (BytesColumnVector) batch.cols[1];
+    BytesColumnVector z = (BytesColumnVector) batch.cols[2];
+    for (int r = 0; r < 10000; ++r) {
+      int row = batch.size++;
+      x.vector[row] = r;
+      byte[] yBuffer = ("y-byte-" + r).getBytes();
+      byte[] zBuffer = ("z-byte-" + r).getBytes();
+      y.setRef(row, yBuffer, 0, yBuffer.length);
+      z.setRef(row, zBuffer, 0, zBuffer.length);
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    if (batch.size != 0) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+  }
+
+  @Test
+  public void testPredicate() throws Exception {
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+    CheckTool.main(conf, new String[]{
+        "--type", "predicate",
+        "--values", "0", "--values", "5566",
+        "--column", "x",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "predicate",
+        "--values", "y-byte-1234", "--values", "y-byte-5566",
+        "--column", "y",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "predicate",
+        "--values", "z-byte-1234", "--values", "z-byte-5566",
+        "--column", "z",
+        testFilePath.toString()});
+
+    System.out.flush();
+    System.setOut(origOut);
+    String output = myOut.toString(StandardCharsets.UTF_8);
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value: 
YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test 
value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value: 
NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test 
value: YES_NO"));
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, 
test value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, 
test value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, 
test value: YES_NO"));
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234, 
test value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566, 
test value: YES_NO"));
+  }
+
+  @Test
+  public void testStatistics() throws Exception {
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+    CheckTool.main(conf, new String[]{
+        "--type", "stat",
+        "--values", "0", "--values", "5566",
+        "--column", "x",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "stat",
+        "--values", "y-byte-1234", "--values", "y-byte-5566",
+        "--column", "y",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "stat",
+        "--values", "z-byte-1234", "--values", "z-byte-5566",
+        "--column", "z",
+        testFilePath.toString()});
+
+    System.out.flush();
+    System.setOut(origOut);
+    String output = myOut.toString(StandardCharsets.UTF_8);
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value: 
YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test 
value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value: 
NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test 
value: YES_NO"));
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, 
test value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, 
test value: YES_NO"));
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566, 
test value: YES_NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234, 
test value: NO"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566, 
test value: YES_NO"));
+  }
+
+  @Test
+  public void testBloomFilter() throws Exception {
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+    CheckTool.main(conf, new String[]{
+        "--type", "bloom-filter",
+        "--values", "0", "--values", "5566",
+        "--column", "x",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "bloom-filter",
+        "--values", "y-byte-1234", "--values", "y-byte-5566",
+        "--column", "y",
+        testFilePath.toString()});
+
+    CheckTool.main(conf, new String[]{
+        "--type", "bloom-filter",
+        "--values", "z-byte-1234", "--values", "z-byte-5566",
+        "--column", "z",
+        testFilePath.toString()});
+
+    System.out.flush();
+    System.setOut(origOut);
+
+    String output = myOut.toString(StandardCharsets.UTF_8);
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, bloom 
filter: maybe exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, bloom 
filter: not exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, bloom 
filter: maybe exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, bloom 
filter: maybe exist"));
+
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, 
bloom filter: maybe exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, 
bloom filter: not exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, 
bloom filter: not exist"));
+    assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, 
bloom filter: maybe exist"));
+  }
+}
diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md
index 92a876a5f..f53720113 100644
--- a/site/_docs/java-tools.md
+++ b/site/_docs/java-tools.md
@@ -11,6 +11,7 @@ supports both the local file system and HDFS.
 
 The subcommands for the tools are:
 
+  * check (since ORC 2.0.1) - check the index of the specified column
   * convert (since ORC 1.4) - convert CSV/JSON/ORC files to ORC
   * count (since ORC 1.6) - recursively find *.orc and print the number of rows
   * data - print the data of an ORC file
@@ -27,6 +28,24 @@ The command line looks like:
 ~~~ shell
 % java -jar orc-tools-X.Y.Z-uber.jar <sub-command> <args>
 ~~~
+## Java Check
+
+The check command can check whether the specified value of the column 
specified by multiple ORC files can be filtered.
+
+Check statistics and bloom filter index on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type predicate 
/path/to/example.orc --values 1234 --values 5566 --column x
+~~~
+
+Check statistics on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type stat /path/to/example.orc 
--values 1234 --values 5566 --column x
+~~~
+
+Check bloom filter index on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type bloom-filter 
/path/to/example.orc --values 1234 --values 5566 --column x
+~~~
 
 ## Java Convert
 

Reply via email to