This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 5937e7530 ORC-1667: Add `check` tool to check the index of the
specified column
5937e7530 is described below
commit 5937e7530751852c76195dfbef492fdef65f9a1a
Author: sychen <[email protected]>
AuthorDate: Fri Mar 29 10:57:10 2024 -0700
ORC-1667: Add `check` tool to check the index of the specified column
### What changes were proposed in this pull request?
This PR aims to check the index of the specified column.
We can test the filtering effect by specifying different types.
`check --type stat` - Only use column statistics.
`check --type bloom-filter` - Only use bloom filter.
`check --type predicate` - Used in combination with column statistics and
bloom filter.
### Why are the changes needed?
ORC supports specifying multiple columns to generate bloom filter indexes,
but it lacks a convenient tool to verify the effect of bloom filter.
Parquet also has similar commands.
[PARQUET-2138](https://issues.apache.org/jira/browse/PARQUET-2138): Add
ShowBloomFilterCommand to parquet-cli
### How was this patch tested?
Add UT
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1862 from cxzl25/ORC-1667.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 3b5b2a6286df48a4ab471aece74bc7b7947042ad)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/java/org/apache/orc/tools/CheckTool.java | 336 +++++++++++++++++++++
.../src/java/org/apache/orc/tools/Driver.java | 4 +
.../orc/tools/bloomfilter/TestCheckTool.java | 212 +++++++++++++
site/_docs/java-tools.md | 19 ++
4 files changed, 571 insertions(+)
diff --git a/java/tools/src/java/org/apache/orc/tools/CheckTool.java
b/java/tools/src/java/org/apache/orc/tools/CheckTool.java
new file mode 100644
index 000000000..2d90241bd
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/CheckTool.java
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Check whether the specified column of multiple ORC files can filter the
specified value.
+ */
+public class CheckTool {
+
+ private static final String CHECK_TYPE_PREDICATE = "predicate";
+ private static final String CHECK_TYPE_STAT = "stat";
+ private static final String CHECK_TYPE_BLOOM_FILTER = "bloom-filter";
+
+ public static void main(Configuration conf, String[] args) throws Exception {
+ Options opts = createOptions();
+ CommandLine cli = new DefaultParser().parse(opts, args);
+ HelpFormatter formatter = new HelpFormatter();
+ if (cli.hasOption('h')) {
+ formatter.printHelp("check", opts);
+ return;
+ }
+
+ String type = cli.getOptionValue("type");
+ if (type == null ||
+ (!type.equals(CHECK_TYPE_PREDICATE) &&
+ !type.equals(CHECK_TYPE_STAT) &&
+ !type.equals(CHECK_TYPE_BLOOM_FILTER))) {
+ System.err.printf("type %s not support %n", type);
+ formatter.printHelp("check", opts);
+ return;
+ }
+ String column = cli.getOptionValue("column");
+ if (column == null || column.isEmpty()) {
+ System.err.println("column is null");
+ formatter.printHelp("check", opts);
+ return;
+ }
+ String[] values = cli.getOptionValues("values");
+ if (values == null || values.length == 0) {
+ System.err.println("values is null");
+ formatter.printHelp("check", opts);
+ return;
+ }
+ boolean ignoreExtension = cli.hasOption("ignoreExtension");
+
+ List<Path> inputFiles = new ArrayList<>();
+ String[] files = cli.getArgs();
+ for (String root : files) {
+ Path rootPath = new Path(root);
+ FileSystem fs = rootPath.getFileSystem(conf);
+ for (RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath,
true); itr.hasNext(); ) {
+ LocatedFileStatus status = itr.next();
+ if (status.isFile() && (ignoreExtension ||
status.getPath().getName().endsWith(".orc"))) {
+ inputFiles.add(status.getPath());
+ }
+ }
+ }
+ if (inputFiles.isEmpty()) {
+ System.err.println("No files found.");
+ System.exit(1);
+ }
+
+ for (Path inputFile : inputFiles) {
+ System.out.println("input file: " + inputFile);
+ FileSystem fs = inputFile.getFileSystem(conf);
+ try (Reader reader = OrcFile.createReader(inputFile,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+ RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+ TypeDescription schema = reader.getSchema();
+ boolean[] includedColumns = OrcUtils.includeColumns(column, schema);
+ int colIndex = -1;
+ for (int i = 0; i < includedColumns.length; i++) {
+ if (includedColumns[i]) {
+ colIndex = i;
+ break;
+ }
+ }
+ if (colIndex == -1) {
+ System.err.printf("column: %s not found in file: %s%n", column,
inputFile);
+ continue;
+ }
+ int stripeIndex = -1;
+ for (StripeInformation stripe : reader.getStripes()) {
+ ++stripeIndex;
+
+ OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+
+ OrcProto.ColumnEncoding columnEncoding = footer.getColumns(colIndex);
+ TypeDescription subtype = reader.getSchema().findSubtype(colIndex);
+ TypeDescription.Category columnCategory = subtype.getCategory();
+ OrcIndex indices = rows.readRowIndex(stripeIndex, null,
includedColumns);
+ if (type.equals(CHECK_TYPE_BLOOM_FILTER)) {
+ checkBloomFilter(inputFile, reader, indices, stripeIndex,
+ colIndex, column, columnEncoding, columnCategory, values);
+ } else {
+ checkStatOrPredicate(inputFile, reader, indices, stripeIndex,
+ colIndex, column, columnEncoding, subtype, columnCategory,
values, type);
+ }
+ }
+ }
+ }
+ }
+
+ private static void checkStatOrPredicate(Path inputFile,
+ Reader reader,
+ OrcIndex indices,
+ int stripeIndex,
+ int colIndex,
+ String column,
+ OrcProto.ColumnEncoding columnEncoding,
+ TypeDescription subtype,
+ TypeDescription.Category columnCategory,
+ String[] values,
+ String type) {
+ OrcProto.RowIndex rowGroupIndex = indices.getRowGroupIndex()[colIndex];
+ int entryCount = rowGroupIndex.getEntryCount();
+ boolean hasBloomFilter = true;
+ OrcProto.BloomFilterIndex[] bloomFilterIndices =
indices.getBloomFilterIndex();
+ OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
+ if (bloomFilterIndex == null ||
bloomFilterIndex.getBloomFilterList().isEmpty()) {
+ hasBloomFilter = false;
+ }
+ for (int i = 0; i < entryCount; i++) {
+ OrcProto.ColumnStatistics statistics =
rowGroupIndex.getEntry(i).getStatistics();
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(subtype,
+ statistics,
+ reader.writerUsedProlepticGregorian(),
+ reader.getConvertToProlepticGregorian());
+
+ BloomFilter bloomFilter = null;
+ if (type.equals(CHECK_TYPE_PREDICATE) && hasBloomFilter) {
+ bloomFilter = BloomFilterIO.deserialize(
+ indices.getBloomFilterKinds()[colIndex], columnEncoding,
+ reader.getWriterVersion(), columnCategory,
bloomFilterIndex.getBloomFilter(i));
+ }
+
+ for (String value : values) {
+ PredicateLeaf predicateLeaf =
createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
+ getPredicateLeafType(columnCategory), column,
convert(columnCategory, value));
+ SearchArgument.TruthValue truthValue =
RecordReaderImpl.evaluatePredicate(
+ cs, predicateLeaf, bloomFilter);
+ System.out.printf("stripe: %d, rowIndex: %d, value: %s, test value:
%s%n",
+ stripeIndex, i, value, truthValue);
+ }
+ }
+ }
+
+ private static void checkBloomFilter(Path inputFile,
+ Reader reader,
+ OrcIndex indices,
+ int stripeIndex,
+ int colIndex,
+ String column,
+ OrcProto.ColumnEncoding columnEncoding,
+ TypeDescription.Category columnCategory,
+ String[] values) {
+ OrcProto.BloomFilterIndex[] bloomFilterIndices =
indices.getBloomFilterIndex();
+ OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
+ if (bloomFilterIndex == null ||
bloomFilterIndex.getBloomFilterList().isEmpty()) {
+ System.err.printf("The bloom filter index for column: %s is not found in
file: %s%n",
+ column, inputFile);
+ return;
+ }
+ List<OrcProto.BloomFilter> bloomFilterList =
bloomFilterIndex.getBloomFilterList();
+ for (int i = 0; i < bloomFilterList.size(); i++) {
+ OrcProto.BloomFilter bf = bloomFilterList.get(i);
+ org.apache.orc.util.BloomFilter bloomFilter = BloomFilterIO.deserialize(
+ indices.getBloomFilterKinds()[colIndex], columnEncoding,
+ reader.getWriterVersion(), columnCategory, bf);
+ for (String value : values) {
+ boolean testResult = test(bloomFilter, columnCategory, value);
+ if (testResult) {
+ System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom
filter: maybe exist%n",
+ stripeIndex, i, value);
+ } else {
+ System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom
filter: not exist%n",
+ stripeIndex, i, value);
+ }
+ }
+ }
+ }
+
+ private static boolean test(BloomFilter bloomFilter,
+ TypeDescription.Category columnCategory, String value) {
+ switch (columnCategory){
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DATE:
+ case TIMESTAMP:
+ return bloomFilter.testLong(Long.parseLong(value));
+ case FLOAT:
+ case DOUBLE:
+ return bloomFilter.testDouble(Double.parseDouble(value));
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ case DECIMAL:
+ return bloomFilter.testString(value);
+ default:
+ throw new IllegalStateException("Not supported type:" +
columnCategory);
+ }
+ }
+
+ private static Object convert(
+ TypeDescription.Category columnCategory, String value) {
+ switch (columnCategory) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DATE:
+ case TIMESTAMP:
+ return Long.parseLong(value);
+ case FLOAT:
+ case DOUBLE:
+ return Double.parseDouble(value);
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ case DECIMAL:
+ return value;
+ default:
+ throw new IllegalStateException("Not supported type:" +
columnCategory);
+ }
+ }
+
+ private static PredicateLeaf.Type
getPredicateLeafType(TypeDescription.Category columnCategory) {
+ switch (columnCategory){
+ case BOOLEAN:
+ return PredicateLeaf.Type.BOOLEAN;
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return PredicateLeaf.Type.LONG;
+ case DATE:
+ return PredicateLeaf.Type.DATE;
+ case TIMESTAMP:
+ return PredicateLeaf.Type.TIMESTAMP;
+ case FLOAT:
+ case DOUBLE:
+ return PredicateLeaf.Type.FLOAT;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ case DECIMAL:
+ return PredicateLeaf.Type.STRING;
+ default:
+ throw new IllegalStateException("Not supported type:" +
columnCategory);
+ }
+ }
+
+ private static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator
operator,
+ PredicateLeaf.Type type,
+ String columnName,
+ Object literal) {
+ return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+ literal, null);
+ }
+
+ private static Options createOptions() {
+ Options result = new Options();
+
+ result.addOption(Option.builder("t")
+ .longOpt("type")
+ .desc(String.format("check type = {%s, %s, %s}",
+ CHECK_TYPE_PREDICATE, CHECK_TYPE_STAT, CHECK_TYPE_BLOOM_FILTER))
+ .hasArg()
+ .build());
+
+ result.addOption(Option.builder("col")
+ .longOpt("column")
+ .desc("column name")
+ .hasArg()
+ .build());
+
+ result.addOption(Option.builder("v")
+ .longOpt("values")
+ .desc("test values")
+ .hasArgs()
+ .build());
+
+ result.addOption(Option.builder("h")
+ .longOpt("help")
+ .desc("print help message")
+ .build());
+ return result;
+ }
+}
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java
b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 5b993c2e9..0d2778b41 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -86,6 +86,7 @@ public class Driver {
" [--define X=Y] <command> <args>");
System.err.println();
System.err.println("Commands:");
+ System.err.println(" check - check the index of the specified column");
System.err.println(" convert - convert CSV/JSON/ORC files to ORC");
System.err.println(" count - recursively find *.orc and print the
number of rows");
System.err.println(" data - print the data from the ORC file");
@@ -106,6 +107,9 @@ public class Driver {
conf.set(pair.getKey().toString(), pair.getValue().toString());
}
switch (options.command) {
+ case "check":
+ CheckTool.main(conf, options.commandArgs);
+ break;
case "convert":
ConvertTool.main(conf, options.commandArgs);
break;
diff --git
a/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java
b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java
new file mode 100644
index 000000000..ada80a569
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.bloomfilter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.CheckTool;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestCheckTool {
+ private Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ private Configuration conf;
+ private FileSystem fs;
+ private Path testFilePath;
+
+ @BeforeEach
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestCheckTool.testCheckTool.orc");
+ fs.delete(testFilePath, false);
+ createFile();
+ }
+
+ private void createFile() throws IOException {
+ TypeDescription schema =
TypeDescription.fromString("struct<x:int,y:string,z:string>");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .bloomFilterColumns("x,y")
+ .rowIndexStride(5000)
+ .setSchema(schema));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ LongColumnVector x = (LongColumnVector) batch.cols[0];
+ BytesColumnVector y = (BytesColumnVector) batch.cols[1];
+ BytesColumnVector z = (BytesColumnVector) batch.cols[2];
+ for (int r = 0; r < 10000; ++r) {
+ int row = batch.size++;
+ x.vector[row] = r;
+ byte[] yBuffer = ("y-byte-" + r).getBytes();
+ byte[] zBuffer = ("z-byte-" + r).getBytes();
+ y.setRef(row, yBuffer, 0, yBuffer.length);
+ z.setRef(row, zBuffer, 0, zBuffer.length);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ }
+
+ @Test
+ public void testPredicate() throws Exception {
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+ CheckTool.main(conf, new String[]{
+ "--type", "predicate",
+ "--values", "0", "--values", "5566",
+ "--column", "x",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "predicate",
+ "--values", "y-byte-1234", "--values", "y-byte-5566",
+ "--column", "y",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "predicate",
+ "--values", "z-byte-1234", "--values", "z-byte-5566",
+ "--column", "z",
+ testFilePath.toString()});
+
+ System.out.flush();
+ System.setOut(origOut);
+ String output = myOut.toString(StandardCharsets.UTF_8);
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value:
YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test
value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value:
NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test
value: YES_NO"));
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566,
test value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234,
test value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566,
test value: YES_NO"));
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234,
test value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566,
test value: YES_NO"));
+ }
+
+ @Test
+ public void testStatistics() throws Exception {
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+ CheckTool.main(conf, new String[]{
+ "--type", "stat",
+ "--values", "0", "--values", "5566",
+ "--column", "x",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "stat",
+ "--values", "y-byte-1234", "--values", "y-byte-5566",
+ "--column", "y",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "stat",
+ "--values", "z-byte-1234", "--values", "z-byte-5566",
+ "--column", "z",
+ testFilePath.toString()});
+
+ System.out.flush();
+ System.setOut(origOut);
+ String output = myOut.toString(StandardCharsets.UTF_8);
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value:
YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test
value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value:
NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test
value: YES_NO"));
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234,
test value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566,
test value: YES_NO"));
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566,
test value: YES_NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234,
test value: NO"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566,
test value: YES_NO"));
+ }
+
+ @Test
+ public void testBloomFilter() throws Exception {
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+
+ CheckTool.main(conf, new String[]{
+ "--type", "bloom-filter",
+ "--values", "0", "--values", "5566",
+ "--column", "x",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "bloom-filter",
+ "--values", "y-byte-1234", "--values", "y-byte-5566",
+ "--column", "y",
+ testFilePath.toString()});
+
+ CheckTool.main(conf, new String[]{
+ "--type", "bloom-filter",
+ "--values", "z-byte-1234", "--values", "z-byte-5566",
+ "--column", "z",
+ testFilePath.toString()});
+
+ System.out.flush();
+ System.setOut(origOut);
+
+ String output = myOut.toString(StandardCharsets.UTF_8);
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, bloom
filter: maybe exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, bloom
filter: not exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, bloom
filter: maybe exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, bloom
filter: maybe exist"));
+
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234,
bloom filter: maybe exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566,
bloom filter: not exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234,
bloom filter: not exist"));
+ assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566,
bloom filter: maybe exist"));
+ }
+}
diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md
index 92a876a5f..f53720113 100644
--- a/site/_docs/java-tools.md
+++ b/site/_docs/java-tools.md
@@ -11,6 +11,7 @@ supports both the local file system and HDFS.
The subcommands for the tools are:
+ * check (since ORC 2.0.1) - check the index of the specified column
* convert (since ORC 1.4) - convert CSV/JSON/ORC files to ORC
* count (since ORC 1.6) - recursively find *.orc and print the number of rows
* data - print the data of an ORC file
@@ -27,6 +28,24 @@ The command line looks like:
~~~ shell
% java -jar orc-tools-X.Y.Z-uber.jar <sub-command> <args>
~~~
+## Java Check
+
+The check command can check whether the specified value of the column
specified by multiple ORC files can be filtered.
+
+Check statistics and bloom filter index on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type predicate
/path/to/example.orc --values 1234 --values 5566 --column x
+~~~
+
+Check statistics on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type stat /path/to/example.orc
--values 1234 --values 5566 --column x
+~~~
+
+Check bloom filter index on x column.
+~~~ shell
+% java -jar orc-tools-X.Y.Z-uber.jar check --type bloom-filter
/path/to/example.orc --values 1234 --values 5566 --column x
+~~~
## Java Convert