ORC-1 Import of ORC code from Hive. (omalley reviewed by prasanthj) Fixes #23
Signed-off-by: Owen O'Malley <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/3283d238 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/3283d238 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/3283d238 Branch: refs/heads/master Commit: 3283d2381c62c446f7faf6d34513cbc627c588c8 Parents: 3c30fe8 Author: Owen O'Malley <[email protected]> Authored: Mon Apr 11 21:08:39 2016 -0700 Committer: Owen O'Malley <[email protected]> Committed: Fri May 13 12:49:52 2016 -0700 ---------------------------------------------------------------------- .gitignore | 2 + java/core/pom.xml | 179 ++ .../org/apache/orc/BinaryColumnStatistics.java | 27 + .../src/java/org/apache/orc/BloomFilterIO.java | 43 + .../org/apache/orc/BooleanColumnStatistics.java | 29 + .../java/org/apache/orc/ColumnStatistics.java | 36 + .../java/org/apache/orc/CompressionCodec.java | 69 + .../java/org/apache/orc/CompressionKind.java | 27 + .../src/java/org/apache/orc/DataReader.java | 76 + .../org/apache/orc/DateColumnStatistics.java | 39 + .../org/apache/orc/DecimalColumnStatistics.java | 46 + .../org/apache/orc/DoubleColumnStatistics.java | 46 + .../org/apache/orc/FileFormatException.java | 30 + .../src/java/org/apache/orc/FileMetaInfo.java | 64 + .../src/java/org/apache/orc/FileMetadata.java | 64 + .../org/apache/orc/IntegerColumnStatistics.java | 52 + java/core/src/java/org/apache/orc/OrcConf.java | 205 ++ java/core/src/java/org/apache/orc/OrcFile.java | 562 ++++ java/core/src/java/org/apache/orc/OrcUtils.java | 530 ++++ java/core/src/java/org/apache/orc/Reader.java | 368 +++ .../src/java/org/apache/orc/RecordReader.java | 64 + .../org/apache/orc/StringColumnStatistics.java | 43 + .../java/org/apache/orc/StripeInformation.java | 59 + .../java/org/apache/orc/StripeStatistics.java | 44 + .../apache/orc/TimestampColumnStatistics.java | 38 + .../java/org/apache/orc/TypeDescription.java | 791 +++++ java/core/src/java/org/apache/orc/Writer.java | 114 + .../src/java/org/apache/orc/impl/AcidStats.java | 60 + .../org/apache/orc/impl/BitFieldReader.java | 217 ++ .../org/apache/orc/impl/BitFieldWriter.java | 73 + .../java/org/apache/orc/impl/BufferChunk.java | 85 + .../apache/orc/impl/ColumnStatisticsImpl.java | 1097 +++++++ .../orc/impl/ConvertTreeReaderFactory.java | 2840 +++++++++++++++++ .../apache/orc/impl/DataReaderProperties.java | 107 + .../orc/impl/DirectDecompressionCodec.java | 28 + .../org/apache/orc/impl/DynamicByteArray.java | 303 ++ .../org/apache/orc/impl/DynamicIntArray.java | 142 + .../java/org/apache/orc/impl/HadoopShims.java | 143 + .../org/apache/orc/impl/HadoopShimsCurrent.java | 92 + .../org/apache/orc/impl/HadoopShims_2_2.java | 101 + .../src/java/org/apache/orc/impl/InStream.java | 498 +++ .../java/org/apache/orc/impl/IntegerReader.java | 82 + .../java/org/apache/orc/impl/IntegerWriter.java | 47 + .../java/org/apache/orc/impl/MemoryManager.java | 214 ++ .../java/org/apache/orc/impl/OrcAcidUtils.java | 85 + .../src/java/org/apache/orc/impl/OrcIndex.java | 43 + .../src/java/org/apache/orc/impl/OutStream.java | 289 ++ .../org/apache/orc/impl/PositionProvider.java | 26 + .../org/apache/orc/impl/PositionRecorder.java | 25 + .../apache/orc/impl/PositionedOutputStream.java | 39 + .../java/org/apache/orc/impl/ReaderImpl.java | 758 +++++ .../org/apache/orc/impl/RecordReaderImpl.java | 1215 ++++++++ .../org/apache/orc/impl/RecordReaderUtils.java | 578 ++++ .../java/org/apache/orc/impl/RedBlackTree.java | 311 ++ .../apache/orc/impl/RunLengthByteReader.java | 174 ++ .../apache/orc/impl/RunLengthByteWriter.java | 106 + .../apache/orc/impl/RunLengthIntegerReader.java | 173 ++ .../orc/impl/RunLengthIntegerReaderV2.java | 406 +++ .../apache/orc/impl/RunLengthIntegerWriter.java | 143 + .../orc/impl/RunLengthIntegerWriterV2.java | 831 +++++ .../org/apache/orc/impl/SchemaEvolution.java | 190 ++ .../org/apache/orc/impl/SerializationUtils.java | 1311 ++++++++ .../orc/impl/SettableUncompressedStream.java | 44 + .../java/org/apache/orc/impl/SnappyCodec.java | 108 + .../java/org/apache/orc/impl/StreamName.java | 97 + .../org/apache/orc/impl/StringRedBlackTree.java | 210 ++ .../org/apache/orc/impl/TreeReaderFactory.java | 2093 +++++++++++++ .../java/org/apache/orc/impl/WriterImpl.java | 2916 ++++++++++++++++++ .../java/org/apache/orc/impl/ZeroCopyShims.java | 89 + .../src/java/org/apache/orc/impl/ZlibCodec.java | 169 + .../src/java/org/apache/orc/tools/FileDump.java | 934 ++++++ .../java/org/apache/orc/tools/JsonFileDump.java | 406 +++ .../org/apache/orc/TestColumnStatistics.java | 364 +++ .../org/apache/orc/TestNewIntegerEncoding.java | 1373 +++++++++ .../org/apache/orc/TestOrcNullOptimization.java | 415 +++ .../test/org/apache/orc/TestOrcTimezone1.java | 189 ++ .../test/org/apache/orc/TestOrcTimezone2.java | 143 + .../org/apache/orc/TestStringDictionary.java | 290 ++ .../org/apache/orc/TestTypeDescription.java | 153 + .../org/apache/orc/TestUnrolledBitPack.java | 114 + .../test/org/apache/orc/TestVectorOrcFile.java | 2782 +++++++++++++++++ .../org/apache/orc/impl/TestBitFieldReader.java | 145 + .../test/org/apache/orc/impl/TestBitPack.java | 279 ++ .../orc/impl/TestColumnStatisticsImpl.java | 64 + .../orc/impl/TestDataReaderProperties.java | 69 + .../org/apache/orc/impl/TestDynamicArray.java | 90 + .../test/org/apache/orc/impl/TestInStream.java | 314 ++ .../orc/impl/TestIntegerCompressionReader.java | 130 + .../org/apache/orc/impl/TestMemoryManager.java | 133 + .../org/apache/orc/impl/TestOrcWideTable.java | 64 + .../test/org/apache/orc/impl/TestOutStream.java | 43 + .../src/test/org/apache/orc/impl/TestRLEv2.java | 307 ++ .../org/apache/orc/impl/TestReaderImpl.java | 152 + .../apache/orc/impl/TestRecordReaderImpl.java | 1693 ++++++++++ .../orc/impl/TestRunLengthByteReader.java | 143 + .../orc/impl/TestRunLengthIntegerReader.java | 125 + .../apache/orc/impl/TestSerializationUtils.java | 201 ++ .../org/apache/orc/impl/TestStreamName.java | 49 + .../apache/orc/impl/TestStringRedBlackTree.java | 234 ++ .../src/test/org/apache/orc/impl/TestZlib.java | 56 + .../test/org/apache/orc/tools/TestFileDump.java | 486 +++ .../org/apache/orc/tools/TestJsonFileDump.java | 150 + .../src/test/resources/orc-file-11-format.orc | Bin 0 -> 373336 bytes .../resources/orc-file-dump-bloomfilter.out | 179 ++ .../resources/orc-file-dump-bloomfilter2.out | 179 ++ .../orc-file-dump-dictionary-threshold.out | 190 ++ java/core/src/test/resources/orc-file-dump.json | 1355 ++++++++ java/core/src/test/resources/orc-file-dump.out | 195 ++ .../src/test/resources/orc-file-has-null.out | 112 + java/mapreduce/pom.xml | 148 + .../org/apache/orc/mapred/OrcInputFormat.java | 131 + .../src/java/org/apache/orc/mapred/OrcList.java | 74 + .../src/java/org/apache/orc/mapred/OrcMap.java | 86 + .../org/apache/orc/mapred/OrcOutputFormat.java | 65 + .../org/apache/orc/mapred/OrcRecordReader.java | 547 ++++ .../org/apache/orc/mapred/OrcRecordWriter.java | 277 ++ .../java/org/apache/orc/mapred/OrcStruct.java | 188 ++ .../org/apache/orc/mapred/OrcTimestamp.java | 64 + .../java/org/apache/orc/mapred/OrcUnion.java | 103 + .../org/apache/orc/mapred/package-info.java | 82 + .../test/org/apache/orc/mapred/TestOrcList.java | 65 + .../test/org/apache/orc/mapred/TestOrcMap.java | 57 + .../org/apache/orc/mapred/TestOrcStruct.java | 96 + .../org/apache/orc/mapred/TestOrcTimestamp.java | 41 + .../org/apache/orc/mapred/TestOrcUnion.java | 58 + .../orc/mapred/other/TestOrcOutputFormat.java | 249 ++ java/pom.xml | 110 + java/storage-api/pom.xml | 76 + .../hadoop/hive/common/DiskRangeInfo.java | 59 + .../org/apache/hadoop/hive/common/Pool.java | 33 + .../apache/hadoop/hive/common/io/Allocator.java | 53 + .../apache/hadoop/hive/common/io/DataCache.java | 100 + .../apache/hadoop/hive/common/io/DiskRange.java | 102 + .../hadoop/hive/common/io/DiskRangeList.java | 210 ++ .../common/io/encoded/EncodedColumnBatch.java | 135 + .../hive/common/io/encoded/MemoryBuffer.java | 28 + .../hadoop/hive/common/type/HiveDecimal.java | 332 ++ .../hive/common/type/HiveIntervalDayTime.java | 253 ++ .../hadoop/hive/common/type/RandomTypeUtil.java | 115 + .../hive/ql/exec/vector/BytesColumnVector.java | 389 +++ .../hive/ql/exec/vector/ColumnVector.java | 217 ++ .../ql/exec/vector/DecimalColumnVector.java | 156 + .../hive/ql/exec/vector/DoubleColumnVector.java | 178 ++ .../vector/IntervalDayTimeColumnVector.java | 348 +++ .../hive/ql/exec/vector/ListColumnVector.java | 119 + .../hive/ql/exec/vector/LongColumnVector.java | 224 ++ .../hive/ql/exec/vector/MapColumnVector.java | 131 + .../ql/exec/vector/MultiValuedColumnVector.java | 150 + .../hive/ql/exec/vector/StructColumnVector.java | 132 + .../ql/exec/vector/TimestampColumnVector.java | 400 +++ .../hive/ql/exec/vector/UnionColumnVector.java | 140 + .../hive/ql/exec/vector/VectorizedRowBatch.java | 218 ++ .../ql/exec/vector/expressions/StringExpr.java | 354 +++ .../hadoop/hive/ql/io/sarg/ExpressionTree.java | 160 + .../hadoop/hive/ql/io/sarg/PredicateLeaf.java | 102 + .../hadoop/hive/ql/io/sarg/SearchArgument.java | 287 ++ .../hive/ql/io/sarg/SearchArgumentFactory.java | 31 + .../hive/ql/io/sarg/SearchArgumentImpl.java | 703 +++++ .../hadoop/hive/ql/util/JavaDataModel.java | 335 ++ .../hadoop/hive/ql/util/TimestampUtils.java | 94 + .../hadoop/hive/serde2/io/DateWritable.java | 179 ++ .../hive/serde2/io/HiveDecimalWritable.java | 170 + .../apache/hive/common/util/BloomFilter.java | 309 ++ .../hive/common/util/IntervalDayTimeUtils.java | 77 + .../org/apache/hive/common/util/Murmur3.java | 335 ++ .../ql/exec/vector/TestListColumnVector.java | 200 ++ .../ql/exec/vector/TestMapColumnVector.java | 224 ++ .../ql/exec/vector/TestStructColumnVector.java | 95 + .../exec/vector/TestTimestampColumnVector.java | 117 + .../ql/exec/vector/TestUnionColumnVector.java | 93 + .../apache/hive/common/util/TestMurmur3.java | 224 ++ proto/orc_proto.proto | 5 +- 172 files changed, 47337 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 4f54128..167ac88 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ build target *~ +*.iml +.idea .DS_Store http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/pom.xml ---------------------------------------------------------------------- diff --git a/java/core/pom.xml b/java/core/pom.xml new file mode 100644 index 0000000..85d9ae6 --- /dev/null +++ b/java/core/pom.xml @@ -0,0 +1,179 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.orc</groupId> + <artifactId>orc</artifactId> + <version>1.1.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>orc-core</artifactId> + <packaging>jar</packaging> + <name>ORC Core</name> + + <dependencies> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + <version>2.0.0.1-SNAPSHOT</version> + </dependency> + + <!-- inter-project --> + <dependency> + <groupId>com.google.protobuf</groupId> + <artifactId>protobuf-java</artifactId> + <version>${protobuf.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + <exclusions> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet.jsp</groupId> + <artifactId>jsp-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.avro</groupId> + <artifactId>avro</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-hdfs</artifactId> + <version>${hadoop.version}</version> + <exclusions> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet.jsp</groupId> + <artifactId>jsp-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.avro</groupId> + <artifactId>avro</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.iq80.snappy</groupId> + <artifactId>snappy</artifactId> + <version>${snappy.version}</version> + </dependency> + + <!-- test inter-project --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>${junit.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-all</artifactId> + <version>${mockito.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <sourceDirectory>${basedir}/src/java</sourceDirectory> + <testSourceDirectory>${basedir}/src/test</testSourceDirectory> + <testResources> + <testResource> + <directory>${basedir}/src/test/resources</directory> + </testResource> + </testResources> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <executions> + <execution> + <id>add-source</id> + <phase>generate-sources</phase> + <goals> + <goal>add-source</goal> + </goals> + <configuration> + <sources> + <source>target/generated-sources</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>com.github.os72</groupId> + <artifactId>protoc-jar-maven-plugin</artifactId> + <version>3.0.0-a3</version> + <executions> + <execution> + <phase>generate-sources</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <protocVersion>2.5.0</protocVersion> + <addSources>none</addSources> + <includeDirectories> + <include>../../proto</include> + </includeDirectories> + <inputDirectories> + <include>../../proto</include> + </inputDirectories> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.1</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/BinaryColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/BinaryColumnStatistics.java b/java/core/src/java/org/apache/orc/BinaryColumnStatistics.java new file mode 100644 index 0000000..19db98a --- /dev/null +++ b/java/core/src/java/org/apache/orc/BinaryColumnStatistics.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for binary columns. + */ +public interface BinaryColumnStatistics extends ColumnStatistics { + long getSum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/BloomFilterIO.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/BloomFilterIO.java b/java/core/src/java/org/apache/orc/BloomFilterIO.java new file mode 100644 index 0000000..1406266 --- /dev/null +++ b/java/core/src/java/org/apache/orc/BloomFilterIO.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hive.common.util.BloomFilter; + +import com.google.common.primitives.Longs; + +public class BloomFilterIO extends BloomFilter { + + public BloomFilterIO(long expectedEntries) { + super(expectedEntries, DEFAULT_FPP); + } + + public BloomFilterIO(long expectedEntries, double fpp) { + super(expectedEntries, fpp); + } + +/** + * Initializes the BloomFilter from the given Orc BloomFilter + */ + public BloomFilterIO(OrcProto.BloomFilter bloomFilter) { + this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList())); + this.numHashFunctions = bloomFilter.getNumHashFunctions(); + this.numBits = (int) this.bitSet.bitSize(); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/BooleanColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/BooleanColumnStatistics.java b/java/core/src/java/org/apache/orc/BooleanColumnStatistics.java new file mode 100644 index 0000000..af08f06 --- /dev/null +++ b/java/core/src/java/org/apache/orc/BooleanColumnStatistics.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for boolean columns. + */ +public interface BooleanColumnStatistics extends ColumnStatistics { + long getFalseCount(); + + long getTrueCount(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/ColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/ColumnStatistics.java b/java/core/src/java/org/apache/orc/ColumnStatistics.java new file mode 100644 index 0000000..72d8fbf --- /dev/null +++ b/java/core/src/java/org/apache/orc/ColumnStatistics.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +/** + * Statistics that are available for all types of columns. + */ +public interface ColumnStatistics { + /** + * Get the number of values in this column. It will differ from the number + * of rows because of NULL values and repeated values. + * @return the number of values + */ + long getNumberOfValues(); + + /** + * Returns true if there are nulls in the scope of column statistics. + * @return true if null present else false + */ + boolean hasNull(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/CompressionCodec.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/CompressionCodec.java b/java/core/src/java/org/apache/orc/CompressionCodec.java new file mode 100644 index 0000000..3421969 --- /dev/null +++ b/java/core/src/java/org/apache/orc/CompressionCodec.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; + +import javax.annotation.Nullable; + +public interface CompressionCodec { + + enum Modifier { + /* speed/compression tradeoffs */ + FASTEST, + FAST, + DEFAULT, + /* data sensitivity modifiers */ + TEXT, + BINARY + }; + + /** + * Compress the in buffer to the out buffer. + * @param in the bytes to compress + * @param out the uncompressed bytes + * @param overflow put any additional bytes here + * @return true if the output is smaller than input + * @throws IOException + */ + boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow + ) throws IOException; + + /** + * Decompress the in buffer to the out buffer. + * @param in the bytes to decompress + * @param out the decompressed bytes + * @throws IOException + */ + void decompress(ByteBuffer in, ByteBuffer out) throws IOException; + + /** + * Produce a modified compression codec if the underlying algorithm allows + * modification. + * + * This does not modify the current object, but returns a new object if + * modifications are possible. Returns the same object if no modifications + * are possible. + * @param modifiers compression modifiers + * @return codec for use after optional modification + */ + CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers); + +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/CompressionKind.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/CompressionKind.java b/java/core/src/java/org/apache/orc/CompressionKind.java new file mode 100644 index 0000000..f684bef --- /dev/null +++ b/java/core/src/java/org/apache/orc/CompressionKind.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +/** + * An enumeration that lists the generic compression algorithms that + * can be applied to ORC files. + */ +public enum CompressionKind { + NONE, ZLIB, SNAPPY, LZO +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/DataReader.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/DataReader.java b/java/core/src/java/org/apache/orc/DataReader.java new file mode 100644 index 0000000..a5dbb76 --- /dev/null +++ b/java/core/src/java/org/apache/orc/DataReader.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.apache.orc.impl.OrcIndex; + +/** An abstract data reader that IO formats can use to read bytes from underlying storage. */ +public interface DataReader extends AutoCloseable { + + /** Opens the DataReader, making it ready to use. */ + void open() throws IOException; + + OrcIndex readRowIndex(StripeInformation stripe, + OrcProto.StripeFooter footer, + boolean[] included, OrcProto.RowIndex[] indexes, + boolean[] sargColumns, + OrcProto.BloomFilterIndex[] bloomFilterIndices + ) throws IOException; + + OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException; + + /** Reads the data. + * + * Note that for the cases such as zero-copy read, caller must release the disk ranges + * produced after being done with them. Call isTrackingDiskRanges to find out if this is needed. + * @param range List if disk ranges to read. Ranges with data will be ignored. + * @param baseOffset Base offset from the start of the file of the ranges in disk range list. + * @param doForceDirect Whether the data should be read into direct buffers. + * @return New or modified list of DiskRange-s, where all the ranges are filled with data. + */ + DiskRangeList readFileData( + DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException; + + + /** + * Whether the user should release buffers created by readFileData. See readFileData javadoc. + */ + boolean isTrackingDiskRanges(); + + /** + * Releases buffers created by readFileData. See readFileData javadoc. + * @param toRelease The buffer to release. + */ + void releaseBuffer(ByteBuffer toRelease); + + /** + * Clone the entire state of the DataReader with the assumption that the + * clone will be closed at a different time. Thus, any file handles in the + * implementation need to be cloned. + * @return a new instance + */ + DataReader clone(); + + @Override + public void close() throws IOException; +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/DateColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/DateColumnStatistics.java b/java/core/src/java/org/apache/orc/DateColumnStatistics.java new file mode 100644 index 0000000..cdd01af --- /dev/null +++ b/java/core/src/java/org/apache/orc/DateColumnStatistics.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +import java.util.Date; + +/** + * Statistics for DATE columns. + */ +public interface DateColumnStatistics extends ColumnStatistics { + /** + * Get the minimum value for the column. + * @return minimum value + */ + Date getMinimum(); + + /** + * Get the maximum value for the column. + * @return maximum value + */ + Date getMaximum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/DecimalColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/DecimalColumnStatistics.java b/java/core/src/java/org/apache/orc/DecimalColumnStatistics.java new file mode 100644 index 0000000..51b6d7d --- /dev/null +++ b/java/core/src/java/org/apache/orc/DecimalColumnStatistics.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for decimal columns. + */ +public interface DecimalColumnStatistics extends ColumnStatistics { + + /** + * Get the minimum value for the column. + * @return the minimum value + */ + HiveDecimal getMinimum(); + + /** + * Get the maximum value for the column. + * @return the maximum value + */ + HiveDecimal getMaximum(); + + /** + * Get the sum of the values of the column. + * @return the sum + */ + HiveDecimal getSum(); + +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/DoubleColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/DoubleColumnStatistics.java b/java/core/src/java/org/apache/orc/DoubleColumnStatistics.java new file mode 100644 index 0000000..00c728f --- /dev/null +++ b/java/core/src/java/org/apache/orc/DoubleColumnStatistics.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for float and double columns. + */ +public interface DoubleColumnStatistics extends ColumnStatistics { + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + double getMinimum(); + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + double getMaximum(); + + /** + * Get the sum of the values in the column. + * @return the sum + */ + double getSum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/FileFormatException.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/FileFormatException.java b/java/core/src/java/org/apache/orc/FileFormatException.java new file mode 100644 index 0000000..2cebea7 --- /dev/null +++ b/java/core/src/java/org/apache/orc/FileFormatException.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import java.io.IOException; + +/** + * Thrown when an invalid file format is encountered. + */ +public class FileFormatException extends IOException { + + public FileFormatException(String errMsg) { + super(errMsg); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/FileMetaInfo.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/FileMetaInfo.java b/java/core/src/java/org/apache/orc/FileMetaInfo.java new file mode 100644 index 0000000..d3cac3b --- /dev/null +++ b/java/core/src/java/org/apache/orc/FileMetaInfo.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file + * that is useful for Reader implementation + * + */ +public class FileMetaInfo { + public ByteBuffer footerMetaAndPsBuffer; + public final String compressionType; + public final int bufferSize; + public final int metadataSize; + public final ByteBuffer footerBuffer; + public final List<Integer> versionList; + public final OrcFile.WriterVersion writerVersion; + + + /** Ctor used when reading splits - no version list or full footer buffer. */ + public FileMetaInfo(String compressionType, int bufferSize, int metadataSize, + ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) { + this(compressionType, bufferSize, metadataSize, footerBuffer, null, + writerVersion, null); + } + + /** Ctor used when creating file info during init and when getting a new one. */ + public FileMetaInfo(String compressionType, int bufferSize, int metadataSize, + ByteBuffer footerBuffer, List<Integer> versionList, + OrcFile.WriterVersion writerVersion, + ByteBuffer fullFooterBuffer) { + this.compressionType = compressionType; + this.bufferSize = bufferSize; + this.metadataSize = metadataSize; + this.footerBuffer = footerBuffer; + this.versionList = versionList; + this.writerVersion = writerVersion; + this.footerMetaAndPsBuffer = fullFooterBuffer; + } + + public OrcFile.WriterVersion getWriterVersion() { + return writerVersion; + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/FileMetadata.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/FileMetadata.java b/java/core/src/java/org/apache/orc/FileMetadata.java new file mode 100644 index 0000000..807e696 --- /dev/null +++ b/java/core/src/java/org/apache/orc/FileMetadata.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.util.List; + +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcProto; +import org.apache.orc.StripeInformation; + +/** + * Cached file metadata. Right now, it caches everything; we don't have to store all the + * protobuf structs actually, we could just store what we need, but that would require that + * ORC stop depending on them too. Luckily, they shouldn't be very big. + */ +public interface FileMetadata { + boolean isOriginalFormat(); + + List<StripeInformation> getStripes(); + + CompressionKind getCompressionKind(); + + int getCompressionBufferSize(); + + int getRowIndexStride(); + + int getColumnCount(); + + int getFlattenedColumnCount(); + + Object getFileKey(); + + List<Integer> getVersionList(); + + int getMetadataSize(); + + int getWriterVersionNum(); + + List<OrcProto.Type> getTypes(); + + List<OrcProto.StripeStatistics> getStripeStats(); + + long getContentLength(); + + long getNumberOfRows(); + + List<OrcProto.ColumnStatistics> getFileStats(); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/IntegerColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/IntegerColumnStatistics.java b/java/core/src/java/org/apache/orc/IntegerColumnStatistics.java new file mode 100644 index 0000000..1a162ff --- /dev/null +++ b/java/core/src/java/org/apache/orc/IntegerColumnStatistics.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for all of the integer columns, such as byte, short, int, and + * long. + */ +public interface IntegerColumnStatistics extends ColumnStatistics { + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + long getMinimum(); + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + long getMaximum(); + + /** + * Is the sum defined? If the sum overflowed the counter this will be false. + * @return is the sum available + */ + boolean isSumDefined(); + + /** + * Get the sum of the column. Only valid if isSumDefined returns true. + * @return the sum of the column + */ + long getSum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/OrcConf.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java new file mode 100644 index 0000000..df1b410 --- /dev/null +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -0,0 +1,205 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; + +import java.util.Properties; + +/** + * Define the configuration properties that Orc understands. + */ +public enum OrcConf { + STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size", + 64L * 1024 * 1024, + "Define the default ORC stripe size, in bytes."), + BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size", + 256L * 1024 * 1024, + "Define the default file system block size for ORC files."), + ENABLE_INDEXES("orc.create.index", "orc.create.index", true, + "Should the ORC writer create indexes as part of the file."), + ROW_INDEX_STRIDE("orc.row.index.stride", + "hive.exec.orc.default.row.index.stride", 10000, + "Define the default ORC index stride in number of rows. (Stride is the\n"+ + " number of rows n index entry represents.)"), + BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size", + 256 * 1024, "Define the default ORC buffer size, in bytes."), + BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding", + true, + "Define whether stripes should be padded to the HDFS block boundaries."), + COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB", + "Define the default compression codec for ORC file"), + WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12", + "Define the version of the file to write. Possible values are 0.11 and\n"+ + " 0.12. If this parameter is not defined, ORC will use the run\n" + + " length encoding (RLE) introduced in Hive 0.12."), + ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy", + "SPEED", + "Define the encoding strategy to use while writing data. Changing this\n"+ + "will only affect the light weight encoding for integers. This\n" + + "flag will not change the compression level of higher level\n" + + "compression codec (like ZLIB)."), + COMPRESSION_STRATEGY("orc.compression.strategy", + "hive.exec.orc.compression.strategy", "SPEED", + "Define the compression strategy to use while writing data.\n" + + "This changes the compression level of higher level compression\n" + + "codec (like ZLIB)."), + BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance", + "hive.exec.orc.block.padding.tolerance", 0.05, + "Define the tolerance for block padding as a decimal fraction of\n" + + "stripe size (for example, the default value 0.05 is 5% of the\n" + + "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" + + "blocks, the default block padding tolerance of 5% will\n" + + "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" + + "In that case, if the available size within the block is more than\n"+ + "3.2Mb, a new smaller stripe will be inserted to fit within that\n" + + "space. This will make sure that no stripe written will block\n" + + " boundaries and cause remote reads within a node local task."), + BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05, + "Define the default false positive probability for bloom filters."), + USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false, + "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"), + SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data", + false, + "If ORC reader encounters corrupt data, this value will be used to\n" + + "determine whether to skip the corrupt data or throw exception.\n" + + "The default behavior is to throw exception."), + MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5, + "Maximum fraction of heap that can be used by ORC file writers"), + DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold", + "hive.exec.orc.dictionary.key.size.threshold", + 0.8, + "If the number of distinct keys in a dictionary is greater than this\n" + + "fraction of the total number of non-null rows, turn off \n" + + "dictionary encoding. Use 1 to always use dictionary encoding."), + ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check", + "hive.orc.row.index.stride.dictionary.check", + true, + "If enabled dictionary check will happen after first row index stride\n" + + "(default 10000 rows) else dictionary check will happen before\n" + + "writing first stripe. In both cases, the decision to use\n" + + "dictionary or not will be retained thereafter."), + BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns", + "", "List of columns to create bloom filters for when writing."), + MAX_FILE_LENGTH("orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE, + "The maximum size of the file to read for finding the file tail. This\n" + + "is primarily used for streaming ingest to read intermediate\n" + + "footers while the file is still open"), + SCHEMA("orc.schema", "orc.schema", null, + "The schema that the user desires to read or write. The values are\n" + + "interpreted using TypeDescription.fromString."), + INCLUDE_COLUMNS("orc.include.columns", "hive.io.file.readcolumn.ids", null, + "The list of comma separated column ids that should be read with 0\n" + + "being the first column, 1 being the next, and so on. ."), + KRYO_SARG("orc.kryo.sarg", "orc.kryo.sarg", null, + "The kryo and base64 encoded SearchArgument for predicate pushdown."), + SARG_COLUMNS("orc.sarg.column.names", "org.sarg.column.names", null, + "The list of column names for the SearchArgument.") + ; + + private final String attribute; + private final String hiveConfName; + private final Object defaultValue; + private final String description; + + OrcConf(String attribute, + String hiveConfName, + Object defaultValue, + String description) { + this.attribute = attribute; + this.hiveConfName = hiveConfName; + this.defaultValue = defaultValue; + this.description = description; + } + + public String getAttribute() { + return attribute; + } + + public String getHiveConfName() { + return hiveConfName; + } + + public Object getDefaultValue() { + return defaultValue; + } + + public String getDescription() { + return description; + } + + private String lookupValue(Properties tbl, Configuration conf) { + String result = null; + if (tbl != null) { + result = tbl.getProperty(attribute); + } + if (result == null && conf != null) { + result = conf.get(attribute); + if (result == null) { + result = conf.get(hiveConfName); + } + } + return result; + } + + public long getLong(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Long.parseLong(value); + } + return ((Number) defaultValue).longValue(); + } + + public long getLong(Configuration conf) { + return getLong(null, conf); + } + + public String getString(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + return value == null ? (String) defaultValue : value; + } + + public String getString(Configuration conf) { + return getString(null, conf); + } + + public boolean getBoolean(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Boolean.parseBoolean(value); + } + return (Boolean) defaultValue; + } + + public boolean getBoolean(Configuration conf) { + return getBoolean(null, conf); + } + + public double getDouble(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Double.parseDouble(value); + } + return ((Number) defaultValue).doubleValue(); + } + + public double getDouble(Configuration conf) { + return getDouble(null, conf); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/OrcFile.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java new file mode 100644 index 0000000..7dd7333 --- /dev/null +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -0,0 +1,562 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.io.IOException; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.orc.impl.MemoryManager; +import org.apache.orc.impl.ReaderImpl; +import org.apache.orc.impl.WriterImpl; + +/** + * Contains factory methods to read or write ORC files. + */ +public class OrcFile { + public static final String MAGIC = "ORC"; + + /** + * Create a version number for the ORC file format, so that we can add + * non-forward compatible changes in the future. To make it easier for users + * to understand the version numbers, we use the Hive release number that + * first wrote that version of ORC files. + * + * Thus, if you add new encodings or other non-forward compatible changes + * to ORC files, which prevent the old reader from reading the new format, + * you should change these variable to reflect the next Hive release number. + * Non-forward compatible changes should never be added in patch releases. + * + * Do not make any changes that break backwards compatibility, which would + * prevent the new reader from reading ORC files generated by any released + * version of Hive. + */ + public enum Version { + V_0_11("0.11", 0, 11), + V_0_12("0.12", 0, 12); + + public static final Version CURRENT = V_0_12; + + private final String name; + private final int major; + private final int minor; + + Version(String name, int major, int minor) { + this.name = name; + this.major = major; + this.minor = minor; + } + + public static Version byName(String name) { + for(Version version: values()) { + if (version.name.equals(name)) { + return version; + } + } + throw new IllegalArgumentException("Unknown ORC version " + name); + } + + /** + * Get the human readable name for the version. + */ + public String getName() { + return name; + } + + /** + * Get the major version number. + */ + public int getMajor() { + return major; + } + + /** + * Get the minor version number. + */ + public int getMinor() { + return minor; + } + } + + /** + * Records the version of the writer in terms of which bugs have been fixed. + * For bugs in the writer, but the old readers already read the new data + * correctly, bump this version instead of the Version. + */ + public enum WriterVersion { + ORIGINAL(0), + HIVE_8732(1), // corrupted stripe/file maximum column statistics + HIVE_4243(2), // use real column names from Hive tables + HIVE_12055(3), // vectorized writer + HIVE_13083(4), // decimal writer updating present stream wrongly + + // Don't use any magic numbers here except for the below: + FUTURE(Integer.MAX_VALUE); // a version from a future writer + + private final int id; + + public int getId() { + return id; + } + + WriterVersion(int id) { + this.id = id; + } + + private static final WriterVersion[] values; + static { + // Assumes few non-negative values close to zero. + int max = Integer.MIN_VALUE; + for (WriterVersion v : WriterVersion.values()) { + if (v.id < 0) throw new AssertionError(); + if (v.id > max && FUTURE.id != v.id) { + max = v.id; + } + } + values = new WriterVersion[max + 1]; + for (WriterVersion v : WriterVersion.values()) { + if (v.id < values.length) { + values[v.id] = v; + } + } + } + + public static WriterVersion from(int val) { + if (val == FUTURE.id) return FUTURE; // Special handling for the magic value. + return values[val]; + } + } + public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083; + + public enum EncodingStrategy { + SPEED, COMPRESSION + } + + public enum CompressionStrategy { + SPEED, COMPRESSION + } + + // unused + protected OrcFile() {} + + public static class ReaderOptions { + private final Configuration conf; + private FileSystem filesystem; + private FileMetaInfo fileMetaInfo; // TODO: this comes from some place. + private long maxLength = Long.MAX_VALUE; + private FileMetadata fullFileMetadata; // Propagate from LLAP cache. + + public ReaderOptions(Configuration conf) { + this.conf = conf; + } + + public ReaderOptions fileMetaInfo(FileMetaInfo info) { + fileMetaInfo = info; + return this; + } + + public ReaderOptions filesystem(FileSystem fs) { + this.filesystem = fs; + return this; + } + + public ReaderOptions maxLength(long val) { + maxLength = val; + return this; + } + + public ReaderOptions fileMetadata(FileMetadata metadata) { + this.fullFileMetadata = metadata; + return this; + } + + public Configuration getConfiguration() { + return conf; + } + + public FileSystem getFilesystem() { + return filesystem; + } + + public FileMetaInfo getFileMetaInfo() { + return fileMetaInfo; + } + + public long getMaxLength() { + return maxLength; + } + + public FileMetadata getFileMetadata() { + return fullFileMetadata; + } + } + + public static ReaderOptions readerOptions(Configuration conf) { + return new ReaderOptions(conf); + } + + public static Reader createReader(Path path, + ReaderOptions options) throws IOException { + return new ReaderImpl(path, options); + } + + public interface WriterContext { + Writer getWriter(); + } + + public interface WriterCallback { + void preStripeWrite(WriterContext context) throws IOException; + void preFooterWrite(WriterContext context) throws IOException; + } + + /** + * Options for creating ORC file writers. + */ + public static class WriterOptions { + private final Configuration configuration; + private FileSystem fileSystemValue = null; + private TypeDescription schema = null; + private long stripeSizeValue; + private long blockSizeValue; + private int rowIndexStrideValue; + private int bufferSizeValue; + private boolean enforceBufferSize = false; + private boolean blockPaddingValue; + private CompressionKind compressValue; + private MemoryManager memoryManagerValue; + private Version versionValue; + private WriterCallback callback; + private EncodingStrategy encodingStrategy; + private CompressionStrategy compressionStrategy; + private double paddingTolerance; + private String bloomFilterColumns; + private double bloomFilterFpp; + + protected WriterOptions(Properties tableProperties, Configuration conf) { + configuration = conf; + memoryManagerValue = getStaticMemoryManager(conf); + stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf); + blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf); + rowIndexStrideValue = + (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf); + bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties, + conf); + blockPaddingValue = + OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf); + compressValue = + CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties, + conf).toUpperCase()); + String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties, + conf); + versionValue = Version.byName(versionName); + String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties, + conf); + encodingStrategy = EncodingStrategy.valueOf(enString); + + String compString = + OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf); + compressionStrategy = CompressionStrategy.valueOf(compString); + + paddingTolerance = + OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf); + + bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties, + conf); + bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties, + conf); + } + + /** + * Provide the filesystem for the path, if the client has it available. + * If it is not provided, it will be found from the path. + */ + public WriterOptions fileSystem(FileSystem value) { + fileSystemValue = value; + return this; + } + + /** + * Set the stripe size for the file. The writer stores the contents of the + * stripe in memory until this memory limit is reached and the stripe + * is flushed to the HDFS file and the next stripe started. + */ + public WriterOptions stripeSize(long value) { + stripeSizeValue = value; + return this; + } + + /** + * Set the file system block size for the file. For optimal performance, + * set the block size to be multiple factors of stripe size. + */ + public WriterOptions blockSize(long value) { + blockSizeValue = value; + return this; + } + + /** + * Set the distance between entries in the row index. The minimum value is + * 1000 to prevent the index from overwhelming the data. If the stride is + * set to 0, no indexes will be included in the file. + */ + public WriterOptions rowIndexStride(int value) { + rowIndexStrideValue = value; + return this; + } + + /** + * The size of the memory buffers used for compressing and storing the + * stripe in memory. NOTE: ORC writer may choose to use smaller buffer + * size based on stripe size and number of columns for efficient stripe + * writing and memory utilization. To enforce writer to use the requested + * buffer size use enforceBufferSize(). + */ + public WriterOptions bufferSize(int value) { + bufferSizeValue = value; + return this; + } + + /** + * Enforce writer to use requested buffer size instead of estimating + * buffer size based on stripe size and number of columns. + * See bufferSize() method for more info. + * Default: false + */ + public WriterOptions enforceBufferSize() { + enforceBufferSize = true; + return this; + } + + /** + * Sets whether the HDFS blocks are padded to prevent stripes from + * straddling blocks. Padding improves locality and thus the speed of + * reading, but costs space. + */ + public WriterOptions blockPadding(boolean value) { + blockPaddingValue = value; + return this; + } + + /** + * Sets the encoding strategy that is used to encode the data. + */ + public WriterOptions encodingStrategy(EncodingStrategy strategy) { + encodingStrategy = strategy; + return this; + } + + /** + * Sets the tolerance for block padding as a percentage of stripe size. + */ + public WriterOptions paddingTolerance(double value) { + paddingTolerance = value; + return this; + } + + /** + * Comma separated values of column names for which bloom filter is to be created. + */ + public WriterOptions bloomFilterColumns(String columns) { + bloomFilterColumns = columns; + return this; + } + + /** + * Specify the false positive probability for bloom filter. + * @param fpp - false positive probability + * @return this + */ + public WriterOptions bloomFilterFpp(double fpp) { + bloomFilterFpp = fpp; + return this; + } + + /** + * Sets the generic compression that is used to compress the data. + */ + public WriterOptions compress(CompressionKind value) { + compressValue = value; + return this; + } + + /** + * Set the schema for the file. This is a required parameter. + * @param schema the schema for the file. + * @return this + */ + public WriterOptions setSchema(TypeDescription schema) { + this.schema = schema; + return this; + } + + /** + * Sets the version of the file that will be written. + */ + public WriterOptions version(Version value) { + versionValue = value; + return this; + } + + /** + * Add a listener for when the stripe and file are about to be closed. + * @param callback the object to be called when the stripe is closed + * @return this + */ + public WriterOptions callback(WriterCallback callback) { + this.callback = callback; + return this; + } + + /** + * A package local option to set the memory manager. + */ + protected WriterOptions memory(MemoryManager value) { + memoryManagerValue = value; + return this; + } + + public boolean getBlockPadding() { + return blockPaddingValue; + } + + public long getBlockSize() { + return blockSizeValue; + } + + public String getBloomFilterColumns() { + return bloomFilterColumns; + } + + public FileSystem getFileSystem() { + return fileSystemValue; + } + + public Configuration getConfiguration() { + return configuration; + } + + public TypeDescription getSchema() { + return schema; + } + + public long getStripeSize() { + return stripeSizeValue; + } + + public CompressionKind getCompress() { + return compressValue; + } + + public WriterCallback getCallback() { + return callback; + } + + public Version getVersion() { + return versionValue; + } + + public MemoryManager getMemoryManager() { + return memoryManagerValue; + } + + public int getBufferSize() { + return bufferSizeValue; + } + + public boolean isEnforceBufferSize() { + return enforceBufferSize; + } + + public int getRowIndexStride() { + return rowIndexStrideValue; + } + + public CompressionStrategy getCompressionStrategy() { + return compressionStrategy; + } + + public EncodingStrategy getEncodingStrategy() { + return encodingStrategy; + } + + public double getPaddingTolerance() { + return paddingTolerance; + } + + public double getBloomFilterFpp() { + return bloomFilterFpp; + } + } + + /** + * Create a set of writer options based on a configuration. + * @param conf the configuration to use for values + * @return A WriterOptions object that can be modified + */ + public static WriterOptions writerOptions(Configuration conf) { + return new WriterOptions(null, conf); + } + + /** + * Create a set of write options based on a set of table properties and + * configuration. + * @param tableProperties the properties of the table + * @param conf the configuration of the query + * @return a WriterOptions object that can be modified + */ + public static WriterOptions writerOptions(Properties tableProperties, + Configuration conf) { + return new WriterOptions(tableProperties, conf); + } + + private static ThreadLocal<MemoryManager> memoryManager = null; + + private static synchronized MemoryManager getStaticMemoryManager( + final Configuration conf) { + if (memoryManager == null) { + memoryManager = new ThreadLocal<MemoryManager>() { + @Override + protected MemoryManager initialValue() { + return new MemoryManager(conf); + } + }; + } + return memoryManager.get(); + } + + /** + * Create an ORC file writer. This is the public interface for creating + * writers going forward and new options will only be added to this method. + * @param path filename to write to + * @param opts the options + * @return a new ORC file writer + * @throws IOException + */ + public static Writer createWriter(Path path, + WriterOptions opts + ) throws IOException { + FileSystem fs = opts.getFileSystem() == null ? + path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem(); + + return new WriterImpl(fs, path, opts); + } + +}
