http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/test/org/apache/orc/TestStringDictionary.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java b/java/core/src/test/org/apache/orc/TestStringDictionary.java new file mode 100644 index 0000000..46209bb --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java @@ -0,0 +1,290 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import org.apache.orc.impl.RecordReaderImpl; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +public class TestStringDictionary { + + Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + + File.separator + "tmp")); + + Configuration conf; + FileSystem fs; + Path testFilePath; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testTooManyDistinct() throws Exception { + TypeDescription schema = TypeDescription.createString(); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) + .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector col = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 20000; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + col.setVal(batch.size++, String.valueOf(i).getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + col = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), col.toString(r)); + } + } + + // make sure the encoding type is correct + for (StripeInformation stripe : reader.getStripes()) { + // hacky but does the job, this casting will work as long this test resides + // within the same package as ORC reader + OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind()); + } + } + } + + @Test + public void testHalfDistinct() throws Exception { + TypeDescription schema = TypeDescription.createString(); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE) + .bufferSize(10000)); + Random rand = new Random(123); + int[] input = new int[20000]; + for (int i = 0; i < 20000; i++) { + input[i] = rand.nextInt(10000); + } + + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector col = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 20000; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + col.setVal(batch.size++, String.valueOf(input[i]).getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + col = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(input[idx++]), col.toString(r)); + } + } + + // make sure the encoding type is correct + for (StripeInformation stripe : reader.getStripes()) { + // hacky but does the job, this casting will work as long this test resides + // within the same package as ORC reader + OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind()); + } + } + } + + @Test + public void testTooManyDistinctCheckDisabled() throws Exception { + TypeDescription schema = TypeDescription.createString(); + + conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false); + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE) + .bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 20000; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(i).getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), string.toString(r)); + } + } + + // make sure the encoding type is correct + for (StripeInformation stripe : reader.getStripes()) { + // hacky but does the job, this casting will work as long this test resides + // within the same package as ORC reader + OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind()); + } + } + } + + @Test + public void testHalfDistinctCheckDisabled() throws Exception { + TypeDescription schema = TypeDescription.createString(); + + conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), + false); + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) + .bufferSize(10000)); + Random rand = new Random(123); + int[] input = new int[20000]; + for (int i = 0; i < 20000; i++) { + input[i] = rand.nextInt(10000); + } + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 20000; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(input[i]).getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(input[idx++]), string.toString(r)); + } + } + + // make sure the encoding type is correct + for (StripeInformation stripe : reader.getStripes()) { + // hacky but does the job, this casting will work as long this test resides + // within the same package as ORC reader + OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind()); + } + } + } + + @Test + public void testTooManyDistinctV11AlwaysDictionary() throws Exception { + TypeDescription schema = TypeDescription.createString(); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema) + .compress(CompressionKind.NONE) + .version(OrcFile.Version.V_0_11).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector string = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 20000; i++) { + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + string.setVal(batch.size++, String.valueOf(i).getBytes()); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + batch = reader.getSchema().createRowBatch(); + string = (BytesColumnVector) batch.cols[0]; + RecordReader rows = reader.rows(); + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(String.valueOf(idx++), string.toString(r)); + } + } + + // make sure the encoding type is correct + for (StripeInformation stripe : reader.getStripes()) { + // hacky but does the job, this casting will work as long this test resides + // within the same package as ORC reader + OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind()); + } + } + + } + +}
http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/test/org/apache/orc/TestTypeDescription.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java new file mode 100644 index 0000000..1f6f254 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java @@ -0,0 +1,153 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import static org.junit.Assert.assertEquals; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class TestTypeDescription { + @Rule + public ExpectedException thrown= ExpectedException.none(); + + @Test + public void testJson() { + TypeDescription bin = TypeDescription.createBinary(); + assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}", + bin.toJson()); + assertEquals("binary", bin.toString()); + TypeDescription struct = TypeDescription.createStruct() + .addField("f1", TypeDescription.createInt()) + .addField("f2", TypeDescription.createString()) + .addField("f3", TypeDescription.createDecimal()); + assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>", + struct.toString()); + assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n" + + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n" + + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n" + + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}", + struct.toJson()); + struct = TypeDescription.createStruct() + .addField("f1", TypeDescription.createUnion() + .addUnionChild(TypeDescription.createByte()) + .addUnionChild(TypeDescription.createDecimal() + .withPrecision(20).withScale(10))) + .addField("f2", TypeDescription.createStruct() + .addField("f3", TypeDescription.createDate()) + .addField("f4", TypeDescription.createDouble()) + .addField("f5", TypeDescription.createBoolean())) + .addField("f6", TypeDescription.createChar().withMaxLength(100)); + assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>", + struct.toString()); + assertEquals( + "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" + + " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" + + " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" + + " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" + + " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" + + " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" + + " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" + + " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" + + " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}", + struct.toJson()); + } + + @Test + public void testParserSimple() { + TypeDescription expected = TypeDescription.createStruct() + .addField("b1", TypeDescription.createBinary()) + .addField("b2", TypeDescription.createBoolean()) + .addField("b3", TypeDescription.createByte()) + .addField("c", TypeDescription.createChar().withMaxLength(10)) + .addField("d1", TypeDescription.createDate()) + .addField("d2", TypeDescription.createDecimal().withScale(5).withPrecision(20)) + .addField("d3", TypeDescription.createDouble()) + .addField("fff", TypeDescription.createFloat()) + .addField("int", TypeDescription.createInt()) + .addField("l", TypeDescription.createList + (TypeDescription.createLong())) + .addField("map", TypeDescription.createMap + (TypeDescription.createShort(), TypeDescription.createString())) + .addField("str", TypeDescription.createStruct() + .addField("u", TypeDescription.createUnion() + .addUnionChild(TypeDescription.createTimestamp()) + .addUnionChild(TypeDescription.createVarchar() + .withMaxLength(100)))); + String expectedStr = + "struct<b1:binary,b2:boolean,b3:tinyint,c:char(10),d1:date," + + "d2:decimal(20,5),d3:double,fff:float,int:int,l:array<bigint>," + + "map:map<smallint,string>,str:struct<u:uniontype<timestamp," + + "varchar(100)>>>"; + assertEquals(expectedStr, expected.toString()); + TypeDescription actual = TypeDescription.fromString(expectedStr); + assertEquals(expected, actual); + assertEquals(expectedStr, actual.toString()); + } + + @Test + public void testParserUpper() { + TypeDescription type = TypeDescription.fromString("BIGINT"); + assertEquals(TypeDescription.Category.LONG, type.getCategory()); + type = TypeDescription.fromString("STRUCT<MY_FIELD:INT>"); + assertEquals(TypeDescription.Category.STRUCT, type.getCategory()); + assertEquals("MY_FIELD", type.getFieldNames().get(0)); + assertEquals(TypeDescription.Category.INT, + type.getChildren().get(0).getCategory()); + type = TypeDescription.fromString("UNIONTYPE<STRING>"); + assertEquals(TypeDescription.Category.UNION, type.getCategory()); + assertEquals(TypeDescription.Category.STRING, + type.getChildren().get(0).getCategory()); + } + + @Test + public void testParserUnknownCategory() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Can't parse category at 'FOOBAR^'"); + TypeDescription.fromString("FOOBAR"); + } + + @Test + public void testParserEmptyCategory() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Can't parse category at '^<int>'"); + TypeDescription.fromString("<int>"); + } + + @Test + public void testParserMissingInt() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Missing integer at 'char(^)'"); + TypeDescription.fromString("char()"); + } + + @Test + public void testParserMissingSize() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Missing required char '(' at 'struct<c:char^>'"); + TypeDescription.fromString("struct<c:char>"); + } + + @Test + public void testParserExtraStuff() { + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Extra characters at 'struct<i:int>^,'"); + TypeDescription.fromString("struct<i:int>,"); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java b/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java new file mode 100644 index 0000000..ef8fcd0 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestUnrolledBitPack.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import com.google.common.collect.Lists; +import com.google.common.primitives.Longs; + +@RunWith(value = Parameterized.class) +public class TestUnrolledBitPack { + + private long val; + + public TestUnrolledBitPack(long val) { + this.val = val; + } + + @Parameters + public static Collection<Object[]> data() { + Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 }, + { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L }, + { Long.MAX_VALUE } }; + return Arrays.asList(data); + } + + Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + + File.separator + "tmp")); + + Configuration conf; + FileSystem fs; + Path testFilePath; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testBitPacking() throws Exception { + TypeDescription schema = TypeDescription.createLong(); + + long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0, + val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, + 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, + 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, + val, 0, val, 0, 0, val, 0, val, 0, 0, val, val }; + List<Long> input = Lists.newArrayList(Longs.asList(inp)); + + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .compress(CompressionKind.NONE).bufferSize(10000)); + VectorizedRowBatch batch = schema.createRowBatch(); + for (Long l : input) { + int row = batch.size++; + ((LongColumnVector) batch.cols[0]).vector[row] = l; + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + assertEquals(input.get(idx++).longValue(), + ((LongColumnVector) batch.cols[0]).vector[r]); + } + } + } + +}
