[04/23] orc git commit: ORC-1 Import of ORC code from Hive. (omalley reviewed by prasanthj)

omalley Fri, 13 May 2016 12:51:12 -0700

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java 
b/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java
new file mode 100644
index 0000000..ce5523f
--- /dev/null
+++ 
b/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java
@@ -0,0 +1,249 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.mapred.other;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.ShortWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.JobContext;
+import org.apache.hadoop.mapred.OutputCommitter;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.TaskAttemptContext;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.mapred.OrcInputFormat;
+import org.apache.orc.mapred.OrcList;
+import org.apache.orc.mapred.OrcMap;
+import org.apache.orc.mapred.OrcOutputFormat;
+import org.apache.orc.mapred.OrcStruct;
+import org.apache.orc.mapred.OrcTimestamp;
+import org.apache.orc.mapred.OrcUnion;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestOrcOutputFormat {
+
+  Path workDir = new Path(System.getProperty("test.tmp.dir",
+      "target" + File.separator + "test" + File.separator + "tmp"));
+  JobConf conf = new JobConf();
+  FileSystem fs;
+
+  {
+    try {
+      fs =  FileSystem.getLocal(conf).getRaw();
+      fs.delete(workDir, true);
+      fs.mkdirs(workDir);
+    } catch (IOException e) {
+      throw new IllegalStateException("bad fs init", e);
+    }
+  }
+
+  static class NullOutputCommitter extends OutputCommitter {
+
+    @Override
+    public void setupJob(JobContext jobContext) {
+      // PASS
+    }
+
+    @Override
+    public void setupTask(TaskAttemptContext taskAttemptContext) {
+
+    }
+
+    @Override
+    public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) {
+      return false;
+    }
+
+    @Override
+    public void commitTask(TaskAttemptContext taskAttemptContext) {
+      // PASS
+    }
+
+    @Override
+    public void abortTask(TaskAttemptContext taskAttemptContext) {
+      // PASS
+    }
+  }
+
+  @Test
+  public void testAllTypes() throws Exception {
+    conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0");
+    conf.setOutputCommitter(NullOutputCommitter.class);
+    final String typeStr = "struct<b1:binary,b2:boolean,b3:tinyint," +
+        "c:char(10),d1:date,d2:decimal(20,5),d3:double,fff:float,int:int," +
+        "l:array<bigint>,map:map<smallint,string>," +
+        "str:struct<u:uniontype<timestamp,varchar(100)>>,ts:timestamp>";
+    conf.set(OrcConf.SCHEMA.getAttribute(), typeStr);
+    FileOutputFormat.setOutputPath(conf, workDir);
+    TypeDescription type = TypeDescription.fromString(typeStr);
+
+    // build a row object
+    OrcStruct row = (OrcStruct) OrcStruct.createValue(type);
+    ((BytesWritable) row.getFieldValue(0)).set(new byte[]{1,2,3,4}, 0, 4);
+    ((BooleanWritable) row.getFieldValue(1)).set(true);
+    ((ByteWritable) row.getFieldValue(2)).set((byte) 23);
+    ((Text) row.getFieldValue(3)).set("aaabbbcccddd");
+    SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
+    ((DateWritable) row.getFieldValue(4)).set(DateWritable.millisToDays
+        (format.parse("2016-04-01").getTime()));
+    ((HiveDecimalWritable) row.getFieldValue(5)).set(new 
HiveDecimalWritable("1.23"));
+    ((DoubleWritable) row.getFieldValue(6)).set(1.5);
+    ((FloatWritable) row.getFieldValue(7)).set(4.5f);
+    ((IntWritable) row.getFieldValue(8)).set(31415);
+    OrcList<LongWritable> longList = (OrcList<LongWritable>) 
row.getFieldValue(9);
+    longList.add(new LongWritable(123));
+    longList.add(new LongWritable(456));
+    OrcMap<ShortWritable,Text> map = (OrcMap<ShortWritable,Text>) 
row.getFieldValue(10);
+    map.put(new ShortWritable((short) 1000), new Text("aaaa"));
+    map.put(new ShortWritable((short) 123), new Text("bbbb"));
+    OrcStruct struct = (OrcStruct) row.getFieldValue(11);
+    OrcUnion union = (OrcUnion) struct.getFieldValue(0);
+    union.set((byte) 1, new Text("abcde"));
+    ((OrcTimestamp) row.getFieldValue(12)).set("1996-12-11 15:00:00");
+    NullWritable nada = NullWritable.get();
+    RecordWriter<NullWritable, OrcStruct> writer =
+        new OrcOutputFormat<OrcStruct>().getRecordWriter(fs, conf, "all.orc",
+            Reporter.NULL);
+    for(int r=0; r < 10; ++r) {
+      row.setFieldValue(8, new IntWritable(r * 10));
+      writer.write(nada, row);
+    }
+    union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56"));
+    for(int r=0; r < 10; ++r) {
+      row.setFieldValue(8, new IntWritable(r * 10 + 100));
+      writer.write(nada, row);
+    }
+    OrcStruct row2 = new OrcStruct(type);
+    writer.write(nada, row2);
+    row.setFieldValue(8, new IntWritable(210));
+    writer.write(nada, row);
+    writer.close(Reporter.NULL);
+
+    FileSplit split = new FileSplit(new Path(workDir, "all.orc"), 0, 100000,
+        new String[0]);
+    RecordReader<NullWritable, OrcStruct> reader =
+        new OrcInputFormat<OrcStruct>().getRecordReader(split, conf,
+            Reporter.NULL);
+    nada = reader.createKey();
+    row = reader.createValue();
+    for(int r=0; r < 22; ++r) {
+      assertEquals(true, reader.next(nada, row));
+      if (r == 20) {
+        for(int c=0; c < 12; ++c) {
+          assertEquals(null, row.getFieldValue(c));
+        }
+      } else {
+        assertEquals(new BytesWritable(new byte[]{1, 2, 3, 4}), 
row.getFieldValue(0));
+        assertEquals(new BooleanWritable(true), row.getFieldValue(1));
+        assertEquals(new ByteWritable((byte) 23), row.getFieldValue(2));
+        assertEquals(new Text("aaabbbcccd"), row.getFieldValue(3));
+        assertEquals(new DateWritable(DateWritable.millisToDays
+            (format.parse("2016-04-01").getTime())), row.getFieldValue(4));
+        assertEquals(new HiveDecimalWritable("1.23"), row.getFieldValue(5));
+        assertEquals(new DoubleWritable(1.5), row.getFieldValue(6));
+        assertEquals(new FloatWritable(4.5f), row.getFieldValue(7));
+        assertEquals(new IntWritable(r * 10), row.getFieldValue(8));
+        assertEquals(longList, row.getFieldValue(9));
+        assertEquals(map, row.getFieldValue(10));
+        if (r < 10) {
+          union.set((byte) 1, new Text("abcde"));
+        } else {
+          union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56"));
+        }
+        assertEquals("row " + r, struct, row.getFieldValue(11));
+        assertEquals("row " + r, new OrcTimestamp("1996-12-11 15:00:00"),
+            row.getFieldValue(12));
+      }
+    }
+    assertEquals(false, reader.next(nada, row));
+  }
+
+  /**
+   * Test the case where the top level isn't a struct, but a long.
+   */
+  @Test
+  public void testLongRoot() throws Exception {
+    conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0");
+    conf.setOutputCommitter(NullOutputCommitter.class);
+    conf.set(OrcConf.COMPRESS.getAttribute(), "SNAPPY");
+    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
+    conf.setInt(OrcConf.BUFFER_SIZE.getAttribute(), 64 * 1024);
+    conf.set(OrcConf.WRITE_FORMAT.getAttribute(), "0.11");
+    final String typeStr = "bigint";
+    conf.set(OrcConf.SCHEMA.getAttribute(), typeStr);
+    FileOutputFormat.setOutputPath(conf, workDir);
+    TypeDescription type = TypeDescription.fromString(typeStr);
+    LongWritable value = new LongWritable();
+    NullWritable nada = NullWritable.get();
+    RecordWriter<NullWritable, LongWritable> writer =
+        new OrcOutputFormat<LongWritable>().getRecordWriter(fs, conf,
+            "long.orc", Reporter.NULL);
+    for(long lo=0; lo < 2000; ++lo) {
+      value.set(lo);
+      writer.write(nada, value);
+    }
+    writer.close(Reporter.NULL);
+
+    Path path = new Path(workDir, "long.orc");
+    Reader file = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+    assertEquals(CompressionKind.SNAPPY, file.getCompressionKind());
+    assertEquals(2000, file.getNumberOfRows());
+    assertEquals(1000, file.getRowIndexStride());
+    assertEquals(64 * 1024, file.getCompressionSize());
+    assertEquals(OrcFile.Version.V_0_11, file.getFileVersion());
+    FileSplit split = new FileSplit(path, 0, 100000,
+        new String[0]);
+    RecordReader<NullWritable, LongWritable> reader =
+        new OrcInputFormat<LongWritable>().getRecordReader(split, conf,
+            Reporter.NULL);
+    nada = reader.createKey();
+    value = reader.createValue();
+    for(long lo=0; lo < 2000; ++lo) {
+      assertEquals(true, reader.next(nada, value));
+      assertEquals(lo, value.get());
+    }
+    assertEquals(false, reader.next(nada, value));
+  }
+}


http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/pom.xml
----------------------------------------------------------------------
diff --git a/java/pom.xml b/java/pom.xml
new file mode 100644
index 0000000..7ce1161
--- /dev/null
+++ b/java/pom.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache</groupId>
+    <artifactId>apache</artifactId>
+    <version>14</version>
+  </parent>
+  <groupId>org.apache.orc</groupId>
+  <artifactId>orc</artifactId>
+  <version>1.1.0-SNAPSHOT</version>
+  <packaging>pom</packaging>
+
+  <name>ORC</name>
+  <url>http://orc.apache.org</url>
+  <prerequisites>
+    <maven>2.2.1</maven>
+  </prerequisites>
+
+  <modules>
+    <module>storage-api</module>
+    <module>core</module>
+    <module>mapreduce</module>
+  </modules>
+
+  <properties>
+    <!-- Build Properties -->
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    
<maven.compiler.useIncrementalCompilation>false</maven.compiler.useIncrementalCompilation>
+    <test.tmp.dir>${project.build.directory}/testing-tmp</test.tmp.dir>
+
+    <commons-codec.version>1.4</commons-codec.version>
+    <hadoop.version>2.6.0</hadoop.version>
+    <junit.version>4.11</junit.version>
+    <kryo.version>3.0.3</kryo.version>
+    <mockito.version>1.9.5</mockito.version>
+    <protobuf.version>2.5.0</protobuf.version>
+    <slf4j.version>1.7.5</slf4j.version>
+    <snappy.version>0.2</snappy.version>
+  </properties>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.10.3</version>
+        <configuration>
+          ...
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>setup-test-dirs</id>
+            <phase>process-test-resources</phase>
+            <goals>
+              <goal>run</goal>
+            </goals>
+            <configuration>
+              <target>
+                <delete dir="${test.tmp.dir}" />
+                <mkdir dir="${test.tmp.dir}" />
+              </target>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <reuseForks>false</reuseForks>
+          <argLine>-Xmx2048m -XX:MaxPermSize=512m</argLine>
+          <environmentVariables>
+            <TZ>US/Pacific</TZ>
+            <LANG>en_US.UTF-8</LANG>
+          </environmentVariables>
+          <systemPropertyVariables>
+            <test.tmp.dir>${test.tmp.dir}</test.tmp.dir>
+          </systemPropertyVariables>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <!-- global dependencies -->
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>${slf4j.version}</version>
+    </dependency>
+  </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/pom.xml
----------------------------------------------------------------------
diff --git a/java/storage-api/pom.xml b/java/storage-api/pom.xml
new file mode 100644
index 0000000..85bd77c
--- /dev/null
+++ b/java/storage-api/pom.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.hive</groupId>
+  <artifactId>hive-storage-api</artifactId>
+  <version>2.0.0.1-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>Hive Storage API</name>
+
+  <dependencies>
+    <!-- test inter-project -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>2.6.0</version>
+      <optional>true</optional>
+      <exclusions>
+        <exclusion>
+          <groupId>commmons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>servlet-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+   </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <sourceDirectory>${basedir}/src/java</sourceDirectory>
+    <testSourceDirectory>${basedir}/src/test</testSourceDirectory>
+    <testResources>
+      <testResource>
+        <directory>${basedir}/src/test/resources</directory>
+      </testResource>
+    </testResources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java
new file mode 100644
index 0000000..86b838c
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.common.io.DiskRange;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Disk range information class containing disk ranges and total length.
+ */
+public class DiskRangeInfo {
+  List<DiskRange> diskRanges; // TODO: use DiskRangeList instead
+  long totalLength;
+
+  public DiskRangeInfo(int indexBaseOffset) {
+    this.diskRanges = Lists.newArrayList();
+    // Some data is missing from the stream for PPD uncompressed read (because 
index offset is
+    // relative to the entire stream and we only read part of stream if RGs 
are filtered; unlike
+    // with compressed data where PPD only filters CBs, so we always get full 
CB, and index offset
+    // is relative to CB). To take care of the case when UncompressedStream 
goes seeking around by
+    // its incorrect (relative to partial stream) index offset, we will 
increase the length by our
+    // offset-relative-to-the-stream, and also account for it in buffers (see 
createDiskRangeInfo).
+    // So, index offset now works; as long as noone seeks into this data 
before the RG (why would
+    // they), everything works. This is hacky... Stream shouldn't depend on 
having all the data.
+    this.totalLength = indexBaseOffset;
+  }
+
+  public void addDiskRange(DiskRange diskRange) {
+    diskRanges.add(diskRange);
+    totalLength += diskRange.getLength();
+  }
+
+  public List<DiskRange> getDiskRanges() {
+    return diskRanges;
+  }
+
+  public long getTotalLength() {
+    return totalLength;
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java
new file mode 100644
index 0000000..272bbdd
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common;
+
+/** Simple object pool to prevent GC on small objects passed between threads. 
*/
+public interface Pool<T> {
+  /** Object helper for objects stored in the pool. */
+  public interface PoolObjectHelper<T> {
+    /** Called to create an object when one cannot be provided. */
+    T create();
+    /** Called before the object is put in the pool (regardless of whether put 
succeeds). */
+    void resetBeforeOffer(T t);
+  }
+
+  T take();
+  void offer(T t);
+  int size();
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java
new file mode 100644
index 0000000..fd9d9c9
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.io;
+
+import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer;
+
+/** An allocator provided externally to storage classes to allocate 
MemoryBuffer-s. */
+public interface Allocator {
+  public static class AllocatorOutOfMemoryException extends RuntimeException {
+    public AllocatorOutOfMemoryException(String msg) {
+      super(msg);
+    }
+
+    private static final long serialVersionUID = 268124648177151761L;
+  }
+
+  /**
+   * Allocates multiple buffers of a given size.
+   * @param dest Array where buffers are placed. Objects are reused if already 
there
+   *             (see createUnallocated), created otherwise.
+   * @param size Allocation size.
+   * @throws AllocatorOutOfMemoryException Cannot allocate.
+   */
+  void allocateMultiple(MemoryBuffer[] dest, int size) throws 
AllocatorOutOfMemoryException;
+
+  /**
+   * Creates an unallocated memory buffer object. This object can be passed to 
allocateMultiple
+   * to allocate; this is useful if data structures are created for separate 
buffers that can
+   * later be allocated together.
+   */
+  MemoryBuffer createUnallocated();
+  /** Deallocates a memory buffer. */
+  void deallocate(MemoryBuffer buffer);
+  /** Whether the allocator uses direct buffers. */
+  boolean isDirectAlloc();
+  /** Maximum allocation size supported by this allocator. */
+  int getMaxAllocation();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java
new file mode 100644
index 0000000..1273588
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.common.io;
+
+import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer;
+
+/** An abstract data cache that IO formats can use to retrieve and cache data. 
*/
+public interface DataCache {
+  public static final class BooleanRef {
+    public boolean value;
+  }
+
+  /** Disk range factory used during cache retrieval. */
+  public interface DiskRangeListFactory {
+    DiskRangeList createCacheChunk(MemoryBuffer buffer, long startOffset, long 
endOffset);
+  }
+
+  /**
+   * Gets file data for particular offsets. The range list is modified in 
place; it is then
+   * returned (since the list head could have changed). Ranges are replaced 
with cached ranges.
+   *
+   * Any such buffer is locked in cache to prevent eviction, and must 
therefore be released
+   * back to cache via a corresponding call (releaseBuffer) when the caller is 
done with it.
+   *
+   * In case of partial overlap with cached data, full cache blocks are always 
returned;
+   * there's no capacity for partial matches in return type. The rules are as 
follows:
+   * 1) If the requested range starts in the middle of a cached range, that 
cached range will not
+   *    be returned by default (e.g. if [100,200) and [200,300) are cached, 
the request for
+   *    [150,300) will only return [200,300) from cache). This may be 
configurable in impls.
+   *    This is because we assume well-known range start offsets are used 
(rg/stripe offsets), so
+   *    a request from the middle of the start doesn't make sense.
+   * 2) If the requested range ends in the middle of a cached range, that 
entire cached range will
+   *    be returned (e.g. if [100,200) and [200,300) are cached, the request 
for [100,250) will
+   *    return both ranges). It should really be same as #1, however currently 
ORC uses estimated
+   *    end offsets; if we don't return the end block, the caller may read it 
from disk needlessly.
+   *
+   * @param fileId Unique ID of the target file on the file system.
+   * @param range A set of DiskRange-s (linked list) that is to be retrieved. 
May be modified.
+   * @param baseOffset base offset for the ranges (stripe/stream offset in 
case of ORC).
+   * @param factory A factory to produce DiskRangeList-s out of cached 
MemoryBuffer-s.
+   * @param gotAllData An out param - whether all the requested data was found 
in cache.
+   * @return The new or modified list of DiskRange-s, where some ranges may 
contain cached data.
+   */
+  DiskRangeList getFileData(Object fileKey, DiskRangeList range, long 
baseOffset,
+      DiskRangeListFactory factory, BooleanRef gotAllData);
+
+  /**
+   * Puts file data into cache, or gets older data in case of collisions.
+   *
+   * The memory buffers provided MUST be allocated via an allocator returned 
by getAllocator
+   * method, to allow cache implementations that evict and then de-allocate 
the buffer.
+   *
+   * It is assumed that the caller will use the data immediately, therefore 
any buffers provided
+   * to putFileData (or returned due to cache collision) are locked in cache 
to prevent eviction,
+   * and must therefore be released back to cache via a corresponding call 
(releaseBuffer) when the
+   * caller is done with it. Buffers rejected due to conflict will neither be 
locked, nor
+   * automatically deallocated. The caller must take care to discard these 
buffers.
+   *
+   * @param fileId Unique ID of the target file on the file system.
+   * @param ranges The ranges for which the data is being cached. These 
objects will not be stored.
+   * @param data The data for the corresponding ranges.
+   * @param baseOffset base offset for the ranges (stripe/stream offset in 
case of ORC).
+   * @return null if all data was put; bitmask indicating which chunks were 
not put otherwise;
+   *         the replacement chunks from cache are updated directly in the 
array.
+   */
+  long[] putFileData(Object fileKey, DiskRange[] ranges, MemoryBuffer[] data, 
long baseOffset);
+
+  /**
+   * Releases the buffer returned by getFileData/provided to putFileData back 
to cache.
+   * See respective javadocs for details.
+   */
+  void releaseBuffer(MemoryBuffer buffer);
+
+  /**
+   * Notifies the cache that the buffer returned from getFileData/provided to 
putFileData will
+   * be used by another consumer and therefore released multiple times (one 
more time per call).
+   */
+  void reuseBuffer(MemoryBuffer buffer);
+
+  /**
+   * Gets the allocator associated with this DataCache.
+   */
+  Allocator getAllocator();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java
new file mode 100644
index 0000000..33aecf5
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.io;
+
+import java.nio.ByteBuffer;
+
+/**
+ * The sections of a file.
+ */
+public class DiskRange {
+  /** The first address. */
+  protected long offset;
+  /** The address afterwards. */
+  protected long end;
+
+  public DiskRange(long offset, long end) {
+    this.offset = offset;
+    this.end = end;
+    if (end < offset) {
+      throw new IllegalArgumentException("invalid range " + this);
+    }
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == null || other.getClass() != getClass()) {
+      return false;
+    }
+    return equalRange((DiskRange) other);
+  }
+
+  public boolean equalRange(DiskRange other) {
+    return other.offset == offset && other.end == end;
+  }
+
+  @Override
+  public int hashCode() {
+    return (int)(offset ^ (offset >>> 32)) * 31 + (int)(end ^ (end >>> 32));
+  }
+
+  @Override
+  public String toString() {
+    return "range start: " + offset + " end: " + end;
+  }
+
+  public long getOffset() {
+    return offset;
+  }
+
+  public long getEnd() {
+    return end;
+  }
+
+  public int getLength() {
+    long len = this.end - this.offset;
+    assert len <= Integer.MAX_VALUE;
+    return (int)len;
+  }
+
+  // For subclasses
+  public boolean hasData() {
+    return false;
+  }
+
+  public DiskRange sliceAndShift(long offset, long end, long shiftBy) {
+    // Rather, unexpected usage exception.
+    throw new UnsupportedOperationException();
+  }
+
+  public ByteBuffer getData() {
+    throw new UnsupportedOperationException();
+  }
+
+  protected boolean merge(long otherOffset, long otherEnd) {
+    if (!overlap(offset, end, otherOffset, otherEnd)) return false;
+    offset = Math.min(offset, otherOffset);
+    end = Math.max(end, otherEnd);
+    return true;
+  }
+
+  private static boolean overlap(long leftA, long rightA, long leftB, long 
rightB) {
+    if (leftA <= leftB) {
+      return rightA >= leftB;
+    }
+    return rightB >= leftA;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java
new file mode 100644
index 0000000..b84aeb5
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.io;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Java linked list iterator interface is convoluted, and moreover concurrent 
modifications
+ * of the same list by multiple iterators are impossible. Hence, this.
+ * Java also doesn't support multiple inheritance, so this cannot be done as 
"aspect"... */
+public class DiskRangeList extends DiskRange {
+  private static final Logger LOG = 
LoggerFactory.getLogger(DiskRangeList.class);
+  public DiskRangeList prev, next;
+
+  public DiskRangeList(long offset, long end) {
+    super(offset, end);
+  }
+
+  /** Replaces this element with another in the list; returns the new element. 
*/
+  public DiskRangeList replaceSelfWith(DiskRangeList other) {
+    other.prev = this.prev;
+    other.next = this.next;
+    if (this.prev != null) {
+      this.prev.next = other;
+    }
+    if (this.next != null) {
+      this.next.prev = other;
+    }
+    this.next = this.prev = null;
+    return other;
+  }
+
+  /**
+   * Inserts an intersecting range before current in the list and adjusts 
offset accordingly.
+   * @returns the new element.
+   */
+  public DiskRangeList insertPartBefore(DiskRangeList other) {
+    assert other.end >= this.offset;
+    this.offset = other.end;
+    other.prev = this.prev;
+    other.next = this;
+    if (this.prev != null) {
+      this.prev.next = other;
+    }
+    this.prev = other;
+    return other;
+  }
+
+  /**
+   * Inserts an element after current in the list.
+   * @returns the new element.
+   * */
+  public DiskRangeList insertAfter(DiskRangeList other) {
+    other.next = this.next;
+    other.prev = this;
+    if (this.next != null) {
+      this.next.prev = other;
+    }
+    this.next = other;
+    return other;
+  }
+
+  /**
+   * Inserts an intersecting range after current in the list and adjusts 
offset accordingly.
+   * @returns the new element.
+   */
+  public DiskRangeList insertPartAfter(DiskRangeList other) {
+    assert other.offset <= this.end;
+    this.end = other.offset;
+    return insertAfter(other);
+  }
+
+  /** Removes an element after current from the list. */
+  public void removeAfter() {
+    DiskRangeList other = this.next;
+    this.next = other.next;
+    if (this.next != null) {
+      this.next.prev = this;
+    }
+    other.next = other.prev = null;
+  }
+
+  /** Removes the current element from the list. */
+  public void removeSelf() {
+    if (this.prev != null) {
+      this.prev.next = this.next;
+    }
+    if (this.next != null) {
+      this.next.prev = this.prev;
+    }
+    this.next = this.prev = null;
+  }
+
+  /** Splits current element in the list, using DiskRange::slice */
+  public final DiskRangeList split(long cOffset) {
+    insertAfter((DiskRangeList)this.sliceAndShift(cOffset, end, 0));
+    return replaceSelfWith((DiskRangeList)this.sliceAndShift(offset, cOffset, 
0));
+  }
+
+  public boolean hasContiguousNext() {
+    return next != null && end == next.offset;
+  }
+
+  // @VisibleForTesting
+  public int listSize() {
+    int result = 1;
+    DiskRangeList current = this.next;
+    while (current != null) {
+      ++result;
+      current = current.next;
+    }
+    return result;
+  }
+
+  public long getTotalLength() {
+    long totalLength = getLength();
+    DiskRangeList current = next;
+    while (current != null) {
+      totalLength += current.getLength();
+      current = current.next;
+    }
+    return totalLength;
+  }
+
+  // @VisibleForTesting
+  public DiskRangeList[] listToArray() {
+    DiskRangeList[] result = new DiskRangeList[listSize()];
+    int i = 0;
+    DiskRangeList current = this.next;
+    while (current != null) {
+      result[i] = current;
+      ++i;
+      current = current.next;
+    }
+    return result;
+  }
+
+  public static class CreateHelper {
+    private DiskRangeList tail = null, head;
+
+    public DiskRangeList getTail() {
+      return tail;
+    }
+
+    public void addOrMerge(long offset, long end, boolean doMerge, boolean 
doLogNew) {
+      if (doMerge && tail != null && tail.merge(offset, end)) return;
+      if (doLogNew) {
+        LOG.info("Creating new range; last range (which can include some 
previous adds) was "
+            + tail);
+      }
+      DiskRangeList node = new DiskRangeList(offset, end);
+      if (tail == null) {
+        head = tail = node;
+      } else {
+        tail = tail.insertAfter(node);
+      }
+    }
+
+    public DiskRangeList get() {
+      return head;
+    }
+
+    public DiskRangeList extract() {
+      DiskRangeList result = head;
+      head = null;
+      return result;
+    }
+  }
+
+  /**
+   * List in-place mutation helper - a bogus first element that is inserted 
before list head,
+   * and thus remains constant even if head is replaced with some new range 
via in-place list
+   * mutation. extract() can be used to obtain the modified list.
+   */
+  public static class MutateHelper extends DiskRangeList {
+    public MutateHelper(DiskRangeList head) {
+      super(-1, -1);
+      assert head != null;
+      assert head.prev == null;
+      this.next = head;
+      head.prev = this;
+    }
+
+    public DiskRangeList get() {
+      return next;
+    }
+
+    public DiskRangeList extract() {
+      DiskRangeList result = this.next;
+      assert result != null;
+      this.next = result.prev = null;
+      return result;
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java
 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java
new file mode 100644
index 0000000..907181e
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.common.io.encoded;
+
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * A block of data for a given section of a file, similar to VRB but in 
encoded form.
+ * Stores a set of buffers for each encoded stream that is a part of each 
column.
+ */
+public class EncodedColumnBatch<BatchKey> {
+  /**
+   * Slice of the data for a stream for some column, stored inside 
MemoryBuffer's.
+   * ColumnStreamData can be reused for many EncodedColumnBatch-es (e.g. 
dictionary stream), so
+   * it tracks the number of such users via a refcount.
+   */
+  public static class ColumnStreamData {
+    private List<MemoryBuffer> cacheBuffers;
+    /** Base offset from the beginning of the indexable unit; for example, for 
ORC,
+     * offset from the CB in a compressed file, from the stream in 
uncompressed file. */
+    private int indexBaseOffset = 0;
+
+    /** Reference count. */
+    private AtomicInteger refCount = new AtomicInteger(0);
+
+    public void reset() {
+      cacheBuffers.clear();
+      refCount.set(0);
+      indexBaseOffset = 0;
+    }
+
+    public void incRef() {
+      refCount.incrementAndGet();
+    }
+
+    public int decRef() {
+      int i = refCount.decrementAndGet();
+      assert i >= 0;
+      return i;
+    }
+
+    public List<MemoryBuffer> getCacheBuffers() {
+      return cacheBuffers;
+    }
+
+    public void setCacheBuffers(List<MemoryBuffer> cacheBuffers) {
+      this.cacheBuffers = cacheBuffers;
+    }
+
+    public int getIndexBaseOffset() {
+      return indexBaseOffset;
+    }
+
+    public void setIndexBaseOffset(int indexBaseOffset) {
+      this.indexBaseOffset = indexBaseOffset;
+    }
+  }
+
+  /** The key that is used to map this batch to source location. */
+  protected BatchKey batchKey;
+  /**
+   * Stream data for each stream, for each included column.
+   * For each column, streams are indexed by kind, with missing elements being 
null.
+   */
+  protected ColumnStreamData[][] columnData;
+  /** Column indexes included in the batch. Correspond to columnData elements. 
*/
+  protected int[] columnIxs;
+
+  public void reset() {
+    if (columnData == null) return;
+    for (int i = 0; i < columnData.length; ++i) {
+      if (columnData[i] == null) continue;
+      for (int j = 0; j < columnData[i].length; ++j) {
+        columnData[i][j] = null;
+      }
+    }
+  }
+
+  public void initColumn(int colIxMod, int colIx, int streamCount) {
+    columnIxs[colIxMod] = colIx;
+    if (columnData[colIxMod] == null || columnData[colIxMod].length != 
streamCount) {
+      columnData[colIxMod] = new ColumnStreamData[streamCount];
+    }
+  }
+
+  public void setStreamData(int colIxMod, int streamKind, ColumnStreamData 
csd) {
+    columnData[colIxMod][streamKind] = csd;
+  }
+
+  public void setAllStreamsData(int colIxMod, int colIx, ColumnStreamData[] 
sbs) {
+    columnIxs[colIxMod] = colIx;
+    columnData[colIxMod] = sbs;
+  }
+
+  public BatchKey getBatchKey() {
+    return batchKey;
+  }
+
+  public ColumnStreamData[][] getColumnData() {
+    return columnData;
+  }
+
+  public int[] getColumnIxs() {
+    return columnIxs;
+  }
+
+  protected void resetColumnArrays(int columnCount) {
+    if (columnIxs != null && columnCount == columnIxs.length) return;
+    columnIxs = new int[columnCount];
+    ColumnStreamData[][] columnData = new ColumnStreamData[columnCount][];
+    if (this.columnData != null) {
+      for (int i = 0; i < Math.min(columnData.length, this.columnData.length); 
++i) {
+        columnData[i] = this.columnData[i];
+      }
+    }
+    this.columnData = columnData;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java
 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java
new file mode 100644
index 0000000..4475009
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.common.io.encoded;
+
+import java.nio.ByteBuffer;
+
+/** Abstract interface for any class wrapping a ByteBuffer. */
+public interface MemoryBuffer {
+  /** Note - raw buffer should not be modified. */
+  public ByteBuffer getByteBufferRaw();
+  public ByteBuffer getByteBufferDup();
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java
new file mode 100644
index 0000000..1c6be91
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java
@@ -0,0 +1,332 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.type;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.math.RoundingMode;
+
+/**
+ *
+ * HiveDecimal. Simple wrapper for BigDecimal. Adds fixed max precision and 
non scientific string
+ * representation
+ *
+ */
+public class HiveDecimal implements Comparable<HiveDecimal> {
+  public static final int MAX_PRECISION = 38;
+  public static final int MAX_SCALE = 38;
+
+  /**
+   * Default precision/scale when user doesn't specify in the column metadata, 
such as
+   * decimal and decimal(8).
+   */
+  public static final int USER_DEFAULT_PRECISION = 10;
+  public static final int USER_DEFAULT_SCALE = 0;
+
+  /**
+   *  Default precision/scale when system is not able to determine them, such 
as in case
+   *  of a non-generic udf.
+   */
+  public static final int SYSTEM_DEFAULT_PRECISION = 38;
+  public static final int SYSTEM_DEFAULT_SCALE = 18;
+
+  public static final HiveDecimal ZERO = new HiveDecimal(BigDecimal.ZERO);
+  public static final HiveDecimal ONE = new HiveDecimal(BigDecimal.ONE);
+
+  public static final int ROUND_FLOOR = BigDecimal.ROUND_FLOOR;
+  public static final int ROUND_CEILING = BigDecimal.ROUND_CEILING;
+  public static final int ROUND_HALF_UP = BigDecimal.ROUND_HALF_UP;
+  public static final int ROUND_HALF_EVEN = BigDecimal.ROUND_HALF_EVEN;
+
+  private BigDecimal bd = BigDecimal.ZERO;
+
+  private HiveDecimal(BigDecimal bd) {
+    this.bd = bd;
+  }
+
+  public static HiveDecimal create(BigDecimal b) {
+    return create(b, true);
+  }
+
+  public static HiveDecimal create(BigDecimal b, boolean allowRounding) {
+    BigDecimal bd = normalize(b, allowRounding);
+    return bd == null ? null : new HiveDecimal(bd);
+  }
+
+  public static HiveDecimal create(BigInteger unscaled, int scale) {
+    BigDecimal bd = normalize(new BigDecimal(unscaled, scale), true);
+    return bd == null ? null : new HiveDecimal(bd);
+  }
+
+  public static HiveDecimal create(String dec) {
+    BigDecimal bd;
+    try {
+      bd = new BigDecimal(dec.trim());
+    } catch (NumberFormatException ex) {
+      return null;
+    }
+
+    bd = normalize(bd, true);
+    return bd == null ? null : new HiveDecimal(bd);
+  }
+
+  public static HiveDecimal create(BigInteger bi) {
+    BigDecimal bd = normalize(new BigDecimal(bi), true);
+    return bd == null ? null : new HiveDecimal(bd);
+  }
+
+  public static HiveDecimal create(int i) {
+    return new HiveDecimal(new BigDecimal(i));
+  }
+
+  public static HiveDecimal create(long l) {
+    return new HiveDecimal(new BigDecimal(l));
+  }
+
+  @Override
+  public String toString() {
+     return bd.toPlainString();
+  }
+  
+  /**
+   * Return a string representation of the number with the number of decimal 
digits as
+   * the given scale. Please note that this is different from toString().
+   * @param scale the number of digits after the decimal point
+   * @return the string representation of exact number of decimal digits
+   */
+  public String toFormatString(int scale) {
+    return (bd.scale() == scale ? bd :
+      bd.setScale(scale, RoundingMode.HALF_UP)).toPlainString();
+  }
+
+  public HiveDecimal setScale(int i) {
+    return new HiveDecimal(bd.setScale(i, RoundingMode.HALF_UP));
+  }
+
+  @Override
+  public int compareTo(HiveDecimal dec) {
+    return bd.compareTo(dec.bd);
+  }
+
+  @Override
+  public int hashCode() {
+    return bd.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null || obj.getClass() != getClass()) {
+      return false;
+    }
+    return bd.equals(((HiveDecimal) obj).bd);
+  }
+
+  public int scale() {
+    return bd.scale();
+  }
+
+  /**
+   * Returns the number of digits (integer and fractional) in the number, 
which is equivalent
+   * to SQL decimal precision. Note that this is different from 
BigDecimal.precision(),
+   * which returns the precision of the unscaled value 
(BigDecimal.valueOf(0.01).precision() = 1,
+   * whereas HiveDecimal.create("0.01").precision() = 2).
+   * If you want the BigDecimal precision, use 
HiveDecimal.bigDecimalValue().precision()
+   * @return
+   */
+  public int precision() {
+    int bdPrecision = bd.precision();
+    int bdScale = bd.scale();
+
+    if (bdPrecision < bdScale) {
+      // This can happen for numbers less than 0.1
+      // For 0.001234: bdPrecision=4, bdScale=6
+      // In this case, we'll set the type to have the same precision as the 
scale.
+      return bdScale;
+    }
+    return bdPrecision;
+  }
+
+  public int intValue() {
+    return bd.intValue();
+  }
+
+  public double doubleValue() {
+    return bd.doubleValue();
+  }
+
+  public long longValue() {
+    return bd.longValue();
+  }
+
+  public short shortValue() {
+    return bd.shortValue();
+  }
+
+  public float floatValue() {
+    return bd.floatValue();
+  }
+
+  public BigDecimal bigDecimalValue() {
+    return bd;
+  }
+
+  public byte byteValue() {
+    return bd.byteValue();
+  }
+
+  public HiveDecimal setScale(int adjustedScale, int rm) {
+    return create(bd.setScale(adjustedScale, rm));
+  }
+
+  public HiveDecimal subtract(HiveDecimal dec) {
+    return create(bd.subtract(dec.bd));
+  }
+
+  public HiveDecimal multiply(HiveDecimal dec) {
+    return create(bd.multiply(dec.bd), false);
+  }
+
+  public BigInteger unscaledValue() {
+    return bd.unscaledValue();
+  }
+
+  public HiveDecimal scaleByPowerOfTen(int n) {
+    return create(bd.scaleByPowerOfTen(n));
+  }
+
+  public HiveDecimal abs() {
+    return create(bd.abs());
+  }
+
+  public HiveDecimal negate() {
+    return create(bd.negate());
+  }
+
+  public HiveDecimal add(HiveDecimal dec) {
+    return create(bd.add(dec.bd));
+  }
+
+  public HiveDecimal pow(int n) {
+    BigDecimal result = normalize(bd.pow(n), false);
+    return result == null ? null : new HiveDecimal(result);
+  }
+
+  public HiveDecimal remainder(HiveDecimal dec) {
+    return create(bd.remainder(dec.bd));
+  }
+
+  public HiveDecimal divide(HiveDecimal dec) {
+    return create(bd.divide(dec.bd, MAX_SCALE, RoundingMode.HALF_UP), true);
+  }
+
+  /**
+   * Get the sign of the underlying decimal.
+   * @return 0 if the decimal is equal to 0, -1 if less than zero, and 1 if 
greater than 0
+   */
+  public int signum() {
+    return bd.signum();
+  }
+
+  private static BigDecimal trim(BigDecimal d) {
+    if (d.compareTo(BigDecimal.ZERO) == 0) {
+      // Special case for 0, because java doesn't strip zeros correctly on 
that number.
+      d = BigDecimal.ZERO;
+    } else {
+      d = d.stripTrailingZeros();
+      if (d.scale() < 0) {
+        // no negative scale decimals
+        d = d.setScale(0);
+      }
+    }
+    return d;
+  }
+
+  private static BigDecimal normalize(BigDecimal bd, boolean allowRounding) {
+    if (bd == null) {
+      return null;
+    }
+
+    bd = trim(bd);
+
+    int intDigits = bd.precision() - bd.scale();
+
+    if (intDigits > MAX_PRECISION) {
+      return null;
+    }
+
+    int maxScale = Math.min(MAX_SCALE, Math.min(MAX_PRECISION - intDigits, 
bd.scale()));
+    if (bd.scale() > maxScale ) {
+      if (allowRounding) {
+        bd = bd.setScale(maxScale, RoundingMode.HALF_UP);
+        // Trimming is again necessary, because rounding may introduce new 
trailing 0's.
+        bd = trim(bd);
+      } else {
+        bd = null;
+      }
+    }
+
+    return bd;
+  }
+
+  private static BigDecimal enforcePrecisionScale(BigDecimal bd, int 
maxPrecision, int maxScale) {
+    if (bd == null) {
+      return null;
+    }
+
+    /**
+     * Specially handling the case that bd=0, and we are converting it to a 
type where precision=scale,
+     * such as decimal(1, 1).
+     */
+    if (bd.compareTo(BigDecimal.ZERO) == 0 && bd.scale() == 0 && maxPrecision 
== maxScale) {
+      return bd.setScale(maxScale);
+    }
+
+    bd = trim(bd);
+
+    if (bd.scale() > maxScale) {
+      bd = bd.setScale(maxScale, RoundingMode.HALF_UP);
+    }
+
+    int maxIntDigits = maxPrecision - maxScale;
+    int intDigits = bd.precision() - bd.scale();
+    if (intDigits > maxIntDigits) {
+      return null;
+    }
+
+    return bd;
+  }
+
+  public static HiveDecimal enforcePrecisionScale(HiveDecimal dec, int 
maxPrecision, int maxScale) {
+    if (dec == null) {
+      return null;
+    }
+
+    // Minor optimization, avoiding creating new objects.
+    if (dec.precision() - dec.scale() <= maxPrecision - maxScale &&
+        dec.scale() <= maxScale) {
+      return dec;
+    }
+
+    BigDecimal bd = enforcePrecisionScale(dec.bd, maxPrecision, maxScale);
+    if (bd == null) {
+      return null;
+    }
+
+    return HiveDecimal.create(bd);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java
 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java
new file mode 100644
index 0000000..b891e27
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.type;
+
+import java.math.BigDecimal;
+import java.sql.Timestamp;
+import java.util.Date;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hive.common.util.IntervalDayTimeUtils;
+
+import sun.util.calendar.BaseCalendar;
+
+/**
+ * Day-time interval type representing an offset in days/hours/minutes/seconds,
+ * with nanosecond precision.
+ * 1 day = 24 hours = 1440 minutes = 86400 seconds
+ */
+public class HiveIntervalDayTime implements Comparable<HiveIntervalDayTime> {
+
+  // days/hours/minutes/seconds all represented as seconds
+  protected long totalSeconds;
+  protected int nanos;
+
+  public HiveIntervalDayTime() {
+  }
+
+  public HiveIntervalDayTime(int days, int hours, int minutes, int seconds, 
int nanos) {
+    set(days, hours, minutes, seconds, nanos);
+  }
+
+  public HiveIntervalDayTime(long seconds, int nanos) {
+    set(seconds, nanos);
+  }
+
+  public HiveIntervalDayTime(BigDecimal seconds) {
+    set(seconds);
+  }
+
+  public HiveIntervalDayTime(HiveIntervalDayTime other) {
+    set(other.totalSeconds, other.nanos);
+  }
+
+  public int getDays() {
+    return (int) TimeUnit.SECONDS.toDays(totalSeconds);
+  }
+
+  public int getHours() {
+    return (int) (TimeUnit.SECONDS.toHours(totalSeconds) % 
TimeUnit.DAYS.toHours(1));
+  }
+
+  public int getMinutes() {
+    return (int) (TimeUnit.SECONDS.toMinutes(totalSeconds) % 
TimeUnit.HOURS.toMinutes(1));
+  }
+
+  public int getSeconds() {
+    return (int) (totalSeconds % TimeUnit.MINUTES.toSeconds(1));
+  }
+
+  public int getNanos() {
+    return nanos;
+  }
+
+  /**
+   * Returns days/hours/minutes all converted into seconds.
+   * Nanos still need to be retrieved using getNanos()
+   * @return
+   */
+  public long getTotalSeconds() {
+    return totalSeconds;
+  }
+
+  /**
+   *
+   * @return double representation of the interval day time, accurate to 
nanoseconds
+   */
+  public double getDouble() {
+    return totalSeconds + nanos / 1000000000;
+  }
+
+  /**
+   * Ensures that the seconds and nanoseconds fields have consistent sign
+   */
+  protected void normalizeSecondsAndNanos() {
+    if (totalSeconds > 0 && nanos < 0) {
+      --totalSeconds;
+      nanos += IntervalDayTimeUtils.NANOS_PER_SEC;
+    } else if (totalSeconds < 0 && nanos > 0) {
+      ++totalSeconds;
+      nanos -= IntervalDayTimeUtils.NANOS_PER_SEC;
+    }
+  }
+
+  public void set(int days, int hours, int minutes, int seconds, int nanos) {
+    long totalSeconds = seconds;
+    totalSeconds += TimeUnit.DAYS.toSeconds(days);
+    totalSeconds += TimeUnit.HOURS.toSeconds(hours);
+    totalSeconds += TimeUnit.MINUTES.toSeconds(minutes);
+    totalSeconds += TimeUnit.NANOSECONDS.toSeconds(nanos);
+    nanos = nanos % IntervalDayTimeUtils.NANOS_PER_SEC;
+
+    this.totalSeconds = totalSeconds;
+    this.nanos = nanos;
+
+    normalizeSecondsAndNanos();
+  }
+
+  public void set(long seconds, int nanos) {
+    this.totalSeconds = seconds;
+    this.nanos = nanos;
+    normalizeSecondsAndNanos();
+  }
+
+  public void set(BigDecimal totalSecondsBd) {
+    long totalSeconds = totalSecondsBd.longValue();
+    BigDecimal fractionalSecs = totalSecondsBd.remainder(BigDecimal.ONE);
+    int nanos = 
fractionalSecs.multiply(IntervalDayTimeUtils.NANOS_PER_SEC_BD).intValue();
+    set(totalSeconds, nanos);
+  }
+
+  public void set(HiveIntervalDayTime other) {
+    set(other.getTotalSeconds(), other.getNanos());
+  }
+
+  public HiveIntervalDayTime negate() {
+    return new HiveIntervalDayTime(-getTotalSeconds(), -getNanos());
+  }
+
+  @Override
+  public int compareTo(HiveIntervalDayTime other) {
+    long cmp = this.totalSeconds - other.totalSeconds;
+    if (cmp == 0) {
+      cmp = this.nanos - other.nanos;
+    }
+    if (cmp != 0) {
+      cmp = cmp > 0 ? 1 : -1;
+    }
+    return (int) cmp;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof HiveIntervalDayTime)) {
+      return false;
+    }
+    return 0 == compareTo((HiveIntervalDayTime) obj);
+  }
+
+  /**
+   * Return a copy of this object.
+   */
+  public Object clone() {
+      return new HiveIntervalDayTime(totalSeconds, nanos);
+  }
+
+  @Override
+  public int hashCode() {
+    return new 
HashCodeBuilder().append(totalSeconds).append(nanos).toHashCode();
+  }
+
+  @Override
+  public String toString() {
+    // If normalize() was used, then day-hour-minute-second-nanos should have 
the same sign.
+    // This is currently working with that assumption.
+    boolean isNegative = (totalSeconds < 0 || nanos < 0);
+    String daySecondSignStr = isNegative ? "-" : "";
+
+    return String.format("%s%d %02d:%02d:%02d.%09d",
+        daySecondSignStr, Math.abs(getDays()),
+        Math.abs(getHours()), Math.abs(getMinutes()),
+        Math.abs(getSeconds()), Math.abs(getNanos()));
+  }
+
+  public static HiveIntervalDayTime valueOf(String strVal) {
+    HiveIntervalDayTime result = null;
+    if (strVal == null) {
+      throw new IllegalArgumentException("Interval day-time string was null");
+    }
+    Matcher patternMatcher = PATTERN_MATCHER.get();
+    patternMatcher.reset(strVal);
+    if (patternMatcher.matches()) {
+      // Parse out the individual parts
+      try {
+        // Sign - whether interval is positive or negative
+        int sign = 1;
+        String field = patternMatcher.group(1);
+        if (field != null && field.equals("-")) {
+          sign = -1;
+        }
+        int days = sign *
+            IntervalDayTimeUtils.parseNumericValueWithRange("day", 
patternMatcher.group(2),
+                0, Integer.MAX_VALUE);
+        byte hours = (byte) (sign *
+            IntervalDayTimeUtils.parseNumericValueWithRange("hour", 
patternMatcher.group(3), 0, 23));
+        byte minutes = (byte) (sign *
+            IntervalDayTimeUtils.parseNumericValueWithRange("minute", 
patternMatcher.group(4), 0, 59));
+        int seconds = 0;
+        int nanos = 0;
+        field = patternMatcher.group(5);
+        if (field != null) {
+          BigDecimal bdSeconds = new BigDecimal(field);
+          if (bdSeconds.compareTo(IntervalDayTimeUtils.MAX_INT_BD) > 0) {
+            throw new IllegalArgumentException("seconds value of " + bdSeconds 
+ " too large");
+          }
+          seconds = sign * bdSeconds.intValue();
+          nanos = sign * bdSeconds.subtract(new 
BigDecimal(bdSeconds.toBigInteger()))
+              .multiply(IntervalDayTimeUtils.NANOS_PER_SEC_BD).intValue();
+        }
+
+        result = new HiveIntervalDayTime(days, hours, minutes, seconds, nanos);
+      } catch (Exception err) {
+        throw new IllegalArgumentException("Error parsing interval day-time 
string: " + strVal, err);
+      }
+    } else {
+      throw new IllegalArgumentException(
+          "Interval string does not match day-time format of 'd h:m:s.n': " + 
strVal);
+    }
+
+    return result;
+  }
+
+  // Simple pattern: D H:M:S.nnnnnnnnn
+  private final static String PARSE_PATTERN =
+      "([+|-])?(\\d+) (\\d+):(\\d+):((\\d+)(\\.(\\d+))?)";
+
+  private static final ThreadLocal<Matcher> PATTERN_MATCHER = new 
ThreadLocal<Matcher>() {
+      @Override
+      protected Matcher initialValue() {
+        return Pattern.compile(PARSE_PATTERN).matcher("");
+      }
+  };
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java
 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java
new file mode 100644
index 0000000..3fb0cfd
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.common.type;
+
+import java.sql.Timestamp;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+public class RandomTypeUtil {
+
+  public static final long NANOSECONDS_PER_SECOND = 
TimeUnit.SECONDS.toNanos(1);
+  public static final long MILLISECONDS_PER_SECOND = 
TimeUnit.SECONDS.toMillis(1);
+  public static final long NANOSECONDS_PER_MILLISSECOND = 
TimeUnit.MILLISECONDS.toNanos(1);
+
+  private static ThreadLocal<DateFormat> DATE_FORMAT =
+      new ThreadLocal<DateFormat>() {
+        @Override
+        protected DateFormat initialValue() {
+          return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+        }
+      };
+
+  // We've switched to Joda/Java Calendar which has a more limited time 
range....
+  public static int MIN_YEAR = 1900;
+  public static int MAX_YEAR = 3000;
+  private static long MIN_FOUR_DIGIT_YEAR_MILLIS = parseToMillis("1900-01-01 
00:00:00");
+  private static long MAX_FOUR_DIGIT_YEAR_MILLIS = parseToMillis("3000-01-01 
00:00:00");
+
+  private static long parseToMillis(String s) {
+    try {
+      return DATE_FORMAT.get().parse(s).getTime();
+    } catch (ParseException ex) {
+      throw new RuntimeException(ex);
+    }
+  }
+
+  public static Timestamp getRandTimestamp(Random r) {
+    return getRandTimestamp(r, MIN_YEAR, MAX_YEAR);
+  }
+
+  public static Timestamp getRandTimestamp(Random r, int minYear, int maxYear) 
{
+    String optionalNanos = "";
+    switch (r.nextInt(4)) {
+    case 0:
+      // No nanos.
+      break;
+    case 1:
+      optionalNanos = String.format(".%09d",
+          Integer.valueOf(r.nextInt((int) NANOSECONDS_PER_SECOND)));
+      break;
+    case 2:
+      // Limit to milliseconds only...
+      optionalNanos = String.format(".%09d",
+          Integer.valueOf(r.nextInt((int) MILLISECONDS_PER_SECOND)) * 
NANOSECONDS_PER_MILLISSECOND);
+      break;
+    case 3:
+      // Limit to below milliseconds only...
+      optionalNanos = String.format(".%09d",
+          Integer.valueOf(r.nextInt((int) NANOSECONDS_PER_MILLISSECOND)));
+      break;
+    }
+    String timestampStr = String.format("%04d-%02d-%02d %02d:%02d:%02d%s",
+        Integer.valueOf(minYear + r.nextInt(maxYear - minYear + 1)),  // year
+        Integer.valueOf(1 + r.nextInt(12)),      // month
+        Integer.valueOf(1 + r.nextInt(28)),      // day
+        Integer.valueOf(0 + r.nextInt(24)),      // hour
+        Integer.valueOf(0 + r.nextInt(60)),      // minute
+        Integer.valueOf(0 + r.nextInt(60)),      // second
+        optionalNanos);
+    Timestamp timestampVal;
+    try {
+      timestampVal = Timestamp.valueOf(timestampStr);
+    } catch (Exception e) {
+      System.err.println("Timestamp string " + timestampStr + " did not 
parse");
+      throw e;
+    }
+    return timestampVal;
+  }
+
+  public static long randomMillis(long minMillis, long maxMillis, Random rand) 
{
+    return minMillis + (long) ((maxMillis - minMillis) * rand.nextDouble());
+  }
+
+  public static long randomMillis(Random rand) {
+    return randomMillis(MIN_FOUR_DIGIT_YEAR_MILLIS, 
MAX_FOUR_DIGIT_YEAR_MILLIS, rand);
+  }
+
+  public static int randomNanos(Random rand, int decimalDigits) {
+    // Only keep the most significant decimalDigits digits.
+    int nanos = rand.nextInt((int) NANOSECONDS_PER_SECOND);
+    return nanos - nanos % (int) Math.pow(10, 9 - decimalDigits);
+  }
+
+  public static int randomNanos(Random rand) {
+    return randomNanos(rand, 9);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
 
b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
new file mode 100644
index 0000000..a6d932c
--- /dev/null
+++ 
b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
@@ -0,0 +1,389 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+
+/**
+ * This class supports string and binary data by value reference -- i.e. each 
field is
+ * explicitly present, as opposed to provided by a dictionary reference.
+ * In some cases, all the values will be in the same byte array to begin with,
+ * but this need not be the case. If each value is in a separate byte
+ * array to start with, or not all of the values are in the same original
+ * byte array, you can still assign data by reference into this column vector.
+ * This gives flexibility to use this in multiple situations.
+ * <p>
+ * When setting data by reference, the caller
+ * is responsible for allocating the byte arrays used to hold the data.
+ * You can also set data by value, as long as you call the initBuffer() method 
first.
+ * You can mix "by value" and "by reference" in the same column vector,
+ * though that use is probably not typical.
+ */
+public class BytesColumnVector extends ColumnVector {
+  public byte[][] vector;
+  public int[] start;          // start offset of each field
+
+  /*
+   * The length of each field. If the value repeats for every entry, then it 
is stored
+   * in vector[0] and isRepeating from the superclass is set to true.
+   */
+  public int[] length;
+  private byte[] buffer;   // optional buffer to use when actually copying in 
data
+  private int nextFree;    // next free position in buffer
+
+  // Estimate that there will be 16 bytes per entry
+  static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE;
+
+  // Proportion of extra space to provide when allocating more buffer space.
+  static final float EXTRA_SPACE_FACTOR = (float) 1.2;
+
+  /**
+   * Use this constructor for normal operation.
+   * All column vectors should be the default size normally.
+   */
+  public BytesColumnVector() {
+    this(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+
+  /**
+   * Don't call this constructor except for testing purposes.
+   *
+   * @param size  number of elements in the column vector
+   */
+  public BytesColumnVector(int size) {
+    super(size);
+    vector = new byte[size][];
+    start = new int[size];
+    length = new int[size];
+  }
+
+  /**
+   * Additional reset work for BytesColumnVector (releasing scratch bytes for 
by value strings).
+   */
+  @Override
+  public void reset() {
+    super.reset();
+    initBuffer(0);
+  }
+
+  /** Set a field by reference.
+   *
+   * @param elementNum index within column vector to set
+   * @param sourceBuf container of source data
+   * @param start start byte position within source
+   * @param length  length of source byte sequence
+   */
+  public void setRef(int elementNum, byte[] sourceBuf, int start, int length) {
+    vector[elementNum] = sourceBuf;
+    this.start[elementNum] = start;
+    this.length[elementNum] = length;
+  }
+
+  /**
+   * You must call initBuffer first before using setVal().
+   * Provide the estimated number of bytes needed to hold
+   * a full column vector worth of byte string data.
+   *
+   * @param estimatedValueSize  Estimated size of buffer space needed
+   */
+  public void initBuffer(int estimatedValueSize) {
+    nextFree = 0;
+
+    // if buffer is already allocated, keep using it, don't re-allocate
+    if (buffer != null) {
+      return;
+    }
+
+    // allocate a little extra space to limit need to re-allocate
+    int bufferSize = this.vector.length * (int)(estimatedValueSize * 
EXTRA_SPACE_FACTOR);
+    if (bufferSize < DEFAULT_BUFFER_SIZE) {
+      bufferSize = DEFAULT_BUFFER_SIZE;
+    }
+    buffer = new byte[bufferSize];
+  }
+
+  /**
+   * Initialize buffer to default size.
+   */
+  public void initBuffer() {
+    initBuffer(0);
+  }
+
+  /**
+   * @return amount of buffer space currently allocated
+   */
+  public int bufferSize() {
+    if (buffer == null) {
+      return 0;
+    }
+    return buffer.length;
+  }
+
+  /**
+   * Set a field by actually copying in to a local buffer.
+   * If you must actually copy data in to the array, use this method.
+   * DO NOT USE this method unless it's not practical to set data by reference 
with setRef().
+   * Setting data by reference tends to run a lot faster than copying data in.
+   *
+   * @param elementNum index within column vector to set
+   * @param sourceBuf container of source data
+   * @param start start byte position within source
+   * @param length  length of source byte sequence
+   */
+  public void setVal(int elementNum, byte[] sourceBuf, int start, int length) {
+    if ((nextFree + length) > buffer.length) {
+      increaseBufferSpace(length);
+    }
+    System.arraycopy(sourceBuf, start, buffer, nextFree, length);
+    vector[elementNum] = buffer;
+    this.start[elementNum] = nextFree;
+    this.length[elementNum] = length;
+    nextFree += length;
+  }
+
+  /**
+   * Set a field by actually copying in to a local buffer.
+   * If you must actually copy data in to the array, use this method.
+   * DO NOT USE this method unless it's not practical to set data by reference 
with setRef().
+   * Setting data by reference tends to run a lot faster than copying data in.
+   *
+   * @param elementNum index within column vector to set
+   * @param sourceBuf container of source data
+   */
+  public void setVal(int elementNum, byte[] sourceBuf) {
+    setVal(elementNum, sourceBuf, 0, sourceBuf.length);
+  }
+
+  /**
+   * Set a field to the concatenation of two string values. Result data is 
copied
+   * into the internal buffer.
+   *
+   * @param elementNum index within column vector to set
+   * @param leftSourceBuf container of left argument
+   * @param leftStart start of left argument
+   * @param leftLen length of left argument
+   * @param rightSourceBuf container of right argument
+   * @param rightStart start of right argument
+   * @param rightLen length of right arugment
+   */
+  public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, 
int leftLen,
+      byte[] rightSourceBuf, int rightStart, int rightLen) {
+    int newLen = leftLen + rightLen;
+    if ((nextFree + newLen) > buffer.length) {
+      increaseBufferSpace(newLen);
+    }
+    vector[elementNum] = buffer;
+    this.start[elementNum] = nextFree;
+    this.length[elementNum] = newLen;
+
+    System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen);
+    nextFree += leftLen;
+    System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen);
+    nextFree += rightLen;
+  }
+
+  /**
+   * Increase buffer space enough to accommodate next element.
+   * This uses an exponential increase mechanism to rapidly
+   * increase buffer size to enough to hold all data.
+   * As batches get re-loaded, buffer space allocated will quickly
+   * stabilize.
+   *
+   * @param nextElemLength size of next element to be added
+   */
+  public void increaseBufferSpace(int nextElemLength) {
+
+    // Keep doubling buffer size until there will be enough space for next 
element.
+    int newLength = 2 * buffer.length;
+    while((nextFree + nextElemLength) > newLength) {
+      newLength *= 2;
+    }
+
+    // Allocate new buffer, copy data to it, and set buffer to new buffer.
+    byte[] newBuffer = new byte[newLength];
+    System.arraycopy(buffer, 0, newBuffer, 0, nextFree);
+    buffer = newBuffer;
+  }
+
+  /** Copy the current object contents into the output. Only copy selected 
entries,
+    * as indicated by selectedInUse and the sel array.
+    */
+  public void copySelected(
+      boolean selectedInUse, int[] sel, int size, BytesColumnVector output) {
+
+    // Output has nulls if and only if input has nulls.
+    output.noNulls = noNulls;
+    output.isRepeating = false;
+
+    // Handle repeating case
+    if (isRepeating) {
+      output.setVal(0, vector[0], start[0], length[0]);
+      output.isNull[0] = isNull[0];
+      output.isRepeating = true;
+      return;
+    }
+
+    // Handle normal case
+
+    // Copy data values over
+    if (selectedInUse) {
+      for (int j = 0; j < size; j++) {
+        int i = sel[j];
+        output.setVal(i, vector[i], start[i], length[i]);
+      }
+    }
+    else {
+      for (int i = 0; i < size; i++) {
+        output.setVal(i, vector[i], start[i], length[i]);
+      }
+    }
+
+    // Copy nulls over if needed
+    if (!noNulls) {
+      if (selectedInUse) {
+        for (int j = 0; j < size; j++) {
+          int i = sel[j];
+          output.isNull[i] = isNull[i];
+        }
+      }
+      else {
+        System.arraycopy(isNull, 0, output.isNull, 0, size);
+      }
+    }
+  }
+
+  /** Simplify vector by brute-force flattening noNulls and isRepeating
+    * This can be used to reduce combinatorial explosion of code paths in 
VectorExpressions
+    * with many arguments, at the expense of loss of some performance.
+    */
+  public void flatten(boolean selectedInUse, int[] sel, int size) {
+    flattenPush();
+    if (isRepeating) {
+      isRepeating = false;
+
+      // setRef is used below and this is safe, because the reference
+      // is to data owned by this column vector. If this column vector
+      // gets re-used, the whole thing is re-used together so there
+      // is no danger of a dangling reference.
+
+      // Only copy data values if entry is not null. The string value
+      // at position 0 is undefined if the position 0 value is null.
+      if (noNulls || !isNull[0]) {
+
+        // loops start at position 1 because position 0 is already set
+        if (selectedInUse) {
+          for (int j = 1; j < size; j++) {
+            int i = sel[j];
+            this.setRef(i, vector[0], start[0], length[0]);
+          }
+        } else {
+          for (int i = 1; i < size; i++) {
+            this.setRef(i, vector[0], start[0], length[0]);
+          }
+        }
+      }
+      flattenRepeatingNulls(selectedInUse, sel, size);
+    }
+    flattenNoNulls(selectedInUse, sel, size);
+  }
+
+  // Fill the all the vector entries with provided value
+  public void fill(byte[] value) {
+    noNulls = true;
+    isRepeating = true;
+    setRef(0, value, 0, value.length);
+  }
+
+  // Fill the column vector with nulls
+  public void fillWithNulls() {
+    noNulls = false;
+    isRepeating = true;
+    vector[0] = null;
+    isNull[0] = true;
+  }
+
+  @Override
+  public void setElement(int outElementNum, int inputElementNum, ColumnVector 
inputVector) {
+    if (inputVector.isRepeating) {
+      inputElementNum = 0;
+    }
+    if (inputVector.noNulls || !inputVector.isNull[inputElementNum]) {
+      isNull[outElementNum] = false;
+      BytesColumnVector in = (BytesColumnVector) inputVector;
+      setVal(outElementNum, in.vector[inputElementNum],
+          in.start[inputElementNum], in.length[inputElementNum]);
+    } else {
+      isNull[outElementNum] = true;
+      noNulls = false;
+    }
+  }
+
+  @Override
+  public void init() {
+    initBuffer(0);
+  }
+
+  public String toString(int row) {
+    if (isRepeating) {
+      row = 0;
+    }
+    if (noNulls || !isNull[row]) {
+      return new String(vector[row], start[row], length[row]);
+    } else {
+      return null;
+    }
+  }
+
+  @Override
+  public void stringifyValue(StringBuilder buffer, int row) {
+    if (isRepeating) {
+      row = 0;
+    }
+    if (noNulls || !isNull[row]) {
+      buffer.append('"');
+      buffer.append(new String(this.buffer, start[row], length[row]));
+      buffer.append('"');
+    } else {
+      buffer.append("null");
+    }
+  }
+
+  @Override
+  public void ensureSize(int size, boolean preserveData) {
+    super.ensureSize(size, preserveData);
+    if (size > vector.length) {
+      int[] oldStart = start;
+      start = new int[size];
+      int[] oldLength = length;
+      length = new int[size];
+      byte[][] oldVector = vector;
+      vector = new byte[size][];
+      if (preserveData) {
+        if (isRepeating) {
+          vector[0] = oldVector[0];
+          start[0] = oldStart[0];
+          length[0] = oldLength[0];
+        } else {
+          System.arraycopy(oldVector, 0, vector, 0, oldVector.length);
+          System.arraycopy(oldStart, 0, start, 0 , oldStart.length);
+          System.arraycopy(oldLength, 0, length, 0, oldLength.length);
+        }
+      }
+    }
+  }
+}

[04/23] orc git commit: ORC-1 Import of ORC code from Hive. (omalley reviewed by prasanthj)

Reply via email to