http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java ---------------------------------------------------------------------- diff --git a/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java b/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java new file mode 100644 index 0000000..ce5523f --- /dev/null +++ b/java/mapreduce/src/test/org/apache/orc/mapred/other/TestOrcOutputFormat.java @@ -0,0 +1,249 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.mapred.other; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.ByteWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.ShortWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobContext; +import org.apache.hadoop.mapred.OutputCommitter; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TaskAttemptContext; +import org.apache.orc.CompressionKind; +import org.apache.orc.OrcConf; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; +import org.apache.orc.mapred.OrcInputFormat; +import org.apache.orc.mapred.OrcList; +import org.apache.orc.mapred.OrcMap; +import org.apache.orc.mapred.OrcOutputFormat; +import org.apache.orc.mapred.OrcStruct; +import org.apache.orc.mapred.OrcTimestamp; +import org.apache.orc.mapred.OrcUnion; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class TestOrcOutputFormat { + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + JobConf conf = new JobConf(); + FileSystem fs; + + { + try { + fs = FileSystem.getLocal(conf).getRaw(); + fs.delete(workDir, true); + fs.mkdirs(workDir); + } catch (IOException e) { + throw new IllegalStateException("bad fs init", e); + } + } + + static class NullOutputCommitter extends OutputCommitter { + + @Override + public void setupJob(JobContext jobContext) { + // PASS + } + + @Override + public void setupTask(TaskAttemptContext taskAttemptContext) { + + } + + @Override + public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) { + return false; + } + + @Override + public void commitTask(TaskAttemptContext taskAttemptContext) { + // PASS + } + + @Override + public void abortTask(TaskAttemptContext taskAttemptContext) { + // PASS + } + } + + @Test + public void testAllTypes() throws Exception { + conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0"); + conf.setOutputCommitter(NullOutputCommitter.class); + final String typeStr = "struct<b1:binary,b2:boolean,b3:tinyint," + + "c:char(10),d1:date,d2:decimal(20,5),d3:double,fff:float,int:int," + + "l:array<bigint>,map:map<smallint,string>," + + "str:struct<u:uniontype<timestamp,varchar(100)>>,ts:timestamp>"; + conf.set(OrcConf.SCHEMA.getAttribute(), typeStr); + FileOutputFormat.setOutputPath(conf, workDir); + TypeDescription type = TypeDescription.fromString(typeStr); + + // build a row object + OrcStruct row = (OrcStruct) OrcStruct.createValue(type); + ((BytesWritable) row.getFieldValue(0)).set(new byte[]{1,2,3,4}, 0, 4); + ((BooleanWritable) row.getFieldValue(1)).set(true); + ((ByteWritable) row.getFieldValue(2)).set((byte) 23); + ((Text) row.getFieldValue(3)).set("aaabbbcccddd"); + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); + ((DateWritable) row.getFieldValue(4)).set(DateWritable.millisToDays + (format.parse("2016-04-01").getTime())); + ((HiveDecimalWritable) row.getFieldValue(5)).set(new HiveDecimalWritable("1.23")); + ((DoubleWritable) row.getFieldValue(6)).set(1.5); + ((FloatWritable) row.getFieldValue(7)).set(4.5f); + ((IntWritable) row.getFieldValue(8)).set(31415); + OrcList<LongWritable> longList = (OrcList<LongWritable>) row.getFieldValue(9); + longList.add(new LongWritable(123)); + longList.add(new LongWritable(456)); + OrcMap<ShortWritable,Text> map = (OrcMap<ShortWritable,Text>) row.getFieldValue(10); + map.put(new ShortWritable((short) 1000), new Text("aaaa")); + map.put(new ShortWritable((short) 123), new Text("bbbb")); + OrcStruct struct = (OrcStruct) row.getFieldValue(11); + OrcUnion union = (OrcUnion) struct.getFieldValue(0); + union.set((byte) 1, new Text("abcde")); + ((OrcTimestamp) row.getFieldValue(12)).set("1996-12-11 15:00:00"); + NullWritable nada = NullWritable.get(); + RecordWriter<NullWritable, OrcStruct> writer = + new OrcOutputFormat<OrcStruct>().getRecordWriter(fs, conf, "all.orc", + Reporter.NULL); + for(int r=0; r < 10; ++r) { + row.setFieldValue(8, new IntWritable(r * 10)); + writer.write(nada, row); + } + union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56")); + for(int r=0; r < 10; ++r) { + row.setFieldValue(8, new IntWritable(r * 10 + 100)); + writer.write(nada, row); + } + OrcStruct row2 = new OrcStruct(type); + writer.write(nada, row2); + row.setFieldValue(8, new IntWritable(210)); + writer.write(nada, row); + writer.close(Reporter.NULL); + + FileSplit split = new FileSplit(new Path(workDir, "all.orc"), 0, 100000, + new String[0]); + RecordReader<NullWritable, OrcStruct> reader = + new OrcInputFormat<OrcStruct>().getRecordReader(split, conf, + Reporter.NULL); + nada = reader.createKey(); + row = reader.createValue(); + for(int r=0; r < 22; ++r) { + assertEquals(true, reader.next(nada, row)); + if (r == 20) { + for(int c=0; c < 12; ++c) { + assertEquals(null, row.getFieldValue(c)); + } + } else { + assertEquals(new BytesWritable(new byte[]{1, 2, 3, 4}), row.getFieldValue(0)); + assertEquals(new BooleanWritable(true), row.getFieldValue(1)); + assertEquals(new ByteWritable((byte) 23), row.getFieldValue(2)); + assertEquals(new Text("aaabbbcccd"), row.getFieldValue(3)); + assertEquals(new DateWritable(DateWritable.millisToDays + (format.parse("2016-04-01").getTime())), row.getFieldValue(4)); + assertEquals(new HiveDecimalWritable("1.23"), row.getFieldValue(5)); + assertEquals(new DoubleWritable(1.5), row.getFieldValue(6)); + assertEquals(new FloatWritable(4.5f), row.getFieldValue(7)); + assertEquals(new IntWritable(r * 10), row.getFieldValue(8)); + assertEquals(longList, row.getFieldValue(9)); + assertEquals(map, row.getFieldValue(10)); + if (r < 10) { + union.set((byte) 1, new Text("abcde")); + } else { + union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56")); + } + assertEquals("row " + r, struct, row.getFieldValue(11)); + assertEquals("row " + r, new OrcTimestamp("1996-12-11 15:00:00"), + row.getFieldValue(12)); + } + } + assertEquals(false, reader.next(nada, row)); + } + + /** + * Test the case where the top level isn't a struct, but a long. + */ + @Test + public void testLongRoot() throws Exception { + conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0"); + conf.setOutputCommitter(NullOutputCommitter.class); + conf.set(OrcConf.COMPRESS.getAttribute(), "SNAPPY"); + conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000); + conf.setInt(OrcConf.BUFFER_SIZE.getAttribute(), 64 * 1024); + conf.set(OrcConf.WRITE_FORMAT.getAttribute(), "0.11"); + final String typeStr = "bigint"; + conf.set(OrcConf.SCHEMA.getAttribute(), typeStr); + FileOutputFormat.setOutputPath(conf, workDir); + TypeDescription type = TypeDescription.fromString(typeStr); + LongWritable value = new LongWritable(); + NullWritable nada = NullWritable.get(); + RecordWriter<NullWritable, LongWritable> writer = + new OrcOutputFormat<LongWritable>().getRecordWriter(fs, conf, + "long.orc", Reporter.NULL); + for(long lo=0; lo < 2000; ++lo) { + value.set(lo); + writer.write(nada, value); + } + writer.close(Reporter.NULL); + + Path path = new Path(workDir, "long.orc"); + Reader file = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + assertEquals(CompressionKind.SNAPPY, file.getCompressionKind()); + assertEquals(2000, file.getNumberOfRows()); + assertEquals(1000, file.getRowIndexStride()); + assertEquals(64 * 1024, file.getCompressionSize()); + assertEquals(OrcFile.Version.V_0_11, file.getFileVersion()); + FileSplit split = new FileSplit(path, 0, 100000, + new String[0]); + RecordReader<NullWritable, LongWritable> reader = + new OrcInputFormat<LongWritable>().getRecordReader(split, conf, + Reporter.NULL); + nada = reader.createKey(); + value = reader.createValue(); + for(long lo=0; lo < 2000; ++lo) { + assertEquals(true, reader.next(nada, value)); + assertEquals(lo, value.get()); + } + assertEquals(false, reader.next(nada, value)); + } +}
http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/pom.xml ---------------------------------------------------------------------- diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 0000000..7ce1161 --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,110 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache</groupId> + <artifactId>apache</artifactId> + <version>14</version> + </parent> + <groupId>org.apache.orc</groupId> + <artifactId>orc</artifactId> + <version>1.1.0-SNAPSHOT</version> + <packaging>pom</packaging> + + <name>ORC</name> + <url>http://orc.apache.org</url> + <prerequisites> + <maven>2.2.1</maven> + </prerequisites> + + <modules> + <module>storage-api</module> + <module>core</module> + <module>mapreduce</module> + </modules> + + <properties> + <!-- Build Properties --> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <maven.compiler.useIncrementalCompilation>false</maven.compiler.useIncrementalCompilation> + <test.tmp.dir>${project.build.directory}/testing-tmp</test.tmp.dir> + + <commons-codec.version>1.4</commons-codec.version> + <hadoop.version>2.6.0</hadoop.version> + <junit.version>4.11</junit.version> + <kryo.version>3.0.3</kryo.version> + <mockito.version>1.9.5</mockito.version> + <protobuf.version>2.5.0</protobuf.version> + <slf4j.version>1.7.5</slf4j.version> + <snappy.version>0.2</snappy.version> + </properties> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>2.10.3</version> + <configuration> + ... + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-antrun-plugin</artifactId> + <executions> + <execution> + <id>setup-test-dirs</id> + <phase>process-test-resources</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <target> + <delete dir="${test.tmp.dir}" /> + <mkdir dir="${test.tmp.dir}" /> + </target> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <configuration> + <reuseForks>false</reuseForks> + <argLine>-Xmx2048m -XX:MaxPermSize=512m</argLine> + <environmentVariables> + <TZ>US/Pacific</TZ> + <LANG>en_US.UTF-8</LANG> + </environmentVariables> + <systemPropertyVariables> + <test.tmp.dir>${test.tmp.dir}</test.tmp.dir> + </systemPropertyVariables> + </configuration> + </plugin> + </plugins> + </build> + + <dependencies> + <!-- global dependencies --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + <version>${slf4j.version}</version> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/pom.xml ---------------------------------------------------------------------- diff --git a/java/storage-api/pom.xml b/java/storage-api/pom.xml new file mode 100644 index 0000000..85bd77c --- /dev/null +++ b/java/storage-api/pom.xml @@ -0,0 +1,76 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + <version>2.0.0.1-SNAPSHOT</version> + <packaging>jar</packaging> + <name>Hive Storage API</name> + + <dependencies> + <!-- test inter-project --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>2.6.0</version> + <optional>true</optional> + <exclusions> + <exclusion> + <groupId>commmons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.11</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <sourceDirectory>${basedir}/src/java</sourceDirectory> + <testSourceDirectory>${basedir}/src/test</testSourceDirectory> + <testResources> + <testResource> + <directory>${basedir}/src/test/resources</directory> + </testResource> + </testResources> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.1</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java new file mode 100644 index 0000000..86b838c --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/DiskRangeInfo.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common; + +import java.util.List; + +import org.apache.hadoop.hive.common.io.DiskRange; + +import com.google.common.collect.Lists; + +/** + * Disk range information class containing disk ranges and total length. + */ +public class DiskRangeInfo { + List<DiskRange> diskRanges; // TODO: use DiskRangeList instead + long totalLength; + + public DiskRangeInfo(int indexBaseOffset) { + this.diskRanges = Lists.newArrayList(); + // Some data is missing from the stream for PPD uncompressed read (because index offset is + // relative to the entire stream and we only read part of stream if RGs are filtered; unlike + // with compressed data where PPD only filters CBs, so we always get full CB, and index offset + // is relative to CB). To take care of the case when UncompressedStream goes seeking around by + // its incorrect (relative to partial stream) index offset, we will increase the length by our + // offset-relative-to-the-stream, and also account for it in buffers (see createDiskRangeInfo). + // So, index offset now works; as long as noone seeks into this data before the RG (why would + // they), everything works. This is hacky... Stream shouldn't depend on having all the data. + this.totalLength = indexBaseOffset; + } + + public void addDiskRange(DiskRange diskRange) { + diskRanges.add(diskRange); + totalLength += diskRange.getLength(); + } + + public List<DiskRange> getDiskRanges() { + return diskRanges; + } + + public long getTotalLength() { + return totalLength; + } +} + http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java new file mode 100644 index 0000000..272bbdd --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/Pool.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common; + +/** Simple object pool to prevent GC on small objects passed between threads. */ +public interface Pool<T> { + /** Object helper for objects stored in the pool. */ + public interface PoolObjectHelper<T> { + /** Called to create an object when one cannot be provided. */ + T create(); + /** Called before the object is put in the pool (regardless of whether put succeeds). */ + void resetBeforeOffer(T t); + } + + T take(); + void offer(T t); + int size(); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java new file mode 100644 index 0000000..fd9d9c9 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/Allocator.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.io; + +import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer; + +/** An allocator provided externally to storage classes to allocate MemoryBuffer-s. */ +public interface Allocator { + public static class AllocatorOutOfMemoryException extends RuntimeException { + public AllocatorOutOfMemoryException(String msg) { + super(msg); + } + + private static final long serialVersionUID = 268124648177151761L; + } + + /** + * Allocates multiple buffers of a given size. + * @param dest Array where buffers are placed. Objects are reused if already there + * (see createUnallocated), created otherwise. + * @param size Allocation size. + * @throws AllocatorOutOfMemoryException Cannot allocate. + */ + void allocateMultiple(MemoryBuffer[] dest, int size) throws AllocatorOutOfMemoryException; + + /** + * Creates an unallocated memory buffer object. This object can be passed to allocateMultiple + * to allocate; this is useful if data structures are created for separate buffers that can + * later be allocated together. + */ + MemoryBuffer createUnallocated(); + /** Deallocates a memory buffer. */ + void deallocate(MemoryBuffer buffer); + /** Whether the allocator uses direct buffers. */ + boolean isDirectAlloc(); + /** Maximum allocation size supported by this allocator. */ + int getMaxAllocation(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java new file mode 100644 index 0000000..1273588 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DataCache.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.io; + +import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer; + +/** An abstract data cache that IO formats can use to retrieve and cache data. */ +public interface DataCache { + public static final class BooleanRef { + public boolean value; + } + + /** Disk range factory used during cache retrieval. */ + public interface DiskRangeListFactory { + DiskRangeList createCacheChunk(MemoryBuffer buffer, long startOffset, long endOffset); + } + + /** + * Gets file data for particular offsets. The range list is modified in place; it is then + * returned (since the list head could have changed). Ranges are replaced with cached ranges. + * + * Any such buffer is locked in cache to prevent eviction, and must therefore be released + * back to cache via a corresponding call (releaseBuffer) when the caller is done with it. + * + * In case of partial overlap with cached data, full cache blocks are always returned; + * there's no capacity for partial matches in return type. The rules are as follows: + * 1) If the requested range starts in the middle of a cached range, that cached range will not + * be returned by default (e.g. if [100,200) and [200,300) are cached, the request for + * [150,300) will only return [200,300) from cache). This may be configurable in impls. + * This is because we assume well-known range start offsets are used (rg/stripe offsets), so + * a request from the middle of the start doesn't make sense. + * 2) If the requested range ends in the middle of a cached range, that entire cached range will + * be returned (e.g. if [100,200) and [200,300) are cached, the request for [100,250) will + * return both ranges). It should really be same as #1, however currently ORC uses estimated + * end offsets; if we don't return the end block, the caller may read it from disk needlessly. + * + * @param fileId Unique ID of the target file on the file system. + * @param range A set of DiskRange-s (linked list) that is to be retrieved. May be modified. + * @param baseOffset base offset for the ranges (stripe/stream offset in case of ORC). + * @param factory A factory to produce DiskRangeList-s out of cached MemoryBuffer-s. + * @param gotAllData An out param - whether all the requested data was found in cache. + * @return The new or modified list of DiskRange-s, where some ranges may contain cached data. + */ + DiskRangeList getFileData(Object fileKey, DiskRangeList range, long baseOffset, + DiskRangeListFactory factory, BooleanRef gotAllData); + + /** + * Puts file data into cache, or gets older data in case of collisions. + * + * The memory buffers provided MUST be allocated via an allocator returned by getAllocator + * method, to allow cache implementations that evict and then de-allocate the buffer. + * + * It is assumed that the caller will use the data immediately, therefore any buffers provided + * to putFileData (or returned due to cache collision) are locked in cache to prevent eviction, + * and must therefore be released back to cache via a corresponding call (releaseBuffer) when the + * caller is done with it. Buffers rejected due to conflict will neither be locked, nor + * automatically deallocated. The caller must take care to discard these buffers. + * + * @param fileId Unique ID of the target file on the file system. + * @param ranges The ranges for which the data is being cached. These objects will not be stored. + * @param data The data for the corresponding ranges. + * @param baseOffset base offset for the ranges (stripe/stream offset in case of ORC). + * @return null if all data was put; bitmask indicating which chunks were not put otherwise; + * the replacement chunks from cache are updated directly in the array. + */ + long[] putFileData(Object fileKey, DiskRange[] ranges, MemoryBuffer[] data, long baseOffset); + + /** + * Releases the buffer returned by getFileData/provided to putFileData back to cache. + * See respective javadocs for details. + */ + void releaseBuffer(MemoryBuffer buffer); + + /** + * Notifies the cache that the buffer returned from getFileData/provided to putFileData will + * be used by another consumer and therefore released multiple times (one more time per call). + */ + void reuseBuffer(MemoryBuffer buffer); + + /** + * Gets the allocator associated with this DataCache. + */ + Allocator getAllocator(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java new file mode 100644 index 0000000..33aecf5 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRange.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.io; + +import java.nio.ByteBuffer; + +/** + * The sections of a file. + */ +public class DiskRange { + /** The first address. */ + protected long offset; + /** The address afterwards. */ + protected long end; + + public DiskRange(long offset, long end) { + this.offset = offset; + this.end = end; + if (end < offset) { + throw new IllegalArgumentException("invalid range " + this); + } + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != getClass()) { + return false; + } + return equalRange((DiskRange) other); + } + + public boolean equalRange(DiskRange other) { + return other.offset == offset && other.end == end; + } + + @Override + public int hashCode() { + return (int)(offset ^ (offset >>> 32)) * 31 + (int)(end ^ (end >>> 32)); + } + + @Override + public String toString() { + return "range start: " + offset + " end: " + end; + } + + public long getOffset() { + return offset; + } + + public long getEnd() { + return end; + } + + public int getLength() { + long len = this.end - this.offset; + assert len <= Integer.MAX_VALUE; + return (int)len; + } + + // For subclasses + public boolean hasData() { + return false; + } + + public DiskRange sliceAndShift(long offset, long end, long shiftBy) { + // Rather, unexpected usage exception. + throw new UnsupportedOperationException(); + } + + public ByteBuffer getData() { + throw new UnsupportedOperationException(); + } + + protected boolean merge(long otherOffset, long otherEnd) { + if (!overlap(offset, end, otherOffset, otherEnd)) return false; + offset = Math.min(offset, otherOffset); + end = Math.max(end, otherEnd); + return true; + } + + private static boolean overlap(long leftA, long rightA, long leftB, long rightB) { + if (leftA <= leftB) { + return rightA >= leftB; + } + return rightB >= leftA; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java new file mode 100644 index 0000000..b84aeb5 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/DiskRangeList.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.io; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Java linked list iterator interface is convoluted, and moreover concurrent modifications + * of the same list by multiple iterators are impossible. Hence, this. + * Java also doesn't support multiple inheritance, so this cannot be done as "aspect"... */ +public class DiskRangeList extends DiskRange { + private static final Logger LOG = LoggerFactory.getLogger(DiskRangeList.class); + public DiskRangeList prev, next; + + public DiskRangeList(long offset, long end) { + super(offset, end); + } + + /** Replaces this element with another in the list; returns the new element. */ + public DiskRangeList replaceSelfWith(DiskRangeList other) { + other.prev = this.prev; + other.next = this.next; + if (this.prev != null) { + this.prev.next = other; + } + if (this.next != null) { + this.next.prev = other; + } + this.next = this.prev = null; + return other; + } + + /** + * Inserts an intersecting range before current in the list and adjusts offset accordingly. + * @returns the new element. + */ + public DiskRangeList insertPartBefore(DiskRangeList other) { + assert other.end >= this.offset; + this.offset = other.end; + other.prev = this.prev; + other.next = this; + if (this.prev != null) { + this.prev.next = other; + } + this.prev = other; + return other; + } + + /** + * Inserts an element after current in the list. + * @returns the new element. + * */ + public DiskRangeList insertAfter(DiskRangeList other) { + other.next = this.next; + other.prev = this; + if (this.next != null) { + this.next.prev = other; + } + this.next = other; + return other; + } + + /** + * Inserts an intersecting range after current in the list and adjusts offset accordingly. + * @returns the new element. + */ + public DiskRangeList insertPartAfter(DiskRangeList other) { + assert other.offset <= this.end; + this.end = other.offset; + return insertAfter(other); + } + + /** Removes an element after current from the list. */ + public void removeAfter() { + DiskRangeList other = this.next; + this.next = other.next; + if (this.next != null) { + this.next.prev = this; + } + other.next = other.prev = null; + } + + /** Removes the current element from the list. */ + public void removeSelf() { + if (this.prev != null) { + this.prev.next = this.next; + } + if (this.next != null) { + this.next.prev = this.prev; + } + this.next = this.prev = null; + } + + /** Splits current element in the list, using DiskRange::slice */ + public final DiskRangeList split(long cOffset) { + insertAfter((DiskRangeList)this.sliceAndShift(cOffset, end, 0)); + return replaceSelfWith((DiskRangeList)this.sliceAndShift(offset, cOffset, 0)); + } + + public boolean hasContiguousNext() { + return next != null && end == next.offset; + } + + // @VisibleForTesting + public int listSize() { + int result = 1; + DiskRangeList current = this.next; + while (current != null) { + ++result; + current = current.next; + } + return result; + } + + public long getTotalLength() { + long totalLength = getLength(); + DiskRangeList current = next; + while (current != null) { + totalLength += current.getLength(); + current = current.next; + } + return totalLength; + } + + // @VisibleForTesting + public DiskRangeList[] listToArray() { + DiskRangeList[] result = new DiskRangeList[listSize()]; + int i = 0; + DiskRangeList current = this.next; + while (current != null) { + result[i] = current; + ++i; + current = current.next; + } + return result; + } + + public static class CreateHelper { + private DiskRangeList tail = null, head; + + public DiskRangeList getTail() { + return tail; + } + + public void addOrMerge(long offset, long end, boolean doMerge, boolean doLogNew) { + if (doMerge && tail != null && tail.merge(offset, end)) return; + if (doLogNew) { + LOG.info("Creating new range; last range (which can include some previous adds) was " + + tail); + } + DiskRangeList node = new DiskRangeList(offset, end); + if (tail == null) { + head = tail = node; + } else { + tail = tail.insertAfter(node); + } + } + + public DiskRangeList get() { + return head; + } + + public DiskRangeList extract() { + DiskRangeList result = head; + head = null; + return result; + } + } + + /** + * List in-place mutation helper - a bogus first element that is inserted before list head, + * and thus remains constant even if head is replaced with some new range via in-place list + * mutation. extract() can be used to obtain the modified list. + */ + public static class MutateHelper extends DiskRangeList { + public MutateHelper(DiskRangeList head) { + super(-1, -1); + assert head != null; + assert head.prev == null; + this.next = head; + head.prev = this; + } + + public DiskRangeList get() { + return next; + } + + public DiskRangeList extract() { + DiskRangeList result = this.next; + assert result != null; + this.next = result.prev = null; + return result; + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java new file mode 100644 index 0000000..907181e --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/EncodedColumnBatch.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.io.encoded; + +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * A block of data for a given section of a file, similar to VRB but in encoded form. + * Stores a set of buffers for each encoded stream that is a part of each column. + */ +public class EncodedColumnBatch<BatchKey> { + /** + * Slice of the data for a stream for some column, stored inside MemoryBuffer's. + * ColumnStreamData can be reused for many EncodedColumnBatch-es (e.g. dictionary stream), so + * it tracks the number of such users via a refcount. + */ + public static class ColumnStreamData { + private List<MemoryBuffer> cacheBuffers; + /** Base offset from the beginning of the indexable unit; for example, for ORC, + * offset from the CB in a compressed file, from the stream in uncompressed file. */ + private int indexBaseOffset = 0; + + /** Reference count. */ + private AtomicInteger refCount = new AtomicInteger(0); + + public void reset() { + cacheBuffers.clear(); + refCount.set(0); + indexBaseOffset = 0; + } + + public void incRef() { + refCount.incrementAndGet(); + } + + public int decRef() { + int i = refCount.decrementAndGet(); + assert i >= 0; + return i; + } + + public List<MemoryBuffer> getCacheBuffers() { + return cacheBuffers; + } + + public void setCacheBuffers(List<MemoryBuffer> cacheBuffers) { + this.cacheBuffers = cacheBuffers; + } + + public int getIndexBaseOffset() { + return indexBaseOffset; + } + + public void setIndexBaseOffset(int indexBaseOffset) { + this.indexBaseOffset = indexBaseOffset; + } + } + + /** The key that is used to map this batch to source location. */ + protected BatchKey batchKey; + /** + * Stream data for each stream, for each included column. + * For each column, streams are indexed by kind, with missing elements being null. + */ + protected ColumnStreamData[][] columnData; + /** Column indexes included in the batch. Correspond to columnData elements. */ + protected int[] columnIxs; + + public void reset() { + if (columnData == null) return; + for (int i = 0; i < columnData.length; ++i) { + if (columnData[i] == null) continue; + for (int j = 0; j < columnData[i].length; ++j) { + columnData[i][j] = null; + } + } + } + + public void initColumn(int colIxMod, int colIx, int streamCount) { + columnIxs[colIxMod] = colIx; + if (columnData[colIxMod] == null || columnData[colIxMod].length != streamCount) { + columnData[colIxMod] = new ColumnStreamData[streamCount]; + } + } + + public void setStreamData(int colIxMod, int streamKind, ColumnStreamData csd) { + columnData[colIxMod][streamKind] = csd; + } + + public void setAllStreamsData(int colIxMod, int colIx, ColumnStreamData[] sbs) { + columnIxs[colIxMod] = colIx; + columnData[colIxMod] = sbs; + } + + public BatchKey getBatchKey() { + return batchKey; + } + + public ColumnStreamData[][] getColumnData() { + return columnData; + } + + public int[] getColumnIxs() { + return columnIxs; + } + + protected void resetColumnArrays(int columnCount) { + if (columnIxs != null && columnCount == columnIxs.length) return; + columnIxs = new int[columnCount]; + ColumnStreamData[][] columnData = new ColumnStreamData[columnCount][]; + if (this.columnData != null) { + for (int i = 0; i < Math.min(columnData.length, this.columnData.length); ++i) { + columnData[i] = this.columnData[i]; + } + } + this.columnData = columnData; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java new file mode 100644 index 0000000..4475009 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/io/encoded/MemoryBuffer.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.common.io.encoded; + +import java.nio.ByteBuffer; + +/** Abstract interface for any class wrapping a ByteBuffer. */ +public interface MemoryBuffer { + /** Note - raw buffer should not be modified. */ + public ByteBuffer getByteBufferRaw(); + public ByteBuffer getByteBufferDup(); +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java new file mode 100644 index 0000000..1c6be91 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveDecimal.java @@ -0,0 +1,332 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.type; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.math.RoundingMode; + +/** + * + * HiveDecimal. Simple wrapper for BigDecimal. Adds fixed max precision and non scientific string + * representation + * + */ +public class HiveDecimal implements Comparable<HiveDecimal> { + public static final int MAX_PRECISION = 38; + public static final int MAX_SCALE = 38; + + /** + * Default precision/scale when user doesn't specify in the column metadata, such as + * decimal and decimal(8). + */ + public static final int USER_DEFAULT_PRECISION = 10; + public static final int USER_DEFAULT_SCALE = 0; + + /** + * Default precision/scale when system is not able to determine them, such as in case + * of a non-generic udf. + */ + public static final int SYSTEM_DEFAULT_PRECISION = 38; + public static final int SYSTEM_DEFAULT_SCALE = 18; + + public static final HiveDecimal ZERO = new HiveDecimal(BigDecimal.ZERO); + public static final HiveDecimal ONE = new HiveDecimal(BigDecimal.ONE); + + public static final int ROUND_FLOOR = BigDecimal.ROUND_FLOOR; + public static final int ROUND_CEILING = BigDecimal.ROUND_CEILING; + public static final int ROUND_HALF_UP = BigDecimal.ROUND_HALF_UP; + public static final int ROUND_HALF_EVEN = BigDecimal.ROUND_HALF_EVEN; + + private BigDecimal bd = BigDecimal.ZERO; + + private HiveDecimal(BigDecimal bd) { + this.bd = bd; + } + + public static HiveDecimal create(BigDecimal b) { + return create(b, true); + } + + public static HiveDecimal create(BigDecimal b, boolean allowRounding) { + BigDecimal bd = normalize(b, allowRounding); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(BigInteger unscaled, int scale) { + BigDecimal bd = normalize(new BigDecimal(unscaled, scale), true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(String dec) { + BigDecimal bd; + try { + bd = new BigDecimal(dec.trim()); + } catch (NumberFormatException ex) { + return null; + } + + bd = normalize(bd, true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(BigInteger bi) { + BigDecimal bd = normalize(new BigDecimal(bi), true); + return bd == null ? null : new HiveDecimal(bd); + } + + public static HiveDecimal create(int i) { + return new HiveDecimal(new BigDecimal(i)); + } + + public static HiveDecimal create(long l) { + return new HiveDecimal(new BigDecimal(l)); + } + + @Override + public String toString() { + return bd.toPlainString(); + } + + /** + * Return a string representation of the number with the number of decimal digits as + * the given scale. Please note that this is different from toString(). + * @param scale the number of digits after the decimal point + * @return the string representation of exact number of decimal digits + */ + public String toFormatString(int scale) { + return (bd.scale() == scale ? bd : + bd.setScale(scale, RoundingMode.HALF_UP)).toPlainString(); + } + + public HiveDecimal setScale(int i) { + return new HiveDecimal(bd.setScale(i, RoundingMode.HALF_UP)); + } + + @Override + public int compareTo(HiveDecimal dec) { + return bd.compareTo(dec.bd); + } + + @Override + public int hashCode() { + return bd.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != getClass()) { + return false; + } + return bd.equals(((HiveDecimal) obj).bd); + } + + public int scale() { + return bd.scale(); + } + + /** + * Returns the number of digits (integer and fractional) in the number, which is equivalent + * to SQL decimal precision. Note that this is different from BigDecimal.precision(), + * which returns the precision of the unscaled value (BigDecimal.valueOf(0.01).precision() = 1, + * whereas HiveDecimal.create("0.01").precision() = 2). + * If you want the BigDecimal precision, use HiveDecimal.bigDecimalValue().precision() + * @return + */ + public int precision() { + int bdPrecision = bd.precision(); + int bdScale = bd.scale(); + + if (bdPrecision < bdScale) { + // This can happen for numbers less than 0.1 + // For 0.001234: bdPrecision=4, bdScale=6 + // In this case, we'll set the type to have the same precision as the scale. + return bdScale; + } + return bdPrecision; + } + + public int intValue() { + return bd.intValue(); + } + + public double doubleValue() { + return bd.doubleValue(); + } + + public long longValue() { + return bd.longValue(); + } + + public short shortValue() { + return bd.shortValue(); + } + + public float floatValue() { + return bd.floatValue(); + } + + public BigDecimal bigDecimalValue() { + return bd; + } + + public byte byteValue() { + return bd.byteValue(); + } + + public HiveDecimal setScale(int adjustedScale, int rm) { + return create(bd.setScale(adjustedScale, rm)); + } + + public HiveDecimal subtract(HiveDecimal dec) { + return create(bd.subtract(dec.bd)); + } + + public HiveDecimal multiply(HiveDecimal dec) { + return create(bd.multiply(dec.bd), false); + } + + public BigInteger unscaledValue() { + return bd.unscaledValue(); + } + + public HiveDecimal scaleByPowerOfTen(int n) { + return create(bd.scaleByPowerOfTen(n)); + } + + public HiveDecimal abs() { + return create(bd.abs()); + } + + public HiveDecimal negate() { + return create(bd.negate()); + } + + public HiveDecimal add(HiveDecimal dec) { + return create(bd.add(dec.bd)); + } + + public HiveDecimal pow(int n) { + BigDecimal result = normalize(bd.pow(n), false); + return result == null ? null : new HiveDecimal(result); + } + + public HiveDecimal remainder(HiveDecimal dec) { + return create(bd.remainder(dec.bd)); + } + + public HiveDecimal divide(HiveDecimal dec) { + return create(bd.divide(dec.bd, MAX_SCALE, RoundingMode.HALF_UP), true); + } + + /** + * Get the sign of the underlying decimal. + * @return 0 if the decimal is equal to 0, -1 if less than zero, and 1 if greater than 0 + */ + public int signum() { + return bd.signum(); + } + + private static BigDecimal trim(BigDecimal d) { + if (d.compareTo(BigDecimal.ZERO) == 0) { + // Special case for 0, because java doesn't strip zeros correctly on that number. + d = BigDecimal.ZERO; + } else { + d = d.stripTrailingZeros(); + if (d.scale() < 0) { + // no negative scale decimals + d = d.setScale(0); + } + } + return d; + } + + private static BigDecimal normalize(BigDecimal bd, boolean allowRounding) { + if (bd == null) { + return null; + } + + bd = trim(bd); + + int intDigits = bd.precision() - bd.scale(); + + if (intDigits > MAX_PRECISION) { + return null; + } + + int maxScale = Math.min(MAX_SCALE, Math.min(MAX_PRECISION - intDigits, bd.scale())); + if (bd.scale() > maxScale ) { + if (allowRounding) { + bd = bd.setScale(maxScale, RoundingMode.HALF_UP); + // Trimming is again necessary, because rounding may introduce new trailing 0's. + bd = trim(bd); + } else { + bd = null; + } + } + + return bd; + } + + private static BigDecimal enforcePrecisionScale(BigDecimal bd, int maxPrecision, int maxScale) { + if (bd == null) { + return null; + } + + /** + * Specially handling the case that bd=0, and we are converting it to a type where precision=scale, + * such as decimal(1, 1). + */ + if (bd.compareTo(BigDecimal.ZERO) == 0 && bd.scale() == 0 && maxPrecision == maxScale) { + return bd.setScale(maxScale); + } + + bd = trim(bd); + + if (bd.scale() > maxScale) { + bd = bd.setScale(maxScale, RoundingMode.HALF_UP); + } + + int maxIntDigits = maxPrecision - maxScale; + int intDigits = bd.precision() - bd.scale(); + if (intDigits > maxIntDigits) { + return null; + } + + return bd; + } + + public static HiveDecimal enforcePrecisionScale(HiveDecimal dec, int maxPrecision, int maxScale) { + if (dec == null) { + return null; + } + + // Minor optimization, avoiding creating new objects. + if (dec.precision() - dec.scale() <= maxPrecision - maxScale && + dec.scale() <= maxScale) { + return dec; + } + + BigDecimal bd = enforcePrecisionScale(dec.bd, maxPrecision, maxScale); + if (bd == null) { + return null; + } + + return HiveDecimal.create(bd); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java new file mode 100644 index 0000000..b891e27 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/HiveIntervalDayTime.java @@ -0,0 +1,253 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.type; + +import java.math.BigDecimal; +import java.sql.Timestamp; +import java.util.Date; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hive.common.util.IntervalDayTimeUtils; + +import sun.util.calendar.BaseCalendar; + +/** + * Day-time interval type representing an offset in days/hours/minutes/seconds, + * with nanosecond precision. + * 1 day = 24 hours = 1440 minutes = 86400 seconds + */ +public class HiveIntervalDayTime implements Comparable<HiveIntervalDayTime> { + + // days/hours/minutes/seconds all represented as seconds + protected long totalSeconds; + protected int nanos; + + public HiveIntervalDayTime() { + } + + public HiveIntervalDayTime(int days, int hours, int minutes, int seconds, int nanos) { + set(days, hours, minutes, seconds, nanos); + } + + public HiveIntervalDayTime(long seconds, int nanos) { + set(seconds, nanos); + } + + public HiveIntervalDayTime(BigDecimal seconds) { + set(seconds); + } + + public HiveIntervalDayTime(HiveIntervalDayTime other) { + set(other.totalSeconds, other.nanos); + } + + public int getDays() { + return (int) TimeUnit.SECONDS.toDays(totalSeconds); + } + + public int getHours() { + return (int) (TimeUnit.SECONDS.toHours(totalSeconds) % TimeUnit.DAYS.toHours(1)); + } + + public int getMinutes() { + return (int) (TimeUnit.SECONDS.toMinutes(totalSeconds) % TimeUnit.HOURS.toMinutes(1)); + } + + public int getSeconds() { + return (int) (totalSeconds % TimeUnit.MINUTES.toSeconds(1)); + } + + public int getNanos() { + return nanos; + } + + /** + * Returns days/hours/minutes all converted into seconds. + * Nanos still need to be retrieved using getNanos() + * @return + */ + public long getTotalSeconds() { + return totalSeconds; + } + + /** + * + * @return double representation of the interval day time, accurate to nanoseconds + */ + public double getDouble() { + return totalSeconds + nanos / 1000000000; + } + + /** + * Ensures that the seconds and nanoseconds fields have consistent sign + */ + protected void normalizeSecondsAndNanos() { + if (totalSeconds > 0 && nanos < 0) { + --totalSeconds; + nanos += IntervalDayTimeUtils.NANOS_PER_SEC; + } else if (totalSeconds < 0 && nanos > 0) { + ++totalSeconds; + nanos -= IntervalDayTimeUtils.NANOS_PER_SEC; + } + } + + public void set(int days, int hours, int minutes, int seconds, int nanos) { + long totalSeconds = seconds; + totalSeconds += TimeUnit.DAYS.toSeconds(days); + totalSeconds += TimeUnit.HOURS.toSeconds(hours); + totalSeconds += TimeUnit.MINUTES.toSeconds(minutes); + totalSeconds += TimeUnit.NANOSECONDS.toSeconds(nanos); + nanos = nanos % IntervalDayTimeUtils.NANOS_PER_SEC; + + this.totalSeconds = totalSeconds; + this.nanos = nanos; + + normalizeSecondsAndNanos(); + } + + public void set(long seconds, int nanos) { + this.totalSeconds = seconds; + this.nanos = nanos; + normalizeSecondsAndNanos(); + } + + public void set(BigDecimal totalSecondsBd) { + long totalSeconds = totalSecondsBd.longValue(); + BigDecimal fractionalSecs = totalSecondsBd.remainder(BigDecimal.ONE); + int nanos = fractionalSecs.multiply(IntervalDayTimeUtils.NANOS_PER_SEC_BD).intValue(); + set(totalSeconds, nanos); + } + + public void set(HiveIntervalDayTime other) { + set(other.getTotalSeconds(), other.getNanos()); + } + + public HiveIntervalDayTime negate() { + return new HiveIntervalDayTime(-getTotalSeconds(), -getNanos()); + } + + @Override + public int compareTo(HiveIntervalDayTime other) { + long cmp = this.totalSeconds - other.totalSeconds; + if (cmp == 0) { + cmp = this.nanos - other.nanos; + } + if (cmp != 0) { + cmp = cmp > 0 ? 1 : -1; + } + return (int) cmp; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof HiveIntervalDayTime)) { + return false; + } + return 0 == compareTo((HiveIntervalDayTime) obj); + } + + /** + * Return a copy of this object. + */ + public Object clone() { + return new HiveIntervalDayTime(totalSeconds, nanos); + } + + @Override + public int hashCode() { + return new HashCodeBuilder().append(totalSeconds).append(nanos).toHashCode(); + } + + @Override + public String toString() { + // If normalize() was used, then day-hour-minute-second-nanos should have the same sign. + // This is currently working with that assumption. + boolean isNegative = (totalSeconds < 0 || nanos < 0); + String daySecondSignStr = isNegative ? "-" : ""; + + return String.format("%s%d %02d:%02d:%02d.%09d", + daySecondSignStr, Math.abs(getDays()), + Math.abs(getHours()), Math.abs(getMinutes()), + Math.abs(getSeconds()), Math.abs(getNanos())); + } + + public static HiveIntervalDayTime valueOf(String strVal) { + HiveIntervalDayTime result = null; + if (strVal == null) { + throw new IllegalArgumentException("Interval day-time string was null"); + } + Matcher patternMatcher = PATTERN_MATCHER.get(); + patternMatcher.reset(strVal); + if (patternMatcher.matches()) { + // Parse out the individual parts + try { + // Sign - whether interval is positive or negative + int sign = 1; + String field = patternMatcher.group(1); + if (field != null && field.equals("-")) { + sign = -1; + } + int days = sign * + IntervalDayTimeUtils.parseNumericValueWithRange("day", patternMatcher.group(2), + 0, Integer.MAX_VALUE); + byte hours = (byte) (sign * + IntervalDayTimeUtils.parseNumericValueWithRange("hour", patternMatcher.group(3), 0, 23)); + byte minutes = (byte) (sign * + IntervalDayTimeUtils.parseNumericValueWithRange("minute", patternMatcher.group(4), 0, 59)); + int seconds = 0; + int nanos = 0; + field = patternMatcher.group(5); + if (field != null) { + BigDecimal bdSeconds = new BigDecimal(field); + if (bdSeconds.compareTo(IntervalDayTimeUtils.MAX_INT_BD) > 0) { + throw new IllegalArgumentException("seconds value of " + bdSeconds + " too large"); + } + seconds = sign * bdSeconds.intValue(); + nanos = sign * bdSeconds.subtract(new BigDecimal(bdSeconds.toBigInteger())) + .multiply(IntervalDayTimeUtils.NANOS_PER_SEC_BD).intValue(); + } + + result = new HiveIntervalDayTime(days, hours, minutes, seconds, nanos); + } catch (Exception err) { + throw new IllegalArgumentException("Error parsing interval day-time string: " + strVal, err); + } + } else { + throw new IllegalArgumentException( + "Interval string does not match day-time format of 'd h:m:s.n': " + strVal); + } + + return result; + } + + // Simple pattern: D H:M:S.nnnnnnnnn + private final static String PARSE_PATTERN = + "([+|-])?(\\d+) (\\d+):(\\d+):((\\d+)(\\.(\\d+))?)"; + + private static final ThreadLocal<Matcher> PATTERN_MATCHER = new ThreadLocal<Matcher>() { + @Override + protected Matcher initialValue() { + return Pattern.compile(PARSE_PATTERN).matcher(""); + } + }; +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java new file mode 100644 index 0000000..3fb0cfd --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/common/type/RandomTypeUtil.java @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.type; + +import java.sql.Timestamp; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +public class RandomTypeUtil { + + public static final long NANOSECONDS_PER_SECOND = TimeUnit.SECONDS.toNanos(1); + public static final long MILLISECONDS_PER_SECOND = TimeUnit.SECONDS.toMillis(1); + public static final long NANOSECONDS_PER_MILLISSECOND = TimeUnit.MILLISECONDS.toNanos(1); + + private static ThreadLocal<DateFormat> DATE_FORMAT = + new ThreadLocal<DateFormat>() { + @Override + protected DateFormat initialValue() { + return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + } + }; + + // We've switched to Joda/Java Calendar which has a more limited time range.... + public static int MIN_YEAR = 1900; + public static int MAX_YEAR = 3000; + private static long MIN_FOUR_DIGIT_YEAR_MILLIS = parseToMillis("1900-01-01 00:00:00"); + private static long MAX_FOUR_DIGIT_YEAR_MILLIS = parseToMillis("3000-01-01 00:00:00"); + + private static long parseToMillis(String s) { + try { + return DATE_FORMAT.get().parse(s).getTime(); + } catch (ParseException ex) { + throw new RuntimeException(ex); + } + } + + public static Timestamp getRandTimestamp(Random r) { + return getRandTimestamp(r, MIN_YEAR, MAX_YEAR); + } + + public static Timestamp getRandTimestamp(Random r, int minYear, int maxYear) { + String optionalNanos = ""; + switch (r.nextInt(4)) { + case 0: + // No nanos. + break; + case 1: + optionalNanos = String.format(".%09d", + Integer.valueOf(r.nextInt((int) NANOSECONDS_PER_SECOND))); + break; + case 2: + // Limit to milliseconds only... + optionalNanos = String.format(".%09d", + Integer.valueOf(r.nextInt((int) MILLISECONDS_PER_SECOND)) * NANOSECONDS_PER_MILLISSECOND); + break; + case 3: + // Limit to below milliseconds only... + optionalNanos = String.format(".%09d", + Integer.valueOf(r.nextInt((int) NANOSECONDS_PER_MILLISSECOND))); + break; + } + String timestampStr = String.format("%04d-%02d-%02d %02d:%02d:%02d%s", + Integer.valueOf(minYear + r.nextInt(maxYear - minYear + 1)), // year + Integer.valueOf(1 + r.nextInt(12)), // month + Integer.valueOf(1 + r.nextInt(28)), // day + Integer.valueOf(0 + r.nextInt(24)), // hour + Integer.valueOf(0 + r.nextInt(60)), // minute + Integer.valueOf(0 + r.nextInt(60)), // second + optionalNanos); + Timestamp timestampVal; + try { + timestampVal = Timestamp.valueOf(timestampStr); + } catch (Exception e) { + System.err.println("Timestamp string " + timestampStr + " did not parse"); + throw e; + } + return timestampVal; + } + + public static long randomMillis(long minMillis, long maxMillis, Random rand) { + return minMillis + (long) ((maxMillis - minMillis) * rand.nextDouble()); + } + + public static long randomMillis(Random rand) { + return randomMillis(MIN_FOUR_DIGIT_YEAR_MILLIS, MAX_FOUR_DIGIT_YEAR_MILLIS, rand); + } + + public static int randomNanos(Random rand, int decimalDigits) { + // Only keep the most significant decimalDigits digits. + int nanos = rand.nextInt((int) NANOSECONDS_PER_SECOND); + return nanos - nanos % (int) Math.pow(10, 9 - decimalDigits); + } + + public static int randomNanos(Random rand) { + return randomNanos(rand, 9); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java new file mode 100644 index 0000000..a6d932c --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -0,0 +1,389 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + + +/** + * This class supports string and binary data by value reference -- i.e. each field is + * explicitly present, as opposed to provided by a dictionary reference. + * In some cases, all the values will be in the same byte array to begin with, + * but this need not be the case. If each value is in a separate byte + * array to start with, or not all of the values are in the same original + * byte array, you can still assign data by reference into this column vector. + * This gives flexibility to use this in multiple situations. + * <p> + * When setting data by reference, the caller + * is responsible for allocating the byte arrays used to hold the data. + * You can also set data by value, as long as you call the initBuffer() method first. + * You can mix "by value" and "by reference" in the same column vector, + * though that use is probably not typical. + */ +public class BytesColumnVector extends ColumnVector { + public byte[][] vector; + public int[] start; // start offset of each field + + /* + * The length of each field. If the value repeats for every entry, then it is stored + * in vector[0] and isRepeating from the superclass is set to true. + */ + public int[] length; + private byte[] buffer; // optional buffer to use when actually copying in data + private int nextFree; // next free position in buffer + + // Estimate that there will be 16 bytes per entry + static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE; + + // Proportion of extra space to provide when allocating more buffer space. + static final float EXTRA_SPACE_FACTOR = (float) 1.2; + + /** + * Use this constructor for normal operation. + * All column vectors should be the default size normally. + */ + public BytesColumnVector() { + this(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Don't call this constructor except for testing purposes. + * + * @param size number of elements in the column vector + */ + public BytesColumnVector(int size) { + super(size); + vector = new byte[size][]; + start = new int[size]; + length = new int[size]; + } + + /** + * Additional reset work for BytesColumnVector (releasing scratch bytes for by value strings). + */ + @Override + public void reset() { + super.reset(); + initBuffer(0); + } + + /** Set a field by reference. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setRef(int elementNum, byte[] sourceBuf, int start, int length) { + vector[elementNum] = sourceBuf; + this.start[elementNum] = start; + this.length[elementNum] = length; + } + + /** + * You must call initBuffer first before using setVal(). + * Provide the estimated number of bytes needed to hold + * a full column vector worth of byte string data. + * + * @param estimatedValueSize Estimated size of buffer space needed + */ + public void initBuffer(int estimatedValueSize) { + nextFree = 0; + + // if buffer is already allocated, keep using it, don't re-allocate + if (buffer != null) { + return; + } + + // allocate a little extra space to limit need to re-allocate + int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR); + if (bufferSize < DEFAULT_BUFFER_SIZE) { + bufferSize = DEFAULT_BUFFER_SIZE; + } + buffer = new byte[bufferSize]; + } + + /** + * Initialize buffer to default size. + */ + public void initBuffer() { + initBuffer(0); + } + + /** + * @return amount of buffer space currently allocated + */ + public int bufferSize() { + if (buffer == null) { + return 0; + } + return buffer.length; + } + + /** + * Set a field by actually copying in to a local buffer. + * If you must actually copy data in to the array, use this method. + * DO NOT USE this method unless it's not practical to set data by reference with setRef(). + * Setting data by reference tends to run a lot faster than copying data in. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + * @param start start byte position within source + * @param length length of source byte sequence + */ + public void setVal(int elementNum, byte[] sourceBuf, int start, int length) { + if ((nextFree + length) > buffer.length) { + increaseBufferSpace(length); + } + System.arraycopy(sourceBuf, start, buffer, nextFree, length); + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = length; + nextFree += length; + } + + /** + * Set a field by actually copying in to a local buffer. + * If you must actually copy data in to the array, use this method. + * DO NOT USE this method unless it's not practical to set data by reference with setRef(). + * Setting data by reference tends to run a lot faster than copying data in. + * + * @param elementNum index within column vector to set + * @param sourceBuf container of source data + */ + public void setVal(int elementNum, byte[] sourceBuf) { + setVal(elementNum, sourceBuf, 0, sourceBuf.length); + } + + /** + * Set a field to the concatenation of two string values. Result data is copied + * into the internal buffer. + * + * @param elementNum index within column vector to set + * @param leftSourceBuf container of left argument + * @param leftStart start of left argument + * @param leftLen length of left argument + * @param rightSourceBuf container of right argument + * @param rightStart start of right argument + * @param rightLen length of right arugment + */ + public void setConcat(int elementNum, byte[] leftSourceBuf, int leftStart, int leftLen, + byte[] rightSourceBuf, int rightStart, int rightLen) { + int newLen = leftLen + rightLen; + if ((nextFree + newLen) > buffer.length) { + increaseBufferSpace(newLen); + } + vector[elementNum] = buffer; + this.start[elementNum] = nextFree; + this.length[elementNum] = newLen; + + System.arraycopy(leftSourceBuf, leftStart, buffer, nextFree, leftLen); + nextFree += leftLen; + System.arraycopy(rightSourceBuf, rightStart, buffer, nextFree, rightLen); + nextFree += rightLen; + } + + /** + * Increase buffer space enough to accommodate next element. + * This uses an exponential increase mechanism to rapidly + * increase buffer size to enough to hold all data. + * As batches get re-loaded, buffer space allocated will quickly + * stabilize. + * + * @param nextElemLength size of next element to be added + */ + public void increaseBufferSpace(int nextElemLength) { + + // Keep doubling buffer size until there will be enough space for next element. + int newLength = 2 * buffer.length; + while((nextFree + nextElemLength) > newLength) { + newLength *= 2; + } + + // Allocate new buffer, copy data to it, and set buffer to new buffer. + byte[] newBuffer = new byte[newLength]; + System.arraycopy(buffer, 0, newBuffer, 0, nextFree); + buffer = newBuffer; + } + + /** Copy the current object contents into the output. Only copy selected entries, + * as indicated by selectedInUse and the sel array. + */ + public void copySelected( + boolean selectedInUse, int[] sel, int size, BytesColumnVector output) { + + // Output has nulls if and only if input has nulls. + output.noNulls = noNulls; + output.isRepeating = false; + + // Handle repeating case + if (isRepeating) { + output.setVal(0, vector[0], start[0], length[0]); + output.isNull[0] = isNull[0]; + output.isRepeating = true; + return; + } + + // Handle normal case + + // Copy data values over + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.setVal(i, vector[i], start[i], length[i]); + } + } + else { + for (int i = 0; i < size; i++) { + output.setVal(i, vector[i], start[i], length[i]); + } + } + + // Copy nulls over if needed + if (!noNulls) { + if (selectedInUse) { + for (int j = 0; j < size; j++) { + int i = sel[j]; + output.isNull[i] = isNull[i]; + } + } + else { + System.arraycopy(isNull, 0, output.isNull, 0, size); + } + } + } + + /** Simplify vector by brute-force flattening noNulls and isRepeating + * This can be used to reduce combinatorial explosion of code paths in VectorExpressions + * with many arguments, at the expense of loss of some performance. + */ + public void flatten(boolean selectedInUse, int[] sel, int size) { + flattenPush(); + if (isRepeating) { + isRepeating = false; + + // setRef is used below and this is safe, because the reference + // is to data owned by this column vector. If this column vector + // gets re-used, the whole thing is re-used together so there + // is no danger of a dangling reference. + + // Only copy data values if entry is not null. The string value + // at position 0 is undefined if the position 0 value is null. + if (noNulls || !isNull[0]) { + + // loops start at position 1 because position 0 is already set + if (selectedInUse) { + for (int j = 1; j < size; j++) { + int i = sel[j]; + this.setRef(i, vector[0], start[0], length[0]); + } + } else { + for (int i = 1; i < size; i++) { + this.setRef(i, vector[0], start[0], length[0]); + } + } + } + flattenRepeatingNulls(selectedInUse, sel, size); + } + flattenNoNulls(selectedInUse, sel, size); + } + + // Fill the all the vector entries with provided value + public void fill(byte[] value) { + noNulls = true; + isRepeating = true; + setRef(0, value, 0, value.length); + } + + // Fill the column vector with nulls + public void fillWithNulls() { + noNulls = false; + isRepeating = true; + vector[0] = null; + isNull[0] = true; + } + + @Override + public void setElement(int outElementNum, int inputElementNum, ColumnVector inputVector) { + if (inputVector.isRepeating) { + inputElementNum = 0; + } + if (inputVector.noNulls || !inputVector.isNull[inputElementNum]) { + isNull[outElementNum] = false; + BytesColumnVector in = (BytesColumnVector) inputVector; + setVal(outElementNum, in.vector[inputElementNum], + in.start[inputElementNum], in.length[inputElementNum]); + } else { + isNull[outElementNum] = true; + noNulls = false; + } + } + + @Override + public void init() { + initBuffer(0); + } + + public String toString(int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + return new String(vector[row], start[row], length[row]); + } else { + return null; + } + } + + @Override + public void stringifyValue(StringBuilder buffer, int row) { + if (isRepeating) { + row = 0; + } + if (noNulls || !isNull[row]) { + buffer.append('"'); + buffer.append(new String(this.buffer, start[row], length[row])); + buffer.append('"'); + } else { + buffer.append("null"); + } + } + + @Override + public void ensureSize(int size, boolean preserveData) { + super.ensureSize(size, preserveData); + if (size > vector.length) { + int[] oldStart = start; + start = new int[size]; + int[] oldLength = length; + length = new int[size]; + byte[][] oldVector = vector; + vector = new byte[size][]; + if (preserveData) { + if (isRepeating) { + vector[0] = oldVector[0]; + start[0] = oldStart[0]; + length[0] = oldLength[0]; + } else { + System.arraycopy(oldVector, 0, vector, 0, oldVector.length); + System.arraycopy(oldStart, 0, start, 0 , oldStart.length); + System.arraycopy(oldLength, 0, length, 0, oldLength.length); + } + } + } + } +}
