http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/OrcUtils.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java new file mode 100644 index 0000000..5845ba6 --- /dev/null +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -0,0 +1,530 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.google.common.collect.Lists; + +public class OrcUtils { + + /** + * Returns selected columns as a boolean array with true value set for specified column names. + * The result will contain number of elements equal to flattened number of columns. + * For example: + * selectedColumns - a,b,c + * allColumns - a,b,c,d + * If column c is a complex type, say list<string> and other types are primitives then result will + * be [false, true, true, true, true, true, false] + * Index 0 is the root element of the struct which is set to false by default, index 1,2 + * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and + * index 5 correspond to column d. After flattening list<string> gets 2 columns. + * + * @param selectedColumns - comma separated list of selected column names + * @param schema - object schema + * @return - boolean array with true value set for the specified column names + */ + public static boolean[] includeColumns(String selectedColumns, + TypeDescription schema) { + int numFlattenedCols = schema.getMaximumId(); + boolean[] results = new boolean[numFlattenedCols + 1]; + if ("*".equals(selectedColumns)) { + Arrays.fill(results, true); + return results; + } + if (selectedColumns != null && + schema.getCategory() == TypeDescription.Category.STRUCT) { + List<String> fieldNames = schema.getFieldNames(); + List<TypeDescription> fields = schema.getChildren(); + for (String column: selectedColumns.split((","))) { + TypeDescription col = findColumn(column, fieldNames, fields); + if (col != null) { + for(int i=col.getId(); i <= col.getMaximumId(); ++i) { + results[i] = true; + } + } + } + } + return results; + } + + private static TypeDescription findColumn(String columnName, + List<String> fieldNames, + List<TypeDescription> fields) { + int i = 0; + for(String fieldName: fieldNames) { + if (fieldName.equalsIgnoreCase(columnName)) { + return fields.get(i); + } else { + i += 1; + } + } + return null; + } + + public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) { + List<OrcProto.Type> result = Lists.newArrayList(); + appendOrcTypes(result, typeDescr); + return result; + } + + private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) { + OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); + List<TypeDescription> children = typeDescr.getChildren(); + switch (typeDescr.getCategory()) { + case BOOLEAN: + type.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + type.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + type.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + type.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + type.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + type.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + type.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + type.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + type.setKind(OrcProto.Type.Kind.CHAR); + type.setMaximumLength(typeDescr.getMaxLength()); + break; + case VARCHAR: + type.setKind(OrcProto.Type.Kind.VARCHAR); + type.setMaximumLength(typeDescr.getMaxLength()); + break; + case BINARY: + type.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + type.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + type.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + type.setKind(OrcProto.Type.Kind.DECIMAL); + type.setPrecision(typeDescr.getPrecision()); + type.setScale(typeDescr.getScale()); + break; + case LIST: + type.setKind(OrcProto.Type.Kind.LIST); + type.addSubtypes(children.get(0).getId()); + break; + case MAP: + type.setKind(OrcProto.Type.Kind.MAP); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); + } + break; + case STRUCT: + type.setKind(OrcProto.Type.Kind.STRUCT); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); + } + for(String field: typeDescr.getFieldNames()) { + type.addFieldNames(field); + } + break; + case UNION: + type.setKind(OrcProto.Type.Kind.UNION); + for(TypeDescription t: children) { + type.addSubtypes(t.getId()); + } + break; + default: + throw new IllegalArgumentException("Unknown category: " + + typeDescr.getCategory()); + } + result.add(type.build()); + if (children != null) { + for(TypeDescription child: children) { + appendOrcTypes(result, child); + } + } + } + + /** + * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype + * numbers based on the length of the result list being appended. + * + * @param result + * @param typeDescr + */ + public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result, + TypeDescription typeDescr) { + + int subtype = result.size(); + OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); + boolean needsAdd = true; + List<TypeDescription> children = typeDescr.getChildren(); + switch (typeDescr.getCategory()) { + case BOOLEAN: + type.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + type.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + type.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + type.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + type.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + type.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + type.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + type.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + type.setKind(OrcProto.Type.Kind.CHAR); + type.setMaximumLength(typeDescr.getMaxLength()); + break; + case VARCHAR: + type.setKind(OrcProto.Type.Kind.VARCHAR); + type.setMaximumLength(typeDescr.getMaxLength()); + break; + case BINARY: + type.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + type.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + type.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + type.setKind(OrcProto.Type.Kind.DECIMAL); + type.setPrecision(typeDescr.getPrecision()); + type.setScale(typeDescr.getScale()); + break; + case LIST: + type.setKind(OrcProto.Type.Kind.LIST); + type.addSubtypes(++subtype); + result.add(type.build()); + needsAdd = false; + appendOrcTypesRebuildSubtypes(result, children.get(0)); + break; + case MAP: + { + // Make room for MAP type. + result.add(null); + + // Add MAP type pair in order to determine their subtype values. + appendOrcTypesRebuildSubtypes(result, children.get(0)); + int subtype2 = result.size(); + appendOrcTypesRebuildSubtypes(result, children.get(1)); + type.setKind(OrcProto.Type.Kind.MAP); + type.addSubtypes(subtype + 1); + type.addSubtypes(subtype2); + result.set(subtype, type.build()); + needsAdd = false; + } + break; + case STRUCT: + { + List<String> fieldNames = typeDescr.getFieldNames(); + + // Make room for STRUCT type. + result.add(null); + + List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size()); + for(TypeDescription child: children) { + int fieldSubtype = result.size(); + fieldSubtypes.add(fieldSubtype); + appendOrcTypesRebuildSubtypes(result, child); + } + + type.setKind(OrcProto.Type.Kind.STRUCT); + + for (int i = 0 ; i < fieldNames.size(); i++) { + type.addSubtypes(fieldSubtypes.get(i)); + type.addFieldNames(fieldNames.get(i)); + } + result.set(subtype, type.build()); + needsAdd = false; + } + break; + case UNION: + { + // Make room for UNION type. + result.add(null); + + List<Integer> unionSubtypes = new ArrayList<Integer>(children.size()); + for(TypeDescription child: children) { + int unionSubtype = result.size(); + unionSubtypes.add(unionSubtype); + appendOrcTypesRebuildSubtypes(result, child); + } + + type.setKind(OrcProto.Type.Kind.UNION); + for (int i = 0 ; i < children.size(); i++) { + type.addSubtypes(unionSubtypes.get(i)); + } + result.set(subtype, type.build()); + needsAdd = false; + } + break; + default: + throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory()); + } + if (needsAdd) { + result.add(type.build()); + } + } + + /** + * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype + * numbers based on the length of the result list being appended. + * + * @param result + * @param types + * @param columnId + */ + public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result, + List<OrcProto.Type> types, int columnId) { + + OrcProto.Type oldType = types.get(columnId++); + + int subtype = result.size(); + OrcProto.Type.Builder builder = OrcProto.Type.newBuilder(); + boolean needsAdd = true; + switch (oldType.getKind()) { + case BOOLEAN: + builder.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + builder.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + builder.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + builder.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + builder.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + builder.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + builder.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + builder.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + builder.setKind(OrcProto.Type.Kind.CHAR); + builder.setMaximumLength(oldType.getMaximumLength()); + break; + case VARCHAR: + builder.setKind(OrcProto.Type.Kind.VARCHAR); + builder.setMaximumLength(oldType.getMaximumLength()); + break; + case BINARY: + builder.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + builder.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + builder.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + builder.setKind(OrcProto.Type.Kind.DECIMAL); + builder.setPrecision(oldType.getPrecision()); + builder.setScale(oldType.getScale()); + break; + case LIST: + builder.setKind(OrcProto.Type.Kind.LIST); + builder.addSubtypes(++subtype); + result.add(builder.build()); + needsAdd = false; + columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); + break; + case MAP: + { + // Make room for MAP type. + result.add(null); + + // Add MAP type pair in order to determine their subtype values. + columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); + int subtype2 = result.size(); + columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); + builder.setKind(OrcProto.Type.Kind.MAP); + builder.addSubtypes(subtype + 1); + builder.addSubtypes(subtype2); + result.set(subtype, builder.build()); + needsAdd = false; + } + break; + case STRUCT: + { + List<String> fieldNames = oldType.getFieldNamesList(); + + // Make room for STRUCT type. + result.add(null); + + List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size()); + for(int i = 0 ; i < fieldNames.size(); i++) { + int fieldSubtype = result.size(); + fieldSubtypes.add(fieldSubtype); + columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); + } + + builder.setKind(OrcProto.Type.Kind.STRUCT); + + for (int i = 0 ; i < fieldNames.size(); i++) { + builder.addSubtypes(fieldSubtypes.get(i)); + builder.addFieldNames(fieldNames.get(i)); + } + result.set(subtype, builder.build()); + needsAdd = false; + } + break; + case UNION: + { + int subtypeCount = oldType.getSubtypesCount(); + + // Make room for UNION type. + result.add(null); + + List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount); + for(int i = 0 ; i < subtypeCount; i++) { + int unionSubtype = result.size(); + unionSubtypes.add(unionSubtype); + columnId = appendOrcTypesRebuildSubtypes(result, types, columnId); + } + + builder.setKind(OrcProto.Type.Kind.UNION); + for (int i = 0 ; i < subtypeCount; i++) { + builder.addSubtypes(unionSubtypes.get(i)); + } + result.set(subtype, builder.build()); + needsAdd = false; + } + break; + default: + throw new IllegalArgumentException("Unknown category: " + oldType.getKind()); + } + if (needsAdd) { + result.add(builder.build()); + } + return columnId; + } + + /** + * Translate the given rootColumn from the list of types to a TypeDescription. + * @param types all of the types + * @param rootColumn translate this type + * @return a new TypeDescription that matches the given rootColumn + */ + public static + TypeDescription convertTypeFromProtobuf(List<OrcProto.Type> types, + int rootColumn) { + OrcProto.Type type = types.get(rootColumn); + switch (type.getKind()) { + case BOOLEAN: + return TypeDescription.createBoolean(); + case BYTE: + return TypeDescription.createByte(); + case SHORT: + return TypeDescription.createShort(); + case INT: + return TypeDescription.createInt(); + case LONG: + return TypeDescription.createLong(); + case FLOAT: + return TypeDescription.createFloat(); + case DOUBLE: + return TypeDescription.createDouble(); + case STRING: + return TypeDescription.createString(); + case CHAR: + case VARCHAR: { + TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ? + TypeDescription.createChar() : TypeDescription.createVarchar(); + if (type.hasMaximumLength()) { + result.withMaxLength(type.getMaximumLength()); + } + return result; + } + case BINARY: + return TypeDescription.createBinary(); + case TIMESTAMP: + return TypeDescription.createTimestamp(); + case DATE: + return TypeDescription.createDate(); + case DECIMAL: { + TypeDescription result = TypeDescription.createDecimal(); + if (type.hasScale()) { + result.withScale(type.getScale()); + } + if (type.hasPrecision()) { + result.withPrecision(type.getPrecision()); + } + return result; + } + case LIST: + return TypeDescription.createList( + convertTypeFromProtobuf(types, type.getSubtypes(0))); + case MAP: + return TypeDescription.createMap( + convertTypeFromProtobuf(types, type.getSubtypes(0)), + convertTypeFromProtobuf(types, type.getSubtypes(1))); + case STRUCT: { + TypeDescription result = TypeDescription.createStruct(); + for(int f=0; f < type.getSubtypesCount(); ++f) { + result.addField(type.getFieldNames(f), + convertTypeFromProtobuf(types, type.getSubtypes(f))); + } + return result; + } + case UNION: { + TypeDescription result = TypeDescription.createUnion(); + for(int f=0; f < type.getSubtypesCount(); ++f) { + result.addUnionChild( + convertTypeFromProtobuf(types, type.getSubtypes(f))); + } + return result; + } + } + throw new IllegalArgumentException("Unknown ORC type " + type.getKind()); + } +}
http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/Reader.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java new file mode 100644 index 0000000..87f3293 --- /dev/null +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -0,0 +1,368 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; + +/** + * The interface for reading ORC files. + * + * One Reader can support multiple concurrent RecordReader. + */ +public interface Reader { + + /** + * Get the number of rows in the file. + * @return the number of rows + */ + long getNumberOfRows(); + + /** + * Get the deserialized data size of the file + * @return raw data size + */ + long getRawDataSize(); + + /** + * Get the deserialized data size of the specified columns + * @param colNames + * @return raw data size of columns + */ + long getRawDataSizeOfColumns(List<String> colNames); + + /** + * Get the deserialized data size of the specified columns ids + * @param colIds - internal column id (check orcfiledump for column ids) + * @return raw data size of columns + */ + long getRawDataSizeFromColIndices(List<Integer> colIds); + + /** + * Get the user metadata keys. + * @return the set of metadata keys + */ + List<String> getMetadataKeys(); + + /** + * Get a user metadata value. + * @param key a key given by the user + * @return the bytes associated with the given key + */ + ByteBuffer getMetadataValue(String key); + + /** + * Did the user set the given metadata value. + * @param key the key to check + * @return true if the metadata value was set + */ + boolean hasMetadataValue(String key); + + /** + * Get the compression kind. + * @return the kind of compression in the file + */ + CompressionKind getCompressionKind(); + + /** + * Get the buffer size for the compression. + * @return number of bytes to buffer for the compression codec. + */ + int getCompressionSize(); + + /** + * Get the number of rows per a entry in the row index. + * @return the number of rows per an entry in the row index or 0 if there + * is no row index. + */ + int getRowIndexStride(); + + /** + * Get the list of stripes. + * @return the information about the stripes in order + */ + List<StripeInformation> getStripes(); + + /** + * Get the length of the file. + * @return the number of bytes in the file + */ + long getContentLength(); + + /** + * Get the statistics about the columns in the file. + * @return the information about the column + */ + ColumnStatistics[] getStatistics(); + + /** + * Get the type of rows in this ORC file. + */ + TypeDescription getSchema(); + + /** + * Get the list of types contained in the file. The root type is the first + * type in the list. + * @return the list of flattened types + * @deprecated use getSchema instead + */ + List<OrcProto.Type> getTypes(); + + /** + * Get the file format version. + */ + OrcFile.Version getFileVersion(); + + /** + * Get the version of the writer of this file. + */ + OrcFile.WriterVersion getWriterVersion(); + + /** + * Options for creating a RecordReader. + */ + public static class Options { + private boolean[] include; + private long offset = 0; + private long length = Long.MAX_VALUE; + private SearchArgument sarg = null; + private String[] columnNames = null; + private Boolean useZeroCopy = null; + private Boolean skipCorruptRecords = null; + private TypeDescription schema = null; + private DataReader dataReader = null; + + /** + * Set the list of columns to read. + * @param include a list of columns to read + * @return this + */ + public Options include(boolean[] include) { + this.include = include; + return this; + } + + /** + * Set the range of bytes to read + * @param offset the starting byte offset + * @param length the number of bytes to read + * @return this + */ + public Options range(long offset, long length) { + this.offset = offset; + this.length = length; + return this; + } + + /** + * Set the schema on read type description. + */ + public Options schema(TypeDescription schema) { + this.schema = schema; + return this; + } + + /** + * Set search argument for predicate push down. + * @param sarg the search argument + * @param columnNames the column names for + * @return this + */ + public Options searchArgument(SearchArgument sarg, String[] columnNames) { + this.sarg = sarg; + this.columnNames = columnNames; + return this; + } + + /** + * Set whether to use zero copy from HDFS. + * @param value the new zero copy flag + * @return this + */ + public Options useZeroCopy(boolean value) { + this.useZeroCopy = value; + return this; + } + + public Options dataReader(DataReader value) { + this.dataReader = value; + return this; + } + + /** + * Set whether to skip corrupt records. + * @param value the new skip corrupt records flag + * @return this + */ + public Options skipCorruptRecords(boolean value) { + this.skipCorruptRecords = value; + return this; + } + + public boolean[] getInclude() { + return include; + } + + public long getOffset() { + return offset; + } + + public long getLength() { + return length; + } + + public TypeDescription getSchema() { + return schema; + } + + public SearchArgument getSearchArgument() { + return sarg; + } + + public String[] getColumnNames() { + return columnNames; + } + + public long getMaxOffset() { + long result = offset + length; + if (result < 0) { + result = Long.MAX_VALUE; + } + return result; + } + + public Boolean getUseZeroCopy() { + return useZeroCopy; + } + + public Boolean getSkipCorruptRecords() { + return skipCorruptRecords; + } + + public DataReader getDataReader() { + return dataReader; + } + + public Options clone() { + Options result = new Options(); + result.include = include; + result.offset = offset; + result.length = length; + result.sarg = sarg; + result.schema = schema; + result.columnNames = columnNames; + result.useZeroCopy = useZeroCopy; + result.skipCorruptRecords = skipCorruptRecords; + result.dataReader = dataReader == null ? null : dataReader.clone(); + return result; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append("{include: "); + if (include == null) { + buffer.append("null"); + } else { + buffer.append("["); + for(int i=0; i < include.length; ++i) { + if (i != 0) { + buffer.append(", "); + } + buffer.append(include[i]); + } + buffer.append("]"); + } + buffer.append(", offset: "); + buffer.append(offset); + buffer.append(", length: "); + buffer.append(length); + if (sarg != null) { + buffer.append(", sarg: "); + buffer.append(sarg.toString()); + buffer.append(", columns: ["); + for(int i=0; i < columnNames.length; ++i) { + if (i != 0) { + buffer.append(", "); + } + buffer.append("'"); + buffer.append(columnNames[i]); + buffer.append("'"); + } + buffer.append("]"); + } + if (schema != null) { + buffer.append(", schema: "); + schema.printToBuffer(buffer); + } + buffer.append("}"); + return buffer.toString(); + } + } + + /** + * Create a RecordReader that reads everything with the default options. + * @return a new RecordReader + * @throws IOException + */ + RecordReader rows() throws IOException; + + /** + * Create a RecordReader that uses the options given. + * This method can't be named rows, because many callers used rows(null) + * before the rows() method was introduced. + * @param options the options to read with + * @return a new RecordReader + * @throws IOException + */ + RecordReader rows(Options options) throws IOException; + + /** + * @return List of integers representing version of the file, in order from major to minor. + */ + List<Integer> getVersionList(); + + /** + * @return Gets the size of metadata, in bytes. + */ + int getMetadataSize(); + + /** + * @return Stripe statistics, in original protobuf form. + */ + List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics(); + + /** + * @return Stripe statistics. + */ + List<StripeStatistics> getStripeStatistics(); + + /** + * @return File statistics, in original protobuf form. + */ + List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics(); + + /** + * @return Serialized file metadata read from disk for the purposes of caching, etc. + */ + ByteBuffer getSerializedFileFooter(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/RecordReader.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/RecordReader.java b/java/core/src/java/org/apache/orc/RecordReader.java new file mode 100644 index 0000000..09ba0f0 --- /dev/null +++ b/java/core/src/java/org/apache/orc/RecordReader.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +/** + * A row-by-row iterator for ORC files. + */ +public interface RecordReader { + /** + * Read the next row batch. The size of the batch to read cannot be + * controlled by the callers. Caller need to look at + * VectorizedRowBatch.size of the retunred object to know the batch + * size read. + * @param batch a row batch object to read into + * @return were more rows available to read? + * @throws java.io.IOException + */ + boolean nextBatch(VectorizedRowBatch batch) throws IOException; + + /** + * Get the row number of the row that will be returned by the following + * call to next(). + * @return the row number from 0 to the number of rows in the file + * @throws java.io.IOException + */ + long getRowNumber() throws IOException; + + /** + * Get the progress of the reader through the rows. + * @return a fraction between 0.0 and 1.0 of rows read + * @throws java.io.IOException + */ + float getProgress() throws IOException; + + /** + * Release the resources associated with the given reader. + * @throws java.io.IOException + */ + void close() throws IOException; + + /** + * Seek to a particular row number. + */ + void seekToRow(long rowCount) throws IOException; +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StringColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/StringColumnStatistics.java b/java/core/src/java/org/apache/orc/StringColumnStatistics.java new file mode 100644 index 0000000..5a868d0 --- /dev/null +++ b/java/core/src/java/org/apache/orc/StringColumnStatistics.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.orc.ColumnStatistics; + +/** + * Statistics for string columns. + */ +public interface StringColumnStatistics extends ColumnStatistics { + /** + * Get the minimum string. + * @return the minimum + */ + String getMinimum(); + + /** + * Get the maximum string. + * @return the maximum + */ + String getMaximum(); + + /** + * Get the total length of all strings + * @return the sum (total length) + */ + long getSum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StripeInformation.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/StripeInformation.java b/java/core/src/java/org/apache/orc/StripeInformation.java new file mode 100644 index 0000000..38f7eba --- /dev/null +++ b/java/core/src/java/org/apache/orc/StripeInformation.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +/** + * Information about the stripes in an ORC file that is provided by the Reader. + */ +public interface StripeInformation { + /** + * Get the byte offset of the start of the stripe. + * @return the bytes from the start of the file + */ + long getOffset(); + + /** + * Get the total length of the stripe in bytes. + * @return the number of bytes in the stripe + */ + long getLength(); + + /** + * Get the length of the stripe's indexes. + * @return the number of bytes in the index + */ + long getIndexLength(); + + /** + * Get the length of the stripe's data. + * @return the number of bytes in the stripe + */ + long getDataLength(); + + /** + * Get the length of the stripe's tail section, which contains its index. + * @return the number of bytes in the tail + */ + long getFooterLength(); + + /** + * Get the number of rows in the stripe. + * @return a count of the number of rows + */ + long getNumberOfRows(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StripeStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/StripeStatistics.java b/java/core/src/java/org/apache/orc/StripeStatistics.java new file mode 100644 index 0000000..8fc91cb --- /dev/null +++ b/java/core/src/java/org/apache/orc/StripeStatistics.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.orc.impl.ColumnStatisticsImpl; + +import java.util.List; + +public class StripeStatistics { + private final List<OrcProto.ColumnStatistics> cs; + + public StripeStatistics(List<OrcProto.ColumnStatistics> list) { + this.cs = list; + } + + /** + * Return list of column statistics + * + * @return column stats + */ + public ColumnStatistics[] getColumnStatistics() { + ColumnStatistics[] result = new ColumnStatistics[cs.size()]; + for (int i = 0; i < result.length; ++i) { + result[i] = ColumnStatisticsImpl.deserialize(cs.get(i)); + } + return result; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java new file mode 100644 index 0000000..27dc49f --- /dev/null +++ b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import java.sql.Timestamp; + +/** + * Statistics for Timestamp columns. + */ +public interface TimestampColumnStatistics extends ColumnStatistics { + /** + * Get the minimum value for the column. + * @return minimum value + */ + Timestamp getMinimum(); + + /** + * Get the maximum value for the column. + * @return maximum value + */ + Timestamp getMaximum(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/TypeDescription.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java new file mode 100644 index 0000000..d4c66d1 --- /dev/null +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -0,0 +1,791 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * This is the description of the types in an ORC file. + */ +public class TypeDescription { + private static final int MAX_PRECISION = 38; + private static final int MAX_SCALE = 38; + private static final int DEFAULT_PRECISION = 38; + private static final int DEFAULT_SCALE = 10; + private static final int DEFAULT_LENGTH = 256; + public enum Category { + BOOLEAN("boolean", true), + BYTE("tinyint", true), + SHORT("smallint", true), + INT("int", true), + LONG("bigint", true), + FLOAT("float", true), + DOUBLE("double", true), + STRING("string", true), + DATE("date", true), + TIMESTAMP("timestamp", true), + BINARY("binary", true), + DECIMAL("decimal", true), + VARCHAR("varchar", true), + CHAR("char", true), + LIST("array", false), + MAP("map", false), + STRUCT("struct", false), + UNION("uniontype", false); + + Category(String name, boolean isPrimitive) { + this.name = name; + this.isPrimitive = isPrimitive; + } + + final boolean isPrimitive; + final String name; + + public boolean isPrimitive() { + return isPrimitive; + } + + public String getName() { + return name; + } + } + + public static TypeDescription createBoolean() { + return new TypeDescription(Category.BOOLEAN); + } + + public static TypeDescription createByte() { + return new TypeDescription(Category.BYTE); + } + + public static TypeDescription createShort() { + return new TypeDescription(Category.SHORT); + } + + public static TypeDescription createInt() { + return new TypeDescription(Category.INT); + } + + public static TypeDescription createLong() { + return new TypeDescription(Category.LONG); + } + + public static TypeDescription createFloat() { + return new TypeDescription(Category.FLOAT); + } + + public static TypeDescription createDouble() { + return new TypeDescription(Category.DOUBLE); + } + + public static TypeDescription createString() { + return new TypeDescription(Category.STRING); + } + + public static TypeDescription createDate() { + return new TypeDescription(Category.DATE); + } + + public static TypeDescription createTimestamp() { + return new TypeDescription(Category.TIMESTAMP); + } + + public static TypeDescription createBinary() { + return new TypeDescription(Category.BINARY); + } + + public static TypeDescription createDecimal() { + return new TypeDescription(Category.DECIMAL); + } + + static class StringPosition { + final String value; + int position; + final int length; + + StringPosition(String value) { + this.value = value; + position = 0; + length = value.length(); + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append('\''); + buffer.append(value.substring(0, position)); + buffer.append('^'); + buffer.append(value.substring(position)); + buffer.append('\''); + return buffer.toString(); + } + } + + static Category parseCategory(StringPosition source) { + int start = source.position; + while (source.position < source.length) { + char ch = source.value.charAt(source.position); + if (!Character.isLetter(ch)) { + break; + } + source.position += 1; + } + if (source.position != start) { + String word = source.value.substring(start, source.position).toLowerCase(); + for (Category cat : Category.values()) { + if (cat.getName().equals(word)) { + return cat; + } + } + } + throw new IllegalArgumentException("Can't parse category at " + source); + } + + static int parseInt(StringPosition source) { + int start = source.position; + int result = 0; + while (source.position < source.length) { + char ch = source.value.charAt(source.position); + if (!Character.isDigit(ch)) { + break; + } + result = result * 10 + (ch - '0'); + source.position += 1; + } + if (source.position == start) { + throw new IllegalArgumentException("Missing integer at " + source); + } + return result; + } + + static String parseName(StringPosition source) { + int start = source.position; + while (source.position < source.length) { + char ch = source.value.charAt(source.position); + if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') { + break; + } + source.position += 1; + } + if (source.position == start) { + throw new IllegalArgumentException("Missing name at " + source); + } + return source.value.substring(start, source.position); + } + + static void requireChar(StringPosition source, char required) { + if (source.position >= source.length || + source.value.charAt(source.position) != required) { + throw new IllegalArgumentException("Missing required char '" + + required + "' at " + source); + } + source.position += 1; + } + + static boolean consumeChar(StringPosition source, char ch) { + boolean result = source.position < source.length && + source.value.charAt(source.position) == ch; + if (result) { + source.position += 1; + } + return result; + } + + static void parseUnion(TypeDescription type, StringPosition source) { + requireChar(source, '<'); + do { + type.addUnionChild(parseType(source)); + } while (consumeChar(source, ',')); + requireChar(source, '>'); + } + + static void parseStruct(TypeDescription type, StringPosition source) { + requireChar(source, '<'); + do { + String fieldName = parseName(source); + requireChar(source, ':'); + type.addField(fieldName, parseType(source)); + } while (consumeChar(source, ',')); + requireChar(source, '>'); + } + + static TypeDescription parseType(StringPosition source) { + TypeDescription result = new TypeDescription(parseCategory(source)); + switch (result.getCategory()) { + case BINARY: + case BOOLEAN: + case BYTE: + case DATE: + case DOUBLE: + case FLOAT: + case INT: + case LONG: + case SHORT: + case STRING: + case TIMESTAMP: + break; + case CHAR: + case VARCHAR: + requireChar(source, '('); + result.withMaxLength(parseInt(source)); + requireChar(source, ')'); + break; + case DECIMAL: { + requireChar(source, '('); + int precision = parseInt(source); + requireChar(source, ','); + result.withScale(parseInt(source)); + result.withPrecision(precision); + requireChar(source, ')'); + break; + } + case LIST: + requireChar(source, '<'); + result.children.add(parseType(source)); + requireChar(source, '>'); + break; + case MAP: + requireChar(source, '<'); + result.children.add(parseType(source)); + requireChar(source, ','); + result.children.add(parseType(source)); + requireChar(source, '>'); + break; + case UNION: + parseUnion(result, source); + break; + case STRUCT: + parseStruct(result, source); + break; + default: + throw new IllegalArgumentException("Unknown type " + + result.getCategory() + " at " + source); + } + return result; + } + + /** + * Parse TypeDescription from the Hive type names. This is the inverse + * of TypeDescription.toString() + * @param typeName the name of the type + * @return a new TypeDescription or null if typeName was null + * @throws IllegalArgumentException if the string is badly formed + */ + public static TypeDescription fromString(String typeName) { + if (typeName == null) { + return null; + } + StringPosition source = new StringPosition(typeName); + TypeDescription result = parseType(source); + if (source.position != source.length) { + throw new IllegalArgumentException("Extra characters at " + source); + } + return result; + } + + /** + * For decimal types, set the precision. + * @param precision the new precision + * @return this + */ + public TypeDescription withPrecision(int precision) { + if (category != Category.DECIMAL) { + throw new IllegalArgumentException("precision is only allowed on decimal"+ + " and not " + category.name); + } else if (precision < 1 || precision > MAX_PRECISION || scale > precision){ + throw new IllegalArgumentException("precision " + precision + + " is out of range 1 .. " + scale); + } + this.precision = precision; + return this; + } + + /** + * For decimal types, set the scale. + * @param scale the new scale + * @return this + */ + public TypeDescription withScale(int scale) { + if (category != Category.DECIMAL) { + throw new IllegalArgumentException("scale is only allowed on decimal"+ + " and not " + category.name); + } else if (scale < 0 || scale > MAX_SCALE || scale > precision) { + throw new IllegalArgumentException("scale is out of range at " + scale); + } + this.scale = scale; + return this; + } + + public static TypeDescription createVarchar() { + return new TypeDescription(Category.VARCHAR); + } + + public static TypeDescription createChar() { + return new TypeDescription(Category.CHAR); + } + + /** + * Set the maximum length for char and varchar types. + * @param maxLength the maximum value + * @return this + */ + public TypeDescription withMaxLength(int maxLength) { + if (category != Category.VARCHAR && category != Category.CHAR) { + throw new IllegalArgumentException("maxLength is only allowed on char" + + " and varchar and not " + category.name); + } + this.maxLength = maxLength; + return this; + } + + public static TypeDescription createList(TypeDescription childType) { + TypeDescription result = new TypeDescription(Category.LIST); + result.children.add(childType); + childType.parent = result; + return result; + } + + public static TypeDescription createMap(TypeDescription keyType, + TypeDescription valueType) { + TypeDescription result = new TypeDescription(Category.MAP); + result.children.add(keyType); + result.children.add(valueType); + keyType.parent = result; + valueType.parent = result; + return result; + } + + public static TypeDescription createUnion() { + return new TypeDescription(Category.UNION); + } + + public static TypeDescription createStruct() { + return new TypeDescription(Category.STRUCT); + } + + /** + * Add a child to a union type. + * @param child a new child type to add + * @return the union type. + */ + public TypeDescription addUnionChild(TypeDescription child) { + if (category != Category.UNION) { + throw new IllegalArgumentException("Can only add types to union type" + + " and not " + category); + } + children.add(child); + child.parent = this; + return this; + } + + /** + * Add a field to a struct type as it is built. + * @param field the field name + * @param fieldType the type of the field + * @return the struct type + */ + public TypeDescription addField(String field, TypeDescription fieldType) { + if (category != Category.STRUCT) { + throw new IllegalArgumentException("Can only add fields to struct type" + + " and not " + category); + } + fieldNames.add(field); + children.add(fieldType); + fieldType.parent = this; + return this; + } + + /** + * Get the id for this type. + * The first call will cause all of the the ids in tree to be assigned, so + * it should not be called before the type is completely built. + * @return the sequential id + */ + public int getId() { + // if the id hasn't been assigned, assign all of the ids from the root + if (id == -1) { + TypeDescription root = this; + while (root.parent != null) { + root = root.parent; + } + root.assignIds(0); + } + return id; + } + + public TypeDescription clone() { + TypeDescription result = new TypeDescription(category); + result.maxLength = maxLength; + result.precision = precision; + result.scale = scale; + if (fieldNames != null) { + result.fieldNames.addAll(fieldNames); + } + if (children != null) { + for(TypeDescription child: children) { + TypeDescription clone = child.clone(); + clone.parent = result; + result.children.add(clone); + } + } + return result; + } + + @Override + public int hashCode() { + return getId(); + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != TypeDescription.class) { + return false; + } + if (other == this) { + return true; + } + TypeDescription castOther = (TypeDescription) other; + if (category != castOther.category || + getId() != castOther.getId() || + getMaximumId() != castOther.getMaximumId() || + maxLength != castOther.maxLength || + scale != castOther.scale || + precision != castOther.precision) { + return false; + } + if (children != null) { + if (children.size() != castOther.children.size()) { + return false; + } + for (int i = 0; i < children.size(); ++i) { + if (!children.get(i).equals(castOther.children.get(i))) { + return false; + } + } + } + if (category == Category.STRUCT) { + for(int i=0; i < fieldNames.size(); ++i) { + if (!fieldNames.get(i).equals(castOther.fieldNames.get(i))) { + return false; + } + } + } + return true; + } + + /** + * Get the maximum id assigned to this type or its children. + * The first call will cause all of the the ids in tree to be assigned, so + * it should not be called before the type is completely built. + * @return the maximum id assigned under this type + */ + public int getMaximumId() { + // if the id hasn't been assigned, assign all of the ids from the root + if (maxId == -1) { + TypeDescription root = this; + while (root.parent != null) { + root = root.parent; + } + root.assignIds(0); + } + return maxId; + } + + private ColumnVector createColumn(int maxSize) { + switch (category) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case DATE: + return new LongColumnVector(maxSize); + case TIMESTAMP: + return new TimestampColumnVector(maxSize); + case FLOAT: + case DOUBLE: + return new DoubleColumnVector(maxSize); + case DECIMAL: + return new DecimalColumnVector(maxSize, precision, scale); + case STRING: + case BINARY: + case CHAR: + case VARCHAR: + return new BytesColumnVector(maxSize); + case STRUCT: { + ColumnVector[] fieldVector = new ColumnVector[children.size()]; + for(int i=0; i < fieldVector.length; ++i) { + fieldVector[i] = children.get(i).createColumn(maxSize); + } + return new StructColumnVector(maxSize, + fieldVector); + } + case UNION: { + ColumnVector[] fieldVector = new ColumnVector[children.size()]; + for(int i=0; i < fieldVector.length; ++i) { + fieldVector[i] = children.get(i).createColumn(maxSize); + } + return new UnionColumnVector(maxSize, + fieldVector); + } + case LIST: + return new ListColumnVector(maxSize, + children.get(0).createColumn(maxSize)); + case MAP: + return new MapColumnVector(maxSize, + children.get(0).createColumn(maxSize), + children.get(1).createColumn(maxSize)); + default: + throw new IllegalArgumentException("Unknown type " + category); + } + } + + public VectorizedRowBatch createRowBatch(int maxSize) { + VectorizedRowBatch result; + if (category == Category.STRUCT) { + result = new VectorizedRowBatch(children.size(), maxSize); + for(int i=0; i < result.cols.length; ++i) { + result.cols[i] = children.get(i).createColumn(maxSize); + } + } else { + result = new VectorizedRowBatch(1, maxSize); + result.cols[0] = createColumn(maxSize); + } + result.reset(); + return result; + } + + public VectorizedRowBatch createRowBatch() { + return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE); + } + + /** + * Get the kind of this type. + * @return get the category for this type. + */ + public Category getCategory() { + return category; + } + + /** + * Get the maximum length of the type. Only used for char and varchar types. + * @return the maximum length of the string type + */ + public int getMaxLength() { + return maxLength; + } + + /** + * Get the precision of the decimal type. + * @return the number of digits for the precision. + */ + public int getPrecision() { + return precision; + } + + /** + * Get the scale of the decimal type. + * @return the number of digits for the scale. + */ + public int getScale() { + return scale; + } + + /** + * For struct types, get the list of field names. + * @return the list of field names. + */ + public List<String> getFieldNames() { + return Collections.unmodifiableList(fieldNames); + } + + /** + * Get the subtypes of this type. + * @return the list of children types + */ + public List<TypeDescription> getChildren() { + return children == null ? null : Collections.unmodifiableList(children); + } + + /** + * Assign ids to all of the nodes under this one. + * @param startId the lowest id to assign + * @return the next available id + */ + private int assignIds(int startId) { + id = startId++; + if (children != null) { + for (TypeDescription child : children) { + startId = child.assignIds(startId); + } + } + maxId = startId - 1; + return startId; + } + + private TypeDescription(Category category) { + this.category = category; + if (category.isPrimitive) { + children = null; + } else { + children = new ArrayList<>(); + } + if (category == Category.STRUCT) { + fieldNames = new ArrayList<>(); + } else { + fieldNames = null; + } + } + + private int id = -1; + private int maxId = -1; + private TypeDescription parent; + private final Category category; + private final List<TypeDescription> children; + private final List<String> fieldNames; + private int maxLength = DEFAULT_LENGTH; + private int precision = DEFAULT_PRECISION; + private int scale = DEFAULT_SCALE; + + public void printToBuffer(StringBuilder buffer) { + buffer.append(category.name); + switch (category) { + case DECIMAL: + buffer.append('('); + buffer.append(precision); + buffer.append(','); + buffer.append(scale); + buffer.append(')'); + break; + case CHAR: + case VARCHAR: + buffer.append('('); + buffer.append(maxLength); + buffer.append(')'); + break; + case LIST: + case MAP: + case UNION: + buffer.append('<'); + for(int i=0; i < children.size(); ++i) { + if (i != 0) { + buffer.append(','); + } + children.get(i).printToBuffer(buffer); + } + buffer.append('>'); + break; + case STRUCT: + buffer.append('<'); + for(int i=0; i < children.size(); ++i) { + if (i != 0) { + buffer.append(','); + } + buffer.append(fieldNames.get(i)); + buffer.append(':'); + children.get(i).printToBuffer(buffer); + } + buffer.append('>'); + break; + default: + break; + } + } + + public String toString() { + StringBuilder buffer = new StringBuilder(); + printToBuffer(buffer); + return buffer.toString(); + } + + private void printJsonToBuffer(String prefix, StringBuilder buffer, + int indent) { + for(int i=0; i < indent; ++i) { + buffer.append(' '); + } + buffer.append(prefix); + buffer.append("{\"category\": \""); + buffer.append(category.name); + buffer.append("\", \"id\": "); + buffer.append(getId()); + buffer.append(", \"max\": "); + buffer.append(maxId); + switch (category) { + case DECIMAL: + buffer.append(", \"precision\": "); + buffer.append(precision); + buffer.append(", \"scale\": "); + buffer.append(scale); + break; + case CHAR: + case VARCHAR: + buffer.append(", \"length\": "); + buffer.append(maxLength); + break; + case LIST: + case MAP: + case UNION: + buffer.append(", \"children\": ["); + for(int i=0; i < children.size(); ++i) { + buffer.append('\n'); + children.get(i).printJsonToBuffer("", buffer, indent + 2); + if (i != children.size() - 1) { + buffer.append(','); + } + } + buffer.append("]"); + break; + case STRUCT: + buffer.append(", \"fields\": ["); + for(int i=0; i < children.size(); ++i) { + buffer.append('\n'); + children.get(i).printJsonToBuffer("\"" + fieldNames.get(i) + "\": ", + buffer, indent + 2); + if (i != children.size() - 1) { + buffer.append(','); + } + } + buffer.append(']'); + break; + default: + break; + } + buffer.append('}'); + } + + public String toJson() { + StringBuilder buffer = new StringBuilder(); + printJsonToBuffer("", buffer, 0); + return buffer.toString(); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/Writer.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/Writer.java b/java/core/src/java/org/apache/orc/Writer.java new file mode 100644 index 0000000..4492062 --- /dev/null +++ b/java/core/src/java/org/apache/orc/Writer.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +import org.apache.orc.OrcProto; +import org.apache.orc.StripeInformation; +import org.apache.orc.TypeDescription; + +/** + * The interface for writing ORC files. + */ +public interface Writer { + + /** + * Get the schema for this writer + * @return the file schema + */ + TypeDescription getSchema(); + + /** + * Add arbitrary meta-data to the ORC file. This may be called at any point + * until the Writer is closed. If the same key is passed a second time, the + * second value will replace the first. + * @param key a key to label the data with. + * @param value the contents of the metadata. + */ + void addUserMetadata(String key, ByteBuffer value); + + /** + * Add a row batch to the ORC file. + * @param batch the rows to add + */ + void addRowBatch(VectorizedRowBatch batch) throws IOException; + + /** + * Flush all of the buffers and close the file. No methods on this writer + * should be called afterwards. + * @throws IOException + */ + void close() throws IOException; + + /** + * Return the deserialized data size. Raw data size will be compute when + * writing the file footer. Hence raw data size value will be available only + * after closing the writer. + * + * @return raw data size + */ + long getRawDataSize(); + + /** + * Return the number of rows in file. Row count gets updated when flushing + * the stripes. To get accurate row count this method should be called after + * closing the writer. + * + * @return row count + */ + long getNumberOfRows(); + + /** + * Write an intermediate footer on the file such that if the file is + * truncated to the returned offset, it would be a valid ORC file. + * @return the offset that would be a valid end location for an ORC file + */ + long writeIntermediateFooter() throws IOException; + + /** + * Fast stripe append to ORC file. This interface is used for fast ORC file + * merge with other ORC files. When merging, the file to be merged should pass + * stripe in binary form along with stripe information and stripe statistics. + * After appending last stripe of a file, use appendUserMetadata() to append + * any user metadata. + * @param stripe - stripe as byte array + * @param offset - offset within byte array + * @param length - length of stripe within byte array + * @param stripeInfo - stripe information + * @param stripeStatistics - stripe statistics (Protobuf objects can be + * merged directly) + * @throws IOException + */ + public void appendStripe(byte[] stripe, int offset, int length, + StripeInformation stripeInfo, + OrcProto.StripeStatistics stripeStatistics) throws IOException; + + /** + * When fast stripe append is used for merging ORC stripes, after appending + * the last stripe from a file, this interface must be used to merge any + * user metadata. + * @param userMetadata - user metadata + */ + public void appendUserMetadata(List<OrcProto.UserMetadataItem> userMetadata); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/AcidStats.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/AcidStats.java b/java/core/src/java/org/apache/orc/impl/AcidStats.java new file mode 100644 index 0000000..6657fe9 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/AcidStats.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +/** + * Statistics about the ACID operations in an ORC file + */ +public class AcidStats { + public long inserts; + public long updates; + public long deletes; + + public AcidStats() { + inserts = 0; + updates = 0; + deletes = 0; + } + + public AcidStats(String serialized) { + String[] parts = serialized.split(","); + inserts = Long.parseLong(parts[0]); + updates = Long.parseLong(parts[1]); + deletes = Long.parseLong(parts[2]); + } + + public String serialize() { + StringBuilder builder = new StringBuilder(); + builder.append(inserts); + builder.append(","); + builder.append(updates); + builder.append(","); + builder.append(deletes); + return builder.toString(); + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append(" inserts: ").append(inserts); + builder.append(" updates: ").append(updates); + builder.append(" deletes: ").append(deletes); + return builder.toString(); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BitFieldReader.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/BitFieldReader.java b/java/core/src/java/org/apache/orc/impl/BitFieldReader.java new file mode 100644 index 0000000..dda7355 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/BitFieldReader.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import java.io.EOFException; +import java.io.IOException; + +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.orc.impl.InStream; +import org.apache.orc.impl.PositionProvider; +import org.apache.orc.impl.RunLengthByteReader; + +public class BitFieldReader { + private final RunLengthByteReader input; + /** The number of bits in one item. Non-test code always uses 1. */ + private final int bitSize; + private int current; + private int bitsLeft; + private final int mask; + + public BitFieldReader(InStream input, + int bitSize) throws IOException { + this.input = new RunLengthByteReader(input); + this.bitSize = bitSize; + mask = (1 << bitSize) - 1; + } + + public void setInStream(InStream inStream) { + this.input.setInStream(inStream); + } + + private void readByte() throws IOException { + if (input.hasNext()) { + current = 0xff & input.next(); + bitsLeft = 8; + } else { + throw new EOFException("Read past end of bit field from " + this); + } + } + + public int next() throws IOException { + int result = 0; + int bitsLeftToRead = bitSize; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= current & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + readByte(); + } + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= bitsLeftToRead; + result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + return result & mask; + } + + /** + * Unlike integer readers, where runs are encoded explicitly, in this one we have to read ahead + * to figure out whether we have a run. Given that runs in booleans are likely it's worth it. + * However it means we'd need to keep track of how many bytes we read, and next/nextVector won't + * work anymore once this is called. These is trivial to fix, but these are never interspersed. + */ + private boolean lastRunValue; + private int lastRunLength = -1; + private void readNextRun(int maxRunLength) throws IOException { + assert bitSize == 1; + if (lastRunLength > 0) return; // last run is not exhausted yet + if (bitsLeft == 0) { + readByte(); + } + // First take care of the partial bits. + boolean hasVal = false; + int runLength = 0; + if (bitsLeft != 8) { + int partialBitsMask = (1 << bitsLeft) - 1; + int partialBits = current & partialBitsMask; + if (partialBits == partialBitsMask || partialBits == 0) { + lastRunValue = (partialBits == partialBitsMask); + if (maxRunLength <= bitsLeft) { + lastRunLength = maxRunLength; + return; + } + maxRunLength -= bitsLeft; + hasVal = true; + runLength = bitsLeft; + bitsLeft = 0; + } else { + // There's no run in partial bits. Return whatever we have. + int prefixBitsCount = 32 - bitsLeft; + runLength = Integer.numberOfLeadingZeros(partialBits) - prefixBitsCount; + lastRunValue = (runLength > 0); + lastRunLength = Math.min(maxRunLength, lastRunValue ? runLength : + (Integer.numberOfLeadingZeros(~(partialBits | ~partialBitsMask)) - prefixBitsCount)); + return; + } + assert bitsLeft == 0; + readByte(); + } + if (!hasVal) { + lastRunValue = ((current >> 7) == 1); + hasVal = true; + } + // Read full bytes until the run ends. + assert bitsLeft == 8; + while (maxRunLength >= 8 + && ((lastRunValue && (current == 0xff)) || (!lastRunValue && (current == 0)))) { + runLength += 8; + maxRunLength -= 8; + readByte(); + } + if (maxRunLength > 0) { + int extraBits = Integer.numberOfLeadingZeros( + lastRunValue ? (~(current | ~255)) : current) - 24; + bitsLeft -= extraBits; + runLength += extraBits; + } + lastRunLength = runLength; + } + + public void nextVector(LongColumnVector previous, + long previousLen) throws IOException { + previous.isRepeating = true; + for (int i = 0; i < previousLen; i++) { + if (previous.noNulls || !previous.isNull[i]) { + previous.vector[i] = next(); + } else { + // The default value of null for int types in vectorized + // processing is 1, so set that if the value is null + previous.vector[i] = 1; + } + + // The default value for nulls in Vectorization for int types is 1 + // and given that non null value can also be 1, we need to check for isNull also + // when determining the isRepeating flag. + if (previous.isRepeating + && i > 0 + && ((previous.vector[0] != previous.vector[i]) || + (previous.isNull[0] != previous.isNull[i]))) { + previous.isRepeating = false; + } + } + } + + public void seek(PositionProvider index) throws IOException { + input.seek(index); + int consumed = (int) index.getNext(); + if (consumed > 8) { + throw new IllegalArgumentException("Seek past end of byte at " + + consumed + " in " + input); + } else if (consumed != 0) { + readByte(); + bitsLeft = 8 - consumed; + } else { + bitsLeft = 0; + } + } + + public void skip(long items) throws IOException { + long totalBits = bitSize * items; + if (bitsLeft >= totalBits) { + bitsLeft -= totalBits; + } else { + totalBits -= bitsLeft; + input.skip(totalBits / 8); + current = input.next(); + bitsLeft = (int) (8 - (totalBits % 8)); + } + } + + @Override + public String toString() { + return "bit reader current: " + current + " bits left: " + bitsLeft + + " bit size: " + bitSize + " from " + input; + } + + boolean hasFullByte() { + return bitsLeft == 8 || bitsLeft == 0; + } + + int peekOneBit() throws IOException { + assert bitSize == 1; + if (bitsLeft == 0) { + readByte(); + } + return (current >>> (bitsLeft - 1)) & 1; + } + + int peekFullByte() throws IOException { + assert bitSize == 1; + assert bitsLeft == 8 || bitsLeft == 0; + if (bitsLeft == 0) { + readByte(); + } + return current; + } + + void skipInCurrentByte(int bits) throws IOException { + assert bitsLeft >= bits; + bitsLeft -= bits; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java b/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java new file mode 100644 index 0000000..aa5f886 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import org.apache.orc.impl.PositionRecorder; +import org.apache.orc.impl.PositionedOutputStream; +import org.apache.orc.impl.RunLengthByteWriter; + +import java.io.IOException; + +public class BitFieldWriter { + private RunLengthByteWriter output; + private final int bitSize; + private byte current = 0; + private int bitsLeft = 8; + + public BitFieldWriter(PositionedOutputStream output, + int bitSize) throws IOException { + this.output = new RunLengthByteWriter(output); + this.bitSize = bitSize; + } + + private void writeByte() throws IOException { + output.write(current); + current = 0; + bitsLeft = 8; + } + + public void flush() throws IOException { + if (bitsLeft != 8) { + writeByte(); + } + output.flush(); + } + + public void write(int value) throws IOException { + int bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= value >>> (bitsToWrite - bitsLeft); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (1 << bitsToWrite) - 1; + writeByte(); + } + bitsLeft -= bitsToWrite; + current |= value << bitsLeft; + if (bitsLeft == 0) { + writeByte(); + } + } + + public void getPosition(PositionRecorder recorder) throws IOException { + output.getPosition(recorder); + recorder.addPosition(8 - bitsLeft); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BufferChunk.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/BufferChunk.java b/java/core/src/java/org/apache/orc/impl/BufferChunk.java new file mode 100644 index 0000000..da43b96 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/BufferChunk.java @@ -0,0 +1,85 @@ +package org.apache.orc.impl; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.hadoop.hive.common.io.DiskRange; +import org.apache.hadoop.hive.common.io.DiskRangeList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.ByteBuffer; + +/** + * The sections of stripe that we have read. + * This might not match diskRange - 1 disk range can be multiple buffer chunks, + * depending on DFS block boundaries. + */ +public class BufferChunk extends DiskRangeList { + + private static final Logger LOG = + LoggerFactory.getLogger(BufferChunk.class); + final ByteBuffer chunk; + + public BufferChunk(ByteBuffer chunk, long offset) { + super(offset, offset + chunk.remaining()); + this.chunk = chunk; + } + + public ByteBuffer getChunk() { + return chunk; + } + + @Override + public boolean hasData() { + return chunk != null; + } + + @Override + public final String toString() { + boolean makesSense = chunk.remaining() == (end - offset); + return "data range [" + offset + ", " + end + "), size: " + chunk.remaining() + + (makesSense ? "" : "(!)") + " type: " + + (chunk.isDirect() ? "direct" : "array-backed"); + } + + @Override + public DiskRange sliceAndShift(long offset, long end, long shiftBy) { + assert offset <= end && offset >= this.offset && end <= this.end; + assert offset + shiftBy >= 0; + ByteBuffer sliceBuf = chunk.slice(); + int newPos = (int) (offset - this.offset); + int newLimit = newPos + (int) (end - offset); + try { + sliceBuf.position(newPos); + sliceBuf.limit(newLimit); + } catch (Throwable t) { + LOG.error("Failed to slice buffer chunk with range" + " [" + this.offset + ", " + this.end + + "), position: " + chunk.position() + " limit: " + chunk.limit() + ", " + + (chunk.isDirect() ? "direct" : "array") + "; to [" + offset + ", " + end + ") " + + t.getClass()); + throw new RuntimeException(t); + } + return new BufferChunk(sliceBuf, offset + shiftBy); + } + + @Override + public ByteBuffer getData() { + return chunk; + } +}
