Repository: hive Updated Branches: refs/heads/master ff67cdda1 -> 0b62e6f38
http://git-wip-us.apache.org/repos/asf/hive/blob/0b62e6f3/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java index 472ace7..ee945d4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazybinary/fast/LazyBinaryDeserializeRead.java @@ -39,7 +39,7 @@ import org.apache.hadoop.io.WritableUtils; * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format. * * The caller is responsible for calling the read method for the right type of each field - * (after calling readCheckNull). + * (after calling readNextField). * * Reading some fields require a results object to receive value information. A separate * results object is created by the caller at initialization per different field even for the same @@ -65,17 +65,12 @@ public final class LazyBinaryDeserializeRead extends DeserializeRead { private VInt tempVInt; private VLong tempVLong; - private boolean readBeyondConfiguredFieldsWarned; - private boolean bufferRangeHasExtraDataWarned; - public LazyBinaryDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer) { super(typeInfos, useExternalBuffer); fieldCount = typeInfos.length; tempVInt = new VInt(); tempVLong = new VLong(); currentExternalBufferNeeded = false; - readBeyondConfiguredFieldsWarned = false; - bufferRangeHasExtraDataWarned = false; } // Not public since we must have the field count so every 8 fields NULL bytes can be navigated. @@ -122,22 +117,19 @@ public final class LazyBinaryDeserializeRead extends DeserializeRead { } /* - * Reads the NULL information for a field. + * Reads the the next field. + * + * Afterwards, reading is positioned to the next field. + * + * @return Return true when the field was not null and data is put in the appropriate + * current* member. + * Otherwise, false when the field is null. * - * @return Returns true when the field is NULL; reading is positioned to the next field. - * Otherwise, false when the field is NOT NULL; reading is positioned to the field data. */ @Override - public boolean readCheckNull() throws IOException { + public boolean readNextField() throws IOException { if (fieldIndex >= fieldCount) { - // Reading beyond the specified field count produces NULL. - if (!readBeyondConfiguredFieldsWarned) { - // Warn only once. - LOG.info("Reading beyond configured fields! Configured " + fieldCount + " fields but " - + " reading more (NULLs returned). Ignoring similar problems."); - readBeyondConfiguredFieldsWarned = true; - } - return true; + return false; } fieldStart = offset; @@ -151,12 +143,24 @@ public final class LazyBinaryDeserializeRead extends DeserializeRead { nullByte = bytes[offset++]; } - // NOTE: The bit is set to 1 if a field is NOT NULL. - boolean isNull; + // NOTE: The bit is set to 1 if a field is NOT NULL. boolean isNull; if ((nullByte & (1 << (fieldIndex % 8))) == 0) { - isNull = true; + + // Logically move past this field. + fieldIndex++; + + // Every 8 fields we read a new NULL byte. + if (fieldIndex < fieldCount) { + if ((fieldIndex % 8) == 0) { + // Get next null byte. + if (offset >= end) { + throw new EOFException(); + } + nullByte = bytes[offset++]; + } + } + return false; } else { - isNull = false; // Assume. // Make sure there is at least one byte that can be read for a value. if (offset >= end) { @@ -336,24 +340,30 @@ public final class LazyBinaryDeserializeRead extends DeserializeRead { HiveDecimal decimal = currentHiveDecimalWritable.getHiveDecimal(precision, scale); if (decimal == null) { - isNull = true; - } else { - // Put value back into writable. - currentHiveDecimalWritable.set(decimal); + + // Logically move past this field. + fieldIndex++; + + // Every 8 fields we read a new NULL byte. + if (fieldIndex < fieldCount) { + if ((fieldIndex % 8) == 0) { + // Get next null byte. + if (offset >= end) { + throw new EOFException(); + } + nullByte = bytes[offset++]; + } + } + return false; } + // Put value back into writable. + currentHiveDecimalWritable.set(decimal); } break; default: throw new Error("Unexpected primitive category " + primitiveCategories[fieldIndex].name()); } - - /* - * Now that we have read through the field -- did we really want it? - */ - if (columnsToInclude != null && !columnsToInclude[fieldIndex]) { - isNull = true; - } } // Logically move past this field. @@ -370,37 +380,32 @@ public final class LazyBinaryDeserializeRead extends DeserializeRead { } } - return isNull; + return true; } /* - * Call this method after all fields have been read to check for extra fields. + * Reads through an undesired field. + * + * No data values are valid after this call. + * Designed for skipping columns that are not included. */ - public void extraFieldsCheck() { - if (offset < end) { - // We did not consume all of the byte range. - if (!bufferRangeHasExtraDataWarned) { - // Warn only once. - int length = end - start; - int remaining = end - offset; - LOG.info("Not all fields were read in the buffer range! Buffer range " + start - + " for length " + length + " but " + remaining + " bytes remain. " - + "(total buffer length " + bytes.length + ")" - + " Ignoring similar problems."); - bufferRangeHasExtraDataWarned = true; - } - } + public void skipNextField() throws IOException { + // Not a known use case for LazyBinary -- so don't optimize. + readNextField(); } /* - * Read integrity warning flags. + * Call this method may be called after all the all fields have been read to check + * for unread fields. + * + * Note that when optimizing reading to stop reading unneeded include columns, worrying + * about whether all data is consumed is not appropriate (often we aren't reading it all by + * design). + * + * Since LazySimpleDeserializeRead parses the line through the last desired column it does + * support this function. */ - @Override - public boolean readBeyondConfiguredFieldsWarned() { - return readBeyondConfiguredFieldsWarned; - } - @Override - public boolean bufferRangeHasExtraDataWarned() { - return bufferRangeHasExtraDataWarned; + public boolean isEndOfInputReached() { + return (offset == end); } } http://git-wip-us.apache.org/repos/asf/hive/blob/0b62e6f3/serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java ---------------------------------------------------------------------- diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java b/serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java index 52dd5a3..3ac339d 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/VerifyFast.java @@ -65,7 +65,7 @@ public class VerifyFast { boolean isNull; - isNull = deserializeRead.readCheckNull(); + isNull = !deserializeRead.readNextField(); if (isNull) { if (writable != null) { TestCase.fail("Field reports null but object is not null"); http://git-wip-us.apache.org/repos/asf/hive/blob/0b62e6f3/serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java ---------------------------------------------------------------------- diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java b/serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java index 49ee9c6..f1eeb2d 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/binarysortable/TestBinarySortableFast.java @@ -120,17 +120,14 @@ public class TestBinarySortableFast extends TestCase { /* useExternalBuffer */ false, columnSortOrderIsDesc); - if (useIncludeColumns) { - binarySortableDeserializeRead.setColumnsToInclude(columnsToInclude); - } - BytesWritable bytesWritable = serializeWriteBytes[i]; binarySortableDeserializeRead.set( bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + binarySortableDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -138,9 +135,9 @@ public class TestBinarySortableFast extends TestCase { VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], writable); } } - binarySortableDeserializeRead.extraFieldsCheck(); - TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!binarySortableDeserializeRead.bufferRangeHasExtraDataWarned()); + if (writeColumnCount == columnCount) { + TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached()); + } /* * Clip off one byte and expect to get an EOFException on the write field. @@ -151,10 +148,6 @@ public class TestBinarySortableFast extends TestCase { /* useExternalBuffer */ false, columnSortOrderIsDesc); - if (useIncludeColumns) { - binarySortableDeserializeRead2.setColumnsToInclude(columnsToInclude); - } - binarySortableDeserializeRead2.set( bytesWritable.getBytes(), 0, bytesWritable.getLength() - 1); // One fewer byte. @@ -172,7 +165,7 @@ public class TestBinarySortableFast extends TestCase { TestCase.assertTrue(threw); } else { if (useIncludeColumns && !columnsToInclude[index]) { - VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead2, primitiveTypeInfos[index], null); + binarySortableDeserializeRead2.skipNextField(); } else { VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead2, primitiveTypeInfos[index], writable); } @@ -258,16 +251,14 @@ public class TestBinarySortableFast extends TestCase { /* useExternalBuffer */ false, columnSortOrderIsDesc); - if (useIncludeColumns) { - binarySortableDeserializeRead.setColumnsToInclude(columnsToInclude); - } BytesWritable bytesWritable = serdeBytes[i]; binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + binarySortableDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -275,9 +266,9 @@ public class TestBinarySortableFast extends TestCase { VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], writable); } } - binarySortableDeserializeRead.extraFieldsCheck(); - TestCase.assertTrue(!binarySortableDeserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!binarySortableDeserializeRead.bufferRangeHasExtraDataWarned()); + if (writeColumnCount == columnCount) { + TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached()); + } } } http://git-wip-us.apache.org/repos/asf/hive/blob/0b62e6f3/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java ---------------------------------------------------------------------- diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java index 8285c06..c857b42 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/lazy/TestLazySimpleFast.java @@ -101,10 +101,6 @@ public class TestLazySimpleFast extends TestCase { /* useExternalBuffer */ false, separator, serdeParams); - if (useIncludeColumns) { - lazySimpleDeserializeRead.setColumnsToInclude(columnsToInclude); - } - BytesWritable bytesWritable = serializeWriteBytes[i]; byte[] bytes = bytesWritable.getBytes(); int length = bytesWritable.getLength(); @@ -116,8 +112,9 @@ public class TestLazySimpleFast extends TestCase { } for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + lazySimpleDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -125,10 +122,9 @@ public class TestLazySimpleFast extends TestCase { VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], writable); } } - lazySimpleDeserializeRead.extraFieldsCheck(); - TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!lazySimpleDeserializeRead.bufferRangeHasExtraDataWarned()); - + if (writeColumnCount == columnCount) { + TestCase.assertTrue(lazySimpleDeserializeRead.isEndOfInputReached()); + } } // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. @@ -193,16 +189,13 @@ public class TestLazySimpleFast extends TestCase { /* useExternalBuffer */ false, separator, serdeParams); - if (useIncludeColumns) { - lazySimpleDeserializeRead.setColumnsToInclude(columnsToInclude); - } - byte[] bytes = serdeBytes[i]; lazySimpleDeserializeRead.set(bytes, 0, bytes.length); for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + lazySimpleDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -210,9 +203,9 @@ public class TestLazySimpleFast extends TestCase { VerifyFast.verifyDeserializeRead(lazySimpleDeserializeRead, primitiveTypeInfos[index], writable); } } - lazySimpleDeserializeRead.extraFieldsCheck(); - TestCase.assertTrue(!lazySimpleDeserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!lazySimpleDeserializeRead.bufferRangeHasExtraDataWarned()); + if (writeColumnCount == columnCount) { + TestCase.assertTrue(lazySimpleDeserializeRead.isEndOfInputReached()); + } } } http://git-wip-us.apache.org/repos/asf/hive/blob/0b62e6f3/serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java ---------------------------------------------------------------------- diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java b/serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java index e64d67d..a1828c9 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/lazybinary/TestLazyBinaryFast.java @@ -95,16 +95,13 @@ public class TestLazyBinaryFast extends TestCase { writePrimitiveTypeInfos, /* useExternalBuffer */ false); - if (useIncludeColumns) { - lazyBinaryDeserializeRead.setColumnsToInclude(columnsToInclude); - } - BytesWritable bytesWritable = serializeWriteBytes[i]; lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + lazyBinaryDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -112,13 +109,9 @@ public class TestLazyBinaryFast extends TestCase { VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable); } } - lazyBinaryDeserializeRead.extraFieldsCheck(); - if (doWriteFewerColumns) { - TestCase.assertTrue(lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); - } else { - TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); + if (writeColumnCount == columnCount) { + TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached()); } - TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); } // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. @@ -197,16 +190,13 @@ public class TestLazyBinaryFast extends TestCase { primitiveTypeInfos, /* useExternalBuffer */ false); - if (useIncludeColumns) { - lazyBinaryDeserializeRead.setColumnsToInclude(columnsToInclude); - } - BytesWritable bytesWritable = serdeBytes[i]; lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { - if (index >= writeColumnCount || - (useIncludeColumns && !columnsToInclude[index])) { + if (useIncludeColumns && !columnsToInclude[index]) { + lazyBinaryDeserializeRead.skipNextField(); + } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null); } else { @@ -214,9 +204,9 @@ public class TestLazyBinaryFast extends TestCase { VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable); } } - lazyBinaryDeserializeRead.extraFieldsCheck(); - TestCase.assertTrue(!lazyBinaryDeserializeRead.readBeyondConfiguredFieldsWarned()); - TestCase.assertTrue(!lazyBinaryDeserializeRead.bufferRangeHasExtraDataWarned()); + if (writeColumnCount == columnCount) { + TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached()); + } } }