Author: cutting
Date: Thu Jan 7 19:51:49 2010
New Revision: 896985
URL: http://svn.apache.org/viewvc?rev=896985&view=rev
Log:
AVRO-292. Fix Python skipping of ints and longs. Contributed by Jeff
Hammerbacher.
Modified:
hadoop/avro/trunk/CHANGES.txt
hadoop/avro/trunk/src/py/avro/io.py
hadoop/avro/trunk/src/test/py/test_io.py
Modified: hadoop/avro/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/avro/trunk/CHANGES.txt?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/CHANGES.txt (original)
+++ hadoop/avro/trunk/CHANGES.txt Thu Jan 7 19:51:49 2010
@@ -247,6 +247,10 @@
AVRO-280. Fix file header schema in specification. Also fix
"forrestdoc" build target to work on clean checkout.
+ (Jeff Hammerbacher & cutting)
+
+ AVRO-292. Fix Python skipping of ints and longs.
+ (Jeff Hammerbacher via cutting)
Avro 1.2.0 (14 October 2009)
Modified: hadoop/avro/trunk/src/py/avro/io.py
URL:
http://svn.apache.org/viewvc/hadoop/avro/trunk/src/py/avro/io.py?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/src/py/avro/io.py (original)
+++ hadoop/avro/trunk/src/py/avro/io.py Thu Jan 7 19:51:49 2010
@@ -129,6 +129,12 @@
# read-only properties
reader = property(lambda self: self._reader)
+ def read(self, n):
+ """
+ Read n bytes.
+ """
+ return self.reader.read(n)
+
def read_null(self):
"""
null is written as zero bytes
@@ -140,7 +146,7 @@
a boolean is written as a single byte
whose value is either 0 (false) or 1 (true).
"""
- return ord(self.reader.read(1)) == 1
+ return ord(self.read(1)) == 1
def read_int(self):
"""
@@ -152,11 +158,11 @@
"""
int and long values are written using variable-length, zig-zag coding.
"""
- b = ord(self.reader.read(1))
+ b = ord(self.read(1))
n = b & 0x7F
shift = 7
while (b & 0x80) != 0:
- b = ord(self.reader.read(1))
+ b = ord(self.read(1))
n |= (b & 0x7F) << shift
shift += 7
datum = (n >> 1) ^ -(n & 1)
@@ -168,10 +174,10 @@
The float is converted into a 32-bit integer using a method equivalent to
Java's floatToIntBits and then encoded in little-endian format.
"""
- bits = (((ord(self.reader.read(1)) & 0xffL)) |
- ((ord(self.reader.read(1)) & 0xffL) << 8) |
- ((ord(self.reader.read(1)) & 0xffL) << 16) |
- ((ord(self.reader.read(1)) & 0xffL) << 24))
+ bits = (((ord(self.read(1)) & 0xffL)) |
+ ((ord(self.read(1)) & 0xffL) << 8) |
+ ((ord(self.read(1)) & 0xffL) << 16) |
+ ((ord(self.read(1)) & 0xffL) << 24))
return STRUCT_FLOAT.unpack(STRUCT_INT.pack(bits))[0]
def read_double(self):
@@ -180,14 +186,14 @@
The double is converted into a 64-bit integer using a method equivalent to
Java's doubleToLongBits and then encoded in little-endian format.
"""
- bits = (((ord(self.reader.read(1)) & 0xffL)) |
- ((ord(self.reader.read(1)) & 0xffL) << 8) |
- ((ord(self.reader.read(1)) & 0xffL) << 16) |
- ((ord(self.reader.read(1)) & 0xffL) << 24) |
- ((ord(self.reader.read(1)) & 0xffL) << 32) |
- ((ord(self.reader.read(1)) & 0xffL) << 40) |
- ((ord(self.reader.read(1)) & 0xffL) << 48) |
- ((ord(self.reader.read(1)) & 0xffL) << 56))
+ bits = (((ord(self.read(1)) & 0xffL)) |
+ ((ord(self.read(1)) & 0xffL) << 8) |
+ ((ord(self.read(1)) & 0xffL) << 16) |
+ ((ord(self.read(1)) & 0xffL) << 24) |
+ ((ord(self.read(1)) & 0xffL) << 32) |
+ ((ord(self.read(1)) & 0xffL) << 40) |
+ ((ord(self.read(1)) & 0xffL) << 48) |
+ ((ord(self.read(1)) & 0xffL) << 56))
return STRUCT_DOUBLE.unpack(STRUCT_LONG.pack(bits))[0]
def read_bytes(self):
@@ -203,25 +209,19 @@
"""
return unicode(self.read_bytes(), "utf-8")
- def read(self, n):
- """
- Read n bytes.
- """
- return struct.unpack('%ds' % n, self.reader.read(n))[0]
-
def skip_null(self):
pass
def skip_boolean(self):
self.skip(1)
- # TODO(hammer): I thought ints were VLE?
def skip_int(self):
- self.skip(4)
+ self.skip_long()
- # TODO(hammer): I thought longs were VLE?
def skip_long(self):
- self.skip(8)
+ b = ord(self.read(1))
+ while (b & 0x80) != 0:
+ b = ord(self.read(1))
def skip_float(self):
self.skip(4)
Modified: hadoop/avro/trunk/src/test/py/test_io.py
URL:
http://svn.apache.org/viewvc/hadoop/avro/trunk/src/test/py/test_io.py?rev=896985&r1=896984&r2=896985&view=diff
==============================================================================
--- hadoop/avro/trunk/src/test/py/test_io.py (original)
+++ hadoop/avro/trunk/src/test/py/test_io.py Thu Jan 7 19:51:49 2010
@@ -15,6 +15,7 @@
# limitations under the License.
import unittest
import cStringIO
+from binascii import hexlify
from avro import schema
from avro import io
@@ -49,6 +50,28 @@
""", {'value': {'car': {'value': 'head'}, 'cdr': {'value': None}}}),
)
+BINARY_INT_ENCODINGS = (
+ (0, '00'),
+ (-1, '01'),
+ (1, '02'),
+ (-2, '03'),
+ (2, '04'),
+ (-64, '7f'),
+ (64, '80 01'),
+ (8192, '80 80 01'),
+ (-8193, '81 80 01'),
+)
+
+def avro_hexlify(reader):
+ """Return the hex value, as a string, of a binary-encoded int or long."""
+ bytes = []
+ current_byte = reader.read(1)
+ bytes.append(hexlify(current_byte))
+ while (ord(current_byte) & 0x80) != 0:
+ current_byte = reader.read(1)
+ bytes.append(hexlify(current_byte))
+ return ' '.join(bytes)
+
class TestIO(unittest.TestCase):
def test_validate(self):
print ''
@@ -150,5 +173,121 @@
print ''
self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
+ def test_binary_int_encoding(self):
+ print ''
+ print 'TEST BINARY INT ENCODING'
+ print '========================'
+ print ''
+ correct = 0
+ for value, hex_encoding in BINARY_INT_ENCODINGS:
+ print 'Value: %d' % value
+ print 'Correct Encoding: %s' % hex_encoding
+
+ # write datum in binary to string buffer
+ buffer = cStringIO.StringIO()
+ encoder = io.BinaryEncoder(buffer)
+ datum_writer = io.DatumWriter(schema.parse('"int"'))
+ datum_writer.write(value, encoder)
+
+ # read it out of the buffer and hexlify it
+ buffer.seek(0)
+ hex_val = avro_hexlify(buffer)
+
+ # check it
+ print 'Read Encoding: %s' % hex_val
+ if hex_encoding == hex_val: correct += 1
+ print ''
+ self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+ def test_binary_long_encoding(self):
+ print ''
+ print 'TEST BINARY LONG ENCODING'
+ print '========================='
+ print ''
+ correct = 0
+ for value, hex_encoding in BINARY_INT_ENCODINGS:
+ print 'Value: %d' % value
+ print 'Correct Encoding: %s' % hex_encoding
+
+ # write datum in binary to string buffer
+ buffer = cStringIO.StringIO()
+ encoder = io.BinaryEncoder(buffer)
+ datum_writer = io.DatumWriter(schema.parse('"long"'))
+ datum_writer.write(value, encoder)
+
+ # read it out of the buffer and hexlify it
+ buffer.seek(0)
+ hex_val = avro_hexlify(buffer)
+
+ # check it
+ print 'Read Encoding: %s' % hex_val
+ if hex_encoding == hex_val: correct += 1
+ print ''
+ self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+ def test_skip_long(self):
+ print ''
+ print 'TEST SKIP LONG'
+ print '=============='
+ print ''
+ correct = 0
+ for value_to_skip, hex_encoding in BINARY_INT_ENCODINGS:
+ VALUE_TO_READ = 6253
+ print 'Value to Skip: %d' % value_to_skip
+
+ # write some data in binary to string buffer
+ writer = cStringIO.StringIO()
+ encoder = io.BinaryEncoder(writer)
+ datum_writer = io.DatumWriter(schema.parse('"long"'))
+ datum_writer.write(value_to_skip, encoder)
+ datum_writer.write(VALUE_TO_READ, encoder)
+
+ # skip the value
+ reader = cStringIO.StringIO(writer.getvalue())
+ decoder = io.BinaryDecoder(reader)
+ decoder.skip_long()
+
+ # read data from string buffer
+ datum_reader = io.DatumReader(schema.parse('"long"'))
+ read_value = datum_reader.read(decoder)
+
+ # check it
+ print 'Read Value: %d' % read_value
+ if read_value == VALUE_TO_READ: correct += 1
+ print ''
+ self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
+ def test_skip_int(self):
+ print ''
+ print 'TEST SKIP INT'
+ print '============='
+ print ''
+ correct = 0
+ for value_to_skip, hex_encoding in BINARY_INT_ENCODINGS:
+ VALUE_TO_READ = 6253
+ print 'Value to Skip: %d' % value_to_skip
+
+ # write some data in binary to string buffer
+ writer = cStringIO.StringIO()
+ encoder = io.BinaryEncoder(writer)
+ datum_writer = io.DatumWriter(schema.parse('"int"'))
+ datum_writer.write(value_to_skip, encoder)
+ datum_writer.write(VALUE_TO_READ, encoder)
+
+ # skip the value
+ reader = cStringIO.StringIO(writer.getvalue())
+ decoder = io.BinaryDecoder(reader)
+ decoder.skip_int()
+
+ # read data from string buffer
+ datum_reader = io.DatumReader(schema.parse('"int"'))
+ read_value = datum_reader.read(decoder)
+
+ # check it
+ print 'Read Value: %d' % read_value
+ if read_value == VALUE_TO_READ: correct += 1
+ print ''
+ self.assertEquals(correct, len(BINARY_INT_ENCODINGS))
+
if __name__ == '__main__':
unittest.main()