[ 
https://issues.apache.org/jira/browse/BEAM-5720?focusedWorklogId=154621&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-154621
 ]

ASF GitHub Bot logged work on BEAM-5720:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 16/Oct/18 07:20
            Start Date: 16/Oct/18 07:20
    Worklog Time Spent: 10m 
      Work Description: robertwb closed pull request #6659: [BEAM-5720] Fix 
encoding of large python ints in Python 3.
URL: https://github.com/apache/beam/pull/6659
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/sdks/python/apache_beam/coders/coder_impl.pxd 
b/sdks/python/apache_beam/coders/coder_impl.pxd
index 8a85d085448..03d0a56c053 100644
--- a/sdks/python/apache_beam/coders/coder_impl.pxd
+++ b/sdks/python/apache_beam/coders/coder_impl.pxd
@@ -69,12 +69,13 @@ cdef class DeterministicFastPrimitivesCoderImpl(CoderImpl):
 
 
 cdef object NoneType
-cdef char UNKNOWN_TYPE, NONE_TYPE, INT_TYPE, FLOAT_TYPE, BOOL_TYPE
-cdef char BYTES_TYPE, UNICODE_TYPE, LIST_TYPE, TUPLE_TYPE, DICT_TYPE, SET_TYPE
+cdef unsigned char UNKNOWN_TYPE, NONE_TYPE, INT_TYPE, FLOAT_TYPE, BOOL_TYPE
+cdef unsigned char BYTES_TYPE, UNICODE_TYPE, LIST_TYPE, TUPLE_TYPE, DICT_TYPE
+cdef unsigned char SET_TYPE
 
 cdef class FastPrimitivesCoderImpl(StreamCoderImpl):
   cdef CoderImpl fallback_coder_impl
-  @cython.locals(dict_value=dict)
+  @cython.locals(dict_value=dict, int_value=libc.stdint.int64_t)
   cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
 
 
diff --git a/sdks/python/apache_beam/coders/coder_impl.py 
b/sdks/python/apache_beam/coders/coder_impl.py
index 00349a66c91..070e07d0d37 100644
--- a/sdks/python/apache_beam/coders/coder_impl.py
+++ b/sdks/python/apache_beam/coders/coder_impl.py
@@ -62,6 +62,12 @@
   from .slow_stream import OutputStream as create_OutputStream
   from .slow_stream import ByteCountingOutputStream
   from .slow_stream import get_varint_size
+  if False:  # pylint: disable=using-constant-test
+    # This clause is interpreted by the compiler.
+    from cython import compiled as is_compiled
+  else:
+    is_compiled = False
+    fits_in_64_bits = lambda x: -(1 << 63) <= x <= (1 << 63) - 1
 # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports
 
 
@@ -293,8 +299,22 @@ def encode_to_stream(self, value, stream, nested):
     if value is None:
       stream.write_byte(NONE_TYPE)
     elif t is int:
-      stream.write_byte(INT_TYPE)
-      stream.write_var_int64(value)
+      # In Python 3, an int may be larger than 64 bits.
+      # We need to check whether value fits into a 64 bit integer before
+      # writing the marker byte.
+      try:
+        # In Cython-compiled code this will throw an overflow error
+        # when value does not fit into int64.
+        int_value = value
+        # If Cython is not used, we must do a (slower) check ourselves.
+        if not is_compiled:
+          if not fits_in_64_bits(value):
+            raise OverflowError()
+        stream.write_byte(INT_TYPE)
+        stream.write_var_int64(int_value)
+      except OverflowError:
+        stream.write_byte(UNKNOWN_TYPE)
+        self.fallback_coder_impl.encode_to_stream(value, stream, nested)
     elif t is float:
       stream.write_byte(FLOAT_TYPE)
       stream.write_bigendian_double(value)
@@ -354,8 +374,10 @@ def decode_from_stream(self, stream, nested):
       return v
     elif t == BOOL_TYPE:
       return not not stream.read_byte()
-
-    return self.fallback_coder_impl.decode_from_stream(stream, nested)
+    elif t == UNKNOWN_TYPE:
+      return self.fallback_coder_impl.decode_from_stream(stream, nested)
+    else:
+      raise ValueError('Unknown type tag %x' % t)
 
 
 class BytesCoderImpl(CoderImpl):
diff --git a/sdks/python/apache_beam/coders/coders_test_common.py 
b/sdks/python/apache_beam/coders/coders_test_common.py
index 607d3e25741..4707c32f846 100644
--- a/sdks/python/apache_beam/coders/coders_test_common.py
+++ b/sdks/python/apache_beam/coders/coders_test_common.py
@@ -143,6 +143,10 @@ def test_fast_primitives_coder(self):
     self.check_coder(coder, len)
     self.check_coder(coders.TupleCoder((coder,)), ('a',), (1,))
 
+  def test_fast_primitives_coder_large_int(self):
+    coder = coders.FastPrimitivesCoder()
+    self.check_coder(coder, 10 ** 100)
+
   def test_bytes_coder(self):
     self.check_coder(coders.BytesCoder(), b'a', b'\0', b'z' * 1000)
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


Issue Time Tracking
-------------------

    Worklog Id:     (was: 154621)
    Time Spent: 1h 40m  (was: 1.5h)

> Default coder breaks with large ints on Python 3
> ------------------------------------------------
>
>                 Key: BEAM-5720
>                 URL: https://issues.apache.org/jira/browse/BEAM-5720
>             Project: Beam
>          Issue Type: Sub-task
>          Components: sdk-py-core
>            Reporter: Robert Bradshaw
>            Assignee: Robert Bradshaw
>            Priority: Minor
>          Time Spent: 1h 40m
>  Remaining Estimate: 0h
>
> The test for `int` includes greater than 64-bit values, which causes an 
> overflow error later in the code. We need to only use that coding scheme for 
> machine-sized ints. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to