Cythonize Timestamp- and FloatCoder

robertwb · silviulica · commit 2c4c414056c0 · 2016-03-31T17:43:47.000-07:00
Also add further utilities to our coder stream classes. ----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=118068607
diff --git a/google/cloud/dataflow/coders/coder_impl.pxd b/google/cloud/dataflow/coders/coder_impl.pxd
@@ -58,6 +58,14 @@ cdef class BytesCoderImpl(CoderImpl):
   pass
 
 
+cdef class FloatCoderImpl(StreamCoderImpl):
+  pass
+
+
+cdef class TimestampCoderImpl(StreamCoderImpl):
+  cdef object timestamp_class
+
+
 cdef list small_ints
 cdef class VarIntCoderImpl(StreamCoderImpl):
   @cython.locals(ivalue=libc.stdint.int64_t)
diff --git a/google/cloud/dataflow/coders/coder_impl.py b/google/cloud/dataflow/coders/coder_impl.py
@@ -162,6 +162,27 @@ def decode(self, encoded):
     return encoded
 
 
+class FloatCoderImpl(StreamCoderImpl):
+
+  def encode_to_stream(self, value, out, nested):
+    out.write_bigendian_double(value)
+
+  def decode_from_stream(self, in_stream, nested):
+    return in_stream.read_bigendian_double()
+
+
+class TimestampCoderImpl(StreamCoderImpl):
+
+  def __init__(self, timestamp_class):
+    self.timestamp_class = timestamp_class
+
+  def encode_to_stream(self, value, out, nested):
+    out.write_bigendian_int64(value.micros)
+
+  def decode_from_stream(self, in_stream, nested):
+    return self.timestamp_class(micros=in_stream.read_bigendian_int64())
+
+
 small_ints = [chr(_) for _ in range(128)]
 
 
diff --git a/google/cloud/dataflow/coders/coders.py b/google/cloud/dataflow/coders/coders.py
@@ -17,7 +17,6 @@
 import base64
 import collections
 import cPickle as pickle
-import struct
 
 from google.cloud.dataflow.coders import coder_impl
 
@@ -225,29 +224,21 @@ def is_deterministic(self):
     return True
 
 
-# TODO(ccy): Write a Cython implementation of FloatCoder.
-class FloatCoder(Coder):
+class FloatCoder(FastCoder):
   """A coder used for floating-point values."""
 
-  def encode(self, value):
-    return struct.pack('<d', value)
-
-  def decode(self, encoded):
-    return struct.unpack('<d', encoded)[0]
+  def _create_impl(self):
+    return coder_impl.FloatCoderImpl()
 
   def is_deterministic(self):
     return True
 
 
-# TODO(ccy): Write a Cython implementation of TimestampCoder.
-class TimestampCoder(Coder):
+class TimestampCoder(FastCoder):
   """A coder used for timeutil.Timestamp values."""
 
-  def encode(self, value):
-    return struct.pack('<q', value.micros)
-
-  def decode(self, encoded):
-    return Timestamp(micros=struct.unpack('<q', encoded)[0])
+  def _create_impl(self):
+    return coder_impl.TimestampCoderImpl(Timestamp)
 
   def is_deterministic(self):
     return True
diff --git a/google/cloud/dataflow/coders/slow_stream.py b/google/cloud/dataflow/coders/slow_stream.py
@@ -14,6 +14,8 @@
 
 """A pure Python implementation of stream.pyx."""
 
+import struct
+
 
 class OutputStream(object):
   """A pure Python implementation of stream.OutputStream."""
@@ -44,6 +46,15 @@ def write_var_int64(self, v):
       if not v:
         break
 
+  def write_bigendian_int64(self, v):
+    self.write(struct.pack('>q', v))
+
+  def write_bigendian_int32(self, v):
+    self.write(struct.pack('>i', v))
+
+  def write_bigendian_double(self, v):
+    self.write(struct.pack('>d', v))
+
   def get(self):
     return ''.join(self.data)
 
@@ -87,3 +98,12 @@ def read_var_int64(self):
     if result >= 1 << 63:
       result -= 1 << 64
     return result
+
+  def read_bigendian_int64(self):
+    return struct.unpack('>q', self.read(8))[0]
+
+  def read_bigendian_int32(self):
+    return struct.unpack('>i', self.read(4))[0]
+
+  def read_bigendian_double(self):
+    return struct.unpack('>d', self.read(8))[0]
diff --git a/google/cloud/dataflow/coders/stream.pxd b/google/cloud/dataflow/coders/stream.pxd
@@ -23,6 +23,9 @@ cdef class OutputStream(object):
   cpdef write(self, bytes b, bint nested=*)
   cpdef write_byte(self, unsigned char val)
   cpdef write_var_int64(self, libc.stdint.int64_t v)
+  cpdef write_bigendian_int64(self, libc.stdint.int64_t signed_v)
+  cpdef write_bigendian_int32(self, libc.stdint.int32_t signed_v)
+  cpdef write_bigendian_double(self, double d)
 
   cpdef bytes get(self)
 
@@ -38,4 +41,7 @@ cdef class InputStream(object):
   cpdef bytes read(self, size_t len)
   cpdef long read_byte(self) except? -1
   cpdef libc.stdint.int64_t read_var_int64(self) except? -1
+  cpdef libc.stdint.int64_t read_bigendian_int64(self) except? -1
+  cpdef libc.stdint.int32_t read_bigendian_int32(self) except? -1
+  cpdef double read_bigendian_double(self) except? -1
   cpdef bytes read_all(self, bint nested=*)
diff --git a/google/cloud/dataflow/coders/stream.pyx b/google/cloud/dataflow/coders/stream.pyx
@@ -40,7 +40,6 @@ cdef class OutputStream(object):
     self.pos += blen
 
   cpdef write_byte(self, unsigned char val):
-    assert 0 <= val <= 0xFF
     if  self.size <= self.pos:
       self.extend(1)
     self.data[self.pos] = val
@@ -59,6 +58,33 @@ cdef class OutputStream(object):
       if not v:
         break
 
+  cpdef write_bigendian_int64(self, libc.stdint.int64_t signed_v):
+    cdef libc.stdint.uint64_t v = signed_v
+    if  self.size < self.pos - 8:
+      self.extend(8)
+    self.data[self.pos    ] = <unsigned char>(v >> 56)
+    self.data[self.pos + 1] = <unsigned char>(v >> 48)
+    self.data[self.pos + 2] = <unsigned char>(v >> 40)
+    self.data[self.pos + 3] = <unsigned char>(v >> 32)
+    self.data[self.pos + 4] = <unsigned char>(v >> 24)
+    self.data[self.pos + 5] = <unsigned char>(v >> 16)
+    self.data[self.pos + 6] = <unsigned char>(v >>  8)
+    self.data[self.pos + 7] = <unsigned char>(v      )
+    self.pos += 8
+
+  cpdef write_bigendian_int32(self, libc.stdint.int32_t signed_v):
+    cdef libc.stdint.uint32_t v = signed_v
+    if  self.size < self.pos - 4:
+      self.extend(4)
+    self.data[self.pos    ] = <unsigned char>(v >> 24)
+    self.data[self.pos + 1] = <unsigned char>(v >> 16)
+    self.data[self.pos + 2] = <unsigned char>(v >>  8)
+    self.data[self.pos + 3] = <unsigned char>(v      )
+    self.pos += 4
+
+  cpdef write_bigendian_double(self, double d):
+    self.write_bigendian_int64((<libc.stdint.int64_t*><char*>&d)[0])
+
   cpdef bytes get(self):
     return self.data[:self.pos]
 
@@ -111,3 +137,25 @@ cdef class InputStream(object):
       if not (byte & 0x80):
         break
     return result
+
+  cpdef libc.stdint.int64_t read_bigendian_int64(self) except? -1:
+    self.pos += 8
+    return (<unsigned char>self.allc[self.pos - 1]
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 2] <<  8
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 3] << 16
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 4] << 24
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 5] << 32
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 6] << 40
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 7] << 48
+      | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 8] << 56)
+
+  cpdef libc.stdint.int32_t read_bigendian_int32(self) except? -1:
+    self.pos += 4
+    return (<unsigned char>self.allc[self.pos - 1]
+      | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 2] <<  8
+      | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 3] << 16
+      | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 4] << 24)
+
+  cpdef double read_bigendian_double(self) except? -1:
+    cdef libc.stdint.int64_t as_long = self.read_bigendian_int64()
+    return (<double*><char*>&as_long)[0]
diff --git a/google/cloud/dataflow/coders/stream_test.py b/google/cloud/dataflow/coders/stream_test.py
@@ -72,6 +72,33 @@ def test_medium_var_int64(self):
   def test_large_var_int64(self):
     self.run_read_write_var_int64([0, 2**63 - 1, -2**63, 2**63 - 3])
 
+  def test_read_write_double(self):
+    values = 0, 1, -1, 1e100, 1.0/3, math.pi, float('inf')
+    out_s = self.OutputStream()
+    for v in values:
+      out_s.write_bigendian_double(v)
+    in_s = self.InputStream(out_s.get())
+    for v in values:
+      self.assertEquals(v, in_s.read_bigendian_double())
+
+  def test_read_write_bigendian_int64(self):
+    values = 0, 1, -1, 2**63-1, -2**63, int(2**61 * math.pi)
+    out_s = self.OutputStream()
+    for v in values:
+      out_s.write_bigendian_int64(v)
+    in_s = self.InputStream(out_s.get())
+    for v in values:
+      self.assertEquals(v, in_s.read_bigendian_int64())
+
+  def test_read_write_bigendian_int32(self):
+    values = 0, 1, -1, 2**31-1, -2**31, int(2**29 * math.pi)
+    out_s = self.OutputStream()
+    for v in values:
+      out_s.write_bigendian_int32(v)
+    in_s = self.InputStream(out_s.get())
+    for v in values:
+      self.assertEquals(v, in_s.read_bigendian_int32())
+
 
 try:
   # pylint: disable=g-import-not-at-top