commit python-fastparquet for openSUSE:Factory

root Mon, 26 Nov 2018 01:30:02 -0800

Hello community,

here is the log from the commit of package python-fastparquet for 
openSUSE:Factory checked in at 2018-11-26 10:29:27
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-fastparquet (Old)
 and      /work/SRC/openSUSE:Factory/.python-fastparquet.new.19453 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-fastparquet"

Mon Nov 26 10:29:27 2018 rev:4 rq:651329 version:0.2.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-fastparquet/python-fastparquet.changes    
2018-10-02 19:48:31.629762182 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-fastparquet.new.19453/python-fastparquet.changes
 2018-11-26 10:29:43.165069276 +0100
@@ -1,0 +2,13 @@
+Thu Nov 22 22:47:24 UTC 2018 - Arun Persaud <a...@gmx.de>
+
+- update to version 0.2.0:
+  * Don't mutate column list input (#383) (#384)
+  * Add optional requirements to extras_require (#380)
+  * Fix "broken link to parquet-format page" (#377)
+  * Add .c file to repo
+  * Handle rows split across 2 pages in the case of a map (#369)
+  * Fixes 370 (#371)
+  * Handle multi-page maps (#368)
+  * Handle zero-column files. Closes #361. (#363)
+
+-------------------------------------------------------------------

Old:
----
  fastparquet-0.1.6.tar.gz

New:
----
  fastparquet-0.2.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-fastparquet.spec ++++++
--- /var/tmp/diff_new_pack.UYGH4o/_old  2018-11-26 10:29:43.761068577 +0100
+++ /var/tmp/diff_new_pack.UYGH4o/_new  2018-11-26 10:29:43.765068572 +0100
@@ -20,7 +20,7 @@
 # Test files not included
 %bcond_with test
 Name:           python-fastparquet
-Version:        0.1.6
+Version:        0.2.0
 Release:        0
 Summary:        Python support for Parquet file format
 License:        Apache-2.0

++++++ fastparquet-0.1.6.tar.gz -> fastparquet-0.2.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/PKG-INFO 
new/fastparquet-0.2.0/PKG-INFO
--- old/fastparquet-0.1.6/PKG-INFO      2018-08-20 00:31:06.000000000 +0200
+++ new/fastparquet-0.2.0/PKG-INFO      2018-11-22 17:33:29.000000000 +0100
@@ -1,6 +1,6 @@
-Metadata-Version: 1.2
+Metadata-Version: 2.1
 Name: fastparquet
-Version: 0.1.6
+Version: 0.2.0
 Summary: Python support for Parquet file format
 Home-page: https://github.com/dask/fastparquet/
 Author: Martin Durant
@@ -133,3 +133,8 @@
 Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*,
+Provides-Extra: lz4
+Provides-Extra: zstandard
+Provides-Extra: lzo
+Provides-Extra: snappy
+Provides-Extra: brotli
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/docs/source/index.rst 
new/fastparquet-0.2.0/docs/source/index.rst
--- old/fastparquet-0.1.6/docs/source/index.rst 2018-08-20 00:30:58.000000000 
+0200
+++ new/fastparquet-0.2.0/docs/source/index.rst 2018-11-22 17:29:54.000000000 
+0100
@@ -6,7 +6,7 @@
 Introduction
 ------------
 
-The `Parquet format <https://github.com/Parquet/parquet-format>`_ is a common 
binary data store, used
+The `Parquet format <https://github.com/apache/parquet-format>`_ is a common 
binary data store, used
 particularly in the Hadoop/big-data sphere. It provides several advantages 
relevant to big-data
 processing:
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/__init__.py 
new/fastparquet-0.2.0/fastparquet/__init__.py
--- old/fastparquet-0.1.6/fastparquet/__init__.py       2018-08-20 
00:30:58.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet/__init__.py       2018-11-22 
17:31:49.000000000 +0100
@@ -11,4 +11,4 @@
 from .api import ParquetFile
 from .util import ParquetException
 
-__version__ = "0.1.6"
+__version__ = "0.2.0"
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/api.py 
new/fastparquet-0.2.0/fastparquet/api.py
--- old/fastparquet-0.1.6/fastparquet/api.py    2018-08-20 00:30:58.000000000 
+0200
+++ new/fastparquet-0.2.0/fastparquet/api.py    2018-11-22 17:29:54.000000000 
+0100
@@ -153,8 +153,10 @@
                 self.group_files.setdefault(i, set()).add(chunk.file_path)
         self.schema = schema.SchemaHelper(self._schema)
         self.selfmade = self.created_by.split(' ', 1)[0] == 
"fastparquet-python"
-        self.file_scheme = get_file_scheme([rg.columns[0].file_path
-                                           for rg in self.row_groups])
+        files = [rg.columns[0].file_path
+                 for rg in self.row_groups
+                 if rg.columns]
+        self.file_scheme = get_file_scheme(files)
         self._read_partitions()
         self._dtypes()
 
@@ -215,7 +217,7 @@
                                 for key, v in cats.items()])
 
     def row_group_filename(self, rg):
-        if rg.columns[0].file_path:
+        if rg.columns and rg.columns[0].file_path:
             base = self.fn.replace('_metadata', '').rstrip('/')
             if base:
                 return join_path(base, rg.columns[0].file_path)
@@ -410,7 +412,10 @@
         rgs = self.filter_row_groups(filters)
         size = sum(rg.num_rows for rg in rgs)
         index = self._get_index(index)
-        columns = columns or self.columns
+        if columns is not None:
+            columns = columns[:]
+        else:
+            columns = self.columns
         if index:
             columns += [i for i in index if i not in columns]
         check_column_names(self.columns + list(self.cats), columns, categories)
@@ -680,7 +685,7 @@
         return d
 
 
-def sorted_partitioned_columns(pf):
+def sorted_partitioned_columns(pf, filters=None):
     """
     The columns that are known to be sorted partition-by-partition
 
@@ -701,6 +706,13 @@
     statistics
     """
     s = statistics(pf)
+    if (filters is not None) & (filters != []):
+        idx_list = [i for i, rg in enumerate(pf.row_groups) if
+                    not(filter_out_stats(rg, filters, pf.schema)) and
+                    not(filter_out_cats(rg, filters))]
+        for stat in s.keys():
+            for col in s[stat].keys():
+                s[stat][col] = [s[stat][col][i] for i in idx_list]
     columns = pf.columns
     out = dict()
     for c in columns:
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/core.py 
new/fastparquet-0.2.0/fastparquet/core.py
--- old/fastparquet-0.1.6/fastparquet/core.py   2018-08-20 00:30:58.000000000 
+0200
+++ new/fastparquet-0.2.0/fastparquet/core.py   2018-09-04 16:06:56.000000000 
+0200
@@ -225,6 +225,7 @@
             my_nan = None
 
     num = 0
+    row_idx = 0
     while True:
         if (selfmade and hasattr(cmd, 'statistics') and
                 getattr(cmd.statistics, 'null_count', 1) == 0):
@@ -249,8 +250,8 @@
             null = not schema_helper.is_required(cmd.path_in_schema[0])
             null_val = (se.repetition_type !=
                         parquet_thrift.FieldRepetitionType.REQUIRED)
-            num = encoding._assemble_objects(assign, defi, rep, val, dic, d,
-                                             null, null_val, max_defi)
+            row_idx = 1 + encoding._assemble_objects(assign, defi, rep, val, 
dic, d,
+                                             null, null_val, max_defi, row_idx)
         elif defi is not None:
             max_defi = schema_helper.max_definition_level(cmd.path_in_schema)
             part = assign[num:num+len(defi)]
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/encoding.py 
new/fastparquet-0.2.0/fastparquet/encoding.py
--- old/fastparquet-0.1.6/fastparquet/encoding.py       2018-08-20 
00:30:58.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet/encoding.py       2018-11-04 
19:12:37.000000000 +0100
@@ -224,7 +224,7 @@
 Numpy32 = numba.jitclass(spec32)(NumpyIO)
 
 
-def _assemble_objects(assign, defi, rep, val, dic, d, null, null_val, 
max_defi):
+def _assemble_objects(assign, defi, rep, val, dic, d, null, null_val, 
max_defi, prev_i):
     """Dremel-assembly of arrays of values into lists
 
     Parameters
@@ -245,12 +245,14 @@
         can list elements be None
     max_defi: int
         value of definition level that corresponds to non-null
+    prev_i: int
+        1 + index where the last row in the previous page was inserted (0 if 
first page)
     """
     ## TODO: good case for cython
     if d:
         # dereference dict values
         val = dic[val]
-    i = 0
+    i = prev_i
     vali = 0
     part = []
     started = False
@@ -265,7 +267,11 @@
                 part = []
                 i += 1
             else:
-                # first time: no row to save yet
+                # first time: no row to save yet, unless it's a row continued 
from previous page
+                if vali > 0:
+                    assign[i - 1].extend(part) # add the items to previous row
+                    part = []
+                    # don't increment i since we only filled i-1
                 started = True
         if de == max_defi:
             # append real value to current item
@@ -276,7 +282,10 @@
             part.append(None)
         # next object is None as opposed to an object
         have_null = de == 0 and null
-    assign[i] = None if have_null else part
+    if started: # normal case - add the leftovers to the next row
+        assign[i] = None if have_null else part
+    else: # can only happen if the only elements in this page are the 
continuation of the last row from previous page
+        assign[i - 1].extend(part)
     return i
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/speedups.c 
new/fastparquet-0.2.0/fastparquet/speedups.c
--- old/fastparquet-0.1.6/fastparquet/speedups.c        2018-08-20 
00:31:06.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet/speedups.c        2018-11-22 
17:29:45.000000000 +0100
@@ -1723,7 +1723,6 @@
 static const char __pyx_k_RuntimeError[] = "RuntimeError";
 static const char __pyx_k_AttributeError[] = "AttributeError";
 static const char __pyx_k_pack_byte_array[] = "pack_byte_array";
-static const char __pyx_k_Ran_out_of_input[] = "Ran out of input";
 static const char __pyx_k_array_decode_utf8[] = "array_decode_utf8";
 static const char __pyx_k_array_encode_utf8[] = "array_encode_utf8";
 static const char __pyx_k_unpack_byte_array[] = "unpack_byte_array";
@@ -1736,6 +1735,7 @@
 static const char __pyx_k_expected_an_object_array[] = "expected an object 
array";
 static const char __pyx_k_fastparquet_speedups_pyx[] = 
"fastparquet/speedups.pyx";
 static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C 
contiguous";
+static const char __pyx_k_Ran_out_of_input_after_i_items[] = "Ran out of input 
after %i items";
 static const char __pyx_k_Native_accelerators_for_Parquet[] = "\nNative 
accelerators for Parquet encoding and decoding.\n";
 static const char __pyx_k_numpy_core_multiarray_failed_to[] = 
"numpy.core.multiarray failed to import";
 static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype 
code in numpy.pxd (%d)";
@@ -1750,7 +1750,7 @@
 static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
 static PyObject *__pyx_n_s_ImportError;
 static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
-static PyObject *__pyx_kp_s_Ran_out_of_input;
+static PyObject *__pyx_kp_s_Ran_out_of_input_after_i_items;
 static PyObject *__pyx_n_s_RuntimeError;
 static PyObject *__pyx_n_s_TypeError;
 static PyObject *__pyx_n_s_ValueError;
@@ -1822,19 +1822,17 @@
 static PyObject *__pyx_tuple__13;
 static PyObject *__pyx_tuple__14;
 static PyObject *__pyx_tuple__15;
-static PyObject *__pyx_tuple__16;
 static PyObject *__pyx_tuple__17;
 static PyObject *__pyx_tuple__19;
 static PyObject *__pyx_tuple__21;
 static PyObject *__pyx_tuple__23;
 static PyObject *__pyx_tuple__25;
-static PyObject *__pyx_tuple__27;
+static PyObject *__pyx_codeobj__16;
 static PyObject *__pyx_codeobj__18;
 static PyObject *__pyx_codeobj__20;
 static PyObject *__pyx_codeobj__22;
 static PyObject *__pyx_codeobj__24;
 static PyObject *__pyx_codeobj__26;
-static PyObject *__pyx_codeobj__28;
 /* Late includes */
 
 /* "fastparquet/speedups.pyx":21
@@ -3345,6 +3343,7 @@
   Py_ssize_t __pyx_t_4;
   Py_ssize_t __pyx_t_5;
   int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
   __Pyx_RefNannySetupContext("unpack_byte_array", 0);
 
   /* "fastparquet/speedups.pyx":147
@@ -3411,7 +3410,7 @@
  *         # It is required to check this inside the loop to avoid
  *         # out of bounds array accesses.
  *         if remaining < 0:             # <<<<<<<<<<<<<<
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         itemlen = (data[0] + (data[1] << 8) +
  */
     __pyx_t_6 = ((__pyx_v_remaining < 0) != 0);
@@ -3420,12 +3419,18 @@
       /* "fastparquet/speedups.pyx":156
  *         # out of bounds array accesses.
  *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")             # 
<<<<<<<<<<<<<<
+ *             raise RuntimeError("Ran out of input after %i items" % i)       
      # <<<<<<<<<<<<<<
  *         itemlen = (data[0] + (data[1] << 8) +
  *                    (data[2] << 16) + (data[3] << 24))
  */
-      __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__5, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, 
__pyx_L1_error)
+      __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_i); if (unlikely(!__pyx_t_2)) 
__PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_7 = 
__Pyx_PyString_Format(__pyx_kp_s_Ran_out_of_input_after_i_items, __pyx_t_2); if 
(unlikely(!__pyx_t_7)) __PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_RuntimeError, 
__pyx_t_7); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error)
       __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       __Pyx_Raise(__pyx_t_2, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
       __PYX_ERR(0, 156, __pyx_L1_error)
@@ -3434,13 +3439,13 @@
  *         # It is required to check this inside the loop to avoid
  *         # out of bounds array accesses.
  *         if remaining < 0:             # <<<<<<<<<<<<<<
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         itemlen = (data[0] + (data[1] << 8) +
  */
     }
 
     /* "fastparquet/speedups.pyx":158
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         itemlen = (data[0] + (data[1] << 8) +
  *                    (data[2] << 16) + (data[3] << 24))             # 
<<<<<<<<<<<<<<
  *         data += 4
@@ -3462,7 +3467,7 @@
  * 
  *         remaining -= itemlen             # <<<<<<<<<<<<<<
  *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  */
     __pyx_v_remaining = (__pyx_v_remaining - __pyx_v_itemlen);
 
@@ -3470,7 +3475,7 @@
  * 
  *         remaining -= itemlen
  *         if remaining < 0:             # <<<<<<<<<<<<<<
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)
  */
     __pyx_t_6 = ((__pyx_v_remaining < 0) != 0);
@@ -3479,12 +3484,18 @@
       /* "fastparquet/speedups.pyx":163
  *         remaining -= itemlen
  *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")             # 
<<<<<<<<<<<<<<
+ *             raise RuntimeError("Ran out of input after %i items" % i)       
      # <<<<<<<<<<<<<<
  *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)
  *         data += itemlen
  */
-      __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__6, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, 
__pyx_L1_error)
+      __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_i); if (unlikely(!__pyx_t_2)) 
__PYX_ERR(0, 163, __pyx_L1_error)
       __Pyx_GOTREF(__pyx_t_2);
+      __pyx_t_7 = 
__Pyx_PyString_Format(__pyx_kp_s_Ran_out_of_input_after_i_items, __pyx_t_2); if 
(unlikely(!__pyx_t_7)) __PYX_ERR(0, 163, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+      __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_RuntimeError, 
__pyx_t_7); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
       __Pyx_Raise(__pyx_t_2, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
       __PYX_ERR(0, 163, __pyx_L1_error)
@@ -3493,14 +3504,14 @@
  * 
  *         remaining -= itemlen
  *         if remaining < 0:             # <<<<<<<<<<<<<<
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)
  */
     }
 
     /* "fastparquet/speedups.pyx":164
  *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)          
   # <<<<<<<<<<<<<<
  *         data += itemlen
  * 
@@ -3511,7 +3522,7 @@
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
     /* "fastparquet/speedups.pyx":165
- *             raise RuntimeError("Ran out of input")
+ *             raise RuntimeError("Ran out of input after %i items" % i)
  *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)
  *         data += itemlen             # <<<<<<<<<<<<<<
  * 
@@ -3541,6 +3552,7 @@
   /* function exit code */
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_7);
   __Pyx_AddTraceback("fastparquet.speedups.unpack_byte_array", __pyx_clineno, 
__pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
@@ -3666,7 +3678,7 @@
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == 
pybuf.PyBUF_F_CONTIGUOUS)
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__7, 
NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error)
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, 
NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error)
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -3722,7 +3734,7 @@
  * 
  *             info.buf = PyArray_DATA(self)
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__8, 
NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error)
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__6, 
NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error)
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -3979,7 +3991,7 @@
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, 
__pyx_tuple__9, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, 
__pyx_L1_error)
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, 
__pyx_tuple__7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, 
__pyx_L1_error)
       __Pyx_GOTREF(__pyx_t_3);
       __Pyx_Raise(__pyx_t_3, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -4859,7 +4871,7 @@
  * 
  *         if ((child.byteorder == c'>' and little_endian) or
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__10, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, 
__pyx_L1_error)
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, 
__pyx_L1_error)
       __Pyx_GOTREF(__pyx_t_3);
       __Pyx_Raise(__pyx_t_3, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -4927,7 +4939,7 @@
  *             # One could encode it in the format string and have Cython
  *             # complain instead, BUT: < and > in format strings also imply
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, 
__pyx_tuple__11, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, 
__pyx_L1_error)
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, 
__pyx_tuple__9, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, 
__pyx_L1_error)
       __Pyx_GOTREF(__pyx_t_3);
       __Pyx_Raise(__pyx_t_3, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -5036,7 +5048,7 @@
  * 
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  */
-        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__12, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, 
__pyx_L1_error)
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, 
__pyx_tuple__10, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, 
__pyx_L1_error)
         __Pyx_GOTREF(__pyx_t_4);
         __Pyx_Raise(__pyx_t_4, 0, 0, 0);
         __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
@@ -5710,7 +5722,7 @@
  * 
  * cdef inline int import_umath() except -1:
  */
-      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__13, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1000, 
__pyx_L5_except_error)
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__11, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1000, 
__pyx_L5_except_error)
       __Pyx_GOTREF(__pyx_t_8);
       __Pyx_Raise(__pyx_t_8, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
@@ -5839,7 +5851,7 @@
  * 
  * cdef inline int import_ufunc() except -1:
  */
-      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__14, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1006, 
__pyx_L5_except_error)
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__12, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1006, 
__pyx_L5_except_error)
       __Pyx_GOTREF(__pyx_t_8);
       __Pyx_Raise(__pyx_t_8, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
@@ -5965,7 +5977,7 @@
  *     except Exception:
  *         raise ImportError("numpy.core.umath failed to import")             
# <<<<<<<<<<<<<<
  */
-      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__15, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1012, 
__pyx_L5_except_error)
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, 
__pyx_tuple__13, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1012, 
__pyx_L5_except_error)
       __Pyx_GOTREF(__pyx_t_8);
       __Pyx_Raise(__pyx_t_8, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
@@ -6054,7 +6066,7 @@
   {&__pyx_kp_u_Format_string_allocated_too_shor_2, 
__pyx_k_Format_string_allocated_too_shor_2, 
sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
   {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 
0, 0, 1, 1},
   {&__pyx_kp_u_Non_native_byte_order_not_suppor, 
__pyx_k_Non_native_byte_order_not_suppor, 
sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
-  {&__pyx_kp_s_Ran_out_of_input, __pyx_k_Ran_out_of_input, 
sizeof(__pyx_k_Ran_out_of_input), 0, 0, 1, 0},
+  {&__pyx_kp_s_Ran_out_of_input_after_i_items, 
__pyx_k_Ran_out_of_input_after_i_items, 
sizeof(__pyx_k_Ran_out_of_input_after_i_items), 0, 0, 1, 0},
   {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, 
sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
   {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 
1, 1},
   {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 
0, 1, 1},
@@ -6166,28 +6178,6 @@
   __Pyx_GOTREF(__pyx_tuple__4);
   __Pyx_GIVEREF(__pyx_tuple__4);
 
-  /* "fastparquet/speedups.pyx":156
- *         # out of bounds array accesses.
- *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")             # 
<<<<<<<<<<<<<<
- *         itemlen = (data[0] + (data[1] << 8) +
- *                    (data[2] << 16) + (data[3] << 24))
- */
-  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_s_Ran_out_of_input); if 
(unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 156, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__5);
-  __Pyx_GIVEREF(__pyx_tuple__5);
-
-  /* "fastparquet/speedups.pyx":163
- *         remaining -= itemlen
- *         if remaining < 0:
- *             raise RuntimeError("Ran out of input")             # 
<<<<<<<<<<<<<<
- *         out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen)
- *         data += itemlen
- */
-  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_Ran_out_of_input); if 
(unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 163, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__6);
-  __Pyx_GIVEREF(__pyx_tuple__6);
-
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":229
  *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == 
pybuf.PyBUF_C_CONTIGUOUS)
  *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
@@ -6195,9 +6185,9 @@
  * 
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == 
pybuf.PyBUF_F_CONTIGUOUS)
  */
-  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if 
(unlikely(!__pyx_tuple__7)) __PYX_ERR(1, 229, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__7);
-  __Pyx_GIVEREF(__pyx_tuple__7);
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if 
(unlikely(!__pyx_tuple__5)) __PYX_ERR(1, 229, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":233
  *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == 
pybuf.PyBUF_F_CONTIGUOUS)
@@ -6206,9 +6196,9 @@
  * 
  *             info.buf = PyArray_DATA(self)
  */
-  __pyx_tuple__8 = PyTuple_Pack(1, 
__pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__8)) 
__PYX_ERR(1, 233, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__8);
-  __Pyx_GIVEREF(__pyx_tuple__8);
+  __pyx_tuple__6 = PyTuple_Pack(1, 
__pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__6)) 
__PYX_ERR(1, 233, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__6);
+  __Pyx_GIVEREF(__pyx_tuple__6);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":263
  *                 if ((descr.byteorder == c'>' and little_endian) or
@@ -6217,9 +6207,9 @@
  *                 if   t == NPY_BYTE:        f = "b"
  *                 elif t == NPY_UBYTE:       f = "B"
  */
-  __pyx_tuple__9 = PyTuple_Pack(1, 
__pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__9)) 
__PYX_ERR(1, 263, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__9);
-  __Pyx_GIVEREF(__pyx_tuple__9);
+  __pyx_tuple__7 = PyTuple_Pack(1, 
__pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__7)) 
__PYX_ERR(1, 263, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":810
  * 
@@ -6228,9 +6218,9 @@
  * 
  *         if ((child.byteorder == c'>' and little_endian) or
  */
-  __pyx_tuple__10 = PyTuple_Pack(1, 
__pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__10)) 
__PYX_ERR(1, 810, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__10);
-  __Pyx_GIVEREF(__pyx_tuple__10);
+  __pyx_tuple__8 = PyTuple_Pack(1, 
__pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__8)) 
__PYX_ERR(1, 810, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":814
  *         if ((child.byteorder == c'>' and little_endian) or
@@ -6239,9 +6229,9 @@
  *             # One could encode it in the format string and have Cython
  *             # complain instead, BUT: < and > in format strings also imply
  */
-  __pyx_tuple__11 = PyTuple_Pack(1, 
__pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__11)) 
__PYX_ERR(1, 814, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__11);
-  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_tuple__9 = PyTuple_Pack(1, 
__pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__9)) 
__PYX_ERR(1, 814, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":834
  *             t = child.type_num
@@ -6250,9 +6240,9 @@
  * 
  *             # Until ticket #99 is fixed, use integers to avoid warnings
  */
-  __pyx_tuple__12 = PyTuple_Pack(1, 
__pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__12)) 
__PYX_ERR(1, 834, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__12);
-  __Pyx_GIVEREF(__pyx_tuple__12);
+  __pyx_tuple__10 = PyTuple_Pack(1, 
__pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__10)) 
__PYX_ERR(1, 834, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1000
  *         _import_array()
@@ -6261,9 +6251,9 @@
  * 
  * cdef inline int import_umath() except -1:
  */
-  __pyx_tuple__13 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__13)) 
__PYX_ERR(1, 1000, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__13);
-  __Pyx_GIVEREF(__pyx_tuple__13);
+  __pyx_tuple__11 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__11)) 
__PYX_ERR(1, 1000, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1006
  *         _import_umath()
@@ -6272,18 +6262,18 @@
  * 
  * cdef inline int import_ufunc() except -1:
  */
-  __pyx_tuple__14 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__14)) 
__PYX_ERR(1, 1006, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__14);
-  __Pyx_GIVEREF(__pyx_tuple__14);
+  __pyx_tuple__12 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__12)) 
__PYX_ERR(1, 1006, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
 
   /* 
"../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1012
  *         _import_umath()
  *     except Exception:
  *         raise ImportError("numpy.core.umath failed to import")             
# <<<<<<<<<<<<<<
  */
-  __pyx_tuple__15 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__15)) 
__PYX_ERR(1, 1012, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__15);
-  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_tuple__13 = PyTuple_Pack(1, 
__pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__13)) 
__PYX_ERR(1, 1012, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
 
   /* "fastparquet/speedups.pyx":18
  * 
@@ -6292,9 +6282,9 @@
  * 
  * 
  */
-  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_n_s_object); if 
(unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 18, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__16);
-  __Pyx_GIVEREF(__pyx_tuple__16);
+  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_n_s_object); if 
(unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
 
   /* "fastparquet/speedups.pyx":21
  * 
@@ -6303,10 +6293,10 @@
  *     """
  *     Convert *obj* (a ndarray-compatible object, e.g. pandas Series)
  */
-  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_n_s_obj); if 
(unlikely(!__pyx_tuple__17)) __PYX_ERR(0, 21, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__17);
-  __Pyx_GIVEREF(__pyx_tuple__17);
-  __pyx_codeobj__18 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__17, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_to_array, 21, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__18)) __PYX_ERR(0, 21, 
__pyx_L1_error)
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_n_s_obj); if 
(unlikely(!__pyx_tuple__15)) __PYX_ERR(0, 21, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_codeobj__16 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__15, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_to_array, 21, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__16)) __PYX_ERR(0, 21, 
__pyx_L1_error)
 
   /* "fastparquet/speedups.pyx":36
  * 
@@ -6315,10 +6305,10 @@
  *     if arr.ndim != 1:
  *         raise TypeError("expected a 1d array")
  */
-  __pyx_tuple__19 = PyTuple_Pack(1, __pyx_n_s_arr); if 
(unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 36, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__19);
-  __Pyx_GIVEREF(__pyx_tuple__19);
-  __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_check_1d_object_array, 36, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) __PYX_ERR(0, 36, 
__pyx_L1_error)
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_n_s_arr); if 
(unlikely(!__pyx_tuple__17)) __PYX_ERR(0, 36, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+  __pyx_codeobj__18 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__17, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_check_1d_object_array, 36, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__18)) __PYX_ERR(0, 36, 
__pyx_L1_error)
 
   /* "fastparquet/speedups.pyx":43
  * 
@@ -6327,10 +6317,10 @@
  *     """
  *     utf-8 encode all elements of a 1d ndarray of "object" dtype.
  */
-  __pyx_tuple__21 = PyTuple_Pack(5, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_arr, __pyx_n_s_result); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(0, 
43, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__21);
-  __Pyx_GIVEREF(__pyx_tuple__21);
-  __pyx_codeobj__22 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__21, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_encode_utf8, 43, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__22)) __PYX_ERR(0, 43, 
__pyx_L1_error)
+  __pyx_tuple__19 = PyTuple_Pack(5, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_arr, __pyx_n_s_result); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 
43, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__19);
+  __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_encode_utf8, 43, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) __PYX_ERR(0, 43, 
__pyx_L1_error)
 
   /* "fastparquet/speedups.pyx":65
  * 
@@ -6339,10 +6329,10 @@
  *     """
  *     utf-8 decode all elements of a 1d ndarray of "object" dtype.
  */
-  __pyx_tuple__23 = PyTuple_Pack(6, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_arr, __pyx_n_s_result, __pyx_n_s_val); if 
(unlikely(!__pyx_tuple__23)) __PYX_ERR(0, 65, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__23);
-  __Pyx_GIVEREF(__pyx_tuple__23);
-  __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__23, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_decode_utf8, 65, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) __PYX_ERR(0, 65, 
__pyx_L1_error)
+  __pyx_tuple__21 = PyTuple_Pack(6, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_arr, __pyx_n_s_result, __pyx_n_s_val); if 
(unlikely(!__pyx_tuple__21)) __PYX_ERR(0, 65, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__21);
+  __Pyx_GIVEREF(__pyx_tuple__21);
+  __pyx_codeobj__22 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__21, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_decode_utf8, 65, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__22)) __PYX_ERR(0, 65, 
__pyx_L1_error)
 
   /* "fastparquet/speedups.pyx":95
  * 
@@ -6351,10 +6341,10 @@
  *     """
  *     Pack a variable length byte array column.
  */
-  __pyx_tuple__25 = PyTuple_Pack(9, __pyx_n_s_items, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_itemlen, __pyx_n_s_total_size, __pyx_n_s_start, __pyx_n_s_data, 
__pyx_n_s_val, __pyx_n_s_out); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(0, 95, 
__pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__25);
-  __Pyx_GIVEREF(__pyx_tuple__25);
-  __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__25, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_pack_byte_array, 95, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) __PYX_ERR(0, 95, 
__pyx_L1_error)
+  __pyx_tuple__23 = PyTuple_Pack(9, __pyx_n_s_items, __pyx_n_s_i, __pyx_n_s_n, 
__pyx_n_s_itemlen, __pyx_n_s_total_size, __pyx_n_s_start, __pyx_n_s_data, 
__pyx_n_s_val, __pyx_n_s_out); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(0, 95, 
__pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__23);
+  __Pyx_GIVEREF(__pyx_tuple__23);
+  __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__23, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_pack_byte_array, 95, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) __PYX_ERR(0, 95, 
__pyx_L1_error)
 
   /* "fastparquet/speedups.pyx":135
  * 
@@ -6363,10 +6353,10 @@
  *     """
  *     Unpack a variable length byte array column.
  */
-  __pyx_tuple__27 = PyTuple_Pack(8, __pyx_n_s_raw_bytes, __pyx_n_s_n, 
__pyx_n_s_i, __pyx_n_s_itemlen, __pyx_n_s_remaining, __pyx_n_s_start, 
__pyx_n_s_data, __pyx_n_s_out); if (unlikely(!__pyx_tuple__27)) __PYX_ERR(0, 
135, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__27);
-  __Pyx_GIVEREF(__pyx_tuple__27);
-  __pyx_codeobj__28 = (PyObject*)__Pyx_PyCode_New(2, 0, 8, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__27, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_unpack_byte_array, 135, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__28)) __PYX_ERR(0, 135, 
__pyx_L1_error)
+  __pyx_tuple__25 = PyTuple_Pack(8, __pyx_n_s_raw_bytes, __pyx_n_s_n, 
__pyx_n_s_i, __pyx_n_s_itemlen, __pyx_n_s_remaining, __pyx_n_s_start, 
__pyx_n_s_data, __pyx_n_s_out); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(0, 
135, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__25);
+  __Pyx_GIVEREF(__pyx_tuple__25);
+  __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(2, 0, 8, 0, 
CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, 
__pyx_empty_tuple, __pyx_tuple__25, __pyx_empty_tuple, __pyx_empty_tuple, 
__pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_unpack_byte_array, 135, 
__pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) __PYX_ERR(0, 135, 
__pyx_L1_error)
   __Pyx_RefNannyFinishContext();
   return 0;
   __pyx_L1_error:;
@@ -6654,7 +6644,7 @@
  * 
  * 
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5numpy_dtype), 
__pyx_tuple__16, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, 
__pyx_L1_error)
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5numpy_dtype), 
__pyx_tuple__14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, 
__pyx_L1_error)
   __Pyx_GOTREF(__pyx_t_1);
   if (PyDict_SetItem(__pyx_d, __pyx_n_s_obj_dtype, __pyx_t_1) < 0) 
__PYX_ERR(0, 18, __pyx_L1_error)
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/test/test_api.py 
new/fastparquet-0.2.0/fastparquet/test/test_api.py
--- old/fastparquet-0.1.6/fastparquet/test/test_api.py  2018-08-20 
00:30:58.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet/test/test_api.py  2018-11-22 
17:29:54.000000000 +0100
@@ -97,6 +97,36 @@
     assert result == expected
 
 
+def test_sorted_row_group_columns_with_filters(tempdir):
+    dd = pytest.importorskip('dask.dataframe')
+    # create dummy dataframe
+    df = pd.DataFrame({'unique': [0, 0, 1, 1, 2, 2, 3, 3],
+                       'id': ['id1', 'id2',
+                              'id1', 'id2',
+                              'id1', 'id2',
+                              'id1', 'id2']},
+                      index=[0, 0, 1, 1, 2, 2, 3, 3])
+    df = dd.from_pandas(df, npartitions=2)
+    fn = os.path.join(tempdir, 'foo.parquet')
+    df.to_parquet(fn,
+                  engine='fastparquet',
+                  partition_on=['id'])
+    # load ParquetFile
+    pf = ParquetFile(fn)
+    filters = [('id', '==', 'id1')]
+
+    # without filters no columns are sorted
+    result = sorted_partitioned_columns(pf)
+    expected = {}
+    assert result == expected
+
+    # with filters both columns are sorted
+    result = sorted_partitioned_columns(pf, filters=filters)
+    expected = {'index': {'min': [0, 2], 'max': [1, 3]},
+                'unique': {'min': [0, 2], 'max': [1, 3]}}
+    assert result == expected
+
+
 def test_iter(tempdir):
     df = pd.DataFrame({'x': [1, 2, 3, 4],
                        'y': [1.0, 2.0, 1.0, 2.0],
@@ -417,6 +447,14 @@
     assert out.index.name is None
     assert out.index.tolist() == [0, 1, 2]
 
+def test_input_column_list_not_mutated(tempdir):
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
+    write(tempdir, df, file_scheme='hive')
+    cols = ['a']
+    pf = ParquetFile(tempdir)
+    out = pf.to_pandas(columns=cols)
+    assert cols == ['a']
+
 
 def test_drill_list(tempdir):
     df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/test/test_read.py 
new/fastparquet-0.2.0/fastparquet/test/test_read.py
--- old/fastparquet-0.1.6/fastparquet/test/test_read.py 2018-08-20 
00:30:58.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet/test/test_read.py 2018-09-13 
00:09:43.000000000 +0200
@@ -335,3 +335,88 @@
     assert dg.index.levels[0].dtype == '<M8[ns]'
     assert dg.index.levels[1].name == 'b'
     assert dg.equals(df)
+
+def test_no_columns(tempdir):
+    # https://github.com/dask/fastparquet/issues/361
+    # Create a non-empty DataFrame, then select no columns. That way we get
+    # _some_ rows, _no_ columns.
+    #
+    # df = pd.DataFrame({"A": [1, 2]})[[]]
+    # fastparquet.write("test-data/no_columns.parquet", df)
+    pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "no_columns.parquet"))
+    assert pf.count == 2
+    assert pf.columns == []
+    result = pf.to_pandas()
+    expected = pd.DataFrame({"A": [1, 2]})[[]]
+    assert len(result) == 2
+    pd.testing.assert_frame_equal(result, expected)
+
+def test_map_multipage(tempdir):
+    pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, 
"map-test.snappy.parquet"))
+    assert pf.count == 3551
+    df = pf.to_pandas()
+    first_row_keys = [u'FoxNews.com', u'News Network', u'mobile technology', 
u'broadcast', u'sustainability',
+                      u'collective intelligence', u'radio', u'business law', 
u'LLC', u'telecommunications',
+                      u'FOX News Network']
+    last_row_keys = [u'protests', u'gas mask', u'Pot & Painting Party', 
u'Denver', u'New Year', u'Anderson Cooper',
+                     u'gas mask bonk', u'digital media', u'marijuana leaf 
earrings', u'Screengrab', u'gas mask bongs',
+                     u'Randi Kaye', u'Lee Rogers', u'Andy Cohen', u'CNN', 
u'Times Square', u'Colorado', u'opera',
+                     u'slavery', u'Kathy Griffin', u'marijuana cigarette', 
u'executive producer']
+
+    assert len(df) == 3551
+    assert sorted(df["topics"].iloc[0].keys()) == sorted(first_row_keys)
+    assert sorted(df["topics"].iloc[-1].keys()) == sorted(last_row_keys)
+    assert df.isnull().sum().sum() == 0 # ensure every row got converted
+
+def test_map_last_row_split(tempdir):
+    pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, 
"test-map-last-row-split.parquet"))
+    assert pf.count == 2428
+    df = pf.to_pandas()
+    # file has 3 pages - rows at index 1210 and 2427 are split in-between 
neighboring pages
+    first_split_row_keys = [u'White House', u'State Department', 
u'Tatverd\xe4chtige', u'financial economics',
+                            u'Hezbollah', u'Bashar Assad', u'break-down', 
u'paper', u'radio', u'musicals',
+                            u'Vladimir Putin', u'Hill two', u'The New York 
Times and Washington Post', u'tweet',
+                            u'guest bedroom', u'Susie Tompkins Buell', 
u'private law', u'Tammy Bruce',
+                            u'Obama Presidential Library', u'Fox News', 
u'President Trump', u'John Kerry',
+                            u'Vanity Fair', u'government', u'Josh Meyer', 
u'The Hill', u'Esprit Clothing',
+                            u'Rainer Wendt', u'Fitness', u'u.n.', u'David 
Brock', u'fleas', u'Trump', u'WORKOUT',
+                            u'Washington', u'Brandenburg Gate', u'Lisa Bloom', 
u'festgenommen', u'journalist',
+                            u'Kolleg', u'Middle East', u'financial markets', 
u'gym equipment', u'weight training',
+                            u'reference', u'Solche Taten', u'digital radio', 
u'Stephen l. Miller', u'Belleon Body',
+                            u'harassment', u'East', u'investment', 
u'creatures', u'Islamic Republic', u'New Year',
+                            u'New York City', u'Media Research center', u'Neue 
Osnabruecker Zeitung daily newspaper',
+                            u'Berlin', u'gegen diese Taten vorgehen', 
u'safety', u'Jarrett Blanc', u'Tehran',
+                            u'America', u'Black Lives Matter', u'pussy hats',
+                            u'wurden bislang leider vereinzelt sexuelle 
\xdcbergriffe gemeldet', u'Roger Cohen',
+                            u'u.s.', u'Donald Trump', u'Emily Shire', 
u'hardline', u'common law', u'animal workouts',
+                            u'Hamas', u'operas', u'New York Times', u'Amanda 
Hess', u'Adrian Carrasquillo',
+                            u'Lukas Mikelionis', u'Koi', u'TOUGHEST MUDDER', 
u'Middle Eastern', u'Erik Wemple',
+                            u'Associated Press', u'Iran', u'out-of-pocket 
expenses', u'Neue Osnabruecker Zeitung',
+                            u'lizards', u'Carlos Leon', u'Polizei Berlin 
Einsatz', u'Russia', u'Russian',
+                            u'Berlin Wall', u'Obama', u'The Times', u'The New 
York Post', u'Mark Halperin',
+                            u'learning programs', u'NBC', u'American', u'Jeff 
Bell',
+                            u'Heat Street and National Review Online', u'Dan 
Merica', u'Tel Aviv',
+                            u'Wielding Money', u'anxiety', u'Bell', 
u'Twitter', u'Hillary Clinton',
+                            u'physical exercise', u'Fellow Times', 
u'property', u'Paul Krugman', u'FoxNews.com',
+                            u'Times Square New Year', u'Mika Brzezinksi', 
u'Ayatollah Ali Khamenei', u'Nikki Haley',
+                            u'Obama Library', u'internet-based works', 
u'Quadriga', u'Washington Post',
+                            u'Angela Merkel', u'Manhattan', u'United Nations', 
u'information', u'Israel',
+                            u'Wir haben zivile', u'administration', u'United 
States', u'Maya Kosoff', u'Germany',
+                            u'donor', u'television terminology', u'Bloom', 
u'The Washington Post', u'Jack Shafer',
+                            u'Bei den Veranstaltungen', u'singles', 
u'uprising', u'reporting', u'AP',
+                            u'Fox News Opinion', u'celebrity lawyer', u'Dan 
Gainor', u'CNN', u'Syria',
+                            u'business law', u'inspiration', u'regime', 
u'Politico', u'Democratic Party',
+                            u'The New York Times', u'websites', 
u'socio-economics', u'Jerusalem']
+    second_split_row_keys = [u'Stockton University', u'Walter Montelione', 
u'law enforcement', u'shooting',
+                             u'international incidents', u'NYE', u'Linda 
Kologi', u'criminal law',
+                             u'Long Branch Police Department', u'Kaitlyn 
Schallhorn', u'Brittany Kologi', u'suspect',
+                             u'teenager', u'Monmouth County', u'television 
terminology', u'Fox News', u'Long Branch',
+                             u'Monmouth County prosecutor\u2019s Office', 
u'Galloway Township', u'Dave Farmer',
+                             u'Steven Kologi jr.', u'u.s.', u'incident', 
u'WCBS-TV', u'Christopher j. Gramiccioni',
+                             u"Diane D'Amico", u'New Jersey', u'shooter', 
u'maritime incidents',
+                             u'Monmouth County Prosecutor', u'Steven Kologi', 
u'Bryan Llenas', u'Mary Schultz',
+                             u'NJ.com', u'n.j.', u'Veronica Mass']
+    assert len(df) == 2428
+    assert sorted(df["topics"].iloc[1210].keys()) == 
sorted(first_split_row_keys)
+    assert sorted(df["topics"].iloc[2427].keys()) == 
sorted(second_split_row_keys)
+    assert df.isnull().sum().sum() == 0
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/writer.py 
new/fastparquet-0.2.0/fastparquet/writer.py
--- old/fastparquet-0.1.6/fastparquet/writer.py 2018-08-20 00:30:58.000000000 
+0200
+++ new/fastparquet-0.2.0/fastparquet/writer.py 2018-09-30 17:02:24.000000000 
+0200
@@ -984,7 +984,7 @@
         for rg in fmd.row_groups:
             for col in rg.columns:
                 if ".".join(col.meta_data.path_in_schema) == cat['name']:
-                    ncats = [k.value for k in col.meta_data.key_value_metadata
+                    ncats = [k.value for k in 
(col.meta_data.key_value_metadata or [])
                              if k.key == 'num_categories']
                     if ncats and int(ncats[0]) > cat['metadata'][
                             'num_categories']:
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/PKG-INFO 
new/fastparquet-0.2.0/fastparquet.egg-info/PKG-INFO
--- old/fastparquet-0.1.6/fastparquet.egg-info/PKG-INFO 2018-08-20 
00:31:06.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet.egg-info/PKG-INFO 2018-11-22 
17:33:29.000000000 +0100
@@ -1,6 +1,6 @@
-Metadata-Version: 1.2
+Metadata-Version: 2.1
 Name: fastparquet
-Version: 0.1.6
+Version: 0.2.0
 Summary: Python support for Parquet file format
 Home-page: https://github.com/dask/fastparquet/
 Author: Martin Durant
@@ -133,3 +133,8 @@
 Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*,
+Provides-Extra: lz4
+Provides-Extra: zstandard
+Provides-Extra: lzo
+Provides-Extra: snappy
+Provides-Extra: brotli
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/pbr.json 
new/fastparquet-0.2.0/fastparquet.egg-info/pbr.json
--- old/fastparquet-0.1.6/fastparquet.egg-info/pbr.json 2018-08-20 
00:31:06.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet.egg-info/pbr.json 2018-11-22 
17:33:29.000000000 +0100
@@ -1 +1 @@
-{"git_version": "5f06d4e", "is_release": true}
\ No newline at end of file
+{"git_version": "65283e2", "is_release": true}
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/requires.txt 
new/fastparquet-0.2.0/fastparquet.egg-info/requires.txt
--- old/fastparquet-0.1.6/fastparquet.egg-info/requires.txt     2018-08-20 
00:31:06.000000000 +0200
+++ new/fastparquet-0.2.0/fastparquet.egg-info/requires.txt     2018-11-22 
17:33:29.000000000 +0100
@@ -4,3 +4,18 @@
 thrift>=0.11.0
 six
 pytest-runner
+
+[brotli]
+brotli
+
+[lz4]
+lz4>=0.19.1
+
+[lzo]
+python-lzo
+
+[snappy]
+python-snappy
+
+[zstandard]
+zstandard
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/fastparquet-0.1.6/setup.py 
new/fastparquet-0.2.0/setup.py
--- old/fastparquet-0.1.6/setup.py      2018-08-20 00:30:58.000000000 +0200
+++ new/fastparquet-0.2.0/setup.py      2018-11-22 17:31:49.000000000 +0100
@@ -54,7 +54,7 @@
 
 setup(
     name='fastparquet',
-    version='0.1.6',
+    version='0.2.0',
     description='Python support for Parquet file format',
     author='Martin Durant',
     author_email='mdur...@continuum.io',
@@ -81,6 +81,13 @@
     #    'pytest-runner',
     #    [p for p in install_requires if p.startswith('numpy')][0]
     #],
+    extras_require={
+        'brotli': ['brotli'],
+        'lz4': ['lz4 >= 0.19.1'],
+        'lzo': ['python-lzo'],
+        'snappy': ['python-snappy'],
+        'zstandard': ['zstandard']
+    },
     tests_require=[
         'pytest',
         'python-snappy',

commit python-fastparquet for openSUSE:Factory

Reply via email to