Hello community, here is the log from the commit of package python-fastparquet for openSUSE:Factory checked in at 2018-11-26 10:29:27 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-fastparquet (Old) and /work/SRC/openSUSE:Factory/.python-fastparquet.new.19453 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-fastparquet" Mon Nov 26 10:29:27 2018 rev:4 rq:651329 version:0.2.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-fastparquet/python-fastparquet.changes 2018-10-02 19:48:31.629762182 +0200 +++ /work/SRC/openSUSE:Factory/.python-fastparquet.new.19453/python-fastparquet.changes 2018-11-26 10:29:43.165069276 +0100 @@ -1,0 +2,13 @@ +Thu Nov 22 22:47:24 UTC 2018 - Arun Persaud <a...@gmx.de> + +- update to version 0.2.0: + * Don't mutate column list input (#383) (#384) + * Add optional requirements to extras_require (#380) + * Fix "broken link to parquet-format page" (#377) + * Add .c file to repo + * Handle rows split across 2 pages in the case of a map (#369) + * Fixes 370 (#371) + * Handle multi-page maps (#368) + * Handle zero-column files. Closes #361. (#363) + +------------------------------------------------------------------- Old: ---- fastparquet-0.1.6.tar.gz New: ---- fastparquet-0.2.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-fastparquet.spec ++++++ --- /var/tmp/diff_new_pack.UYGH4o/_old 2018-11-26 10:29:43.761068577 +0100 +++ /var/tmp/diff_new_pack.UYGH4o/_new 2018-11-26 10:29:43.765068572 +0100 @@ -20,7 +20,7 @@ # Test files not included %bcond_with test Name: python-fastparquet -Version: 0.1.6 +Version: 0.2.0 Release: 0 Summary: Python support for Parquet file format License: Apache-2.0 ++++++ fastparquet-0.1.6.tar.gz -> fastparquet-0.2.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/PKG-INFO new/fastparquet-0.2.0/PKG-INFO --- old/fastparquet-0.1.6/PKG-INFO 2018-08-20 00:31:06.000000000 +0200 +++ new/fastparquet-0.2.0/PKG-INFO 2018-11-22 17:33:29.000000000 +0100 @@ -1,6 +1,6 @@ -Metadata-Version: 1.2 +Metadata-Version: 2.1 Name: fastparquet -Version: 0.1.6 +Version: 0.2.0 Summary: Python support for Parquet file format Home-page: https://github.com/dask/fastparquet/ Author: Martin Durant @@ -133,3 +133,8 @@ Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: Implementation :: CPython Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, +Provides-Extra: lz4 +Provides-Extra: zstandard +Provides-Extra: lzo +Provides-Extra: snappy +Provides-Extra: brotli diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/docs/source/index.rst new/fastparquet-0.2.0/docs/source/index.rst --- old/fastparquet-0.1.6/docs/source/index.rst 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/docs/source/index.rst 2018-11-22 17:29:54.000000000 +0100 @@ -6,7 +6,7 @@ Introduction ------------ -The `Parquet format <https://github.com/Parquet/parquet-format>`_ is a common binary data store, used +The `Parquet format <https://github.com/apache/parquet-format>`_ is a common binary data store, used particularly in the Hadoop/big-data sphere. It provides several advantages relevant to big-data processing: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/__init__.py new/fastparquet-0.2.0/fastparquet/__init__.py --- old/fastparquet-0.1.6/fastparquet/__init__.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/__init__.py 2018-11-22 17:31:49.000000000 +0100 @@ -11,4 +11,4 @@ from .api import ParquetFile from .util import ParquetException -__version__ = "0.1.6" +__version__ = "0.2.0" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/api.py new/fastparquet-0.2.0/fastparquet/api.py --- old/fastparquet-0.1.6/fastparquet/api.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/api.py 2018-11-22 17:29:54.000000000 +0100 @@ -153,8 +153,10 @@ self.group_files.setdefault(i, set()).add(chunk.file_path) self.schema = schema.SchemaHelper(self._schema) self.selfmade = self.created_by.split(' ', 1)[0] == "fastparquet-python" - self.file_scheme = get_file_scheme([rg.columns[0].file_path - for rg in self.row_groups]) + files = [rg.columns[0].file_path + for rg in self.row_groups + if rg.columns] + self.file_scheme = get_file_scheme(files) self._read_partitions() self._dtypes() @@ -215,7 +217,7 @@ for key, v in cats.items()]) def row_group_filename(self, rg): - if rg.columns[0].file_path: + if rg.columns and rg.columns[0].file_path: base = self.fn.replace('_metadata', '').rstrip('/') if base: return join_path(base, rg.columns[0].file_path) @@ -410,7 +412,10 @@ rgs = self.filter_row_groups(filters) size = sum(rg.num_rows for rg in rgs) index = self._get_index(index) - columns = columns or self.columns + if columns is not None: + columns = columns[:] + else: + columns = self.columns if index: columns += [i for i in index if i not in columns] check_column_names(self.columns + list(self.cats), columns, categories) @@ -680,7 +685,7 @@ return d -def sorted_partitioned_columns(pf): +def sorted_partitioned_columns(pf, filters=None): """ The columns that are known to be sorted partition-by-partition @@ -701,6 +706,13 @@ statistics """ s = statistics(pf) + if (filters is not None) & (filters != []): + idx_list = [i for i, rg in enumerate(pf.row_groups) if + not(filter_out_stats(rg, filters, pf.schema)) and + not(filter_out_cats(rg, filters))] + for stat in s.keys(): + for col in s[stat].keys(): + s[stat][col] = [s[stat][col][i] for i in idx_list] columns = pf.columns out = dict() for c in columns: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/core.py new/fastparquet-0.2.0/fastparquet/core.py --- old/fastparquet-0.1.6/fastparquet/core.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/core.py 2018-09-04 16:06:56.000000000 +0200 @@ -225,6 +225,7 @@ my_nan = None num = 0 + row_idx = 0 while True: if (selfmade and hasattr(cmd, 'statistics') and getattr(cmd.statistics, 'null_count', 1) == 0): @@ -249,8 +250,8 @@ null = not schema_helper.is_required(cmd.path_in_schema[0]) null_val = (se.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED) - num = encoding._assemble_objects(assign, defi, rep, val, dic, d, - null, null_val, max_defi) + row_idx = 1 + encoding._assemble_objects(assign, defi, rep, val, dic, d, + null, null_val, max_defi, row_idx) elif defi is not None: max_defi = schema_helper.max_definition_level(cmd.path_in_schema) part = assign[num:num+len(defi)] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/encoding.py new/fastparquet-0.2.0/fastparquet/encoding.py --- old/fastparquet-0.1.6/fastparquet/encoding.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/encoding.py 2018-11-04 19:12:37.000000000 +0100 @@ -224,7 +224,7 @@ Numpy32 = numba.jitclass(spec32)(NumpyIO) -def _assemble_objects(assign, defi, rep, val, dic, d, null, null_val, max_defi): +def _assemble_objects(assign, defi, rep, val, dic, d, null, null_val, max_defi, prev_i): """Dremel-assembly of arrays of values into lists Parameters @@ -245,12 +245,14 @@ can list elements be None max_defi: int value of definition level that corresponds to non-null + prev_i: int + 1 + index where the last row in the previous page was inserted (0 if first page) """ ## TODO: good case for cython if d: # dereference dict values val = dic[val] - i = 0 + i = prev_i vali = 0 part = [] started = False @@ -265,7 +267,11 @@ part = [] i += 1 else: - # first time: no row to save yet + # first time: no row to save yet, unless it's a row continued from previous page + if vali > 0: + assign[i - 1].extend(part) # add the items to previous row + part = [] + # don't increment i since we only filled i-1 started = True if de == max_defi: # append real value to current item @@ -276,7 +282,10 @@ part.append(None) # next object is None as opposed to an object have_null = de == 0 and null - assign[i] = None if have_null else part + if started: # normal case - add the leftovers to the next row + assign[i] = None if have_null else part + else: # can only happen if the only elements in this page are the continuation of the last row from previous page + assign[i - 1].extend(part) return i diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/speedups.c new/fastparquet-0.2.0/fastparquet/speedups.c --- old/fastparquet-0.1.6/fastparquet/speedups.c 2018-08-20 00:31:06.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/speedups.c 2018-11-22 17:29:45.000000000 +0100 @@ -1723,7 +1723,6 @@ static const char __pyx_k_RuntimeError[] = "RuntimeError"; static const char __pyx_k_AttributeError[] = "AttributeError"; static const char __pyx_k_pack_byte_array[] = "pack_byte_array"; -static const char __pyx_k_Ran_out_of_input[] = "Ran out of input"; static const char __pyx_k_array_decode_utf8[] = "array_decode_utf8"; static const char __pyx_k_array_encode_utf8[] = "array_encode_utf8"; static const char __pyx_k_unpack_byte_array[] = "unpack_byte_array"; @@ -1736,6 +1735,7 @@ static const char __pyx_k_expected_an_object_array[] = "expected an object array"; static const char __pyx_k_fastparquet_speedups_pyx[] = "fastparquet/speedups.pyx"; static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous"; +static const char __pyx_k_Ran_out_of_input_after_i_items[] = "Ran out of input after %i items"; static const char __pyx_k_Native_accelerators_for_Parquet[] = "\nNative accelerators for Parquet encoding and decoding.\n"; static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import"; static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)"; @@ -1750,7 +1750,7 @@ static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2; static PyObject *__pyx_n_s_ImportError; static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor; -static PyObject *__pyx_kp_s_Ran_out_of_input; +static PyObject *__pyx_kp_s_Ran_out_of_input_after_i_items; static PyObject *__pyx_n_s_RuntimeError; static PyObject *__pyx_n_s_TypeError; static PyObject *__pyx_n_s_ValueError; @@ -1822,19 +1822,17 @@ static PyObject *__pyx_tuple__13; static PyObject *__pyx_tuple__14; static PyObject *__pyx_tuple__15; -static PyObject *__pyx_tuple__16; static PyObject *__pyx_tuple__17; static PyObject *__pyx_tuple__19; static PyObject *__pyx_tuple__21; static PyObject *__pyx_tuple__23; static PyObject *__pyx_tuple__25; -static PyObject *__pyx_tuple__27; +static PyObject *__pyx_codeobj__16; static PyObject *__pyx_codeobj__18; static PyObject *__pyx_codeobj__20; static PyObject *__pyx_codeobj__22; static PyObject *__pyx_codeobj__24; static PyObject *__pyx_codeobj__26; -static PyObject *__pyx_codeobj__28; /* Late includes */ /* "fastparquet/speedups.pyx":21 @@ -3345,6 +3343,7 @@ Py_ssize_t __pyx_t_4; Py_ssize_t __pyx_t_5; int __pyx_t_6; + PyObject *__pyx_t_7 = NULL; __Pyx_RefNannySetupContext("unpack_byte_array", 0); /* "fastparquet/speedups.pyx":147 @@ -3411,7 +3410,7 @@ * # It is required to check this inside the loop to avoid * # out of bounds array accesses. * if remaining < 0: # <<<<<<<<<<<<<< - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * itemlen = (data[0] + (data[1] << 8) + */ __pyx_t_6 = ((__pyx_v_remaining < 0) != 0); @@ -3420,12 +3419,18 @@ /* "fastparquet/speedups.pyx":156 * # out of bounds array accesses. * if remaining < 0: - * raise RuntimeError("Ran out of input") # <<<<<<<<<<<<<< + * raise RuntimeError("Ran out of input after %i items" % i) # <<<<<<<<<<<<<< * itemlen = (data[0] + (data[1] << 8) + * (data[2] << 16) + (data[3] << 24)) */ - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error) + __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_i); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_7 = __Pyx_PyString_Format(__pyx_kp_s_Ran_out_of_input_after_i_items, __pyx_t_2); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 156, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_RuntimeError, __pyx_t_7); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_Raise(__pyx_t_2, 0, 0, 0); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __PYX_ERR(0, 156, __pyx_L1_error) @@ -3434,13 +3439,13 @@ * # It is required to check this inside the loop to avoid * # out of bounds array accesses. * if remaining < 0: # <<<<<<<<<<<<<< - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * itemlen = (data[0] + (data[1] << 8) + */ } /* "fastparquet/speedups.pyx":158 - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * itemlen = (data[0] + (data[1] << 8) + * (data[2] << 16) + (data[3] << 24)) # <<<<<<<<<<<<<< * data += 4 @@ -3462,7 +3467,7 @@ * * remaining -= itemlen # <<<<<<<<<<<<<< * if remaining < 0: - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) */ __pyx_v_remaining = (__pyx_v_remaining - __pyx_v_itemlen); @@ -3470,7 +3475,7 @@ * * remaining -= itemlen * if remaining < 0: # <<<<<<<<<<<<<< - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) */ __pyx_t_6 = ((__pyx_v_remaining < 0) != 0); @@ -3479,12 +3484,18 @@ /* "fastparquet/speedups.pyx":163 * remaining -= itemlen * if remaining < 0: - * raise RuntimeError("Ran out of input") # <<<<<<<<<<<<<< + * raise RuntimeError("Ran out of input after %i items" % i) # <<<<<<<<<<<<<< * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) * data += itemlen */ - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, __pyx_L1_error) + __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_i); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); + __pyx_t_7 = __Pyx_PyString_Format(__pyx_kp_s_Ran_out_of_input_after_i_items, __pyx_t_2); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 163, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_RuntimeError, __pyx_t_7); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 163, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_Raise(__pyx_t_2, 0, 0, 0); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __PYX_ERR(0, 163, __pyx_L1_error) @@ -3493,14 +3504,14 @@ * * remaining -= itemlen * if remaining < 0: # <<<<<<<<<<<<<< - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) */ } /* "fastparquet/speedups.pyx":164 * if remaining < 0: - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) # <<<<<<<<<<<<<< * data += itemlen * @@ -3511,7 +3522,7 @@ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; /* "fastparquet/speedups.pyx":165 - * raise RuntimeError("Ran out of input") + * raise RuntimeError("Ran out of input after %i items" % i) * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) * data += itemlen # <<<<<<<<<<<<<< * @@ -3541,6 +3552,7 @@ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_7); __Pyx_AddTraceback("fastparquet.speedups.unpack_byte_array", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; @@ -3666,7 +3678,7 @@ * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) */ - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -3722,7 +3734,7 @@ * * info.buf = PyArray_DATA(self) */ - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -3979,7 +3991,7 @@ * if t == NPY_BYTE: f = "b" * elif t == NPY_UBYTE: f = "B" */ - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -4859,7 +4871,7 @@ * * if ((child.byteorder == c'>' and little_endian) or */ - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -4927,7 +4939,7 @@ * # One could encode it in the format string and have Cython * # complain instead, BUT: < and > in format strings also imply */ - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; @@ -5036,7 +5048,7 @@ * * # Until ticket #99 is fixed, use integers to avoid warnings */ - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__12, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_Raise(__pyx_t_4, 0, 0, 0); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -5710,7 +5722,7 @@ * * cdef inline int import_umath() except -1: */ - __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1000, __pyx_L5_except_error) + __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1000, __pyx_L5_except_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_Raise(__pyx_t_8, 0, 0, 0); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; @@ -5839,7 +5851,7 @@ * * cdef inline int import_ufunc() except -1: */ - __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1006, __pyx_L5_except_error) + __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__12, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1006, __pyx_L5_except_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_Raise(__pyx_t_8, 0, 0, 0); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; @@ -5965,7 +5977,7 @@ * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< */ - __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1012, __pyx_L5_except_error) + __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1012, __pyx_L5_except_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_Raise(__pyx_t_8, 0, 0, 0); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; @@ -6054,7 +6066,7 @@ {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0}, {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1}, {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0}, - {&__pyx_kp_s_Ran_out_of_input, __pyx_k_Ran_out_of_input, sizeof(__pyx_k_Ran_out_of_input), 0, 0, 1, 0}, + {&__pyx_kp_s_Ran_out_of_input_after_i_items, __pyx_k_Ran_out_of_input_after_i_items, sizeof(__pyx_k_Ran_out_of_input_after_i_items), 0, 0, 1, 0}, {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1}, {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1}, {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1}, @@ -6166,28 +6178,6 @@ __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); - /* "fastparquet/speedups.pyx":156 - * # out of bounds array accesses. - * if remaining < 0: - * raise RuntimeError("Ran out of input") # <<<<<<<<<<<<<< - * itemlen = (data[0] + (data[1] << 8) + - * (data[2] << 16) + (data[3] << 24)) - */ - __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_s_Ran_out_of_input); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 156, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__5); - __Pyx_GIVEREF(__pyx_tuple__5); - - /* "fastparquet/speedups.pyx":163 - * remaining -= itemlen - * if remaining < 0: - * raise RuntimeError("Ran out of input") # <<<<<<<<<<<<<< - * out[i] = PyBytes_FromStringAndSize(<char *> data, itemlen) - * data += itemlen - */ - __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_Ran_out_of_input); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 163, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__6); - __Pyx_GIVEREF(__pyx_tuple__6); - /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":229 * if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) * and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): @@ -6195,9 +6185,9 @@ * * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) */ - __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(1, 229, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__7); - __Pyx_GIVEREF(__pyx_tuple__7); + __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(1, 229, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__5); + __Pyx_GIVEREF(__pyx_tuple__5); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":233 * if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) @@ -6206,9 +6196,9 @@ * * info.buf = PyArray_DATA(self) */ - __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(1, 233, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__8); - __Pyx_GIVEREF(__pyx_tuple__8); + __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(1, 233, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__6); + __Pyx_GIVEREF(__pyx_tuple__6); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":263 * if ((descr.byteorder == c'>' and little_endian) or @@ -6217,9 +6207,9 @@ * if t == NPY_BYTE: f = "b" * elif t == NPY_UBYTE: f = "B" */ - __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(1, 263, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__9); - __Pyx_GIVEREF(__pyx_tuple__9); + __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(1, 263, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__7); + __Pyx_GIVEREF(__pyx_tuple__7); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":810 * @@ -6228,9 +6218,9 @@ * * if ((child.byteorder == c'>' and little_endian) or */ - __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(1, 810, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__10); - __Pyx_GIVEREF(__pyx_tuple__10); + __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(1, 810, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__8); + __Pyx_GIVEREF(__pyx_tuple__8); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":814 * if ((child.byteorder == c'>' and little_endian) or @@ -6239,9 +6229,9 @@ * # One could encode it in the format string and have Cython * # complain instead, BUT: < and > in format strings also imply */ - __pyx_tuple__11 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(1, 814, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__11); - __Pyx_GIVEREF(__pyx_tuple__11); + __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(1, 814, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__9); + __Pyx_GIVEREF(__pyx_tuple__9); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":834 * t = child.type_num @@ -6250,9 +6240,9 @@ * * # Until ticket #99 is fixed, use integers to avoid warnings */ - __pyx_tuple__12 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(1, 834, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__12); - __Pyx_GIVEREF(__pyx_tuple__12); + __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(1, 834, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__10); + __Pyx_GIVEREF(__pyx_tuple__10); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1000 * _import_array() @@ -6261,9 +6251,9 @@ * * cdef inline int import_umath() except -1: */ - __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(1, 1000, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__13); - __Pyx_GIVEREF(__pyx_tuple__13); + __pyx_tuple__11 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(1, 1000, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__11); + __Pyx_GIVEREF(__pyx_tuple__11); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1006 * _import_umath() @@ -6272,18 +6262,18 @@ * * cdef inline int import_ufunc() except -1: */ - __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(1, 1006, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__14); - __Pyx_GIVEREF(__pyx_tuple__14); + __pyx_tuple__12 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(1, 1006, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__12); + __Pyx_GIVEREF(__pyx_tuple__12); /* "../../anaconda/envs/py36/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1012 * _import_umath() * except Exception: * raise ImportError("numpy.core.umath failed to import") # <<<<<<<<<<<<<< */ - __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(1, 1012, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__15); - __Pyx_GIVEREF(__pyx_tuple__15); + __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(1, 1012, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__13); + __Pyx_GIVEREF(__pyx_tuple__13); /* "fastparquet/speedups.pyx":18 * @@ -6292,9 +6282,9 @@ * * */ - __pyx_tuple__16 = PyTuple_Pack(1, __pyx_n_s_object); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 18, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__16); - __Pyx_GIVEREF(__pyx_tuple__16); + __pyx_tuple__14 = PyTuple_Pack(1, __pyx_n_s_object); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 18, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__14); + __Pyx_GIVEREF(__pyx_tuple__14); /* "fastparquet/speedups.pyx":21 * @@ -6303,10 +6293,10 @@ * """ * Convert *obj* (a ndarray-compatible object, e.g. pandas Series) */ - __pyx_tuple__17 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(0, 21, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__17); - __Pyx_GIVEREF(__pyx_tuple__17); - __pyx_codeobj__18 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__17, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_to_array, 21, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__18)) __PYX_ERR(0, 21, __pyx_L1_error) + __pyx_tuple__15 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(0, 21, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__15); + __Pyx_GIVEREF(__pyx_tuple__15); + __pyx_codeobj__16 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__15, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_to_array, 21, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__16)) __PYX_ERR(0, 21, __pyx_L1_error) /* "fastparquet/speedups.pyx":36 * @@ -6315,10 +6305,10 @@ * if arr.ndim != 1: * raise TypeError("expected a 1d array") */ - __pyx_tuple__19 = PyTuple_Pack(1, __pyx_n_s_arr); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 36, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__19); - __Pyx_GIVEREF(__pyx_tuple__19); - __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_check_1d_object_array, 36, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) __PYX_ERR(0, 36, __pyx_L1_error) + __pyx_tuple__17 = PyTuple_Pack(1, __pyx_n_s_arr); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(0, 36, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__17); + __Pyx_GIVEREF(__pyx_tuple__17); + __pyx_codeobj__18 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__17, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_check_1d_object_array, 36, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__18)) __PYX_ERR(0, 36, __pyx_L1_error) /* "fastparquet/speedups.pyx":43 * @@ -6327,10 +6317,10 @@ * """ * utf-8 encode all elements of a 1d ndarray of "object" dtype. */ - __pyx_tuple__21 = PyTuple_Pack(5, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_arr, __pyx_n_s_result); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(0, 43, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__21); - __Pyx_GIVEREF(__pyx_tuple__21); - __pyx_codeobj__22 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__21, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_encode_utf8, 43, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__22)) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_tuple__19 = PyTuple_Pack(5, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_arr, __pyx_n_s_result); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 43, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__19); + __Pyx_GIVEREF(__pyx_tuple__19); + __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_encode_utf8, 43, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) __PYX_ERR(0, 43, __pyx_L1_error) /* "fastparquet/speedups.pyx":65 * @@ -6339,10 +6329,10 @@ * """ * utf-8 decode all elements of a 1d ndarray of "object" dtype. */ - __pyx_tuple__23 = PyTuple_Pack(6, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_arr, __pyx_n_s_result, __pyx_n_s_val); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(0, 65, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__23); - __Pyx_GIVEREF(__pyx_tuple__23); - __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__23, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_decode_utf8, 65, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) __PYX_ERR(0, 65, __pyx_L1_error) + __pyx_tuple__21 = PyTuple_Pack(6, __pyx_n_s_inp, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_arr, __pyx_n_s_result, __pyx_n_s_val); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(0, 65, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__21); + __Pyx_GIVEREF(__pyx_tuple__21); + __pyx_codeobj__22 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__21, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_array_decode_utf8, 65, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__22)) __PYX_ERR(0, 65, __pyx_L1_error) /* "fastparquet/speedups.pyx":95 * @@ -6351,10 +6341,10 @@ * """ * Pack a variable length byte array column. */ - __pyx_tuple__25 = PyTuple_Pack(9, __pyx_n_s_items, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_itemlen, __pyx_n_s_total_size, __pyx_n_s_start, __pyx_n_s_data, __pyx_n_s_val, __pyx_n_s_out); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(0, 95, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__25); - __Pyx_GIVEREF(__pyx_tuple__25); - __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__25, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_pack_byte_array, 95, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) __PYX_ERR(0, 95, __pyx_L1_error) + __pyx_tuple__23 = PyTuple_Pack(9, __pyx_n_s_items, __pyx_n_s_i, __pyx_n_s_n, __pyx_n_s_itemlen, __pyx_n_s_total_size, __pyx_n_s_start, __pyx_n_s_data, __pyx_n_s_val, __pyx_n_s_out); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(0, 95, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__23); + __Pyx_GIVEREF(__pyx_tuple__23); + __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__23, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_pack_byte_array, 95, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) __PYX_ERR(0, 95, __pyx_L1_error) /* "fastparquet/speedups.pyx":135 * @@ -6363,10 +6353,10 @@ * """ * Unpack a variable length byte array column. */ - __pyx_tuple__27 = PyTuple_Pack(8, __pyx_n_s_raw_bytes, __pyx_n_s_n, __pyx_n_s_i, __pyx_n_s_itemlen, __pyx_n_s_remaining, __pyx_n_s_start, __pyx_n_s_data, __pyx_n_s_out); if (unlikely(!__pyx_tuple__27)) __PYX_ERR(0, 135, __pyx_L1_error) - __Pyx_GOTREF(__pyx_tuple__27); - __Pyx_GIVEREF(__pyx_tuple__27); - __pyx_codeobj__28 = (PyObject*)__Pyx_PyCode_New(2, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__27, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_unpack_byte_array, 135, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__28)) __PYX_ERR(0, 135, __pyx_L1_error) + __pyx_tuple__25 = PyTuple_Pack(8, __pyx_n_s_raw_bytes, __pyx_n_s_n, __pyx_n_s_i, __pyx_n_s_itemlen, __pyx_n_s_remaining, __pyx_n_s_start, __pyx_n_s_data, __pyx_n_s_out); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(0, 135, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__25); + __Pyx_GIVEREF(__pyx_tuple__25); + __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(2, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__25, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fastparquet_speedups_pyx, __pyx_n_s_unpack_byte_array, 135, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) __PYX_ERR(0, 135, __pyx_L1_error) __Pyx_RefNannyFinishContext(); return 0; __pyx_L1_error:; @@ -6654,7 +6644,7 @@ * * */ - __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5numpy_dtype), __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5numpy_dtype), __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_obj_dtype, __pyx_t_1) < 0) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/test/test_api.py new/fastparquet-0.2.0/fastparquet/test/test_api.py --- old/fastparquet-0.1.6/fastparquet/test/test_api.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/test/test_api.py 2018-11-22 17:29:54.000000000 +0100 @@ -97,6 +97,36 @@ assert result == expected +def test_sorted_row_group_columns_with_filters(tempdir): + dd = pytest.importorskip('dask.dataframe') + # create dummy dataframe + df = pd.DataFrame({'unique': [0, 0, 1, 1, 2, 2, 3, 3], + 'id': ['id1', 'id2', + 'id1', 'id2', + 'id1', 'id2', + 'id1', 'id2']}, + index=[0, 0, 1, 1, 2, 2, 3, 3]) + df = dd.from_pandas(df, npartitions=2) + fn = os.path.join(tempdir, 'foo.parquet') + df.to_parquet(fn, + engine='fastparquet', + partition_on=['id']) + # load ParquetFile + pf = ParquetFile(fn) + filters = [('id', '==', 'id1')] + + # without filters no columns are sorted + result = sorted_partitioned_columns(pf) + expected = {} + assert result == expected + + # with filters both columns are sorted + result = sorted_partitioned_columns(pf, filters=filters) + expected = {'index': {'min': [0, 2], 'max': [1, 3]}, + 'unique': {'min': [0, 2], 'max': [1, 3]}} + assert result == expected + + def test_iter(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], @@ -417,6 +447,14 @@ assert out.index.name is None assert out.index.tolist() == [0, 1, 2] +def test_input_column_list_not_mutated(tempdir): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + write(tempdir, df, file_scheme='hive') + cols = ['a'] + pf = ParquetFile(tempdir) + out = pf.to_pandas(columns=cols) + assert cols == ['a'] + def test_drill_list(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/test/test_read.py new/fastparquet-0.2.0/fastparquet/test/test_read.py --- old/fastparquet-0.1.6/fastparquet/test/test_read.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/test/test_read.py 2018-09-13 00:09:43.000000000 +0200 @@ -335,3 +335,88 @@ assert dg.index.levels[0].dtype == '<M8[ns]' assert dg.index.levels[1].name == 'b' assert dg.equals(df) + +def test_no_columns(tempdir): + # https://github.com/dask/fastparquet/issues/361 + # Create a non-empty DataFrame, then select no columns. That way we get + # _some_ rows, _no_ columns. + # + # df = pd.DataFrame({"A": [1, 2]})[[]] + # fastparquet.write("test-data/no_columns.parquet", df) + pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "no_columns.parquet")) + assert pf.count == 2 + assert pf.columns == [] + result = pf.to_pandas() + expected = pd.DataFrame({"A": [1, 2]})[[]] + assert len(result) == 2 + pd.testing.assert_frame_equal(result, expected) + +def test_map_multipage(tempdir): + pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "map-test.snappy.parquet")) + assert pf.count == 3551 + df = pf.to_pandas() + first_row_keys = [u'FoxNews.com', u'News Network', u'mobile technology', u'broadcast', u'sustainability', + u'collective intelligence', u'radio', u'business law', u'LLC', u'telecommunications', + u'FOX News Network'] + last_row_keys = [u'protests', u'gas mask', u'Pot & Painting Party', u'Denver', u'New Year', u'Anderson Cooper', + u'gas mask bonk', u'digital media', u'marijuana leaf earrings', u'Screengrab', u'gas mask bongs', + u'Randi Kaye', u'Lee Rogers', u'Andy Cohen', u'CNN', u'Times Square', u'Colorado', u'opera', + u'slavery', u'Kathy Griffin', u'marijuana cigarette', u'executive producer'] + + assert len(df) == 3551 + assert sorted(df["topics"].iloc[0].keys()) == sorted(first_row_keys) + assert sorted(df["topics"].iloc[-1].keys()) == sorted(last_row_keys) + assert df.isnull().sum().sum() == 0 # ensure every row got converted + +def test_map_last_row_split(tempdir): + pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "test-map-last-row-split.parquet")) + assert pf.count == 2428 + df = pf.to_pandas() + # file has 3 pages - rows at index 1210 and 2427 are split in-between neighboring pages + first_split_row_keys = [u'White House', u'State Department', u'Tatverd\xe4chtige', u'financial economics', + u'Hezbollah', u'Bashar Assad', u'break-down', u'paper', u'radio', u'musicals', + u'Vladimir Putin', u'Hill two', u'The New York Times and Washington Post', u'tweet', + u'guest bedroom', u'Susie Tompkins Buell', u'private law', u'Tammy Bruce', + u'Obama Presidential Library', u'Fox News', u'President Trump', u'John Kerry', + u'Vanity Fair', u'government', u'Josh Meyer', u'The Hill', u'Esprit Clothing', + u'Rainer Wendt', u'Fitness', u'u.n.', u'David Brock', u'fleas', u'Trump', u'WORKOUT', + u'Washington', u'Brandenburg Gate', u'Lisa Bloom', u'festgenommen', u'journalist', + u'Kolleg', u'Middle East', u'financial markets', u'gym equipment', u'weight training', + u'reference', u'Solche Taten', u'digital radio', u'Stephen l. Miller', u'Belleon Body', + u'harassment', u'East', u'investment', u'creatures', u'Islamic Republic', u'New Year', + u'New York City', u'Media Research center', u'Neue Osnabruecker Zeitung daily newspaper', + u'Berlin', u'gegen diese Taten vorgehen', u'safety', u'Jarrett Blanc', u'Tehran', + u'America', u'Black Lives Matter', u'pussy hats', + u'wurden bislang leider vereinzelt sexuelle \xdcbergriffe gemeldet', u'Roger Cohen', + u'u.s.', u'Donald Trump', u'Emily Shire', u'hardline', u'common law', u'animal workouts', + u'Hamas', u'operas', u'New York Times', u'Amanda Hess', u'Adrian Carrasquillo', + u'Lukas Mikelionis', u'Koi', u'TOUGHEST MUDDER', u'Middle Eastern', u'Erik Wemple', + u'Associated Press', u'Iran', u'out-of-pocket expenses', u'Neue Osnabruecker Zeitung', + u'lizards', u'Carlos Leon', u'Polizei Berlin Einsatz', u'Russia', u'Russian', + u'Berlin Wall', u'Obama', u'The Times', u'The New York Post', u'Mark Halperin', + u'learning programs', u'NBC', u'American', u'Jeff Bell', + u'Heat Street and National Review Online', u'Dan Merica', u'Tel Aviv', + u'Wielding Money', u'anxiety', u'Bell', u'Twitter', u'Hillary Clinton', + u'physical exercise', u'Fellow Times', u'property', u'Paul Krugman', u'FoxNews.com', + u'Times Square New Year', u'Mika Brzezinksi', u'Ayatollah Ali Khamenei', u'Nikki Haley', + u'Obama Library', u'internet-based works', u'Quadriga', u'Washington Post', + u'Angela Merkel', u'Manhattan', u'United Nations', u'information', u'Israel', + u'Wir haben zivile', u'administration', u'United States', u'Maya Kosoff', u'Germany', + u'donor', u'television terminology', u'Bloom', u'The Washington Post', u'Jack Shafer', + u'Bei den Veranstaltungen', u'singles', u'uprising', u'reporting', u'AP', + u'Fox News Opinion', u'celebrity lawyer', u'Dan Gainor', u'CNN', u'Syria', + u'business law', u'inspiration', u'regime', u'Politico', u'Democratic Party', + u'The New York Times', u'websites', u'socio-economics', u'Jerusalem'] + second_split_row_keys = [u'Stockton University', u'Walter Montelione', u'law enforcement', u'shooting', + u'international incidents', u'NYE', u'Linda Kologi', u'criminal law', + u'Long Branch Police Department', u'Kaitlyn Schallhorn', u'Brittany Kologi', u'suspect', + u'teenager', u'Monmouth County', u'television terminology', u'Fox News', u'Long Branch', + u'Monmouth County prosecutor\u2019s Office', u'Galloway Township', u'Dave Farmer', + u'Steven Kologi jr.', u'u.s.', u'incident', u'WCBS-TV', u'Christopher j. Gramiccioni', + u"Diane D'Amico", u'New Jersey', u'shooter', u'maritime incidents', + u'Monmouth County Prosecutor', u'Steven Kologi', u'Bryan Llenas', u'Mary Schultz', + u'NJ.com', u'n.j.', u'Veronica Mass'] + assert len(df) == 2428 + assert sorted(df["topics"].iloc[1210].keys()) == sorted(first_split_row_keys) + assert sorted(df["topics"].iloc[2427].keys()) == sorted(second_split_row_keys) + assert df.isnull().sum().sum() == 0 \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet/writer.py new/fastparquet-0.2.0/fastparquet/writer.py --- old/fastparquet-0.1.6/fastparquet/writer.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet/writer.py 2018-09-30 17:02:24.000000000 +0200 @@ -984,7 +984,7 @@ for rg in fmd.row_groups: for col in rg.columns: if ".".join(col.meta_data.path_in_schema) == cat['name']: - ncats = [k.value for k in col.meta_data.key_value_metadata + ncats = [k.value for k in (col.meta_data.key_value_metadata or []) if k.key == 'num_categories'] if ncats and int(ncats[0]) > cat['metadata'][ 'num_categories']: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/PKG-INFO new/fastparquet-0.2.0/fastparquet.egg-info/PKG-INFO --- old/fastparquet-0.1.6/fastparquet.egg-info/PKG-INFO 2018-08-20 00:31:06.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet.egg-info/PKG-INFO 2018-11-22 17:33:29.000000000 +0100 @@ -1,6 +1,6 @@ -Metadata-Version: 1.2 +Metadata-Version: 2.1 Name: fastparquet -Version: 0.1.6 +Version: 0.2.0 Summary: Python support for Parquet file format Home-page: https://github.com/dask/fastparquet/ Author: Martin Durant @@ -133,3 +133,8 @@ Classifier: Programming Language :: Python :: 3.7 Classifier: Programming Language :: Python :: Implementation :: CPython Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, +Provides-Extra: lz4 +Provides-Extra: zstandard +Provides-Extra: lzo +Provides-Extra: snappy +Provides-Extra: brotli diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/pbr.json new/fastparquet-0.2.0/fastparquet.egg-info/pbr.json --- old/fastparquet-0.1.6/fastparquet.egg-info/pbr.json 2018-08-20 00:31:06.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet.egg-info/pbr.json 2018-11-22 17:33:29.000000000 +0100 @@ -1 +1 @@ -{"git_version": "5f06d4e", "is_release": true} \ No newline at end of file +{"git_version": "65283e2", "is_release": true} \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/fastparquet.egg-info/requires.txt new/fastparquet-0.2.0/fastparquet.egg-info/requires.txt --- old/fastparquet-0.1.6/fastparquet.egg-info/requires.txt 2018-08-20 00:31:06.000000000 +0200 +++ new/fastparquet-0.2.0/fastparquet.egg-info/requires.txt 2018-11-22 17:33:29.000000000 +0100 @@ -4,3 +4,18 @@ thrift>=0.11.0 six pytest-runner + +[brotli] +brotli + +[lz4] +lz4>=0.19.1 + +[lzo] +python-lzo + +[snappy] +python-snappy + +[zstandard] +zstandard diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/fastparquet-0.1.6/setup.py new/fastparquet-0.2.0/setup.py --- old/fastparquet-0.1.6/setup.py 2018-08-20 00:30:58.000000000 +0200 +++ new/fastparquet-0.2.0/setup.py 2018-11-22 17:31:49.000000000 +0100 @@ -54,7 +54,7 @@ setup( name='fastparquet', - version='0.1.6', + version='0.2.0', description='Python support for Parquet file format', author='Martin Durant', author_email='mdur...@continuum.io', @@ -81,6 +81,13 @@ # 'pytest-runner', # [p for p in install_requires if p.startswith('numpy')][0] #], + extras_require={ + 'brotli': ['brotli'], + 'lz4': ['lz4 >= 0.19.1'], + 'lzo': ['python-lzo'], + 'snappy': ['python-snappy'], + 'zstandard': ['zstandard'] + }, tests_require=[ 'pytest', 'python-snappy',