Repository: arrow Updated Branches: refs/heads/master 8f113b4d0 -> 96f3d6176
ARROW-749: [Python] Delete partially-written Feather file when column write fails This is currently the only place where we are doing an atomic create-file/write-file. We should be mindful of other serialization functions which may yield unreadable files in the future. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #484 from wesm/ARROW-749 and squashes the following commits: 137e235 [Wes McKinney] Delete partially-written Feather file when column write fails Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/96f3d617 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/96f3d617 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/96f3d617 Branch: refs/heads/master Commit: 96f3d6176d8c95717f4ff45e4226161de3168b05 Parents: 8f113b4 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Apr 3 08:43:47 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Mon Apr 3 08:43:47 2017 +0200 ---------------------------------------------------------------------- python/pyarrow/feather.py | 79 ++++++++++++++++++++----------- python/pyarrow/tests/test_feather.py | 16 +++++++ 2 files changed, 67 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index f87c7f3..3b5716e 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -15,8 +15,10 @@ # specific language governing permissions and limitations # under the License. -import six from distutils.version import LooseVersion +import os + +import six import pandas as pd from pyarrow.compat import pdapi @@ -54,45 +56,66 @@ class FeatherReader(ext.FeatherReader): return table.to_pandas() -def write_feather(df, dest): - ''' - Write a pandas.DataFrame to Feather format - ''' - writer = ext.FeatherWriter() - writer.open(dest) +class FeatherWriter(object): - if isinstance(df, pd.SparseDataFrame): - df = df.to_dense() + def __init__(self, dest): + self.dest = dest + self.writer = ext.FeatherWriter() + self.writer.open(dest) - if not df.columns.is_unique: - raise ValueError("cannot serialize duplicate column names") + def write(self, df): + if isinstance(df, pd.SparseDataFrame): + df = df.to_dense() - # TODO(wesm): pipeline conversion to Arrow memory layout - for i, name in enumerate(df.columns): - col = df.iloc[:, i] + if not df.columns.is_unique: + raise ValueError("cannot serialize duplicate column names") - if pdapi.is_object_dtype(col): - inferred_type = pd.lib.infer_dtype(col) - msg = ("cannot serialize column {n} " - "named {name} with dtype {dtype}".format( - n=i, name=name, dtype=inferred_type)) + # TODO(wesm): pipeline conversion to Arrow memory layout + for i, name in enumerate(df.columns): + col = df.iloc[:, i] - if inferred_type in ['mixed']: + if pdapi.is_object_dtype(col): + inferred_type = pd.lib.infer_dtype(col) + msg = ("cannot serialize column {n} " + "named {name} with dtype {dtype}".format( + n=i, name=name, dtype=inferred_type)) - # allow columns with nulls + an inferable type - inferred_type = pd.lib.infer_dtype(col[col.notnull()]) if inferred_type in ['mixed']: + + # allow columns with nulls + an inferable type + inferred_type = pd.lib.infer_dtype(col[col.notnull()]) + if inferred_type in ['mixed']: + raise ValueError(msg) + + elif inferred_type not in ['unicode', 'string']: raise ValueError(msg) - elif inferred_type not in ['unicode', 'string']: - raise ValueError(msg) + if not isinstance(name, six.string_types): + name = str(name) - if not isinstance(name, six.string_types): - name = str(name) + self.writer.write_array(name, col) - writer.write_array(name, col) + self.writer.close() - writer.close() + +def write_feather(df, dest): + ''' + Write a pandas.DataFrame to Feather format + ''' + writer = FeatherWriter(dest) + try: + writer.write(df) + except: + # Try to make sure the resource is closed + import gc + writer = None + gc.collect() + if isinstance(dest, six.string_types): + try: + os.remove(dest) + except os.error: + pass + raise def read_feather(source, columns=None): http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/tests/test_feather.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 525da34..c7b4f1e 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -249,6 +249,22 @@ class TestFeatherReader(unittest.TestCase): df = pd.DataFrame({'bools': arr}) self._check_pandas_roundtrip(df, null_counts=[1 * repeats]) + def test_delete_partial_file_on_error(self): + # strings will fail + df = pd.DataFrame( + { + 'numbers': range(5), + 'strings': [b'foo', None, u'bar', 'qux', np.nan]}, + columns=['numbers', 'strings']) + + path = random_path() + try: + write_feather(df, path) + except: + pass + + assert not os.path.exists(path) + def test_strings(self): repeats = 1000