Repository: arrow
Updated Branches:
  refs/heads/master 20cee707c -> 2615b4703


ARROW-1306: [C++] Use UTF8 filenames in local file error messages

Encoded utf16-le bytes were being written to error messages (which are output 
to UTF-8 consoles), resulting in unintelligible displays.

This also improves the error message when opening the file fails per ARROW-1121.

Author: Wes McKinney <wes.mckin...@twosigma.com>

Closes #951 from wesm/ARROW-1306 and squashes the following commits:

fd0d93f7 [Wes McKinney] Restore utf8_data method
a4aae504 [Wes McKinney] MSVC fixes
b847b66c [Wes McKinney] Change PlatformFilename to be allocated with OSFile
d445fcad [Wes McKinney] Add Python unit test for ARROW-1306
0dc220c2 [Wes McKinney] MSVC fixes
9d80e491 [Wes McKinney] Add PlatformFilename abstraction, write error messages 
with UTF8 filenames


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2615b470
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2615b470
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2615b470

Branch: refs/heads/master
Commit: 2615b47032d58284e0606b21cb216aa4b303a72c
Parents: 20cee70
Author: Wes McKinney <wes.mckin...@twosigma.com>
Authored: Tue Aug 8 11:56:32 2017 -0400
Committer: Wes McKinney <wes.mckin...@twosigma.com>
Committed: Tue Aug 8 11:56:32 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/io/file.cc             | 128 ++++++++++++++++--------------
 cpp/src/arrow/io/file.h              |  23 +++++-
 cpp/src/arrow/io/io-file-test.cc     |   9 ++-
 python/pyarrow/compat.py             |   3 +-
 python/pyarrow/tests/test_parquet.py |  11 +++
 5 files changed, 106 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/file.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index 82e3ba8..57d30f7 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -118,33 +118,64 @@ namespace io {
 
 // ----------------------------------------------------------------------
 // Cross-platform file compatability layer
+
 #if defined(_MSC_VER)
+
 constexpr const char* kRangeExceptionError =
     "Range exception during wide-char string conversion";
+
+struct PlatformFilename {
+  static Status Init(const std::string& utf8_path, PlatformFilename* out) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf16_converter;
+
+    if (!utf8_path.empty()) {
+      try {
+        out->utf16_path = utf16_converter.from_bytes(utf8_path);
+      } catch (const std::range_error&) {
+        return Status::Invalid(kRangeExceptionError);
+      }
+    } else {
+      out->utf16_path = std::wstring();
+    }
+    out->utf8_path = utf8_path;
+    return Status::OK();
+  }
+
+  const char* data() const { return reinterpret_cast<const 
char*>(utf16_path.c_str()); }
+
+  const char* utf8_data() const { return utf8_path.c_str(); }
+
+  size_t length() const { return utf16_path.size(); }
+
+  std::string utf8_path;
+  std::wstring utf16_path;
+};
+
+#else
+
+struct PlatformFilename {
+  static Status Init(const std::string& utf8_path, PlatformFilename* out) {
+    out->utf8_path = utf8_path;
+    return Status::OK();
+  }
+
+  const char* data() const { return utf8_path.c_str(); }
+
+  const char* utf8_data() const { return data(); }
+
+  size_t length() const { return utf8_path.size(); }
+
+  std::string utf8_path;
+};
+
 #endif
 
-static inline Status CheckOpenResult(int ret, int errno_actual, const char* 
filename,
-                                     size_t filename_length) {
+static inline Status CheckOpenResult(int ret, int errno_actual,
+                                     const PlatformFilename& filename) {
   if (ret == -1) {
     // TODO: errno codes to strings
     std::stringstream ss;
-    ss << "Failed to open file: ";
-#if defined(_MSC_VER)
-    // using wchar_t
-
-    // this requires c++11
-    std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
-    std::wstring wide_string(reinterpret_cast<const wchar_t*>(filename),
-                             filename_length / sizeof(wchar_t));
-    try {
-      std::string byte_string = converter.to_bytes(wide_string);
-      ss << byte_string;
-    } catch (const std::range_error&) {
-      ss << kRangeExceptionError;
-    }
-#else
-    ss << filename;
-#endif
+    ss << "Failed to open local file: " << filename.utf8_data();
     return Status::IOError(ss.str());
   }
   return Status::OK();
@@ -161,54 +192,27 @@ static inline int64_t lseek64_compat(int fd, int64_t pos, 
int whence) {
 #endif
 }
 
-#if defined(_MSC_VER)
-static inline Status ConvertToUtf16(const std::string& input, std::wstring* 
result) {
-  if (result == nullptr) {
-    return Status::Invalid("Pointer to result is not valid");
-  }
-
-  if (input.empty()) {
-    *result = std::wstring();
-    return Status::OK();
-  }
-
-  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf16_converter;
-  try {
-    *result = utf16_converter.from_bytes(input);
-  } catch (const std::range_error&) {
-    return Status::Invalid(kRangeExceptionError);
-  }
-  return Status::OK();
-}
-#endif
-
-static inline Status FileOpenReadable(const std::string& filename, int* fd) {
+static inline Status FileOpenReadable(const PlatformFilename& filename, int* 
fd) {
   int ret;
   errno_t errno_actual = 0;
 #if defined(_MSC_VER)
-  std::wstring wide_filename;
-  RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename));
-
-  errno_actual =
-      _wsopen_s(fd, wide_filename.c_str(), _O_RDONLY | _O_BINARY, _SH_DENYNO, 
_S_IREAD);
+  errno_actual = _wsopen_s(fd, reinterpret_cast<const 
wchar_t*>(filename.data()),
+                           _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD);
   ret = *fd;
 #else
-  ret = *fd = open(filename.c_str(), O_RDONLY | O_BINARY);
+  ret = *fd = open(filename.data(), O_RDONLY | O_BINARY);
   errno_actual = errno;
 #endif
 
-  return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size());
+  return CheckOpenResult(ret, errno_actual, filename);
 }
 
-static inline Status FileOpenWriteable(const std::string& filename, bool 
write_only,
+static inline Status FileOpenWriteable(const PlatformFilename& filename, bool 
write_only,
                                        bool truncate, int* fd) {
   int ret;
   errno_t errno_actual = 0;
 
 #if defined(_MSC_VER)
-  std::wstring wide_filename;
-  RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename));
-
   int oflag = _O_CREAT | _O_BINARY;
   int pmode = _S_IWRITE;
   if (!write_only) {
@@ -225,7 +229,8 @@ static inline Status FileOpenWriteable(const std::string& 
filename, bool write_o
     oflag |= _O_RDWR;
   }
 
-  errno_actual = _wsopen_s(fd, wide_filename.c_str(), oflag, _SH_DENYNO, 
pmode);
+  errno_actual = _wsopen_s(fd, reinterpret_cast<const 
wchar_t*>(filename.data()), oflag,
+                           _SH_DENYNO, pmode);
   ret = *fd;
 
 #else
@@ -241,9 +246,9 @@ static inline Status FileOpenWriteable(const std::string& 
filename, bool write_o
     oflag |= O_RDWR;
   }
 
-  ret = *fd = open(filename.c_str(), oflag, ARROW_WRITE_SHMODE);
+  ret = *fd = open(filename.data(), oflag, ARROW_WRITE_SHMODE);
 #endif
-  return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size());
+  return CheckOpenResult(ret, errno_actual, filename);
 }
 
 static inline Status FileTell(int fd, int64_t* pos) {
@@ -352,8 +357,9 @@ class OSFile {
   ~OSFile() {}
 
   Status OpenWriteable(const std::string& path, bool append, bool write_only) {
-    RETURN_NOT_OK(FileOpenWriteable(path, write_only, !append, &fd_));
-    path_ = path;
+    RETURN_NOT_OK(PlatformFilename::Init(path, &path_));
+
+    RETURN_NOT_OK(FileOpenWriteable(path_, write_only, !append, &fd_));
     is_open_ = true;
     mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE;
 
@@ -366,10 +372,11 @@ class OSFile {
   }
 
   Status OpenReadable(const std::string& path) {
-    RETURN_NOT_OK(FileOpenReadable(path, &fd_));
+    RETURN_NOT_OK(PlatformFilename::Init(path, &path_));
+
+    RETURN_NOT_OK(FileOpenReadable(path_, &fd_));
     RETURN_NOT_OK(FileGetSize(fd_, &size_));
 
-    path_ = path;
     is_open_ = true;
     mode_ = FileMode::READ;
     return Status::OK();
@@ -408,14 +415,13 @@ class OSFile {
   int fd() const { return fd_; }
 
   bool is_open() const { return is_open_; }
-  const std::string& path() const { return path_; }
 
   int64_t size() const { return size_; }
 
   FileMode::type mode() const { return mode_; }
 
  protected:
-  std::string path_;
+  PlatformFilename path_;
 
   std::mutex lock_;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/file.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h
index ba740f1..2a0e89c 100644
--- a/cpp/src/arrow/io/file.h
+++ b/cpp/src/arrow/io/file.h
@@ -40,10 +40,18 @@ class ARROW_EXPORT FileOutputStream : public OutputStream {
  public:
   ~FileOutputStream();
 
-  // When opening a new file, any existing file with the indicated path is
-  // truncated to 0 bytes, deleting any existing memory
+  /// \brief Open a local file for writing, truncating any existing file
+  /// \param[in] path with UTF8 encoding
+  /// \param[out] file a FileOutputStream instance
+  ///
+  /// When opening a new file, any existing file with the indicated path is
+  /// truncated to 0 bytes, deleting any existing memory
   static Status Open(const std::string& path, 
std::shared_ptr<FileOutputStream>* file);
 
+  /// \brief Open a local file for writing
+  /// \param[in] path with UTF8 encoding
+  /// \param[in] append append to existing file, otherwise truncate to 0 bytes
+  /// \param[out] file a FileOutputStream instance
   static Status Open(const std::string& path, bool append,
                      std::shared_ptr<FileOutputStream>* file);
 
@@ -68,10 +76,17 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile {
  public:
   ~ReadableFile();
 
-  // Open file, allocate memory (if needed) from default memory pool
+  /// \brief Open a local file for reading
+  /// \param[in] path with UTF8 encoding
+  /// \param[out] file ReadableFile instance
+  /// Open file, allocate memory (if needed) from default memory pool
   static Status Open(const std::string& path, std::shared_ptr<ReadableFile>* 
file);
 
-  // Open file with one's own memory pool for memory allocations
+  /// \brief Open a local file for reading
+  /// \param[in] path with UTF8 encoding
+  /// \param[in] pool a MemoryPool for memory allocations
+  /// \param[out] file ReadableFile instance
+  /// Open file with one's own memory pool for memory allocations
   static Status Open(const std::string& path, MemoryPool* memory_pool,
                      std::shared_ptr<ReadableFile>* file);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/io-file-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc
index 36c3570..630356f 100644
--- a/cpp/src/arrow/io/io-file-test.cc
+++ b/cpp/src/arrow/io/io-file-test.cc
@@ -45,7 +45,7 @@ static bool FileExists(const std::string& path) {
 void InvalidParamHandler(const wchar_t* expr, const wchar_t* func,
                          const wchar_t* source_file, unsigned int source_line,
                          uintptr_t reserved) {
-  wprintf(L"Invalid parameter in funcion %s. Source: %s line %d expression 
%s", func,
+  wprintf(L"Invalid parameter in function %s. Source: %s line %d expression 
%s", func,
           source_file, source_line, expr);
 }
 #endif
@@ -320,7 +320,12 @@ TEST_F(TestReadableFile, ReadAt) {
 }
 
 TEST_F(TestReadableFile, NonExistentFile) {
-  ASSERT_RAISES(IOError, ReadableFile::Open("0xDEADBEEF.txt", &file_));
+  std::string path = "0xDEADBEEF.txt";
+  Status s = ReadableFile::Open(path, &file_);
+  ASSERT_TRUE(s.IsIOError());
+
+  std::string message = s.message();
+  ASSERT_NE(std::string::npos, message.find(path));
 }
 
 class MyMemoryPool : public MemoryPool {

http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/python/pyarrow/compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 7be35df..2252e85 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -132,7 +132,6 @@ else:
 
 def encode_file_path(path):
     import os
-    # Windows requires utf-16le encoding for unicode file names
     if isinstance(path, unicode_type):
         # POSIX systems can handle utf-8. UTF8 is converted to utf16-le in
         # libarrow
@@ -140,6 +139,8 @@ def encode_file_path(path):
     else:
         encoded_path = path
 
+    # Windows file system requires utf-16le for file names; Arrow C++ libraries
+    # will convert utf8 to utf16
     return encoded_path
 
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 9a570b9..8a20f4c 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1127,3 +1127,14 @@ def test_write_error_deletes_incomplete_file(tmpdir):
         pass
 
     assert not os.path.exists(filename)
+
+
+@parquet
+def test_read_non_existent_file(tmpdir):
+    import pyarrow.parquet as pq
+
+    path = 'non-existent-file.parquet'
+    try:
+        pq.read_table(path)
+    except Exception as e:
+        assert path in e.args[0]

Reply via email to