Repository: arrow Updated Branches: refs/heads/master 20cee707c -> 2615b4703
ARROW-1306: [C++] Use UTF8 filenames in local file error messages Encoded utf16-le bytes were being written to error messages (which are output to UTF-8 consoles), resulting in unintelligible displays. This also improves the error message when opening the file fails per ARROW-1121. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #951 from wesm/ARROW-1306 and squashes the following commits: fd0d93f7 [Wes McKinney] Restore utf8_data method a4aae504 [Wes McKinney] MSVC fixes b847b66c [Wes McKinney] Change PlatformFilename to be allocated with OSFile d445fcad [Wes McKinney] Add Python unit test for ARROW-1306 0dc220c2 [Wes McKinney] MSVC fixes 9d80e491 [Wes McKinney] Add PlatformFilename abstraction, write error messages with UTF8 filenames Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2615b470 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2615b470 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2615b470 Branch: refs/heads/master Commit: 2615b47032d58284e0606b21cb216aa4b303a72c Parents: 20cee70 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue Aug 8 11:56:32 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue Aug 8 11:56:32 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/io/file.cc | 128 ++++++++++++++++-------------- cpp/src/arrow/io/file.h | 23 +++++- cpp/src/arrow/io/io-file-test.cc | 9 ++- python/pyarrow/compat.py | 3 +- python/pyarrow/tests/test_parquet.py | 11 +++ 5 files changed, 106 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/file.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 82e3ba8..57d30f7 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -118,33 +118,64 @@ namespace io { // ---------------------------------------------------------------------- // Cross-platform file compatability layer + #if defined(_MSC_VER) + constexpr const char* kRangeExceptionError = "Range exception during wide-char string conversion"; + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf16_converter; + + if (!utf8_path.empty()) { + try { + out->utf16_path = utf16_converter.from_bytes(utf8_path); + } catch (const std::range_error&) { + return Status::Invalid(kRangeExceptionError); + } + } else { + out->utf16_path = std::wstring(); + } + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return reinterpret_cast<const char*>(utf16_path.c_str()); } + + const char* utf8_data() const { return utf8_path.c_str(); } + + size_t length() const { return utf16_path.size(); } + + std::string utf8_path; + std::wstring utf16_path; +}; + +#else + +struct PlatformFilename { + static Status Init(const std::string& utf8_path, PlatformFilename* out) { + out->utf8_path = utf8_path; + return Status::OK(); + } + + const char* data() const { return utf8_path.c_str(); } + + const char* utf8_data() const { return data(); } + + size_t length() const { return utf8_path.size(); } + + std::string utf8_path; +}; + #endif -static inline Status CheckOpenResult(int ret, int errno_actual, const char* filename, - size_t filename_length) { +static inline Status CheckOpenResult(int ret, int errno_actual, + const PlatformFilename& filename) { if (ret == -1) { // TODO: errno codes to strings std::stringstream ss; - ss << "Failed to open file: "; -#if defined(_MSC_VER) - // using wchar_t - - // this requires c++11 - std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter; - std::wstring wide_string(reinterpret_cast<const wchar_t*>(filename), - filename_length / sizeof(wchar_t)); - try { - std::string byte_string = converter.to_bytes(wide_string); - ss << byte_string; - } catch (const std::range_error&) { - ss << kRangeExceptionError; - } -#else - ss << filename; -#endif + ss << "Failed to open local file: " << filename.utf8_data(); return Status::IOError(ss.str()); } return Status::OK(); @@ -161,54 +192,27 @@ static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) { #endif } -#if defined(_MSC_VER) -static inline Status ConvertToUtf16(const std::string& input, std::wstring* result) { - if (result == nullptr) { - return Status::Invalid("Pointer to result is not valid"); - } - - if (input.empty()) { - *result = std::wstring(); - return Status::OK(); - } - - std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf16_converter; - try { - *result = utf16_converter.from_bytes(input); - } catch (const std::range_error&) { - return Status::Invalid(kRangeExceptionError); - } - return Status::OK(); -} -#endif - -static inline Status FileOpenReadable(const std::string& filename, int* fd) { +static inline Status FileOpenReadable(const PlatformFilename& filename, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - - errno_actual = - _wsopen_s(fd, wide_filename.c_str(), _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); + errno_actual = _wsopen_s(fd, reinterpret_cast<const wchar_t*>(filename.data()), + _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); ret = *fd; #else - ret = *fd = open(filename.c_str(), O_RDONLY | O_BINARY); + ret = *fd = open(filename.data(), O_RDONLY | O_BINARY); errno_actual = errno; #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } -static inline Status FileOpenWriteable(const std::string& filename, bool write_only, +static inline Status FileOpenWriteable(const PlatformFilename& filename, bool write_only, bool truncate, int* fd) { int ret; errno_t errno_actual = 0; #if defined(_MSC_VER) - std::wstring wide_filename; - RETURN_NOT_OK(ConvertToUtf16(filename, &wide_filename)); - int oflag = _O_CREAT | _O_BINARY; int pmode = _S_IWRITE; if (!write_only) { @@ -225,7 +229,8 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= _O_RDWR; } - errno_actual = _wsopen_s(fd, wide_filename.c_str(), oflag, _SH_DENYNO, pmode); + errno_actual = _wsopen_s(fd, reinterpret_cast<const wchar_t*>(filename.data()), oflag, + _SH_DENYNO, pmode); ret = *fd; #else @@ -241,9 +246,9 @@ static inline Status FileOpenWriteable(const std::string& filename, bool write_o oflag |= O_RDWR; } - ret = *fd = open(filename.c_str(), oflag, ARROW_WRITE_SHMODE); + ret = *fd = open(filename.data(), oflag, ARROW_WRITE_SHMODE); #endif - return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); + return CheckOpenResult(ret, errno_actual, filename); } static inline Status FileTell(int fd, int64_t* pos) { @@ -352,8 +357,9 @@ class OSFile { ~OSFile() {} Status OpenWriteable(const std::string& path, bool append, bool write_only) { - RETURN_NOT_OK(FileOpenWriteable(path, write_only, !append, &fd_)); - path_ = path; + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenWriteable(path_, write_only, !append, &fd_)); is_open_ = true; mode_ = write_only ? FileMode::WRITE : FileMode::READWRITE; @@ -366,10 +372,11 @@ class OSFile { } Status OpenReadable(const std::string& path) { - RETURN_NOT_OK(FileOpenReadable(path, &fd_)); + RETURN_NOT_OK(PlatformFilename::Init(path, &path_)); + + RETURN_NOT_OK(FileOpenReadable(path_, &fd_)); RETURN_NOT_OK(FileGetSize(fd_, &size_)); - path_ = path; is_open_ = true; mode_ = FileMode::READ; return Status::OK(); @@ -408,14 +415,13 @@ class OSFile { int fd() const { return fd_; } bool is_open() const { return is_open_; } - const std::string& path() const { return path_; } int64_t size() const { return size_; } FileMode::type mode() const { return mode_; } protected: - std::string path_; + PlatformFilename path_; std::mutex lock_; http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/file.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index ba740f1..2a0e89c 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -40,10 +40,18 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { public: ~FileOutputStream(); - // When opening a new file, any existing file with the indicated path is - // truncated to 0 bytes, deleting any existing memory + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[out] file a FileOutputStream instance + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing memory static Status Open(const std::string& path, std::shared_ptr<FileOutputStream>* file); + /// \brief Open a local file for writing + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \param[out] file a FileOutputStream instance static Status Open(const std::string& path, bool append, std::shared_ptr<FileOutputStream>* file); @@ -68,10 +76,17 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { public: ~ReadableFile(); - // Open file, allocate memory (if needed) from default memory pool + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[out] file ReadableFile instance + /// Open file, allocate memory (if needed) from default memory pool static Status Open(const std::string& path, std::shared_ptr<ReadableFile>* file); - // Open file with one's own memory pool for memory allocations + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations static Status Open(const std::string& path, MemoryPool* memory_pool, std::shared_ptr<ReadableFile>* file); http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/cpp/src/arrow/io/io-file-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index 36c3570..630356f 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -45,7 +45,7 @@ static bool FileExists(const std::string& path) { void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, const wchar_t* source_file, unsigned int source_line, uintptr_t reserved) { - wprintf(L"Invalid parameter in funcion %s. Source: %s line %d expression %s", func, + wprintf(L"Invalid parameter in function %s. Source: %s line %d expression %s", func, source_file, source_line, expr); } #endif @@ -320,7 +320,12 @@ TEST_F(TestReadableFile, ReadAt) { } TEST_F(TestReadableFile, NonExistentFile) { - ASSERT_RAISES(IOError, ReadableFile::Open("0xDEADBEEF.txt", &file_)); + std::string path = "0xDEADBEEF.txt"; + Status s = ReadableFile::Open(path, &file_); + ASSERT_TRUE(s.IsIOError()); + + std::string message = s.message(); + ASSERT_NE(std::string::npos, message.find(path)); } class MyMemoryPool : public MemoryPool { http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/python/pyarrow/compat.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 7be35df..2252e85 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -132,7 +132,6 @@ else: def encode_file_path(path): import os - # Windows requires utf-16le encoding for unicode file names if isinstance(path, unicode_type): # POSIX systems can handle utf-8. UTF8 is converted to utf16-le in # libarrow @@ -140,6 +139,8 @@ def encode_file_path(path): else: encoded_path = path + # Windows file system requires utf-16le for file names; Arrow C++ libraries + # will convert utf8 to utf16 return encoded_path http://git-wip-us.apache.org/repos/asf/arrow/blob/2615b470/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9a570b9..8a20f4c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1127,3 +1127,14 @@ def test_write_error_deletes_incomplete_file(tmpdir): pass assert not os.path.exists(filename) + + +@parquet +def test_read_non_existent_file(tmpdir): + import pyarrow.parquet as pq + + path = 'non-existent-file.parquet' + try: + pq.read_table(path) + except Exception as e: + assert path in e.args[0]