[
https://issues.apache.org/jira/browse/ARROW-1756?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16239226#comment-16239226
]
ASF GitHub Bot commented on ARROW-1756:
---------------------------------------
wesm closed pull request #1276: ARROW-1756: [Python] Fix large file read/write
error
URL: https://github.com/apache/arrow/pull/1276
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 8e989064b..7c5f6174f 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -340,7 +340,7 @@ Status AllocateResizableBuffer(MemoryPool* pool, const
int64_t size,
#ifndef ARROW_NO_DEPRECATED_API
/// \brief Create Buffer referencing std::string memory
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
///
/// Warning: string instance must stay alive
///
diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h
index 27176ed86..df3386e4b 100644
--- a/cpp/src/arrow/compare.h
+++ b/cpp/src/arrow/compare.h
@@ -33,27 +33,27 @@ class Tensor;
#ifndef ARROW_NO_DEPRECATED_API
/// Returns true if the arrays are exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool*
are_equal);
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
Status ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
bool* are_equal);
/// Returns true if the arrays are approximately equal. For non-floating point
/// types, this is equivalent to ArrayEquals(left, right)
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
Status ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right,
bool* are_equal);
/// Returns true if indicated equal-length segment of arrays is exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right,
int64_t start_idx, int64_t end_idx,
int64_t other_start_idx, bool* are_equal);
/// Returns true if the type metadata are exactly equal
-/// \deprecated Since 0.8.0
+/// \note Deprecated since 0.8.0
Status ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right,
bool* are_equal);
#endif
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index 74c6c09e6..057cad111 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -22,6 +22,21 @@
#define _FILE_OFFSET_BITS 64
+// define max read/write count
+#if defined(_MSC_VER)
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+
+#ifdef __APPLE__
+// due to macOS bug, we need to set read/write max
+#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#else
+// see notes on Linux read/write manpage
+#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000
+#endif
+
+#endif
+
#include "arrow/io/file.h"
#if _WIN32 || _WIN64
@@ -238,39 +253,68 @@ static inline Status FileSeek(int fd, int64_t pos) {
return Status::OK();
}
-static inline Status FileRead(int fd, uint8_t* buffer, int64_t nbytes,
+static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t
nbytes,
int64_t* bytes_read) {
#if defined(_MSC_VER)
- if (nbytes > INT32_MAX) {
+ if (nbytes > ARROW_MAX_IO_CHUNKSIZE) {
return Status::IOError("Unable to read > 2GB blocks yet");
}
*bytes_read = static_cast<int64_t>(_read(fd, buffer,
static_cast<uint32_t>(nbytes)));
#else
- *bytes_read = static_cast<int64_t>(read(fd, buffer,
static_cast<size_t>(nbytes)));
+ *bytes_read = 0;
+
+ while (*bytes_read != -1 && *bytes_read < nbytes) {
+ int64_t chunksize =
+ std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes -
*bytes_read);
+ int64_t ret = static_cast<int64_t>(
+ read(fd, buffer + *bytes_read, static_cast<size_t>(chunksize)));
+
+ if (ret != -1) {
+ *bytes_read += ret;
+ if (ret < chunksize) {
+ // EOF
+ break;
+ }
+ } else {
+ *bytes_read = ret;
+ }
+ }
#endif
if (*bytes_read == -1) {
- // TODO(wesm): errno to string
- return Status::IOError("Error reading bytes from file");
+ return Status::IOError(std::string("Error reading bytes from file: ") +
+ std::string(strerror(errno)));
}
return Status::OK();
}
-static inline Status FileWrite(int fd, const uint8_t* buffer, int64_t nbytes) {
- int ret;
+static inline Status FileWrite(const int fd, const uint8_t* buffer,
+ const int64_t nbytes) {
+ int ret = 0;
#if defined(_MSC_VER)
- if (nbytes > INT32_MAX) {
+ if (nbytes > ARROW_MAX_IO_CHUNKSIZE) {
return Status::IOError("Unable to write > 2GB blocks to file yet");
}
ret = static_cast<int>(_write(fd, buffer, static_cast<uint32_t>(nbytes)));
#else
- ret = static_cast<int>(write(fd, buffer, static_cast<size_t>(nbytes)));
+ int64_t bytes_written = 0;
+
+ while (ret != -1 && bytes_written < nbytes) {
+ int64_t chunksize =
+ std::min(static_cast<int64_t>(ARROW_MAX_IO_CHUNKSIZE), nbytes -
bytes_written);
+ ret = static_cast<int>(
+ write(fd, buffer + bytes_written, static_cast<size_t>(chunksize)));
+
+ if (ret != -1) {
+ bytes_written += ret;
+ }
+ }
#endif
if (ret == -1) {
- // TODO(wesm): errno to string
- return Status::IOError("Error writing bytes to file");
+ return Status::IOError(std::string("Error writing bytes from file: ") +
+ std::string(strerror(errno)));
}
return Status::OK();
}
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index c6bd6c9b3..e27682232 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from pytest import skip
+from pytest import skip, mark
groups = [
@@ -70,6 +70,18 @@ def pytest_addoption(parser):
default=False,
help=('Run only the {0} test group'.format(group)))
+ parser.addoption('--runslow', action='store_true',
+ default=False, help='run slow tests')
+
+
+def pytest_collection_modifyitems(config, items):
+ if not config.getoption('--runslow'):
+ skip_slow = mark.skip(reason='need --runslow option to run')
+
+ for item in items:
+ if 'slow' in item.keywords:
+ item.add_marker(skip_slow)
+
def pytest_runtest_setup(item):
only_set = False
diff --git a/python/pyarrow/tests/test_feather.py
b/python/pyarrow/tests/test_feather.py
index 9e7fc8863..b0764fdec 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -50,7 +50,7 @@ def tearDown(self):
pass
def test_file_not_exist(self):
- with self.assertRaises(pa.ArrowIOError):
+ with pytest.raises(pa.ArrowIOError):
FeatherReader('test_invalid_file')
def _get_null_counts(self, path, columns=None):
@@ -98,7 +98,7 @@ def _assert_error_on_write(self, df, exc, path=None):
def f():
write_feather(df, path)
- self.assertRaises(exc, f)
+ pytest.raises(exc, f)
def test_num_rows_attr(self):
df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
@@ -466,3 +466,8 @@ def test_unsupported(self):
# non-strings
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self._assert_error_on_write(df, ValueError)
+
+ @pytest.mark.slow
+ def test_large_dataframe(self):
+ df = pd.DataFrame({'A': np.arange(400000000)})
+ self._check_pandas_roundtrip(df)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Observed int32 overflow in Feather write/read path
> -----------------------------------------------------------
>
> Key: ARROW-1756
> URL: https://issues.apache.org/jira/browse/ARROW-1756
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Reporter: Wes McKinney
> Assignee: Licht Takeuchi
> Priority: Major
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> See downstream report
> https://github.com/wesm/feather/issues/321
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)