[
https://issues.apache.org/jira/browse/ARROW-1908?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16285075#comment-16285075
]
ASF GitHub Bot commented on ARROW-1908:
---------------------------------------
wesm closed pull request #1405: ARROW-1908: [Python] Construction of arrow
table from pandas DataFrame with duplicate column names crashes
URL: https://github.com/apache/arrow/pull/1405
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/python/builtin_convert.cc
b/cpp/src/arrow/python/builtin_convert.cc
index 08cbae7ab2..cd88d557d4 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -148,15 +148,14 @@ static constexpr int MAX_NESTING_LEVELS = 32;
// SeqVisitor is used to infer the type.
class SeqVisitor {
public:
- SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) {
- memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
+ SeqVisitor() : max_nesting_level_(0), max_observed_level_(0),
nesting_histogram_() {
+ std::fill(nesting_histogram_, nesting_histogram_ + MAX_NESTING_LEVELS, 0);
}
// co-recursive with VisitElem
Status Visit(PyObject* obj, int level = 0) {
- if (level > max_nesting_level_) {
- max_nesting_level_ = level;
- }
+ max_nesting_level_ = std::max(max_nesting_level_, level);
+
// Loop through either a sequence or an iterator.
if (PySequence_Check(obj)) {
Py_ssize_t size = PySequence_Size(obj);
@@ -165,18 +164,26 @@ class SeqVisitor {
if (PyArray_Check(obj)) {
auto array = reinterpret_cast<PyArrayObject*>(obj);
auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));
+
ref.reset(PyArray_GETITEM(array, ptr));
+ RETURN_IF_PYERROR();
+
RETURN_NOT_OK(VisitElem(ref, level));
} else {
ref.reset(PySequence_GetItem(obj, i));
+ RETURN_IF_PYERROR();
RETURN_NOT_OK(VisitElem(ref, level));
}
}
} else if (PyObject_HasAttrString(obj, "__iter__")) {
- OwnedRef iter = OwnedRef(PyObject_GetIter(obj));
- PyObject* item;
+ OwnedRef iter(PyObject_GetIter(obj));
+ RETURN_IF_PYERROR();
+
+ PyObject* item = NULLPTR;
while ((item = PyIter_Next(iter.obj()))) {
- OwnedRef ref = OwnedRef(item);
+ RETURN_IF_PYERROR();
+
+ OwnedRef ref(item);
RETURN_NOT_OK(VisitElem(ref, level));
}
} else {
@@ -242,6 +249,7 @@ class SeqVisitor {
// Visits a specific element (inner part of the loop).
Status VisitElem(const OwnedRef& item_ref, int level) {
+ DCHECK_NE(item_ref.obj(), NULLPTR);
if (PyList_Check(item_ref.obj())) {
RETURN_NOT_OK(Visit(item_ref.obj(), level + 1));
} else if (PyDict_Check(item_ref.obj())) {
@@ -323,7 +331,7 @@ class SeqConverter {
virtual Status AppendData(PyObject* seq, int64_t size) = 0;
- virtual ~SeqConverter() {}
+ virtual ~SeqConverter() = default;
protected:
ArrayBuilder* builder_;
@@ -459,13 +467,13 @@ class UInt8Converter : public
TypedConverterVisitor<UInt8Builder, UInt8Converter
public:
Status AppendItem(const OwnedRef& item) {
const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+ RETURN_IF_PYERROR();
if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
- RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint8_t>(val));
}
};
@@ -474,13 +482,13 @@ class UInt16Converter : public
TypedConverterVisitor<UInt16Builder, UInt16Conver
public:
Status AppendItem(const OwnedRef& item) {
const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+ RETURN_IF_PYERROR();
if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
- RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint16_t>(val));
}
};
@@ -489,13 +497,13 @@ class UInt32Converter : public
TypedConverterVisitor<UInt32Builder, UInt32Conver
public:
Status AppendItem(const OwnedRef& item) {
const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+ RETURN_IF_PYERROR();
if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max())) {
return Status::Invalid(
"Cannot coerce values to array type that would "
"lose data");
}
- RETURN_IF_PYERROR();
return typed_builder_->Append(static_cast<uint32_t>(val));
}
};
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 8459ec31be..48384ad478 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -281,6 +281,12 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1):
columns_to_convert = []
convert_types = []
+
+ if not df.columns.is_unique:
+ raise ValueError(
+ 'Duplicate column names found: {}'.format(list(df.columns))
+ )
+
for name in df.columns:
col = df[name]
if not isinstance(name, six.string_types):
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index e94ee4608e..1231844e35 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -227,6 +227,11 @@ def test_zero_copy_success(self):
result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
npt.assert_array_equal(result, [0, 1, 2])
+ def test_duplicate_column_names_does_not_crash(self):
+ df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa'))
+ with pytest.raises(ValueError):
+ pa.Table.from_pandas(df)
+
def test_dictionary_indices_boundscheck(self):
# ARROW-1658. No validation of indices leads to segfaults in pandas
indices = [[0, 1], [0, -1]]
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Construction of arrow table from pandas DataFrame with duplicate
> column names crashes
> ----------------------------------------------------------------------------------------------
>
> Key: ARROW-1908
> URL: https://issues.apache.org/jira/browse/ARROW-1908
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 0.7.1
> Reporter: Phillip Cloud
> Assignee: Phillip Cloud
> Labels: pandas, pull-request-available, python
> Fix For: 0.8.0
>
>
> [~jorisvandenbossche]'s example here:
> https://github.com/pandas-dev/pandas/pull/18201#issuecomment-350259248 shows
> that a {{pyarrow.Table}} with duplicate column names can be constructed.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)