[ 
https://issues.apache.org/jira/browse/ARROW-602?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17086017#comment-17086017
 ] 

Scott Wilson commented on ARROW-602:
------------------------------------

[~wesm] I've taken a stab at implementing ChunkedArray covers and iterators to 
support STL interfaces. My initial implementation returned std::optional<T>, 
but I didn't like the way this interacted with most stl numeric algorithms 
which expect iterator dereferences to be double or the like, and 
std::optional<double> doesn't automatically cast to double. I added a IsNull() 
member to the iterators, so at least this info is accessible.

I'm not quite sure how to handle the issue of mutability, so I've simply 
punted. (Is there some description of the expected semantics of immutability? 
Is it really the case that I shouldn't write into a raw_value if the array is 
immutable?) Arrays that return proxys for the underlying memory, e.g. string, 
require a strange _cached variable in order to return a dereferenced iterator.

This my first pass at this code. I've done little testing, and it is not 
comprehensive. I just wanted to verify that I'm not doing something abnormally 
stupid ;)
{code:c++}
#include <cstdint>
#include <memory>
#include <numeric>
#include <string>
#include <iostream>
#include <optional>
#include <vector>
#include <xutility>

#include <arrow/api.h>
#include <arrow/filesystem/localfs.h>
#include <arrow/csv/api.h>
#include <arrow/result.h>
#include <arrow/builder.h>

#include <boost/iterator/iterator_facade.hpp>
#include <boost/range/iterator_range.hpp>

using namespace std;
using namespace arrow;

// SBW 2020.04.15 For ArrayCoverRaw::iterator, we can simply use the the 
pointer interface.
// Wes suggests returning std::optional<T>, but sizeof(double) < 
sizeof(std::optional<double>) and
// is not a drop-in replacement for T, i.e. optional<T> can't be used in 
expression, need optional<T>.value().

// STL container-like cover for arrow::Array.
// Only works for Array types that support raw_values().
template<typename ArrType>
class ArrayCoverRaw
{
public:
        using T = typename ArrType::value_type;
        using pointer = T*;
        using const_pointer = const T*;
        using reference = T&;
        using const_reference = const T&;
        // Match size_type to Array offsets rather than using size_t and 
ptrdiff_t.
        using size_type = int64_t;
        using difference_type = int64_t;
        using iterator = pointer;
        using const_iterator = const_pointer;
        using reverse_iterator = pointer;
        using const_reverse_iterator = const_pointer;

        ArrayCoverRaw(std::shared_ptr<ArrType>& array) : _array(array) {}

        size_type size() const { return _array->length(); }

        // Should non-const versions fail if Array is immutable?
        iterator begin() { return const_cast<pointer>(_array->raw_values()); }
        iterator end() { return 
const_cast<pointer>(_array->raw_values()+_array->length()); }
        reverse_iterator rbegin() { return 
const_cast<pointer>(_array->raw_values()+_array->length()-1); }
        reverse_iterator rend() { return 
const_cast<pointer>(_array->raw_values()-1); }
        const_iterator cbegin() const { return _array->raw_values(); }
        const_iterator cend() const { return 
_array->raw_values()+_array->length(); }
        const_reverse_iterator crbegin() const { return 
_array->raw_values()+_array->length()-1; }
        const_reverse_iterator crend() const { return _array->raw_values()-1; }

        // We could return std::optional<T> to encapsulate IsNull() info, but 
this would seem to break the expected semantics.
        reference operator[](const difference_type off) { 
assert(_array->data()->is_mutable()); return _array->raw_values()+off; }
        const_reference operator[](const difference_type off) const { return 
_array->raw_values()+off; }
        // ISSUE: is there an interface for setting IsNull() if array is 
mutable.
        bool IsNull(difference_type off) const { return _array->IsNull(off); }

protected:
        std::shared_ptr<ArrType> _array;
};

// TODO: Add ArrayCoverString and iterators, perhaps others.

// Use template on Value so we can create iterator and const_iterator by 
changing Value.
// Only works for Array types that support raw_values().
template <typename ArrType, class Value>
class ChunkedArrayIteratorRaw
        : public boost::iterator_facade<ChunkedArrayIteratorRaw<ArrType, 
Value>, Value, boost::random_access_traversal_tag>
{
public:
        using difference_type = int64_t;
        using T = typename ArrType::value_type;
        using pointer = T*;

        explicit ChunkedArrayIteratorRaw(std::shared_ptr<arrow::ChunkedArray> 
ch_arr = 0, difference_type pos = 0)
                : _ch_arr(ch_arr)
        {
                set_position(pos);
        }

        bool IsNull() const
        {
                auto arr = _ch_arr->chunk(_chunk_index);
                return arr->IsNull(_current-_first);
        }

private:
        friend class boost::iterator_core_access;

    bool equal(ChunkedArrayIteratorRaw<ArrType, Value> const& other) const
    {
        return this->_position == other._position;
    }

    void increment()
        {
                _position++;
                // Need to move to next chunk?
                if ((_current == _last) && ((_chunk_index+1) < 
_ch_arr->num_chunks()))
                {
                        _chunk_index++;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        auto typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _first = const_cast<pointer>(typed_arr->raw_values());
                        _last = _first + arr->length() - 1;
                        _current = _first;
                }
                else
                {
                        _current++;
                }
        }

        void decrement()
        {
                _position--;
                // Need to move to previous chunk?
                if ((_current == _first) && (_chunk_index > 0))
                {
                        _chunk_index--;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        auto typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _first = const_cast<pointer>(typed_arr->raw_values());
                        _last = _first + arr->length() - 1;
                        _current = _last;
                }
                else
                {
                        _current--;
                }
        }

    Value& dereference() const { return *_current; }

        void advance(difference_type n)
        {
                _position += n;
                while (n > 0)
                {
                        difference_type max_delta = _last - _current;
                        if ((max_delta >= n) || ((_chunk_index+1) == 
_ch_arr->num_chunks()))
                        {
                                _current += n;
                                return;
                        }
                        // Move to next chunk.
                        n -= max_delta;
                        _chunk_index++;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        auto typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _first = const_cast<pointer>(typed_arr->raw_values());
                        _last = _first + arr->length() - 1;
                        _current = _first;
                }
                while (n < 0)
                {
                        difference_type max_delta = _first - _current;
                        if ((max_delta <= n) || (_chunk_index == 0))
                        {
                                _current += n;
                                return;
                        }
                        // Move to previous chunk.
                        n -= max_delta;
                        _chunk_index--;
                        assert(_chunk_index >= 0);
                        auto arr = _ch_arr->chunk(_chunk_index);
                        auto typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _first = const_cast<pointer>(typed_arr->raw_values());
                        _last = _first + arr->length() - 1;
                        _current = _last;
                }
        }

        difference_type distance_to(ChunkedArrayIteratorRaw<ArrType, Value> 
const& other)
        {
                return other._position - this->_position;
        }

        // Helper
        void set_position(difference_type pos)
        {
                _position = pos;
                const int nchunks = _ch_arr->num_chunks();
                int64_t offset = 0;
                for (_chunk_index = 0; _chunk_index < nchunks; _chunk_index++)
                {
                        auto arr = _ch_arr->chunk(_chunk_index);
                        int64_t arr_rows = arr->length();
                        if (((offset+arr_rows) > pos) || 
((_chunk_index+1)==nchunks))
                        {
                                auto typed_arr = 
std::static_pointer_cast<ArrType>(arr);
                                _first = 
const_cast<T*>(typed_arr->raw_values());
                                _last = _first + arr_rows - 1;
                                _current = _first + (pos-offset);
                                return;
                        }
                        offset += arr_rows;
                }
                assert(false);
        }

        std::shared_ptr<arrow::ChunkedArray> _ch_arr;
        // Which Array we're looking at.
        int _chunk_index = 0;
        // Pointers into current Array. Use first/last rather than begin/end 
for symmetry of moving forward/backward.
        pointer _first = 0;
        pointer _current = 0;
        pointer _last = 0;
        // Cache position across all chunks for support of random access.
        difference_type _position = 0;
};

// This implementation is a subclass for Arrays that use GetView(i), 
GetString(i), etc.
// Concrete subclass only needs to implement dereference(i).
template <typename ArrType>
class ChunkedArrayIteratorIndexImpl
{
public:
        using difference_type = int64_t;

        explicit 
ChunkedArrayIteratorIndexImpl(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, 
difference_type pos = 0)
                : _ch_arr(ch_arr)
        {
                set_position(pos);
        }

        bool IsNull() const
        {
                auto arr = _ch_arr->chunk(_chunk_index);
                return arr->IsNull(_current);
        }

protected:
        friend class boost::iterator_core_access;

    bool equal(ChunkedArrayIteratorIndexImpl<ArrType> const& other) const
    {
        return this->_position == other._position;
    }

    void increment()
        {
                _position++;
                // Need to move to next chunk?
                if ((_current == _last) && ((_chunk_index+1) < 
_ch_arr->num_chunks()))
                {
                        _chunk_index++;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        _typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _last = arr->length() - 1;
                        _current = 0;
                }
                else
                {
                        _current++;
                }
        }

        void decrement()
        {
                _position--;
                // Need to move to previous chunk?
                if ((_current == _first) && (_chunk_index > 0))
                {
                        _chunk_index--;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        _typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _last = arr->length() - 1;
                        _current = _last;
                }
                else
                {
                        _current--;
                }
        }

    // Value& dereference() const { return *_current; }

        void advance(difference_type n)
        {
                _position += n;
                while (n > 0)
                {
                        difference_type max_delta = _last - _current;
                        if ((max_delta >= n) || ((_chunk_index+1) == 
_ch_arr->num_chunks()))
                        {
                                _current += n;
                                return;
                        }
                        // Move to next chunk.
                        n -= max_delta;
                        _chunk_index++;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        _typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _last = arr->length() - 1;
                        _current = 0;
                }
                while (n < 0)
                {
                        difference_type max_delta = 0 - _current;
                        if ((max_delta <= n) || (_chunk_index == 0))
                        {
                                _current += n;
                                return;
                        }
                        // Move to previous chunk.
                        n -= max_delta;
                        _chunk_index--;
                        auto arr = _ch_arr->chunk(_chunk_index);
                        _typed_arr = std::static_pointer_cast<ArrType>(arr);
                        _last = arr->length() - 1;
                        _current = _last;
                }
        }

        difference_type distance_to(ChunkedArrayIteratorIndexImpl<ArrType> 
const& other)
        {
                return other._position - this->_position;
        }

        // Helper
        void set_position(difference_type pos)
        {
                _position = pos;
                const int nchunks = _ch_arr->num_chunks();
                int64_t offset = 0;
                for (_chunk_index = 0; _chunk_index < nchunks; _chunk_index++)
                {
                        auto arr = _ch_arr->chunk(_chunk_index);
                        int64_t arr_rows = arr->length();
                        if (((offset+arr_rows) > pos) || 
((_chunk_index+1)==nchunks))
                        {
                                _typed_arr = 
std::static_pointer_cast<ArrType>(arr);
                                _last = arr_rows - 1;
                                _current = (pos-offset);
                                return;
                        }
                        offset += arr_rows;
                }
                assert(false);
        }

        std::shared_ptr<arrow::ChunkedArray> _ch_arr;
        // Which Array we're looking at.
        int _chunk_index = 0;
        // Current Array. Use first/last rather than begin/end for symmetry of 
moving forward/backward.
        std::shared_ptr<ArrType> _typed_arr;
        difference_type _current = 0;
        difference_type _last = 0;
        // Cache position across all chunks for support of random access.
        difference_type _position = 0;
};

class ChunkedArrayIteratorString :
        public ChunkedArrayIteratorIndexImpl<StringArray>,
        public boost::iterator_facade<ChunkedArrayIteratorString, const 
std::string, boost::random_access_traversal_tag>
{
public:
        using difference_type = int64_t;
        using Value = const std::string;

        explicit 
ChunkedArrayIteratorString(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, 
difference_type pos = 0)
                : ChunkedArrayIteratorIndexImpl(ch_arr, pos)
        {
        }

        // Cache value to avoid returning pointer to temp.
    Value& dereference() const { _cached = _typed_arr->GetString(_current); 
return _cached; }

private:
        mutable std::string _cached;
};

class ChunkedArrayIteratorBoolean :
        public ChunkedArrayIteratorIndexImpl<BooleanArray>,
        public boost::iterator_facade<ChunkedArrayIteratorBoolean, const bool, 
boost::random_access_traversal_tag>
{
public:
        using difference_type = int64_t;
        using Value = const bool;

        explicit 
ChunkedArrayIteratorBoolean(std::shared_ptr<arrow::ChunkedArray> ch_arr = 0, 
difference_type pos = 0)
                : ChunkedArrayIteratorIndexImpl(ch_arr, pos)
        {
        }

        // Cache value to avoid returning pointer to temp.
    Value& dereference() const { _cached = _typed_arr->GetView(_current); 
return _cached; }

private:
        mutable bool _cached;
};

// STL container-like cover for arrow::ChunkedArray.
// Only works for ChunkedArrays composed of Array types that support 
raw_values().
template<typename ArrType>
class ChunkedArrayCoverRaw
{
public:
        using T = typename ArrType::value_type;
        // Match size_type to Array offsets rather than using size_t and 
ptrdiff_t.
        using size_type = int64_t;
        using difference_type = int64_t;
        using iterator = typename ChunkedArrayIteratorRaw<ArrType, T>;
        using const_iterator = typename ChunkedArrayIteratorRaw<ArrType, const 
T>;;
        using reverse_iterator = iterator;
        using const_reverse_iterator = const_iterator;

        ChunkedArrayCoverRaw(std::shared_ptr<ChunkedArray>& array) : 
_array(array) {}

        size_type size() const { return _array->length(); }

        // Should non-const versions fail if Array is immutable?
        iterator begin() { return iterator(_array); }
        iterator end() { return iterator(_array, size()); }
        reverse_iterator rbegin() { return iterator(_array, size()-1); }
        reverse_iterator rend() { return iterator(_array, -1); }
        const_iterator cbegin() const { return const_iterator(_array); }
        const_iterator cend() const { return const_iterator(_array, size()); }
        const_reverse_iterator crbegin() const { return const_iterator(_array, 
size()-1); }
        const_reverse_iterator crend() const { return const_iterator(_array, 
-1); }

protected:
        std::shared_ptr<ChunkedArray> _array;
};


int main(int argc, char *argv[])
{
        auto fs = make_shared<fs::LocalFileSystem>();
        auto r_input = fs->OpenInputStream("c:/temp/_DatasetP14Seizures.csv");

        auto pool = default_memory_pool();
        auto read_options = arrow::csv::ReadOptions::Defaults();
        auto parse_options = arrow::csv::ParseOptions::Defaults();
        auto convert_options = arrow::csv::ConvertOptions::Defaults();

        auto r_table_reader = csv::TableReader::Make(pool, r_input.ValueOrDie(),
                read_options, parse_options, convert_options);
        auto r_read = r_table_reader.ValueOrDie()->Read();
        auto pTable = r_read.ValueOrDie();

        PrettyPrintOptions options{0};
        arrow::PrettyPrint(*pTable, options, &std::cout);

        // Test covers and iterators.
        const Table& tlb = *pTable;
        const int64_t rows = tlb.num_rows();
        const int cols = tlb.num_columns();
        for (int c = 0; c < cols; c++)
        {
                auto f = tlb.field(c);
                const string& name = f->name();
                int type_id = f->type()->id();
                auto ch_arr = tlb.column(c);
                switch (type_id)
                {
                case Type::DOUBLE:
                        {
#if 0
                                using iterator = 
ChunkedArrayIteratorRaw<arrow::DoubleArray, double>;
                                iterator it(ch_arr, 2);
                                cout << it.IsNull() << endl;
                                boost::iterator_range<iterator> range(it-2, 
it+8);
                                for (double val : range)
                                        cout << val << endl;
#else
                                using cover = 
ChunkedArrayCoverRaw<arrow::DoubleArray>;
                                using iterator = typename cover::iterator;
                                using range = typename 
boost::iterator_range<iterator>;
                                cover cvr(ch_arr);
                                auto begin = cvr.begin();
                                auto end = cvr.end();
                                auto rbegin = cvr.rbegin();
                                auto rend = cvr.rend();
                                auto it = begin;
                                it += 2;
                                cout << it.IsNull() << endl;
                                range rng(it-2, it+8);
                                for (double val : rng)
                                        cout << val << endl;
#endif
                        }
                        break;
                case Type::STRING:
                        {
                                using iterator = ChunkedArrayIteratorString;
                                iterator it(ch_arr, 2);
                                cout << it.IsNull() << endl;
                                boost::iterator_range<iterator> range(it-2, 
it+8);
                                for (std::string val : range)
                                        cout << val << endl;
                        }
                        break;
                case Type::BOOL:
                        {
                                using iterator = ChunkedArrayIteratorBoolean;
                                iterator it(ch_arr, 2);
                                cout << it.IsNull() << endl;
                                boost::iterator_range<iterator> range(it-2, 
it+8);
                                for (bool val : range)
                                        cout << val << endl;
                        }
                        break;

                default:
                        break;
                }
        }

        return 1;
}


{code}

> [C++] Provide iterator access to primitive elements inside a 
> Column/ChunkedArray
> --------------------------------------------------------------------------------
>
>                 Key: ARROW-602
>                 URL: https://issues.apache.org/jira/browse/ARROW-602
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++
>            Reporter: Uwe Korn
>            Assignee: Ben Kietzman
>            Priority: Major
>              Labels: beginner, newbie
>
> Given a ChunkedArray, an Arrow user must currently iterate over all its 
> chunks and then cast them to their types to extract the primitive memory 
> regions to access the values. A convenient way to access the underlying 
> values would be to offer a function that takes a ChunkedArray and returns a 
> C++ iterator over all elements.
> While this may not be the most performant way to access the underlying data, 
> it should have sufficient performance and adds a convenience layer for new 
> users.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to