Ritchie created ARROW-10618: ------------------------------- Summary: Invalid write of size 1 in StringBuilder Key: ARROW-10618 URL: https://issues.apache.org/jira/browse/ARROW-10618 Project: Apache Arrow Issue Type: Bug Reporter: Ritchie
h1. What is the problem? I encounter memory errors with using the safe api of StringBuilder. I used the exact same code with PrimitiveBuilders and don't encounter the issue with them. h1. How to reproduce? I encounter when creating multiple builder whilst reading a very large csv. The csv I've used is this kaggle dataset: https://www.kaggle.com/colinmorris/reddit-usernames {code:c++} use arrow::array::StringBuilder; fn main () { let batch_size = 1024; let file = std::fs::File::open("users.csv").unwrap(); let mut rdr = csv::Reader::from_reader(file); // to make sure exceeding limit is not the cause of invalid mem write let mut builder = StringBuilder::new(batch_size * 2); for (i, result) in rdr.records().enumerate() { let record = result.unwrap(); builder.append_value(record.get(0).unwrap()).unwrap(); if i % batch_size == 0 { builder.finish(); builder = StringBuilder::new(batch_size * 2) } } } {code} h2. Cargo.toml {code:c} [package] name = "memcheck" version = "0.1.0" authors = ["ritchie46 <ritchi...@gmail.com>"] edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] arrow = {version = "2", default_features = false} csv = "1.1" {code} h1. Valgrind output {code:c} ==11917== Memcheck, a memory error detector ==11917== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==11917== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info ==11917== Command: ./memcheck ==11917== ==11917== Invalid read of size 1 ==11917== at 0x13C022: arrow::util::bit_util::set_bits_raw (bit_util.rs:128) ==11917== by 0x150B71: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::append_n (builder.rs:374) ==11917== by 0x151164: arrow::array::builder::PrimitiveBuilder<T>::append_slice (builder.rs:596) ==11917== by 0x152417: arrow::array::builder::StringBuilder::append_value (builder.rs:1771) ==11917== by 0x12D0ED: memcheck::main (exec.rs:773) ==11917== by 0x12C3CA: core::ops::function::FnOnce::call_once (dfa.rs:794) ==11917== by 0x12CACD: std::sys_common::backtrace::__rust_begin_short_backtrace (dfa.rs:840) ==11917== by 0x1307E0: std::rt::lang_start::{{closure}} (rt.rs:66) ==11917== by 0x25F746: call_once<(),Fn<()>> (function.rs:259) ==11917== by 0x25F746: do_call<&Fn<()>,i32> (panicking.rs:381) ==11917== by 0x25F746: try<i32,&Fn<()>> (panicking.rs:345) ==11917== by 0x25F746: catch_unwind<&Fn<()>,i32> (panic.rs:396) ==11917== by 0x25F746: std::rt::lang_start_internal (rt.rs:51) ==11917== by 0x1307B6: std::rt::lang_start (rt.rs:65) ==11917== by 0x12D369: main (in /home/ritchie46/code/polars/target/debug/memcheck) ==11917== Address 0x5f01e80 is 0 bytes after a block of size 1,024 alloc'd ==11917== at 0x4C31E76: memalign (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==11917== by 0x4C31F91: posix_memalign (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==11917== by 0x25E453: aligned_malloc (alloc.rs:95) ==11917== by 0x25E453: alloc (alloc.rs:22) ==11917== by 0x25E453: realloc_fallback (alloc.rs:41) ==11917== by 0x25E453: realloc (alloc.rs:50) ==11917== by 0x25E453: __rdl_realloc (alloc.rs:378) ==11917== by 0x136D5C: alloc::alloc::realloc (alloc.rs:120) ==11917== by 0x13B811: arrow::memory::reallocate (memory.rs:187) ==11917== by 0x188A54: arrow::buffer::MutableBuffer::reserve (buffer.rs:666) ==11917== by 0x150CBD: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::reserve (builder.rs:402) ==11917== by 0x150A1E: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::append_n (builder.rs:371) ==11917== by 0x151164: arrow::array::builder::PrimitiveBuilder<T>::append_slice (builder.rs:596) ==11917== by 0x152417: arrow::array::builder::StringBuilder::append_value (builder.rs:1771) ==11917== by 0x12D0ED: memcheck::main (exec.rs:773) ==11917== by 0x12C3CA: core::ops::function::FnOnce::call_once (dfa.rs:794) ==11917== ==11917== Invalid write of size 1 ==11917== at 0x13C024: arrow::util::bit_util::set_bits_raw (bit_util.rs:128) ==11917== by 0x150B71: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::append_n (builder.rs:374) ==11917== by 0x151164: arrow::array::builder::PrimitiveBuilder<T>::append_slice (builder.rs:596) ==11917== by 0x152417: arrow::array::builder::StringBuilder::append_value (builder.rs:1771) ==11917== by 0x12D0ED: memcheck::main (exec.rs:773) ==11917== by 0x12C3CA: core::ops::function::FnOnce::call_once (dfa.rs:794) ==11917== by 0x12CACD: std::sys_common::backtrace::__rust_begin_short_backtrace (dfa.rs:840) ==11917== by 0x1307E0: std::rt::lang_start::{{closure}} (rt.rs:66) ==11917== by 0x25F746: call_once<(),Fn<()>> (function.rs:259) ==11917== by 0x25F746: do_call<&Fn<()>,i32> (panicking.rs:381) ==11917== by 0x25F746: try<i32,&Fn<()>> (panicking.rs:345) ==11917== by 0x25F746: catch_unwind<&Fn<()>,i32> (panic.rs:396) ==11917== by 0x25F746: std::rt::lang_start_internal (rt.rs:51) ==11917== by 0x1307B6: std::rt::lang_start (rt.rs:65) ==11917== by 0x12D369: main (in /home/ritchie46/code/polars/target/debug/memcheck) ==11917== Address 0x5f01e80 is 0 bytes after a block of size 1,024 alloc'd ==11917== at 0x4C31E76: memalign (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==11917== by 0x4C31F91: posix_memalign (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) ==11917== by 0x25E453: aligned_malloc (alloc.rs:95) ==11917== by 0x25E453: alloc (alloc.rs:22) ==11917== by 0x25E453: realloc_fallback (alloc.rs:41) ==11917== by 0x25E453: realloc (alloc.rs:50) ==11917== by 0x25E453: __rdl_realloc (alloc.rs:378) ==11917== by 0x136D5C: alloc::alloc::realloc (alloc.rs:120) ==11917== by 0x13B811: arrow::memory::reallocate (memory.rs:187) ==11917== by 0x188A54: arrow::buffer::MutableBuffer::reserve (buffer.rs:666) ==11917== by 0x150CBD: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::reserve (builder.rs:402) ==11917== by 0x150A1E: <arrow::array::builder::BufferBuilder<arrow::datatypes::BooleanType> as arrow::array::builder::BufferBuilderTrait<arrow::datatypes::BooleanType>>::append_n (builder.rs:371) ==11917== by 0x151164: arrow::array::builder::PrimitiveBuilder<T>::append_slice (builder.rs:596) ==11917== by 0x152417: arrow::array::builder::StringBuilder::append_value (builder.rs:1771) ==11917== by 0x12D0ED: memcheck::main (exec.rs:773) ==11917== by 0x12C3CA: core::ops::function::FnOnce::call_once (dfa.rs:794) {code} h1. Environment. Confirmed invalid write on Ubuntu 18.04 and a Segfault 11 on MacOs. cargo 1.49.0-nightly (d5556aeb8 2020-11-04) -- This message was sent by Atlassian Jira (v8.3.4#803005)