matthewmturner commented on issue #208:
URL: https://github.com/apache/arrow-rs/issues/208#issuecomment-919341814
hi @nevi-me - ive been building up a test that i could use to compare the
ipc size pre and post the fix. but i havent been able to produce the expected
results. Code is below:
```
pub fn compare_ipc() {
let arr_data = vec![1, 2, 3, 4, 5];
let val_data = vec![5, 6, 7, 8, 9];
let id_arr = Int32Array::from(arr_data);
let val_arr = Int32Array::from(val_data);
let id_arr_slice = id_arr.slice(1, 3);
let val_arr_slice = val_arr.slice(1, 3);
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("val", DataType::Int32, false),
]);
let raw_batch = RecordBatch::try_new(
Arc::new(schema.clone()),
vec![Arc::new(id_arr), Arc::new(val_arr)],
)
.unwrap();
println!("{:?}", raw_batch);
let slice_batch =
RecordBatch::try_new(Arc::new(schema.clone()), vec![id_arr_slice,
val_arr_slice]).unwrap();
println!("{:?}", slice_batch);
println!("Running first test");
raw_batch
.columns()
.iter()
.zip(slice_batch.columns())
.for_each(|(a, b)| {
println!("{:?} : {:?}", a.data(), b.data());
assert_eq!(a.data_type(), b.data_type());
assert_eq!(a.data().buffers()[0], b.data().buffers()[0]);
});
let raw_path = "raw_data.arrow";
let slice_path = "slice_data.arrow";
{
let raw_file = File::create(raw_path).unwrap();
let mut raw_writer = FileWriter::try_new(raw_file, &schema).unwrap();
raw_writer.write(&raw_batch).unwrap();
raw_writer.finish().unwrap();
}
{
let slice_file = File::create(slice_path).unwrap();
let mut slice_writer = FileWriter::try_new(slice_file,
&schema).unwrap();
slice_writer.write(&slice_batch).unwrap();
slice_writer.finish().unwrap();
}
let raw_file = File::open(raw_path).unwrap();
let slice_file = File::open(slice_path).unwrap();
let mut raw_reader = FileReader::try_new(raw_file).unwrap();
let mut slice_reader = FileReader::try_new(slice_file).unwrap();
while let Some(Ok(raw_ipc_batch)) = raw_reader.next() {
println!("{:?}", raw_ipc_batch);
while let Some(Ok(slice_ipc_batch)) = slice_reader.next() {
println!("{:?}", slice_ipc_batch);
raw_ipc_batch
.columns()
.iter()
.zip(slice_ipc_batch.columns())
.for_each(|(a, b)| {
println!("{:?} : {:?}", a.data(), b.data());
assert_eq!(a.data_type(), b.data_type());
assert_eq!(a.data().buffers()[0], b.data().buffers()[0]);
});
}
}
}
```
Which produces the following output:
```
RecordBatch { schema: Schema { fields: [Field { name: "id", data_type:
Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None },
Field { name: "val", data_type: Int32, nullable: false, dict_id: 0,
dict_is_ordered: false, metadata: None }], metadata: {} }, columns:
[PrimitiveArray<Int32>
[
1,
2,
3,
4,
5,
], PrimitiveArray<Int32>
[
5,
6,
7,
8,
9,
]] }
RecordBatch { schema: Schema { fields: [Field { name: "id", data_type:
Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None },
Field { name: "val", data_type: Int32, nullable: false, dict_id: 0,
dict_is_ordered: false, metadata: None }], metadata: {} }, columns:
[PrimitiveArray<Int32>
[
2,
3,
4,
], PrimitiveArray<Int32>
[
6,
7,
8,
]] }
Running first test
ArrayData { data_type: Int32, len: 5, null_count: 0, offset: 0, buffers:
[Buffer { data: Bytes { ptr: 0x11d606c40, len: 20, data: [1, 0, 0, 0, 2, 0, 0,
0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0] }, offset: 0 }], child_data: [],
null_bitmap: None } : ArrayData { data_type: Int32, len: 3, null_count: 0,
offset: 1, buffers: [Buffer { data: Bytes { ptr: 0x11d606c40, len: 20, data:
[1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0] }, offset: 0 }],
child_data: [], null_bitmap: None }
ArrayData { data_type: Int32, len: 5, null_count: 0, offset: 0, buffers:
[Buffer { data: Bytes { ptr: 0x11d606d00, len: 20, data: [5, 0, 0, 0, 6, 0, 0,
0, 7, 0, 0, 0, 8, 0, 0, 0, 9, 0, 0, 0] }, offset: 0 }], child_data: [],
null_bitmap: None } : ArrayData { data_type: Int32, len: 3, null_count: 0,
offset: 1, buffers: [Buffer { data: Bytes { ptr: 0x11d606d00, len: 20, data:
[5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 8, 0, 0, 0, 9, 0, 0, 0] }, offset: 0 }],
child_data: [], null_bitmap: None }
RecordBatch { schema: Schema { fields: [Field { name: "id", data_type:
Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None },
Field { name: "val", data_type: Int32, nullable: false, dict_id: 0,
dict_is_ordered: false, metadata: None }], metadata: {} }, columns:
[PrimitiveArray<Int32>
[
1,
2,
3,
4,
5,
], PrimitiveArray<Int32>
[
5,
6,
7,
8,
9,
]] }
RecordBatch { schema: Schema { fields: [Field { name: "id", data_type:
Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None },
Field { name: "val", data_type: Int32, nullable: false, dict_id: 0,
dict_is_ordered: false, metadata: None }], metadata: {} }, columns:
[PrimitiveArray<Int32>
[
null,
null,
5,
], PrimitiveArray<Int32>
[
null,
null,
9,
]] }
ArrayData { data_type: Int32, len: 5, null_count: 0, offset: 0, buffers:
[Buffer { data: Bytes { ptr: 0x11d607ac0, len: 24, data: [1, 0, 0, 0, 2, 0, 0,
0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0] }, offset: 0 }], child_data:
[], null_bitmap: None } : ArrayData { data_type: Int32, len: 3, null_count: 2,
offset: 0, buffers: [Buffer { data: Bytes { ptr: 0x11d704240, len: 12, data:
[0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0] }, offset: 0 }], child_data: [],
null_bitmap: Some(Bitmap { bits: Buffer { data: Bytes { ptr: 0x11d7041c0, len:
1, data: [4] }, offset: 0 } }) }
thread 'main' panicked at 'assertion failed: `(left == right)`
left: `Buffer { data: Bytes { ptr: 0x11d607ac0, len: 24, data: [1, 0, 0,
0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0] }, offset: 0 }`,
right: `Buffer { data: Bytes { ptr: 0x11d704240, len: 12, data: [0, 0, 0,
0, 0, 0, 0, 0, 5, 0, 0, 0] }, offset: 0 }`',
src/flight_sends_too_much_data.rs:149:21
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
```
Basically, im just trying to compare the buffers from two batches (one batch
is a slice of the other) after reading their IPC files and comparing the value
buffers. Given what we are working on i was expecting the data to be the same
(i guess the assertion would still fail after reading the IPC files as they
would have different pointers but i expected the value arrays to have the same
values). But the value arrays were different(the full `ArrayData` values are
above):
```
left: `Buffer { data: Bytes { ptr: 0x11d607ac0, len: 24, data: [1, 0, 0, 0,
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0] }, offset: 0 }`,
right: `Buffer { data: Bytes { ptr: 0x11d704240, len: 12, data: [0, 0, 0, 0,
0, 0, 0, 0, 5, 0, 0, 0] }, offset: 0 }`',
src/flight_sends_too_much_data.rs:149:21
```
I'm going to keep playing around with this but wanted to get your thoughts
on if I am approaching this the right way.
Thanks again for all your help - much appreciated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]