JakeDern commented on code in PR #10122:
URL: https://github.com/apache/arrow-rs/pull/10122#discussion_r3398945731
##########
arrow-ipc/benches/ipc_writer.rs:
##########
@@ -69,6 +72,119 @@ fn criterion_benchmark(c: &mut Criterion) {
writer.finish().unwrap();
})
});
+
+ group.bench_function("StreamWriter/write_10/dict", |b| {
+ let batches = create_unique_dict_batches(10, 8192);
+ let schema = batches[0].schema();
+ let mut buffer = Vec::with_capacity(2 * 1024 * 1024);
+ b.iter(move || {
+ buffer.clear();
+ let mut writer = StreamWriter::try_new(&mut buffer,
schema.as_ref()).unwrap();
+ for batch in &batches {
+ writer.write(batch).unwrap();
+ }
+ writer.finish().unwrap();
+ })
+ });
+
+ group.bench_function("StreamWriter/write_10/dict/delta", |b| {
+ let batches = create_delta_dict_batches(10, 8192);
+ let schema = batches[0].schema();
+ let mut buffer = Vec::with_capacity(2 * 1024 * 1024);
+ let options =
+
IpcWriteOptions::default().with_dictionary_handling(DictionaryHandling::Delta);
+
+ b.iter(move || {
+ buffer.clear();
+
+ let mut writer =
+ StreamWriter::try_new_with_options(&mut buffer,
schema.as_ref(), options.clone())
+ .unwrap();
+
+ for batch in &batches {
+ writer.write(batch).unwrap();
+ }
+
+ writer.finish().unwrap();
+ })
+ });
+
+ // The file writer rejects dictionary replacement, so only the delta case
is
+ // exercised here (growing dictionaries that are prefixes of one another).
+ group.bench_function("FileWriter/write_10/dict/delta", |b| {
+ let batches = create_delta_dict_batches(10, 8192);
+ let schema = batches[0].schema();
+ let mut buffer = Vec::with_capacity(2 * 1024 * 1024);
+ let options =
+
IpcWriteOptions::default().with_dictionary_handling(DictionaryHandling::Delta);
+
+ b.iter(move || {
+ buffer.clear();
+
+ let mut writer =
+ FileWriter::try_new_with_options(&mut buffer, schema.as_ref(),
options.clone())
+ .unwrap();
+
+ for batch in &batches {
+ writer.write(batch).unwrap();
+ }
+
+ writer.finish().unwrap();
+ })
+ });
+}
+
+/// Build `n` record batches with a single dictionary column whose dictionary
+/// grows across batches. A single builder is reused with
`finish_preserve_values`
+/// so each batch's dictionary has the previous batch's as a prefix which
allows
+/// us to emit deltas.
+fn create_delta_dict_batches(n: usize, num_rows: usize) -> Vec<RecordBatch> {
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "d0",
+ DataType::Dictionary(Box::new(DataType::UInt32),
Box::new(DataType::Utf8)),
+ false,
+ )]));
+ let mut builder = StringDictionaryBuilder::<UInt32Type>::new();
+
+ let mut batches = Vec::with_capacity(n);
+ for i in 0..n {
+ // 3/4 of the rows reuse values shared by every batch, the other 1/4
+ // introduce values unique to this batch which extends the dictionary.
+ for r in 0..num_rows {
+ if r < num_rows / 4 {
+ builder.append_value(format!("batch {i} value {}", r));
+ } else {
+ builder.append_value(format!("shared {r}"));
+ }
+ }
+
+ // Preserve the values builder so the dictionary accumulates across
batches.
+ let dict = builder.finish_preserve_values();
+ batches.push(RecordBatch::try_new(schema.clone(),
vec![Arc::new(dict)]).unwrap());
+ }
+
+ batches
+}
+
+/// Build `n` record batches each with a completely distinct dictionary for
each batch.
+fn create_unique_dict_batches(n: usize, num_rows: usize) -> Vec<RecordBatch> {
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "d0",
+ DataType::Dictionary(Box::new(DataType::UInt32),
Box::new(DataType::Utf8)),
+ false,
+ )]));
+
+ let mut batches = Vec::with_capacity(n);
+ for i in 0..n {
+ let mut builder = StringDictionaryBuilder::<UInt32Type>::new();
+ for r in 0..num_rows {
+ builder.append_value(format!("batch {i} value {}", r % (num_rows /
2)));
Review Comment:
Thank you for the thoughtful review!
> Varying cardinality helps detect whether the encoder is doing O(N) work
(proportional to total rows) when it should be doing O(K) work (proportional to
unique values).
I would clarify that we are doing O(K + N) work as there is still per row
overhead in encoding the dictionary keys as well as per unique value cost in
encoding the dictionary values.
I agree that benchmarks varying the amount of unique values could yield
useful information, but these simple benchmarks can still answer whether we've
meaningfully changed the amount of work required to emit dictionary batches.
I think until those additional parameters are needed to demonstrate
something the current set cannot, I favor the simplicity of omitting them.
Just my .02 of course, happy to take more feedback on this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]