This is an automated email from the ASF dual-hosted git repository.
jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ab185d5 ARROW-10999: [Rust] [Benchmarks] Use signed ints for TPC-H
schema
ab185d5 is described below
commit ab185d52ccac689a4ff3c3cb22a35f4fb201b7b8
Author: Andy Grove <[email protected]>
AuthorDate: Tue Dec 22 05:31:27 2020 +0000
ARROW-10999: [Rust] [Benchmarks] Use signed ints for TPC-H schema
The TPC-H parquet files generated by the benchmark crate could not be read
by Apache Spark because they used unsigned ints, which Spark does not support
(JVM only has signed ints).
I would like to use the same data sets for benchmarking DataFusion, Apache
Spark, and other tools, so have changed the schema to use signed ints.
Closes #8980 from andygrove/tpch-signed-ints
Authored-by: Andy Grove <[email protected]>
Signed-off-by: Jorge C. Leitao <[email protected]>
---
rust/benchmarks/src/bin/tpch.rs | 42 ++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)
diff --git a/rust/benchmarks/src/bin/tpch.rs b/rust/benchmarks/src/bin/tpch.rs
index 37eb0bc..b78ea62 100644
--- a/rust/benchmarks/src/bin/tpch.rs
+++ b/rust/benchmarks/src/bin/tpch.rs
@@ -1095,42 +1095,46 @@ fn get_table(
}
fn get_schema(table: &str) -> Schema {
+ // note that the schema intentionally uses signed integers so that any
generated Parquet
+ // files can also be used to benchmark tools that only support signed
integers, such as
+ // Apache Spark
+
match table {
"part" => Schema::new(vec![
- Field::new("p_partkey", DataType::UInt32, false),
+ Field::new("p_partkey", DataType::Int32, false),
Field::new("p_name", DataType::Utf8, false),
Field::new("p_mfgr", DataType::Utf8, false),
Field::new("p_brand", DataType::Utf8, false),
Field::new("p_type", DataType::Utf8, false),
- Field::new("p_size", DataType::UInt32, false),
+ Field::new("p_size", DataType::Int32, false),
Field::new("p_container", DataType::Utf8, false),
Field::new("p_retailprice", DataType::Float64, false), // decimal
Field::new("p_comment", DataType::Utf8, false),
]),
"supplier" => Schema::new(vec![
- Field::new("s_suppkey", DataType::UInt32, false),
+ Field::new("s_suppkey", DataType::Int32, false),
Field::new("s_name", DataType::Utf8, false),
Field::new("s_address", DataType::Utf8, false),
- Field::new("s_nationkey", DataType::UInt32, false),
+ Field::new("s_nationkey", DataType::Int32, false),
Field::new("s_phone", DataType::Utf8, false),
Field::new("s_acctbal", DataType::Float64, false), // decimal
Field::new("s_comment", DataType::Utf8, false),
]),
"partsupp" => Schema::new(vec![
- Field::new("ps_partkey", DataType::UInt32, false),
- Field::new("ps_suppkey", DataType::UInt32, false),
- Field::new("ps_availqty", DataType::UInt32, false),
+ Field::new("ps_partkey", DataType::Int32, false),
+ Field::new("ps_suppkey", DataType::Int32, false),
+ Field::new("ps_availqty", DataType::Int32, false),
Field::new("ps_supplycost", DataType::Float64, false), // decimal
Field::new("ps_comment", DataType::Utf8, false),
]),
"customer" => Schema::new(vec![
- Field::new("c_custkey", DataType::UInt32, false),
+ Field::new("c_custkey", DataType::Int32, false),
Field::new("c_name", DataType::Utf8, false),
Field::new("c_address", DataType::Utf8, false),
- Field::new("c_nationkey", DataType::UInt32, false),
+ Field::new("c_nationkey", DataType::Int32, false),
Field::new("c_phone", DataType::Utf8, false),
Field::new("c_acctbal", DataType::Float64, false), // decimal
Field::new("c_mktsegment", DataType::Utf8, false),
@@ -1138,22 +1142,22 @@ fn get_schema(table: &str) -> Schema {
]),
"orders" => Schema::new(vec![
- Field::new("o_orderkey", DataType::UInt32, false),
- Field::new("o_custkey", DataType::UInt32, false),
+ Field::new("o_orderkey", DataType::Int32, false),
+ Field::new("o_custkey", DataType::Int32, false),
Field::new("o_orderstatus", DataType::Utf8, false),
Field::new("o_totalprice", DataType::Float64, false), // decimal
Field::new("o_orderdate", DataType::Date32(DateUnit::Day), false),
Field::new("o_orderpriority", DataType::Utf8, false),
Field::new("o_clerk", DataType::Utf8, false),
- Field::new("o_shippriority", DataType::UInt32, false),
+ Field::new("o_shippriority", DataType::Int32, false),
Field::new("o_comment", DataType::Utf8, false),
]),
"lineitem" => Schema::new(vec![
- Field::new("l_orderkey", DataType::UInt32, false),
- Field::new("l_partkey", DataType::UInt32, false),
- Field::new("l_suppkey", DataType::UInt32, false),
- Field::new("l_linenumber", DataType::UInt32, false),
+ Field::new("l_orderkey", DataType::Int32, false),
+ Field::new("l_partkey", DataType::Int32, false),
+ Field::new("l_suppkey", DataType::Int32, false),
+ Field::new("l_linenumber", DataType::Int32, false),
Field::new("l_quantity", DataType::Float64, false), // decimal
Field::new("l_extendedprice", DataType::Float64, false), // decimal
Field::new("l_discount", DataType::Float64, false), // decimal
@@ -1169,14 +1173,14 @@ fn get_schema(table: &str) -> Schema {
]),
"nation" => Schema::new(vec![
- Field::new("n_nationkey", DataType::UInt32, false),
+ Field::new("n_nationkey", DataType::Int32, false),
Field::new("n_name", DataType::Utf8, false),
- Field::new("n_regionkey", DataType::UInt32, false),
+ Field::new("n_regionkey", DataType::Int32, false),
Field::new("n_comment", DataType::Utf8, false),
]),
"region" => Schema::new(vec![
- Field::new("r_regionkey", DataType::UInt32, false),
+ Field::new("r_regionkey", DataType::Int32, false),
Field::new("r_name", DataType::Utf8, false),
Field::new("r_comment", DataType::Utf8, false),
]),