This is an automated email from the ASF dual-hosted git repository.

jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ab185d5  ARROW-10999: [Rust] [Benchmarks] Use signed ints for TPC-H 
schema
ab185d5 is described below

commit ab185d52ccac689a4ff3c3cb22a35f4fb201b7b8
Author: Andy Grove <[email protected]>
AuthorDate: Tue Dec 22 05:31:27 2020 +0000

    ARROW-10999: [Rust] [Benchmarks] Use signed ints for TPC-H schema
    
    The TPC-H parquet files generated by the benchmark crate could not be read 
by Apache Spark because they used unsigned ints, which Spark does not support 
(JVM only has signed ints).
    
    I would  like to use the same data sets for benchmarking DataFusion, Apache 
Spark, and other tools, so have changed the schema to use signed ints.
    
    Closes #8980 from andygrove/tpch-signed-ints
    
    Authored-by: Andy Grove <[email protected]>
    Signed-off-by: Jorge C. Leitao <[email protected]>
---
 rust/benchmarks/src/bin/tpch.rs | 42 ++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/rust/benchmarks/src/bin/tpch.rs b/rust/benchmarks/src/bin/tpch.rs
index 37eb0bc..b78ea62 100644
--- a/rust/benchmarks/src/bin/tpch.rs
+++ b/rust/benchmarks/src/bin/tpch.rs
@@ -1095,42 +1095,46 @@ fn get_table(
 }
 
 fn get_schema(table: &str) -> Schema {
+    // note that the schema intentionally uses signed integers so that any 
generated Parquet
+    // files can also be used to benchmark tools that only support signed 
integers, such as
+    // Apache Spark
+
     match table {
         "part" => Schema::new(vec![
-            Field::new("p_partkey", DataType::UInt32, false),
+            Field::new("p_partkey", DataType::Int32, false),
             Field::new("p_name", DataType::Utf8, false),
             Field::new("p_mfgr", DataType::Utf8, false),
             Field::new("p_brand", DataType::Utf8, false),
             Field::new("p_type", DataType::Utf8, false),
-            Field::new("p_size", DataType::UInt32, false),
+            Field::new("p_size", DataType::Int32, false),
             Field::new("p_container", DataType::Utf8, false),
             Field::new("p_retailprice", DataType::Float64, false), // decimal
             Field::new("p_comment", DataType::Utf8, false),
         ]),
 
         "supplier" => Schema::new(vec![
-            Field::new("s_suppkey", DataType::UInt32, false),
+            Field::new("s_suppkey", DataType::Int32, false),
             Field::new("s_name", DataType::Utf8, false),
             Field::new("s_address", DataType::Utf8, false),
-            Field::new("s_nationkey", DataType::UInt32, false),
+            Field::new("s_nationkey", DataType::Int32, false),
             Field::new("s_phone", DataType::Utf8, false),
             Field::new("s_acctbal", DataType::Float64, false), // decimal
             Field::new("s_comment", DataType::Utf8, false),
         ]),
 
         "partsupp" => Schema::new(vec![
-            Field::new("ps_partkey", DataType::UInt32, false),
-            Field::new("ps_suppkey", DataType::UInt32, false),
-            Field::new("ps_availqty", DataType::UInt32, false),
+            Field::new("ps_partkey", DataType::Int32, false),
+            Field::new("ps_suppkey", DataType::Int32, false),
+            Field::new("ps_availqty", DataType::Int32, false),
             Field::new("ps_supplycost", DataType::Float64, false), // decimal
             Field::new("ps_comment", DataType::Utf8, false),
         ]),
 
         "customer" => Schema::new(vec![
-            Field::new("c_custkey", DataType::UInt32, false),
+            Field::new("c_custkey", DataType::Int32, false),
             Field::new("c_name", DataType::Utf8, false),
             Field::new("c_address", DataType::Utf8, false),
-            Field::new("c_nationkey", DataType::UInt32, false),
+            Field::new("c_nationkey", DataType::Int32, false),
             Field::new("c_phone", DataType::Utf8, false),
             Field::new("c_acctbal", DataType::Float64, false), // decimal
             Field::new("c_mktsegment", DataType::Utf8, false),
@@ -1138,22 +1142,22 @@ fn get_schema(table: &str) -> Schema {
         ]),
 
         "orders" => Schema::new(vec![
-            Field::new("o_orderkey", DataType::UInt32, false),
-            Field::new("o_custkey", DataType::UInt32, false),
+            Field::new("o_orderkey", DataType::Int32, false),
+            Field::new("o_custkey", DataType::Int32, false),
             Field::new("o_orderstatus", DataType::Utf8, false),
             Field::new("o_totalprice", DataType::Float64, false), // decimal
             Field::new("o_orderdate", DataType::Date32(DateUnit::Day), false),
             Field::new("o_orderpriority", DataType::Utf8, false),
             Field::new("o_clerk", DataType::Utf8, false),
-            Field::new("o_shippriority", DataType::UInt32, false),
+            Field::new("o_shippriority", DataType::Int32, false),
             Field::new("o_comment", DataType::Utf8, false),
         ]),
 
         "lineitem" => Schema::new(vec![
-            Field::new("l_orderkey", DataType::UInt32, false),
-            Field::new("l_partkey", DataType::UInt32, false),
-            Field::new("l_suppkey", DataType::UInt32, false),
-            Field::new("l_linenumber", DataType::UInt32, false),
+            Field::new("l_orderkey", DataType::Int32, false),
+            Field::new("l_partkey", DataType::Int32, false),
+            Field::new("l_suppkey", DataType::Int32, false),
+            Field::new("l_linenumber", DataType::Int32, false),
             Field::new("l_quantity", DataType::Float64, false), // decimal
             Field::new("l_extendedprice", DataType::Float64, false), // decimal
             Field::new("l_discount", DataType::Float64, false), // decimal
@@ -1169,14 +1173,14 @@ fn get_schema(table: &str) -> Schema {
         ]),
 
         "nation" => Schema::new(vec![
-            Field::new("n_nationkey", DataType::UInt32, false),
+            Field::new("n_nationkey", DataType::Int32, false),
             Field::new("n_name", DataType::Utf8, false),
-            Field::new("n_regionkey", DataType::UInt32, false),
+            Field::new("n_regionkey", DataType::Int32, false),
             Field::new("n_comment", DataType::Utf8, false),
         ]),
 
         "region" => Schema::new(vec![
-            Field::new("r_regionkey", DataType::UInt32, false),
+            Field::new("r_regionkey", DataType::Int32, false),
             Field::new("r_name", DataType::Utf8, false),
             Field::new("r_comment", DataType::Utf8, false),
         ]),

Reply via email to