This is an automated email from the ASF dual-hosted git repository.
xudong963 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new fb850499b Output sqllogictests with arrow display rather than CSV
writer (#4578)
fb850499b is described below
commit fb850499b72aab1596ac7eb6d079d789d42809a6
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Dec 11 05:46:32 2022 -0500
Output sqllogictests with arrow display rather than CSV writer (#4578)
* Output sqllogictests with arrow display
* Improve normalization clarity
---
datafusion/core/tests/sqllogictests/src/main.rs | 40 +---
.../core/tests/sqllogictests/src/normalize.rs | 112 ++++++----
.../tests/sqllogictests/test_files/aggregate.slt | 248 ++++++++++-----------
.../core/tests/sqllogictests/test_files/ddl.slt | 16 +-
.../tests/sqllogictests/test_files/prepare.slt | 3 +-
.../tests/sqllogictests/test_files/timestamps.slt | 26 +--
6 files changed, 214 insertions(+), 231 deletions(-)
diff --git a/datafusion/core/tests/sqllogictests/src/main.rs
b/datafusion/core/tests/sqllogictests/src/main.rs
index 47ae8f80d..3d2fe492c 100644
--- a/datafusion/core/tests/sqllogictests/src/main.rs
+++ b/datafusion/core/tests/sqllogictests/src/main.rs
@@ -16,13 +16,12 @@
// under the License.
use async_trait::async_trait;
-use datafusion::arrow::csv::WriterBuilder;
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::prelude::{SessionConfig, SessionContext};
use datafusion_sql::parser::{DFParser, Statement};
use log::info;
-use normalize::normalize_batch;
-use sqllogictest::{ColumnType, DBOutput};
+use normalize::convert_batches;
+use sqllogictest::DBOutput;
use sqlparser::ast::Statement as SQLStatement;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -173,41 +172,6 @@ async fn context_for_test_file(file_name: &str) ->
SessionContext {
}
}
-fn convert_batches(batches: Vec<RecordBatch>) -> Result<DBOutput> {
- let mut bytes = vec![];
- if batches.is_empty() {
- return Ok(DBOutput::StatementComplete(0));
- }
- // TODO: use the actual types
- let types = vec![ColumnType::Any; batches[0].num_columns()];
-
- {
- let builder = WriterBuilder::new()
- .has_headers(false)
- .with_delimiter(b'\t');
- let mut writer = builder.build(&mut bytes);
- for batch in batches {
- writer.write(&normalize_batch(batch)).unwrap();
- }
- }
- let res = String::from_utf8(bytes).unwrap();
- let rows = res
- .lines()
- .map(|s| {
- s.split('\t')
- .map(|s| {
- if s.is_empty() {
- "NULL".to_string()
- } else {
- s.to_string()
- }
- })
- .collect()
- })
- .collect();
- Ok(DBOutput::Rows { types, rows })
-}
-
async fn run_query(ctx: &SessionContext, sql: impl Into<String>) ->
Result<DBOutput> {
let sql = sql.into();
// Check if the sql is `insert`
diff --git a/datafusion/core/tests/sqllogictests/src/normalize.rs
b/datafusion/core/tests/sqllogictests/src/normalize.rs
index 512de350e..474ec6e85 100644
--- a/datafusion/core/tests/sqllogictests/src/normalize.rs
+++ b/datafusion/core/tests/sqllogictests/src/normalize.rs
@@ -15,61 +15,81 @@
// specific language governing permissions and limitations
// under the License.
-use std::sync::Arc;
+use crate::error::{DFSqlLogicTestError, Result};
+use arrow::{array::ArrayRef, datatypes::DataType, record_batch::RecordBatch};
+use datafusion::error::DataFusionError;
+use sqllogictest::{ColumnType, DBOutput};
-use arrow::{
- array::{
- as_largestring_array, as_string_array, ArrayRef, LargeStringArray,
StringArray,
- },
- datatypes::DataType,
- record_batch::RecordBatch,
-};
+/// Converts `batches` to a DBOutput as expected by sqllogicteset.
+///
+/// Assumes empty record batches are a successful statement completion
+///
+pub fn convert_batches(batches: Vec<RecordBatch>) -> Result<DBOutput> {
+ if batches.is_empty() {
+ // DataFusion doesn't report number of rows complete
+ return Ok(DBOutput::StatementComplete(0));
+ }
+
+ let schema = batches[0].schema();
-/// Normalizes the content of a RecordBatch prior to printing.
+ // TODO: report the the actual types of the result
+ // https://github.com/apache/arrow-datafusion/issues/4499
+ let types = vec![ColumnType::Any; batches[0].num_columns()];
+
+ let mut rows = vec![];
+ for batch in batches {
+ // Verify schema
+ if schema != batch.schema() {
+ return
Err(DFSqlLogicTestError::DataFusion(DataFusionError::Internal(
+ format!(
+ "Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}",
+ schema,
+ batch.schema()
+ ),
+ )));
+ }
+ rows.append(&mut convert_batch(batch)?);
+ }
+
+ Ok(DBOutput::Rows { types, rows })
+}
+
+/// Convert a single batch to a `Vec<Vec<String>>` for comparison
+fn convert_batch(batch: RecordBatch) -> Result<Vec<Vec<String>>> {
+ (0..batch.num_rows())
+ .map(|row| {
+ batch
+ .columns()
+ .iter()
+ .map(|col| cell_to_string(col, row))
+ .collect::<Result<Vec<String>>>()
+ })
+ .collect()
+}
+
+/// Normalizes the content of a single cell in RecordBatch prior to printing.
///
/// This is to make the output comparable to the semi-standard .slt format
///
/// Normalizations applied to [NULL Values and empty strings]
///
/// [NULL Values and empty strings]:
https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings
-pub fn normalize_batch(batch: RecordBatch) -> RecordBatch {
- let new_columns = batch
- .columns()
- .iter()
- .map(|array| {
- match array.data_type() {
- DataType::Utf8 => {
- let arr: StringArray = as_string_array(array.as_ref())
- .iter()
- .map(normalize_string)
- .collect();
- Arc::new(arr) as ArrayRef
- }
- DataType::LargeUtf8 => {
- let arr: LargeStringArray =
as_largestring_array(array.as_ref())
- .iter()
- .map(normalize_string)
- .collect();
- Arc::new(arr) as ArrayRef
- }
- // todo normalize dictionary values
-
- // no normalization on this type
- _ => array.clone(),
- }
- })
- .collect();
+///
+pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result<String> {
+ // represent any null value with the string "NULL"
+ if !col.is_valid(row) {
+ return Ok("NULL".into());
+ }
- RecordBatch::try_new(batch.schema(), new_columns).expect("creating
normalized batch")
-}
+ // Convert to normal string representation
+ let mut s = arrow::util::display::array_value_to_string(col, row)
+ .map_err(DFSqlLogicTestError::Arrow)?;
-fn normalize_string(v: Option<&str>) -> Option<&str> {
- v.map(|v| {
+ // apply subsequent normalization depending on type if
+ if matches!(col.data_type(), DataType::Utf8 | DataType::LargeUtf8) &&
s.is_empty() {
// All empty strings are replaced with this value
- if v.is_empty() {
- "(empty)"
- } else {
- v
- }
- })
+ s = "(empty)".to_string();
+ }
+
+ Ok(s)
}
diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
index 26c4b258b..90241e3e8 100644
--- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
@@ -52,7 +52,7 @@ SELECT var_pop(c2) FROM aggregate_test_100
query R
SELECT var_pop(c6) FROM aggregate_test_100
----
-2.615633434202189e37
+26156334342021890000000000000000000000
# csv_query_variance_3
query R
@@ -82,7 +82,7 @@ SELECT stddev_pop(c2) FROM aggregate_test_100
query R
SELECT stddev_pop(c6) FROM aggregate_test_100
----
-5.114326382039172e18
+5114326382039172000
# csv_query_stddev_3
query R
@@ -523,98 +523,98 @@ true
query TIR
SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2)
ORDER BY c1, c2
----
-a 1 -17.6
-a 2 -15.333333333333334
-a 3 -4.5
-a 4 -32.0
-a 5 -32.0
-a NULL -18.333333333333332
-b 1 31.666666666666668
-b 2 25.5
-b 3 -42.0
-b 4 -44.6
-b 5 -0.2
-b NULL -5.842105263157895
-c 1 47.5
-c 2 -55.57142857142857
-c 3 47.5
-c 4 -10.75
-c 5 12.0
-c NULL -1.3333333333333333
-d 1 -8.142857142857142
-d 2 109.33333333333333
-d 3 41.333333333333336
-d 4 54.0
-d 5 -49.5
-d NULL 25.444444444444443
-e 1 75.66666666666667
-e 2 37.8
-e 3 48.0
-e 4 37.285714285714285
-e 5 -11.0
-e NULL 40.333333333333336
-NULL 1 16.681818181818183
-NULL 2 8.363636363636363
-NULL 3 20.789473684210527
-NULL 4 1.2608695652173914
-NULL 5 -13.857142857142858
+a 1 -17.6
+a 2 -15.333333333333334
+a 3 -4.5
+a 4 -32
+a 5 -32
+a NULL -18.333333333333332
+b 1 31.666666666666668
+b 2 25.5
+b 3 -42
+b 4 -44.6
+b 5 -0.2
+b NULL -5.842105263157895
+c 1 47.5
+c 2 -55.57142857142857
+c 3 47.5
+c 4 -10.75
+c 5 12
+c NULL -1.3333333333333333
+d 1 -8.142857142857142
+d 2 109.33333333333333
+d 3 41.333333333333336
+d 4 54
+d 5 -49.5
+d NULL 25.444444444444443
+e 1 75.66666666666667
+e 2 37.8
+e 3 48
+e 4 37.285714285714285
+e 5 -11
+e NULL 40.333333333333336
+NULL 1 16.681818181818183
+NULL 2 8.363636363636363
+NULL 3 20.789473684210527
+NULL 4 1.2608695652173914
+NULL 5 -13.857142857142858
NULL NULL 7.81
# csv_query_rollup_avg
query TIIR
SELECT c1, c2, c3, AVG(c4) FROM aggregate_test_100_by_sql WHERE c1 IN ('a',
'b', NULL) GROUP BY ROLLUP (c1, c2, c3) ORDER BY c1, c2, c3
----
-a 1 -85 -15154.0
-a 1 -56 8692.0
-a 1 -25 15295.0
-a 1 -5 12636.0
-a 1 83 -14704.0
-a 1 NULL 1353.0
-a 2 -48 -18025.0
-a 2 -43 13080.0
-a 2 45 15673.0
-a 2 NULL 3576.0
-a 3 -72 -11122.0
-a 3 -12 -9168.0
-a 3 13 22338.5
-a 3 14 28162.0
-a 3 17 -22796.0
-a 3 NULL 4958.833333333333
-a 4 -101 11640.0
-a 4 -54 -2376.0
-a 4 -38 20744.0
-a 4 65 -28462.0
-a 4 NULL 386.5
-a 5 -101 -12484.0
-a 5 -31 -12907.0
-a 5 36 -16974.0
-a 5 NULL -14121.666666666666
-a NULL NULL 306.04761904761904
-b 1 12 7652.0
-b 1 29 -18218.0
-b 1 54 -18410.0
-b 1 NULL -9658.666666666666
-b 2 -60 -21739.0
-b 2 31 23127.0
-b 2 63 21456.0
-b 2 68 15874.0
-b 2 NULL 9679.5
-b 3 -101 -13217.0
-b 3 17 14457.0
-b 3 NULL 620.0
-b 4 -117 19316.0
-b 4 -111 -1967.0
-b 4 -59 25286.0
-b 4 17 -28070.0
-b 4 47 20690.0
-b 4 NULL 7051.0
-b 5 -82 22080.0
-b 5 -44 15788.0
-b 5 -5 24896.0
-b 5 62 16337.0
-b 5 68 21576.0
-b 5 NULL 20135.4
-b NULL NULL 7732.315789473684
+a 1 -85 -15154
+a 1 -56 8692
+a 1 -25 15295
+a 1 -5 12636
+a 1 83 -14704
+a 1 NULL 1353
+a 2 -48 -18025
+a 2 -43 13080
+a 2 45 15673
+a 2 NULL 3576
+a 3 -72 -11122
+a 3 -12 -9168
+a 3 13 22338.5
+a 3 14 28162
+a 3 17 -22796
+a 3 NULL 4958.833333333333
+a 4 -101 11640
+a 4 -54 -2376
+a 4 -38 20744
+a 4 65 -28462
+a 4 NULL 386.5
+a 5 -101 -12484
+a 5 -31 -12907
+a 5 36 -16974
+a 5 NULL -14121.666666666666
+a NULL NULL 306.04761904761904
+b 1 12 7652
+b 1 29 -18218
+b 1 54 -18410
+b 1 NULL -9658.666666666666
+b 2 -60 -21739
+b 2 31 23127
+b 2 63 21456
+b 2 68 15874
+b 2 NULL 9679.5
+b 3 -101 -13217
+b 3 17 14457
+b 3 NULL 620
+b 4 -117 19316
+b 4 -111 -1967
+b 4 -59 25286
+b 4 17 -28070
+b 4 47 20690
+b 4 NULL 7051
+b 5 -82 22080
+b 5 -44 15788
+b 5 -5 24896
+b 5 62 16337
+b 5 68 21576
+b 5 NULL 20135.4
+b NULL NULL 7732.315789473684
NULL NULL NULL 3833.525
# csv_query_approx_percentile_cont_with_weight
@@ -885,42 +885,42 @@ select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3)
max_c3, min(c3) min_c3, count
query TIIRIII
select c1, c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3,
count(c3) count_c3 from aggregate_test_100 group by CUBE (c1,c2) order by c1, c2
----
-a 1 -88 -17.6 83 -85 5
-a 2 -46 -15.333333333333334 45 -48 3
-a 3 -27 -4.5 17 -72 6
-a 4 -128 -32.0 65 -101 4
-a 5 -96 -32.0 36 -101 3
-a NULL -385 -18.333333333333332 83 -101 21
-b 1 95 31.666666666666668 54 12 3
-b 2 102 25.5 68 -60 4
-b 3 -84 -42.0 17 -101 2
-b 4 -223 -44.6 47 -117 5
-b 5 -1 -0.2 68 -82 5
-b NULL -111 -5.842105263157895 68 -117 19
-c 1 190 47.5 103 -24 4
-c 2 -389 -55.57142857142857 29 -117 7
-c 3 190 47.5 97 -2 4
-c 4 -43 -10.75 123 -90 4
-c 5 24 12.0 118 -94 2
-c NULL -28 -1.3333333333333333 123 -117 21
-d 1 -57 -8.142857142857142 125 -99 7
-d 2 328 109.33333333333333 122 93 3
-d 3 124 41.333333333333336 123 -76 3
-d 4 162 54.0 102 5 3
-d 5 -99 -49.5 -40 -59 2
-d NULL 458 25.444444444444443 125 -99 18
-e 1 227 75.66666666666667 120 36 3
-e 2 189 37.8 97 -61 5
-e 3 192 48.0 112 -95 4
-e 4 261 37.285714285714285 97 -56 7
-e 5 -22 -11.0 64 -86 2
-e NULL 847 40.333333333333336 120 -95 21
-NULL 1 367 16.681818181818183 125 -99 22
-NULL 2 184 8.363636363636363 122 -117 22
-NULL 3 395 20.789473684210527 123 -101 19
-NULL 4 29 1.2608695652173914 123 -117 23
-NULL 5 -194 -13.857142857142858 118 -101 14
-NULL NULL 781 7.81 125 -117 100
+a 1 -88 -17.6 83 -85 5
+a 2 -46 -15.333333333333334 45 -48 3
+a 3 -27 -4.5 17 -72 6
+a 4 -128 -32 65 -101 4
+a 5 -96 -32 36 -101 3
+a NULL -385 -18.333333333333332 83 -101 21
+b 1 95 31.666666666666668 54 12 3
+b 2 102 25.5 68 -60 4
+b 3 -84 -42 17 -101 2
+b 4 -223 -44.6 47 -117 5
+b 5 -1 -0.2 68 -82 5
+b NULL -111 -5.842105263157895 68 -117 19
+c 1 190 47.5 103 -24 4
+c 2 -389 -55.57142857142857 29 -117 7
+c 3 190 47.5 97 -2 4
+c 4 -43 -10.75 123 -90 4
+c 5 24 12 118 -94 2
+c NULL -28 -1.3333333333333333 123 -117 21
+d 1 -57 -8.142857142857142 125 -99 7
+d 2 328 109.33333333333333 122 93 3
+d 3 124 41.333333333333336 123 -76 3
+d 4 162 54 102 5 3
+d 5 -99 -49.5 -40 -59 2
+d NULL 458 25.444444444444443 125 -99 18
+e 1 227 75.66666666666667 120 36 3
+e 2 189 37.8 97 -61 5
+e 3 192 48 112 -95 4
+e 4 261 37.285714285714285 97 -56 7
+e 5 -22 -11 64 -86 2
+e NULL 847 40.333333333333336 120 -95 21
+NULL 1 367 16.681818181818183 125 -99 22
+NULL 2 184 8.363636363636363 122 -117 22
+NULL 3 395 20.789473684210527 123 -101 19
+NULL 4 29 1.2608695652173914 123 -117 23
+NULL 5 -194 -13.857142857142858 118 -101 14
+NULL NULL 781 7.81 125 -117 100
# csv_query_array_agg_distinct
# query T
diff --git a/datafusion/core/tests/sqllogictests/test_files/ddl.slt
b/datafusion/core/tests/sqllogictests/test_files/ddl.slt
index 8af2a9dad..7445157ee 100644
--- a/datafusion/core/tests/sqllogictests/test_files/ddl.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/ddl.slt
@@ -212,7 +212,7 @@ CREATE TABLE my_table AS SELECT * FROM aggregate_simple
query II rowsort
SELECT * FROM my_table order by c1 LIMIT 1
----
-0.00001 1.0e-12 true
+0.00001 0.000000000001 true
statement ok
DROP TABLE my_table;
@@ -237,7 +237,7 @@ CREATE TABLE my_table(c1 float, c2 double, c3 boolean, c4
varchar) AS SELECT *,c
query II rowsort
SELECT * FROM my_table order by c1 LIMIT 1
----
-0.00001 1.0e-12 true 1
+0.00001 0.000000000001 true 1
statement ok
DROP TABLE my_table;
@@ -257,7 +257,7 @@ CREATE TABLE my_table(c1 int, c2 float, c3 varchar) AS
VALUES(1, 2, 'hello')
query II rowsort
SELECT * FROM my_table;
----
-1 2.0 hello
+1 2 hello
statement ok
DROP TABLE my_table;
@@ -343,8 +343,8 @@ ts TIMESTAMP
query CC
SELECT * from csv_with_timestamps
----
-Jorge 2018-12-13T12:12:10.011000000
-Andrew 2018-11-13T17:11:10.011000000
+Jorge 2018-12-13T12:12:10.011
+Andrew 2018-11-13T17:11:10.011
statement ok
DROP TABLE csv_with_timestamps
@@ -369,8 +369,8 @@ LOCATION 'tests/data/partitioned_table';
query CCC
SELECT * from csv_with_timestamps where c_date='2018-11-13'
----
-Jorge 2018-12-13T12:12:10.011000000 2018-11-13
-Andrew 2018-11-13T17:11:10.011000000 2018-11-13
+Jorge 2018-12-13T12:12:10.011 2018-11-13
+Andrew 2018-11-13T17:11:10.011 2018-11-13
statement ok
DROP TABLE csv_with_timestamps
@@ -409,7 +409,7 @@ CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH
HEADER ROW DELIMITER '
query CCC
SELECT * FROM aggregate_simple order by c1 LIMIT 1;
----
-0.00001 1.0e-12 true
+0.00001 0.000000000001 true
statement ok
DROP TABLE aggregate_simple
diff --git a/datafusion/core/tests/sqllogictests/test_files/prepare.slt
b/datafusion/core/tests/sqllogictests/test_files/prepare.slt
index 948a2e3bc..813288e38 100644
--- a/datafusion/core/tests/sqllogictests/test_files/prepare.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/prepare.slt
@@ -25,7 +25,7 @@ create table person (id int, first_name varchar, last_name
varchar, age int, sta
query C rowsort
select * from person;
----
-1 jane smith 20 MA 100000.45 2000-11-12T00:00:00.000000000 99
+1 jane smith 20 MA 100000.45 2000-11-12T00:00:00 99
# Error due to syntax and semantic violation
@@ -80,4 +80,3 @@ PREPARE my_plan(INT, DOUBLE, DOUBLE, DOUBLE) AS SELECT id,
SUM(age) FROM person
statement error
PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t
(num, letter);
-
diff --git a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt
b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt
index f4d627df4..56c6690be 100644
--- a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt
@@ -25,42 +25,42 @@ create table foo (val int, ts timestamp) as values (1,
'2000-01-01T00:00:00'::ti
query C rowsort
select * from foo;
----
-1 2000-01-01T00:00:00.000000000
-2 2000-02-01T00:00:00.000000000
-3 2000-03-01T00:00:00.000000000
+1 2000-01-01T00:00:00
+2 2000-02-01T00:00:00
+3 2000-03-01T00:00:00
# Test that we can compare a timestamp to a casted string
query C rowsort
select * from foo where ts > '2000-01-01T00:00:00'::timestamp;
----
-2 2000-02-01T00:00:00.000000000
-3 2000-03-01T00:00:00.000000000
+2 2000-02-01T00:00:00
+3 2000-03-01T00:00:00
# Test that we can compare a timestamp to a string and it will be coerced
query C rowsort
select * from foo where ts > '2000-01-01T00:00:00';
----
-2 2000-02-01T00:00:00.000000000
-3 2000-03-01T00:00:00.000000000
+2 2000-02-01T00:00:00
+3 2000-03-01T00:00:00
query C rowsort
select * from foo where ts < '2000-02-01T00:00:00';
----
-1 2000-01-01T00:00:00.000000000
+1 2000-01-01T00:00:00
query C rowsort
select * from foo where ts <= '2000-02-01T00:00:00';
----
-1 2000-01-01T00:00:00.000000000
-2 2000-02-01T00:00:00.000000000
+1 2000-01-01T00:00:00
+2 2000-02-01T00:00:00
query C rowsort
select * from foo where ts = '2000-02-01T00:00:00';
----
-2 2000-02-01T00:00:00.000000000
+2 2000-02-01T00:00:00
query C rowsort
select * from foo where ts != '2000-02-01T00:00:00';
----
-1 2000-01-01T00:00:00.000000000
-3 2000-03-01T00:00:00.000000000
+1 2000-01-01T00:00:00
+3 2000-03-01T00:00:00