This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new ab88220e90 Allow declaring partition columns in `PARTITION BY` clause, 
backwards compatible (#9599)
ab88220e90 is described below

commit ab88220e900f1ba8ec3c65a1e4ae53b3feb2a2af
Author: Mohamed Abdeen <[email protected]>
AuthorDate: Sat Mar 30 22:14:43 2024 +0200

    Allow declaring partition columns in `PARTITION BY` clause, backwards 
compatible (#9599)
    
    * Draft allow both syntaxes
    
    * suppress unused code error
    
    * prevent constraints in partition clauses
    
    * fix clippy
    
    * More tests
    
    * comment and prevent constraints on partition columns
    
    * trailing whitespaces
    
    * End-to-End test of new Hive syntax
    
    ---------
    
    Co-authored-by: Mohamed Abdeen <[email protected]>
---
 datafusion/sql/src/parser.rs                       | 59 ++++++++++++++++++++--
 .../test_files/create_external_table.slt           | 10 +++-
 .../sqllogictest/test_files/insert_to_external.slt | 36 ++++++++++---
 3 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index c585917a1e..67fa1325ee 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -175,7 +175,7 @@ pub(crate) type LexOrdering = Vec<OrderByExpr>;
 /// [ WITH HEADER ROW ]
 /// [ DELIMITER <char> ]
 /// [ COMPRESSION TYPE <GZIP | BZIP2 | XZ | ZSTD> ]
-/// [ PARTITIONED BY (<column list>) ]
+/// [ PARTITIONED BY (<column_definition list> | <column list>) ]
 /// [ WITH ORDER (<ordered column list>)
 /// [ OPTIONS (<key_value_list>) ]
 /// LOCATION <literal>
@@ -693,7 +693,7 @@ impl<'a> DFParser<'a> {
             self.parser
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
         let table_name = self.parser.parse_object_name(true)?;
-        let (columns, constraints) = self.parse_columns()?;
+        let (mut columns, constraints) = self.parse_columns()?;
 
         #[derive(Default)]
         struct Builder {
@@ -754,7 +754,30 @@ impl<'a> DFParser<'a> {
                     Keyword::PARTITIONED => {
                         self.parser.expect_keyword(Keyword::BY)?;
                         ensure_not_set(&builder.table_partition_cols, 
"PARTITIONED BY")?;
-                        builder.table_partition_cols = 
Some(self.parse_partitions()?);
+                        // Expects either list of column names (col_name [, 
col_name]*)
+                        // or list of column definitions (col_name datatype [, 
col_name datatype]* )
+                        // use the token after the name to decide which 
parsing rule to use
+                        // Note that mixing both names and definitions is not 
allowed
+                        let peeked = self.parser.peek_nth_token(2);
+                        if peeked == Token::Comma || peeked == Token::RParen {
+                            // list of column names
+                            builder.table_partition_cols = 
Some(self.parse_partitions()?)
+                        } else {
+                            // list of column defs
+                            let (cols, cons) = self.parse_columns()?;
+                            builder.table_partition_cols = Some(
+                                cols.iter().map(|col| 
col.name.to_string()).collect(),
+                            );
+
+                            columns.extend(cols);
+
+                            if !cons.is_empty() {
+                                return Err(ParserError::ParserError(
+                                    "Constraints on Partition Columns are not 
supported"
+                                        .to_string(),
+                                ));
+                            }
+                        }
                     }
                     Keyword::OPTIONS => {
                         ensure_not_set(&builder.options, "OPTIONS")?;
@@ -1167,9 +1190,37 @@ mod tests {
         });
         expect_parse_ok(sql, expected)?;
 
-        // Error cases: partition column does not support type
+        // positive case: column definiton allowed in 'partition by' clause
         let sql =
             "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 
int) LOCATION 'foo.csv'";
+        let expected = Statement::CreateExternalTable(CreateExternalTable {
+            name: "t".into(),
+            columns: vec![
+                make_column_def("c1", DataType::Int(None)),
+                make_column_def("p1", DataType::Int(None)),
+            ],
+            file_type: "CSV".to_string(),
+            has_header: false,
+            delimiter: ',',
+            location: "foo.csv".into(),
+            table_partition_cols: vec!["p1".to_string()],
+            order_exprs: vec![],
+            if_not_exists: false,
+            file_compression_type: UNCOMPRESSED,
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: vec![],
+        });
+        expect_parse_ok(sql, expected)?;
+
+        // negative case: mixed column defs and column names in `PARTITIONED 
BY` clause
+        let sql =
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 
int, c1) LOCATION 'foo.csv'";
+        expect_parse_error(sql, "sql parser error: Expected a data type name, 
found: )");
+
+        // negative case: mixed column defs and column names in `PARTITIONED 
BY` clause
+        let sql =
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c1, 
p1 int) LOCATION 'foo.csv'";
         expect_parse_error(sql, "sql parser error: Expected ',' or ')' after 
partition definition, found: int");
 
         // positive case: additional options (one entry) can be specified
diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt 
b/datafusion/sqllogictest/test_files/create_external_table.slt
index c4a26a5e22..a200217af6 100644
--- a/datafusion/sqllogictest/test_files/create_external_table.slt
+++ b/datafusion/sqllogictest/test_files/create_external_table.slt
@@ -100,9 +100,17 @@ CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH 
LOCATION 'foo.csv';
 statement error DataFusion error: SQL error: ParserError\("Unexpected token 
FOOBAR"\)
 CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV FOOBAR BARBAR BARFOO LOCATION 
'foo.csv';
 
+# Missing partition column
+statement error DataFusion error: Arrow error: Schema error: Unable to get 
field named "c2". Valid fields: \["c1"\]
+create EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c2) LOCATION 
'foo.csv'
+
+# Duplicate Column in `PARTITIONED BY` clause
+statement error DataFusion error: Schema error: Schema contains duplicate 
unqualified field name c1
+create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1 int) 
LOCATION 'foo.csv'
+
 # Conflicting options
 statement error DataFusion error: Invalid or Unsupported Configuration: Config 
value "column_index_truncate_length" not found on CsvOptions
 CREATE EXTERNAL TABLE csv_table (column1 int)
 STORED AS CSV
 LOCATION 'foo.csv'
-OPTIONS ('format.delimiter' ';', 'format.column_index_truncate_length' '123')
+OPTIONS ('format.delimiter' ';', 'format.column_index_truncate_length' '123')
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt 
b/datafusion/sqllogictest/test_files/insert_to_external.slt
index dc60bafaa8..4b9af3bdea 100644
--- a/datafusion/sqllogictest/test_files/insert_to_external.slt
+++ b/datafusion/sqllogictest/test_files/insert_to_external.slt
@@ -42,7 +42,7 @@ LOCATION '../../testing/data/csv/aggregate_test_100.csv'
 
 
 statement ok
-create table dictionary_encoded_values as values 
+create table dictionary_encoded_values as values
 ('a', arrow_cast('foo', 'Dictionary(Int32, Utf8)')), ('b', arrow_cast('bar', 
'Dictionary(Int32, Utf8)'));
 
 query TTT
@@ -55,13 +55,13 @@ statement ok
 CREATE EXTERNAL TABLE dictionary_encoded_parquet_partitioned(
   a varchar,
   b varchar,
-) 
+)
 STORED AS parquet
 LOCATION 'test_files/scratch/insert_to_external/parquet_types_partitioned/'
 PARTITIONED BY (b);
 
 query TT
-insert into dictionary_encoded_parquet_partitioned 
+insert into dictionary_encoded_parquet_partitioned
 select * from dictionary_encoded_values
 ----
 2
@@ -76,13 +76,13 @@ statement ok
 CREATE EXTERNAL TABLE dictionary_encoded_arrow_partitioned(
   a varchar,
   b varchar,
-) 
+)
 STORED AS arrow
 LOCATION 'test_files/scratch/insert_to_external/arrow_dict_partitioned/'
 PARTITIONED BY (b);
 
 query TT
-insert into dictionary_encoded_arrow_partitioned 
+insert into dictionary_encoded_arrow_partitioned
 select * from dictionary_encoded_values
 ----
 2
@@ -90,7 +90,7 @@ select * from dictionary_encoded_values
 statement ok
 CREATE EXTERNAL TABLE dictionary_encoded_arrow_test_readback(
   a varchar,
-) 
+)
 STORED AS arrow
 LOCATION 'test_files/scratch/insert_to_external/arrow_dict_partitioned/b=bar/';
 
@@ -185,6 +185,30 @@ select * from partitioned_insert_test_verify;
 1
 2
 
+statement ok
+CREATE EXTERNAL TABLE
+partitioned_insert_test_hive(c bigint)
+STORED AS csv
+LOCATION 'test_files/scratch/insert_to_external/insert_to_partitioned'
+PARTITIONED BY (a string, b string);
+
+query ITT
+INSERT INTO partitioned_insert_test_hive VALUES (3,30,300);
+----
+1
+
+query ITT
+SELECT * FROM partitioned_insert_test_hive order by a,b,c;
+----
+1 10 100
+1 10 200
+1 20 100
+2 20 100
+1 20 200
+2 20 200
+3 30 300
+
+
 statement ok
 CREATE EXTERNAL TABLE
 partitioned_insert_test_json(a string, b string)

Reply via email to