iffyio commented on code in PR #1759: URL: https://github.com/apache/datafusion-sqlparser-rs/pull/1759#discussion_r1992171898
########## src/ast/query.rs: ########## @@ -2407,6 +2413,98 @@ impl fmt::Display for OffsetRows { } } +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub enum PipeOperator { + Limit { + expr: Expr, + offset: Option<Expr>, + }, + Where { + expr: Expr, + }, + OrderBy { + exprs: Vec<OrderByExpr>, + }, + Select { + exprs: Vec<SelectItem>, + }, + Extend { Review Comment: it would be nice to include a short example doc string for the variants and where possible links to the documentation, like here for extended for example https://cloud.google.com/bigquery/docs/pipe-syntax-guide#extend-pipe-operator ########## src/ast/query.rs: ########## @@ -2407,6 +2413,98 @@ impl fmt::Display for OffsetRows { } } +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub enum PipeOperator { Review Comment: Can we add a description and link to the docs https://cloud.google.com/bigquery/docs/pipe-syntax-guide#basic_syntax ########## src/parser/mod.rs: ########## @@ -10319,6 +10321,104 @@ impl<'a> Parser<'a> { None }; + let mut pipe_operators = Vec::new(); + + // Syntax from "SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL" + // https://storage.googleapis.com/gweb-research2023-media/pubtools/1004848.pdf + while self.consume_token(&Token::VerticalBarRightAngleBracket) { Review Comment: can we move the logic to a dedicated function like `parse_pipe_operators()`? ########## tests/sqlparser_common.rs: ########## @@ -14654,3 +14662,71 @@ fn parse_set_names() { dialects.verified_stmt("SET NAMES 'utf8'"); dialects.verified_stmt("SET NAMES UTF8 COLLATE bogus"); } + +#[test] +fn parse_pipeline_operator() { + let dialects = all_dialects_where(|d| d.supports_pipe_operator()); + + // select pipe operator + dialects.verified_stmt("SELECT * FROM users |> SELECT id"); + dialects.verified_stmt("SELECT * FROM users |> SELECT id, name"); + dialects.verified_query_with_canonical( + "SELECT * FROM users |> SELECT id user_id", + "SELECT * FROM users |> SELECT id AS user_id", + ); + dialects.verified_stmt("SELECT * FROM users |> SELECT id AS user_id"); + + // extend pipe operator + dialects.verified_stmt("SELECT * FROM users |> EXTEND id + 1 AS new_id"); + dialects.verified_stmt("SELECT * FROM users |> EXTEND id AS new_id, name AS new_name"); + dialects.verified_query_with_canonical( + "SELECT * FROM users |> EXTEND id user_id", + "SELECT * FROM users |> EXTEND id AS user_id", + ); + + // set pipe operator + dialects.verified_stmt("SELECT * FROM users |> SET id = id + 1"); + dialects.verified_stmt("SELECT * FROM users |> SET id = id + 1, name = name + ' Doe'"); + + // drop pipe operator + dialects.verified_stmt("SELECT * FROM users |> DROP id"); + dialects.verified_stmt("SELECT * FROM users |> DROP id, name"); + + // as pipe operator + dialects.verified_stmt("SELECT * FROM users |> AS new_users"); + + // limit pipe operator + dialects.verified_stmt("SELECT * FROM users |> LIMIT 10"); + dialects.verified_stmt("SELECT * FROM users |> LIMIT 10 OFFSET 5"); + dialects.verified_stmt("SELECT * FROM users |> LIMIT 10 |> LIMIT 5"); + dialects.verified_stmt("SELECT * FROM users |> LIMIT 10 |> WHERE true"); + + // where pipe operator + dialects.verified_stmt("SELECT * FROM users |> WHERE id = 1"); + dialects.verified_stmt("SELECT * FROM users |> WHERE id = 1 AND name = 'John'"); + dialects.verified_stmt("SELECT * FROM users |> WHERE id = 1 OR name = 'John'"); + + // aggregate pipe operator full table + dialects.verified_stmt("SELECT * FROM users |> AGGREGATE COUNT(*)"); + dialects.verified_query_with_canonical( + "SELECT * FROM users |> AGGREGATE COUNT(*) total_users", + "SELECT * FROM users |> AGGREGATE COUNT(*) AS total_users", + ); + dialects.verified_stmt("SELECT * FROM users |> AGGREGATE COUNT(*) AS total_users"); + dialects.verified_stmt("SELECT * FROM users |> AGGREGATE COUNT(*), MIN(id)"); + + // aggregate pipe opeprator with grouping + dialects.verified_stmt( + "SELECT * FROM users |> AGGREGATE SUM(o_totalprice) AS price, COUNT(*) AS cnt GROUP BY EXTRACT(YEAR FROM o_orderdate) AS year", + ); + dialects.verified_stmt( + "SELECT * FROM users |> AGGREGATE GROUP BY EXTRACT(YEAR FROM o_orderdate) AS year", + ); + dialects + .verified_stmt("SELECT * FROM users |> AGGREGATE GROUP BY EXTRACT(YEAR FROM o_orderdate)"); + + // order by pipe operator + dialects.verified_stmt("SELECT * FROM users |> ORDER BY id ASC"); + dialects.verified_stmt("SELECT * FROM users |> ORDER BY id DESC"); + dialects.verified_stmt("SELECT * FROM users |> ORDER BY id DESC, name ASC"); +} Review Comment: can we include a test case with more than one pipe operator? e.g. `SELECT * FROM T |> a |> b |> c` ########## src/parser/mod.rs: ########## @@ -10319,6 +10321,104 @@ impl<'a> Parser<'a> { None }; + let mut pipe_operators = Vec::new(); + + // Syntax from "SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL" + // https://storage.googleapis.com/gweb-research2023-media/pubtools/1004848.pdf Review Comment: ```suggestion ``` ########## src/parser/mod.rs: ########## @@ -10319,6 +10321,104 @@ impl<'a> Parser<'a> { None }; + let mut pipe_operators = Vec::new(); + + // Syntax from "SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL" + // https://storage.googleapis.com/gweb-research2023-media/pubtools/1004848.pdf + while self.consume_token(&Token::VerticalBarRightAngleBracket) { + let kw = self.expect_one_of_keywords(&[ + Keyword::SELECT, + Keyword::EXTEND, + Keyword::SET, + Keyword::DROP, + Keyword::AS, + Keyword::WHERE, + Keyword::LIMIT, + Keyword::AGGREGATE, + Keyword::ORDER, + ])?; + match kw { + // SELECT <expr> [[AS] alias], ... + Keyword::SELECT => { + let exprs = self.parse_comma_separated(Parser::parse_select_item)?; + pipe_operators.push(PipeOperator::Select { exprs }) + } + // EXTEND <expr> [[AS] alias], ... + Keyword::EXTEND => { + let exprs = self.parse_comma_separated(Parser::parse_select_item)?; + pipe_operators.push(PipeOperator::Extend { exprs }) + } + // SET <column> = <expression>, ... + Keyword::SET => { + let assignments = self.parse_comma_separated(Parser::parse_assignment)?; + pipe_operators.push(PipeOperator::Set { assignments }) + } + // DROP <column>, ... + Keyword::DROP => { + let columns = self.parse_identifiers()?; + pipe_operators.push(PipeOperator::Drop { columns }) + } + // AS <alias> + Keyword::AS => { + let alias = self.parse_identifier()?; + pipe_operators.push(PipeOperator::Alias { alias }) + } + // WHERE <condition> + Keyword::WHERE => { + let expr = self.parse_expr()?; + pipe_operators.push(PipeOperator::Where { expr }) + } + // LIMIT <n> [OFFSET <m>] + Keyword::LIMIT => { + let expr = self.parse_expr()?; + let offset = if self.parse_keyword(Keyword::OFFSET) { + Some(self.parse_expr()?) + } else { + None + }; + pipe_operators.push(PipeOperator::Limit { expr, offset }) + } + // AGGREGATE <agg_expr> [[AS] alias], ... + // + // and + // + // AGGREGATE [<agg_expr> [[AS] alias], ...] + // GROUP BY <grouping_expr> [AS alias], ... + Keyword::AGGREGATE => { + let full_table_exprs = self.parse_comma_separated0( + |parser| { + let expr = parser.parse_expr()?; + let alias = parser.maybe_parse_select_item_alias()?; + Ok(ExprWithAlias { expr, alias }) + }, + Token::make_keyword(keywords::GROUP), + )?; + + let group_by_exprs = if self.parse_keywords(&[Keyword::GROUP, Keyword::BY]) + { + self.parse_comma_separated(|parser| { + let expr = parser.parse_expr()?; + let alias = parser.maybe_parse_select_item_alias()?; + Ok(ExprWithAlias { expr, alias }) + })? + } else { + vec![] + }; + pipe_operators.push(PipeOperator::Aggregate { + full_table_exprs, + group_by_exprs, + }) + } + // ORDER BY <expr> [ASC|DESC], ... + Keyword::ORDER => { + self.expect_one_of_keywords(&[Keyword::BY])?; + let exprs = self.parse_comma_separated(Parser::parse_order_by_expr)?; + pipe_operators.push(PipeOperator::OrderBy { exprs }) + } + _ => {} Review Comment: This clause look unhandled, for the loop i imagine instead we can something like? ```rust while self.consume_token(&Token::VerticalBarRightAngleBracket) { operators.push(self.parse_pipe_operator()?); } fn parse_pipe_operator() { if self.parse_keyword(SELECT) { // select operator } // ... else { self.expected("...") } } ``` ########## src/parser/mod.rs: ########## @@ -10319,6 +10321,104 @@ impl<'a> Parser<'a> { None }; + let mut pipe_operators = Vec::new(); + + // Syntax from "SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL" + // https://storage.googleapis.com/gweb-research2023-media/pubtools/1004848.pdf + while self.consume_token(&Token::VerticalBarRightAngleBracket) { + let kw = self.expect_one_of_keywords(&[ + Keyword::SELECT, + Keyword::EXTEND, + Keyword::SET, + Keyword::DROP, + Keyword::AS, + Keyword::WHERE, + Keyword::LIMIT, + Keyword::AGGREGATE, + Keyword::ORDER, + ])?; + match kw { + // SELECT <expr> [[AS] alias], ... + Keyword::SELECT => { + let exprs = self.parse_comma_separated(Parser::parse_select_item)?; + pipe_operators.push(PipeOperator::Select { exprs }) + } + // EXTEND <expr> [[AS] alias], ... + Keyword::EXTEND => { + let exprs = self.parse_comma_separated(Parser::parse_select_item)?; + pipe_operators.push(PipeOperator::Extend { exprs }) + } + // SET <column> = <expression>, ... + Keyword::SET => { + let assignments = self.parse_comma_separated(Parser::parse_assignment)?; + pipe_operators.push(PipeOperator::Set { assignments }) + } + // DROP <column>, ... + Keyword::DROP => { + let columns = self.parse_identifiers()?; + pipe_operators.push(PipeOperator::Drop { columns }) + } + // AS <alias> + Keyword::AS => { + let alias = self.parse_identifier()?; + pipe_operators.push(PipeOperator::Alias { alias }) + } + // WHERE <condition> + Keyword::WHERE => { + let expr = self.parse_expr()?; + pipe_operators.push(PipeOperator::Where { expr }) + } + // LIMIT <n> [OFFSET <m>] + Keyword::LIMIT => { + let expr = self.parse_expr()?; + let offset = if self.parse_keyword(Keyword::OFFSET) { + Some(self.parse_expr()?) + } else { + None + }; + pipe_operators.push(PipeOperator::Limit { expr, offset }) + } + // AGGREGATE <agg_expr> [[AS] alias], ... + // + // and + // + // AGGREGATE [<agg_expr> [[AS] alias], ...] + // GROUP BY <grouping_expr> [AS alias], ... Review Comment: Ah so I think these comments would be better placed on in the doc strings for each variant (left similar comment at that site) ########## src/dialect/mod.rs: ########## @@ -508,6 +508,20 @@ pub trait Dialect: Debug + Any { false } + /// Return true if the dialect supports pipe operator. + /// + /// Example: + /// ```sql + /// SELECT * + /// FROM table + /// |> limit 1 + /// ``` + /// + /// See "SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL" https://research.google/pubs/sql-has-problems-we-can-fix-them-pipe-syntax-in-sql/ Review Comment: I think we can link to a concrete syntax like this instead https://cloud.google.com/bigquery/docs/pipe-syntax-guide#basic_syntax The paper wouldn't really help as much in the context of someone working on the codebase or trying to use the library -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org