This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 928783db Parser: fix exponential parse time on compound keyword chains 
(#2350)
928783db is described below

commit 928783db2b1ef0f002dc271f78af59bb34a86888
Author: Luca Cappelletti <[email protected]>
AuthorDate: Sat Jun 6 08:20:40 2026 +0200

    Parser: fix exponential parse time on compound keyword chains (#2350)
---
 sqlparser_bench/benches/sqlparser_bench.rs | 26 +++++++++++++++++-
 src/parser/mod.rs                          | 43 ++++++++++++++++++++----------
 tests/sqlparser_common.rs                  | 25 +++++++++++++++++
 3 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/sqlparser_bench/benches/sqlparser_bench.rs 
b/sqlparser_bench/benches/sqlparser_bench.rs
index 46c20154..7f6bc2ff 100644
--- a/sqlparser_bench/benches/sqlparser_bench.rs
+++ b/sqlparser_bench/benches/sqlparser_bench.rs
@@ -177,11 +177,35 @@ fn parse_compound_chain(c: &mut Criterion) {
     group.finish();
 }
 
+/// Benchmark parsing pathological compound chains with a reserved keyword in
+/// field position, like `SELECT x.not-b.not-b...`. The `.not-b` shape used to
+/// cause 2^N work in `parse_compound_expr` because `parse_prefix` descended
+/// into `parse_not` -> `parse_subexpr`, re-walking the remaining chain at
+/// every segment.
+fn parse_compound_keyword_chain(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_compound_keyword_chain");
+    let dialect = GenericDialect {};
+
+    for &n in &[5usize, 10, 15] {
+        let body = std::iter::repeat_n(".not-b", n).collect::<String>();
+        let sql = format!("SELECT x{body}");
+
+        group.bench_function(format!("chain_{n}"), |b| {
+            b.iter(|| {
+                let _ = Parser::parse_sql(&dialect, 
std::hint::black_box(&sql));
+            });
+        });
+    }
+
+    group.finish();
+}
+
 criterion_group!(
     benches,
     basic_queries,
     word_to_ident,
     parse_many_identifiers,
-    parse_compound_chain
+    parse_compound_chain,
+    parse_compound_keyword_chain
 );
 criterion_main!(benches);
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index d7dd2803..c897c767 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -2036,20 +2036,35 @@ impl<'a> Parser<'a> {
                     // recursive `parse_subexpr` would re-walk the rest of the 
chain at
                     // every dot.
                     _ => {
-                        let expr = self.maybe_parse(|parser| {
-                            let expr = parser.parse_prefix()?;
-                            match &expr {
-                                Expr::CompoundFieldAccess { .. }
-                                | Expr::CompoundIdentifier(_)
-                                | Expr::Identifier(_)
-                                | Expr::Value(_)
-                                | Expr::Function(_) => Ok(expr),
-                                _ => parser.expected_ref(
-                                    "an identifier or value",
-                                    parser.peek_token_ref(),
-                                ),
-                            }
-                        })?;
+                        // For a plain `Word` field (not followed by `(`), 
skip the
+                        // speculative `parse_prefix`. The only result the 
validator
+                        // below would accept is `Identifier`, which 
`parse_identifier`
+                        // in the None branch produces directly. This avoids 
2^N work
+                        // on chains like `.not-b.not-b...` where 
`parse_prefix` would
+                        // descend into `parse_not` and re-walk the remaining 
chain at
+                        // every segment.
+                        let word_field_no_lparen =
+                            matches!(self.peek_token_ref().token, 
Token::Word(_))
+                                && self.peek_nth_token_ref(1).token != 
Token::LParen;
+
+                        let expr = if word_field_no_lparen {
+                            None
+                        } else {
+                            self.maybe_parse(|parser| {
+                                let expr = parser.parse_prefix()?;
+                                match &expr {
+                                    Expr::CompoundFieldAccess { .. }
+                                    | Expr::CompoundIdentifier(_)
+                                    | Expr::Identifier(_)
+                                    | Expr::Value(_)
+                                    | Expr::Function(_) => Ok(expr),
+                                    _ => parser.expected_ref(
+                                        "an identifier or value",
+                                        parser.peek_token_ref(),
+                                    ),
+                                }
+                            })?
+                        };
 
                         match expr {
                             // If we get back a compound field access or 
identifier,
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
index 37bba85f..c808fd5c 100644
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@@ -19010,3 +19010,28 @@ fn parse_compound_chain_no_exponential_blowup() {
     rx.recv_timeout(Duration::from_secs(5))
         .expect("parser should reject this quickly, not loop exponentially");
 }
+
+/// Regression test for the 2^N parse-time blowup in `parse_compound_expr` on
+/// chains like `x.not-b.not-b...`. The `NOT` keyword in field position drives
+/// `parse_prefix` -> `parse_not` -> `parse_subexpr`, which re-walks the
+/// remaining chain at every segment and doubles the work. Post-fix the parser
+/// handles 25 segments in well under a millisecond, so the timeout is a hang
+/// guard, not a perf threshold.
+#[test]
+fn parse_compound_keyword_chain_no_exponential_blowup() {
+    use std::sync::mpsc;
+    use std::thread;
+    use std::time::Duration;
+
+    let body: String = std::iter::repeat_n(".not-b", 25).collect();
+    let sql = format!("SELECT x{body}");
+
+    let (tx, rx) = mpsc::channel();
+    thread::spawn(move || {
+        let _ = Parser::parse_sql(&GenericDialect {}, &sql);
+        let _ = tx.send(());
+    });
+
+    rx.recv_timeout(Duration::from_secs(5))
+        .expect("parser should handle this quickly, not loop exponentially");
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to