This is an automated email from the ASF dual-hosted git repository.
github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 928783db Parser: fix exponential parse time on compound keyword chains
(#2350)
928783db is described below
commit 928783db2b1ef0f002dc271f78af59bb34a86888
Author: Luca Cappelletti <[email protected]>
AuthorDate: Sat Jun 6 08:20:40 2026 +0200
Parser: fix exponential parse time on compound keyword chains (#2350)
---
sqlparser_bench/benches/sqlparser_bench.rs | 26 +++++++++++++++++-
src/parser/mod.rs | 43 ++++++++++++++++++++----------
tests/sqlparser_common.rs | 25 +++++++++++++++++
3 files changed, 79 insertions(+), 15 deletions(-)
diff --git a/sqlparser_bench/benches/sqlparser_bench.rs
b/sqlparser_bench/benches/sqlparser_bench.rs
index 46c20154..7f6bc2ff 100644
--- a/sqlparser_bench/benches/sqlparser_bench.rs
+++ b/sqlparser_bench/benches/sqlparser_bench.rs
@@ -177,11 +177,35 @@ fn parse_compound_chain(c: &mut Criterion) {
group.finish();
}
+/// Benchmark parsing pathological compound chains with a reserved keyword in
+/// field position, like `SELECT x.not-b.not-b...`. The `.not-b` shape used to
+/// cause 2^N work in `parse_compound_expr` because `parse_prefix` descended
+/// into `parse_not` -> `parse_subexpr`, re-walking the remaining chain at
+/// every segment.
+fn parse_compound_keyword_chain(c: &mut Criterion) {
+ let mut group = c.benchmark_group("parse_compound_keyword_chain");
+ let dialect = GenericDialect {};
+
+ for &n in &[5usize, 10, 15] {
+ let body = std::iter::repeat_n(".not-b", n).collect::<String>();
+ let sql = format!("SELECT x{body}");
+
+ group.bench_function(format!("chain_{n}"), |b| {
+ b.iter(|| {
+ let _ = Parser::parse_sql(&dialect,
std::hint::black_box(&sql));
+ });
+ });
+ }
+
+ group.finish();
+}
+
criterion_group!(
benches,
basic_queries,
word_to_ident,
parse_many_identifiers,
- parse_compound_chain
+ parse_compound_chain,
+ parse_compound_keyword_chain
);
criterion_main!(benches);
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index d7dd2803..c897c767 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -2036,20 +2036,35 @@ impl<'a> Parser<'a> {
// recursive `parse_subexpr` would re-walk the rest of the
chain at
// every dot.
_ => {
- let expr = self.maybe_parse(|parser| {
- let expr = parser.parse_prefix()?;
- match &expr {
- Expr::CompoundFieldAccess { .. }
- | Expr::CompoundIdentifier(_)
- | Expr::Identifier(_)
- | Expr::Value(_)
- | Expr::Function(_) => Ok(expr),
- _ => parser.expected_ref(
- "an identifier or value",
- parser.peek_token_ref(),
- ),
- }
- })?;
+ // For a plain `Word` field (not followed by `(`),
skip the
+ // speculative `parse_prefix`. The only result the
validator
+ // below would accept is `Identifier`, which
`parse_identifier`
+ // in the None branch produces directly. This avoids
2^N work
+ // on chains like `.not-b.not-b...` where
`parse_prefix` would
+ // descend into `parse_not` and re-walk the remaining
chain at
+ // every segment.
+ let word_field_no_lparen =
+ matches!(self.peek_token_ref().token,
Token::Word(_))
+ && self.peek_nth_token_ref(1).token !=
Token::LParen;
+
+ let expr = if word_field_no_lparen {
+ None
+ } else {
+ self.maybe_parse(|parser| {
+ let expr = parser.parse_prefix()?;
+ match &expr {
+ Expr::CompoundFieldAccess { .. }
+ | Expr::CompoundIdentifier(_)
+ | Expr::Identifier(_)
+ | Expr::Value(_)
+ | Expr::Function(_) => Ok(expr),
+ _ => parser.expected_ref(
+ "an identifier or value",
+ parser.peek_token_ref(),
+ ),
+ }
+ })?
+ };
match expr {
// If we get back a compound field access or
identifier,
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
index 37bba85f..c808fd5c 100644
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@@ -19010,3 +19010,28 @@ fn parse_compound_chain_no_exponential_blowup() {
rx.recv_timeout(Duration::from_secs(5))
.expect("parser should reject this quickly, not loop exponentially");
}
+
+/// Regression test for the 2^N parse-time blowup in `parse_compound_expr` on
+/// chains like `x.not-b.not-b...`. The `NOT` keyword in field position drives
+/// `parse_prefix` -> `parse_not` -> `parse_subexpr`, which re-walks the
+/// remaining chain at every segment and doubles the work. Post-fix the parser
+/// handles 25 segments in well under a millisecond, so the timeout is a hang
+/// guard, not a perf threshold.
+#[test]
+fn parse_compound_keyword_chain_no_exponential_blowup() {
+ use std::sync::mpsc;
+ use std::thread;
+ use std::time::Duration;
+
+ let body: String = std::iter::repeat_n(".not-b", 25).collect();
+ let sql = format!("SELECT x{body}");
+
+ let (tx, rx) = mpsc::channel();
+ thread::spawn(move || {
+ let _ = Parser::parse_sql(&GenericDialect {}, &sql);
+ let _ = tx.send(());
+ });
+
+ rx.recv_timeout(Duration::from_secs(5))
+ .expect("parser should handle this quickly, not loop exponentially");
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]