This is an automated email from the ASF dual-hosted git repository.
github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 20b9849e Parser: fix exponential parse time on speculative prefix
parsing (#2352)
20b9849e is described below
commit 20b9849e71cadb7b83b49133999b6d22f1320328
Author: Luca Cappelletti <[email protected]>
AuthorDate: Sat Jun 6 08:41:35 2026 +0200
Parser: fix exponential parse time on speculative prefix parsing (#2352)
---
sqlparser_bench/benches/sqlparser_bench.rs | 51 ++++++++++++++++++-
src/parser/mod.rs | 78 +++++++++++++++++++++++++++++-
tests/sqlparser_common.rs | 64 +++++++++++++++++++++++-
3 files changed, 189 insertions(+), 4 deletions(-)
diff --git a/sqlparser_bench/benches/sqlparser_bench.rs
b/sqlparser_bench/benches/sqlparser_bench.rs
index 7f6bc2ff..a3c39bc5 100644
--- a/sqlparser_bench/benches/sqlparser_bench.rs
+++ b/sqlparser_bench/benches/sqlparser_bench.rs
@@ -16,7 +16,7 @@
// under the License.
use criterion::{criterion_group, criterion_main, Criterion};
-use sqlparser::dialect::GenericDialect;
+use sqlparser::dialect::{GenericDialect, PostgreSqlDialect, SQLiteDialect};
use sqlparser::keywords::Keyword;
use sqlparser::parser::Parser;
use sqlparser::tokenizer::{Span, Word};
@@ -200,12 +200,59 @@ fn parse_compound_keyword_chain(c: &mut Criterion) {
group.finish();
}
+/// Benchmark parsing pathological `IF(<keyword-fn>(<keyword-fn>(...x` chains
+/// that previously caused 2^N work in `parse_prefix`. Each nested
+/// `current_time(` segment used to be explored twice at every level (once via
+/// the speculative reserved-word arm, once via the unreserved-word fallback),
+/// doubling work per level. Post-fix the cost is linear in chain length.
+fn parse_prefix_keyword_call_chain(c: &mut Criterion) {
+ let mut group = c.benchmark_group("parse_prefix_keyword_call_chain");
+ let dialect = PostgreSqlDialect {};
+
+ for &n in &[10usize, 20, 30] {
+ let sql = String::from("if(") + &"current_time(".repeat(n) + "x";
+
+ group.bench_function(format!("chain_{n}"), |b| {
+ b.iter(|| {
+ let _ = Parser::parse_sql(&dialect,
std::hint::black_box(&sql));
+ });
+ });
+ }
+
+ group.finish();
+}
+
+/// Benchmark parsing pathological `case-case-case-...c` chains that
+/// previously caused 2^N work in `parse_prefix`. Each `case` token used to
+/// trigger a speculative `parse_case_expr` that recursively descends the
+/// chain, but the unreserved-word fallback returns `Identifier(case)` so the
+/// overall `parse_prefix` succeeds and the failure cache never fires.
+/// Post-fix the per-arm cache short-circuits the speculative descent.
+fn parse_prefix_case_chain(c: &mut Criterion) {
+ let mut group = c.benchmark_group("parse_prefix_case_chain");
+ let dialect = SQLiteDialect {};
+
+ for &n in &[10usize, 20, 30] {
+ let sql = "case\t-".repeat(n) + "c";
+
+ group.bench_function(format!("chain_{n}"), |b| {
+ b.iter(|| {
+ let _ = Parser::parse_sql(&dialect,
std::hint::black_box(&sql));
+ });
+ });
+ }
+
+ group.finish();
+}
+
criterion_group!(
benches,
basic_queries,
word_to_ident,
parse_many_identifiers,
parse_compound_chain,
- parse_compound_keyword_chain
+ parse_compound_keyword_chain,
+ parse_prefix_keyword_call_chain,
+ parse_prefix_case_chain
);
criterion_main!(benches);
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index c897c767..006a0c6f 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -15,6 +15,7 @@
#[cfg(not(feature = "std"))]
use alloc::{
boxed::Box,
+ collections::BTreeMap,
format,
string::{String, ToString},
vec,
@@ -24,6 +25,9 @@ use core::{
fmt::{self, Display},
str::FromStr,
};
+#[cfg(feature = "std")]
+use std::collections::BTreeMap;
+
use helpers::attached_token::AttachedToken;
use log::debug;
@@ -359,6 +363,29 @@ pub struct Parser<'a> {
options: ParserOptions,
/// Ensures the stack does not overflow by limiting recursion depth.
recursion_counter: RecursionCounter,
+ /// Cached failures from `parse_prefix` calls that returned `Err`. See
+ /// [`Parser::parse_prefix`] for the 2^N patterns this guards.
+ failed_prefix_positions: BTreeMap<usize, ExprPrefixError>,
+ /// Cached failures from the speculative reserved-word prefix arm. See
+ /// [`Parser::parse_prefix`] for the 2^N patterns this guards.
+ failed_reserved_word_prefix_positions: BTreeMap<usize, ExprPrefixError>,
+}
+
+/// Copy marker for a [`ParserError`] cached by the `parse_prefix` failure
+/// memoization, so the caches hold no strings.
+#[derive(Debug, Clone, Copy)]
+enum ExprPrefixError {
+ RecursionLimitExceeded,
+ Err,
+}
+
+impl From<&ParserError> for ExprPrefixError {
+ fn from(e: &ParserError) -> Self {
+ match e {
+ ParserError::RecursionLimitExceeded =>
Self::RecursionLimitExceeded,
+ _ => Self::Err,
+ }
+ }
}
impl<'a> Parser<'a> {
@@ -385,6 +412,8 @@ impl<'a> Parser<'a> {
dialect,
recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH),
options:
ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()),
+ failed_prefix_positions: BTreeMap::new(),
+ failed_reserved_word_prefix_positions: BTreeMap::new(),
}
}
@@ -446,6 +475,8 @@ impl<'a> Parser<'a> {
pub fn with_tokens_with_locations(mut self, tokens: Vec<TokenWithSpan>) ->
Self {
self.tokens = tokens;
self.index = 0;
+ self.failed_prefix_positions.clear();
+ self.failed_reserved_word_prefix_positions.clear();
self
}
@@ -1717,6 +1748,35 @@ impl<'a> Parser<'a> {
return prefix;
}
+ // Memoize parse_prefix failures to break 2^N speculation when both
+ // prefix arms fail at every level (e.g. `IF(current_time(...x`).
+ // The per-arm cache in `parse_prefix_inner` complements this for
+ // chains where the reserved arm fails but the unreserved fallback
+ // succeeds (e.g. `case-case-...c`).
+ let start_index = self.index;
+ if let Some(&cached) = self.failed_prefix_positions.get(&start_index) {
+ return self.cached_prefix_error(cached, self.peek_token_ref());
+ }
+ let result = self.parse_prefix_inner();
+ if let Err(ref e) = result {
+ self.failed_prefix_positions.insert(start_index, e.into());
+ }
+ result
+ }
+
+ /// Rebuild the error for a cached prefix failure at the `found` token.
+ fn cached_prefix_error<T>(
+ &self,
+ cached: ExprPrefixError,
+ found: &TokenWithSpan,
+ ) -> Result<T, ParserError> {
+ match cached {
+ ExprPrefixError::RecursionLimitExceeded =>
Err(ParserError::RecursionLimitExceeded),
+ ExprPrefixError::Err => self.expected_ref("an expression", found),
+ }
+ }
+
+ fn parse_prefix_inner(&mut self) -> Result<Expr, ParserError> {
// PostgreSQL allows any string literal to be preceded by a type name,
indicating that the
// string literal represents a literal of that type. Some examples:
//
@@ -1801,7 +1861,21 @@ impl<'a> Parser<'a> {
// We first try to parse the word and following tokens as a
special expression, and if that fails,
// we rollback and try to parse it as an identifier.
let w = w.clone();
- match self.try_parse(|parser|
parser.parse_expr_prefix_by_reserved_word(&w, span)) {
+ // Memoize failed speculative reserved-word parses. When
+ // the reserved arm (CASE, CURRENT_TIME, etc.) does
+ // exponential work but the unreserved fallback ultimately
+ // succeeds, the overall `parse_prefix` returns `Ok` and the
+ // outer cache never fires. Chains like `case-case-...c`
+ // need this per-arm cache to break the doubling.
+ let try_parse_result = if let Some(&cached) = self
+ .failed_reserved_word_prefix_positions
+ .get(&next_token_index)
+ {
+ self.cached_prefix_error(cached, self.get_current_token())
+ } else {
+ self.try_parse(|parser|
parser.parse_expr_prefix_by_reserved_word(&w, span))
+ };
+ match try_parse_result {
// This word indicated an expression prefix and parsing
was successful
Ok(Some(expr)) => Ok(expr),
@@ -1815,6 +1889,8 @@ impl<'a> Parser<'a> {
// we rollback and return the parsing error we got from
trying to parse a
// special expression (to maintain backwards compatibility
of parsing errors).
Err(e) => {
+ self.failed_reserved_word_prefix_positions
+ .insert(next_token_index, (&e).into());
if !self.dialect.is_reserved_for_identifier(w.keyword)
{
if let Ok(Some(expr)) = self.maybe_parse(|parser| {
parser.parse_expr_prefix_by_unreserved_word(&w, span)
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
index c808fd5c..0d49d0fa 100644
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@@ -15572,7 +15572,10 @@ fn parse_create_table_select() {
#[test]
fn test_reserved_keywords_for_identifiers() {
- let dialects = all_dialects_where(|d|
d.is_reserved_for_identifier(Keyword::INTERVAL));
+ let dialects = all_dialects_where(|d| {
+ d.is_reserved_for_identifier(Keyword::INTERVAL)
+ && !d.supports_named_fn_args_with_expr_name()
+ });
// Dialects that reserve the word INTERVAL will not allow it as an
unquoted identifier
let sql = "SELECT MAX(interval) FROM tbl";
assert_eq!(
@@ -15582,6 +15585,19 @@ fn test_reserved_keywords_for_identifiers() {
))
);
+ // Dialects with expression-named function arguments parse the argument
+ // expression twice, so the second attempt reports the memoized failure
+ // at the start of the expression
+ let dialects = all_dialects_where(|d| {
+ d.is_reserved_for_identifier(Keyword::INTERVAL) &&
d.supports_named_fn_args_with_expr_name()
+ });
+ assert_eq!(
+ dialects.parse_sql_statements(sql),
+ Err(ParserError::ParserError(
+ "Expected: an expression, found: interval".to_string()
+ ))
+ );
+
// Dialects that do not reserve the word INTERVAL will allow it
let dialects = all_dialects_where(|d|
!d.is_reserved_for_identifier(Keyword::INTERVAL));
let sql = "SELECT MAX(interval) FROM tbl";
@@ -19035,3 +19051,49 @@ fn
parse_compound_keyword_chain_no_exponential_blowup() {
rx.recv_timeout(Duration::from_secs(5))
.expect("parser should handle this quickly, not loop exponentially");
}
+
+/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
+/// like `IF(current_time(current_time(...x`. Each nested `current_time(` used
+/// to be explored twice at every level (once via the speculative reserved-word
+/// arm, once via the unreserved-word fallback), doubling work per level.
+/// Post-fix the failing parse short-circuits via the position-keyed cache.
+#[test]
+fn parse_prefix_keyword_call_chain_no_exponential_blowup() {
+ use std::sync::mpsc;
+ use std::thread;
+ use std::time::Duration;
+
+ let sql = String::from("if(") + &"current_time(".repeat(30) + "x";
+
+ let (tx, rx) = mpsc::channel();
+ thread::spawn(move || {
+ let _ = Parser::parse_sql(&PostgreSqlDialect {}, &sql);
+ let _ = tx.send(());
+ });
+
+ rx.recv_timeout(Duration::from_secs(5))
+ .expect("parser should reject this quickly, not loop exponentially");
+}
+
+/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
+/// like `case-case-case-...c`. Each `case` token triggers a speculative
+/// `parse_case_expr` that fails, but the unreserved-word fallback returns
+/// `Identifier(case)`, so the outer failure cache never fires. Post-fix the
+/// per-arm cache short-circuits the speculative descent.
+#[test]
+fn parse_prefix_case_chain_no_exponential_blowup() {
+ use std::sync::mpsc;
+ use std::thread;
+ use std::time::Duration;
+
+ let sql = "case\t-".repeat(30) + "c";
+
+ let (tx, rx) = mpsc::channel();
+ thread::spawn(move || {
+ let _ = Parser::parse_sql(&SQLiteDialect {}, &sql);
+ let _ = tx.send(());
+ });
+
+ rx.recv_timeout(Duration::from_secs(5))
+ .expect("parser should reject this quickly, not loop exponentially");
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]