This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch 
gh-readonly-queue/main/pr-2352-928783db2b1ef0f002dc271f78af59bb34a86888
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git

commit 20b9849e71cadb7b83b49133999b6d22f1320328
Author: Luca Cappelletti <[email protected]>
AuthorDate: Sat Jun 6 08:41:35 2026 +0200

    Parser: fix exponential parse time on speculative prefix parsing (#2352)
---
 sqlparser_bench/benches/sqlparser_bench.rs | 51 ++++++++++++++++++-
 src/parser/mod.rs                          | 78 +++++++++++++++++++++++++++++-
 tests/sqlparser_common.rs                  | 64 +++++++++++++++++++++++-
 3 files changed, 189 insertions(+), 4 deletions(-)

diff --git a/sqlparser_bench/benches/sqlparser_bench.rs 
b/sqlparser_bench/benches/sqlparser_bench.rs
index 7f6bc2ff..a3c39bc5 100644
--- a/sqlparser_bench/benches/sqlparser_bench.rs
+++ b/sqlparser_bench/benches/sqlparser_bench.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use criterion::{criterion_group, criterion_main, Criterion};
-use sqlparser::dialect::GenericDialect;
+use sqlparser::dialect::{GenericDialect, PostgreSqlDialect, SQLiteDialect};
 use sqlparser::keywords::Keyword;
 use sqlparser::parser::Parser;
 use sqlparser::tokenizer::{Span, Word};
@@ -200,12 +200,59 @@ fn parse_compound_keyword_chain(c: &mut Criterion) {
     group.finish();
 }
 
+/// Benchmark parsing pathological `IF(<keyword-fn>(<keyword-fn>(...x` chains
+/// that previously caused 2^N work in `parse_prefix`. Each nested
+/// `current_time(` segment used to be explored twice at every level (once via
+/// the speculative reserved-word arm, once via the unreserved-word fallback),
+/// doubling work per level. Post-fix the cost is linear in chain length.
+fn parse_prefix_keyword_call_chain(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_prefix_keyword_call_chain");
+    let dialect = PostgreSqlDialect {};
+
+    for &n in &[10usize, 20, 30] {
+        let sql = String::from("if(") + &"current_time(".repeat(n) + "x";
+
+        group.bench_function(format!("chain_{n}"), |b| {
+            b.iter(|| {
+                let _ = Parser::parse_sql(&dialect, 
std::hint::black_box(&sql));
+            });
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark parsing pathological `case-case-case-...c` chains that
+/// previously caused 2^N work in `parse_prefix`. Each `case` token used to
+/// trigger a speculative `parse_case_expr` that recursively descends the
+/// chain, but the unreserved-word fallback returns `Identifier(case)` so the
+/// overall `parse_prefix` succeeds and the failure cache never fires.
+/// Post-fix the per-arm cache short-circuits the speculative descent.
+fn parse_prefix_case_chain(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_prefix_case_chain");
+    let dialect = SQLiteDialect {};
+
+    for &n in &[10usize, 20, 30] {
+        let sql = "case\t-".repeat(n) + "c";
+
+        group.bench_function(format!("chain_{n}"), |b| {
+            b.iter(|| {
+                let _ = Parser::parse_sql(&dialect, 
std::hint::black_box(&sql));
+            });
+        });
+    }
+
+    group.finish();
+}
+
 criterion_group!(
     benches,
     basic_queries,
     word_to_ident,
     parse_many_identifiers,
     parse_compound_chain,
-    parse_compound_keyword_chain
+    parse_compound_keyword_chain,
+    parse_prefix_keyword_call_chain,
+    parse_prefix_case_chain
 );
 criterion_main!(benches);
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index c897c767..006a0c6f 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -15,6 +15,7 @@
 #[cfg(not(feature = "std"))]
 use alloc::{
     boxed::Box,
+    collections::BTreeMap,
     format,
     string::{String, ToString},
     vec,
@@ -24,6 +25,9 @@ use core::{
     fmt::{self, Display},
     str::FromStr,
 };
+#[cfg(feature = "std")]
+use std::collections::BTreeMap;
+
 use helpers::attached_token::AttachedToken;
 
 use log::debug;
@@ -359,6 +363,29 @@ pub struct Parser<'a> {
     options: ParserOptions,
     /// Ensures the stack does not overflow by limiting recursion depth.
     recursion_counter: RecursionCounter,
+    /// Cached failures from `parse_prefix` calls that returned `Err`. See
+    /// [`Parser::parse_prefix`] for the 2^N patterns this guards.
+    failed_prefix_positions: BTreeMap<usize, ExprPrefixError>,
+    /// Cached failures from the speculative reserved-word prefix arm. See
+    /// [`Parser::parse_prefix`] for the 2^N patterns this guards.
+    failed_reserved_word_prefix_positions: BTreeMap<usize, ExprPrefixError>,
+}
+
+/// Copy marker for a [`ParserError`] cached by the `parse_prefix` failure
+/// memoization, so the caches hold no strings.
+#[derive(Debug, Clone, Copy)]
+enum ExprPrefixError {
+    RecursionLimitExceeded,
+    Err,
+}
+
+impl From<&ParserError> for ExprPrefixError {
+    fn from(e: &ParserError) -> Self {
+        match e {
+            ParserError::RecursionLimitExceeded => 
Self::RecursionLimitExceeded,
+            _ => Self::Err,
+        }
+    }
 }
 
 impl<'a> Parser<'a> {
@@ -385,6 +412,8 @@ impl<'a> Parser<'a> {
             dialect,
             recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH),
             options: 
ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()),
+            failed_prefix_positions: BTreeMap::new(),
+            failed_reserved_word_prefix_positions: BTreeMap::new(),
         }
     }
 
@@ -446,6 +475,8 @@ impl<'a> Parser<'a> {
     pub fn with_tokens_with_locations(mut self, tokens: Vec<TokenWithSpan>) -> 
Self {
         self.tokens = tokens;
         self.index = 0;
+        self.failed_prefix_positions.clear();
+        self.failed_reserved_word_prefix_positions.clear();
         self
     }
 
@@ -1717,6 +1748,35 @@ impl<'a> Parser<'a> {
             return prefix;
         }
 
+        // Memoize parse_prefix failures to break 2^N speculation when both
+        // prefix arms fail at every level (e.g. `IF(current_time(...x`).
+        // The per-arm cache in `parse_prefix_inner` complements this for
+        // chains where the reserved arm fails but the unreserved fallback
+        // succeeds (e.g. `case-case-...c`).
+        let start_index = self.index;
+        if let Some(&cached) = self.failed_prefix_positions.get(&start_index) {
+            return self.cached_prefix_error(cached, self.peek_token_ref());
+        }
+        let result = self.parse_prefix_inner();
+        if let Err(ref e) = result {
+            self.failed_prefix_positions.insert(start_index, e.into());
+        }
+        result
+    }
+
+    /// Rebuild the error for a cached prefix failure at the `found` token.
+    fn cached_prefix_error<T>(
+        &self,
+        cached: ExprPrefixError,
+        found: &TokenWithSpan,
+    ) -> Result<T, ParserError> {
+        match cached {
+            ExprPrefixError::RecursionLimitExceeded => 
Err(ParserError::RecursionLimitExceeded),
+            ExprPrefixError::Err => self.expected_ref("an expression", found),
+        }
+    }
+
+    fn parse_prefix_inner(&mut self) -> Result<Expr, ParserError> {
         // PostgreSQL allows any string literal to be preceded by a type name, 
indicating that the
         // string literal represents a literal of that type. Some examples:
         //
@@ -1801,7 +1861,21 @@ impl<'a> Parser<'a> {
                 // We first try to parse the word and following tokens as a 
special expression, and if that fails,
                 // we rollback and try to parse it as an identifier.
                 let w = w.clone();
-                match self.try_parse(|parser| 
parser.parse_expr_prefix_by_reserved_word(&w, span)) {
+                // Memoize failed speculative reserved-word parses. When
+                // the reserved arm (CASE, CURRENT_TIME, etc.) does
+                // exponential work but the unreserved fallback ultimately
+                // succeeds, the overall `parse_prefix` returns `Ok` and the
+                // outer cache never fires. Chains like `case-case-...c`
+                // need this per-arm cache to break the doubling.
+                let try_parse_result = if let Some(&cached) = self
+                    .failed_reserved_word_prefix_positions
+                    .get(&next_token_index)
+                {
+                    self.cached_prefix_error(cached, self.get_current_token())
+                } else {
+                    self.try_parse(|parser| 
parser.parse_expr_prefix_by_reserved_word(&w, span))
+                };
+                match try_parse_result {
                     // This word indicated an expression prefix and parsing 
was successful
                     Ok(Some(expr)) => Ok(expr),
 
@@ -1815,6 +1889,8 @@ impl<'a> Parser<'a> {
                     // we rollback and return the parsing error we got from 
trying to parse a
                     // special expression (to maintain backwards compatibility 
of parsing errors).
                     Err(e) => {
+                        self.failed_reserved_word_prefix_positions
+                            .insert(next_token_index, (&e).into());
                         if !self.dialect.is_reserved_for_identifier(w.keyword) 
{
                             if let Ok(Some(expr)) = self.maybe_parse(|parser| {
                                 
parser.parse_expr_prefix_by_unreserved_word(&w, span)
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
index c808fd5c..0d49d0fa 100644
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@@ -15572,7 +15572,10 @@ fn parse_create_table_select() {
 
 #[test]
 fn test_reserved_keywords_for_identifiers() {
-    let dialects = all_dialects_where(|d| 
d.is_reserved_for_identifier(Keyword::INTERVAL));
+    let dialects = all_dialects_where(|d| {
+        d.is_reserved_for_identifier(Keyword::INTERVAL)
+            && !d.supports_named_fn_args_with_expr_name()
+    });
     // Dialects that reserve the word INTERVAL will not allow it as an 
unquoted identifier
     let sql = "SELECT MAX(interval) FROM tbl";
     assert_eq!(
@@ -15582,6 +15585,19 @@ fn test_reserved_keywords_for_identifiers() {
         ))
     );
 
+    // Dialects with expression-named function arguments parse the argument
+    // expression twice, so the second attempt reports the memoized failure
+    // at the start of the expression
+    let dialects = all_dialects_where(|d| {
+        d.is_reserved_for_identifier(Keyword::INTERVAL) && 
d.supports_named_fn_args_with_expr_name()
+    });
+    assert_eq!(
+        dialects.parse_sql_statements(sql),
+        Err(ParserError::ParserError(
+            "Expected: an expression, found: interval".to_string()
+        ))
+    );
+
     // Dialects that do not reserve the word INTERVAL will allow it
     let dialects = all_dialects_where(|d| 
!d.is_reserved_for_identifier(Keyword::INTERVAL));
     let sql = "SELECT MAX(interval) FROM tbl";
@@ -19035,3 +19051,49 @@ fn 
parse_compound_keyword_chain_no_exponential_blowup() {
     rx.recv_timeout(Duration::from_secs(5))
         .expect("parser should handle this quickly, not loop exponentially");
 }
+
+/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
+/// like `IF(current_time(current_time(...x`. Each nested `current_time(` used
+/// to be explored twice at every level (once via the speculative reserved-word
+/// arm, once via the unreserved-word fallback), doubling work per level.
+/// Post-fix the failing parse short-circuits via the position-keyed cache.
+#[test]
+fn parse_prefix_keyword_call_chain_no_exponential_blowup() {
+    use std::sync::mpsc;
+    use std::thread;
+    use std::time::Duration;
+
+    let sql = String::from("if(") + &"current_time(".repeat(30) + "x";
+
+    let (tx, rx) = mpsc::channel();
+    thread::spawn(move || {
+        let _ = Parser::parse_sql(&PostgreSqlDialect {}, &sql);
+        let _ = tx.send(());
+    });
+
+    rx.recv_timeout(Duration::from_secs(5))
+        .expect("parser should reject this quickly, not loop exponentially");
+}
+
+/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
+/// like `case-case-case-...c`. Each `case` token triggers a speculative
+/// `parse_case_expr` that fails, but the unreserved-word fallback returns
+/// `Identifier(case)`, so the outer failure cache never fires. Post-fix the
+/// per-arm cache short-circuits the speculative descent.
+#[test]
+fn parse_prefix_case_chain_no_exponential_blowup() {
+    use std::sync::mpsc;
+    use std::thread;
+    use std::time::Duration;
+
+    let sql = "case\t-".repeat(30) + "c";
+
+    let (tx, rx) = mpsc::channel();
+    thread::spawn(move || {
+        let _ = Parser::parse_sql(&SQLiteDialect {}, &sql);
+        let _ = tx.send(());
+    });
+
+    rx.recv_timeout(Duration::from_secs(5))
+        .expect("parser should reject this quickly, not loop exponentially");
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to