Re: [PR] Reduce string copies cow [datafusion-sqlparser-rs]

via GitHub Mon, 01 Dec 2025 08:15:08 -0800


eyalleshem commented on code in PR #2075:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/2075#discussion_r2577738550



##########
src/tokenizer.rs:
##########
@@ -1783,96 +1786,115 @@ impl<'a> Tokenizer<'a> {
     }
 
     /// Tokenize dollar preceded value (i.e: a string/placeholder)
-    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> 
Result<Token, TokenizerError> {
-        let mut s = String::new();
-        let mut value = String::new();
+    fn tokenize_dollar_preceded_value(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<Token, TokenizerError> {
+        chars.next(); // consume first $
 
-        chars.next();
+        // Case 1: $$text$$ (untagged dollar-quoted string)
+        if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
+            let (value, tag) = 
self.tokenize_dollar_quoted_string_borrowed(chars, None)?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-        // If the dialect does not support dollar-quoted strings, then `$$` is 
rather a placeholder.
+        // If it's not $$ we have 2 options :
+        //   Case 2: $tag$text$tag$ (tagged dollar-quoted string) if dialect 
supports it
+        //   Case 3: $placeholder (e.g., $1, $name)
+        let tag_start = chars.byte_pos;
+        let _tag_slice = peeking_take_while_ref(chars, |ch| {
+            ch.is_alphanumeric()
+                || ch == '_'
+                || matches!(ch, '$' if 
self.dialect.supports_dollar_placeholder())
+        });
+        let tag_end = chars.byte_pos;
+
+        // Case 2: $tag$text$tag$ (tagged dollar-quoted string)
         if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
-            chars.next();
+            let tag_value = &chars.source[tag_start..tag_end];
+            let (value, tag) =
+                self.tokenize_dollar_quoted_string_borrowed(chars, 
Some(tag_value))?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-            let mut is_terminated = false;
-            let mut prev: Option<char> = None;
+        // Case 3: $placeholder (e.g., $1, $name)
+        let tag_value = &chars.source[tag_start..tag_end];
+        Ok(Token::Placeholder(format!("${}", tag_value)))
+    }
 
-            while let Some(&ch) = chars.peek() {
-                if prev == Some('$') {
-                    if ch == '$' {
-                        chars.next();
-                        is_terminated = true;
-                        break;
-                    } else {
-                        s.push('$');
-                        s.push(ch);
+    /// Tokenize a dollar-quoted string ($$text$$ or $tag$text$tag$), 
returning borrowed slices.
+    /// tag_prefix: None for $$, Some("tag") for $tag$
+    /// Returns (value: Cow<'a, str>, tag: Option<Cow<'a, str>>)
+    fn tokenize_dollar_quoted_string_borrowed(
+        &self,
+        chars: &mut State<'a>,
+        tag_prefix: Option<&'a str>,
+    ) -> Result<(Cow<'a, str>, Option<Cow<'a, str>>), TokenizerError> {
+        chars.next(); // consume $ after tag (or second $ for $$)
+        let content_start = chars.byte_pos;
+
+        match tag_prefix {
+            None => {
+                // Case: $$text$$
+                let mut prev: Option<char> = None;
+
+                while let Some(&ch) = chars.peek() {
+                    if prev == Some('$') && ch == '$' {
+                        chars.next(); // consume final $
+                                      // content_end is before the first $ of 
$$
+                        let content_end = chars.byte_pos - 2;
+                        let value = &chars.source[content_start..content_end];

Review Comment:
   Added a safe_slice method that returns an error when the index is out of 
bounds, and replaced otherquery slicing operations to use it



##########
src/tokenizer.rs:
##########
@@ -2304,7 +2386,69 @@ fn peeking_next_take_while(
 }
 
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
-    Unescape::new(chars).unescape()
+    borrow_or_unescape_single_quoted_string(chars, true).map(|cow| 
cow.into_owned())
+}
+
+/// Scans a single-quoted string and returns either a borrowed slice or an 
unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return Cow::Borrowed

Review Comment:
   done 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Reduce string copies cow [datafusion-sqlparser-rs]

Reply via email to