iffyio commented on code in PR #2130:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/2130#discussion_r2616776459


##########
src/tokenizer.rs:
##########
@@ -1994,6 +2023,70 @@ impl<'a> Tokenizer<'a> {
         )
     }
 
+    /// Reads a quote delimited string without "backslash escaping" or a word
+    /// depending on whether `chars.next()` delivers a `'`.
+    ///
+    /// See 
<https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html>
+    fn tokenize_word_or_quote_delimited_string(
+        &self,
+        chars: &mut State,
+        // the prefix that introduced the possible literal or word,
+        // e.g. "Q" or "nq"
+        word_prefix: &[char],
+        // turns an identified quote string literal,
+        // ie. `(start-quote-char, string-literal, end-quote-char)`
+        // into a token
+        as_literal: fn(char, String, char) -> Token,
+    ) -> Result<Token, TokenizerError> {
+        match chars.peek() {
+            Some('\'') => {
+                chars.next();
+                // ~ determine the "quote character(s)"
+                let error_loc = chars.location();
+                let (start_quote_char, end_quote_char) = match chars.next() {
+                    // ~ "newline" is not allowed by Oracle's SQL Reference,
+                    // but works with sql*plus nevertheless
+                    None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => 
{

Review Comment:
   can we ensure that these cases are covered by the tests?



##########
src/tokenizer.rs:
##########
@@ -1994,6 +2023,70 @@ impl<'a> Tokenizer<'a> {
         )
     }
 
+    /// Reads a quote delimited string without "backslash escaping" or a word
+    /// depending on whether `chars.next()` delivers a `'`.
+    ///
+    /// See 
<https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html>
+    fn tokenize_word_or_quote_delimited_string(
+        &self,
+        chars: &mut State,
+        // the prefix that introduced the possible literal or word,
+        // e.g. "Q" or "nq"
+        word_prefix: &[char],
+        // turns an identified quote string literal,
+        // ie. `(start-quote-char, string-literal, end-quote-char)`
+        // into a token
+        as_literal: fn(char, String, char) -> Token,

Review Comment:
   since the function is only used twice, I think it would be reasonable to 
have the caller do something like
   
   ```rust
   chars.next(); // consume 'q'
   if chars.peek() != Some('\'') {
       return 'q' + tokenize_word()
   }
   let QuoteString(quote_delimited_string())
   ```
   i.e. that this function should only care about parsing the string, in order 
to simplify its implementation



##########
src/tokenizer.rs:
##########
@@ -1032,13 +1042,32 @@ impl<'a> Tokenizer<'a> {
                                 self.tokenize_single_quoted_string(chars, 
'\'', backslash_escape)?;
                             Ok(Some(Token::NationalStringLiteral(s)))
                         }
+                        Some(&q @ 'q') | Some(&q @ 'Q') if dialect_of!(self is 
OracleDialect | GenericDialect) =>

Review Comment:
   We can add a dialect method `dialect.supports_quote_delimited_string()` 
instead of hardcoding the supported dialect in the tokenizer



##########
src/tokenizer.rs:
##########
@@ -1994,6 +2023,70 @@ impl<'a> Tokenizer<'a> {
         )
     }
 
+    /// Reads a quote delimited string without "backslash escaping" or a word
+    /// depending on whether `chars.next()` delivers a `'`.
+    ///
+    /// See 
<https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html>
+    fn tokenize_word_or_quote_delimited_string(
+        &self,
+        chars: &mut State,
+        // the prefix that introduced the possible literal or word,
+        // e.g. "Q" or "nq"
+        word_prefix: &[char],
+        // turns an identified quote string literal,
+        // ie. `(start-quote-char, string-literal, end-quote-char)`
+        // into a token
+        as_literal: fn(char, String, char) -> Token,
+    ) -> Result<Token, TokenizerError> {
+        match chars.peek() {
+            Some('\'') => {
+                chars.next();
+                // ~ determine the "quote character(s)"

Review Comment:
   ```suggestion
   ```



##########
src/tokenizer.rs:
##########
@@ -98,6 +98,12 @@ pub enum Token {
     TripleDoubleQuotedRawStringLiteral(String),
     /// "National" string literal: i.e: N'string'
     NationalStringLiteral(String),
+    /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
+    /// 
[Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html)

Review Comment:
   for the docs, can we point to this link instead, that take the user directly 
to text literals?
   
https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA



##########
src/ast/value.rs:
##########
@@ -167,6 +167,12 @@ pub enum Value {
     TripleDoubleQuotedRawStringLiteral(String),
     /// N'string value'
     NationalStringLiteral(String),
+    /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
+    /// 
[Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html)
+    QuoteDelimitedStringLiteral(char, String, char),

Review Comment:
   We can add something like 
   ```rust
   struct QuoteDelimitedString {
       start_quote: char,
       value: String,
       end_quote: char
   }
   ```
   that would better clarify the args to the enum variants



##########
src/tokenizer.rs:
##########
@@ -1994,6 +2023,70 @@ impl<'a> Tokenizer<'a> {
         )
     }
 
+    /// Reads a quote delimited string without "backslash escaping" or a word
+    /// depending on whether `chars.next()` delivers a `'`.
+    ///
+    /// See 
<https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html>
+    fn tokenize_word_or_quote_delimited_string(
+        &self,
+        chars: &mut State,
+        // the prefix that introduced the possible literal or word,
+        // e.g. "Q" or "nq"
+        word_prefix: &[char],
+        // turns an identified quote string literal,
+        // ie. `(start-quote-char, string-literal, end-quote-char)`
+        // into a token
+        as_literal: fn(char, String, char) -> Token,
+    ) -> Result<Token, TokenizerError> {
+        match chars.peek() {
+            Some('\'') => {
+                chars.next();
+                // ~ determine the "quote character(s)"
+                let error_loc = chars.location();
+                let (start_quote_char, end_quote_char) = match chars.next() {
+                    // ~ "newline" is not allowed by Oracle's SQL Reference,
+                    // but works with sql*plus nevertheless
+                    None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => 
{
+                        return self.tokenizer_error(
+                            error_loc,
+                            format!(
+                                "Invalid space, tab, newline, or EOF after 
'{}''.",
+                                String::from_iter(word_prefix)
+                            ),
+                        );
+                    }
+                    Some(c) => (
+                        c,
+                        match c {
+                            '[' => ']',
+                            '{' => '}',
+                            '<' => '>',
+                            '(' => ')',
+                            c => c,
+                        },
+                    ),
+                };
+                // read the string literal until the "quote character" 
following a by literal quote
+                let mut s = String::new();
+                while let Some(ch) = chars.next() {
+                    if ch == end_quote_char {
+                        if let Some('\'') = chars.peek() {
+                            chars.next(); // ~ consume the quote
+                            return Ok(as_literal(start_quote_char, s, 
end_quote_char));
+                        }
+                    }
+                    s.push(ch);
+                }
+                self.tokenizer_error(error_loc, "Unterminated string literal")

Review Comment:
   can we add a test case for this?



##########
src/tokenizer.rs:
##########
@@ -1994,6 +2023,70 @@ impl<'a> Tokenizer<'a> {
         )
     }
 
+    /// Reads a quote delimited string without "backslash escaping" or a word
+    /// depending on whether `chars.next()` delivers a `'`.
+    ///
+    /// See 
<https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Literals.html>
+    fn tokenize_word_or_quote_delimited_string(
+        &self,
+        chars: &mut State,
+        // the prefix that introduced the possible literal or word,
+        // e.g. "Q" or "nq"
+        word_prefix: &[char],
+        // turns an identified quote string literal,
+        // ie. `(start-quote-char, string-literal, end-quote-char)`
+        // into a token
+        as_literal: fn(char, String, char) -> Token,
+    ) -> Result<Token, TokenizerError> {
+        match chars.peek() {
+            Some('\'') => {
+                chars.next();
+                // ~ determine the "quote character(s)"
+                let error_loc = chars.location();
+                let (start_quote_char, end_quote_char) = match chars.next() {
+                    // ~ "newline" is not allowed by Oracle's SQL Reference,
+                    // but works with sql*plus nevertheless
+                    None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => 
{
+                        return self.tokenizer_error(
+                            error_loc,
+                            format!(
+                                "Invalid space, tab, newline, or EOF after 
'{}''.",
+                                String::from_iter(word_prefix)
+                            ),
+                        );
+                    }
+                    Some(c) => (
+                        c,
+                        match c {
+                            '[' => ']',
+                            '{' => '}',
+                            '<' => '>',
+                            '(' => ')',
+                            c => c,
+                        },
+                    ),
+                };
+                // read the string literal until the "quote character" 
following a by literal quote
+                let mut s = String::new();
+                while let Some(ch) = chars.next() {
+                    if ch == end_quote_char {
+                        if let Some('\'') = chars.peek() {
+                            chars.next(); // ~ consume the quote
+                            return Ok(as_literal(start_quote_char, s, 
end_quote_char));
+                        }
+                    }
+                    s.push(ch);
+                }
+                self.tokenizer_error(error_loc, "Unterminated string literal")
+            }
+            // ~ not a literal introduced with _token_prefix_, assm

Review Comment:
   ```suggestion
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to