alamb commented on code in PR #1590: URL: https://github.com/apache/datafusion-sqlparser-rs/pull/1590#discussion_r1882396934
########## build.rs: ########## @@ -0,0 +1,101 @@ +use std::env; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +fn read_keywords() -> Vec<(String, Option<String>)> { + let path = Path::new("src").join("keywords.txt"); Review Comment: this is quite clever ########## src/keywords.rs: ########## @@ -33,845 +33,13 @@ use serde::{Deserialize, Serialize}; #[cfg(feature = "visitor")] use sqlparser_derive::{Visit, VisitMut}; -/// Defines a string constant for a single keyword: `kw_def!(SELECT);` -/// expands to `pub const SELECT = "SELECT";` -macro_rules! kw_def { - ($ident:ident = $string_keyword:expr) => { - pub const $ident: &'static str = $string_keyword; - }; - ($ident:ident) => { - kw_def!($ident = stringify!($ident)); - }; -} - -/// Expands to a list of `kw_def!()` invocations for each keyword -/// and defines an ALL_KEYWORDS array of the defined constants. -macro_rules! define_keywords { - ($( - $ident:ident $(= $string_keyword:expr)? - ),*) => { - #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Hash)] - #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] - #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] - #[allow(non_camel_case_types)] - pub enum Keyword { - NoKeyword, - $($ident),* - } +include!(concat!(env!("OUT_DIR"), "/keyword_gen.rs")); - pub const ALL_KEYWORDS_INDEX: &[Keyword] = &[ - $(Keyword::$ident),* - ]; - - $(kw_def!($ident $(= $string_keyword)?);)* - pub const ALL_KEYWORDS: &[&str] = &[ - $($ident),* - ]; - }; +pub fn lookup(keyword: &str) -> Keyword { + let keyword = keyword.to_ascii_uppercase(); Review Comment: If we are going to optimize lookup I think we should also avoid this call to `to_ascii_upprcase` as it allocates / copies the string which I believe is significiantly more costly than some comparisons https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase https://doc.rust-lang.org/std/primitive.str.html#method.to_uppercase Since all keywords are ASCII, what if we did the binary search character by character Here is some brain 🤮 : ``` let first_char = string.first().to_upper() // find range of keywords within the let range = binary_search(keywords, first_char, 0); // if not unique location, proceed to second character, etc ... ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org