This is an automated email from the ASF dual-hosted git repository.

twice pushed a commit to branch unstable
in repository https://gitbox.apache.org/repos/asf/kvrocks.git


The following commit(s) were added to refs/heads/unstable by this push:
     new 54d00847 Add SQL SELECT parser via PEGTL for search module (#2184)
54d00847 is described below

commit 54d00847ad6a7a5c3bed0f13f94f298557a36e15
Author: Twice <[email protected]>
AuthorDate: Sat Mar 23 13:52:44 2024 +0900

    Add SQL SELECT parser via PEGTL for search module (#2184)
---
 .github/workflows/kvrocks.yaml   |   5 +-
 CMakeLists.txt                   |   2 +
 NOTICE                           |   1 +
 cmake/pegtl.cmake                |  27 +++++
 cmake/utils.cmake                |   7 ++
 licenses/LICENSE-pegtl.txt       |  21 ++++
 src/common/string_util.cc        |   6 +
 src/common/string_util.h         |  16 +++
 src/search/ir.h                  | 226 ++++++++++++++++++++++++++++++------
 src/search/sql_parser.h          | 115 +++++++++++++++++++
 src/search/sql_transformer.h     | 242 +++++++++++++++++++++++++++++++++++++++
 tests/cppunit/sql_parser_test.cc | 135 ++++++++++++++++++++++
 12 files changed, 766 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/kvrocks.yaml b/.github/workflows/kvrocks.yaml
index f96cf41e..675a23b6 100644
--- a/.github/workflows/kvrocks.yaml
+++ b/.github/workflows/kvrocks.yaml
@@ -440,8 +440,9 @@ jobs:
       - name: Install cmake
         if: ${{ startsWith(matrix.image, 'centos') }}
         run: |
-          wget 
https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.sh
-          bash cmake-3.26.4-linux-x86_64.sh --skip-license --prefix=/usr
+          VERSION=3.26.4
+          wget 
https://github.com/Kitware/CMake/releases/download/v$VERSION/cmake-$VERSION-linux-x86_64.sh
+          bash cmake-$VERSION-linux-x86_64.sh --skip-license --prefix=/usr
 
       - uses: actions/checkout@v3 #v4 use Node 20 and not working at CentOS 7
       - uses: actions/setup-go@v4 #v5 use Node 20 too
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e384e03..d508d4ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,6 +139,7 @@ include(cmake/jsoncons.cmake)
 include(cmake/xxhash.cmake)
 include(cmake/span.cmake)
 include(cmake/trie.cmake)
+include(cmake/pegtl.cmake)
 
 if (ENABLE_LUAJIT)
     include(cmake/luajit.cmake)
@@ -171,6 +172,7 @@ list(APPEND EXTERNAL_LIBS ${Backtrace_LIBRARY})
 list(APPEND EXTERNAL_LIBS xxhash)
 list(APPEND EXTERNAL_LIBS span-lite)
 list(APPEND EXTERNAL_LIBS tsl_hat_trie)
+list(APPEND EXTERNAL_LIBS pegtl)
 
 # Add git sha to version.h
 find_package(Git REQUIRED)
diff --git a/NOTICE b/NOTICE
index a25a60d1..86c357d8 100644
--- a/NOTICE
+++ b/NOTICE
@@ -66,6 +66,7 @@ The text of each license is also included in 
licenses/LICENSE-[project].txt
 * LuaJIT(https://github.com/KvrocksLabs/LuaJIT)
 * lua(https://github.com/KvrocksLabs/lua, alternative to LuaJIT)
 * hat-trie(https://github.com/Tessil/hat-trie)
+* pegtl(https://github.com/taocpp/PEGTL, NOTE: changed to BSL-1.0 in main 
branch)
 
 ================================================================
 Boost Software License Version 1.0
diff --git a/cmake/pegtl.cmake b/cmake/pegtl.cmake
new file mode 100644
index 00000000..d1638f36
--- /dev/null
+++ b/cmake/pegtl.cmake
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include_guard()
+
+include(cmake/utils.cmake)
+
+FetchContent_DeclareGitHubTarWithMirror(pegtl
+  taocpp/PEGTL 3.2.7
+  MD5=31b14660c883bc0489ddcdfbd29199c9
+)
+
+FetchContent_MakeAvailableWithArgs(pegtl)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index c1a8594b..cd023a72 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -58,3 +58,10 @@ function(FetchContent_DeclareGitHubWithMirror dep repo tag 
hash)
     ${hash}
   )
 endfunction()
+
+function(FetchContent_DeclareGitHubTarWithMirror dep repo tag hash)
+  FetchContent_DeclareWithMirror(${dep}
+    https://github.com/${repo}/archive/${tag}.tar.gz
+    ${hash}
+  )
+endfunction()
diff --git a/licenses/LICENSE-pegtl.txt b/licenses/LICENSE-pegtl.txt
new file mode 100644
index 00000000..97fa8314
--- /dev/null
+++ b/licenses/LICENSE-pegtl.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2007-2022 Dr. Colin Hirsch and Daniel Frey
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/common/string_util.cc b/src/common/string_util.cc
index ae41f918..e9088042 100644
--- a/src/common/string_util.cc
+++ b/src/common/string_util.cc
@@ -357,6 +357,12 @@ std::string EscapeString(std::string_view s) {
       case '\b':
         str += "\\b";
         break;
+      case '\v':
+        str += "\\v";
+        break;
+      case '\f':
+        str += "\\f";
+        break;
       default:
         if (isprint(ch)) {
           str += ch;
diff --git a/src/common/string_util.h b/src/common/string_util.h
index d23ebad7..45c4a31d 100644
--- a/src/common/string_util.h
+++ b/src/common/string_util.h
@@ -39,4 +39,20 @@ std::string StringToHex(std::string_view input);
 std::vector<std::string> TokenizeRedisProtocol(const std::string &value);
 std::string EscapeString(std::string_view s);
 
+template <typename T, typename F>
+std::string StringJoin(
+    const T &con, F &&f = [](const auto &v) -> decltype(auto) { return v; }, 
const std::string &sep = ", ") {
+  std::string res;
+  bool is_first = true;
+  for (const auto &v : con) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      res += sep;
+    }
+    res += std::forward<F>(f)(v);
+  }
+  return res;
+}
+
 }  // namespace util
diff --git a/src/search/ir.h b/src/search/ir.h
index d431d1be..87ed6488 100644
--- a/src/search/ir.h
+++ b/src/search/ir.h
@@ -20,87 +20,243 @@
 
 #pragma once
 
+#include <fmt/format.h>
+
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
+#include "fmt/core.h"
+#include "string_util.h"
+
 // kqir stands for Kvorcks Query Intermediate Representation
 namespace kqir {
 
-struct FieldRef {
+struct Node {
+  virtual std::string Dump() const = 0;
+  virtual ~Node() = default;
+
+  template <typename T, typename U = Node, typename... Args>
+  static std::unique_ptr<U> Create(Args &&...args) {
+    return std::unique_ptr<U>(new T(std::forward<Args>(args)...));
+  }
+
+  template <typename T>
+  static std::unique_ptr<T> As(std::unique_ptr<Node> &&original) {
+    auto casted = dynamic_cast<T *>(original.release());
+    CHECK(casted);
+    return std::unique_ptr<T>(casted);
+  }
+};
+
+struct FieldRef : Node {
   std::string name;
+
+  explicit FieldRef(std::string name) : name(std::move(name)) {}
+
+  std::string Dump() const override { return name; }
 };
 
-struct StringLiteral {
+struct StringLiteral : Node {
   std::string val;
+
+  explicit StringLiteral(std::string val) : val(std::move(val)) {}
+
+  std::string Dump() const override { return fmt::format("\"{}\"", 
util::EscapeString(val)); }
 };
 
-struct TagContainExpr {
-  FieldRef field;
-  StringLiteral tag;
+struct QueryExpr : Node {};
+
+struct BoolAtomExpr : QueryExpr {};
+
+struct TagContainExpr : BoolAtomExpr {
+  std::unique_ptr<FieldRef> field;
+  std::unique_ptr<StringLiteral> tag;
+
+  TagContainExpr(std::unique_ptr<FieldRef> &&field, 
std::unique_ptr<StringLiteral> &&tag)
+      : field(std::move(field)), tag(std::move(tag)) {}
+
+  std::string Dump() const override { return fmt::format("{} hastag {}", 
field->Dump(), tag->Dump()); }
 };
 
-struct NumericLiteral {
+struct NumericLiteral : Node {
   double val;
+
+  explicit NumericLiteral(double val) : val(val) {}
+
+  std::string Dump() const override { return fmt::format("{}", val); }
 };
 
-struct NumericCompareExpr {
-  enum { EQ, NE, LT, LET, GT, GET } op;
-  FieldRef field;
-  NumericLiteral num;
+// NOLINTNEXTLINE
+#define KQIR_NUMERIC_COMPARE_OPS(X) \
+  X(EQ, =, NE, EQ) X(NE, !=, EQ, NE) X(LT, <, GET, GT) X(LET, <=, GT, GET) 
X(GT, >, LET, LT) X(GET, >=, LT, LET)
+
+struct NumericCompareExpr : BoolAtomExpr {
+  enum Op {
+#define X(n, s, o, f) n,  // NOLINT
+    KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+  } op;
+  std::unique_ptr<FieldRef> field;
+  std::unique_ptr<NumericLiteral> num;
+
+  NumericCompareExpr(Op op, std::unique_ptr<FieldRef> &&field, 
std::unique_ptr<NumericLiteral> &&num)
+      : op(op), field(std::move(field)), num(std::move(num)) {}
+
+  static constexpr const char *ToOperator(Op op) {
+    switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+  case n:             \
+    return #s;
+      KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+    }
+
+    return nullptr;
+  }
+
+  static constexpr std::optional<Op> FromOperator(std::string_view op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+  if (op == #s) return n;
+    KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+
+    return std::nullopt;
+  }
+
+  static constexpr Op Negative(Op op) {
+    switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+  case n:             \
+    return o;
+      KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+    }
+
+    __builtin_unreachable();
+  }
+
+  static constexpr Op Flip(Op op) {
+    switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+  case n:             \
+    return f;
+      KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+    }
+
+    __builtin_unreachable();
+  }
+
+  std::string Dump() const override { return fmt::format("{} {} {}", 
field->Dump(), ToOperator(op), num->Dump()); };
 };
 
-struct BoolLiteral {
+struct BoolLiteral : BoolAtomExpr {
   bool val;
-};
 
-struct BooleanExpr {
-  std::variant<TagContainExpr, NumericCompareExpr, BoolLiteral> expr;
+  explicit BoolLiteral(bool val) : val(val) {}
+
+  std::string Dump() const override { return val ? "true" : "false"; }
 };
 
 struct QueryExpr;
 
-struct LogicalUnaryExpr {
-  enum { NOT } op;
+struct NotExpr : QueryExpr {
   std::unique_ptr<QueryExpr> inner;
+
+  explicit NotExpr(std::unique_ptr<QueryExpr> &&inner) : 
inner(std::move(inner)) {}
+
+  std::string Dump() const override { return fmt::format("not {}", 
inner->Dump()); }
 };
 
-struct LogicalBinaryExpr {
-  enum { AND, OR } op;
-  std::unique_ptr<QueryExpr> lhs;
-  std::unique_ptr<QueryExpr> rhs;
+struct AndExpr : QueryExpr {
+  std::vector<std::unique_ptr<QueryExpr>> inners;
+
+  explicit AndExpr(std::vector<std::unique_ptr<QueryExpr>> &&inners) : 
inners(std::move(inners)) {}
+
+  std::string Dump() const override {
+    return fmt::format("(and {})", util::StringJoin(inners, [](const auto &v) 
{ return v->Dump(); }));
+  }
 };
 
-struct QueryExpr {
-  std::variant<LogicalUnaryExpr, LogicalBinaryExpr, BooleanExpr> expr;
+struct OrExpr : QueryExpr {
+  std::vector<std::unique_ptr<QueryExpr>> inners;
+
+  explicit OrExpr(std::vector<std::unique_ptr<QueryExpr>> &&inners) : 
inners(std::move(inners)) {}
+
+  std::string Dump() const override {
+    return fmt::format("(or {})", util::StringJoin(inners, [](const auto &v) { 
return v->Dump(); }));
+  }
 };
 
-struct Limit {
+struct Limit : Node {
   size_t offset = 0;
   size_t count = std::numeric_limits<size_t>::max();
+
+  Limit(size_t offset, size_t count) : offset(offset), count(count) {}
+
+  std::string Dump() const override { return fmt::format("limit {}, {}", 
offset, count); }
 };
 
-struct SortBy {
-  enum { ASC, DESC } order;
-  FieldRef field;
+struct SortBy : Node {
+  enum Order { ASC, DESC } order = ASC;
+  std::unique_ptr<FieldRef> field;
+
+  SortBy(Order order, std::unique_ptr<FieldRef> &&field) : order(order), 
field(std::move(field)) {}
+
+  static constexpr const char *OrderToString(Order order) { return order == 
ASC ? "asc" : "desc"; }
+  std::string Dump() const override { return fmt::format("sortby {}, {}", 
field->Dump(), OrderToString(order)); }
 };
 
-struct SelectExpr {
-  std::vector<FieldRef> fields;
+struct SelectExpr : Node {
+  std::vector<std::unique_ptr<FieldRef>> fields;
+
+  explicit SelectExpr(std::vector<std::unique_ptr<FieldRef>> &&fields) : 
fields(std::move(fields)) {}
+
+  std::string Dump() const override {
+    if (fields.empty()) return "select *";
+    return fmt::format("select {}", util::StringJoin(fields, [](const auto &v) 
{ return v->Dump(); }));
+  }
 };
 
-struct IndexRef {
+struct IndexRef : Node {
   std::string name;
+
+  explicit IndexRef(std::string name) : name(std::move(name)) {}
+
+  std::string Dump() const override { return name; }
 };
 
-struct SearchStmt {
-  IndexRef index_ref;
-  QueryExpr query_expr;
-  Limit limit;
-  SortBy sort_by;
-  SelectExpr select_expr;
+struct SearchStmt : Node {
+  std::unique_ptr<IndexRef> index;
+  std::unique_ptr<QueryExpr> query_expr;  // optional
+  std::unique_ptr<Limit> limit;           // optional
+  std::unique_ptr<SortBy> sort_by;        // optional
+  std::unique_ptr<SelectExpr> select_expr;
+
+  SearchStmt(std::unique_ptr<IndexRef> &&index, std::unique_ptr<QueryExpr> 
&&query_expr, std::unique_ptr<Limit> &&limit,
+             std::unique_ptr<SortBy> &&sort_by, std::unique_ptr<SelectExpr> 
&&select_expr)
+      : index(std::move(index)),
+        query_expr(std::move(query_expr)),
+        limit(std::move(limit)),
+        sort_by(std::move(sort_by)),
+        select_expr(std::move(select_expr)) {}
+
+  std::string Dump() const override {
+    std::string opt;
+    if (query_expr) opt += " where " + query_expr->Dump();
+    if (sort_by) opt += " " + sort_by->Dump();
+    if (limit) opt += " " + limit->Dump();
+    return fmt::format("{} from {}{}", select_expr->Dump(), index->Dump(), 
opt);
+  }
 };
 
 }  // namespace kqir
diff --git a/src/search/sql_parser.h b/src/search/sql_parser.h
new file mode 100644
index 00000000..dffe051d
--- /dev/null
+++ b/src/search/sql_parser.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <tao/pegtl.hpp>
+
+namespace kqir {
+
+namespace peg = tao::pegtl;
+
+namespace sql {
+
+using namespace peg;
+
+struct True : string<'t', 'r', 'u', 'e'> {};
+struct False : string<'f', 'a', 'l', 's', 'e'> {};
+struct Boolean : sor<True, False> {};
+
+struct Digits : plus<digit> {};
+struct NumberExp : seq<one<'e', 'E'>, opt<one<'-', '+'>>, Digits> {};
+struct NumberFrac : seq<one<'.'>, Digits> {};
+struct Number : seq<opt<one<'-'>>, Digits, opt<NumberFrac>, opt<NumberExp>> {};
+
+struct UnicodeXDigit : list<seq<one<'u'>, rep<4, xdigit>>, one<'\\'>> {};
+struct EscapedSingleChar : one<'"', '\\', 'b', 'f', 'n', 'r', 't'> {};
+struct EscapedChar : sor<EscapedSingleChar, UnicodeXDigit> {};
+struct UnescapedChar : utf8::range<0x20, 0x10FFFF> {};
+struct Char : if_then_else<one<'\\'>, EscapedChar, UnescapedChar> {};
+
+struct StringContent : until<at<one<'"'>>, Char> {};
+struct String : seq<one<'"'>, StringContent, any> {};
+
+struct Identifier : identifier {};
+
+struct WhiteSpace : one<' ', '\t', '\n', '\r'> {};
+template <typename T>
+struct WSPad : pad<T, WhiteSpace> {};
+
+struct HasTag : string<'h', 'a', 's', 't', 'a', 'g'> {};
+struct HasTagExpr : WSPad<seq<Identifier, WSPad<HasTag>, String>> {};
+
+struct NumericAtomExpr : WSPad<sor<Number, Identifier>> {};
+struct NumericCompareOp : sor<string<'!', '='>, string<'<', '='>, string<'>', 
'='>, one<'=', '<', '>'>> {};
+struct NumericCompareExpr : seq<NumericAtomExpr, NumericCompareOp, 
NumericAtomExpr> {};
+
+struct BooleanAtomExpr : sor<HasTagExpr, NumericCompareExpr, WSPad<Boolean>> 
{};
+
+struct QueryExpr;
+
+struct ParenExpr : WSPad<seq<one<'('>, QueryExpr, one<')'>>> {};
+
+struct NotExpr;
+
+struct BooleanExpr : sor<BooleanAtomExpr, ParenExpr, NotExpr> {};
+
+struct Not : string<'n', 'o', 't'> {};
+struct NotExpr : seq<WSPad<Not>, BooleanExpr> {};
+
+struct And : string<'a', 'n', 'd'> {};
+// left recursion elimination
+// struct AndExpr : sor<seq<AndExpr, And, NotExpr>, NotExpr> {};
+struct AndExpr : seq<BooleanExpr, plus<seq<And, BooleanExpr>>> {};
+struct AndExprP : sor<AndExpr, BooleanExpr> {};
+
+struct Or : string<'o', 'r'> {};
+// left recursion elimination
+// struct OrExpr : sor<seq<OrExpr, Or, AndExpr>, AndExpr> {};
+struct OrExpr : seq<AndExprP, plus<seq<Or, AndExprP>>> {};
+struct OrExprP : sor<OrExpr, AndExprP> {};
+
+struct QueryExpr : seq<OrExprP> {};
+
+struct Select : string<'s', 'e', 'l', 'e', 'c', 't'> {};
+struct From : string<'f', 'r', 'o', 'm'> {};
+
+struct Wildcard : one<'*'> {};
+struct IdentifierList : seq<Identifier, star<WSPad<one<','>>, Identifier>> {};
+struct SelectExpr : WSPad<sor<Wildcard, IdentifierList>> {};
+struct FromExpr : WSPad<Identifier> {};
+
+struct Where : string<'w', 'h', 'e', 'r', 'e'> {};
+struct OrderBy : seq<string<'o', 'r', 'd', 'e', 'r'>, plus<WhiteSpace>, 
string<'b', 'y'>> {};
+struct Asc : string<'a', 's', 'c'> {};
+struct Desc : string<'d', 'e', 's', 'c'> {};
+struct Limit : string<'l', 'i', 'm', 'i', 't'> {};
+
+struct Integer : Digits {};
+
+struct WhereClause : seq<Where, QueryExpr> {};
+struct AscOrDesc : WSPad<sor<Asc, Desc>> {};
+struct OrderByClause : seq<OrderBy, WSPad<Identifier>, opt<AscOrDesc>> {};
+struct LimitClause : seq<Limit, opt<seq<WSPad<Integer>, one<','>>>, 
WSPad<Integer>> {};
+
+struct SearchStmt
+    : WSPad<seq<Select, SelectExpr, From, FromExpr, opt<WhereClause>, 
opt<OrderByClause>, opt<LimitClause>>> {};
+
+}  // namespace sql
+
+}  // namespace kqir
diff --git a/src/search/sql_transformer.h b/src/search/sql_transformer.h
new file mode 100644
index 00000000..76fac35c
--- /dev/null
+++ b/src/search/sql_transformer.h
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <limits>
+#include <memory>
+#include <tao/pegtl/contrib/parse_tree.hpp>
+#include <tao/pegtl/contrib/unescape.hpp>
+#include <tao/pegtl/demangle.hpp>
+#include <variant>
+
+#include "ir.h"
+#include "parse_util.h"
+#include "sql_parser.h"
+
+namespace kqir {
+
+namespace sql {
+
+namespace ir = kqir;
+
+template <typename Rule>
+using TreeSelector = parse_tree::selector<
+    Rule, parse_tree::store_content::on<Boolean, Number, String, Identifier, 
NumericCompareOp, AscOrDesc, Integer>,
+    parse_tree::remove_content::on<HasTagExpr, NumericCompareExpr, NotExpr, 
AndExpr, OrExpr, Wildcard, SelectExpr,
+                                   FromExpr, WhereClause, OrderByClause, 
LimitClause, SearchStmt>>;
+
+template <typename Input>
+StatusOr<std::unique_ptr<parse_tree::node>> ParseToTree(Input&& in) {
+  if (auto root = parse_tree::parse<seq<SearchStmt, eof>, 
TreeSelector>(std::forward<Input>(in))) {
+    return root;
+  } else {
+    // TODO: improve parse error message, with source location
+    return {Status::NotOK, "invalid syntax"};
+  }
+}
+
+struct Transformer {
+  using TreeNode = std::unique_ptr<parse_tree::node>;
+
+  template <typename T>
+  static bool Is(const TreeNode& node) {
+    return node->type == demangle<T>();
+  }
+
+  static bool IsRoot(const TreeNode& node) { return node->type.empty(); }
+
+  static StatusOr<std::string> UnescapeString(std::string_view str) {
+    str = str.substr(1, str.size() - 2);
+
+    std::string result;
+    while (!str.empty()) {
+      if (str[0] == '\\') {
+        str.remove_prefix(1);
+        switch (str[0]) {
+          case '\\':
+          case '"':
+            result.push_back(str[0]);
+            break;
+          case 'b':
+            result.push_back('\b');
+            break;
+          case 'f':
+            result.push_back('\f');
+            break;
+          case 'n':
+            result.push_back('\n');
+            break;
+          case 'r':
+            result.push_back('\r');
+            break;
+          case 't':
+            result.push_back('\t');
+            break;
+          case 'u':
+            if (!unescape::utf8_append_utf32(result,
+                                             
unescape::unhex_string<unsigned>(str.data() + 1, str.data() + 5))) {
+              return {Status::NotOK,
+                      fmt::format("invalid Unicode code point '{}' in string 
literal", std::string(str.data() + 1, 4))};
+            }
+            str.remove_prefix(4);
+            break;
+          default:
+            __builtin_unreachable();
+        };
+        str.remove_prefix(1);
+      } else {
+        result.push_back(str[0]);
+        str.remove_prefix(1);
+      }
+    }
+
+    return result;
+  }
+
+  static auto Transform(const TreeNode& node) -> 
StatusOr<std::unique_ptr<Node>> {
+    if (Is<Boolean>(node)) {
+      return Node::Create<ir::BoolLiteral>(node->string_view() == "true");
+    } else if (Is<Number>(node)) {
+      return Node::Create<ir::NumericLiteral>(*ParseFloat(node->string()));
+    } else if (Is<String>(node)) {
+      return 
Node::Create<ir::StringLiteral>(GET_OR_RET(UnescapeString(node->string_view())));
+    } else if (Is<HasTagExpr>(node)) {
+      CHECK(node->children.size() == 2);
+
+      return 
Node::Create<ir::TagContainExpr>(std::make_unique<ir::FieldRef>(node->children[0]->string()),
+                                              
Node::As<ir::StringLiteral>(GET_OR_RET(Transform(node->children[1]))));
+    } else if (Is<NumericCompareExpr>(node)) {
+      CHECK(node->children.size() == 3);
+
+      const auto& lhs = node->children[0];
+      const auto& rhs = node->children[2];
+
+      auto op = 
ir::NumericCompareExpr::FromOperator(node->children[1]->string_view()).value();
+      if (Is<Identifier>(lhs) && Is<Number>(rhs)) {
+        return Node::Create<ir::NumericCompareExpr>(op, 
std::make_unique<ir::FieldRef>(lhs->string()),
+                                                    
Node::As<ir::NumericLiteral>(GET_OR_RET(Transform(rhs))));
+      } else if (Is<Number>(lhs) && Is<Identifier>(rhs)) {
+        return 
Node::Create<ir::NumericCompareExpr>(ir::NumericCompareExpr::Flip(op),
+                                                    
std::make_unique<ir::FieldRef>(rhs->string()),
+                                                    
Node::As<ir::NumericLiteral>(GET_OR_RET(Transform(lhs))));
+      } else {
+        return {Status::NotOK, "the left and right side of numeric comparison 
should be an identifier and a number"};
+      }
+    } else if (Is<NotExpr>(node)) {
+      CHECK(node->children.size() == 1);
+
+      return 
Node::Create<ir::NotExpr>(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(node->children[0]))));
+    } else if (Is<AndExpr>(node)) {
+      std::vector<std::unique_ptr<ir::QueryExpr>> exprs;
+
+      for (const auto& child : node->children) {
+        exprs.push_back(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(child))));
+      }
+
+      return Node::Create<ir::AndExpr>(std::move(exprs));
+    } else if (Is<OrExpr>(node)) {
+      std::vector<std::unique_ptr<ir::QueryExpr>> exprs;
+
+      for (const auto& child : node->children) {
+        exprs.push_back(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(child))));
+      }
+
+      return Node::Create<ir::OrExpr>(std::move(exprs));
+    } else if (Is<SelectExpr>(node)) {
+      std::vector<std::unique_ptr<ir::FieldRef>> fields;
+
+      if (node->children.size() == 1 && Is<Wildcard>(node->children[0])) {
+        return Node::Create<ir::SelectExpr>(std::move(fields));
+      }
+
+      for (const auto& child : node->children) {
+        fields.push_back(std::make_unique<ir::FieldRef>(child->string()));
+      }
+
+      return Node::Create<ir::SelectExpr>(std::move(fields));
+    } else if (Is<FromExpr>(node)) {
+      CHECK(node->children.size() == 1);
+      return Node::Create<ir::IndexRef>(node->children[0]->string());
+    } else if (Is<WhereClause>(node)) {
+      CHECK(node->children.size() == 1);
+      return Transform(node->children[0]);
+    } else if (Is<LimitClause>(node)) {
+      CHECK(node->children.size() == 1 || node->children.size() == 2);
+
+      size_t offset = 0, count = std::numeric_limits<size_t>::max();
+      if (node->children.size() == 1) {
+        count = *ParseInt(node->children[0]->string());
+      } else {
+        offset = *ParseInt(node->children[0]->string());
+        count = *ParseInt(node->children[1]->string());
+      }
+
+      return Node::Create<ir::Limit>(offset, count);
+    }
+    if (Is<OrderByClause>(node)) {
+      CHECK(node->children.size() == 1 || node->children.size() == 2);
+
+      auto field = std::make_unique<FieldRef>(node->children[0]->string());
+      auto order = SortBy::Order::ASC;
+      if (node->children.size() == 2 && node->children[1]->string_view() == 
"desc") {
+        order = SortBy::Order::DESC;
+      }
+
+      return Node::Create<SortBy>(order, std::move(field));
+    } else if (Is<SearchStmt>(node)) {  // root node
+      CHECK(node->children.size() >= 2 && node->children.size() <= 5);
+
+      auto index = 
Node::As<ir::IndexRef>(GET_OR_RET(Transform(node->children[1])));
+      auto select = 
Node::As<ir::SelectExpr>(GET_OR_RET(Transform(node->children[0])));
+
+      std::unique_ptr<ir::QueryExpr> query_expr;
+      std::unique_ptr<ir::Limit> limit;
+      std::unique_ptr<ir::SortBy> sort_by;
+
+      for (size_t i = 2; i < node->children.size(); ++i) {
+        if (Is<WhereClause>(node->children[i])) {
+          query_expr = 
Node::As<ir::QueryExpr>(GET_OR_RET(Transform(node->children[i])));
+        } else if (Is<LimitClause>(node->children[i])) {
+          limit = 
Node::As<ir::Limit>(GET_OR_RET(Transform(node->children[i])));
+        } else if (Is<OrderByClause>(node->children[i])) {
+          sort_by = 
Node::As<ir::SortBy>(GET_OR_RET(Transform(node->children[i])));
+        }
+      }
+
+      return Node::Create<ir::SearchStmt>(std::move(index), 
std::move(query_expr), std::move(limit), std::move(sort_by),
+                                          std::move(select));
+    } else if (IsRoot(node)) {
+      CHECK(node->children.size() == 1);
+
+      return Transform(node->children[0]);
+    } else {
+      // UNREACHABLE CODE, just for debugging here
+      return {Status::NotOK, fmt::format("encountered invalid node type: {}", 
node->type)};
+    }
+  }
+};
+
+template <typename Input>
+StatusOr<std::unique_ptr<ir::Node>> ParseToIR(Input&& in) {
+  return 
Transformer::Transform(GET_OR_RET(ParseToTree(std::forward<Input>(in))));
+}
+
+}  // namespace sql
+
+}  // namespace kqir
diff --git a/tests/cppunit/sql_parser_test.cc b/tests/cppunit/sql_parser_test.cc
new file mode 100644
index 00000000..3a99746d
--- /dev/null
+++ b/tests/cppunit/sql_parser_test.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include <search/sql_transformer.h>
+
+#include "tao/pegtl/contrib/parse_tree_to_dot.hpp"
+#include "tao/pegtl/string_input.hpp"
+
+using namespace kqir::sql;
+
+static auto Parse(const std::string& in) { return ParseToIR(string_input(in, 
"test")); }
+
+#define AssertSyntaxError(node) ASSERT_EQ(node.Msg(), "invalid syntax");  // 
NOLINT
+
+// NOLINTNEXTLINE
+#define AssertIR(node, val)              \
+  ASSERT_EQ(node.Msg(), Status::ok_msg); \
+  ASSERT_EQ(node.GetValue()->Dump(), val);
+
+TEST(SQLParserTest, Simple) {
+  AssertSyntaxError(Parse("x"));
+  AssertSyntaxError(Parse("1"));
+  AssertSyntaxError(Parse("select"));
+  AssertSyntaxError(Parse("where"));
+  AssertSyntaxError(Parse("limit"));
+  AssertSyntaxError(Parse("from a"));
+  AssertSyntaxError(Parse("select 0 from"));
+  AssertSyntaxError(Parse("select 0 from b"));
+  AssertSyntaxError(Parse("select a from 123"));
+  AssertSyntaxError(Parse("select a from \"b\""));
+  AssertSyntaxError(Parse("select a from b, c"));
+  AssertSyntaxError(Parse("select a from b where"));
+  AssertSyntaxError(Parse("select a from b hello"));
+  AssertSyntaxError(Parse("select a from b where"));
+  AssertSyntaxError(Parse("select a from b where 1"));
+  AssertSyntaxError(Parse("select a from b where 0"));
+  AssertSyntaxError(Parse("select a from b where \"x\""));
+  AssertSyntaxError(Parse("select a from b where limit 10"));
+  AssertSyntaxError(Parse("select a from b where true and"));
+  AssertSyntaxError(Parse("select a from b where (true"));
+  AssertSyntaxError(Parse("select a from b where (true))"));
+  AssertSyntaxError(Parse("select a from b where 1 >"));
+  AssertSyntaxError(Parse("select a from b where x ="));
+  AssertSyntaxError(Parse("select a from b where x hastag"));
+  AssertSyntaxError(Parse("select a from b where ="));
+  AssertSyntaxError(Parse("select a from b where hastag x"));
+  AssertSyntaxError(Parse("select a from b where = 1"));
+  AssertSyntaxError(Parse("select a from b where x hashtag \""));
+  AssertSyntaxError(Parse(R"(select a from b where x hashtag "\p")"));
+  AssertSyntaxError(Parse(R"(select a from b where x hashtag "\u11")"));
+  AssertSyntaxError(Parse(R"(select a from b where x hashtag "\")"));
+  AssertSyntaxError(Parse(R"(select a from b where x hashtag "abc)"));
+  AssertSyntaxError(Parse("select a from b where limit 10"));
+  AssertSyntaxError(Parse("select a from b limit 1, 1, 1"));
+  AssertSyntaxError(Parse("select a from b limit -10"));
+  AssertSyntaxError(Parse("select a from b limit"));
+  AssertSyntaxError(Parse("select a from b order"));
+  AssertSyntaxError(Parse("select a from b order by"));
+  AssertSyntaxError(Parse("select a from b order by a bsc"));
+  AssertSyntaxError(Parse("select a from b order a"));
+  AssertSyntaxError(Parse("select a from b order asc"));
+  AssertSyntaxError(Parse("select a from b order by a limit"));
+
+  AssertIR(Parse("select a from b"), "select a from b");
+  AssertIR(Parse(" select  a  from  b "), "select a from b");
+  AssertIR(Parse("\nselect\n  a\t \tfrom \n\nb "), "select a from b");
+  AssertIR(Parse("select * from b"), "select * from b");
+  AssertIR(Parse("select a, b from c"), "select a, b from c");
+  AssertIR(Parse("select a, b, c from d"), "select a, b, c from d");
+  AssertIR(Parse("select  xY_z12_3 ,  X00  from  b"), "select xY_z12_3, X00 
from b");
+  AssertIR(Parse("select a from b where true"), "select a from b where true");
+  AssertIR(Parse("select a from b where false"), "select a from b where 
false");
+  AssertIR(Parse("select a from b where true and true"), "select a from b 
where (and true, true)");
+  AssertIR(Parse("select a from b where false and true and false"), "select a 
from b where (and false, true, false)");
+  AssertIR(Parse("select a from b where false or true"), "select a from b 
where (or false, true)");
+  AssertIR(Parse("select a from b where true or false or true"), "select a 
from b where (or true, false, true)");
+  AssertIR(Parse("select a from b where false and true or false"),
+           "select a from b where (or (and false, true), false)");
+  AssertIR(Parse("select a from b where false or true and false"),
+           "select a from b where (or false, (and true, false))");
+  AssertIR(Parse("select a from b where false and (true or false)"),
+           "select a from b where (and false, (or true, false))");
+  AssertIR(Parse("select a from b where (false or true) and false"),
+           "select a from b where (and (or false, true), false)");
+  AssertIR(Parse("select a from b where (false)"), "select a from b where 
false");
+  AssertIR(Parse("select a from b where ((false))"), "select a from b where 
false");
+  AssertIR(Parse("select a from b where (((false)))"), "select a from b where 
false");
+  AssertIR(Parse("select a from b where ((false) and (false))"), "select a 
from b where (and false, false)");
+  AssertIR(Parse("select a from b where x=1"), "select a from b where x = 1");
+  AssertIR(Parse("select a from b where x = 1.0"), "select a from b where x = 
1");
+  AssertIR(Parse("select a from b where x = -1.234e5"), "select a from b where 
x = -123400");
+  AssertIR(Parse("select a from b where x = -1.234e-5"), "select a from b 
where x = -1.234e-05");
+  AssertIR(Parse("select a from b where x = 222e+5"), "select a from b where x 
= 22200000");
+  AssertIR(Parse("select a from b where 1 = x"), "select a from b where x = 
1");
+  AssertIR(Parse("select a from b where 2 < y"), "select a from b where y > 
2");
+  AssertIR(Parse("select a from b where y > 2"), "select a from b where y > 
2");
+  AssertIR(Parse("select a from b where 3 >= z"), "select a from b where z <= 
3");
+  AssertIR(Parse("select a from b where x hastag \"hi\""), "select a from b 
where x hastag \"hi\"");
+  AssertIR(Parse(R"(select a from b where x hastag "a\nb")"), R"(select a from 
b where x hastag "a\nb")");
+  AssertIR(Parse(R"(select a from b where x hastag "")"), R"(select a from b 
where x hastag "")");
+  AssertIR(Parse(R"(select a from b where x hastag "hello ,  hi")"), R"(select 
a from b where x hastag "hello ,  hi")");
+  AssertIR(Parse(R"(select a from b where x hastag "a\nb\t\n")"), R"(select a 
from b where x hastag "a\nb\t\n")");
+  AssertIR(Parse(R"(select a from b where x hastag "a\u0000")"), R"(select a 
from b where x hastag "a\x00")");
+  AssertIR(Parse("select a from b where x > 1 and y < 33"), "select a from b 
where (and x > 1, y < 33)");
+  AssertIR(Parse("select a from b where x >= 1 and y hastag \"hi\" or c <= 
233"),
+           "select a from b where (or (and x >= 1, y hastag \"hi\"), c <= 
233)");
+  AssertIR(Parse("select a from b limit 10"), "select a from b limit 0, 10");
+  AssertIR(Parse("select a from b limit 2, 3"), "select a from b limit 2, 3");
+  AssertIR(Parse("select a from b order by a"), "select a from b sortby a, 
asc");
+  AssertIR(Parse("select a from b order by c desc"), "select a from b sortby 
c, desc");
+  AssertIR(Parse("select a from b order by a limit 10"), "select a from b 
sortby a, asc limit 0, 10");
+  AssertIR(Parse("select a from b where c = 1 limit 10"), "select a from b 
where c = 1 limit 0, 10");
+  AssertIR(Parse("select a from b where c = 1 and d hastag \"x\" order by e"),
+           "select a from b where (and c = 1, d hastag \"x\") sortby e, asc");
+  AssertIR(Parse("select a from b where c = 1 or d hastag \"x\" and 2 <= e 
order by e asc limit 0, 10"),
+           "select a from b where (or c = 1, (and d hastag \"x\", e >= 2)) 
sortby e, asc limit 0, 10");
+}


Reply via email to