This is an automated email from the ASF dual-hosted git repository.
twice pushed a commit to branch unstable
in repository https://gitbox.apache.org/repos/asf/kvrocks.git
The following commit(s) were added to refs/heads/unstable by this push:
new 54d00847 Add SQL SELECT parser via PEGTL for search module (#2184)
54d00847 is described below
commit 54d00847ad6a7a5c3bed0f13f94f298557a36e15
Author: Twice <[email protected]>
AuthorDate: Sat Mar 23 13:52:44 2024 +0900
Add SQL SELECT parser via PEGTL for search module (#2184)
---
.github/workflows/kvrocks.yaml | 5 +-
CMakeLists.txt | 2 +
NOTICE | 1 +
cmake/pegtl.cmake | 27 +++++
cmake/utils.cmake | 7 ++
licenses/LICENSE-pegtl.txt | 21 ++++
src/common/string_util.cc | 6 +
src/common/string_util.h | 16 +++
src/search/ir.h | 226 ++++++++++++++++++++++++++++++------
src/search/sql_parser.h | 115 +++++++++++++++++++
src/search/sql_transformer.h | 242 +++++++++++++++++++++++++++++++++++++++
tests/cppunit/sql_parser_test.cc | 135 ++++++++++++++++++++++
12 files changed, 766 insertions(+), 37 deletions(-)
diff --git a/.github/workflows/kvrocks.yaml b/.github/workflows/kvrocks.yaml
index f96cf41e..675a23b6 100644
--- a/.github/workflows/kvrocks.yaml
+++ b/.github/workflows/kvrocks.yaml
@@ -440,8 +440,9 @@ jobs:
- name: Install cmake
if: ${{ startsWith(matrix.image, 'centos') }}
run: |
- wget
https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.sh
- bash cmake-3.26.4-linux-x86_64.sh --skip-license --prefix=/usr
+ VERSION=3.26.4
+ wget
https://github.com/Kitware/CMake/releases/download/v$VERSION/cmake-$VERSION-linux-x86_64.sh
+ bash cmake-$VERSION-linux-x86_64.sh --skip-license --prefix=/usr
- uses: actions/checkout@v3 #v4 use Node 20 and not working at CentOS 7
- uses: actions/setup-go@v4 #v5 use Node 20 too
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e384e03..d508d4ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,6 +139,7 @@ include(cmake/jsoncons.cmake)
include(cmake/xxhash.cmake)
include(cmake/span.cmake)
include(cmake/trie.cmake)
+include(cmake/pegtl.cmake)
if (ENABLE_LUAJIT)
include(cmake/luajit.cmake)
@@ -171,6 +172,7 @@ list(APPEND EXTERNAL_LIBS ${Backtrace_LIBRARY})
list(APPEND EXTERNAL_LIBS xxhash)
list(APPEND EXTERNAL_LIBS span-lite)
list(APPEND EXTERNAL_LIBS tsl_hat_trie)
+list(APPEND EXTERNAL_LIBS pegtl)
# Add git sha to version.h
find_package(Git REQUIRED)
diff --git a/NOTICE b/NOTICE
index a25a60d1..86c357d8 100644
--- a/NOTICE
+++ b/NOTICE
@@ -66,6 +66,7 @@ The text of each license is also included in
licenses/LICENSE-[project].txt
* LuaJIT(https://github.com/KvrocksLabs/LuaJIT)
* lua(https://github.com/KvrocksLabs/lua, alternative to LuaJIT)
* hat-trie(https://github.com/Tessil/hat-trie)
+* pegtl(https://github.com/taocpp/PEGTL, NOTE: changed to BSL-1.0 in main
branch)
================================================================
Boost Software License Version 1.0
diff --git a/cmake/pegtl.cmake b/cmake/pegtl.cmake
new file mode 100644
index 00000000..d1638f36
--- /dev/null
+++ b/cmake/pegtl.cmake
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include_guard()
+
+include(cmake/utils.cmake)
+
+FetchContent_DeclareGitHubTarWithMirror(pegtl
+ taocpp/PEGTL 3.2.7
+ MD5=31b14660c883bc0489ddcdfbd29199c9
+)
+
+FetchContent_MakeAvailableWithArgs(pegtl)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index c1a8594b..cd023a72 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -58,3 +58,10 @@ function(FetchContent_DeclareGitHubWithMirror dep repo tag
hash)
${hash}
)
endfunction()
+
+function(FetchContent_DeclareGitHubTarWithMirror dep repo tag hash)
+ FetchContent_DeclareWithMirror(${dep}
+ https://github.com/${repo}/archive/${tag}.tar.gz
+ ${hash}
+ )
+endfunction()
diff --git a/licenses/LICENSE-pegtl.txt b/licenses/LICENSE-pegtl.txt
new file mode 100644
index 00000000..97fa8314
--- /dev/null
+++ b/licenses/LICENSE-pegtl.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2007-2022 Dr. Colin Hirsch and Daniel Frey
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/common/string_util.cc b/src/common/string_util.cc
index ae41f918..e9088042 100644
--- a/src/common/string_util.cc
+++ b/src/common/string_util.cc
@@ -357,6 +357,12 @@ std::string EscapeString(std::string_view s) {
case '\b':
str += "\\b";
break;
+ case '\v':
+ str += "\\v";
+ break;
+ case '\f':
+ str += "\\f";
+ break;
default:
if (isprint(ch)) {
str += ch;
diff --git a/src/common/string_util.h b/src/common/string_util.h
index d23ebad7..45c4a31d 100644
--- a/src/common/string_util.h
+++ b/src/common/string_util.h
@@ -39,4 +39,20 @@ std::string StringToHex(std::string_view input);
std::vector<std::string> TokenizeRedisProtocol(const std::string &value);
std::string EscapeString(std::string_view s);
+template <typename T, typename F>
+std::string StringJoin(
+ const T &con, F &&f = [](const auto &v) -> decltype(auto) { return v; },
const std::string &sep = ", ") {
+ std::string res;
+ bool is_first = true;
+ for (const auto &v : con) {
+ if (is_first) {
+ is_first = false;
+ } else {
+ res += sep;
+ }
+ res += std::forward<F>(f)(v);
+ }
+ return res;
+}
+
} // namespace util
diff --git a/src/search/ir.h b/src/search/ir.h
index d431d1be..87ed6488 100644
--- a/src/search/ir.h
+++ b/src/search/ir.h
@@ -20,87 +20,243 @@
#pragma once
+#include <fmt/format.h>
+
#include <limits>
#include <memory>
+#include <optional>
#include <string>
+#include <utility>
#include <variant>
#include <vector>
+#include "fmt/core.h"
+#include "string_util.h"
+
// kqir stands for Kvorcks Query Intermediate Representation
namespace kqir {
-struct FieldRef {
+struct Node {
+ virtual std::string Dump() const = 0;
+ virtual ~Node() = default;
+
+ template <typename T, typename U = Node, typename... Args>
+ static std::unique_ptr<U> Create(Args &&...args) {
+ return std::unique_ptr<U>(new T(std::forward<Args>(args)...));
+ }
+
+ template <typename T>
+ static std::unique_ptr<T> As(std::unique_ptr<Node> &&original) {
+ auto casted = dynamic_cast<T *>(original.release());
+ CHECK(casted);
+ return std::unique_ptr<T>(casted);
+ }
+};
+
+struct FieldRef : Node {
std::string name;
+
+ explicit FieldRef(std::string name) : name(std::move(name)) {}
+
+ std::string Dump() const override { return name; }
};
-struct StringLiteral {
+struct StringLiteral : Node {
std::string val;
+
+ explicit StringLiteral(std::string val) : val(std::move(val)) {}
+
+ std::string Dump() const override { return fmt::format("\"{}\"",
util::EscapeString(val)); }
};
-struct TagContainExpr {
- FieldRef field;
- StringLiteral tag;
+struct QueryExpr : Node {};
+
+struct BoolAtomExpr : QueryExpr {};
+
+struct TagContainExpr : BoolAtomExpr {
+ std::unique_ptr<FieldRef> field;
+ std::unique_ptr<StringLiteral> tag;
+
+ TagContainExpr(std::unique_ptr<FieldRef> &&field,
std::unique_ptr<StringLiteral> &&tag)
+ : field(std::move(field)), tag(std::move(tag)) {}
+
+ std::string Dump() const override { return fmt::format("{} hastag {}",
field->Dump(), tag->Dump()); }
};
-struct NumericLiteral {
+struct NumericLiteral : Node {
double val;
+
+ explicit NumericLiteral(double val) : val(val) {}
+
+ std::string Dump() const override { return fmt::format("{}", val); }
};
-struct NumericCompareExpr {
- enum { EQ, NE, LT, LET, GT, GET } op;
- FieldRef field;
- NumericLiteral num;
+// NOLINTNEXTLINE
+#define KQIR_NUMERIC_COMPARE_OPS(X) \
+ X(EQ, =, NE, EQ) X(NE, !=, EQ, NE) X(LT, <, GET, GT) X(LET, <=, GT, GET)
X(GT, >, LET, LT) X(GET, >=, LT, LET)
+
+struct NumericCompareExpr : BoolAtomExpr {
+ enum Op {
+#define X(n, s, o, f) n, // NOLINT
+ KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+ } op;
+ std::unique_ptr<FieldRef> field;
+ std::unique_ptr<NumericLiteral> num;
+
+ NumericCompareExpr(Op op, std::unique_ptr<FieldRef> &&field,
std::unique_ptr<NumericLiteral> &&num)
+ : op(op), field(std::move(field)), num(std::move(num)) {}
+
+ static constexpr const char *ToOperator(Op op) {
+ switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+ case n: \
+ return #s;
+ KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+ }
+
+ return nullptr;
+ }
+
+ static constexpr std::optional<Op> FromOperator(std::string_view op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+ if (op == #s) return n;
+ KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+
+ return std::nullopt;
+ }
+
+ static constexpr Op Negative(Op op) {
+ switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+ case n: \
+ return o;
+ KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+ }
+
+ __builtin_unreachable();
+ }
+
+ static constexpr Op Flip(Op op) {
+ switch (op) {
+// NOLINTNEXTLINE
+#define X(n, s, o, f) \
+ case n: \
+ return f;
+ KQIR_NUMERIC_COMPARE_OPS(X)
+#undef X
+ }
+
+ __builtin_unreachable();
+ }
+
+ std::string Dump() const override { return fmt::format("{} {} {}",
field->Dump(), ToOperator(op), num->Dump()); };
};
-struct BoolLiteral {
+struct BoolLiteral : BoolAtomExpr {
bool val;
-};
-struct BooleanExpr {
- std::variant<TagContainExpr, NumericCompareExpr, BoolLiteral> expr;
+ explicit BoolLiteral(bool val) : val(val) {}
+
+ std::string Dump() const override { return val ? "true" : "false"; }
};
struct QueryExpr;
-struct LogicalUnaryExpr {
- enum { NOT } op;
+struct NotExpr : QueryExpr {
std::unique_ptr<QueryExpr> inner;
+
+ explicit NotExpr(std::unique_ptr<QueryExpr> &&inner) :
inner(std::move(inner)) {}
+
+ std::string Dump() const override { return fmt::format("not {}",
inner->Dump()); }
};
-struct LogicalBinaryExpr {
- enum { AND, OR } op;
- std::unique_ptr<QueryExpr> lhs;
- std::unique_ptr<QueryExpr> rhs;
+struct AndExpr : QueryExpr {
+ std::vector<std::unique_ptr<QueryExpr>> inners;
+
+ explicit AndExpr(std::vector<std::unique_ptr<QueryExpr>> &&inners) :
inners(std::move(inners)) {}
+
+ std::string Dump() const override {
+ return fmt::format("(and {})", util::StringJoin(inners, [](const auto &v)
{ return v->Dump(); }));
+ }
};
-struct QueryExpr {
- std::variant<LogicalUnaryExpr, LogicalBinaryExpr, BooleanExpr> expr;
+struct OrExpr : QueryExpr {
+ std::vector<std::unique_ptr<QueryExpr>> inners;
+
+ explicit OrExpr(std::vector<std::unique_ptr<QueryExpr>> &&inners) :
inners(std::move(inners)) {}
+
+ std::string Dump() const override {
+ return fmt::format("(or {})", util::StringJoin(inners, [](const auto &v) {
return v->Dump(); }));
+ }
};
-struct Limit {
+struct Limit : Node {
size_t offset = 0;
size_t count = std::numeric_limits<size_t>::max();
+
+ Limit(size_t offset, size_t count) : offset(offset), count(count) {}
+
+ std::string Dump() const override { return fmt::format("limit {}, {}",
offset, count); }
};
-struct SortBy {
- enum { ASC, DESC } order;
- FieldRef field;
+struct SortBy : Node {
+ enum Order { ASC, DESC } order = ASC;
+ std::unique_ptr<FieldRef> field;
+
+ SortBy(Order order, std::unique_ptr<FieldRef> &&field) : order(order),
field(std::move(field)) {}
+
+ static constexpr const char *OrderToString(Order order) { return order ==
ASC ? "asc" : "desc"; }
+ std::string Dump() const override { return fmt::format("sortby {}, {}",
field->Dump(), OrderToString(order)); }
};
-struct SelectExpr {
- std::vector<FieldRef> fields;
+struct SelectExpr : Node {
+ std::vector<std::unique_ptr<FieldRef>> fields;
+
+ explicit SelectExpr(std::vector<std::unique_ptr<FieldRef>> &&fields) :
fields(std::move(fields)) {}
+
+ std::string Dump() const override {
+ if (fields.empty()) return "select *";
+ return fmt::format("select {}", util::StringJoin(fields, [](const auto &v)
{ return v->Dump(); }));
+ }
};
-struct IndexRef {
+struct IndexRef : Node {
std::string name;
+
+ explicit IndexRef(std::string name) : name(std::move(name)) {}
+
+ std::string Dump() const override { return name; }
};
-struct SearchStmt {
- IndexRef index_ref;
- QueryExpr query_expr;
- Limit limit;
- SortBy sort_by;
- SelectExpr select_expr;
+struct SearchStmt : Node {
+ std::unique_ptr<IndexRef> index;
+ std::unique_ptr<QueryExpr> query_expr; // optional
+ std::unique_ptr<Limit> limit; // optional
+ std::unique_ptr<SortBy> sort_by; // optional
+ std::unique_ptr<SelectExpr> select_expr;
+
+ SearchStmt(std::unique_ptr<IndexRef> &&index, std::unique_ptr<QueryExpr>
&&query_expr, std::unique_ptr<Limit> &&limit,
+ std::unique_ptr<SortBy> &&sort_by, std::unique_ptr<SelectExpr>
&&select_expr)
+ : index(std::move(index)),
+ query_expr(std::move(query_expr)),
+ limit(std::move(limit)),
+ sort_by(std::move(sort_by)),
+ select_expr(std::move(select_expr)) {}
+
+ std::string Dump() const override {
+ std::string opt;
+ if (query_expr) opt += " where " + query_expr->Dump();
+ if (sort_by) opt += " " + sort_by->Dump();
+ if (limit) opt += " " + limit->Dump();
+ return fmt::format("{} from {}{}", select_expr->Dump(), index->Dump(),
opt);
+ }
};
} // namespace kqir
diff --git a/src/search/sql_parser.h b/src/search/sql_parser.h
new file mode 100644
index 00000000..dffe051d
--- /dev/null
+++ b/src/search/sql_parser.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <tao/pegtl.hpp>
+
+namespace kqir {
+
+namespace peg = tao::pegtl;
+
+namespace sql {
+
+using namespace peg;
+
+struct True : string<'t', 'r', 'u', 'e'> {};
+struct False : string<'f', 'a', 'l', 's', 'e'> {};
+struct Boolean : sor<True, False> {};
+
+struct Digits : plus<digit> {};
+struct NumberExp : seq<one<'e', 'E'>, opt<one<'-', '+'>>, Digits> {};
+struct NumberFrac : seq<one<'.'>, Digits> {};
+struct Number : seq<opt<one<'-'>>, Digits, opt<NumberFrac>, opt<NumberExp>> {};
+
+struct UnicodeXDigit : list<seq<one<'u'>, rep<4, xdigit>>, one<'\\'>> {};
+struct EscapedSingleChar : one<'"', '\\', 'b', 'f', 'n', 'r', 't'> {};
+struct EscapedChar : sor<EscapedSingleChar, UnicodeXDigit> {};
+struct UnescapedChar : utf8::range<0x20, 0x10FFFF> {};
+struct Char : if_then_else<one<'\\'>, EscapedChar, UnescapedChar> {};
+
+struct StringContent : until<at<one<'"'>>, Char> {};
+struct String : seq<one<'"'>, StringContent, any> {};
+
+struct Identifier : identifier {};
+
+struct WhiteSpace : one<' ', '\t', '\n', '\r'> {};
+template <typename T>
+struct WSPad : pad<T, WhiteSpace> {};
+
+struct HasTag : string<'h', 'a', 's', 't', 'a', 'g'> {};
+struct HasTagExpr : WSPad<seq<Identifier, WSPad<HasTag>, String>> {};
+
+struct NumericAtomExpr : WSPad<sor<Number, Identifier>> {};
+struct NumericCompareOp : sor<string<'!', '='>, string<'<', '='>, string<'>',
'='>, one<'=', '<', '>'>> {};
+struct NumericCompareExpr : seq<NumericAtomExpr, NumericCompareOp,
NumericAtomExpr> {};
+
+struct BooleanAtomExpr : sor<HasTagExpr, NumericCompareExpr, WSPad<Boolean>>
{};
+
+struct QueryExpr;
+
+struct ParenExpr : WSPad<seq<one<'('>, QueryExpr, one<')'>>> {};
+
+struct NotExpr;
+
+struct BooleanExpr : sor<BooleanAtomExpr, ParenExpr, NotExpr> {};
+
+struct Not : string<'n', 'o', 't'> {};
+struct NotExpr : seq<WSPad<Not>, BooleanExpr> {};
+
+struct And : string<'a', 'n', 'd'> {};
+// left recursion elimination
+// struct AndExpr : sor<seq<AndExpr, And, NotExpr>, NotExpr> {};
+struct AndExpr : seq<BooleanExpr, plus<seq<And, BooleanExpr>>> {};
+struct AndExprP : sor<AndExpr, BooleanExpr> {};
+
+struct Or : string<'o', 'r'> {};
+// left recursion elimination
+// struct OrExpr : sor<seq<OrExpr, Or, AndExpr>, AndExpr> {};
+struct OrExpr : seq<AndExprP, plus<seq<Or, AndExprP>>> {};
+struct OrExprP : sor<OrExpr, AndExprP> {};
+
+struct QueryExpr : seq<OrExprP> {};
+
+struct Select : string<'s', 'e', 'l', 'e', 'c', 't'> {};
+struct From : string<'f', 'r', 'o', 'm'> {};
+
+struct Wildcard : one<'*'> {};
+struct IdentifierList : seq<Identifier, star<WSPad<one<','>>, Identifier>> {};
+struct SelectExpr : WSPad<sor<Wildcard, IdentifierList>> {};
+struct FromExpr : WSPad<Identifier> {};
+
+struct Where : string<'w', 'h', 'e', 'r', 'e'> {};
+struct OrderBy : seq<string<'o', 'r', 'd', 'e', 'r'>, plus<WhiteSpace>,
string<'b', 'y'>> {};
+struct Asc : string<'a', 's', 'c'> {};
+struct Desc : string<'d', 'e', 's', 'c'> {};
+struct Limit : string<'l', 'i', 'm', 'i', 't'> {};
+
+struct Integer : Digits {};
+
+struct WhereClause : seq<Where, QueryExpr> {};
+struct AscOrDesc : WSPad<sor<Asc, Desc>> {};
+struct OrderByClause : seq<OrderBy, WSPad<Identifier>, opt<AscOrDesc>> {};
+struct LimitClause : seq<Limit, opt<seq<WSPad<Integer>, one<','>>>,
WSPad<Integer>> {};
+
+struct SearchStmt
+ : WSPad<seq<Select, SelectExpr, From, FromExpr, opt<WhereClause>,
opt<OrderByClause>, opt<LimitClause>>> {};
+
+} // namespace sql
+
+} // namespace kqir
diff --git a/src/search/sql_transformer.h b/src/search/sql_transformer.h
new file mode 100644
index 00000000..76fac35c
--- /dev/null
+++ b/src/search/sql_transformer.h
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <limits>
+#include <memory>
+#include <tao/pegtl/contrib/parse_tree.hpp>
+#include <tao/pegtl/contrib/unescape.hpp>
+#include <tao/pegtl/demangle.hpp>
+#include <variant>
+
+#include "ir.h"
+#include "parse_util.h"
+#include "sql_parser.h"
+
+namespace kqir {
+
+namespace sql {
+
+namespace ir = kqir;
+
+template <typename Rule>
+using TreeSelector = parse_tree::selector<
+ Rule, parse_tree::store_content::on<Boolean, Number, String, Identifier,
NumericCompareOp, AscOrDesc, Integer>,
+ parse_tree::remove_content::on<HasTagExpr, NumericCompareExpr, NotExpr,
AndExpr, OrExpr, Wildcard, SelectExpr,
+ FromExpr, WhereClause, OrderByClause,
LimitClause, SearchStmt>>;
+
+template <typename Input>
+StatusOr<std::unique_ptr<parse_tree::node>> ParseToTree(Input&& in) {
+ if (auto root = parse_tree::parse<seq<SearchStmt, eof>,
TreeSelector>(std::forward<Input>(in))) {
+ return root;
+ } else {
+ // TODO: improve parse error message, with source location
+ return {Status::NotOK, "invalid syntax"};
+ }
+}
+
+struct Transformer {
+ using TreeNode = std::unique_ptr<parse_tree::node>;
+
+ template <typename T>
+ static bool Is(const TreeNode& node) {
+ return node->type == demangle<T>();
+ }
+
+ static bool IsRoot(const TreeNode& node) { return node->type.empty(); }
+
+ static StatusOr<std::string> UnescapeString(std::string_view str) {
+ str = str.substr(1, str.size() - 2);
+
+ std::string result;
+ while (!str.empty()) {
+ if (str[0] == '\\') {
+ str.remove_prefix(1);
+ switch (str[0]) {
+ case '\\':
+ case '"':
+ result.push_back(str[0]);
+ break;
+ case 'b':
+ result.push_back('\b');
+ break;
+ case 'f':
+ result.push_back('\f');
+ break;
+ case 'n':
+ result.push_back('\n');
+ break;
+ case 'r':
+ result.push_back('\r');
+ break;
+ case 't':
+ result.push_back('\t');
+ break;
+ case 'u':
+ if (!unescape::utf8_append_utf32(result,
+
unescape::unhex_string<unsigned>(str.data() + 1, str.data() + 5))) {
+ return {Status::NotOK,
+ fmt::format("invalid Unicode code point '{}' in string
literal", std::string(str.data() + 1, 4))};
+ }
+ str.remove_prefix(4);
+ break;
+ default:
+ __builtin_unreachable();
+ };
+ str.remove_prefix(1);
+ } else {
+ result.push_back(str[0]);
+ str.remove_prefix(1);
+ }
+ }
+
+ return result;
+ }
+
+ static auto Transform(const TreeNode& node) ->
StatusOr<std::unique_ptr<Node>> {
+ if (Is<Boolean>(node)) {
+ return Node::Create<ir::BoolLiteral>(node->string_view() == "true");
+ } else if (Is<Number>(node)) {
+ return Node::Create<ir::NumericLiteral>(*ParseFloat(node->string()));
+ } else if (Is<String>(node)) {
+ return
Node::Create<ir::StringLiteral>(GET_OR_RET(UnescapeString(node->string_view())));
+ } else if (Is<HasTagExpr>(node)) {
+ CHECK(node->children.size() == 2);
+
+ return
Node::Create<ir::TagContainExpr>(std::make_unique<ir::FieldRef>(node->children[0]->string()),
+
Node::As<ir::StringLiteral>(GET_OR_RET(Transform(node->children[1]))));
+ } else if (Is<NumericCompareExpr>(node)) {
+ CHECK(node->children.size() == 3);
+
+ const auto& lhs = node->children[0];
+ const auto& rhs = node->children[2];
+
+ auto op =
ir::NumericCompareExpr::FromOperator(node->children[1]->string_view()).value();
+ if (Is<Identifier>(lhs) && Is<Number>(rhs)) {
+ return Node::Create<ir::NumericCompareExpr>(op,
std::make_unique<ir::FieldRef>(lhs->string()),
+
Node::As<ir::NumericLiteral>(GET_OR_RET(Transform(rhs))));
+ } else if (Is<Number>(lhs) && Is<Identifier>(rhs)) {
+ return
Node::Create<ir::NumericCompareExpr>(ir::NumericCompareExpr::Flip(op),
+
std::make_unique<ir::FieldRef>(rhs->string()),
+
Node::As<ir::NumericLiteral>(GET_OR_RET(Transform(lhs))));
+ } else {
+ return {Status::NotOK, "the left and right side of numeric comparison
should be an identifier and a number"};
+ }
+ } else if (Is<NotExpr>(node)) {
+ CHECK(node->children.size() == 1);
+
+ return
Node::Create<ir::NotExpr>(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(node->children[0]))));
+ } else if (Is<AndExpr>(node)) {
+ std::vector<std::unique_ptr<ir::QueryExpr>> exprs;
+
+ for (const auto& child : node->children) {
+ exprs.push_back(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(child))));
+ }
+
+ return Node::Create<ir::AndExpr>(std::move(exprs));
+ } else if (Is<OrExpr>(node)) {
+ std::vector<std::unique_ptr<ir::QueryExpr>> exprs;
+
+ for (const auto& child : node->children) {
+ exprs.push_back(Node::As<ir::QueryExpr>(GET_OR_RET(Transform(child))));
+ }
+
+ return Node::Create<ir::OrExpr>(std::move(exprs));
+ } else if (Is<SelectExpr>(node)) {
+ std::vector<std::unique_ptr<ir::FieldRef>> fields;
+
+ if (node->children.size() == 1 && Is<Wildcard>(node->children[0])) {
+ return Node::Create<ir::SelectExpr>(std::move(fields));
+ }
+
+ for (const auto& child : node->children) {
+ fields.push_back(std::make_unique<ir::FieldRef>(child->string()));
+ }
+
+ return Node::Create<ir::SelectExpr>(std::move(fields));
+ } else if (Is<FromExpr>(node)) {
+ CHECK(node->children.size() == 1);
+ return Node::Create<ir::IndexRef>(node->children[0]->string());
+ } else if (Is<WhereClause>(node)) {
+ CHECK(node->children.size() == 1);
+ return Transform(node->children[0]);
+ } else if (Is<LimitClause>(node)) {
+ CHECK(node->children.size() == 1 || node->children.size() == 2);
+
+ size_t offset = 0, count = std::numeric_limits<size_t>::max();
+ if (node->children.size() == 1) {
+ count = *ParseInt(node->children[0]->string());
+ } else {
+ offset = *ParseInt(node->children[0]->string());
+ count = *ParseInt(node->children[1]->string());
+ }
+
+ return Node::Create<ir::Limit>(offset, count);
+ }
+ if (Is<OrderByClause>(node)) {
+ CHECK(node->children.size() == 1 || node->children.size() == 2);
+
+ auto field = std::make_unique<FieldRef>(node->children[0]->string());
+ auto order = SortBy::Order::ASC;
+ if (node->children.size() == 2 && node->children[1]->string_view() ==
"desc") {
+ order = SortBy::Order::DESC;
+ }
+
+ return Node::Create<SortBy>(order, std::move(field));
+ } else if (Is<SearchStmt>(node)) { // root node
+ CHECK(node->children.size() >= 2 && node->children.size() <= 5);
+
+ auto index =
Node::As<ir::IndexRef>(GET_OR_RET(Transform(node->children[1])));
+ auto select =
Node::As<ir::SelectExpr>(GET_OR_RET(Transform(node->children[0])));
+
+ std::unique_ptr<ir::QueryExpr> query_expr;
+ std::unique_ptr<ir::Limit> limit;
+ std::unique_ptr<ir::SortBy> sort_by;
+
+ for (size_t i = 2; i < node->children.size(); ++i) {
+ if (Is<WhereClause>(node->children[i])) {
+ query_expr =
Node::As<ir::QueryExpr>(GET_OR_RET(Transform(node->children[i])));
+ } else if (Is<LimitClause>(node->children[i])) {
+ limit =
Node::As<ir::Limit>(GET_OR_RET(Transform(node->children[i])));
+ } else if (Is<OrderByClause>(node->children[i])) {
+ sort_by =
Node::As<ir::SortBy>(GET_OR_RET(Transform(node->children[i])));
+ }
+ }
+
+ return Node::Create<ir::SearchStmt>(std::move(index),
std::move(query_expr), std::move(limit), std::move(sort_by),
+ std::move(select));
+ } else if (IsRoot(node)) {
+ CHECK(node->children.size() == 1);
+
+ return Transform(node->children[0]);
+ } else {
+ // UNREACHABLE CODE, just for debugging here
+ return {Status::NotOK, fmt::format("encountered invalid node type: {}",
node->type)};
+ }
+ }
+};
+
+template <typename Input>
+StatusOr<std::unique_ptr<ir::Node>> ParseToIR(Input&& in) {
+ return
Transformer::Transform(GET_OR_RET(ParseToTree(std::forward<Input>(in))));
+}
+
+} // namespace sql
+
+} // namespace kqir
diff --git a/tests/cppunit/sql_parser_test.cc b/tests/cppunit/sql_parser_test.cc
new file mode 100644
index 00000000..3a99746d
--- /dev/null
+++ b/tests/cppunit/sql_parser_test.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include <search/sql_transformer.h>
+
+#include "tao/pegtl/contrib/parse_tree_to_dot.hpp"
+#include "tao/pegtl/string_input.hpp"
+
+using namespace kqir::sql;
+
+static auto Parse(const std::string& in) { return ParseToIR(string_input(in,
"test")); }
+
+#define AssertSyntaxError(node) ASSERT_EQ(node.Msg(), "invalid syntax"); //
NOLINT
+
+// NOLINTNEXTLINE
+#define AssertIR(node, val) \
+ ASSERT_EQ(node.Msg(), Status::ok_msg); \
+ ASSERT_EQ(node.GetValue()->Dump(), val);
+
+TEST(SQLParserTest, Simple) {
+ AssertSyntaxError(Parse("x"));
+ AssertSyntaxError(Parse("1"));
+ AssertSyntaxError(Parse("select"));
+ AssertSyntaxError(Parse("where"));
+ AssertSyntaxError(Parse("limit"));
+ AssertSyntaxError(Parse("from a"));
+ AssertSyntaxError(Parse("select 0 from"));
+ AssertSyntaxError(Parse("select 0 from b"));
+ AssertSyntaxError(Parse("select a from 123"));
+ AssertSyntaxError(Parse("select a from \"b\""));
+ AssertSyntaxError(Parse("select a from b, c"));
+ AssertSyntaxError(Parse("select a from b where"));
+ AssertSyntaxError(Parse("select a from b hello"));
+ AssertSyntaxError(Parse("select a from b where"));
+ AssertSyntaxError(Parse("select a from b where 1"));
+ AssertSyntaxError(Parse("select a from b where 0"));
+ AssertSyntaxError(Parse("select a from b where \"x\""));
+ AssertSyntaxError(Parse("select a from b where limit 10"));
+ AssertSyntaxError(Parse("select a from b where true and"));
+ AssertSyntaxError(Parse("select a from b where (true"));
+ AssertSyntaxError(Parse("select a from b where (true))"));
+ AssertSyntaxError(Parse("select a from b where 1 >"));
+ AssertSyntaxError(Parse("select a from b where x ="));
+ AssertSyntaxError(Parse("select a from b where x hastag"));
+ AssertSyntaxError(Parse("select a from b where ="));
+ AssertSyntaxError(Parse("select a from b where hastag x"));
+ AssertSyntaxError(Parse("select a from b where = 1"));
+ AssertSyntaxError(Parse("select a from b where x hashtag \""));
+ AssertSyntaxError(Parse(R"(select a from b where x hashtag "\p")"));
+ AssertSyntaxError(Parse(R"(select a from b where x hashtag "\u11")"));
+ AssertSyntaxError(Parse(R"(select a from b where x hashtag "\")"));
+ AssertSyntaxError(Parse(R"(select a from b where x hashtag "abc)"));
+ AssertSyntaxError(Parse("select a from b where limit 10"));
+ AssertSyntaxError(Parse("select a from b limit 1, 1, 1"));
+ AssertSyntaxError(Parse("select a from b limit -10"));
+ AssertSyntaxError(Parse("select a from b limit"));
+ AssertSyntaxError(Parse("select a from b order"));
+ AssertSyntaxError(Parse("select a from b order by"));
+ AssertSyntaxError(Parse("select a from b order by a bsc"));
+ AssertSyntaxError(Parse("select a from b order a"));
+ AssertSyntaxError(Parse("select a from b order asc"));
+ AssertSyntaxError(Parse("select a from b order by a limit"));
+
+ AssertIR(Parse("select a from b"), "select a from b");
+ AssertIR(Parse(" select a from b "), "select a from b");
+ AssertIR(Parse("\nselect\n a\t \tfrom \n\nb "), "select a from b");
+ AssertIR(Parse("select * from b"), "select * from b");
+ AssertIR(Parse("select a, b from c"), "select a, b from c");
+ AssertIR(Parse("select a, b, c from d"), "select a, b, c from d");
+ AssertIR(Parse("select xY_z12_3 , X00 from b"), "select xY_z12_3, X00
from b");
+ AssertIR(Parse("select a from b where true"), "select a from b where true");
+ AssertIR(Parse("select a from b where false"), "select a from b where
false");
+ AssertIR(Parse("select a from b where true and true"), "select a from b
where (and true, true)");
+ AssertIR(Parse("select a from b where false and true and false"), "select a
from b where (and false, true, false)");
+ AssertIR(Parse("select a from b where false or true"), "select a from b
where (or false, true)");
+ AssertIR(Parse("select a from b where true or false or true"), "select a
from b where (or true, false, true)");
+ AssertIR(Parse("select a from b where false and true or false"),
+ "select a from b where (or (and false, true), false)");
+ AssertIR(Parse("select a from b where false or true and false"),
+ "select a from b where (or false, (and true, false))");
+ AssertIR(Parse("select a from b where false and (true or false)"),
+ "select a from b where (and false, (or true, false))");
+ AssertIR(Parse("select a from b where (false or true) and false"),
+ "select a from b where (and (or false, true), false)");
+ AssertIR(Parse("select a from b where (false)"), "select a from b where
false");
+ AssertIR(Parse("select a from b where ((false))"), "select a from b where
false");
+ AssertIR(Parse("select a from b where (((false)))"), "select a from b where
false");
+ AssertIR(Parse("select a from b where ((false) and (false))"), "select a
from b where (and false, false)");
+ AssertIR(Parse("select a from b where x=1"), "select a from b where x = 1");
+ AssertIR(Parse("select a from b where x = 1.0"), "select a from b where x =
1");
+ AssertIR(Parse("select a from b where x = -1.234e5"), "select a from b where
x = -123400");
+ AssertIR(Parse("select a from b where x = -1.234e-5"), "select a from b
where x = -1.234e-05");
+ AssertIR(Parse("select a from b where x = 222e+5"), "select a from b where x
= 22200000");
+ AssertIR(Parse("select a from b where 1 = x"), "select a from b where x =
1");
+ AssertIR(Parse("select a from b where 2 < y"), "select a from b where y >
2");
+ AssertIR(Parse("select a from b where y > 2"), "select a from b where y >
2");
+ AssertIR(Parse("select a from b where 3 >= z"), "select a from b where z <=
3");
+ AssertIR(Parse("select a from b where x hastag \"hi\""), "select a from b
where x hastag \"hi\"");
+ AssertIR(Parse(R"(select a from b where x hastag "a\nb")"), R"(select a from
b where x hastag "a\nb")");
+ AssertIR(Parse(R"(select a from b where x hastag "")"), R"(select a from b
where x hastag "")");
+ AssertIR(Parse(R"(select a from b where x hastag "hello , hi")"), R"(select
a from b where x hastag "hello , hi")");
+ AssertIR(Parse(R"(select a from b where x hastag "a\nb\t\n")"), R"(select a
from b where x hastag "a\nb\t\n")");
+ AssertIR(Parse(R"(select a from b where x hastag "a\u0000")"), R"(select a
from b where x hastag "a\x00")");
+ AssertIR(Parse("select a from b where x > 1 and y < 33"), "select a from b
where (and x > 1, y < 33)");
+ AssertIR(Parse("select a from b where x >= 1 and y hastag \"hi\" or c <=
233"),
+ "select a from b where (or (and x >= 1, y hastag \"hi\"), c <=
233)");
+ AssertIR(Parse("select a from b limit 10"), "select a from b limit 0, 10");
+ AssertIR(Parse("select a from b limit 2, 3"), "select a from b limit 2, 3");
+ AssertIR(Parse("select a from b order by a"), "select a from b sortby a,
asc");
+ AssertIR(Parse("select a from b order by c desc"), "select a from b sortby
c, desc");
+ AssertIR(Parse("select a from b order by a limit 10"), "select a from b
sortby a, asc limit 0, 10");
+ AssertIR(Parse("select a from b where c = 1 limit 10"), "select a from b
where c = 1 limit 0, 10");
+ AssertIR(Parse("select a from b where c = 1 and d hastag \"x\" order by e"),
+ "select a from b where (and c = 1, d hastag \"x\") sortby e, asc");
+ AssertIR(Parse("select a from b where c = 1 or d hastag \"x\" and 2 <= e
order by e asc limit 0, 10"),
+ "select a from b where (or c = 1, (and d hastag \"x\", e >= 2))
sortby e, asc limit 0, 10");
+}