adamdebreceni commented on code in PR #1692:
URL: https://github.com/apache/nifi-minifi-cpp/pull/1692#discussion_r1423921251


##########
extensions/standard-processors/utils/JoltUtils.cpp:
##########
@@ -0,0 +1,1134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltUtils.h"
+#include "rapidjson/error/en.h"
+#include "Exception.h"
+
+namespace org::apache::nifi::minifi::utils::jolt {
+
+
+static bool isSpecialChar(char ch) {
+  static constexpr std::array SPECIAL_CHARS{'.', '[', ']', '$', '&', '@', '#', 
'*'};
+  return std::find(SPECIAL_CHARS.begin(), SPECIAL_CHARS.end(), ch) != 
SPECIAL_CHARS.end();
+}
+
+bool Spec::Template::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '&') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<std::pair<Spec::Template, Spec::It>, std::string> 
Spec::Template::parse(It begin, It end) {
+  enum class State {
+    Plain,
+    Escaped,
+    Template,  // &
+    SimpleIndex,  // &1
+    CanonicalTemplate,  // &(
+    ParentIndex,  // &(1
+    NextIndex,  // &(1,
+    MatchIndex  // &(1,0
+  };
+
+  std::vector<std::string> fragments;
+  std::vector<std::pair<size_t, size_t>> references;
+  fragments.push_back({});
+  State state = State::Plain;
+  std::string target;
+  // go beyond the last char on purpose
+  auto ch_it = begin;
+  while (ch_it <= end) {
+    std::optional<char> ch;
+    if (ch_it < end) {
+      ch = *ch_it;
+    }
+    bool force_terminate = false;
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '&') {
+          references.push_back({});
+          fragments.push_back({});
+          state = State::Template;
+        } else if (ch == ')' || ch == ']' || ch == '.' || ch == '[') {
+          force_terminate = true;
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in template '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+      case State::Template: {
+        if (ch == '(') {
+          state = State::CanonicalTemplate;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::SimpleIndex;
+        } else {
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::SimpleIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::CanonicalTemplate: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::ParentIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::ParentIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          references.back().first = std::stoi(target);
+          state = State::NextIndex;
+        } else if (ch == ')') {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit, comma or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::NextIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::MatchIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::MatchIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          references.back().second = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+    }
+    if (force_terminate) {
+      break;
+    }
+    if (ch_it != end) {
+      ++ch_it;
+    } else {
+      break;
+    }
+  }
+
+  gsl_Assert(state == State::Plain);
+  return std::pair<Template, It>{Template{std::move(fragments), 
std::move(references)}, ch_it};
+}
+
+bool Spec::Regex::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '*') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<Spec::Regex, std::string> Spec::Regex::parse(std::string_view 
str) {
+  enum class State {
+    Plain,
+    Escaped
+  };
+  std::vector<std::string> fragments;
+  fragments.push_back({});
+  State state = State::Plain;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '*') {
+          fragments.push_back({});
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in pattern '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  gsl_Assert(state == State::Plain);
+  return Regex{std::move(fragments)};
+}
+
+std::string Spec::Template::eval(const Context& ctx) const {
+  std::string res;
+  for (size_t idx = 0; idx + 1 < fragments.size(); ++idx) {
+    res += fragments.at(idx);
+    auto& ref = references.at(idx);
+    auto* target = ctx.find(ref.first);
+    if (!target) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Invalid reference to {} 
at {}", ref.first, ctx.path()));
+    }
+    if (target->matches.size() <= ref.second) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Could not find match {} 
in '{}' at {}", ref.second, target->matches.at(0), ctx.path()));
+    }
+    res += target->matches.at(ref.second);
+  }
+  res += fragments.back();
+  return res;
+}
+
+std::optional<std::vector<std::string_view>> 
Spec::Regex::match(std::string_view str) const {
+  std::vector<std::string_view> matches;
+  matches.push_back(str);
+  if (fragments.size() == 1) {
+    if (str == fragments.front()) {
+      return matches;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  // first fragment is at the beginning of the string
+  if (str.substr(0, fragments.front().size()) != fragments.front()) {
+    return std::nullopt;
+  }
+  auto it = str.begin() + fragments.front().size();
+  for (size_t idx = 1; idx + 1 < fragments.size(); ++idx) {
+    auto& frag = fragments[idx];
+    auto next_it = std::search(it, str.end(), frag.begin(), frag.end());
+    if (next_it == str.end() && !frag.empty()) {
+      return std::nullopt;
+    }
+    matches.push_back({it, next_it});
+    it = next_it + frag.size();
+  }
+  // last fragment is at the end of the string
+  if (gsl::narrow<size_t>(std::distance(it, str.end())) < 
fragments.back().size()) {
+    // not enough characters left
+    return std::nullopt;
+  }
+  auto next_it = std::next(str.rbegin(), fragments.back().size()).base();
+  if (std::string_view(next_it, str.end()) != fragments.back()) {
+    return std::nullopt;
+  }
+  matches.push_back({it, next_it});
+  return matches;
+}
+
+namespace {
+
+nonstd::expected<std::pair<Spec::Destination, Spec::It>, std::string> 
parseDestination(const Spec::Context& ctx, Spec::It begin, Spec::It end);
+Spec::Destinations parseDestinations(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+Spec::Pattern::Value parseValue(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+std::pair<size_t, size_t> parseKeyAccess(std::string_view str) {
+  enum class State {
+    Begin,
+    BeginRef,
+    PrimaryIndex,
+    BeginFirstIndex,
+    FirstIndex,
+    BeginSecondIndex,
+    SecondIndex,
+    End
+  } state = State::Begin;
+  std::string target;
+  std::pair<size_t, size_t> result{0, 0};
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Begin: {
+        if (ch != '$') {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected '$' in key 
access in '{}' at {}", str, idx));
+        }
+        state = State::BeginRef;
+        break;
+      }
+      case State::BeginRef: {
+        if (ch == '(') {
+          state = State::BeginFirstIndex;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::PrimaryIndex;
+        } else if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected index in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::PrimaryIndex: {
+        if (!ch) {
+          result.first = std::stoull(target);
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::BeginFirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::FirstIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::FirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          result.first = std::stoull(target);
+          state = State::BeginSecondIndex;
+        }
+        break;
+      }
+      case State::BeginSecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::SecondIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::SecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          result.second = std::stoull(target);
+          state = State::End;
+        }
+        break;
+      }
+      case State::End: {
+        if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected end of 
string in '{}' at {}", str, idx));
+        }
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+std::string parseLiteral(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  std::string result;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch) {
+          result += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated escape 
sequence in '{}'", str));
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unknown escape 
sequence in literal '\\{}'", ch.value()));
+        }
+        result += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+
+  gsl_Expects(state == State::Plain);
+  return result;
+}
+
+nonstd::expected<std::pair<Spec::Path, Spec::It>, std::string> parsePath(const 
Spec::Context& ctx, Spec::It begin, Spec::It end) {
+  auto dst = parseDestination(ctx, begin, end);
+  if (!dst) {
+    return nonstd::make_unexpected(std::move(dst.error()));
+  }
+  Spec::Path result;
+  for (auto&& [member, type] : std::move(dst->first)) {
+    if (!holds_alternative<Spec::Template>(member)) {
+      return nonstd::make_unexpected(fmt::format("Value reference at {} cannot 
contain nested value reference path", ctx.path()));
+    }
+    result.emplace_back(std::move(std::get<Spec::Template>(member)), type);
+  }
+  return std::pair<Spec::Path, Spec::It>{result, dst->second};
+}
+
+nonstd::expected<std::pair<Spec::ValueRef, Spec::It>, std::string> 
parseValueReference(const Spec::Context& ctx, Spec::It begin, Spec::It end, 
bool greedy_path) {
+  using ResultT = std::pair<Spec::ValueRef, Spec::It>;
+  auto it = begin;
+  if (it == end) {
+    return nonstd::make_unexpected("Cannot parse value reference from empty 
string");
+  }
+  if (*it != '@') {
+    return nonstd::make_unexpected("Value reference must start with '@'");
+  }
+  ++it;
+  if (it == end) {
+    return ResultT{{0, {}}, it};
+  }
+  if (*it != '(') {
+    if (std::isdigit(static_cast<unsigned char>(*it))) {
+      // format is @123...
+      auto idx_begin = it;
+      while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+        ++it;
+      }
+      return ResultT{{std::stoull(std::string{idx_begin, it}), {}}, it};
+    }
+    // format is @field.inner
+    if (greedy_path) {
+      if (auto path = parsePath(ctx, it, end)) {
+        return ResultT{{0, std::move(path->first)}, path->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    } else {
+      if (auto templ = Spec::Template::parse(it, end)) {
+        return ResultT{{0, Spec::Path{{std::move(templ->first), 
Spec::MemberType::FIELD}}}, templ->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    }
+  }
+  ++it;
+  size_t idx = 0;
+  if (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+    auto idx_begin = it;
+    while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+      ++it;
+    }
+    auto idx_end = it;
+    idx = std::stoull(std::string{idx_begin, idx_end});
+    if (it == end) {
+      return nonstd::make_unexpected("Expected ')' in value reference");
+    }
+    if (*it != ',') {
+      if (*it != ')') {
+        return nonstd::make_unexpected("Expected ')' in value reference");
+      }
+      ++it;
+      return ResultT{{idx, {}}, it};
+    }
+    // *it == ','
+    ++it;
+  }
+  if (it == end) {
+    return nonstd::make_unexpected("Expected member accessor in value 
reference");
+  }
+  auto path = parsePath(ctx, it, end);
+  if (!path) {
+    return nonstd::make_unexpected(fmt::format("Invalid path in value 
reference: {}", path.error()));
+  }
+  it = path->second;
+  if (it == end || *it != ')') {
+    return nonstd::make_unexpected("Expected ')' in value reference");
+  }
+  ++it;
+  return ResultT{{idx, std::move(path->first)}, it};
+}
+
+template<typename T>
+bool isAllDigits(T begin, T end) {
+  return std::all_of(begin, end, [] (auto ch) {return 
std::isdigit(static_cast<unsigned char>(ch));});
+}
+
+void parseMember(const Spec::Context& ctx, const 
std::unique_ptr<Spec::Pattern>& result, std::string_view name, const 
rapidjson::Value& member) {
+  if (name.starts_with("@")) {
+    if (auto ref = parseValueReference(ctx, name.begin(), name.end(), true)) {
+      if (ref->second != name.end()) {
+        throw Exception(GENERAL_EXCEPTION, "Failed to fully parse value 
reference");
+      }
+      Spec::Context sub_ctx = ctx.extend(ctx.matches, ctx.node);
+      result->values.push_back({Spec::ValueRef{ref->first}, 
parseValue(sub_ctx, member)});
+    } else {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse value 
reference at '{}/{}': {}", ctx.path(), name, ref.error()));
+    }
+  } else if (name.starts_with("$")) {
+    Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+    result->keys.insert({parseKeyAccess(name), parseDestinations(sub_ctx, 
member)});
+  } else if (name.starts_with("#")) {
+    result->defaults.insert({std::string{name.substr(1)}, 
parseDestinations(ctx, member)});
+  } else {
+    const bool is_template = Spec::Template::check(name);
+    const bool is_regex = Spec::Regex::check(name);
+    if (is_template && is_regex) {
+      throw Exception(GENERAL_EXCEPTION, "Pattern cannot contain both & and 
*");
+    }
+    if (is_template) {
+      if (auto templ = Spec::Template::parse(name.begin(), name.end())) {
+        if (templ->second != name.end()) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse 
template at {}, unexpected char at {}", ctx.path(), std::distance(name.begin(), 
templ->second)));
+        }
+        // dry eval so we can check if the references refer to valid substrings
+        (void)templ->first.eval(ctx);
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        result->templates.insert({templ->first, parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key template at {}: {}", ctx.path(), templ.error()));
+      }
+    } else if (is_regex) {
+      if (auto reg = Spec::Regex::parse(name)) {
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        sub_ctx.matches.resize(reg.value().size());
+        result->regexes.insert({reg.value(), parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key regex at {}: {}", ctx.path(), reg.error()));
+      }
+    } else {
+      Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+      std::optional<size_t> numeric_value;
+      auto literal_name = parseLiteral(name);
+      result->literal_indices.insert({literal_name, result->literals.size()});
+      if (isAllDigits(literal_name.begin(), literal_name.end())) {
+        numeric_value = std::stoull(literal_name);
+      }
+      result->literals.push_back({literal_name, numeric_value, 
parseValue(sub_ctx, member)});
+    }
+  }
+}
+
+std::unique_ptr<Spec::Pattern> parseMap(const Spec::Context& ctx, const 
rapidjson::Value& val) {
+  if (!val.IsObject()) {
+    throw Exception(GENERAL_EXCEPTION, fmt::format("Expected a map at '{}'", 
ctx.path()));
+  }
+  auto map = std::make_unique<Spec::Pattern>();
+
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+
+  for (auto& [name_val, member] : val.GetObject()) {
+    std::string_view name{name_val.GetString(), name_val.GetStringLength()};
+    std::string subkey;
+    for (size_t idx = 0; idx <= name.size(); ++idx) {
+      std::optional<char> ch;
+      if (idx < name.size()) {
+        ch = name[idx];
+      }
+      switch (state) {
+        case State::Plain: {
+          if (ch == '\\') {
+            state = State::Escaped;
+          } else if (!ch || ch == '|') {
+            parseMember(ctx, map, subkey, member);
+            subkey.clear();
+          } else {
+            subkey += ch.value();
+          }
+          break;
+        }
+        case State::Escaped: {
+          if (!ch) {
+            throw Exception(GENERAL_EXCEPTION, "Unterminated escape sequence");
+          }
+          if (ch == '|') {
+            subkey += "|";
+          } else {
+            subkey += "\\";
+            subkey += ch.value();
+          }

Review Comment:
   there is a subsequent processing step that checks for escapes again, so you 
can put `abc\\|def\|gh` which will be divided into `['abc\\', 'def|gh']` which 
then processed further into `['abc\', 'def|gh']`, note that this does not work 
in the original implementation, you cannot escape `|` characters, they are 
always treated as multiple subpatterns, but since no test verified that 
behavior I added it here
   (added some comments on the intentionality of leaving `\\`)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to