fgerlits commented on code in PR #1692:
URL: https://github.com/apache/nifi-minifi-cpp/pull/1692#discussion_r1403618933


##########
libminifi/test/Utils.h:
##########
@@ -48,23 +48,45 @@ using namespace std::literals::chrono_literals;
 
 namespace org::apache::nifi::minifi::test::utils {
 
+struct JsonContext {
+  const JsonContext* parent{nullptr};
+  std::string_view member;
+
+  std::string path() const {
+    if (!parent) {
+      return "/";
+    }
+    return minifi::utils::StringUtils::join_pack(parent->path(), member, "/");
+  }
+};
+
+#define REQUIRE_WARN(cond, msg) if (!(cond)) {WARN(msg); REQUIRE(cond);}
+
 // carries out a loose match on objects, i.e. it doesn't matter if the
 // actual object has extra fields than expected
-void matchJSON(const rapidjson::Value& actual, const rapidjson::Value& 
expected) {
+void matchJSON(const JsonContext& ctx, const rapidjson::Value& actual, const 
rapidjson::Value& expected, bool strict = false) {
   if (expected.IsObject()) {
-    REQUIRE(actual.IsObject());
+    REQUIRE_WARN(actual.IsObject(), fmt::format("Expected object at {}", 
ctx.path()));
     for (const auto& expected_member : expected.GetObject()) {
-      REQUIRE(actual.HasMember(expected_member.name));
-      matchJSON(actual[expected_member.name], expected_member.value);
+      std::string_view name{expected_member.name.GetString(), 
expected_member.name.GetStringLength()};
+      REQUIRE_WARN(actual.HasMember(expected_member.name), 
fmt::format("Expected member '{}' at {}", name, ctx.path()));
+      matchJSON(JsonContext{.parent = &ctx, .member = name}, 
actual[expected_member.name], expected_member.value, strict);
+    }
+    if (strict) {
+      for (const auto& actual_member : actual.GetObject()) {
+        std::string_view name{actual_member.name.GetString(), 
actual_member.name.GetStringLength()};
+        REQUIRE_WARN(expected.HasMember(actual_member.name), fmt::format("Did 
not expect member '{}' at {}", name, ctx.path()));
+        matchJSON(JsonContext{.parent = &ctx, .member = name}, 
actual_member.value, expected[actual_member.name], strict);

Review Comment:
   I don't think line 79 is needed, since we have already compared all common 
children in the first loop.



##########
libminifi/test/Utils.h:
##########
@@ -48,23 +48,45 @@ using namespace std::literals::chrono_literals;
 
 namespace org::apache::nifi::minifi::test::utils {
 
+struct JsonContext {
+  const JsonContext* parent{nullptr};
+  std::string_view member;
+
+  std::string path() const {
+    if (!parent) {
+      return "/";
+    }
+    return minifi::utils::StringUtils::join_pack(parent->path(), member, "/");
+  }
+};
+
+#define REQUIRE_WARN(cond, msg) if (!(cond)) {WARN(msg); REQUIRE(cond);}
+
 // carries out a loose match on objects, i.e. it doesn't matter if the
 // actual object has extra fields than expected
-void matchJSON(const rapidjson::Value& actual, const rapidjson::Value& 
expected) {
+void matchJSON(const JsonContext& ctx, const rapidjson::Value& actual, const 
rapidjson::Value& expected, bool strict = false) {
   if (expected.IsObject()) {
-    REQUIRE(actual.IsObject());
+    REQUIRE_WARN(actual.IsObject(), fmt::format("Expected object at {}", 
ctx.path()));
     for (const auto& expected_member : expected.GetObject()) {
-      REQUIRE(actual.HasMember(expected_member.name));
-      matchJSON(actual[expected_member.name], expected_member.value);
+      std::string_view name{expected_member.name.GetString(), 
expected_member.name.GetStringLength()};
+      REQUIRE_WARN(actual.HasMember(expected_member.name), 
fmt::format("Expected member '{}' at {}", name, ctx.path()));
+      matchJSON(JsonContext{.parent = &ctx, .member = name}, 
actual[expected_member.name], expected_member.value, strict);
+    }
+    if (strict) {
+      for (const auto& actual_member : actual.GetObject()) {
+        std::string_view name{actual_member.name.GetString(), 
actual_member.name.GetStringLength()};
+        REQUIRE_WARN(expected.HasMember(actual_member.name), fmt::format("Did 
not expect member '{}' at {}", name, ctx.path()));
+        matchJSON(JsonContext{.parent = &ctx, .member = name}, 
actual_member.value, expected[actual_member.name], strict);
+      }
     }
   } else if (expected.IsArray()) {
-    REQUIRE(actual.IsArray());
-    REQUIRE(actual.Size() == expected.Size());
+    REQUIRE_WARN(actual.IsArray(), fmt::format("Expected array at {}", 
ctx.path()));
+    REQUIRE_WARN(actual.Size() == expected.Size(), fmt::format("Expected array 
of length {}, got {} at", expected.Size(), actual.Size(), ctx.path()));

Review Comment:
   missing `{}` at the end of the format string



##########
libminifi/test/Utils.h:
##########
@@ -48,23 +48,45 @@ using namespace std::literals::chrono_literals;
 
 namespace org::apache::nifi::minifi::test::utils {
 
+struct JsonContext {
+  const JsonContext* parent{nullptr};
+  std::string_view member;

Review Comment:
   Having a `string_view` member is unsafe, although the uses below look OK.  I 
would either change the type of the member to `string`, or move the class 
inside the `matchJSON` function to hide it.



##########
extensions/standard-processors/processors/JoltTransformJSON.cpp:
##########
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltTransformJSON.h"
+#include "core/Resource.h"
+#include "utils/ProcessorConfigUtils.h"
+#include "rapidjson/document.h"
+#include "rapidjson/error/en.h"
+#include "utils/StringUtils.h"
+
+namespace org::apache::nifi::minifi::processors {
+
+void JoltTransformJSON::initialize() {
+  setSupportedProperties(Properties);
+  setSupportedRelationships(Relationships);
+}
+
+void JoltTransformJSON::onSchedule(core::ProcessContext* context, 
core::ProcessSessionFactory* /*session_factory*/) {
+  gsl_Expects(context);
+  transform_ = 
utils::parseEnumProperty<jolt_transform_json::JoltTransform>(*context, 
JoltTransform);
+  const std::string spec_str = utils::getRequiredPropertyOrThrow(*context, 
JoltSpecification.name);
+  if (auto spec = utils::jolt::Spec::parse(spec_str, logger_)) {
+    spec_ = std::move(spec.value());
+  } else {
+    throw Exception(PROCESS_SCHEDULE_EXCEPTION, fmt::format("The value of '{}' 
is not a valid jolt specification: {}", JoltSpecification.name, spec.error()));
+  }
+}
+
+void JoltTransformJSON::onTrigger(core::ProcessContext* context, 
core::ProcessSession* session) {
+  gsl_Expects(context && session && spec_);
+  auto flowfile = session->get();
+  if (!flowfile) {
+    context->yield();
+    return;
+  }
+
+  auto content = session->readBuffer(flowfile);
+  rapidjson::Document input;
+  rapidjson::ParseResult parse_result = input.Parse(reinterpret_cast<const 
char*>(content.buffer.data()), content.buffer.size());
+  if (!parse_result) {
+    session->transfer(flowfile, Failure);
+    return;
+  }
+
+  if (auto result = spec_->process(input, logger_)) {
+    rapidjson::StringBuffer buffer;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+    result.value().Accept(writer);
+    session->writeBuffer(flowfile, std::span<const char>(buffer.GetString(), 
buffer.GetSize()));
+    session->transfer(flowfile, Success);
+  } else {
+    logger_->log_debug("Failed to apply transformation: %s", result.error());

Review Comment:
   I would make this at least an `info`, too



##########
extensions/standard-processors/processors/JoltTransformJSON.h:
##########


Review Comment:
   Please add the new processor to `PROCESSORS.md`, as well.  It's easy to do 
using `minifi --docs [docs-dir]`.



##########
extensions/standard-processors/processors/JoltTransformJSON.cpp:
##########
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltTransformJSON.h"
+#include "core/Resource.h"
+#include "utils/ProcessorConfigUtils.h"
+#include "rapidjson/document.h"
+#include "rapidjson/error/en.h"
+#include "utils/StringUtils.h"
+
+namespace org::apache::nifi::minifi::processors {
+
+void JoltTransformJSON::initialize() {
+  setSupportedProperties(Properties);
+  setSupportedRelationships(Relationships);
+}
+
+void JoltTransformJSON::onSchedule(core::ProcessContext* context, 
core::ProcessSessionFactory* /*session_factory*/) {
+  gsl_Expects(context);
+  transform_ = 
utils::parseEnumProperty<jolt_transform_json::JoltTransform>(*context, 
JoltTransform);
+  const std::string spec_str = utils::getRequiredPropertyOrThrow(*context, 
JoltSpecification.name);
+  if (auto spec = utils::jolt::Spec::parse(spec_str, logger_)) {
+    spec_ = std::move(spec.value());
+  } else {
+    throw Exception(PROCESS_SCHEDULE_EXCEPTION, fmt::format("The value of '{}' 
is not a valid jolt specification: {}", JoltSpecification.name, spec.error()));
+  }
+}
+
+void JoltTransformJSON::onTrigger(core::ProcessContext* context, 
core::ProcessSession* session) {
+  gsl_Expects(context && session && spec_);
+  auto flowfile = session->get();
+  if (!flowfile) {
+    context->yield();
+    return;
+  }
+
+  auto content = session->readBuffer(flowfile);
+  rapidjson::Document input;
+  rapidjson::ParseResult parse_result = input.Parse(reinterpret_cast<const 
char*>(content.buffer.data()), content.buffer.size());
+  if (!parse_result) {
+    session->transfer(flowfile, Failure);
+    return;
+  }

Review Comment:
   some logging would be useful here, at least on `info` level, but could be 
`warn` or `error`, as well



##########
extensions/standard-processors/utils/JoltUtils.cpp:
##########
@@ -0,0 +1,1134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltUtils.h"
+#include "rapidjson/error/en.h"
+#include "Exception.h"
+
+namespace org::apache::nifi::minifi::utils::jolt {
+
+
+static bool isSpecialChar(char ch) {
+  static constexpr std::array SPECIAL_CHARS{'.', '[', ']', '$', '&', '@', '#', 
'*'};
+  return std::find(SPECIAL_CHARS.begin(), SPECIAL_CHARS.end(), ch) != 
SPECIAL_CHARS.end();
+}

Review Comment:
   I don't insist if you like it better this way, but since `isSpecialChar()` 
is always used as
   ```c++
   if (ch != '\\' && !isSpecialChar(ch.value())) { ... }
   ```
   we could include the `\` in the list of special chars so we can simplify 
this to
   ```c++
   if (!isSpecialChar(ch.value())) { ... }
   ```



##########
extensions/standard-processors/utils/JoltUtils.cpp:
##########
@@ -0,0 +1,1134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltUtils.h"
+#include "rapidjson/error/en.h"
+#include "Exception.h"
+
+namespace org::apache::nifi::minifi::utils::jolt {
+
+
+static bool isSpecialChar(char ch) {
+  static constexpr std::array SPECIAL_CHARS{'.', '[', ']', '$', '&', '@', '#', 
'*'};
+  return std::find(SPECIAL_CHARS.begin(), SPECIAL_CHARS.end(), ch) != 
SPECIAL_CHARS.end();
+}
+
+bool Spec::Template::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '&') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<std::pair<Spec::Template, Spec::It>, std::string> 
Spec::Template::parse(It begin, It end) {
+  enum class State {
+    Plain,
+    Escaped,
+    Template,  // &
+    SimpleIndex,  // &1
+    CanonicalTemplate,  // &(
+    ParentIndex,  // &(1
+    NextIndex,  // &(1,
+    MatchIndex  // &(1,0
+  };
+
+  std::vector<std::string> fragments;
+  std::vector<std::pair<size_t, size_t>> references;
+  fragments.push_back({});
+  State state = State::Plain;
+  std::string target;
+  // go beyond the last char on purpose
+  auto ch_it = begin;
+  while (ch_it <= end) {
+    std::optional<char> ch;
+    if (ch_it < end) {
+      ch = *ch_it;
+    }
+    bool force_terminate = false;
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '&') {
+          references.push_back({});
+          fragments.push_back({});
+          state = State::Template;
+        } else if (ch == ')' || ch == ']' || ch == '.' || ch == '[') {
+          force_terminate = true;
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in template '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+      case State::Template: {
+        if (ch == '(') {
+          state = State::CanonicalTemplate;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::SimpleIndex;
+        } else {
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;

Review Comment:
   I would put a `gsl_Expects(ch_it != begin)` before this line (and also 
before line 126), just to guard against bad things happening a few years from 
now, after some edits are made to this code.



##########
extensions/standard-processors/utils/JoltUtils.cpp:
##########
@@ -0,0 +1,1134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltUtils.h"
+#include "rapidjson/error/en.h"
+#include "Exception.h"
+
+namespace org::apache::nifi::minifi::utils::jolt {
+
+
+static bool isSpecialChar(char ch) {
+  static constexpr std::array SPECIAL_CHARS{'.', '[', ']', '$', '&', '@', '#', 
'*'};
+  return std::find(SPECIAL_CHARS.begin(), SPECIAL_CHARS.end(), ch) != 
SPECIAL_CHARS.end();
+}
+
+bool Spec::Template::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '&') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<std::pair<Spec::Template, Spec::It>, std::string> 
Spec::Template::parse(It begin, It end) {
+  enum class State {
+    Plain,
+    Escaped,
+    Template,  // &
+    SimpleIndex,  // &1
+    CanonicalTemplate,  // &(
+    ParentIndex,  // &(1
+    NextIndex,  // &(1,
+    MatchIndex  // &(1,0
+  };
+
+  std::vector<std::string> fragments;
+  std::vector<std::pair<size_t, size_t>> references;
+  fragments.push_back({});
+  State state = State::Plain;
+  std::string target;
+  // go beyond the last char on purpose
+  auto ch_it = begin;
+  while (ch_it <= end) {
+    std::optional<char> ch;
+    if (ch_it < end) {
+      ch = *ch_it;
+    }
+    bool force_terminate = false;
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '&') {
+          references.push_back({});
+          fragments.push_back({});
+          state = State::Template;
+        } else if (ch == ')' || ch == ']' || ch == '.' || ch == '[') {
+          force_terminate = true;
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in template '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+      case State::Template: {
+        if (ch == '(') {
+          state = State::CanonicalTemplate;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::SimpleIndex;
+        } else {
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::SimpleIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::CanonicalTemplate: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::ParentIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::ParentIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          references.back().first = std::stoi(target);
+          state = State::NextIndex;
+        } else if (ch == ')') {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit, comma or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::NextIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::MatchIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::MatchIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          references.back().second = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+    }
+    if (force_terminate) {
+      break;
+    }
+    if (ch_it != end) {
+      ++ch_it;
+    } else {
+      break;
+    }
+  }
+
+  gsl_Assert(state == State::Plain);
+  return std::pair<Template, It>{Template{std::move(fragments), 
std::move(references)}, ch_it};
+}
+
+bool Spec::Regex::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '*') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<Spec::Regex, std::string> Spec::Regex::parse(std::string_view 
str) {
+  enum class State {
+    Plain,
+    Escaped
+  };
+  std::vector<std::string> fragments;
+  fragments.push_back({});
+  State state = State::Plain;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '*') {
+          fragments.push_back({});
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in pattern '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  gsl_Assert(state == State::Plain);
+  return Regex{std::move(fragments)};
+}
+
+std::string Spec::Template::eval(const Context& ctx) const {
+  std::string res;
+  for (size_t idx = 0; idx + 1 < fragments.size(); ++idx) {
+    res += fragments.at(idx);
+    auto& ref = references.at(idx);
+    auto* target = ctx.find(ref.first);
+    if (!target) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Invalid reference to {} 
at {}", ref.first, ctx.path()));
+    }
+    if (target->matches.size() <= ref.second) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Could not find match {} 
in '{}' at {}", ref.second, target->matches.at(0), ctx.path()));
+    }
+    res += target->matches.at(ref.second);
+  }
+  res += fragments.back();
+  return res;
+}
+
+std::optional<std::vector<std::string_view>> 
Spec::Regex::match(std::string_view str) const {
+  std::vector<std::string_view> matches;
+  matches.push_back(str);
+  if (fragments.size() == 1) {
+    if (str == fragments.front()) {
+      return matches;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  // first fragment is at the beginning of the string
+  if (str.substr(0, fragments.front().size()) != fragments.front()) {
+    return std::nullopt;
+  }
+  auto it = str.begin() + fragments.front().size();
+  for (size_t idx = 1; idx + 1 < fragments.size(); ++idx) {
+    auto& frag = fragments[idx];
+    auto next_it = std::search(it, str.end(), frag.begin(), frag.end());
+    if (next_it == str.end() && !frag.empty()) {
+      return std::nullopt;
+    }
+    matches.push_back({it, next_it});
+    it = next_it + frag.size();
+  }
+  // last fragment is at the end of the string
+  if (gsl::narrow<size_t>(std::distance(it, str.end())) < 
fragments.back().size()) {
+    // not enough characters left
+    return std::nullopt;
+  }
+  auto next_it = std::next(str.rbegin(), fragments.back().size()).base();
+  if (std::string_view(next_it, str.end()) != fragments.back()) {
+    return std::nullopt;
+  }
+  matches.push_back({it, next_it});
+  return matches;
+}
+
+namespace {
+
+nonstd::expected<std::pair<Spec::Destination, Spec::It>, std::string> 
parseDestination(const Spec::Context& ctx, Spec::It begin, Spec::It end);
+Spec::Destinations parseDestinations(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+Spec::Pattern::Value parseValue(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+std::pair<size_t, size_t> parseKeyAccess(std::string_view str) {
+  enum class State {
+    Begin,
+    BeginRef,
+    PrimaryIndex,
+    BeginFirstIndex,
+    FirstIndex,
+    BeginSecondIndex,
+    SecondIndex,
+    End
+  } state = State::Begin;
+  std::string target;
+  std::pair<size_t, size_t> result{0, 0};
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Begin: {
+        if (ch != '$') {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected '$' in key 
access in '{}' at {}", str, idx));
+        }
+        state = State::BeginRef;
+        break;
+      }
+      case State::BeginRef: {
+        if (ch == '(') {
+          state = State::BeginFirstIndex;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::PrimaryIndex;
+        } else if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected index in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::PrimaryIndex: {
+        if (!ch) {
+          result.first = std::stoull(target);
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::BeginFirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::FirstIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::FirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          result.first = std::stoull(target);
+          state = State::BeginSecondIndex;
+        }
+        break;
+      }
+      case State::BeginSecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::SecondIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::SecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          result.second = std::stoull(target);
+          state = State::End;
+        }
+        break;
+      }
+      case State::End: {
+        if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected end of 
string in '{}' at {}", str, idx));
+        }
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+std::string parseLiteral(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  std::string result;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch) {
+          result += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated escape 
sequence in '{}'", str));
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unknown escape 
sequence in literal '\\{}'", ch.value()));
+        }
+        result += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+
+  gsl_Expects(state == State::Plain);
+  return result;
+}
+
+nonstd::expected<std::pair<Spec::Path, Spec::It>, std::string> parsePath(const 
Spec::Context& ctx, Spec::It begin, Spec::It end) {
+  auto dst = parseDestination(ctx, begin, end);
+  if (!dst) {
+    return nonstd::make_unexpected(std::move(dst.error()));
+  }
+  Spec::Path result;
+  for (auto&& [member, type] : std::move(dst->first)) {
+    if (!holds_alternative<Spec::Template>(member)) {
+      return nonstd::make_unexpected(fmt::format("Value reference at {} cannot 
contain nested value reference path", ctx.path()));
+    }
+    result.emplace_back(std::move(std::get<Spec::Template>(member)), type);
+  }
+  return std::pair<Spec::Path, Spec::It>{result, dst->second};
+}
+
+nonstd::expected<std::pair<Spec::ValueRef, Spec::It>, std::string> 
parseValueReference(const Spec::Context& ctx, Spec::It begin, Spec::It end, 
bool greedy_path) {
+  using ResultT = std::pair<Spec::ValueRef, Spec::It>;
+  auto it = begin;
+  if (it == end) {
+    return nonstd::make_unexpected("Cannot parse value reference from empty 
string");
+  }
+  if (*it != '@') {
+    return nonstd::make_unexpected("Value reference must start with '@'");
+  }
+  ++it;
+  if (it == end) {
+    return ResultT{{0, {}}, it};
+  }
+  if (*it != '(') {
+    if (std::isdigit(static_cast<unsigned char>(*it))) {
+      // format is @123...
+      auto idx_begin = it;
+      while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+        ++it;
+      }
+      return ResultT{{std::stoull(std::string{idx_begin, it}), {}}, it};
+    }
+    // format is @field.inner
+    if (greedy_path) {
+      if (auto path = parsePath(ctx, it, end)) {
+        return ResultT{{0, std::move(path->first)}, path->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    } else {
+      if (auto templ = Spec::Template::parse(it, end)) {
+        return ResultT{{0, Spec::Path{{std::move(templ->first), 
Spec::MemberType::FIELD}}}, templ->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    }
+  }
+  ++it;
+  size_t idx = 0;
+  if (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+    auto idx_begin = it;
+    while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+      ++it;
+    }
+    auto idx_end = it;
+    idx = std::stoull(std::string{idx_begin, idx_end});
+    if (it == end) {
+      return nonstd::make_unexpected("Expected ')' in value reference");
+    }
+    if (*it != ',') {
+      if (*it != ')') {
+        return nonstd::make_unexpected("Expected ')' in value reference");
+      }
+      ++it;
+      return ResultT{{idx, {}}, it};
+    }
+    // *it == ','
+    ++it;
+  }
+  if (it == end) {
+    return nonstd::make_unexpected("Expected member accessor in value 
reference");
+  }
+  auto path = parsePath(ctx, it, end);
+  if (!path) {
+    return nonstd::make_unexpected(fmt::format("Invalid path in value 
reference: {}", path.error()));
+  }
+  it = path->second;
+  if (it == end || *it != ')') {
+    return nonstd::make_unexpected("Expected ')' in value reference");
+  }
+  ++it;
+  return ResultT{{idx, std::move(path->first)}, it};
+}
+
+template<typename T>
+bool isAllDigits(T begin, T end) {
+  return std::all_of(begin, end, [] (auto ch) {return 
std::isdigit(static_cast<unsigned char>(ch));});
+}
+
+void parseMember(const Spec::Context& ctx, const 
std::unique_ptr<Spec::Pattern>& result, std::string_view name, const 
rapidjson::Value& member) {
+  if (name.starts_with("@")) {
+    if (auto ref = parseValueReference(ctx, name.begin(), name.end(), true)) {
+      if (ref->second != name.end()) {
+        throw Exception(GENERAL_EXCEPTION, "Failed to fully parse value 
reference");
+      }
+      Spec::Context sub_ctx = ctx.extend(ctx.matches, ctx.node);
+      result->values.push_back({Spec::ValueRef{ref->first}, 
parseValue(sub_ctx, member)});
+    } else {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse value 
reference at '{}/{}': {}", ctx.path(), name, ref.error()));
+    }
+  } else if (name.starts_with("$")) {
+    Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+    result->keys.insert({parseKeyAccess(name), parseDestinations(sub_ctx, 
member)});
+  } else if (name.starts_with("#")) {
+    result->defaults.insert({std::string{name.substr(1)}, 
parseDestinations(ctx, member)});
+  } else {
+    const bool is_template = Spec::Template::check(name);
+    const bool is_regex = Spec::Regex::check(name);
+    if (is_template && is_regex) {
+      throw Exception(GENERAL_EXCEPTION, "Pattern cannot contain both & and 
*");
+    }
+    if (is_template) {
+      if (auto templ = Spec::Template::parse(name.begin(), name.end())) {
+        if (templ->second != name.end()) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse 
template at {}, unexpected char at {}", ctx.path(), std::distance(name.begin(), 
templ->second)));
+        }
+        // dry eval so we can check if the references refer to valid substrings
+        (void)templ->first.eval(ctx);
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        result->templates.insert({templ->first, parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key template at {}: {}", ctx.path(), templ.error()));
+      }
+    } else if (is_regex) {
+      if (auto reg = Spec::Regex::parse(name)) {
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        sub_ctx.matches.resize(reg.value().size());
+        result->regexes.insert({reg.value(), parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key regex at {}: {}", ctx.path(), reg.error()));
+      }
+    } else {
+      Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+      std::optional<size_t> numeric_value;
+      auto literal_name = parseLiteral(name);
+      result->literal_indices.insert({literal_name, result->literals.size()});
+      if (isAllDigits(literal_name.begin(), literal_name.end())) {
+        numeric_value = std::stoull(literal_name);
+      }
+      result->literals.push_back({literal_name, numeric_value, 
parseValue(sub_ctx, member)});
+    }
+  }
+}
+
+std::unique_ptr<Spec::Pattern> parseMap(const Spec::Context& ctx, const 
rapidjson::Value& val) {
+  if (!val.IsObject()) {
+    throw Exception(GENERAL_EXCEPTION, fmt::format("Expected a map at '{}'", 
ctx.path()));
+  }
+  auto map = std::make_unique<Spec::Pattern>();
+
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+
+  for (auto& [name_val, member] : val.GetObject()) {
+    std::string_view name{name_val.GetString(), name_val.GetStringLength()};
+    std::string subkey;
+    for (size_t idx = 0; idx <= name.size(); ++idx) {
+      std::optional<char> ch;
+      if (idx < name.size()) {
+        ch = name[idx];
+      }
+      switch (state) {
+        case State::Plain: {
+          if (ch == '\\') {
+            state = State::Escaped;
+          } else if (!ch || ch == '|') {
+            parseMember(ctx, map, subkey, member);
+            subkey.clear();
+          } else {
+            subkey += ch.value();
+          }
+          break;
+        }
+        case State::Escaped: {
+          if (!ch) {
+            throw Exception(GENERAL_EXCEPTION, "Unterminated escape sequence");
+          }
+          if (ch == '|') {
+            subkey += "|";
+          } else {
+            subkey += "\\";
+            subkey += ch.value();
+          }

Review Comment:
   this leaves `\\` as `\\`; is that OK?  how would I put a single literal `\` 
right before a `|` operator?



##########
extensions/standard-processors/utils/JoltUtils.cpp:
##########
@@ -0,0 +1,1134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JoltUtils.h"
+#include "rapidjson/error/en.h"
+#include "Exception.h"
+
+namespace org::apache::nifi::minifi::utils::jolt {
+
+
+static bool isSpecialChar(char ch) {
+  static constexpr std::array SPECIAL_CHARS{'.', '[', ']', '$', '&', '@', '#', 
'*'};
+  return std::find(SPECIAL_CHARS.begin(), SPECIAL_CHARS.end(), ch) != 
SPECIAL_CHARS.end();
+}
+
+bool Spec::Template::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '&') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<std::pair<Spec::Template, Spec::It>, std::string> 
Spec::Template::parse(It begin, It end) {
+  enum class State {
+    Plain,
+    Escaped,
+    Template,  // &
+    SimpleIndex,  // &1
+    CanonicalTemplate,  // &(
+    ParentIndex,  // &(1
+    NextIndex,  // &(1,
+    MatchIndex  // &(1,0
+  };
+
+  std::vector<std::string> fragments;
+  std::vector<std::pair<size_t, size_t>> references;
+  fragments.push_back({});
+  State state = State::Plain;
+  std::string target;
+  // go beyond the last char on purpose
+  auto ch_it = begin;
+  while (ch_it <= end) {
+    std::optional<char> ch;
+    if (ch_it < end) {
+      ch = *ch_it;
+    }
+    bool force_terminate = false;
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '&') {
+          references.push_back({});
+          fragments.push_back({});
+          state = State::Template;
+        } else if (ch == ')' || ch == ']' || ch == '.' || ch == '[') {
+          force_terminate = true;
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in template '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+      case State::Template: {
+        if (ch == '(') {
+          state = State::CanonicalTemplate;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::SimpleIndex;
+        } else {
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::SimpleIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+          // reprocess this char in a different state
+          --ch_it;
+        }
+        break;
+      }
+      case State::CanonicalTemplate: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::ParentIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::ParentIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          references.back().first = std::stoi(target);
+          state = State::NextIndex;
+        } else if (ch == ')') {
+          references.back().first = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit, comma or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::NextIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::MatchIndex;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Expected an index at 
{}", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+      case State::MatchIndex: {
+        if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          references.back().second = std::stoi(target);
+          state = State::Plain;
+        } else {
+          return nonstd::make_unexpected(fmt::format("Invalid character at {}, 
expected digit or close parenthesis", std::distance(begin, ch_it)));
+        }
+        break;
+      }
+    }
+    if (force_terminate) {
+      break;
+    }
+    if (ch_it != end) {
+      ++ch_it;
+    } else {
+      break;
+    }
+  }
+
+  gsl_Assert(state == State::Plain);
+  return std::pair<Template, It>{Template{std::move(fragments), 
std::move(references)}, ch_it};
+}
+
+bool Spec::Regex::check(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  for (char ch : str) {
+    switch (state) {
+      case State::Plain: {
+        if (ch == '*') {
+          return true;
+        } else if (ch == '\\') {
+          state = State::Escaped;
+        }
+        break;
+      }
+      case State::Escaped: {
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+nonstd::expected<Spec::Regex, std::string> Spec::Regex::parse(std::string_view 
str) {
+  enum class State {
+    Plain,
+    Escaped
+  };
+  std::vector<std::string> fragments;
+  fragments.push_back({});
+  State state = State::Plain;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch == '*') {
+          fragments.push_back({});
+        } else if (ch) {
+          fragments.back() += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          return nonstd::make_unexpected("Unterminated escape sequence");
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          return nonstd::make_unexpected(fmt::format("Unknown escape sequence 
in pattern '\\{}'", ch.value()));
+        }
+        fragments.back() += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+  gsl_Assert(state == State::Plain);
+  return Regex{std::move(fragments)};
+}
+
+std::string Spec::Template::eval(const Context& ctx) const {
+  std::string res;
+  for (size_t idx = 0; idx + 1 < fragments.size(); ++idx) {
+    res += fragments.at(idx);
+    auto& ref = references.at(idx);
+    auto* target = ctx.find(ref.first);
+    if (!target) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Invalid reference to {} 
at {}", ref.first, ctx.path()));
+    }
+    if (target->matches.size() <= ref.second) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Could not find match {} 
in '{}' at {}", ref.second, target->matches.at(0), ctx.path()));
+    }
+    res += target->matches.at(ref.second);
+  }
+  res += fragments.back();
+  return res;
+}
+
+std::optional<std::vector<std::string_view>> 
Spec::Regex::match(std::string_view str) const {
+  std::vector<std::string_view> matches;
+  matches.push_back(str);
+  if (fragments.size() == 1) {
+    if (str == fragments.front()) {
+      return matches;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  // first fragment is at the beginning of the string
+  if (str.substr(0, fragments.front().size()) != fragments.front()) {
+    return std::nullopt;
+  }
+  auto it = str.begin() + fragments.front().size();
+  for (size_t idx = 1; idx + 1 < fragments.size(); ++idx) {
+    auto& frag = fragments[idx];
+    auto next_it = std::search(it, str.end(), frag.begin(), frag.end());
+    if (next_it == str.end() && !frag.empty()) {
+      return std::nullopt;
+    }
+    matches.push_back({it, next_it});
+    it = next_it + frag.size();
+  }
+  // last fragment is at the end of the string
+  if (gsl::narrow<size_t>(std::distance(it, str.end())) < 
fragments.back().size()) {
+    // not enough characters left
+    return std::nullopt;
+  }
+  auto next_it = std::next(str.rbegin(), fragments.back().size()).base();
+  if (std::string_view(next_it, str.end()) != fragments.back()) {
+    return std::nullopt;
+  }
+  matches.push_back({it, next_it});
+  return matches;
+}
+
+namespace {
+
+nonstd::expected<std::pair<Spec::Destination, Spec::It>, std::string> 
parseDestination(const Spec::Context& ctx, Spec::It begin, Spec::It end);
+Spec::Destinations parseDestinations(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+Spec::Pattern::Value parseValue(const Spec::Context& ctx, const 
rapidjson::Value& val);
+
+std::pair<size_t, size_t> parseKeyAccess(std::string_view str) {
+  enum class State {
+    Begin,
+    BeginRef,
+    PrimaryIndex,
+    BeginFirstIndex,
+    FirstIndex,
+    BeginSecondIndex,
+    SecondIndex,
+    End
+  } state = State::Begin;
+  std::string target;
+  std::pair<size_t, size_t> result{0, 0};
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Begin: {
+        if (ch != '$') {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected '$' in key 
access in '{}' at {}", str, idx));
+        }
+        state = State::BeginRef;
+        break;
+      }
+      case State::BeginRef: {
+        if (ch == '(') {
+          state = State::BeginFirstIndex;
+        } else if (ch && std::isdigit(static_cast<unsigned char>(ch.value()))) 
{
+          target.clear();
+          target += ch.value();
+          state = State::PrimaryIndex;
+        } else if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected index in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::PrimaryIndex: {
+        if (!ch) {
+          result.first = std::stoull(target);
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::BeginFirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::FirstIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::FirstIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated first 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ',') {
+          result.first = std::stoull(target);
+          state = State::BeginSecondIndex;
+        }
+        break;
+      }
+      case State::BeginSecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target.clear();
+          target += ch.value();
+          state = State::SecondIndex;
+        } else {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected digit in 
key access in '{}' at {}", str, idx));
+        }
+        break;
+      }
+      case State::SecondIndex: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated second 
index in key access in '{}'", str));
+        } else if (std::isdigit(static_cast<unsigned char>(ch.value()))) {
+          target += ch.value();
+        } else if (ch == ')') {
+          result.second = std::stoull(target);
+          state = State::End;
+        }
+        break;
+      }
+      case State::End: {
+        if (ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Expected end of 
string in '{}' at {}", str, idx));
+        }
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+std::string parseLiteral(std::string_view str) {
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+  std::string result;
+  for (size_t idx = 0; idx <= str.size(); ++idx) {
+    std::optional<char> ch;
+    if (idx < str.size()) {
+      ch = str[idx];
+    }
+    switch (state) {
+      case State::Plain: {
+        if (ch == '\\') {
+          state = State::Escaped;
+        } else if (ch) {
+          result += ch.value();
+        }
+        break;
+      }
+      case State::Escaped: {
+        if (!ch) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unterminated escape 
sequence in '{}'", str));
+        }
+        if (ch != '\\' && !isSpecialChar(ch.value())) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Unknown escape 
sequence in literal '\\{}'", ch.value()));
+        }
+        result += ch.value();
+        state = State::Plain;
+        break;
+      }
+    }
+  }
+
+  gsl_Expects(state == State::Plain);
+  return result;
+}
+
+nonstd::expected<std::pair<Spec::Path, Spec::It>, std::string> parsePath(const 
Spec::Context& ctx, Spec::It begin, Spec::It end) {
+  auto dst = parseDestination(ctx, begin, end);
+  if (!dst) {
+    return nonstd::make_unexpected(std::move(dst.error()));
+  }
+  Spec::Path result;
+  for (auto&& [member, type] : std::move(dst->first)) {
+    if (!holds_alternative<Spec::Template>(member)) {
+      return nonstd::make_unexpected(fmt::format("Value reference at {} cannot 
contain nested value reference path", ctx.path()));
+    }
+    result.emplace_back(std::move(std::get<Spec::Template>(member)), type);
+  }
+  return std::pair<Spec::Path, Spec::It>{result, dst->second};
+}
+
+nonstd::expected<std::pair<Spec::ValueRef, Spec::It>, std::string> 
parseValueReference(const Spec::Context& ctx, Spec::It begin, Spec::It end, 
bool greedy_path) {
+  using ResultT = std::pair<Spec::ValueRef, Spec::It>;
+  auto it = begin;
+  if (it == end) {
+    return nonstd::make_unexpected("Cannot parse value reference from empty 
string");
+  }
+  if (*it != '@') {
+    return nonstd::make_unexpected("Value reference must start with '@'");
+  }
+  ++it;
+  if (it == end) {
+    return ResultT{{0, {}}, it};
+  }
+  if (*it != '(') {
+    if (std::isdigit(static_cast<unsigned char>(*it))) {
+      // format is @123...
+      auto idx_begin = it;
+      while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+        ++it;
+      }
+      return ResultT{{std::stoull(std::string{idx_begin, it}), {}}, it};
+    }
+    // format is @field.inner
+    if (greedy_path) {
+      if (auto path = parsePath(ctx, it, end)) {
+        return ResultT{{0, std::move(path->first)}, path->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    } else {
+      if (auto templ = Spec::Template::parse(it, end)) {
+        return ResultT{{0, Spec::Path{{std::move(templ->first), 
Spec::MemberType::FIELD}}}, templ->second};
+      } else {
+        return ResultT {{0, {}}, it};
+      }
+    }
+  }
+  ++it;
+  size_t idx = 0;
+  if (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+    auto idx_begin = it;
+    while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+      ++it;
+    }
+    auto idx_end = it;
+    idx = std::stoull(std::string{idx_begin, idx_end});
+    if (it == end) {
+      return nonstd::make_unexpected("Expected ')' in value reference");
+    }
+    if (*it != ',') {
+      if (*it != ')') {
+        return nonstd::make_unexpected("Expected ')' in value reference");
+      }
+      ++it;
+      return ResultT{{idx, {}}, it};
+    }
+    // *it == ','
+    ++it;
+  }
+  if (it == end) {
+    return nonstd::make_unexpected("Expected member accessor in value 
reference");
+  }
+  auto path = parsePath(ctx, it, end);
+  if (!path) {
+    return nonstd::make_unexpected(fmt::format("Invalid path in value 
reference: {}", path.error()));
+  }
+  it = path->second;
+  if (it == end || *it != ')') {
+    return nonstd::make_unexpected("Expected ')' in value reference");
+  }
+  ++it;
+  return ResultT{{idx, std::move(path->first)}, it};
+}
+
+template<typename T>
+bool isAllDigits(T begin, T end) {
+  return std::all_of(begin, end, [] (auto ch) {return 
std::isdigit(static_cast<unsigned char>(ch));});
+}
+
+void parseMember(const Spec::Context& ctx, const 
std::unique_ptr<Spec::Pattern>& result, std::string_view name, const 
rapidjson::Value& member) {
+  if (name.starts_with("@")) {
+    if (auto ref = parseValueReference(ctx, name.begin(), name.end(), true)) {
+      if (ref->second != name.end()) {
+        throw Exception(GENERAL_EXCEPTION, "Failed to fully parse value 
reference");
+      }
+      Spec::Context sub_ctx = ctx.extend(ctx.matches, ctx.node);
+      result->values.push_back({Spec::ValueRef{ref->first}, 
parseValue(sub_ctx, member)});
+    } else {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse value 
reference at '{}/{}': {}", ctx.path(), name, ref.error()));
+    }
+  } else if (name.starts_with("$")) {
+    Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+    result->keys.insert({parseKeyAccess(name), parseDestinations(sub_ctx, 
member)});
+  } else if (name.starts_with("#")) {
+    result->defaults.insert({std::string{name.substr(1)}, 
parseDestinations(ctx, member)});
+  } else {
+    const bool is_template = Spec::Template::check(name);
+    const bool is_regex = Spec::Regex::check(name);
+    if (is_template && is_regex) {
+      throw Exception(GENERAL_EXCEPTION, "Pattern cannot contain both & and 
*");
+    }
+    if (is_template) {
+      if (auto templ = Spec::Template::parse(name.begin(), name.end())) {
+        if (templ->second != name.end()) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse 
template at {}, unexpected char at {}", ctx.path(), std::distance(name.begin(), 
templ->second)));
+        }
+        // dry eval so we can check if the references refer to valid substrings
+        (void)templ->first.eval(ctx);
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        result->templates.insert({templ->first, parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key template at {}: {}", ctx.path(), templ.error()));
+      }
+    } else if (is_regex) {
+      if (auto reg = Spec::Regex::parse(name)) {
+        Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+        sub_ctx.matches.resize(reg.value().size());
+        result->regexes.insert({reg.value(), parseValue(sub_ctx, member)});
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Error while parsing 
key regex at {}: {}", ctx.path(), reg.error()));
+      }
+    } else {
+      Spec::Context sub_ctx = ctx.extend({name}, nullptr);
+      std::optional<size_t> numeric_value;
+      auto literal_name = parseLiteral(name);
+      result->literal_indices.insert({literal_name, result->literals.size()});
+      if (isAllDigits(literal_name.begin(), literal_name.end())) {
+        numeric_value = std::stoull(literal_name);
+      }
+      result->literals.push_back({literal_name, numeric_value, 
parseValue(sub_ctx, member)});
+    }
+  }
+}
+
+std::unique_ptr<Spec::Pattern> parseMap(const Spec::Context& ctx, const 
rapidjson::Value& val) {
+  if (!val.IsObject()) {
+    throw Exception(GENERAL_EXCEPTION, fmt::format("Expected a map at '{}'", 
ctx.path()));
+  }
+  auto map = std::make_unique<Spec::Pattern>();
+
+  enum class State {
+    Plain,
+    Escaped
+  } state = State::Plain;
+
+  for (auto& [name_val, member] : val.GetObject()) {
+    std::string_view name{name_val.GetString(), name_val.GetStringLength()};
+    std::string subkey;
+    for (size_t idx = 0; idx <= name.size(); ++idx) {
+      std::optional<char> ch;
+      if (idx < name.size()) {
+        ch = name[idx];
+      }
+      switch (state) {
+        case State::Plain: {
+          if (ch == '\\') {
+            state = State::Escaped;
+          } else if (!ch || ch == '|') {
+            parseMember(ctx, map, subkey, member);
+            subkey.clear();
+          } else {
+            subkey += ch.value();
+          }
+          break;
+        }
+        case State::Escaped: {
+          if (!ch) {
+            throw Exception(GENERAL_EXCEPTION, "Unterminated escape sequence");
+          }
+          if (ch == '|') {
+            subkey += "|";
+          } else {
+            subkey += "\\";
+            subkey += ch.value();
+          }
+          state = State::Plain;
+          break;
+        }
+      }
+    }
+  }
+  return map;
+}
+
+nonstd::expected<std::pair<Spec::MatchingIndex, Spec::It>, std::string> 
parseMatchingIndex(Spec::It begin, Spec::It end) {
+  auto it = begin;
+  if (it == end) {
+    return nonstd::make_unexpected("Empty matching index");
+  }
+  if (*it != '#') {
+    return nonstd::make_unexpected("Matching must start with a '#'");
+  }
+  ++it;
+  auto idx_begin = it;
+  while (it != end && std::isdigit(static_cast<unsigned char>(*it))) {
+    ++it;
+  }
+  return std::pair<Spec::MatchingIndex, 
Spec::It>{std::stoull(std::string{idx_begin, it}), it};
+}
+
+// dot-delimited list of templates and value references
+nonstd::expected<std::pair<Spec::Destination, Spec::It>, std::string> 
parseDestination(const Spec::Context& ctx, Spec::It begin, Spec::It end) {
+  Spec::Destination result;
+  Spec::MemberType type = Spec::MemberType::FIELD;
+  auto ch_it = begin;
+  auto isEnd = [&] () {
+    return ch_it == end || *ch_it == ')';
+  };
+  while (!isEnd()) {
+    if (auto match_idx = parseMatchingIndex(ch_it, end)) {
+      if (type != Spec::MemberType::INDEX) {
+        return nonstd::make_unexpected("Matching index can only be used in 
index context, e.g. apple[#2]");
+      }
+      if (!ctx.find(match_idx->first)) {
+        return nonstd::make_unexpected(fmt::format("Invalid matching index at 
{} to ancestor {}", ctx.path(), match_idx->first));
+      }
+      result.push_back({match_idx->first, type});
+      ch_it = match_idx->second;
+    } else if (auto val_ref = parseValueReference(ctx, ch_it, end, false)) {
+      result.push_back({std::move(val_ref->first), type});
+      ch_it = val_ref->second;
+    } else if (auto templ = Spec::Template::parse(ch_it, end)) {
+      // dry eval to verify that references are valid
+      (void)templ->first.eval(ctx);
+      result.push_back({std::move(templ->first), type});
+      ch_it = templ->second;
+    } else {
+      return nonstd::make_unexpected(fmt::format("Could not parse neither 
value reference or template in {} at {}", ctx.path(), std::distance(begin, 
ch_it)));
+    }
+    if (type == Spec::MemberType::INDEX) {
+      if (ch_it == end || *ch_it != ']') {
+        return nonstd::make_unexpected(fmt::format("Expected closing index ']' 
in {} at {}", ctx.path(), std::distance(begin, ch_it)));
+      }
+      ++ch_it;
+    }
+    if (!isEnd()) {
+      if (*ch_it == '.') {
+        type = Spec::MemberType::FIELD;
+      } else if (*ch_it == '[') {
+        type = Spec::MemberType::INDEX;
+      } else {
+        return nonstd::make_unexpected(fmt::format("Unexpected destination 
delimiter '{}' in {} at {}", *ch_it, ctx.path(), std::distance(begin, ch_it)));
+      }
+      ++ch_it;
+      if (ch_it == end) {
+        if (type == Spec::MemberType::FIELD) {
+          return nonstd::make_unexpected(fmt::format("Unterminated member in 
{} at {}", ctx.path(), std::distance(begin, ch_it)));
+        } else {
+          return nonstd::make_unexpected(fmt::format("Unterminated indexed 
member in {} at {}", ctx.path(), std::distance(begin, ch_it)));
+        }
+      }
+    }
+  }
+
+  return std::pair<Spec::Destination, Spec::It>{result, ch_it};
+}
+
+Spec::Destinations parseDestinations(const Spec::Context& ctx, const 
rapidjson::Value& val) {
+  Spec::Destinations res;
+  if (val.IsNull()) {
+    return res;
+  }
+  if (val.IsArray()) {
+    for (rapidjson::SizeType i = 0; i < val.GetArray().Size(); ++i) {
+      auto& item = val.GetArray()[i];
+      if (!item.IsString()) {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Expected a string or 
array of strings at '{}/{}'", ctx.path(), i));
+      }
+      if (auto dst = parseDestination(ctx, item.GetString(), item.GetString() 
+ item.GetStringLength())) {
+        if (dst->second != item.GetString() + item.GetStringLength()) {
+          throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to fully 
parse destination at '{}/{}'", ctx.path(), i));
+        }
+        res.push_back(std::move(dst->first));
+      } else {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse 
destination at '{}/{}': {}", ctx.path(), i, dst.error()));
+      }
+    }
+  } else {
+    if (!val.IsString()) {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Expected a string or 
array of strings at '{}'", ctx.path()));
+    }
+    if (auto dst = parseDestination(ctx, val.GetString(), val.GetString() + 
val.GetStringLength())) {
+      if (dst->second != val.GetString() + val.GetStringLength()) {
+        throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to fully parse 
destination at '{}'", ctx.path()));
+      }
+      res.push_back(std::move(dst->first));
+    } else {
+      throw Exception(GENERAL_EXCEPTION, fmt::format("Failed to parse 
destination at '{}': {}", ctx.path(), dst.error()));
+    }
+  }
+  return res;
+}
+
+std::optional<std::string> jsonValueToString(const rapidjson::Value& val) {
+  if (val.IsString()) {
+    return std::string{val.GetString(), val.GetStringLength()};
+  }
+  if (val.IsUint64()) {
+    return std::to_string(val.GetUint64());
+  }
+  if (val.IsInt64()) {
+    return std::to_string(val.GetInt64());
+  }
+  if (val.IsDouble()) {
+    return std::to_string(static_cast<int64_t>(val.GetDouble()));

Review Comment:
   this looks weird: do we really want to `static_cast` the double to an 
`int64_t`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to