arpadboda commented on a change in pull request #533: MINIFICPP-726 - Enhance 
ExtractText to have more feature parity with …
URL: https://github.com/apache/nifi-minifi-cpp/pull/533#discussion_r272972983
 
 

 ##########
 File path: libminifi/src/processors/ExtractText.cpp
 ##########
 @@ -104,7 +143,64 @@ int64_t 
ExtractText::ReadCallback::process(std::shared_ptr<io::BaseStream> strea
     }
   }
 
-  flowFile_->setAttribute(attrKey, contentStream.str());
+  if(regex_mode) {
+    std::regex_constants::syntax_option_type regex_mode = 
std::regex_constants::ECMAScript;
+
+    bool insensitive = InsensitiveMatch.getDefaultValue();
+    if(ctx_->getProperty(InsensitiveMatch.getName(), insensitive) && 
insensitive) {
+      regex_mode |= std::regex_constants::icase;
+    }
+
+    bool ignoregroupzero = IgnoreCaptureGroupZero.getDefaultValue();
+    ctx_->getProperty(IgnoreCaptureGroupZero.getName(), ignoregroupzero);
+
+    bool repeatingcapture = EnableRepeatingCaptureGroup.getDefaultValue();
+    ctx_->getProperty(EnableRepeatingCaptureGroup.getName(), repeatingcapture);
+
+    int maxCaptureSize = MaxCaptureGroupLen.getDefaultValue();
+    ctx_->getProperty(MaxCaptureGroupLen.getName(), maxCaptureSize);
+
+    std::string contentStr = contentStream.str();
+
+    std::map<std::string, std::string> regexAttributes;
+
+    for (const auto& k : ctx_->getDynamicPropertyKeys()){
+      std::string value;
+      ctx_->getDynamicProperty(k, value);
+
+      std::regex rgx(value, regex_mode);
+
+      std::smatch matches;
+
+      std::string workStr = contentStr;
+
+      int matchcount = 0;
+
+      while(std::regex_search(workStr, matches, rgx)) {
+        size_t i = ignoregroupzero ? 1 : 0;
+
+        for (; i < matches.size(); ++i, ++matchcount) {
+          std::string value = matches[i].str();
+          if(value.length() > maxCaptureSize) {
+            value = value.substr(0, maxCaptureSize);
+          }
+          if(matchcount == 0) {
+            regexAttributes[k] = value;
+          }
+          regexAttributes[k + '.' + std::to_string(matchcount)] = value;
+        }
+        if(!repeatingcapture) {
+          break;
+        }
+        workStr = matches.suffix();
+      }
+    }
+    for(const auto& kv : regexAttributes) {
+      flowFile_->setAttribute(kv.first, kv.second);
+    }
 
 Review comment:
   The only thing missing here is routing to "unmatched" in case there is no 
match.
   However I think that would be a breaking change, so I would prefer to do 
that in scope of a follow-up scheduled for 1.0.
   This change can be part of the next minor release. 
   @apiri do you agree or do you have different opinion? 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to