[GitHub] [nifi-minifi-cpp] szaszm commented on a change in pull request #732: MINIFICPP-1013

GitBox Wed, 12 Feb 2020 03:50:21 -0800

szaszm commented on a change in pull request #732: MINIFICPP-1013
URL: https://github.com/apache/nifi-minifi-cpp/pull/732#discussion_r378203621


 ##########
 File path: extensions/sql/processors/QueryDatabaseTable.cpp
 ##########
 @@ -0,0 +1,475 @@
+/**
+ * @file QueryDatabaseTable.cpp
+ * PutSQL class declaration
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QueryDatabaseTable.h"
+
+#include <vector>
+#include <queue>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <iostream>
+#include <memory>
+#include <codecvt>
+#include <algorithm>
+#include <regex>
+
+#include <soci/soci.h>
+
+#include "io/DataStream.h"
+#include "core/ProcessContext.h"
+#include "core/ProcessSession.h"
+#include "Exception.h"
+#include "utils/OsUtils.h"
+#include "data/DatabaseConnectors.h"
+#include "data/JSONSQLWriter.h"
+#include "data/SQLRowsetProcessor.h"
+#include "data/WriteCallback.h"
+#include "data/MaxCollector.h"
+#include "data/Utils.h"
+#include "utils/file/FileUtils.h"
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace processors {
+
+const std::string QueryDatabaseTable::ProcessorName("QueryDatabaseTable");
+
+const core::Property QueryDatabaseTable::s_tableName(
+  core::PropertyBuilder::createProperty("Table 
Name")->isRequired(true)->withDescription("The name of the database table to be 
queried.")->supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_columnNames(
+  core::PropertyBuilder::createProperty("Columns to 
Return")->isRequired(false)->withDescription(
+    "A comma-separated list of column names to be used in the query. If your 
database requires special treatment of the names (quoting, e.g.), each name 
should include such treatment. "
+    "If no column names are supplied, all columns in the specified table will 
be returned. "
+    "NOTE: It is important to use consistent column names for a given table 
for incremental fetch to work 
properly.")->supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_maxValueColumnNames(
+  core::PropertyBuilder::createProperty("Maximum-value 
Columns")->isRequired(false)->withDescription(
+    "A comma-separated list of column names. The processor will keep track of 
the maximum value for each column that has been returned since the processor 
started running. "
+    "Using multiple columns implies an order to the column list, and each 
column's values are expected to increase more slowly than the previous columns' 
values. "
+    "Thus, using multiple columns implies a hierarchical structure of columns, 
which is usually used for partitioning tables. "
+    "This processor can be used to retrieve only those rows that have been 
added/updated since the last retrieval. "
+    "Note that some ODBC types such as bit/boolean are not conducive to 
maintaining maximum value, so columns of these types should not be listed in 
this property, and will result in error(s) during processing. "
+    "If no columns are provided, all rows from the table will be considered, 
which could have a performance impact. "
+    "NOTE: It is important to use consistent max-value column names for a 
given table for incremental fetch to work properly. "
+    "NOTE: Because of a limitation of database access library 'soci', which 
doesn't support milliseconds in it's 'dt_date', "
+    "there is a possibility that flowfiles might have duplicated records, if a 
max-value column with 'dt_date' type has value with milliseconds.")->
+    supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_whereClause(
+  
core::PropertyBuilder::createProperty("db-fetch-where-clause")->isRequired(false)->withDescription(
+    "A custom clause to be added in the WHERE condition when building SQL 
queries.")->supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_sqlQuery(
+  
core::PropertyBuilder::createProperty("db-fetch-sql-query")->isRequired(false)->withDescription(
+    "A custom SQL query used to retrieve data. Instead of building a SQL query 
from other properties, this query will be wrapped as a sub-query. "
+    "Query must have no ORDER BY 
statement.")->supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_maxRowsPerFlowFile(
+  
core::PropertyBuilder::createProperty("qdbt-max-rows")->isRequired(true)->withDefaultValue<int>(0)->withDescription(
+    "The maximum number of result rows that will be included in a single 
FlowFile. This will allow you to break up very large result sets into multiple 
FlowFiles. "
+    "If the value specified is zero, then all rows are returned in a single 
FlowFile.")->supportsExpressionLanguage(true)->build());
+
+const core::Property QueryDatabaseTable::s_stateDirectory(
+  core::PropertyBuilder::createProperty("State 
Directory")->isRequired(false)->withDefaultValue("QDTState")->withDescription("Directory
 which contains processor state data.")->build());
+
+const std::string 
QueryDatabaseTable::s_initialMaxValueDynamicPropertyPrefix("initial.maxvalue.");
+
+const core::Relationship QueryDatabaseTable::s_success("success", 
"Successfully created FlowFile from SQL query result set.");
+
+static const std::string ResultTableName = "tablename";
+static const std::string ResultRowCount = "querydbtable.row.count";
+
+// State
+class State {
+ public:
+  State(const std::string& tableName, const std::string& stateDir, const 
std::string& uuid, std::shared_ptr<logging::Logger> logger)
+    :tableName_(tableName), logger_(logger) {
+    if (!createUUIDDir(stateDir, uuid, filePath_))
+      return;
+
+    filePath_ += "State.txt";
+
+    if (!getStateFromFile())
+      return;
+
+    ok_ = true;
+  }
+
+  ~State() {
+    if (file_.is_open()) {
+      file_.close();
+    }
+  }
+
+  operator bool() const {
+    return ok_;
+  }
+
+  std::unordered_map<std::string, std::string> mapState() const {
+    return mapState_;
+  }
+
+  void writeStateToFile(const std::unordered_map<std::string, std::string>& 
mapState) {
+    file_.seekp(std::ios::beg);
+
+    file_ << tableName_ << separator();
+    auto dataSize = tableName_.size() + separator().size();
+
+    for (const auto& el : mapState) {
+      file_ << el.first << '=' << el.second << separator();
+      dataSize += el.first.size() + 1 + el.second.size() + separator().size();
+    }
+
+    // If dataSize_ > dataSize, then clear difference with ' '.
+    if (dataSize_ > dataSize) {
+      for (auto i = dataSize_ - dataSize; i > 0; i--) {
+        file_ << ' ';
+      }
+    }
 
 Review comment:
   Your previous reply could be a great code comment explaining why the loop is 
necessary, but if I understand correctly just unwrapping the `if` body and 
leaving the `for` loop only would have equivalent behavior with 2 less lines 
and 1 less indentation level. (i.e. less complexity)
   
   Correct me if I'm wrong, but I suggest this in place of lines 146-151:
   ```
   // If a maxValueColumnName type is varchar then a new max value can be 
shorter than previous max value,
   // and due to ???, we need to pad the data in the state file
   for (auto i = dataSize_ - dataSize; i > 0; i--) {
     file_ << ' ';
   }
   ```
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [nifi-minifi-cpp] szaszm commented on a change in pull request #732: MINIFICPP-1013

Reply via email to