[GitHub] [arrow] rkavanap commented on a change in pull request #9956: ARROW-12211: [C++][Gandiva][WIP] Implement TO_NUMBER function in gandiva

GitBox Thu, 25 Nov 2021 01:56:43 -0800


rkavanap commented on a change in pull request #9956:
URL: https://github.com/apache/arrow/pull/9956#discussion_r756734331




##########
File path: cpp/src/gandiva/to_number_test.cc
##########
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "arrow/memory_pool.h"
+#include "gandiva/arrow.h"
+#include "gandiva/filter.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+namespace gandiva {
+static const double MAX_ERROR = 0.00005;
+class TestToNumber : public ::testing::Test {
+ public:
+  void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+  arrow::MemoryPool* pool_;
+};
+
+void VerifyFuzzyEquals(double actual, double expected, double max_error = 
MAX_ERROR) {
+  EXPECT_TRUE(fabs(actual - expected) < max_error) << actual << " != " << 
expected;
+}
+
+TEST_F(TestToNumber, TestToNumber) {
+  auto field0 = field("f0", arrow::utf8());
+  auto schema = arrow::schema({field0});
+  auto node = TreeExprBuilder::MakeField(field0);
+
+  auto res = field("r0", arrow::float64());
+  auto pattern_literal = TreeExprBuilder::MakeStringLiteral("##,###,###.###");

Review comment:
       should we have more tests with different pattern literals..? Maybe unit 
tests that directly tests DecimalFormatHolder instead of through expression 
tree, so that more cases can be handled.

##########
File path: cpp/src/gandiva/decimal_format_holder.cc
##########
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/decimal_format_holder.h"
+
+namespace gandiva {
+static bool IsArrowStringLiteral(arrow::Type::type type) {
+  return type == arrow::Type::STRING || type == arrow::Type::BINARY;
+}
+inline double get_scale_multiplier(int32_t scale) {
+  static const double values[] = {1.0,
+                                  10.0,
+                                  100.0,
+                                  1000.0,
+                                  10000.0,
+                                  100000.0,
+                                  1000000.0,
+                                  10000000.0,
+                                  100000000.0,
+                                  1000000000.0,
+                                  10000000000.0,
+                                  100000000000.0,
+                                  1000000000000.0,
+                                  10000000000000.0,
+                                  100000000000000.0,
+                                  1000000000000000.0,
+                                  10000000000000000.0,
+                                  100000000000000000.0,
+                                  1000000000000000000.0,
+                                  10000000000000000000.0};
+  if (scale >= 0 && scale < 20) {
+    return values[scale];
+  }
+  return static_cast<double>(powl(10.0, scale));
+}
+
+inline double round_decimal_digits(double to_round, int32_t scale) {
+  double scale_multiplier = get_scale_multiplier(scale);
+  return trunc(to_round * scale_multiplier + ((to_round >= 0) ? 0.5 : -0.5)) /
+         scale_multiplier;
+}
+
+Status DecimalFormatHolder::Make(const FunctionNode& node,
+                                 std::shared_ptr<DecimalFormatHolder>* holder) 
{
+  ARROW_RETURN_IF(node.children().size() != 2,
+                  Status::Invalid("'to_number' function requires two 
parameters"));
+  auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+
+  ARROW_RETURN_IF(literal == nullptr,
+                  Status::Invalid("'to_number' function requires a"
+                                  " literal as the second parameter"));
+
+  auto literal_type = literal->return_type()->id();
+  ARROW_RETURN_IF(
+      !IsArrowStringLiteral(literal_type),
+      Status::Invalid(
+          "'to_number' function requires a string literal as the second 
parameter"));
+  return 
DecimalFormatHolder::Make(arrow::util::get<std::string>(literal->holder()),
+                                   holder);
+}
+
+Status DecimalFormatHolder::Make(const std::string& decimal_format,
+                                 std::shared_ptr<DecimalFormatHolder>* holder) 
{
+  auto lholder = std::shared_ptr<DecimalFormatHolder>(
+      new DecimalFormatHolder(decimal_format.c_str(), decimal_format.size()));

Review comment:
       isn't this a bit dangerous to have an object have a pointer to a string 
c_str. What if string decimal_format was constructed in the stack by the caller 
of Make?
   Is it always guaranteed that the decimal_format object passed here has a 
lifetime more than the DecimalFormatHolder object?
   As far as I can see the shared_ptr is against the DecimalFormatHolder and 
not against decimal_format. So if decimal_format goes out of scope. 
decimal_format.c_str() will be pointing to a freed memory location.

##########
File path: cpp/src/gandiva/decimal_format_holder.h
##########
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/vendored/fast_float/fast_float.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+class GANDIVA_EXPORT DecimalFormatHolder : public FunctionHolder {
+ public:
+  ~DecimalFormatHolder() override = default;
+
+  static Status Make(const FunctionNode& node,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  static Status Make(const std::string& decimal_format,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  double Parse(const char* number, int32_t number_size);
+
+ private:
+  explicit DecimalFormatHolder(const char* pattern, size_t pattern_size)
+      : pattern_(pattern), pattern_size_(pattern_size) {
+    maximumFractionDigits_ = Setup();
+  }
+
+  // Sets the format's metadata, such as the maximum number of decimal digits 
and if
+  // the patterns contains a dollar sign.
+  int32_t Setup() {
+    int32_t ret = 0;
+    bool is_decimal_part = false;
+    has_dollar_sign_ = false;
+    for (size_t i = 0; i < pattern_size_; ++i) {
+      if (pattern_[i] == '$') {
+        has_dollar_sign_ = true;
+      }
+
+      if (pattern_[i] == '.') {
+        is_decimal_part = true;
+        continue;
+      }
+
+      if (is_decimal_part) {
+        ret++;
+      }
+    }
+
+    return ret;
+  }
+
+  const char* pattern_;

Review comment:
       so please remove pattern_ and pattern_size_ as class variable and use it 
only in the Setup method.

##########
File path: cpp/src/gandiva/decimal_format_holder.cc
##########
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/decimal_format_holder.h"
+
+namespace gandiva {
+static bool IsArrowStringLiteral(arrow::Type::type type) {
+  return type == arrow::Type::STRING || type == arrow::Type::BINARY;
+}
+inline double get_scale_multiplier(int32_t scale) {
+  static const double values[] = {1.0,
+                                  10.0,
+                                  100.0,
+                                  1000.0,
+                                  10000.0,
+                                  100000.0,
+                                  1000000.0,
+                                  10000000.0,
+                                  100000000.0,
+                                  1000000000.0,
+                                  10000000000.0,
+                                  100000000000.0,
+                                  1000000000000.0,
+                                  10000000000000.0,
+                                  100000000000000.0,
+                                  1000000000000000.0,
+                                  10000000000000000.0,
+                                  100000000000000000.0,
+                                  1000000000000000000.0,
+                                  10000000000000000000.0};
+  if (scale >= 0 && scale < 20) {
+    return values[scale];
+  }
+  return static_cast<double>(powl(10.0, scale));
+}
+
+inline double round_decimal_digits(double to_round, int32_t scale) {
+  double scale_multiplier = get_scale_multiplier(scale);
+  return trunc(to_round * scale_multiplier + ((to_round >= 0) ? 0.5 : -0.5)) /
+         scale_multiplier;
+}
+
+Status DecimalFormatHolder::Make(const FunctionNode& node,
+                                 std::shared_ptr<DecimalFormatHolder>* holder) 
{
+  ARROW_RETURN_IF(node.children().size() != 2,
+                  Status::Invalid("'to_number' function requires two 
parameters"));
+  auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+
+  ARROW_RETURN_IF(literal == nullptr,
+                  Status::Invalid("'to_number' function requires a"
+                                  " literal as the second parameter"));
+
+  auto literal_type = literal->return_type()->id();
+  ARROW_RETURN_IF(
+      !IsArrowStringLiteral(literal_type),
+      Status::Invalid(
+          "'to_number' function requires a string literal as the second 
parameter"));
+  return 
DecimalFormatHolder::Make(arrow::util::get<std::string>(literal->holder()),
+                                   holder);
+}
+
+Status DecimalFormatHolder::Make(const std::string& decimal_format,
+                                 std::shared_ptr<DecimalFormatHolder>* holder) 
{
+  auto lholder = std::shared_ptr<DecimalFormatHolder>(
+      new DecimalFormatHolder(decimal_format.c_str(), decimal_format.size()));
+  *holder = lholder;
+  return Status::OK();
+}
+
+double DecimalFormatHolder::Parse(const char* number, int32_t number_size) {

Review comment:
       Not sure of this. But do we have to handle NaN (not a number), positive 
infinity and negative infinity.
   
   Atleast the Java versions seems to handle it. But not sure if Dremio 
requires them to be handled and what is the representation for NaN and infinity.

##########
File path: cpp/src/gandiva/decimal_format_holder.h
##########
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/vendored/fast_float/fast_float.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+class GANDIVA_EXPORT DecimalFormatHolder : public FunctionHolder {
+ public:
+  ~DecimalFormatHolder() override = default;
+
+  static Status Make(const FunctionNode& node,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  static Status Make(const std::string& decimal_format,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  double Parse(const char* number, int32_t number_size);
+
+ private:
+  explicit DecimalFormatHolder(const char* pattern, size_t pattern_size)
+      : pattern_(pattern), pattern_size_(pattern_size) {
+    maximumFractionDigits_ = Setup();
+  }
+
+  // Sets the format's metadata, such as the maximum number of decimal digits 
and if
+  // the patterns contains a dollar sign.
+  int32_t Setup() {
+    int32_t ret = 0;
+    bool is_decimal_part = false;
+    has_dollar_sign_ = false;
+    for (size_t i = 0; i < pattern_size_; ++i) {
+      if (pattern_[i] == '$') {
+        has_dollar_sign_ = true;
+      }
+
+      if (pattern_[i] == '.') {
+        is_decimal_part = true;
+        continue;
+      }
+
+      if (is_decimal_part) {
+        ret++;
+      }
+    }
+
+    return ret;
+  }
+
+  const char* pattern_;

Review comment:
       the pattern_ is not used beyond setup. Also it is a pointer to a 
std::string.c_str() that may go out of scope. Currently it is working because 
the class functions are not using this pointer. So it is advisable not to store 
it as a class variable. Tomorrow if someone tries to use it in another method, 
he/she may inardvertently corrupt memory.

##########
File path: cpp/src/gandiva/decimal_format_holder.h
##########
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/vendored/fast_float/fast_float.h"
+
+#include "gandiva/function_holder.h"
+#include "gandiva/node.h"
+
+namespace gandiva {
+
+class GANDIVA_EXPORT DecimalFormatHolder : public FunctionHolder {
+ public:
+  ~DecimalFormatHolder() override = default;
+
+  static Status Make(const FunctionNode& node,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  static Status Make(const std::string& decimal_format,
+                     std::shared_ptr<DecimalFormatHolder>* holder);
+
+  double Parse(const char* number, int32_t number_size);
+
+ private:
+  explicit DecimalFormatHolder(const char* pattern, size_t pattern_size)
+      : pattern_(pattern), pattern_size_(pattern_size) {
+    maximumFractionDigits_ = Setup();
+  }
+
+  // Sets the format's metadata, such as the maximum number of decimal digits 
and if
+  // the patterns contains a dollar sign.
+  int32_t Setup() {
+    int32_t ret = 0;
+    bool is_decimal_part = false;
+    has_dollar_sign_ = false;
+    for (size_t i = 0; i < pattern_size_; ++i) {
+      if (pattern_[i] == '$') {
+        has_dollar_sign_ = true;
+      }
+
+      if (pattern_[i] == '.') {
+        is_decimal_part = true;
+        continue;
+      }
+
+      if (is_decimal_part) {
+        ret++;
+      }
+    }
+
+    return ret;
+  }
+
+  const char* pattern_;

Review comment:
       can't you just pass it as argument to Setup and avoid storing this 
pointer as class variable?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] rkavanap commented on a change in pull request #9956: ARROW-12211: [C++][Gandiva][WIP] Implement TO_NUMBER function in gandiva

Reply via email to