projjal commented on a change in pull request #11193: URL: https://github.com/apache/arrow/pull/11193#discussion_r712991445
########## File path: cpp/src/gandiva/base_cache_key.h ########## @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <arrow/util/hash_util.h> +#include <stddef.h> + +#include "gandiva/expression.h" +#include "gandiva/filter.h" +#include "gandiva/projector.h" + +namespace gandiva { + +class BaseCacheKey { + public: + BaseCacheKey(ProjectorCacheKey& key, std::string type) : type_(type) { + static const int32_t kSeedValue = 4; + size_t key_hash = key.Hash(); + size_t result_hash = kSeedValue; + arrow::internal::hash_combine(result_hash, type); + arrow::internal::hash_combine(result_hash, key_hash); + hash_code_ = result_hash; + schema_ = key.schema(); + } + + BaseCacheKey(FilterCacheKey& key, std::string type) : type_(type) { + static const size_t kSeedValue = 4; + size_t key_hash = key.Hash(); + size_t result_hash = kSeedValue; + arrow::internal::hash_combine(result_hash, type); + arrow::internal::hash_combine(result_hash, key_hash); + hash_code_ = result_hash; + schema_ = key.schema(); + } + + size_t Hash() const { return hash_code_; } + + std::string Type() const { return type_; } + + bool operator==(const BaseCacheKey& other) const { + if (hash_code_ != other.hash_code_) { Review comment: This is incorrect. You can't just only compare the hashcode. ########## File path: cpp/src/gandiva/llvm_generator.h ########## @@ -49,10 +51,48 @@ class GANDIVA_EXPORT LLVMGenerator { static Status Make(std::shared_ptr<Configuration> config, std::unique_ptr<LLVMGenerator>* llvm_generator); + static std::shared_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> + GetCache(); + /// \brief Build the code for the expression trees for default mode. Each /// element in the vector represents an expression tree Status Build(const ExpressionVector& exprs, SelectionVector::Mode mode); + /// \brief Build the code for the expression trees for default mode with a LLVM + /// ObjectCache. Each element in the vector represents an expression tree + template <class KeyType> + Status Build(const ExpressionVector& exprs, SelectionVector::Mode mode, Review comment: Better to add the definition in the cc file. ########## File path: cpp/src/gandiva/engine.cc ########## @@ -167,7 +167,6 @@ Status Engine::Make(const std::shared_ptr<Configuration>& conf, return Status::CodeGenError("Could not instantiate llvm::ExecutionEngine: ", builder_error); } - Review comment: nit: better to remove these extraneous changes ########## File path: cpp/src/gandiva/greedy_dual_size_cache.h ########## @@ -146,9 +159,25 @@ class GreedyDualSizeCache { priority_set_.erase(i); } + void evictObject() { Review comment: This looks duplicate of evict() ########## File path: cpp/src/gandiva/llvm_generator.cc ########## @@ -50,6 +51,23 @@ Status LLVMGenerator::Make(std::shared_ptr<Configuration> config, return Status::OK(); } +std::shared_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> +LLVMGenerator::GetCache() { + static Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>> cache; Review comment: this line seems is not required ########## File path: cpp/src/gandiva/greedy_dual_size_cache.h ########## @@ -146,9 +159,25 @@ class GreedyDualSizeCache { priority_set_.erase(i); } + void evictObject() { + // TODO: inflation overflow is unlikely to happen but needs to be handled + // for correctness. + // evict item from the beginning of the set. This set is ordered from the + // lower priority value to the higher priority value. + typename std::set<PriorityItem>::iterator i = priority_set_.begin(); + // update the inflation cost related to the evicted item + inflation_ = (*i).actual_priority; + size_t size_to_decrease = map_.find((*i).cache_key)->second.first.size; + cache_size_ -= size_to_decrease; + map_.erase((*i).cache_key); + priority_set_.erase(i); + } + map_type map_; std::set<PriorityItem> priority_set_; uint64_t inflation_; size_t capacity_; + size_t cache_size_ = 0; + llvm::SmallString<128> cache_dir_; Review comment: looks like this is not getting used. ########## File path: cpp/src/gandiva/llvm_generator.cc ########## @@ -50,6 +51,23 @@ Status LLVMGenerator::Make(std::shared_ptr<Configuration> config, return Status::OK(); } +std::shared_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> +LLVMGenerator::GetCache() { + static Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>> cache; + // static std::unique_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> + // cache_unique = std::make_unique<Cache<BaseCacheKey, + // std::shared_ptr<llvm::MemoryBuffer>>>(); + + // static std::shared_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> + // shared_cache = std::move(cache_unique); Review comment: remove the commented out code? ########## File path: cpp/src/gandiva/filter.cc ########## @@ -120,17 +138,27 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, ARROW_RETURN_NOT_OK(expr_validator.Validate(condition)); // Start measuring build time - auto begin = std::chrono::high_resolution_clock::now(); - ARROW_RETURN_NOT_OK(llvm_gen->Build({condition}, SelectionVector::Mode::MODE_NONE)); + // auto begin = std::chrono::high_resolution_clock::now(); Review comment: remove the commented out code if no longer required ########## File path: cpp/src/gandiva/filter.cc ########## @@ -42,7 +43,7 @@ FilterCacheKey::FilterCacheKey(SchemaPtr schema, expression_as_string_ = expression.ToString(); UpdateUniqifier(expression_as_string_); arrow::internal::hash_combine(result, expression_as_string_); - arrow::internal::hash_combine(result, configuration); + arrow::internal::hash_combine(result, configuration->Hash()); Review comment: why this change? ########## File path: cpp/src/gandiva/projector.h ########## @@ -138,6 +167,7 @@ class GANDIVA_EXPORT Projector { SchemaPtr schema_; FieldVector output_fields_; std::shared_ptr<Configuration> configuration_; + bool compiled_from_cache_; Review comment: nit: use a different name like built_from_cache since it is already compiled ########## File path: cpp/src/gandiva/greedy_dual_size_cache.h ########## @@ -92,25 +96,26 @@ class GreedyDualSizeCache { bool contains(const Key& key) { return map_.find(key) != map_.end(); } - void insert(const Key& key, const ValueCacheObject<Value>& value) { + void InsertObjectCode(const Key& key, const ValueCacheObject<Value>& value) { Review comment: nit: original name insert seems better since this is a generic class. ########## File path: cpp/src/gandiva/llvm_generator.h ########## @@ -49,10 +51,48 @@ class GANDIVA_EXPORT LLVMGenerator { static Status Make(std::shared_ptr<Configuration> config, std::unique_ptr<LLVMGenerator>* llvm_generator); + static std::shared_ptr<Cache<BaseCacheKey, std::shared_ptr<llvm::MemoryBuffer>>> + GetCache(); + /// \brief Build the code for the expression trees for default mode. Each /// element in the vector represents an expression tree Status Build(const ExpressionVector& exprs, SelectionVector::Mode mode); + /// \brief Build the code for the expression trees for default mode with a LLVM + /// ObjectCache. Each element in the vector represents an expression tree + template <class KeyType> + Status Build(const ExpressionVector& exprs, SelectionVector::Mode mode, + GandivaObjectCache<KeyType>& obj_cache) { + selection_vector_mode_ = mode; + + // Start measuring code gen time + auto begin = std::chrono::high_resolution_clock::now(); + for (auto& expr : exprs) { + auto output = annotator_.AddOutputFieldDescriptor(expr->result()); + ARROW_RETURN_NOT_OK(Add(expr, output)); + } + + // Stop measuring time, calculate the elapsed time and pass it to object cache + auto end = std::chrono::high_resolution_clock::now(); + size_t elapsed = Review comment: This is wrong. the elapse time was supposed to track the compilation time which is mainly done in FinalizeModule -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
