This is an automated email from the ASF dual-hosted git repository.

taiyangli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new fc9d273ee [GLUTEN-6879][CH] Fix partition value diff when it contains 
blank spaces (#6880)
fc9d273ee is described below

commit fc9d273eed7946d255e0d675c6738fa06b45141a
Author: 李扬 <[email protected]>
AuthorDate: Fri Aug 16 18:09:01 2024 +0800

    [GLUTEN-6879][CH] Fix partition value diff when it contains blank spaces 
(#6880)
    
    * fix partition values diff
    
    * change as request
    
    * change as request
---
 .../execution/GlutenClickHouseHiveTableSuite.scala | 26 ++++++++++++++
 cpp-ch/local-engine/Common/GlutenStringUtils.cpp   | 41 +++++++++++++++++++---
 cpp-ch/local-engine/Common/GlutenStringUtils.h     |  3 ++
 .../Storages/SubstraitSource/FormatFile.cpp        |  8 ++---
 4 files changed, 67 insertions(+), 11 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
index 57fda7714..83bc4e76b 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
@@ -1314,4 +1314,30 @@ class GlutenClickHouseHiveTableSuite
     compareResultsAgainstVanillaSpark(select_sql, true, _ => {})
     spark.sql("drop table test_tbl_6506")
   }
+
+  test("GLUTEN-6879: Fix partition value diff when it contains blanks") {
+    val tableName = "test_tbl_6879"
+    sql(s"drop table if exists $tableName")
+
+    val createSql =
+      s"""
+         |CREATE TABLE $tableName (
+         |  id INT,
+         |  name STRING
+         |) PARTITIONED BY (part STRING)
+         |STORED AS PARQUET;
+         |""".stripMargin
+    sql(createSql)
+
+    val insertSql =
+      s"""
+         |INSERT INTO $tableName PARTITION (part='part with spaces')
+         |VALUES (1, 'John Doe');
+         |""".stripMargin
+    sql(insertSql)
+
+    val selectSql = s"SELECT * FROM $tableName"
+    compareResultsAgainstVanillaSpark(selectSql, true, _ => {})
+    sql(s"drop table if exists $tableName")
+  }
 }
diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp 
b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp
index b6d11ac1b..4a18f4ced 100644
--- a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp
+++ b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp
@@ -14,10 +14,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "GlutenStringUtils.h"
-#include <filesystem>
 #include <boost/algorithm/string.hpp>
 #include <Poco/StringTokenizer.h>
+#include <Poco/URI.h>
+
+#include "GlutenStringUtils.h"
 
 namespace local_engine
 {
@@ -27,16 +28,46 @@ PartitionValues 
GlutenStringUtils::parsePartitionTablePath(const std::string & f
     Poco::StringTokenizer path(file, "/");
     for (const auto & item : path)
     {
-        auto position = item.find('=');
-        if (position != std::string::npos)
+        auto pos = item.find('=');
+        if (pos != std::string::npos)
         {
-            
result.emplace_back(PartitionValue(boost::algorithm::to_lower_copy(item.substr(0,
 position)), item.substr(position + 1)));
+            auto key = boost::to_lower_copy(item.substr(0, pos));
+            auto value = item.substr(pos + 1);
+
+            std::string unescaped_key;
+            std::string unescaped_value;
+            Poco::URI::decode(key, unescaped_key);
+            Poco::URI::decode(value, unescaped_value);
+            result.emplace_back(std::move(unescaped_key), 
std::move(unescaped_value));
         }
     }
     return result;
 }
+
 bool GlutenStringUtils::isNullPartitionValue(const std::string & value)
 {
     return value == "__HIVE_DEFAULT_PARTITION__";
 }
+
+std::string GlutenStringUtils::dumpPartitionValue(const PartitionValue & value)
+{
+    return value.first + "=" + value.second;
+}
+
+std::string GlutenStringUtils::dumpPartitionValues(const PartitionValues & 
values)
+{
+    std::string res;
+    res += "[";
+
+    for (size_t i = 0; i < values.size(); ++i)
+    {
+        if (i)
+            res += ", ";
+        res += dumpPartitionValue(values[i]);
+    }
+
+    res += "]";
+    return res;
+}
+
 }
diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.h 
b/cpp-ch/local-engine/Common/GlutenStringUtils.h
index 023cb2b8d..dd0441353 100644
--- a/cpp-ch/local-engine/Common/GlutenStringUtils.h
+++ b/cpp-ch/local-engine/Common/GlutenStringUtils.h
@@ -28,5 +28,8 @@ class GlutenStringUtils
 public:
     static PartitionValues parsePartitionTablePath(const std::string & file);
     static bool isNullPartitionValue(const std::string & value);
+
+    static std::string dumpPartitionValue(const PartitionValue & value);
+    static std::string dumpPartitionValues(const PartitionValues & values);
 };
 }
diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp 
b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp
index e449ede98..4499a9a55 100644
--- a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp
+++ b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp
@@ -55,17 +55,13 @@ FormatFile::FormatFile(
     : context(context_), file_info(file_info_), 
read_buffer_builder(read_buffer_builder_)
 {
     PartitionValues part_vals = 
GlutenStringUtils::parsePartitionTablePath(file_info.uri_file());
-    String partition_values_str = "[";
     for (size_t i = 0; i < part_vals.size(); ++i)
     {
         const auto & part = part_vals[i];
         partition_keys.push_back(part.first);
         partition_values[part.first] = part.second;
-        if (i > 0)
-            partition_values_str += ", ";
-        partition_values_str += part.first + "=" + part.second;
     }
-    partition_values_str += "]";
+
     LOG_INFO(
         &Poco::Logger::get("FormatFile"),
         "Reading File path: {}, format: {}, range: {}, partition_index: {}, 
partition_values: {}",
@@ -73,7 +69,7 @@ FormatFile::FormatFile(
         file_info.file_format_case(),
         std::to_string(file_info.start()) + "-" + 
std::to_string(file_info.start() + file_info.length()),
         file_info.partition_index(),
-        partition_values_str);
+        GlutenStringUtils::dumpPartitionValues(part_vals));
 }
 
 FormatFilePtr FormatFileUtil::createFile(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to