orc format [incubator-gluten]

via GitHub Tue, 08 Oct 2024 19:46:19 -0700


KevinyhZou commented on code in PR #7268:
URL: https://github.com/apache/incubator-gluten/pull/7268#discussion_r1792671696



##########
backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseHiveTableSuite.scala:
##########
@@ -1416,4 +1416,72 @@ class GlutenClickHouseHiveTableSuite
     spark.sql("DROP TABLE test_tbl_7054")
   }
 
+  test("test hive table scan nested column pruning") {
+    val json_table_name = "test_tbl_7267_json"
+    val pq_table_name = "test_tbl_7267_pq"
+    val create_table_sql =
+      s"""
+         | create table if not exists %s(
+         | id bigint,
+         | d1 STRUCT<c: STRING, d: ARRAY<STRUCT<x: STRING, y: STRING>>>,
+         | d2 STRUCT<c: STRING, d: Map<STRING, STRUCT<x: STRING, y: STRING>>>,
+         | day string,
+         | hour string
+         | ) partitioned by(day, hour)
+         |""".stripMargin
+    val create_table_1 = create_table_sql.format(json_table_name) +
+      s"""
+         | ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+         | STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+         | OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+         |""".stripMargin
+    val create_table_2 = create_table_sql.format(pq_table_name) + " STORED AS 
PARQUET"
+    val insert_sql =
+      """
+        | insert into %s values(1,
+        | named_struct('c', 'c123', 'd', array(named_struct('x', 'x123', 'y', 
'y123'))),
+        | named_struct('c', 'c124', 'd', map('m124', named_struct('x', 'x124', 
'y', 'y124'))),
+        | '2024-09-26', '12'
+        | )
+        |""".stripMargin
+    val insert_sql_1 = insert_sql.format(json_table_name)
+    val insert_sql_2 = insert_sql.format(pq_table_name)
+    spark.sql(create_table_1)
+    spark.sql(create_table_2)
+    spark.sql(insert_sql_1)
+    spark.sql(insert_sql_2)
+    val select_sql_1 =
+      "select id, d1.c, d1.d[0].x, d2.d['m124'].y from %s where day = 
'2024-09-26' and hour = '12'"
+        .format(json_table_name)
+    val select_sql_2 =
+      "select id, d1.c, d1.d[0].x, d2.d['m124'].y from %s where day = 
'2024-09-26' and hour = '12'"
+        .format(pq_table_name)
+    withSQLConf(
+      ("spark.sql.hive.convertMetastoreParquet" -> "false"),

Review Comment:
   当需要使用hive parquet/orc serde 读取 table 时，而不是使用spark内置的parquet/orc 
reader读取时，这两个配置就需要被设置为false @taiyang-li 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [GLUTEN-7267][CH]Support nested column pruning for `HiveTableScan` json/parquet/orc format [incubator-gluten]

Reply via email to