Re: [PR] Add example binary variant data and regeneration scripts [parquet-testing]

via GitHub Fri, 02 May 2025 14:03:00 -0700


RussellSpitzer commented on code in PR #76:
URL: https://github.com/apache/parquet-testing/pull/76#discussion_r2072146394



##########
variant/regen.py:
##########
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This program uses Apache Spark to generate example binary Variant data
+#
+# Requirements
+# pip install pyarrow
+# pip install pyspark
+#
+# Last run with Spark 4.0 preview 2:
+# https://spark.apache.org/news/spark-4.0.0-preview2.html
+
+from pyspark.sql import SparkSession
+import pyarrow.parquet as pq
+import os
+import json
+
+# Initialize Spark session and create variant data via SQL
+spark = SparkSession.builder \
+    .appName("PySpark SQL Example") \
+    .getOrCreate()
+
+# recursively cleanup the spark-warehouse directory
+if os.path.exists('spark-warehouse'):
+    for root, dirs, files in os.walk('spark-warehouse', topdown=False):
+        for name in files:
+            os.remove(os.path.join(root, name))
+        for name in dirs:
+            os.rmdir(os.path.join(root, name))
+
+
+# Create a table with variant and insert various types into it
+#
+# This writes data files into spark-warehouse/output
+sql = """
+CREATE TABLE T (name VARCHAR(2000), variant_col VARIANT);
+
+-------------------------------
+-- Primitive type (basic_type=0)
+-------------------------------
+-- One row with a value from each type listed in 
+-- 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+--
+-- Spark Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
+-- Note: must use explicit typecasts as Ppark returns an error for implicit 
casts
+INSERT INTO T VALUES ('primitive_null', NULL);
+INSERT INTO T VALUES ('primitive_boolean_true', true::Variant);
+INSERT INTO T VALUES ('primitive_boolean_false', false::Variant);
+INSERT INTO T VALUES ('primitive_int8', 42::Byte::Variant);
+INSERT INTO T VALUES ('primitive_int16', 1234::Short::Variant);
+INSERT INTO T VALUES ('primitive_int32', 123456::Integer::Variant);
+INSERT INTO T VALUES ('primitive_int64', 12345678::Long::Variant);
+INSERT INTO T VALUES ('primitive_double', 1234567890.1234::Double::Variant);
+INSERT INTO T VALUES ('primitive_decimal4', 12.34::Decimal(8,2)::Variant);
+INSERT INTO T VALUES ('primitive_decimal8', 
12345678.90::Decimal(12,2)::Variant);
+INSERT INTO T VALUES ('primitive_decimal16', 
12345678912345678.90::Decimal(30,2)::Variant);
+INSERT INTO T VALUES ('primitive_date', '2025-04-16'::Date::Variant);
+INSERT INTO T VALUES ('primitive_timestamp', 
'2025-04-16T12:34:56.78'::Timestamp::Variant);
+INSERT INTO T VALUES ('primitive_timestampntz', 
'2025-04-16T12:34:56.78'::Timestamp_NTZ::Variant);
+INSERT INTO T VALUES ('primitive_float', 1234567890.1234::Float::Variant);
+INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant);
+INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes 
and therefore does not fit in a short_string and it also includes several non 
ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant);
+
+-- https://github.com/apache/parquet-testing/issues/79
+-- is not clear how to create the following types using Spark SQL
+-- TODO TimeNTZ                    (Type ID 17)
+-- TODO 'timestamp with timezone'  (Type ID 18)

Review Comment:
   (nanos)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Add example binary variant data and regeneration scripts [parquet-testing]

Reply via email to