This is an automated email from the ASF dual-hosted git repository.
ianmcook pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-experiments.git
The following commit(s) were added to refs/heads/main by this push:
new 2b619a2 Add `rand-many-types` data in Parquet and DuckDB formats (#47)
2b619a2 is described below
commit 2b619a2868eb6a6dafd1764f082170180cf49f0e
Author: Ian Cook <[email protected]>
AuthorDate: Thu May 22 22:50:28 2025 -0400
Add `rand-many-types` data in Parquet and DuckDB formats (#47)
* Add Parquet and DuckDB versions
* Add missing newline
* Add .exit
* Add in a Makefile
---------
Co-authored-by: Bryce Mecum <[email protected]>
---
.gitattributes | 2 ++
.gitattributes => data/rand-many-types/Makefile | 14 ++++++++++----
data/rand-many-types/README.md | 7 +++++++
.../rand-many-types/arrows-to-parquet.py | 13 +++++++++----
data/rand-many-types/parquet-to-duckdb.sql | 21 +++++++++++++++++++++
data/rand-many-types/random.duckdb | 3 +++
data/rand-many-types/random.parquet | 3 +++
data/rand-many-types/requirements.txt | 2 ++
8 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/.gitattributes b/.gitattributes
index e732a16..3138b37 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -19,3 +19,5 @@ data/**/*.arrows filter=lfs diff=lfs merge=lfs -text
data/**/*.arrow filter=lfs diff=lfs merge=lfs -text
data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text
data/**/*.parquet filter=lfs diff=lfs merge=lfs -text
+data/**/*.db filter=lfs diff=lfs merge=lfs -text
+data/**/*.duckdb filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitattributes b/data/rand-many-types/Makefile
similarity index 78%
copy from .gitattributes
copy to data/rand-many-types/Makefile
index e732a16..ce42068 100644
--- a/.gitattributes
+++ b/data/rand-many-types/Makefile
@@ -15,7 +15,13 @@
# specific language governing permissions and limitations
# under the License.
-data/**/*.arrows filter=lfs diff=lfs merge=lfs -text
-data/**/*.arrow filter=lfs diff=lfs merge=lfs -text
-data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text
-data/**/*.parquet filter=lfs diff=lfs merge=lfs -text
+all: arrow parquet duckdb
+
+arrow: generate.py
+ python ./generate.py
+
+parquet: arrow
+ python ./arrows-to-parquet.py
+
+duckdb: parquet
+ duckdb -f ./parquet-to-duckdb.sql
diff --git a/data/rand-many-types/README.md b/data/rand-many-types/README.md
index 83c7872..f14b1f8 100644
--- a/data/rand-many-types/README.md
+++ b/data/rand-many-types/README.md
@@ -20,3 +20,10 @@
# rand-many-types
This directory contains a file `random.arrows` in Arrow IPC stream format with
randomly generated values in 20+ columns exercising many different Arrow data
types. The Python script `generate.py` that generated the data file is included.
+
+The same data is also included as a Parquet file (`random.parquet`) and as a
DuckDB database file (`random.duckdb`) as the table named `random`. The Python
and SQL used to generate these files is included.
+
+To re-generate the data files (for example, if you change `generate.py`),
+
+1. Make sure `duckdb` is in your path and activate a Python environment with
the packages in `./requirements.txt`
+2. Run `make`
diff --git a/.gitattributes b/data/rand-many-types/arrows-to-parquet.py
similarity index 78%
copy from .gitattributes
copy to data/rand-many-types/arrows-to-parquet.py
index e732a16..0ef84d8 100644
--- a/.gitattributes
+++ b/data/rand-many-types/arrows-to-parquet.py
@@ -15,7 +15,12 @@
# specific language governing permissions and limitations
# under the License.
-data/**/*.arrows filter=lfs diff=lfs merge=lfs -text
-data/**/*.arrow filter=lfs diff=lfs merge=lfs -text
-data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text
-data/**/*.parquet filter=lfs diff=lfs merge=lfs -text
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+
+with open("random.arrows", "rb") as f:
+ reader = pa.ipc.open_stream(f)
+ table = reader.read_all()
+
+pq.write_table(table, "random.parquet")
diff --git a/data/rand-many-types/parquet-to-duckdb.sql
b/data/rand-many-types/parquet-to-duckdb.sql
new file mode 100644
index 0000000..3a3b007
--- /dev/null
+++ b/data/rand-many-types/parquet-to-duckdb.sql
@@ -0,0 +1,21 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements. See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership. The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied. See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+.open random.duckdb
+DROP TABLE IF EXISTS random;
+CREATE TABLE random AS SELECT * FROM './random.parquet';
+.exit
diff --git a/data/rand-many-types/random.duckdb
b/data/rand-many-types/random.duckdb
new file mode 100644
index 0000000..e3037eb
--- /dev/null
+++ b/data/rand-many-types/random.duckdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f00607e3755773432dc81ba4bfe8bcdae8457f3430587a12d632261c4451fe
+size 17575936
diff --git a/data/rand-many-types/random.parquet
b/data/rand-many-types/random.parquet
new file mode 100644
index 0000000..e195979
--- /dev/null
+++ b/data/rand-many-types/random.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1518178747ce49b87f13feca3b125f3dcc0c686d198a205c6c8d38c5c4db1158
+size 11109117
diff --git a/data/rand-many-types/requirements.txt
b/data/rand-many-types/requirements.txt
new file mode 100644
index 0000000..32df5a7
--- /dev/null
+++ b/data/rand-many-types/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+pyarrow