Re: [PR] Add reproducer for #8738 [arrow-datafusion]

via GitHub Thu, 04 Jan 2024 10:54:14 -0800


alamb commented on code in PR #8750:
URL: https://github.com/apache/arrow-datafusion/pull/8750#discussion_r1442138693



##########
datafusion/sqllogictest/test_files/dictionary.slt:
##########
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for querying on dictionary encoded data
+
+# Note: These tables model data as is common for timeseries, such as in 
InfluxDB IOx
+# There are three types of columns:
+# 1. tag columns, which are string dictionaries, often with low cardinality
+# 2. field columns, which are typed,
+# 3. a `time` columns, which is a nanosecond timestamp
+
+# It is common to group and filter on the "tag" columns (and thus on dictionary
+# encoded values)
+
+# Table m1 with a tag column `tag_id` 4 fields `f1` - `f4`, and `time`
+
+statement ok
+CREATE VIEW m1 AS
+SELECT
+    arrow_cast(column1, 'Dictionary(Int32, Utf8)') as tag_id,
+    arrow_cast(column2, 'Float64') as f1,
+    arrow_cast(column3, 'Utf8') as f2,
+    arrow_cast(column4, 'Utf8') as f3,
+    arrow_cast(column5, 'Float64') as f4,
+    arrow_cast(column6, 'Timestamp(Nanosecond, None)') as time
+FROM (
+    VALUES
+    -- equivalent to the following line protocol data
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=1.0 1703030400000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=2.0 1703031000000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=3.0 1703031600000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=4.0 1703032200000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=5.0 1703032800000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=6.0 1703033400000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=7.0 1703034000000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=8.0 1703034600000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=9.0 1703035200000000000
+    -- m1,tag_id=1000 f1=32,f2="foo",f3="True",f4=10.0 1703035800000000000
+    ('1000', 32, 'foo', 'True', 1.0, 1703030400000000000),
+    ('1000', 32, 'foo', 'True', 2.0, 1703031000000000000),
+    ('1000', 32, 'foo', 'True', 3.0, 1703031600000000000),
+    ('1000', 32, 'foo', 'True', 4.0, 1703032200000000000),
+    ('1000', 32, 'foo', 'True', 5.0, 1703032800000000000),
+    ('1000', 32, 'foo', 'True', 6.0, 1703033400000000000),
+    ('1000', 32, 'foo', 'True', 7.0, 1703034000000000000),
+    ('1000', 32, 'foo', 'True', 8.0, 1703034600000000000),
+    ('1000', 32, 'foo', 'True', 9.0, 1703035200000000000),
+    ('1000', 32, 'foo', 'True', 10.0, 1703035800000000000)
+);
+
+query ?RTTRP
+SELECT * FROM m1;
+----
+1000 32 foo True 1 2023-12-20T00:00:00
+1000 32 foo True 2 2023-12-20T00:10:00
+1000 32 foo True 3 2023-12-20T00:20:00
+1000 32 foo True 4 2023-12-20T00:30:00
+1000 32 foo True 5 2023-12-20T00:40:00
+1000 32 foo True 6 2023-12-20T00:50:00
+1000 32 foo True 7 2023-12-20T01:00:00
+1000 32 foo True 8 2023-12-20T01:10:00
+1000 32 foo True 9 2023-12-20T01:20:00
+1000 32 foo True 10 2023-12-20T01:30:00
+
+# Note that te type of the tag column is `Dictionary(Int32, Utf8)`
+query TTT
+DESCRIBE m1;
+----
+tag_id Dictionary(Int32, Utf8) YES
+f1 Float64 YES
+f2 Utf8 YES
+f3 Utf8 YES
+f4 Float64 YES
+time Timestamp(Nanosecond, None) YES
+
+
+# Table m2 with a tag columns `tag_id` and `type`, a field column `f5`, and 
`time`
+statement ok
+CREATE VIEW m2 AS
+SELECT
+    arrow_cast(column1, 'Dictionary(Int32, Utf8)') as type,
+    arrow_cast(column2, 'Dictionary(Int32, Utf8)') as tag_id,
+    arrow_cast(column3, 'Float64') as f5,
+    arrow_cast(column4, 'Timestamp(Nanosecond, None)') as time
+FROM (
+    VALUES
+    -- equivalent to the following line protocol data
+    -- m2,type=active,tag_id=1000 f5=100 1701648000000000000
+    -- m2,type=active,tag_id=1000 f5=200 1701648600000000000
+    -- m2,type=active,tag_id=1000 f5=300 1701649200000000000
+    -- m2,type=active,tag_id=1000 f5=400 1701649800000000000
+    -- m2,type=active,tag_id=1000 f5=500 1701650400000000000
+    -- m2,type=active,tag_id=1000 f5=600 1701651000000000000
+    -- m2,type=passive,tag_id=2000 f5=700 1701651600000000000
+    -- m2,type=passive,tag_id=1000 f5=800 1701652200000000000
+    -- m2,type=passive,tag_id=1000 f5=900 1701652800000000000
+    -- m2,type=passive,tag_id=1000 f5=1000 1701653400000000000
+    ('active', '1000', 100, 1701648000000000000),
+    ('active', '1000', 200, 1701648600000000000),
+    ('active', '1000', 300, 1701649200000000000),
+    ('active', '1000', 400, 1701649800000000000),
+    ('active', '1000', 500, 1701650400000000000),
+    ('active', '1000', 600, 1701651000000000000),
+    ('passive', '1000', 700, 1701651600000000000),
+    ('passive', '1000', 800, 1701652200000000000),
+    ('passive', '1000', 900, 1701652800000000000),
+    ('passive', '1000', 1000, 1701653400000000000)
+);
+
+query ??RP
+SELECT * FROM m2;
+----
+active 1000 100 2023-12-04T00:00:00
+active 1000 200 2023-12-04T00:10:00
+active 1000 300 2023-12-04T00:20:00
+active 1000 400 2023-12-04T00:30:00
+active 1000 500 2023-12-04T00:40:00
+active 1000 600 2023-12-04T00:50:00
+passive 1000 700 2023-12-04T01:00:00
+passive 1000 800 2023-12-04T01:10:00
+passive 1000 900 2023-12-04T01:20:00
+passive 1000 1000 2023-12-04T01:30:00
+
+query TTT
+DESCRIBE m2;
+----
+type Dictionary(Int32, Utf8) YES
+tag_id Dictionary(Int32, Utf8) YES
+f5 Float64 YES
+time Timestamp(Nanosecond, None) YES
+
+query I
+select count(*) from m1 where tag_id = '1000' and time < 
'2024-01-03T14:46:35+01:00';
+----
+10
+
+query RRR
+select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time < 
'2024-01-03T14:46:35+01:00' group by type;
+----
+100 600 350
+700 1000 850
+
+query IRRRP
+select count(*), min(f5), max(f5), avg(f5), date_bin('30 minutes', time) as 
"time"
+from m2 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00'
+group by date_bin('30 minutes', time)
+order by date_bin('30 minutes', time) DESC
+----
+1 1000 1000 1000 2023-12-04T01:30:00
+3 700 900 800 2023-12-04T01:00:00
+3 400 600 500 2023-12-04T00:30:00
+3 100 300 200 2023-12-04T00:00:00
+
+
+
+# Reproducer for https://github.com/apache/arrow-datafusion/issues/8738
+# This query should work correctly
+query error DataFusion error: External error: Arrow error: Invalid argument 
error: RowConverter column schema mismatch, expected Utf8 got 
Dictionary\(Int32, Utf8\)
+SELECT

Review Comment:
   I agree it is not necessary, however it is what we tell our customers to do 
(so they don't have to deal with the upper/lower case distinction) and thus it 
is very common in our production systems
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add reproducer for #8738 [arrow-datafusion]

Reply via email to