alamb commented on code in PR #10976:
URL: https://github.com/apache/datafusion/pull/10976#discussion_r1644295073


##########
datafusion/physical-plan/src/aggregates/group_values/row.rs:
##########
@@ -86,15 +102,42 @@ impl GroupValuesRows {
             group_values: None,
             hashes_buffer: Default::default(),
             random_state: Default::default(),
+            var_map: ArrowBytesMap::new(OutputType::Utf8),
+            num_groups: 0,
         })
     }
 }
 
 impl GroupValues for GroupValuesRows {
     fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> 
Result<()> {
+        let cols_var = &cols[1];
+        groups.clear();
+        self.var_map.insert_if_new(
+            cols_var,
+            // called for each new group
+            |_value| {
+                // assign new group index on each insert
+                let group_idx = self.num_groups;
+                self.num_groups += 1;
+                group_idx
+            },
+            // called for each group
+            |group_idx| {
+                groups.push(group_idx);
+            },
+        );
+
+
+        let u64_vec: Vec<u64> = groups.iter().map(|&x| x as u64).collect();

Review Comment:
   It might be faster to write into `u64_vec` directly rather than write to 
groups and then copy over



##########
benchmarks/queries/clickbench/queries.sql:
##########
@@ -1,43 +1,3 @@
-SELECT COUNT(*) FROM hits;
-SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
-SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
-SELECT AVG("UserID") FROM hits;
-SELECT COUNT(DISTINCT "UserID") FROM hits;
-SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
-SELECT MIN("EventDate"::INT::DATE), MAX("EventDate"::INT::DATE) FROM hits;
-SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY 
"AdvEngineID" ORDER BY COUNT(*) DESC;
-SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" 
ORDER BY u DESC LIMIT 10;
-SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), 
COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
-SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE 
"MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
-SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM 
hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" 
ORDER BY u DESC LIMIT 10;
-SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' 
GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
-SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE 
"SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
-SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE 
"SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC 
LIMIT 10;
-SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC 
LIMIT 10;
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", 
"SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", 
"SearchPhrase" LIMIT 10;
-SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, 
"SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER 
BY COUNT(*) DESC LIMIT 10;
-SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
-SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
-SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE 
'%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC 
LIMIT 10;
-SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT 
"UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE 
'%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC 
LIMIT 10;
-SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY 
to_timestamp_seconds("EventTime") LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY 
to_timestamp_seconds("EventTime") LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY 
"SearchPhrase" LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY 
to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10;
-SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE 
"URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 
25;
-SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS 
k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE 
"Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
-SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), 
SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 
4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), 
SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 
9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), 
SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" 
+ 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), 
SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" 
+ 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), 
SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" 
+ 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), 
SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" 
+ 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), 
SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWid
 th" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), 
SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" 
+ 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), 
SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" 
+ 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), 
SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" 
+ 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), 
SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" 
+ 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), 
SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" 
+ 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), 
SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" 
+ 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), 
SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), 
 SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), 
SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" 
+ 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), 
SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" 
+ 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), 
SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" 
+ 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), 
SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" 
+ 88), SUM("ResolutionWidth" + 89) FROM hits;
-SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), 
AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY 
"SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
-SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), 
AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", 
"ClientIP" ORDER BY c DESC LIMIT 10;
-SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), 
AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC 
LIMIT 10;
-SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
-SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC 
LIMIT 10;
-SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS 
c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 
ORDER BY c DESC LIMIT 10;
-SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND 
"EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= 
'2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP 
BY "URL" ORDER BY PageViews DESC LIMIT 10;
-SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND 
"EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= 
'2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' 
GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
-SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND 
"EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= 
'2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP 
BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
-SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN 
("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, 
"URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND 
"EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= 
'2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", 
"AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
-SELECT "URLHash", "EventDate"::INT::DATE, COUNT(*) AS PageViews FROM hits 
WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND 
"EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" 
IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", 
"EventDate"::INT::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
-SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM 
hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND 
"EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" 
= 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", 
"WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
-SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) 
AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= 
'2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND 
"DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', 
to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 
OFFSET 1000;
+SELECT "UserID", concat("SearchPhrase", repeat('hello', 100)) as s, COUNT(*) 
FROM hits GROUP BY "UserID", s LIMIT 10;

Review Comment:
   This is a clever way to quickly iterate for benchmark tests 👍 



##########
datafusion/physical-plan/src/aggregates/group_values/row.rs:
##########
@@ -170,15 +213,40 @@ impl GroupValues for GroupValuesRows {
             .take()
             .expect("Can not emit from empty rows");
 
+        let map_contents = self.var_map.take().into_state();
+        let map_contents = map_contents.as_string::<i32>();
+
         let mut output = match emit_to {
             EmitTo::All => {
-                let output = self.row_converter.convert_rows(&group_values)?;
+                let mut output = 
self.row_converter.convert_rows(&group_values)?;
+
+
+                // Index Array: [0, 1, 1, 0]
+                // Data Array: ['a', 'c']

Review Comment:
   This code copies all the strings again, which likely takes significant time
   
   One way to make this faster is likey to  special case the "has no nulls" 
path (so we can avoid `if let Some(.)` check each row
   
   The only other ways I can think to make this faster is to avoid copying the 
strings all together. The only way to do so I can figure are:
   1. Return a DictionaryArray as output 
   4. Return a `StringViewArray` (similar to DictionayArray) 
https://github.com/apache/datafusion/issues/10918
   
   🤔 
   
   Maybe we can try to hack in the ability to have the HashAggregateExec return 
`Dictionary(Int32, String)` for these multi-column groups and see if we can 
show significant performance improvements
   
   If so then we can figure out how to thread that ability through the engine.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to