Github user fmcquillan99 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/239#discussion_r172949439
--- Diff: src/ports/postgres/modules/sample/balance_sample.sql_in ---
@@ -543,6 +544,90 @@ SELECT * FROM output_table ORDER BY mainhue, name;
(25 rows)
</pre>
+-# To perform the balance sampling for independent groups, use the
'grouping_cols'
+parameter. Note below that each group (zone) has a different count of the
classes (mainhue),
+with some groups not containing some class values.
+<pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+ 'flags', -- Source table
+ 'output_table', -- Output table
+ 'mainhue', -- Class column
+ NULL, -- Uniform
+ NULL, -- Output table size
+ 'zone' -- No grouping
+);
+SELECT * FROM output_table ORDER BY zone, mainhue;
+</pre>
+<pre class="result">
+ __madlib_id__ | id | name | landmass | zone | area | population |
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+ 6 | 8 | Greece | 3 | 1 | 132 | 10 |
6 | 2 | blue
+ 5 | 8 | Greece | 3 | 1 | 132 | 10 |
6 | 2 | blue
+ 8 | 17 | Sweden | 3 | 1 | 450 | 8 |
6 | 2 | blue
+ 7 | 8 | Greece | 3 | 1 | 132 | 10 |
6 | 2 | blue
+ 2 | 7 | Denmark | 3 | 1 | 43 | 5 |
6 | 2 | red
+ 1 | 6 | China | 5 | 1 | 9561 | 1008 |
7 | 2 | red
+ 4 | 12 | Luxembourg | 3 | 1 | 3 | 0 |
4 | 3 | red
+ 3 | 18 | Switzerland | 3 | 1 | 41 | 6 |
4 | 2 | red
+ 1 | 2 | Australia | 6 | 2 | 7690 | 15 |
1 | 3 | blue
+ 1 | 1 | Argentina | 2 | 3 | 2777 | 28 |
2 | 2 | blue
+ 2 | 4 | Brazil | 2 | 3 | 8512 | 119 |
6 | 4 | green
+ 6 | 9 | Guatemala | 1 | 4 | 109 | 8 |
2 | 2 | blue
+ 5 | 9 | Guatemala | 1 | 4 | 109 | 8 |
2 | 2 | blue
+ 4 | 9 | Guatemala | 1 | 4 | 109 | 8 |
2 | 2 | blue
+ 12 | 13 | Mexico | 1 | 4 | 1973 | 77 |
2 | 4 | green
+ 10 | 13 | Mexico | 1 | 4 | 1973 | 77 |
2 | 4 | green
+ 11 | 13 | Mexico | 1 | 4 | 1973 | 77 |
2 | 4 | green
+ 1 | 19 | UK | 3 | 4 | 245 | 56 |
1 | 3 | red
+ 3 | 5 | Canada | 1 | 4 | 9976 | 24 |
1 | 2 | red
+ 2 | 15 | Portugal | 3 | 4 | 92 | 10 |
6 | 5 | red
+ 8 | 20 | USA | 1 | 4 | 9363 | 231 |
1 | 3 | white
+ 7 | 20 | USA | 1 | 4 | 9363 | 231 |
1 | 3 | white
+ 9 | 10 | Ireland | 3 | 4 | 70 | 3 |
1 | 3 | white
+(23 rows)
+</pre>
+
+-# Grouping can be used with class size specification as well. Note below
that
+'blue=<Integer>' is the only valid class value since 'blue' is the only
class
+value that is present in each group.
+<pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+ 'flags', -- Source table
+ 'output_table', -- Output table
+ 'mainhue', -- Class column
+ 'blue=2', -- Specified class value size. Rest of the values
are sampled as is.
+ NULL, -- Output table size
+ 'zone' -- No grouping
+);
+SELECT * FROM output_table ORDER BY zone, mainhue;
+</pre>
+<pre class="result">
+ __madlib_id__ | id | name | landmass | zone | area | population |
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+ 1 | 17 | Sweden | 3 | 1 | 450 | 8 |
6 | 2 | blue
+ 7 | 18 | Switzerland | 3 | 1 | 41 | 6 |
4 | 2 | red
+ 6 | 14 | Norway | 3 | 1 | 324 | 4 |
6 | 3 | red
+ 5 | 12 | Luxembourg | 3 | 1 | 3 | 0 |
4 | 3 | red
+ 4 | 7 | Denmark | 3 | 1 | 43 | 5 |
6 | 2 | red
+ 3 | 6 | China | 5 | 1 | 9561 | 1008 |
7 | 2 | red
+ 2 | 3 | Austria | 3 | 1 | 84 | 8 |
4 | 2 | red
+ 1 | 2 | Australia | 6 | 2 | 7690 | 15 |
1 | 3 | blue
+ 1 | 1 | Argentina | 2 | 3 | 2777 | 28 |
2 | 2 | blue
+ 2 | 4 | Brazil | 2 | 3 | 8512 | 119 |
6 | 4 | green
+ 1 | 9 | Guatemala | 1 | 4 | 109 | 8 |
2 | 2 | blue
+ 4 | 11 | Jamaica | 1 | 4 | 11 | 2 |
1 | 3 | green
+ 5 | 13 | Mexico | 1 | 4 | 1973 | 77 |
2 | 4 | green
+ 8 | 19 | UK | 3 | 4 | 245 | 56 |
1 | 3 | red
+ 2 | 5 | Canada | 1 | 4 | 9976 | 24 |
1 | 2 | red
+ 7 | 16 | Spain | 3 | 4 | 505 | 38 |
2 | 2 | red
+ 6 | 15 | Portugal | 3 | 4 | 92 | 10 |
6 | 5 | red
+ 9 | 20 | USA | 1 | 4 | 9363 | 231 |
1 | 3 | white
+ 3 | 10 | Ireland | 3 | 4 | 70 | 3 |
1 | 3 | white
+(19 rows)
--- End diff --
I forgot that blue=2 means for the whole table, so my misunderstanding, but
perhaps others may be confused too.
So yes, please put blue=8 in the example and remind the user that this
applies to the whole table so that each group will get 2 blues each.
---