Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/239#discussion_r172941914 --- Diff: src/ports/postgres/modules/sample/balance_sample.sql_in --- @@ -543,6 +544,90 @@ SELECT * FROM output_table ORDER BY mainhue, name; (25 rows) </pre> +-# To perform the balance sampling for independent groups, use the 'grouping_cols' +parameter. Note below that each group (zone) has a different count of the classes (mainhue), +with some groups not containing some class values. +<pre class="syntax"> +DROP TABLE IF EXISTS output_table; +SELECT madlib.balance_sample( + 'flags', -- Source table + 'output_table', -- Output table + 'mainhue', -- Class column + NULL, -- Uniform + NULL, -- Output table size + 'zone' -- No grouping +); +SELECT * FROM output_table ORDER BY zone, mainhue; +</pre> +<pre class="result"> + __madlib_id__ | id | name | landmass | zone | area | population | language | colours | mainhue +---------------+----+-------------+----------+------+------+------------+----------+---------+--------- + 6 | 8 | Greece | 3 | 1 | 132 | 10 | 6 | 2 | blue + 5 | 8 | Greece | 3 | 1 | 132 | 10 | 6 | 2 | blue + 8 | 17 | Sweden | 3 | 1 | 450 | 8 | 6 | 2 | blue + 7 | 8 | Greece | 3 | 1 | 132 | 10 | 6 | 2 | blue + 2 | 7 | Denmark | 3 | 1 | 43 | 5 | 6 | 2 | red + 1 | 6 | China | 5 | 1 | 9561 | 1008 | 7 | 2 | red + 4 | 12 | Luxembourg | 3 | 1 | 3 | 0 | 4 | 3 | red + 3 | 18 | Switzerland | 3 | 1 | 41 | 6 | 4 | 2 | red + 1 | 2 | Australia | 6 | 2 | 7690 | 15 | 1 | 3 | blue + 1 | 1 | Argentina | 2 | 3 | 2777 | 28 | 2 | 2 | blue + 2 | 4 | Brazil | 2 | 3 | 8512 | 119 | 6 | 4 | green + 6 | 9 | Guatemala | 1 | 4 | 109 | 8 | 2 | 2 | blue + 5 | 9 | Guatemala | 1 | 4 | 109 | 8 | 2 | 2 | blue + 4 | 9 | Guatemala | 1 | 4 | 109 | 8 | 2 | 2 | blue + 12 | 13 | Mexico | 1 | 4 | 1973 | 77 | 2 | 4 | green + 10 | 13 | Mexico | 1 | 4 | 1973 | 77 | 2 | 4 | green + 11 | 13 | Mexico | 1 | 4 | 1973 | 77 | 2 | 4 | green + 1 | 19 | UK | 3 | 4 | 245 | 56 | 1 | 3 | red + 3 | 5 | Canada | 1 | 4 | 9976 | 24 | 1 | 2 | red + 2 | 15 | Portugal | 3 | 4 | 92 | 10 | 6 | 5 | red + 8 | 20 | USA | 1 | 4 | 9363 | 231 | 1 | 3 | white + 7 | 20 | USA | 1 | 4 | 9363 | 231 | 1 | 3 | white + 9 | 10 | Ireland | 3 | 4 | 70 | 3 | 1 | 3 | white +(23 rows) --- End diff -- Right. We're always using ceil instead of round and hence will provide more than the expected rows in cases where division is not exact.
---