#33711: Suggested feature : Faceted queries
-------------------------------------------+------------------------
               Reporter:  Antoine LAURENT  |          Owner:  nobody
                   Type:  Uncategorized    |         Status:  new
              Component:  Uncategorized    |        Version:  4.0
               Severity:  Normal           |       Keywords:
           Triage Stage:  Unreviewed       |      Has patch:  0
    Needs documentation:  0                |    Needs tests:  0
Patch needs improvement:  0                |  Easy pickings:  0
                  UI/UX:  0                |
-------------------------------------------+------------------------
 Hello.

 In my current projet, I create an aggregation function that relies on
 PostgreSQL GROUPING SETS to compute facets on a queryset.

 This is how we use it :
 {{{
 > aggregate_sql(series_qs, author='author__name, kinds='kind')
 {
     'authors': {' Agatha Christie': 34, 'Brandon Sanderson': 18, ...},
     'kinds': {'Fantasy': 25, ...},
     'total': 70,
 }
 }}}

 It's the kind of feature we use a lot on all our lists to have a better
 user experience with our filters.
 Since I think this might be of interest to other people, I would love to
 add this to Django.

 I'm currently unsure on were to add this feature in Django. It would be
 really easy to use on a queryset object (like `values()') but I don't
 think GROUPING exists on all supported databases, so maybe add it in
 `django.contrib.postgres` (without the queryset function) ?

 I pasted the code bellow.
 I relied on `queryset.values_list()` to build a SQL query, and then
 tweaked it to add GROUPING SETS and unnesting ArrayFields.
 I will need to re-write the code to build the SQL query properly.

 {{{
 import re

 from django.contrib.postgres import fields as pg_fields
 from django.core.exceptions import EmptyResultSet
 from django.db import connection


 def aggregate_sql(queryset, **agg_fields):
     """
     Get count for each given agg_fields.
     """

     def _get_first_non_null(data):
         for i, value in enumerate(data[:-1]):
             if value:
                 return i, value
         return None, None

     db_table = queryset.model._meta.db_table
     db_pk = queryset.model._meta.pk.name

     # With pg_fields.ArrayFields, we want to use PostgreSQL unnest
 function to count the values inside the list
     unnest_fields = []
     field_names: list[str] = list(agg_fields.values())
     for field_name in field_names:
         model = queryset.model
         while True:
             first, _delimiter, second = field_name.partition('__')
             if second:
                 # get related model
                 try:
                     model = getattr(model, first).field.related_model
                 except AttributeError:
                     model = getattr(model, first).related.related_model
                 field_name = second
             else:
                 field = model._meta.get_field(first)
                 unnest_fields.append(isinstance(field,
 pg_fields.ArrayField))
                 break

     # RAW SQL explenation :
     # ~~~~~~~~~~~~~~~~~~~~~
     # GROUPING SETS will allow us to make a count on each value of each
 given field.
     # We need COUNT(DISTINCT id) because we might unnest some ArrayFields
 to get seperated values.
     # We use the ORDER_BY clause to ensure that even if some fields have
 None values, we can
     # identify the total count as the first line.
     #
     # SELECT year, unnest(propulsion), brand_id, COUNT(id)
     # FROM device
     # GROUP BY GROUPING SETS (year, unnest(propulsion), brand_id, ())
     # ORDER BY count DESC;
     #
     #  year |     unnest      |              brand_id                |
 count
     #
 ------+-----------------+--------------------------------------+-------
     #       |                 |                                      |
 10128
     #       | electric        |                                      |
 8412
     #       |                 |                                      |
 6818
     #       |                 | a58e4174-22dc-4001-aa78-dce815b137b0 |
 6025
     #       |                 | b4d86cbf-2ed7-4605-b756-cac63ce73e46 |
 3004
     #  2021 |                 |                                      |
 1909
     #       | human           |                                      |
 1687
     #       |                 | b57bd339-042a-4a41-8815-1f9cbfaa8ef9 |
 1099
     #  2018 |                 |                                      |
 920
     #  2019 |                 |                                      |
 407
     #  2020 |                 |                                      |
 74
     #       | electric_assist |                                      |
 29
     #
     # The third line (with 6818 results) counts the devices with year=None
     # There is currently no way to distinguish None values from different
 fields

     # Prepare results
     result_keys = list(agg_fields.keys())
     aggregation = {name: {} for name in result_keys}

     try:
         # Use values_list to build most of the SQL query
         # NB: remove any order that will intefer with the sql parsing
 later on
         django_query, params =
 queryset.values_list(*field_names).order_by().query.sql_with_params()
         query_start, _delimiter, query_end =
 django_query.partition('FROM')
     except EmptyResultSet:
         aggregation['total'] = 0
         return aggregation

     # extract original select items and add unnest function if required
     SQL_REGEX = r'SELECT (DISTINCT )?(.*) '
     select_items = re.search(SQL_REGEX, query_start).groups()[1]
     sql_fields = []
     for field_name, unnest in zip(select_items.split(', '),
 unnest_fields):
         if unnest:
             sql_fields.append(f'unnest({field_name})')
         else:
             sql_fields.append(field_name)
     fields = ', '.join(sql_fields)

     query = f"""
         SELECT {fields}, COUNT(DISTINCT "{db_table}"."{db_pk}")
         FROM {query_end}
         GROUP BY GROUPING SETS ({fields}, ())
         ORDER BY count DESC
     """

     with connection.cursor() as cursor:
         cursor.execute(query, params)
         results = cursor.fetchall()

     # As stated is the previous comment, the first line is the total count
     aggregation['total'] = results[0][-1]

     for data in results:
         indice, value = _get_first_non_null(data)
         # if indice si None, then we have a count on an None value from
 one of the fields
         if indice is not None:
             aggregation_name = result_keys[indice]
             aggregation[aggregation_name][str(value)] = data[-1]

     return aggregation
 }}}

-- 
Ticket URL: <https://code.djangoproject.com/ticket/33711>
Django <https://code.djangoproject.com/>
The Web framework for perfectionists with deadlines.

-- 
You received this message because you are subscribed to the Google Groups 
"Django updates" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/django-updates/01070180cc586203-a68c54b8-c4d4-4290-89b4-809a11252000-000000%40eu-central-1.amazonses.com.

Reply via email to