#30158: Subquery Expressions Incorrectly Added to Group by
-------------------------------------+-------------------------------------
               Reporter:             |          Owner:  nobody
  JonnyWaffles                       |
                   Type:             |         Status:  new
  Uncategorized                      |
              Component:  Database   |        Version:  2.1
  layer (models, ORM)                |
               Severity:  Normal     |       Keywords:  subquery, group_by
           Triage Stage:             |      Has patch:  0
  Unreviewed                         |
    Needs documentation:  0          |    Needs tests:  0
Patch needs improvement:  0          |  Easy pickings:  0
                  UI/UX:  0          |
-------------------------------------+-------------------------------------
 Hi friends,

 My Django/SQL skills are not good enough to properly isolate the problem
 independently of my use case detailed below. I believe the problem is
 Subqueries being forced in to the group_by clause because they are select
 expressions. Per the below if I remove the subqueries manually from sql
 group_by, my query runs perfectly. I tried to manually edit the
 qs.query.group_by, but because it is created by compiler.get_group_by() I
 cannot fix the ORM query group by clause.

 Are Subquery expressions always supposed to be included in group_by? If
 this is desired behavior is it possible to toggle it off so the ORM can
 produce the accurate query?

 {{{
 """
 Problem Statement: The individual annotations work fine when run
 independently, but when chained the query takes 5 minutes. This is due to
 the final group by clause unexpectedly receiving the Subquery as extra
 fields.
 """
 ​
 # relevant models and querysets
 class ClaimQuerySet(models.QuerySet):
     def annotate_all(self):
         """Adds ``results``, ``latest_note_text``, and
 ``latest_assessment_text`` to the queryset."""
         return
 
self.annotate_latest_results().annotate_most_recent_note().annotate_most_recent_assessment()
 ​
     def prefetch_all(self, annotate_sum=True):
         return self.prefetch_notes().prefetch_latest_results(annotate_sum)
 ​
     def prefetch_latest_results(self, annotate_sum: bool=True):
         """Prefetches the most result :class:`RulesEngineResult` object
 and optionally
         preload its :attr:`RulesEngineResult.score`.
 ​
         Args:
             annotate_sum:
         """
         latest_runs = self.latest_runs
         if annotate_sum:
             latest_runs =
 latest_runs.annotate(_score=Sum('results__value'))
         return self.prefetch_related(Prefetch(
             'rules_engine_results', queryset=latest_runs,
 to_attr='_last_run')
         )
 ​
     def prefetch_notes(self):
         """Prefetches all related notes and assessments."""
         return self.prefetch_related('notes', 'assessments')
 ​
     @property
     def latest_runs(self):
         """Shortcut for
 :attr:`RulesEngineResultQuerySet.get_latest_runs`"""
         return RulesEngineResult.objects.get_latest_runs()
 ​
     def annotate_latest_results(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a new field ``results`` whose value
 is the Sum
         of the last :attr:`RulesEngineResult.results` for the claim.
         """
         # Only Sum on runs in the above set.
         filter_q = Q(rules_engine_results__in=self.latest_runs)
         # noinspection PyTypeChecker
         return
 self.annotate(results=Sum('rules_engine_results__results__value',
 filter=filter_q))
 ​
     def annotate_most_recent_note(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a field ``latest_note_text`` whose
 value is the last
         entered :attr:`Note.text` for the claim or ``None`` if there are
 no associated notes.
         """
         return self._annotate_most_recent_basenote(Note,
 'latest_note_text')
 ​
     def annotate_most_recent_assessment(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a field ``latest_assessment_text``
 whose value is the last
         entered :attr:`Assessment.text` for the claim or ``None`` if there
 are no associated assessments.
         """
         return self._annotate_most_recent_basenote(Assessment,
 'latest_assessment_text')
 ​
     def _annotate_most_recent_basenote(self, model: Type['BaseNote'],
 field_name: str) -> 'ClaimQuerySet':
         newest =
 model.objects.filter(claim=OuterRef('id')).order_by('-created')
         annotate_kwargs = {
             field_name: Subquery(newest.values('text')[:1])
         }
         # noinspection PyTypeChecker
         return self.annotate(**annotate_kwargs)
 ​
 ​
 class Claim(BaseClaim):
     """Concrete :class:`~mkl.fraud_django.models.BaseClaim` for
     :mod:`~mkl.fraud_django.workers_comp` claims.
     """
     objects = ClaimQuerySet.as_manager()
     first_rules_engine_run = models.DateField()
 ​
     @property
     def latest_note(self) -> 'Note':
         """Returns the latest :class:`Note`."""
         return self.notes.latest()
 ​
     @property
     def latest_assessment(self) -> 'Assessment':
         """Retrieves the latest :class:`Assessment`."""
         return self.assessments.latest()
 ​
     @property
     def latest_rulesengine_run(self) -> 'RulesEngineResult':
         """Returns the most most recent run.
 ​
         .. note::
 ​
             Use :meth:`ClaimQuerySet.prefetch_latest_results` to
             prefetch the last_run, falls back on querying for the latest
 value.
 ​
             Note, if used in a prefetched queryset the value could be
 stale.
         """
         return self._get_latest(RulesEngineResult, '_last_run')
 ​
     def _get_latest(self, model: Type[models.Model], cache_attr: str):
         """Handler to return None if a latest related object does not
 exist,
         checks the cache first."""
         if hasattr(self, cache_attr):
             try:
                 return getattr(self, cache_attr)[0]
             except IndexError:
                 return None
         try:
             return model.objects.filter(claim=self).latest()
         except model.DoesNotExist:
             return None
 ​
     def __unicode__(self):
         return self.claim_number
 ​
 ​
 class BaseNote(models.Model):
     """Abstract Base Model for both Notes and Assessments.
 ​
     Use this base for any claim related editable field whose
     historical data is important.
 ​
     On the claim we can write functions to retrieve the latest.
 ​
     .. note:: The related name will be the class name lower case with an
 's'.
 ​
     Attributes:
         text (str): The user provided content
         created (datetime.datetime): Created time stamp
         claim (:class:`Claim`): The claim related to the note.
     """
     id = models.AutoField(primary_key=True)
     text = models.TextField(max_length=1000)
     created = models.DateTimeField(auto_now_add=True)
     claim = models.ForeignKey('Claim', on_delete=models.PROTECT,
 related_name='%(class)ss')
 ​
     class Meta:
         abstract = True
         get_latest_by = 'created'
         ordering = ('-created',)
 ​
 ​
 class Note(BaseNote):
     """Concrete class for Notes, related_name will become ``notes``."""
 ​
 ​
 class Assessment(BaseNote):
     """Concrete class for Assessment, related_name will become
 ``assessments``."""
     CHOICES = (
         ('01', 'Will open a case'),
         ('02', 'Will not open a case'),
         ('03', 'Previously opened'),
         ('04', "Appears suspicious but won't open"),
         ('05', 'Not enough info to determine'),
         ('06', 'Existing vendor request'),
     )
     text = models.CharField(max_length=1000, choices=CHOICES)
 ​
     def get_choice_value(self) -> str:
         """Returns the value as the choice human readable text."""
         db_text = self.text
         return dict(self.CHOICES)[db_text]
 ​
 ​
 class RuleResult(models.Model):
     """The result of running the engine for a particular claim against a
 :class:`Rule`.
 ​
     Attributes:
         rule: The rule to be checked
         value: The numeric weight of the result
         result: The rules engine result of all rules run against the claim
     """
     id = models.AutoField(primary_key=True)
     rule = models.ForeignKey('Rule', on_delete=models.PROTECT)
     value = models.IntegerField()
     result = models.ForeignKey('RulesEngineResult',
 on_delete=models.PROTECT, related_name='results')
 ​
 ​
 class RulesEngineResultQuerySet(models.QuerySet):
     def get_latest_runs(self):
         """Filters to only the most recent
 :class:`RulesEngineResult`\s."""
         annotated = self.annotate(
             latest=Max('claim__rules_engine_results__created')
         )
         return annotated.filter(created=F('latest'))
 ​
 ​
 class RulesEngineResult(models.Model):
     """
     RulesEngine run result.
 ​
     Attributes:
         claim (:class:`Claim`): The claim run through the RulesEngine.
         results (List[:class:`RuleResult`]): Collection of results for
 each rule.
     """
     id = models.AutoField(primary_key=True)
     created = models.DateTimeField(auto_now_add=True)
     claim = models.ForeignKey('Claim', on_delete=models.PROTECT,
 related_name='rules_engine_results')
     objects = RulesEngineResultQuerySet.as_manager()
 ​
     class Meta:
         get_latest_by = 'created'
 ​
     @property
     def score(self) -> int:
         """Returns the aggregate score of all related results. Checks
 prefetched cache first."""
         if hasattr(self, '_score'):
             return self._score
         d = self.results.aggregate(score=models.Sum('value'))
         return d['score']
 ​
 ​
 """
 Individual Query rendering
 """
 # Recent Note
 qs = models.Claim.objects.annotate_most_recent_note()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1) AS "latest_note_text"
 FROM "workers_comp_claim"
 ​
 # Recent Assessment
 qs = models.Claim.objects.annotate_most_recent_assessment()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1) AS "latest_assessment_text"
 FROM "workers_comp_claim"
 ​
 # Latest Results (Run)
 qs = models.Claim.objects.annotate_latest_results()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id"
 ​
 ​
 """
 When chained the query renders incorrectly like this
 """
 qs =
 
models.Claim.objects.annotate_latest_results().annotate_most_recent_note().annotate_most_recent_assessment()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_note_text",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_assessment_text"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id", (SELECT U0."text"
                                      FROM "workers_comp_note" U0
                                      WHERE U0."claim_id" =
 ("workers_comp_claim"."id")
                                      ORDER BY U0."created" DESC
                                      LIMIT 1), (SELECT U0."text"
                                                 FROM
 "workers_comp_assessment" U0
                                                 WHERE U0."claim_id" =
 ("workers_comp_claim"."id")
                                                 ORDER BY U0."created" DESC
                                                 LIMIT 1)
 ​
 ​
 """
 Why is Django performing the group by with the Subqueries? How do I make
 it render correctly like this:
 """
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_note_text",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_assessment_text"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id";
 }}}

-- 
Ticket URL: <https://code.djangoproject.com/ticket/30158>
Django <https://code.djangoproject.com/>
The Web framework for perfectionists with deadlines.

-- 
You received this message because you are subscribed to the Google Groups 
"Django updates" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/django-updates/055.66744c40ff4a14b234f369a840a3c5e4%40djangoproject.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to