denimalpaca commented on code in PR #25164:
URL: https://github.com/apache/airflow/pull/25164#discussion_r955273173


##########
airflow/providers/common/sql/operators/sql.py:
##########
@@ -273,38 +303,38 @@ def __init__(
 
         self.table = table
         self.checks = checks
+        self.partition_clause = partition_clause
         # OpenLineage needs a valid SQL query with the input/output table(s) 
to parse
         self.sql = f"SELECT * FROM {self.table};"
 
     def execute(self, context=None):
         hook = self.get_db_hook()
-
-        check_names = [*self.checks]
-        check_mins_sql = ",".join(
-            self.sql_min_template.replace("check_name", check_name) for 
check_name in check_names
-        )
-        checks_sql = ",".join(
+        checks_sql = " UNION ALL ".join(
             [
-                self.sql_check_template.replace("check_statement", 
value["check_statement"]).replace(
-                    "check_name", check_name
-                )
+                self.sql_check_template.replace("check_statement", 
value["check_statement"])
+                .replace("_check_name", check_name)
+                .replace("table", self.table)
                 for check_name, value in self.checks.items()
             ]
         )
+        partition_clause_statement = f"WHERE {self.partition_clause}" if 
self.partition_clause else ""
+        self.sql = f"SELECT check_name, check_result FROM ({checks_sql}) "
+        f"AS check_table {partition_clause_statement};"
 
-        self.sql = f"SELECT {check_mins_sql} FROM (SELECT {checks_sql} FROM 
{self.table});"
-        records = hook.get_first(self.sql)
+        records = hook.get_pandas_df(self.sql)

Review Comment:
   To expand a bit more, a check like `col_a + col_b >= col_c` would not work 
when there were multiple checks in the operator as the previous `SELECT` 
statement would then fail and require a `GROUP BY` clause iirc. So the check 
would either have to be in its own operator, or be amended like so: `SUM(col_a) 
+ SUM(col_b) >= SUM(col_c)` which isn't quite the same check. So the query 
needed to be updated, and the one that I wound up using returns multiple rows. 
So `get_first` is no longer useful, and in the moment of writing it seemed that 
handling things with a pandas dataframe might be easier in the long term, if 
more complicated uses of the pulled in data were implemented. But as of now I 
see how it's unneeded.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to