This is an automated email from the ASF dual-hosted git repository.
vincbeck pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new aa7cb1d7f1b Catch and log pandas import errors (#58744)
aa7cb1d7f1b is described below
commit aa7cb1d7f1bc9d50b6bfab33e18e3d9171f81620
Author: Niko Oliveira <[email protected]>
AuthorDate: Mon Dec 1 10:00:46 2025 -0800
Catch and log pandas import errors (#58744)
Pandas is used if the user optionally selects advanced output
processing when providing `show_results=True` (default is False) to
GlueDataQualityRuleSetEvaluationRunOperator and
GlueDataQualityRuleSetEvaluationRunSensor
However, the original PR (#39923) adding these operators and sensors did not
include Pandas as a dependency of the Amazon Provider Package. I assume
this is because Pandas is quite a heavy dependency that we don't want
all users to have to install just for this very small usecase.
So this commit catches the exception and logs to the user rather than
failing catastrophically as it does now.
---
.../src/airflow/providers/amazon/aws/hooks/glue.py | 8 ++-
.../tests/unit/amazon/aws/hooks/test_glue.py | 84 ++++++++++++++++++++++
2 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
index 002282d5947..da01711ebb1 100644
--- a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
+++ b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
@@ -565,7 +565,13 @@ class GlueDataQualityHook(AwsBaseHook):
Rule_3 ColumnLength "marketplace" between 1 and 2 FAIL
{'Column.marketplace.MaximumLength': 9.0, 'Column.marketplace.MinimumLength':
3.0} Value: 9.0 does not meet the constraint requirement!
"""
- import pandas as pd
+ try:
+ import pandas as pd
+ except ImportError:
+ self.log.warning(
+ "Pandas is not installed. Please install pandas to see the
detailed Data Quality results."
+ )
+ return
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
diff --git a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
index df3e98401ae..9043f3cc5c3 100644
--- a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
+++ b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
@@ -684,6 +684,90 @@ class TestGlueDataQualityHook:
"AWS Glue data quality ruleset evaluation run, total number of
rules failed: 0"
]
+ @mock.patch.object(AwsBaseHook, "conn")
+ def test_validate_evaluation_results_show_results_True(self, mock_conn,
caplog):
+ response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds":
["resultId1"]}
+
+ response_batch_result = {
+ "RunId": self.RUN_ID,
+ "ResultIds": ["resultId1"],
+ "Results": [
+ {
+ "ResultId": "resultId1",
+ "RulesetName": "rulesetOne",
+ "RuleResults": [
+ {
+ "Name": "Rule_1",
+ "Description": "RowCount between 150000 and
600000",
+ "EvaluatedMetrics": {"Dataset.*.RowCount":
300000.0},
+ "Result": "PASS",
+ }
+ ],
+ }
+ ],
+ }
+ mock_conn.get_data_quality_ruleset_evaluation_run.return_value =
response_evaluation_run
+
+ mock_conn.batch_get_data_quality_result.return_value =
response_batch_result
+
+ with caplog.at_level(logging.INFO, logger=self.glue.log.name):
+ caplog.clear()
+
self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID,
show_results=True)
+
+
mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID)
+ mock_conn.batch_get_data_quality_result.assert_called_once_with(
+ ResultIds=response_evaluation_run["ResultIds"]
+ )
+ # The messages have extra spaces to create spacing in the output, the
number of consecutive spaces
+ # may vary. Remove any sequence of spaces greater than 1 before
asserting.
+ messages = [" ".join(msg.split()) for msg in caplog.messages]
+ assert messages == [
+ "AWS Glue data quality ruleset evaluation result for RulesetName:
rulesetOne RulesetEvaluationRunId: None Score: None",
+ "Name Description EvaluatedMetrics Result 0 Rule_1 RowCount
between 150000 and 600000 {'Dataset.*.RowCount': 300000.0} PASS",
+ "AWS Glue data quality ruleset evaluation run, total number of
rules failed: 0",
+ ]
+
+ @mock.patch.object(AwsBaseHook, "conn")
+ def test_validate_evaluation_results_show_results_True_no_pandas(self,
mock_conn, caplog):
+ response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds":
["resultId1"]}
+
+ response_batch_result = {
+ "RunId": self.RUN_ID,
+ "ResultIds": ["resultId1"],
+ "Results": [
+ {
+ "ResultId": "resultId1",
+ "RulesetName": "rulesetOne",
+ "RuleResults": [
+ {
+ "Name": "Rule_1",
+ "Description": "RowCount between 150000 and
600000",
+ "EvaluatedMetrics": {"Dataset.*.RowCount":
300000.0},
+ "Result": "PASS",
+ }
+ ],
+ }
+ ],
+ }
+ mock_conn.get_data_quality_ruleset_evaluation_run.return_value =
response_evaluation_run
+
+ mock_conn.batch_get_data_quality_result.return_value =
response_batch_result
+
+ # Emulate/mock the import of pandas failing with ModlueNotFoundError
+ with mock.patch.dict("sys.modules", {"pandas": None}):
+ with caplog.at_level(logging.INFO, logger=self.glue.log.name):
+ caplog.clear()
+
self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID,
show_results=True)
+
+
mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID)
+ mock_conn.batch_get_data_quality_result.assert_called_once_with(
+ ResultIds=response_evaluation_run["ResultIds"]
+ )
+ assert caplog.messages == [
+ "Pandas is not installed. Please install pandas to see the
detailed Data Quality results.",
+ "AWS Glue data quality ruleset evaluation run, total number of
rules failed: 0",
+ ]
+
@mock.patch.object(AwsBaseHook, "conn")
def
test_validate_evaluation_results_should_fail_when_any_rules_failed(self,
mock_conn, caplog):
response_batch_result = {