(airflow) branch main updated: Catch and log pandas import errors (#58744)

vincbeck Mon, 01 Dec 2025 10:01:02 -0800

This is an automated email from the ASF dual-hosted git repository.

vincbeck pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new aa7cb1d7f1b Catch and log pandas import errors (#58744)
aa7cb1d7f1b is described below

commit aa7cb1d7f1bc9d50b6bfab33e18e3d9171f81620
Author: Niko Oliveira <[email protected]>
AuthorDate: Mon Dec 1 10:00:46 2025 -0800

    Catch and log pandas import errors (#58744)
    
    Pandas is used if the user optionally selects advanced output
    processing when providing `show_results=True` (default is False) to
    GlueDataQualityRuleSetEvaluationRunOperator and 
GlueDataQualityRuleSetEvaluationRunSensor
    
    However, the original PR (#39923) adding these operators and sensors did not
    include Pandas as a dependency of the Amazon Provider Package. I assume
    this is because Pandas is quite a heavy dependency that we don't want
    all users to have to install just for this very small usecase.
    So this commit catches the exception and logs to the user rather than
    failing catastrophically as it does now.
---
 .../src/airflow/providers/amazon/aws/hooks/glue.py |  8 ++-
 .../tests/unit/amazon/aws/hooks/test_glue.py       | 84 ++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py 
b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
index 002282d5947..da01711ebb1 100644
--- a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
+++ b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py
@@ -565,7 +565,13 @@ class GlueDataQualityHook(AwsBaseHook):
         Rule_3    ColumnLength "marketplace" between 1 and 2     FAIL        
{'Column.marketplace.MaximumLength': 9.0, 'Column.marketplace.MinimumLength': 
3.0}     Value: 9.0 does not meet the constraint requirement!
 
         """
-        import pandas as pd
+        try:
+            import pandas as pd
+        except ImportError:
+            self.log.warning(
+                "Pandas is not installed. Please install pandas to see the 
detailed Data Quality results."
+            )
+            return
 
         pd.set_option("display.max_rows", None)
         pd.set_option("display.max_columns", None)
diff --git a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py 
b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
index df3e98401ae..9043f3cc5c3 100644
--- a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
+++ b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py
@@ -684,6 +684,90 @@ class TestGlueDataQualityHook:
             "AWS Glue data quality ruleset evaluation run, total number of 
rules failed: 0"
         ]
 
+    @mock.patch.object(AwsBaseHook, "conn")
+    def test_validate_evaluation_results_show_results_True(self, mock_conn, 
caplog):
+        response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds": 
["resultId1"]}
+
+        response_batch_result = {
+            "RunId": self.RUN_ID,
+            "ResultIds": ["resultId1"],
+            "Results": [
+                {
+                    "ResultId": "resultId1",
+                    "RulesetName": "rulesetOne",
+                    "RuleResults": [
+                        {
+                            "Name": "Rule_1",
+                            "Description": "RowCount between 150000 and 
600000",
+                            "EvaluatedMetrics": {"Dataset.*.RowCount": 
300000.0},
+                            "Result": "PASS",
+                        }
+                    ],
+                }
+            ],
+        }
+        mock_conn.get_data_quality_ruleset_evaluation_run.return_value = 
response_evaluation_run
+
+        mock_conn.batch_get_data_quality_result.return_value = 
response_batch_result
+
+        with caplog.at_level(logging.INFO, logger=self.glue.log.name):
+            caplog.clear()
+            
self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID, 
show_results=True)
+
+        
mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID)
+        mock_conn.batch_get_data_quality_result.assert_called_once_with(
+            ResultIds=response_evaluation_run["ResultIds"]
+        )
+        # The messages have extra spaces to create spacing in the output, the 
number of consecutive spaces
+        # may vary. Remove any sequence of spaces greater than 1 before 
asserting.
+        messages = [" ".join(msg.split()) for msg in caplog.messages]
+        assert messages == [
+            "AWS Glue data quality ruleset evaluation result for RulesetName: 
rulesetOne RulesetEvaluationRunId: None Score: None",
+            "Name Description EvaluatedMetrics Result 0 Rule_1 RowCount 
between 150000 and 600000 {'Dataset.*.RowCount': 300000.0} PASS",
+            "AWS Glue data quality ruleset evaluation run, total number of 
rules failed: 0",
+        ]
+
+    @mock.patch.object(AwsBaseHook, "conn")
+    def test_validate_evaluation_results_show_results_True_no_pandas(self, 
mock_conn, caplog):
+        response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds": 
["resultId1"]}
+
+        response_batch_result = {
+            "RunId": self.RUN_ID,
+            "ResultIds": ["resultId1"],
+            "Results": [
+                {
+                    "ResultId": "resultId1",
+                    "RulesetName": "rulesetOne",
+                    "RuleResults": [
+                        {
+                            "Name": "Rule_1",
+                            "Description": "RowCount between 150000 and 
600000",
+                            "EvaluatedMetrics": {"Dataset.*.RowCount": 
300000.0},
+                            "Result": "PASS",
+                        }
+                    ],
+                }
+            ],
+        }
+        mock_conn.get_data_quality_ruleset_evaluation_run.return_value = 
response_evaluation_run
+
+        mock_conn.batch_get_data_quality_result.return_value = 
response_batch_result
+
+        # Emulate/mock the import of pandas failing with ModlueNotFoundError
+        with mock.patch.dict("sys.modules", {"pandas": None}):
+            with caplog.at_level(logging.INFO, logger=self.glue.log.name):
+                caplog.clear()
+                
self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID, 
show_results=True)
+
+        
mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID)
+        mock_conn.batch_get_data_quality_result.assert_called_once_with(
+            ResultIds=response_evaluation_run["ResultIds"]
+        )
+        assert caplog.messages == [
+            "Pandas is not installed. Please install pandas to see the 
detailed Data Quality results.",
+            "AWS Glue data quality ruleset evaluation run, total number of 
rules failed: 0",
+        ]
+
     @mock.patch.object(AwsBaseHook, "conn")
     def 
test_validate_evaluation_results_should_fail_when_any_rules_failed(self, 
mock_conn, caplog):
         response_batch_result = {

(airflow) branch main updated: Catch and log pandas import errors (#58744)

Reply via email to