gopidesupavan commented on code in PR #39923:
URL: https://github.com/apache/airflow/pull/39923#discussion_r1621290560


##########
airflow/providers/amazon/aws/operators/glue.py:
##########
@@ -239,3 +243,253 @@ def on_kill(self):
             )
             if not response["SuccessfulSubmissions"]:
                 self.log.error("Failed to stop AWS Glue Job: %s. Run Id: %s", 
self.job_name, self._job_run_id)
+
+
+class GlueDataQualityOperator(AwsBaseOperator[GlueDataQualityHook]):
+    """
+    Creates a data quality ruleset with DQDL rules applied to a specified Glue 
table.
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the 
guide:
+        :ref:`howto/operator:GlueDataQualityOperator`
+
+    :param name: A unique name for the data quality ruleset.
+    :param ruleset: A Data Quality Definition Language (DQDL) ruleset.
+        For more information, see the Glue developer guide.
+    :param description: A description of the data quality ruleset.
+    :param update_rule_set: To update existing ruleset, Set this flag to True. 
(default: False)
+    :param data_quality_ruleset_kwargs: Extra arguments for RuleSet.
+
+    :param aws_conn_id: The Airflow connection used for AWS credentials.
+        If this is ``None`` or empty then the default boto3 behaviour is used. 
If
+        running Airflow in a distributed manner and aws_conn_id is None or
+        empty, then default boto3 configuration would be used (and must be
+        maintained on each worker node).
+    :param region_name: AWS region_name. If not specified then the default 
boto3 behaviour is used.
+    :param verify: Whether or not to verify SSL certificates. See:
+        
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
+    :param botocore_config: Configuration dictionary (key-values) for botocore 
client. See:
+        
https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html
+    """
+
+    aws_hook_class = GlueDataQualityHook
+    template_fields: Sequence[str] = ("name", "ruleset", 
"data_quality_ruleset_kwargs")
+
+    template_fields_renderers = {
+        "data_quality_ruleset_kwargs": "json",
+    }
+    ui_color = "#ededed"
+
+    def __init__(
+        self,
+        *,
+        name: str,
+        ruleset: str,
+        description: str = "AWS Glue Data Quality Rule Set With Airflow",
+        update_rule_set: bool = False,
+        data_quality_ruleset_kwargs: dict | None = None,
+        aws_conn_id: str | None = "aws_default",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.name = name
+        self.ruleset = ruleset.strip()
+        self.description = description
+        self.update_rule_set = update_rule_set
+        self.data_quality_ruleset_kwargs = data_quality_ruleset_kwargs or {}
+        self.aws_conn_id = aws_conn_id
+
+    def validate_inputs(self) -> None:
+        if not self.ruleset.startswith("Rules") or not 
self.ruleset.endswith("]"):
+            raise AttributeError("RuleSet must starts with Rules = [ and ends 
with ]")
+
+        if self.data_quality_ruleset_kwargs.get("TargetTable"):
+            target_table = self.data_quality_ruleset_kwargs["TargetTable"]
+
+            if not target_table.get("TableName") or not 
target_table.get("DatabaseName"):
+                raise AttributeError("Target table must have DatabaseName and 
TableName")
+
+    def execute(self, context: Context):
+        self.validate_inputs()
+
+        config = {
+            "Name": self.name,
+            "Ruleset": self.ruleset,
+            "Description": self.description,
+            **self.data_quality_ruleset_kwargs,
+        }
+
+        if self.update_rule_set:
+            self.hook.update_glue_data_quality_ruleset(config)
+            self.log.info("AWS Glue data quality ruleset updated successfully")
+        else:
+            self.hook.create_glue_data_quality_ruleset(config)
+            self.log.info("AWS Glue data quality ruleset created successfully")
+
+
+class 
GlueDataQualityRuleSetEvaluationRunOperator(AwsBaseOperator[GlueDataQualityHook]):
+    """
+    Once you have a ruleset definition (either recommended or your own), you 
call this operation to evaluate the ruleset against a data source (Glue table).

Review Comment:
   Agree, updated the changes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to