mik-laj commented on a change in pull request #5539: [AIRFLOW-4811] Implement GCP DLP' Hook and Operators URL: https://github.com/apache/airflow/pull/5539#discussion_r308456717
########## File path: airflow/contrib/hooks/gcp_dlp_hook.py ########## @@ -0,0 +1,1760 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This module contains a CloudDLPHook +which allows you to connect to GCP Cloud DLP service. +""" + +import re +import time +from google.cloud.dlp_v2 import DlpServiceClient +from google.cloud.dlp_v2.types import DlpJob + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +DLP_JOB_PATH_PATTERN = '^projects/[^/]+/dlpJobs/(?P<job>.*?)$' +# Time to sleep between active checks of the operation results +TIME_TO_SLEEP_IN_SECONDS = 1 + + +# pylint: disable=R0904, C0302 +class CloudDLPHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Data Loss Prevention (DLP) APIs. + Cloud DLP allows clients to detect the presence of Personally Identifiable + Information (PII) and other privacy-sensitive data in user-supplied, + unstructured data streams, like text blocks or images. The service also + includes methods for sensitive data redaction and scheduling of data scans + on Google Cloud Platform based data sets. + + :param gcp_conn_id: The connection ID to use when fetching connection info. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + """ + + def __init__(self, + gcp_conn_id="google_cloud_default", + delegate_to=None): + super().__init__(gcp_conn_id, delegate_to) + self._client = None + + def get_conn(self): + """ + Provides a client for interacting with the Cloud DLP API. + + :return: GCP Cloud DLP API Client + :rtype: google.cloud.dlp_v2.DlpServiceClient + """ + if not self._client: + self._client = DlpServiceClient(credentials=self._get_credentials()) + return self._client + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def cancel_dlp_job( + self, dlp_job_id, project_id=None, retry=None, timeout=None, metadata=None + ): + """ + Starts asynchronous cancellation on a long-running DLP job. + + :param dlp_job_id: ID of the DLP job resource to be cancelled. + :type dlp_job_id: str + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. If set to None or missing, the default project_id + from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + """ + + client = self.get_conn() + + if not dlp_job_id: + raise AirflowException( + "Please provide the ID of the DLP job resource to be cancelled." + ) + + name = DlpServiceClient.dlp_job_path(project_id, dlp_job_id) + client.cancel_dlp_job( + name=name, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def create_deidentify_template( + self, + organization_id=None, + project_id=None, + deidentify_template=None, + template_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + Creates a deidentify template for re-using frequently used configuration for + de-identifying content, images, and storage. + + :param organization_id: (Optional) The organization ID. Required to set this + field if parent resource is an organzation. + :type organization_id: str + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. Only set this field if the parent resource is + a project instead of an organzation. + :type project_id: str + :param deidentify_template: (Optional) The deidentify template to create. + :type deidentify_template: dict or google.cloud.dlp_v2.types.DeidentifyTemplate + :param template_id: (Optional) The template ID. + :type template_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.DeidentifyTemplate + """ + + client = self.get_conn() + + if organization_id: + parent = DlpServiceClient.organization_path(organization_id) + elif project_id: + parent = DlpServiceClient.project_path(project_id) + else: + raise AirflowException( + "Please provide either organization_id or project_id." + ) + + return client.create_deidentify_template( + parent=parent, + deidentify_template=deidentify_template, + template_id=template_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_dlp_job( + self, + project_id=None, + inspect_job=None, + risk_job=None, + job_id=None, + retry=None, + timeout=None, + metadata=None, + wait_until_finished=True + ): + """ + Creates a new job to inspect storage or calculate risk metrics. + + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. If set to None or missing, the default + project_id from the GCP connection is used. + :type project_id: str + :param inspect_job: (Optional) The configuration for the inspect job. + :type inspect_job: dict or google.cloud.dlp_v2.types.InspectJobConfig + :param risk_job: (Optional) The configuration for the risk job. + :type risk_job: dict or google.cloud.dlp_v2.types.RiskAnalysisJobConfig + :param job_id: (Optional) The job ID. + :type job_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :param wait_until_finished: (Optional) If true, it will keep polling the job state + until it is set to DONE. + :type wait_until_finished: bool + :rtype: google.cloud.dlp_v2.types.DlpJob + """ + + client = self.get_conn() + + if not project_id: + raise AirflowException("Please provide the project_id.") + + parent = DlpServiceClient.project_path(project_id) + job = client.create_dlp_job( + parent=parent, + inspect_job=inspect_job, + risk_job=risk_job, + job_id=job_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + if wait_until_finished: + pattern = re.compile(DLP_JOB_PATH_PATTERN, re.IGNORECASE) + match = pattern.match(job.name) + job_name = match.groupdict()['job'] + + while wait_until_finished: + job = self.get_dlp_job( + dlp_job_id=job_name, + project_id=project_id) + + self.log.info( + 'DLP job {} state: {}.'.format( + job.name, + DlpJob.JobState.Name(job.state) + ) + ) + + if job.state == DlpJob.JobState.DONE: + return job + elif job.state in [DlpJob.JobState.PENDING, + DlpJob.JobState.RUNNING, + DlpJob.JobState.JOB_STATE_UNSPECIFIED]: + time.sleep(TIME_TO_SLEEP_IN_SECONDS) + else: + raise AirflowException( + 'Stopped polling DLP job state. DLP job {} state: {}.' + .format( + job.name, + DlpJob.JobState.Name(job.state) + ) + ) + return job + + @GoogleCloudBaseHook.catch_http_exception + def create_inspect_template( + self, + organization_id=None, + project_id=None, + inspect_template=None, + template_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + Creates an inspect template for re-using frequently used configuration for + inspecting content, images, and storage. + + :param organization_id: (Optional) The organization ID. Required to set this + field if parent resource is an organzation. + :type organization_id: str + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. Only set this field if the parent resource is + a project instead of an organzation. + :type project_id: str + :param inspect_template: (Optional) The inspect template to create. + :type inspect_template: dict or google.cloud.dlp_v2.types.InspectTemplate + :param template_id: (Optional) The template ID. + :type template_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.InspectTemplate + """ + + client = self.get_conn() + + if organization_id: + parent = DlpServiceClient.organization_path(organization_id) + elif project_id: + parent = DlpServiceClient.project_path(project_id) + else: + raise AirflowException( + "Please provide either organization_id or project_id." + ) + + return client.create_inspect_template( + parent=parent, + inspect_template=inspect_template, + template_id=template_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_job_trigger( + self, + project_id=None, + job_trigger=None, + trigger_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + Creates a job trigger to run DLP actions such as scanning storage for sensitive + information on a set schedule. + + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. If set to None or missing, the default + project_id from the GCP connection is used. + :type project_id: str + :param job_trigger: (Optional) The job trigger to create. + :type job_trigger: dict or google.cloud.dlp_v2.types.JobTrigger + :param trigger_id: (Optional) The job trigger ID. + :type trigger_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.JobTrigger + """ + + client = self.get_conn() + + if not project_id: + raise AirflowException("Please provide the project_id.") + + parent = DlpServiceClient.project_path(project_id) + return client.create_job_trigger( + parent=parent, + job_trigger=job_trigger, + trigger_id=trigger_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + def create_stored_info_type( + self, + organization_id=None, + project_id=None, + config=None, + stored_info_type_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + Creates a pre-built stored info type to be used for inspection. + + :param organization_id: (Optional) The organization ID. Required to set this + field if parent resource is an organzation. + :type organization_id: str + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. Only set this field if the parent resource is + a project instead of an organzation. + :type project_id: str + :param config: (Optional) The config for the stored info type. + :type config: dict or google.cloud.dlp_v2.types.StoredInfoTypeConfig + :param stored_info_type_id: (Optional) The stored info type ID. + :type stored_info_type_id: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.StoredInfoType + """ + + client = self.get_conn() + + if organization_id: + parent = DlpServiceClient.organization_path(organization_id) + elif project_id: + parent = DlpServiceClient.project_path(project_id) + else: + raise AirflowException( + "Please provide either organization_id or project_id." + ) + + return client.create_stored_info_type( + parent=parent, + config=config, + stored_info_type_id=stored_info_type_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def deidentify_content( + self, + project_id=None, + deidentify_config=None, + inspect_config=None, + item=None, + inspect_template_name=None, + deidentify_template_name=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + De-identifies potentially sensitive info from a content item. This method has limits + on input size and output size. + + :param project_id: (Optional) Google Cloud Platform project ID where the + DLP Instance exists. If set to None or missing, the default + project_id from the GCP connection is used. + :type project_id: str + :param deidentify_config: (Optional) Configuration for the de-identification of the + content item. Items specified here will override the template referenced by the + deidentify_template_name argument. + :type deidentify_config: dict or google.cloud.dlp_v2.types.DeidentifyConfig + :param inspect_config: (Optional) Configuration for the inspector. Items specified + here will override the template referenced by the inspect_template_name argument. + :type inspect_config: dict or google.cloud.dlp_v2.types.InspectConfig + :param item: (Optional) The item to de-identify. Will be treated as text. + :type item: dict or google.cloud.dlp_v2.types.ContentItem + :param inspect_template_name: (Optional) Optional template to use. Any configuration + directly specified in inspect_config will override those set in the template. + :type inspect_template_name: str + :param deidentify_template_name: (Optional) Optional template to use. Any + configuration directly specified in deidentify_config will override those set + in the template. + :type deidentify_template_name: str + :param retry: (Optional) A retry object used to retry requests. + If None is specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request + to complete. Note that if retry is specified, the timeout applies to each + individual attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.dlp_v2.types.DeidentifyContentResponse + """ + + client = self.get_conn() + + if not project_id: Review comment: It looks like dead code, because decorator ``fallback_to_default_project_id`` prevent to execute this part of code ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
