[ 
https://issues.apache.org/jira/browse/BEAM-9258?focusedWorklogId=387233&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-387233
 ]

ASF GitHub Bot logged work on BEAM-9258:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 14/Feb/20 09:18
            Start Date: 14/Feb/20 09:18
    Worklog Time Spent: 10m 
      Work Description: mwalenia commented on pull request #10849: [BEAM-9258] 
Integrate Google Cloud Data loss prevention functionality for Python SDK
URL: https://github.com/apache/beam/pull/10849#discussion_r379323072
 
 

 ##########
 File path: sdks/python/apache_beam/ml/gcp/cloud_dlp.py
 ##########
 @@ -0,0 +1,224 @@
+#  /*
+#   * Licensed to the Apache Software Foundation (ASF) under one
+#   * or more contributor license agreements.  See the NOTICE file
+#   * distributed with this work for additional information
+#   * regarding copyright ownership.  The ASF licenses this file
+#   * to you under the Apache License, Version 2.0 (the
+#   * "License"); you may not use this file except in compliance
+#   * with the License.  You may obtain a copy of the License at
+#   *
+#   *     http://www.apache.org/licenses/LICENSE-2.0
+#   *
+#   * Unless required by applicable law or agreed to in writing, software
+#   * distributed under the License is distributed on an "AS IS" BASIS,
+#   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   * See the License for the specific language governing permissions and
+#   * limitations under the License.
+#   */
+
+"""``PTransforms`` that implement Google Cloud Data Loss Prevention
+    functionality.
+"""
+
+from __future__ import absolute_import
+
+import logging
+
+from google.cloud import dlp_v2
+
+import apache_beam as beam
+from apache_beam.utils import retry
+from apache_beam.utils.annotations import experimental
+
+__all__ = ['MaskDetectedDetails', 'InspectForDetails']
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@experimental()
+class MaskDetectedDetails(beam.PTransform):
+  """Scrubs sensitive information detected in text.
+  The ``PTransform`` returns a ``PCollection`` of ``str``
+  Example usage::
+    pipeline | MaskDetectedDetails(project='example-gcp-project',
+      deidentification_config={
+          'info_type_transformations: {
+              'transformations': [{
+                  'primitive_transformation': {
+                      'character_mask_config': {
+                          'masking_character': '#'
+                      }
+                  }
+              }]
+          }
+      }, inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
+  """
+  def __init__(
+      self,
+      project=None,
+      deidentification_template_name=None,
+      deidentification_config=None,
+      inspection_template_name=None,
+      inspection_config=None,
+      timeout=None):
+    """Initializes a :class:`MaskDetectedDetails` transform.
+    Args:
+      project (str): Required. GCP project in which the data processing is
+        to be done
+      deidentification_template_name (str): Either this or
+        `deidentification_config` required. Name of
+        deidentification template to be used on detected sensitive information
+        instances in text.
+      deidentification_config
+        (``Union[dict, google.cloud.dlp_v2.types.DeidentifyConfig]``):
+        Configuration for the de-identification of the content item.
+      inspection_template_name (str): This or `inspection_config` required.
+        Name of inspection template to be used
+        to detect sensitive data in text.
+      inspection_config
+        (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
+        Configuration for the inspector used to detect sensitive data in text.
+      timeout (float): Optional. The amount of time, in seconds, to wait for
+        the request to complete.
+    """
+    self.config = {}
+    self.project = project
+    self.timeout = timeout
+    if project is None:
+      raise ValueError(
+          'GCP project name needs to be specified in "project" property')
+    if deidentification_template_name is not None \
+        and deidentification_config is not None:
+      raise ValueError(
+          'Both deidentification_template_name and '
+          'deidentification_config were specified.'
+          ' Please specify only one of these.')
+    elif deidentification_template_name is None \
+        and deidentification_config is None:
+      raise ValueError(
+          'deidentification_template_name or '
+          'deidentification_config must be specified.')
+    elif deidentification_template_name is not None:
+      self.config['deidentify_template_name'] = deidentification_template_name
+    else:
+      self.config['deidentify_config'] = deidentification_config
+
+    if inspection_template_name is not None and inspection_config is not None:
+      raise ValueError(
+          'Both inspection_template_name and '
+          'inspection_template were specified.'
+          ' Please specify ony one of these.')
+    elif inspection_config is None and inspection_template_name is None:
+      raise ValueError(
+          'inspection_template_name or inspection_config must be specified')
+    elif inspection_template_name is not None:
+      self.config['inspect_template_name'] = inspection_template_name
+    elif inspection_config is not None:
+      self.config['inspect_config'] = inspection_config
+
+  def expand(self, pcoll):
+    return (
+        pcoll
+        | beam.ParDo(_DeidentifyFn(self.config, self.timeout, self.project)))
+
+
+@experimental()
+class InspectForDetails(beam.PTransform):
+  """Inspects input text for sensitive information.
+  the ``PTransform`` returns a ``PCollection`` of
+  ``List[google.cloud.dlp_v2.proto.dlp_pb2.Finding]``
+  Example usage::
+      pipeline | InspectForDetails(project='example-gcp-project',
+                inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
+  """
+  def __init__(
+      self,
+      inspection_template_name=None,
+      inspection_config=None,
+      project=None,
+      timeout=None):
+    """Initializes a :class:`InspectForDetails` transform.
+    Args:
+      inspection_template_name (str): This or `inspection_config` required.
+        Name of inspection template to be used
+        to detect sensitive data in text.
+      inspection_config
+        (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
+        Configuration for the inspector used to detect sensitive data in text.
+      project (str): Required. Name of GCP project in which the processing
+        will take place.
+      timeout (float): Optional. The amount of time, in seconds, to wait for
+        the request to complete.
+    """
+    self.project = project
+    self.timeout = timeout
+    self.config = {}
+    if project is None:
+      raise ValueError(
+          'GCP project name needs to be specified in "project" property')
+    if inspection_template_name is not None and inspection_config is not None:
+      raise ValueError(
+          'Both inspection_template_name and '
+          'inspection_template were specified.'
+          ' Please specify ony one of these.')
+    elif inspection_config is None and inspection_template_name is None:
+      raise ValueError(
+          'inspection_template_name or inspection_config must be specified')
+    elif inspection_template_name is not None:
+      self.config['inspect_template_name'] = inspection_template_name
+    elif inspection_config is not None:
+      self.config['inspect_config'] = inspection_config
+
+  def expand(self, pcoll):
+    return pcoll | beam.ParDo(
+        _InspectFn(self.config, self.timeout, self.project))
+
+
+class _DeidentifyFn(beam.DoFn):
+  def __init__(self, config=None, timeout=None, project=None, client=None):
+    self.config = config
+    self.timeout = timeout
+    self.client = client
+    self.project = project
+
+  def start_bundle(self):
+    if self.client is None:
+      self.client = dlp_v2.DlpServiceClient()
+
+  def process(self, element, **kwargs):
+    params = {
 
 Review comment:
   I moved it to `setup()`. Will that be OK?
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 387233)
    Time Spent: 1h 50m  (was: 1h 40m)

> [Python] PTransform that connects to Cloud DLP deidentification service
> -----------------------------------------------------------------------
>
>                 Key: BEAM-9258
>                 URL: https://issues.apache.org/jira/browse/BEAM-9258
>             Project: Beam
>          Issue Type: Sub-task
>          Components: io-py-gcp
>            Reporter: Michał Walenia
>            Assignee: Michał Walenia
>            Priority: Major
>          Time Spent: 1h 50m
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to