[ 
https://issues.apache.org/jira/browse/AIRFLOW-3213?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16695352#comment-16695352
 ] 

ASF GitHub Bot commented on AIRFLOW-3213:
-----------------------------------------

kaxil closed pull request #4134: [AIRFLOW-3213] Create ADLS to GCS operator
URL: https://github.com/apache/incubator-airflow/pull/4134
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/airflow/contrib/operators/adls_to_gcs.py 
b/airflow/contrib/operators/adls_to_gcs.py
new file mode 100644
index 0000000000..affbd45626
--- /dev/null
+++ b/airflow/contrib/operators/adls_to_gcs.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+from tempfile import NamedTemporaryFile
+
+from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
+from airflow.contrib.operators.adls_list_operator import 
AzureDataLakeStorageListOperator
+from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook, 
_parse_gcs_url
+from airflow.utils.decorators import apply_defaults
+
+
+class AdlsToGoogleCloudStorageOperator(AzureDataLakeStorageListOperator):
+    """
+    Synchronizes an Azure Data Lake Storage path with a GCS bucket
+
+    :param src_adls: The Azure Data Lake path to find the objects (templated)
+    :type src_adls: str
+    :param dest_gcs: The Google Cloud Storage bucket and prefix to
+        store the objects. (templated)
+    :type dest_gcs: str
+    :param replace: If true, replaces same-named files in GCS
+    :type replace: bool
+    :param azure_data_lake_conn_id: The connection ID to use when
+        connecting to Azure Data Lake Storage.
+    :type azure_data_lake_conn_id: str
+    :param google_cloud_storage_conn_id: The connection ID to use when
+        connecting to Google Cloud Storage.
+    :type google_cloud_storage_conn_id: str
+    :param delegate_to: The account to impersonate, if any.
+        For this to work, the service account making the request must have
+        domain-wide delegation enabled.
+    :type delegate_to: str
+
+    **Examples**:
+        The following Operator would copy a single file named
+        ``hello/world.avro`` from ADLS to the GCS bucket ``mybucket``. Its full
+        resulting gcs path will be ``gs://mybucket/hello/world.avro`` ::
+            copy_single_file = AdlsToGoogleCloudStorageOperator(
+                task_id='copy_single_file',
+                src_adls='hello/world.avro',
+                dest_gcs='gs://mybucket',
+                replace=False,
+                azure_data_lake_conn_id='azure_data_lake_default',
+                google_cloud_storage_conn_id='google_cloud_default'
+            )
+
+        The following Operator would copy all parquet files from ADLS
+        to the GCS bucket ``mybucket``. ::
+            copy_all_files = AdlsToGoogleCloudStorageOperator(
+                task_id='copy_all_files',
+                src_adls='*.parquet',
+                dest_gcs='gs://mybucket',
+                replace=False,
+                azure_data_lake_conn_id='azure_data_lake_default',
+                google_cloud_storage_conn_id='google_cloud_default'
+            )
+
+         The following Operator would copy all parquet files from ADLS
+         path ``/hello/world``to the GCS bucket ``mybucket``. ::
+            copy_world_files = AdlsToGoogleCloudStorageOperator(
+                task_id='copy_world_files',
+                src_adls='hello/world/*.parquet',
+                dest_gcs='gs://mybucket',
+                replace=False,
+                azure_data_lake_conn_id='azure_data_lake_default',
+                google_cloud_storage_conn_id='google_cloud_default'
+            )
+    """
+    template_fields = ('src_adls', 'dest_gcs')
+    ui_color = '#f0eee4'
+
+    @apply_defaults
+    def __init__(self,
+                 src_adls,
+                 dest_gcs,
+                 azure_data_lake_conn_id,
+                 google_cloud_storage_conn_id,
+                 delegate_to=None,
+                 replace=False,
+                 *args,
+                 **kwargs):
+
+        super(AdlsToGoogleCloudStorageOperator, self).__init__(
+            path=src_adls,
+            azure_data_lake_conn_id=azure_data_lake_conn_id,
+            *args,
+            **kwargs
+        )
+        self.src_adls = src_adls
+        self.dest_gcs = dest_gcs
+        self.replace = replace
+        self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
+        self.delegate_to = delegate_to
+
+    def execute(self, context):
+        # use the super to list all files in an Azure Data Lake path
+        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
+        g_hook = GoogleCloudStorageHook(
+            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
+            delegate_to=self.delegate_to)
+
+        if not self.replace:
+            # if we are not replacing -> list all files in the ADLS path
+            # and only keep those files which are present in
+            # ADLS and not in Google Cloud Storage
+            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
+            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
+            files = set(files) - set(existing_files)
+
+        if files:
+            hook = AzureDataLakeHook(
+                azure_data_lake_conn_id=self.azure_data_lake_conn_id
+            )
+
+            for obj in files:
+                with NamedTemporaryFile(mode='wb', delete=True) as f:
+                    hook.download_file(local_path=f.name, remote_path=obj)
+                    f.flush()
+                    dest_gcs_bucket, dest_gcs_prefix = 
_parse_gcs_url(self.dest_gcs)
+                    dest_path = os.path.join(dest_gcs_prefix, obj)
+                    self.log.info("Saving file to %s", dest_path)
+
+                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, 
filename=f.name)
+
+            self.log.info("All done, uploaded %d files to GCS", len(files))
+        else:
+            self.log.info("In sync, no files needed to be uploaded to GCS")
+
+        return files
diff --git a/docs/code.rst b/docs/code.rst
index bbdbbc8693..288cfbe78f 100644
--- a/docs/code.rst
+++ b/docs/code.rst
@@ -130,6 +130,7 @@ Operators
 .. Alphabetize this list
 
 .. autoclass:: 
airflow.contrib.operators.adls_list_operator.AzureDataLakeStorageListOperator
+.. autoclass:: 
airflow.contrib.operators.adls_to_gcs.AdlsToGoogleCloudStorageOperator
 .. autoclass:: airflow.contrib.operators.aws_athena_operator.AWSAthenaOperator
 .. autoclass:: airflow.contrib.operators.awsbatch_operator.AWSBatchOperator
 .. autoclass:: 
airflow.contrib.operators.bigquery_check_operator.BigQueryCheckOperator
diff --git a/docs/integration.rst b/docs/integration.rst
index 471ce74ed4..00027f1311 100644
--- a/docs/integration.rst
+++ b/docs/integration.rst
@@ -171,6 +171,7 @@ login (=Client ID), password (=Client Secret) and extra 
fields tenant (Tenant) a
 
 - :ref:`AzureDataLakeHook`: Interface with Azure Data Lake.
 - :ref:`AzureDataLakeStorageListOperator`: Lists the files located in a 
specified Azure Data Lake path.
+- :ref:`AdlsToGoogleCloudStorageOperator`: Copies files from an Azure Data 
Lake path to a Google Cloud Storage bucket.
 
 .. _AzureDataLakeHook:
 
@@ -186,6 +187,13 @@ AzureDataLakeStorageListOperator
 
 .. autoclass:: 
airflow.contrib.operators.adls_list_operator.AzureDataLakeStorageListOperator
 
+.. _AdlsToGoogleCloudStorageOperator:
+
+AdlsToGoogleCloudStorageOperator
+""""""""""""""""""""""""""""""""
+
+.. autoclass:: 
airflow.contrib.operators.adls_to_gcs.AdlsToGoogleCloudStorageOperator
+
 .. _AWS:
 
 AWS: Amazon Web Services
diff --git a/tests/contrib/operators/test_adls_to_gcs_operator.py 
b/tests/contrib/operators/test_adls_to_gcs_operator.py
new file mode 100644
index 0000000000..0277f55d55
--- /dev/null
+++ b/tests/contrib/operators/test_adls_to_gcs_operator.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+
+from airflow.contrib.hooks.gcs_hook import _parse_gcs_url
+from airflow.contrib.operators.adls_to_gcs import \
+    AdlsToGoogleCloudStorageOperator
+
+try:
+    from unittest import mock
+except ImportError:
+    try:
+        import mock
+    except ImportError:
+        mock = None
+
+TASK_ID = 'test-adls-gcs-operator'
+ADLS_PATH_1 = '*'
+GCS_PATH = 'gs://test/'
+MOCK_FILES = ["test/TEST1.csv", "test/TEST2.csv", "test/path/TEST3.csv",
+              "test/path/PARQUET.parquet", "test/path/PIC.png"]
+AZURE_CONN_ID = 'azure_data_lake_default'
+GCS_CONN_ID = 'google_cloud_default'
+
+
+class AdlsToGoogleCloudStorageOperatorTest(unittest.TestCase):
+    def test_init(self):
+        """Test AdlsToGoogleCloudStorageOperator instance is properly 
initialized."""
+
+        operator = AdlsToGoogleCloudStorageOperator(
+            task_id=TASK_ID,
+            src_adls=ADLS_PATH_1,
+            dest_gcs=GCS_PATH,
+            replace=False,
+            azure_data_lake_conn_id=AZURE_CONN_ID,
+            google_cloud_storage_conn_id=GCS_CONN_ID
+        )
+
+        self.assertEqual(operator.task_id, TASK_ID)
+        self.assertEqual(operator.src_adls, ADLS_PATH_1)
+        self.assertEqual(operator.dest_gcs, GCS_PATH)
+        self.assertEqual(operator.replace, False)
+        self.assertEqual(operator.google_cloud_storage_conn_id, GCS_CONN_ID)
+        self.assertEqual(operator.azure_data_lake_conn_id, AZURE_CONN_ID)
+
+    @mock.patch('airflow.contrib.operators.adls_to_gcs.AzureDataLakeHook')
+    
@mock.patch('airflow.contrib.operators.adls_list_operator.AzureDataLakeHook')
+    @mock.patch(
+        'airflow.contrib.operators.adls_to_gcs.GoogleCloudStorageHook')
+    def test_execute(self, gcs_mock_hook, adls_one_mock_hook, 
adls_two_mock_hook):
+        """Test the execute function when the run is successful."""
+
+        operator = AdlsToGoogleCloudStorageOperator(
+            task_id=TASK_ID,
+            src_adls=ADLS_PATH_1,
+            dest_gcs=GCS_PATH,
+            replace=False,
+            azure_data_lake_conn_id=AZURE_CONN_ID,
+            google_cloud_storage_conn_id=GCS_CONN_ID
+        )
+
+        adls_one_mock_hook.return_value.list.return_value = MOCK_FILES
+        adls_two_mock_hook.return_value.list.return_value = MOCK_FILES
+
+        def _assert_upload(bucket, object, filename):
+            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH)
+
+            self.assertEqual(gcs_bucket, 'test')
+            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)
+
+        gcs_mock_hook.return_value.upload.side_effect = _assert_upload
+
+        uploaded_files = operator.execute(None)
+
+        
adls_one_mock_hook.assert_called_once_with(azure_data_lake_conn_id=AZURE_CONN_ID)
+        
adls_two_mock_hook.assert_called_once_with(azure_data_lake_conn_id=AZURE_CONN_ID)
+        gcs_mock_hook.assert_called_once_with(
+            google_cloud_storage_conn_id=GCS_CONN_ID, delegate_to=None)
+
+        # we expect MOCK_FILES to be uploaded
+        self.assertEqual(sorted(MOCK_FILES), sorted(uploaded_files))
+
+
+if __name__ == '__main__':
+    unittest.main()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Create ADLS to GCS operator 
> ----------------------------
>
>                 Key: AIRFLOW-3213
>                 URL: https://issues.apache.org/jira/browse/AIRFLOW-3213
>             Project: Apache Airflow
>          Issue Type: Improvement
>          Components: gcp, operators
>            Reporter: Brandon Kvarda
>            Assignee: Brandon Kvarda
>            Priority: Minor
>
> Create ADLS to GCS operator that supports copying of files from ADLS to GCS



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to