Repository: incubator-gobblin
Updated Branches:
  refs/heads/master 1213b06e3 -> 3b64cf70f


Add simple distcp job publishing to S3 as an example

Add simple distcp job publishing to S3 as an
example

S3 data push job with encrypted key setting

Pull data from S3 template

Address comments

Closes #1970 from autumnust/s3copu


Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/3b64cf70
Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/3b64cf70
Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/3b64cf70

Branch: refs/heads/master
Commit: 3b64cf70fbf0bcbf3bf46cecb248f35da02e5a03
Parents: 1213b06
Author: Lei Sun <[email protected]>
Authored: Sat Jul 29 01:13:04 2017 -0700
Committer: Abhishek Tiwari <[email protected]>
Committed: Sat Jul 29 01:13:04 2017 -0700

----------------------------------------------------------------------
 .../src/main/resources/distcpFromS3.job         | 59 ++++++++++++++++++++
 .../src/main/resources/distcpToS3.job           | 56 +++++++++++++++++++
 2 files changed, 115 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/3b64cf70/gobblin-example/src/main/resources/distcpFromS3.job
----------------------------------------------------------------------
diff --git a/gobblin-example/src/main/resources/distcpFromS3.job 
b/gobblin-example/src/main/resources/distcpFromS3.job
new file mode 100644
index 0000000..430e28c
--- /dev/null
+++ b/gobblin-example/src/main/resources/distcpFromS3.job
@@ -0,0 +1,59 @@
+# ====================================================================
+# Job configurations (can be changed)
+# ====================================================================
+
+job.name=GobblinDatabaseCopyTest
+job.description=Gobblin job for copy to S3
+
+
+
+# target publishing location for copy 
+# The folder containing result files that will show up in the s3
+data.publisher.final.dir=<Full local(target) FS Path>
+
+gobblin.dataset.profile.class=gobblin.data.management.copy.CopyableGlobDatasetFinder
+
+#e.g. s3a://gobblinoutput/
+source.filebased.fs.uri=<Full Remote FS Path>
+gobblin.dataset.pattern=<Remote Dataset Pattern>
+
+# For s3 to work, Need to also add hadoop-aws.jar as the dependency in the 
classpath. 
+fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
+source.filebased.encrypted.fs.s3a.access.key=<Encrypted Access Key>
+source.filebased.encrypted.fs.s3a.secret.key=<Encrypted Secret key>
+fs.s3a.buffer.dir=<Local Copy Bufer>
+# e.g. file:///
+writer.fs.uri=<Local FS Uri>
+gobblin.copy.recursive.update=true
+
+#You can disable ssl if necessary
+#fs.s3a.connection.ssl.enabled=false
+
+#Loationfor masterPassword file
+encrypt.key.loc=<Local master password file location>
+
+
+# ====================================================================
+# Distcp configurations (do not change)
+# ====================================================================
+
+type=hadoopJava
+job.class=gobblin.azkaban.AzkabanJobLauncher
+
+extract.namespace=gobblin.copy
+
+source.class=gobblin.data.management.copy.CopySource
+converter.classes=gobblin.converter.IdentityConverter
+writer.builder.class=gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder
+data.publisher.type=gobblin.data.management.copy.publisher.CopyDataPublisher
+distcp.persist.dir=/tmp/distcp-persist-dir
+
+task.maxretries=0
+workunit.retry.enabled=false
+
+# Intermediate steps configuration.
+work.dir=/tmp/
+state.store.dir=${work.dir}/state-store
+writer.staging.dir=${work.dir}/taskStaging
+writer.output.dir=${work.dir}/taskOutput
+mr.job.root.dir=${work.dir}/working

http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/3b64cf70/gobblin-example/src/main/resources/distcpToS3.job
----------------------------------------------------------------------
diff --git a/gobblin-example/src/main/resources/distcpToS3.job 
b/gobblin-example/src/main/resources/distcpToS3.job
new file mode 100644
index 0000000..4e391e8
--- /dev/null
+++ b/gobblin-example/src/main/resources/distcpToS3.job
@@ -0,0 +1,56 @@
+# ====================================================================
+# Job configurations (can be changed)
+# ====================================================================
+
+job.name=GobblinPushToExternalTest
+job.description=Gobblin job for pushing data to S3
+
+
+
+# target publishing location for copy
+data.publisher.final.dir=<testfolder>
+
+gobblin.dataset.profile.class=gobblin.data.management.copy.CopyableGlobDatasetFinder
+gobblin.dataset.pattern=< Dataset pattern describing your to-be-copied data's 
location >
+
+# For s3 to work, Need to also add hadoop-aws.jar as the dependency in the 
classpath. 
+# Attention that don't use any version later than **2.6.3** (Which means, 
pre-2015, very old)
+# Or you will hit into some messy dependecies problem related to hadoop 
version and amazon dependencies. 
+fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
+writer.encrypted.fs.s3a.access.key=<Encrypted Access Key>
+writer.encrypted.fs.s3a.secret.key=<Encrypted Secret Key>
+fs.s3a.buffer.dir=<Local Buffer folder>
+writer.fs.uri=<Full Bucket Path>
+gobblin.copy.recursive.update=true
+
+#You can disable ssl if necessary
+#fs.s3a.connection.ssl.enabled=false
+
+#Some encryption-related stuff, the content is 'gobblin'
+encrypt.key.loc=<Local File Path for master password>
+
+
+# ====================================================================
+# Distcp configurations (do not change)
+# ====================================================================
+
+type=hadoopJava
+job.class=gobblin.azkaban.AzkabanJobLauncher
+
+extract.namespace=gobblin.copy
+
+source.class=gobblin.data.management.copy.CopySource
+converter.classes=gobblin.converter.IdentityConverter
+writer.builder.class=gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder
+data.publisher.type=gobblin.data.management.copy.publisher.CopyDataPublisher
+distcp.persist.dir=/tmp/distcp-persist-dir
+
+task.maxretries=0
+workunit.retry.enabled=false
+
+# Intermediate steps configuration.
+work.dir=/tmp/
+state.store.dir=${work.dir}/state-store
+writer.staging.dir=${work.dir}/taskStaging
+writer.output.dir=${work.dir}/taskOutput
+mr.job.root.dir=${work.dir}/working

Reply via email to