Repository: incubator-gobblin Updated Branches: refs/heads/master 1213b06e3 -> 3b64cf70f
Add simple distcp job publishing to S3 as an example Add simple distcp job publishing to S3 as an example S3 data push job with encrypted key setting Pull data from S3 template Address comments Closes #1970 from autumnust/s3copu Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/3b64cf70 Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/3b64cf70 Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/3b64cf70 Branch: refs/heads/master Commit: 3b64cf70fbf0bcbf3bf46cecb248f35da02e5a03 Parents: 1213b06 Author: Lei Sun <[email protected]> Authored: Sat Jul 29 01:13:04 2017 -0700 Committer: Abhishek Tiwari <[email protected]> Committed: Sat Jul 29 01:13:04 2017 -0700 ---------------------------------------------------------------------- .../src/main/resources/distcpFromS3.job | 59 ++++++++++++++++++++ .../src/main/resources/distcpToS3.job | 56 +++++++++++++++++++ 2 files changed, 115 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/3b64cf70/gobblin-example/src/main/resources/distcpFromS3.job ---------------------------------------------------------------------- diff --git a/gobblin-example/src/main/resources/distcpFromS3.job b/gobblin-example/src/main/resources/distcpFromS3.job new file mode 100644 index 0000000..430e28c --- /dev/null +++ b/gobblin-example/src/main/resources/distcpFromS3.job @@ -0,0 +1,59 @@ +# ==================================================================== +# Job configurations (can be changed) +# ==================================================================== + +job.name=GobblinDatabaseCopyTest +job.description=Gobblin job for copy to S3 + + + +# target publishing location for copy +# The folder containing result files that will show up in the s3 +data.publisher.final.dir=<Full local(target) FS Path> + +gobblin.dataset.profile.class=gobblin.data.management.copy.CopyableGlobDatasetFinder + +#e.g. s3a://gobblinoutput/ +source.filebased.fs.uri=<Full Remote FS Path> +gobblin.dataset.pattern=<Remote Dataset Pattern> + +# For s3 to work, Need to also add hadoop-aws.jar as the dependency in the classpath. +fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem +source.filebased.encrypted.fs.s3a.access.key=<Encrypted Access Key> +source.filebased.encrypted.fs.s3a.secret.key=<Encrypted Secret key> +fs.s3a.buffer.dir=<Local Copy Bufer> +# e.g. file:/// +writer.fs.uri=<Local FS Uri> +gobblin.copy.recursive.update=true + +#You can disable ssl if necessary +#fs.s3a.connection.ssl.enabled=false + +#Loationfor masterPassword file +encrypt.key.loc=<Local master password file location> + + +# ==================================================================== +# Distcp configurations (do not change) +# ==================================================================== + +type=hadoopJava +job.class=gobblin.azkaban.AzkabanJobLauncher + +extract.namespace=gobblin.copy + +source.class=gobblin.data.management.copy.CopySource +converter.classes=gobblin.converter.IdentityConverter +writer.builder.class=gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder +data.publisher.type=gobblin.data.management.copy.publisher.CopyDataPublisher +distcp.persist.dir=/tmp/distcp-persist-dir + +task.maxretries=0 +workunit.retry.enabled=false + +# Intermediate steps configuration. +work.dir=/tmp/ +state.store.dir=${work.dir}/state-store +writer.staging.dir=${work.dir}/taskStaging +writer.output.dir=${work.dir}/taskOutput +mr.job.root.dir=${work.dir}/working http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/3b64cf70/gobblin-example/src/main/resources/distcpToS3.job ---------------------------------------------------------------------- diff --git a/gobblin-example/src/main/resources/distcpToS3.job b/gobblin-example/src/main/resources/distcpToS3.job new file mode 100644 index 0000000..4e391e8 --- /dev/null +++ b/gobblin-example/src/main/resources/distcpToS3.job @@ -0,0 +1,56 @@ +# ==================================================================== +# Job configurations (can be changed) +# ==================================================================== + +job.name=GobblinPushToExternalTest +job.description=Gobblin job for pushing data to S3 + + + +# target publishing location for copy +data.publisher.final.dir=<testfolder> + +gobblin.dataset.profile.class=gobblin.data.management.copy.CopyableGlobDatasetFinder +gobblin.dataset.pattern=< Dataset pattern describing your to-be-copied data's location > + +# For s3 to work, Need to also add hadoop-aws.jar as the dependency in the classpath. +# Attention that don't use any version later than **2.6.3** (Which means, pre-2015, very old) +# Or you will hit into some messy dependecies problem related to hadoop version and amazon dependencies. +fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem +writer.encrypted.fs.s3a.access.key=<Encrypted Access Key> +writer.encrypted.fs.s3a.secret.key=<Encrypted Secret Key> +fs.s3a.buffer.dir=<Local Buffer folder> +writer.fs.uri=<Full Bucket Path> +gobblin.copy.recursive.update=true + +#You can disable ssl if necessary +#fs.s3a.connection.ssl.enabled=false + +#Some encryption-related stuff, the content is 'gobblin' +encrypt.key.loc=<Local File Path for master password> + + +# ==================================================================== +# Distcp configurations (do not change) +# ==================================================================== + +type=hadoopJava +job.class=gobblin.azkaban.AzkabanJobLauncher + +extract.namespace=gobblin.copy + +source.class=gobblin.data.management.copy.CopySource +converter.classes=gobblin.converter.IdentityConverter +writer.builder.class=gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder +data.publisher.type=gobblin.data.management.copy.publisher.CopyDataPublisher +distcp.persist.dir=/tmp/distcp-persist-dir + +task.maxretries=0 +workunit.retry.enabled=false + +# Intermediate steps configuration. +work.dir=/tmp/ +state.store.dir=${work.dir}/state-store +writer.staging.dir=${work.dir}/taskStaging +writer.output.dir=${work.dir}/taskOutput +mr.job.root.dir=${work.dir}/working
