http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py new file mode 100644 index 0000000..35e52b8 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py @@ -0,0 +1,219 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet model. + +Related papers: +https://arxiv.org/pdf/1603.05027v2.pdf +https://arxiv.org/pdf/1512.03385v1.pdf +https://arxiv.org/pdf/1605.07146v1.pdf +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +class ResNet(object): + """ResNet model.""" + + def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon): + """ResNet constructor. + + Args: + is_training: if build training or inference model. + data_format: the data_format used during computation. + one of 'channels_first' or 'channels_last'. + """ + self._batch_norm_decay = batch_norm_decay + self._batch_norm_epsilon = batch_norm_epsilon + self._is_training = is_training + assert data_format in ('channels_first', 'channels_last') + self._data_format = data_format + + def forward_pass(self, x): + raise NotImplementedError( + 'forward_pass() is implemented in ResNet sub classes') + + def _residual_v1(self, + x, + kernel_size, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers, using Plan A for shortcut connection.""" + + del activate_before_residual + with tf.name_scope('residual_v1') as name_scope: + orig_x = x + + x = self._conv(x, kernel_size, out_filter, stride) + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, kernel_size, out_filter, 1) + x = self._batch_norm(x) + + if in_filter != out_filter: + orig_x = self._avg_pool(orig_x, stride, stride) + pad = (out_filter - in_filter) // 2 + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = self._relu(tf.add(x, orig_x)) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Residual unit with 2 sub layers with preactivation, plan A shortcut.""" + + with tf.name_scope('residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 3, out_filter, stride) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 3, out_filter, [1, 1, 1, 1]) + + if in_filter != out_filter: + pad = (out_filter - in_filter) // 2 + orig_x = self._avg_pool(orig_x, stride, stride) + if self._data_format == 'channels_first': + orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]]) + else: + orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]]) + + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _bottleneck_residual_v2(self, + x, + in_filter, + out_filter, + stride, + activate_before_residual=False): + """Bottleneck residual unit with 3 sub layers, plan B shortcut.""" + + with tf.name_scope('bottle_residual_v2') as name_scope: + if activate_before_residual: + x = self._batch_norm(x) + x = self._relu(x) + orig_x = x + else: + orig_x = x + x = self._batch_norm(x) + x = self._relu(x) + + x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + # pad when stride isn't unit + x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True) + + x = self._batch_norm(x) + x = self._relu(x) + x = self._conv(x, 1, out_filter, 1, is_atrous=True) + + if in_filter != out_filter: + orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True) + x = tf.add(x, orig_x) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _conv(self, x, kernel_size, filters, strides, is_atrous=False): + """Convolution.""" + + padding = 'SAME' + if not is_atrous and strides > 1: + pad = kernel_size - 1 + pad_beg = pad // 2 + pad_end = pad - pad_beg + if self._data_format == 'channels_first': + x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]) + else: + x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + padding = 'VALID' + return tf.layers.conv2d( + inputs=x, + kernel_size=kernel_size, + filters=filters, + strides=strides, + padding=padding, + use_bias=False, + data_format=self._data_format) + + def _batch_norm(self, x): + if self._data_format == 'channels_first': + data_format = 'NCHW' + else: + data_format = 'NHWC' + return tf.contrib.layers.batch_norm( + x, + decay=self._batch_norm_decay, + center=True, + scale=True, + epsilon=self._batch_norm_epsilon, + is_training=self._is_training, + fused=True, + data_format=data_format) + + def _relu(self, x): + return tf.nn.relu(x) + + def _fully_connected(self, x, out_dim): + with tf.name_scope('fully_connected') as name_scope: + x = tf.layers.dense(x, out_dim) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _avg_pool(self, x, pool_size, stride): + with tf.name_scope('avg_pool') as name_scope: + x = tf.layers.average_pooling2d( + x, pool_size, stride, 'SAME', data_format=self._data_format) + + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x + + def _global_avg_pool(self, x): + with tf.name_scope('global_avg_pool') as name_scope: + assert x.get_shape().ndims == 4 + if self._data_format == 'channels_first': + x = tf.reduce_mean(x, [2, 3]) + else: + x = tf.reduce_mean(x, [1, 2]) + tf.logging.info('image after unit %s: %s', name_scope, x.get_shape()) + return x
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/Dockerfile.gpu ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/Dockerfile.gpu b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/Dockerfile.gpu new file mode 100644 index 0000000..05d5fe7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/Dockerfile.gpu @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:9.0-base-ubuntu16.04 + +RUN echo "$LOG_TAG update and install basic packages" && \ + apt-get -y update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + rsync \ + software-properties-common \ + unzip \ + vim \ + wget \ + && \ + apt-get install -y locales && \ + locale-gen $LANG && \ + apt-get clean && \ + apt -y autoclean && \ + apt -y dist-upgrade && \ + apt-get install -y build-essential && \ + rm -rf /var/lib/apt/lists/* + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +RUN echo "$LOG_TAG Install java8" && \ + apt-get -y update && \ + apt-get install -y openjdk-8-jdk && \ + rm -rf /var/lib/apt/lists/* + +# Install Zeppelin +ENV Z_VERSION="0.7.3" \ + Z_HOME="/zeppelin" + +RUN echo "$LOG_TAG Download Zeppelin binary" && \ + wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \ + tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \ + rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \ + mv /zeppelin-${Z_VERSION}-bin-all ${Z_HOME} +ENV PATH="${Z_HOME}/bin:${PATH}" + +RUN echo "$LOG_TAG Set locale" && \ + echo "LC_ALL=en_US.UTF-8" >> /etc/environment && \ + echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen && \ + echo "LANG=en_US.UTF-8" > /etc/locale.conf && \ + locale-gen en_US.UTF-8 + +ENV LANG=en_US.UTF-8 \ + LC_ALL=en_US.UTF-8 + +COPY zeppelin-site.xml $Z_HOME/conf/zeppelin-site.xml +COPY shiro.ini ${Z_HOME}/conf/shiro.ini +RUN chmod 777 -R ${Z_HOME} + +COPY run_container.sh /usr/local/bin/run_container.sh +RUN chmod 755 /usr/local/bin/run_container.sh + +EXPOSE 8080 +CMD ["/usr/local/bin/run_container.sh"] http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/run_container.sh ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/run_container.sh b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/run_container.sh new file mode 100644 index 0000000..8b90920 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/run_container.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"${Z_HOME}/bin/zeppelin-daemon.sh" start +while true; do + #perform the test + sleep 5 +done http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/shiro.ini ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/shiro.ini b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/shiro.ini new file mode 100644 index 0000000..89f976a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/shiro.ini @@ -0,0 +1,120 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[users] +# List of users with their password allowed to access Zeppelin. +# To use a different strategy (LDAP / Database / ...) check the shiro doc at http://shiro.apache.org/configuration.html#Configuration-INISections +# To enable admin user, uncomment the following line and set an appropriate password. +admin = admin, admin +user1 = password2, role1, role2 +user2 = password3, role3 +user3 = password4, role2 + +# Sample LDAP configuration, for user Authentication, currently tested for single Realm +[main] +### A sample for configuring Active Directory Realm +#activeDirectoryRealm = org.apache.zeppelin.realm.ActiveDirectoryGroupRealm +#activeDirectoryRealm.systemUsername = userNameA + +#use either systemPassword or hadoopSecurityCredentialPath, more details in http://zeppelin.apache.org/docs/latest/security/shiroauthentication.html +#activeDirectoryRealm.systemPassword = passwordA +#activeDirectoryRealm.hadoopSecurityCredentialPath = jceks://file/user/zeppelin/zeppelin.jceks +#activeDirectoryRealm.searchBase = CN=Users,DC=SOME_GROUP,DC=COMPANY,DC=COM +#activeDirectoryRealm.url = ldap://ldap.test.com:389 +#activeDirectoryRealm.groupRolesMap = "CN=admin,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"admin","CN=finance,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"finance","CN=hr,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"hr" +#activeDirectoryRealm.authorizationCachingEnabled = false + +### A sample for configuring LDAP Directory Realm +#ldapRealm = org.apache.zeppelin.realm.LdapGroupRealm +## search base for ldap groups (only relevant for LdapGroupRealm): +#ldapRealm.contextFactory.environment[ldap.searchBase] = dc=COMPANY,dc=COM +#ldapRealm.contextFactory.url = ldap://ldap.test.com:389 +#ldapRealm.userDnTemplate = uid={0},ou=Users,dc=COMPANY,dc=COM +#ldapRealm.contextFactory.authenticationMechanism = simple + +### A sample PAM configuration +#pamRealm=org.apache.zeppelin.realm.PamRealm +#pamRealm.service=sshd + +### A sample for configuring ZeppelinHub Realm +#zeppelinHubRealm = org.apache.zeppelin.realm.ZeppelinHubRealm +## Url of ZeppelinHub +#zeppelinHubRealm.zeppelinhubUrl = https://www.zeppelinhub.com +#securityManager.realms = $zeppelinHubRealm + +## A same for configuring Knox SSO Realm +#knoxJwtRealm = org.apache.zeppelin.realm.jwt.KnoxJwtRealm +#knoxJwtRealm.providerUrl = https://domain.example.com/ +#knoxJwtRealm.login = gateway/knoxsso/knoxauth/login.html +#knoxJwtRealm.logout = gateway/knoxssout/api/v1/webssout +#knoxJwtRealm.logoutAPI = true +#knoxJwtRealm.redirectParam = originalUrl +#knoxJwtRealm.cookieName = hadoop-jwt +#knoxJwtRealm.publicKeyPath = /etc/zeppelin/conf/knox-sso.pem +# +#knoxJwtRealm.groupPrincipalMapping = group.principal.mapping +#knoxJwtRealm.principalMapping = principal.mapping +#authc = org.apache.zeppelin.realm.jwt.KnoxAuthenticationFilter + +sessionManager = org.apache.shiro.web.session.mgt.DefaultWebSessionManager + +### If caching of user is required then uncomment below lines +#cacheManager = org.apache.shiro.cache.MemoryConstrainedCacheManager +#securityManager.cacheManager = $cacheManager + +### Enables 'HttpOnly' flag in Zeppelin cookies +cookie = org.apache.shiro.web.servlet.SimpleCookie +cookie.name = JSESSIONID +cookie.httpOnly = true +### Uncomment the below line only when Zeppelin is running over HTTPS +#cookie.secure = true +sessionManager.sessionIdCookie = $cookie + +securityManager.sessionManager = $sessionManager +# 86,400,000 milliseconds = 24 hour +securityManager.sessionManager.globalSessionTimeout = 86400000 +shiro.loginUrl = /api/login + +[roles] +role1 = * +role2 = * +role3 = * +admin = * + +[urls] +# This section is used for url-based security. For details see the shiro.ini documentation. +# +# You can secure interpreter, configuration and credential information by urls. +# Comment or uncomment the below urls that you want to hide: +# anon means the access is anonymous. +# authc means form based auth Security. +# +# IMPORTANT: Order matters: URL path expressions are evaluated against an incoming request +# in the order they are defined and the FIRST MATCH WINS. +# +# To allow anonymous access to all but the stated urls, +# uncomment the line second last line (/** = anon) and comment the last line (/** = authc) +# +/api/version = anon +# Allow all authenticated users to restart interpreters on a notebook page. +# Comment out the following line if you would like to authorize only admin users to restart interpreters. +/api/interpreter/setting/restart/** = authc +/api/interpreter/** = authc, roles[admin] +/api/configurations/** = authc, roles[admin] +/api/credential/** = authc, roles[admin] +#/** = anon +/** = authc http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/zeppelin-site.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/zeppelin-site.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/zeppelin-site.xml new file mode 100644 index 0000000..2bde161 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/zeppelin-notebook-example/zeppelin-site.xml @@ -0,0 +1,569 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<configuration> + + <property> + <name>zeppelin.server.addr</name> + <value>0.0.0.0</value> + <description>Server address</description> + </property> + + <property> + <name>zeppelin.server.port</name> + <value>8080</value> + <description>Server port.</description> + </property> + + <property> + <name>zeppelin.server.ssl.port</name> + <value>8443</value> + <description>Server ssl port. (used when ssl property is set to true)</description> + </property> + + <property> + <name>zeppelin.server.context.path</name> + <value>/</value> + <description>Context Path of the Web Application</description> + </property> + + <property> + <name>zeppelin.war.tempdir</name> + <value>webapps</value> + <description>Location of jetty temporary directory</description> + </property> + + <property> + <name>zeppelin.notebook.dir</name> + <value>notebook</value> + <description>path or URI for notebook persist</description> + </property> + + <property> + <name>zeppelin.notebook.homescreen</name> + <value></value> + <description>id of notebook to be displayed in homescreen. ex) 2A94M5J1Z Empty value displays default home screen</description> + </property> + + <property> + <name>zeppelin.notebook.homescreen.hide</name> + <value>false</value> + <description>hide homescreen notebook from list when this value set to true</description> + </property> + + <property> + <name>zeppelin.notebook.collaborative.mode.enable</name> + <value>true</value> + <description>Enable collaborative mode</description> + </property> + + <!-- Google Cloud Storage notebook storage --> + <!-- + <property> + <name>zeppelin.notebook.gcs.dir</name> + <value></value> + <description> + A GCS path in the form gs://bucketname/path/to/dir. + Notes are stored at {zeppelin.notebook.gcs.dir}/{notebook-id}/note.json + </description> + </property> + + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.GCSNotebookRepo</value> + <description>notebook persistence layer implementation</description> + </property> + --> + + <!-- Amazon S3 notebook storage --> + <!-- Creates the following directory structure: s3://{bucket}/{username}/{notebook-id}/note.json --> + <!-- + <property> + <name>zeppelin.notebook.s3.user</name> + <value>user</value> + <description>user name for s3 folder structure</description> + </property> + + <property> + <name>zeppelin.notebook.s3.bucket</name> + <value>zeppelin</value> + <description>bucket name for notebook storage</description> + </property> + + <property> + <name>zeppelin.notebook.s3.endpoint</name> + <value>s3.amazonaws.com</value> + <description>endpoint for s3 bucket</description> + </property> + + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.S3NotebookRepo</value> + <description>notebook persistence layer implementation</description> + </property> + --> + + <!-- Additionally, encryption is supported for notebook data stored in S3 --> + <!-- Use the AWS KMS to encrypt data --> + <!-- If used, the EC2 role assigned to the EMR cluster must have rights to use the given key --> + <!-- See https://aws.amazon.com/kms/ and http://docs.aws.amazon.com/kms/latest/developerguide/concepts.html --> + <!-- + <property> + <name>zeppelin.notebook.s3.kmsKeyID</name> + <value>AWS-KMS-Key-UUID</value> + <description>AWS KMS key ID used to encrypt notebook data in S3</description> + </property> + --> + + <!-- provide region of your KMS key --> + <!-- See http://docs.aws.amazon.com/general/latest/gr/rande.html#kms_region for region codes names --> + <!-- + <property> + <name>zeppelin.notebook.s3.kmsKeyRegion</name> + <value>us-east-1</value> + <description>AWS KMS key region in your AWS account</description> + </property> + --> + + <!-- Use a custom encryption materials provider to encrypt data --> + <!-- No configuration is given to the provider, so you must use system properties or another means to configure --> + <!-- See https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/EncryptionMaterialsProvider.html --> + <!-- + <property> + <name>zeppelin.notebook.s3.encryptionMaterialsProvider</name> + <value>provider implementation class name</value> + <description>Custom encryption materials provider used to encrypt notebook data in S3</description> + </property> + --> + + <!-- Server-side encryption enabled for notebooks --> + <!-- + <property> + <name>zeppelin.notebook.s3.sse</name> + <value>true</value> + <description>Server-side encryption enabled for notebooks</description> + </property> + --> + + <!-- Optional override to control which signature algorithm should be used to sign AWS requests --> + <!-- Set this property to "S3SignerType" if your AWS S3 compatible APIs support only AWS Signature Version 2 such as Ceph. --> + <!-- + <property> + <name>zeppelin.notebook.s3.signerOverride</name> + <value>S3SignerType</value> + <description>optional override to control which signature algorithm should be used to sign AWS requests</description> + </property> + --> + + <!-- If using Azure for storage use the following settings --> + <!-- + <property> + <name>zeppelin.notebook.azure.connectionString</name> + <value>DefaultEndpointsProtocol=https;AccountName=<accountName>;AccountKey=<accountKey></value> + <description>Azure account credentials</description> + </property> + + <property> + <name>zeppelin.notebook.azure.share</name> + <value>zeppelin</value> + <description>share name for notebook storage</description> + </property> + + <property> + <name>zeppelin.notebook.azure.user</name> + <value>user</value> + <description>optional user name for Azure folder structure</description> + </property> + + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.AzureNotebookRepo</value> + <description>notebook persistence layer implementation</description> + </property> + --> + + <!-- Notebook storage layer using local file system + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.VFSNotebookRepo</value> + <description>local notebook persistence layer implementation</description> + </property> + --> + + <!-- Notebook storage layer using hadoop compatible file system + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.FileSystemNotebookRepo</value> + <description>Hadoop compatible file system notebook persistence layer implementation, such as local file system, hdfs, azure wasb, s3 and etc.</description> + </property> + + <property> + <name>zeppelin.server.kerberos.keytab</name> + <value></value> + <description>keytab for accessing kerberized hdfs</description> + </property> + + <property> + <name>zeppelin.server.kerberos.principal</name> + <value></value> + <description>principal for accessing kerberized hdfs</description> + </property> + --> + + <!-- For connecting your Zeppelin with ZeppelinHub --> + <!-- + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.GitNotebookRepo, org.apache.zeppelin.notebook.repo.zeppelinhub.ZeppelinHubRepo</value> + <description>two notebook persistence layers (versioned local + ZeppelinHub)</description> + </property> + --> + + <!-- MongoDB notebook storage --> + <!-- + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.MongoNotebookRepo</value> + <description>notebook persistence layer implementation</description> + </property> + + <property> + <name>zeppelin.notebook.mongo.uri</name> + <value>mongodb://localhost</value> + <description>MongoDB connection URI used to connect to a MongoDB database server</description> + </property> + + <property> + <name>zeppelin.notebook.mongo.database</name> + <value>zeppelin</value> + <description>database name for notebook storage</description> + </property> + + <property> + <name>zeppelin.notebook.mongo.collection</name> + <value>notes</value> + <description>collection name for notebook storage</description> + </property> + + <property> + <name>zeppelin.notebook.mongo.autoimport</name> + <value>false</value> + <description>import local notes into MongoDB automatically on startup</description> + </property> + --> + + <property> + <name>zeppelin.notebook.storage</name> + <value>org.apache.zeppelin.notebook.repo.GitNotebookRepo</value> + <description>versioned notebook persistence layer implementation</description> + </property> + + <property> + <name>zeppelin.notebook.one.way.sync</name> + <value>false</value> + <description>If there are multiple notebook storages, should we treat the first one as the only source of truth?</description> + </property> + + <property> + <name>zeppelin.interpreter.dir</name> + <value>interpreter</value> + <description>Interpreter implementation base directory</description> + </property> + + <property> + <name>zeppelin.interpreter.localRepo</name> + <value>local-repo</value> + <description>Local repository for interpreter's additional dependency loading</description> + </property> + + <property> + <name>zeppelin.interpreter.dep.mvnRepo</name> + <value>http://repo1.maven.org/maven2/</value> + <description>Remote principal repository for interpreter's additional dependency loading</description> + </property> + + <property> + <name>zeppelin.dep.localrepo</name> + <value>local-repo</value> + <description>Local repository for dependency loader</description> + </property> + + <property> + <name>zeppelin.helium.node.installer.url</name> + <value>https://nodejs.org/dist/</value> + <description>Remote Node installer url for Helium dependency loader</description> + </property> + + <property> + <name>zeppelin.helium.npm.installer.url</name> + <value>http://registry.npmjs.org/</value> + <description>Remote Npm installer url for Helium dependency loader</description> + </property> + + <property> + <name>zeppelin.helium.yarnpkg.installer.url</name> + <value>https://github.com/yarnpkg/yarn/releases/download/</value> + <description>Remote Yarn package installer url for Helium dependency loader</description> + </property> + + <property> + <name>zeppelin.interpreters</name> + <value>org.apache.zeppelin.spark.SparkInterpreter,org.apache.zeppelin.spark.PySparkInterpreter,org.apache.zeppelin.rinterpreter.RRepl,org.apache.zeppelin.rinterpreter.KnitR,org.apache.zeppelin.spark.SparkRInterpreter,org.apache.zeppelin.spark.SparkSqlInterpreter,org.apache.zeppelin.spark.DepInterpreter,org.apache.zeppelin.markdown.Markdown,org.apache.zeppelin.angular.AngularInterpreter,org.apache.zeppelin.shell.ShellInterpreter,org.apache.zeppelin.file.HDFSFileInterpreter,org.apache.zeppelin.flink.FlinkInterpreter,,org.apache.zeppelin.python.PythonInterpreter,org.apache.zeppelin.python.PythonInterpreterPandasSql,org.apache.zeppelin.python.PythonCondaInterpreter,org.apache.zeppelin.python.PythonDockerInterpreter,org.apache.zeppelin.lens.LensInterpreter,org.apache.zeppelin.ignite.IgniteInterpreter,org.apache.zeppelin.ignite.IgniteSqlInterpreter,org.apache.zeppelin.cassandra.CassandraInterpreter,org.apache.zeppelin.geode.GeodeOqlInterpreter,org.apache.zeppelin.jdbc.JDBCInterpreter, org.apache.zeppelin.kylin.KylinInterpreter,org.apache.zeppelin.elasticsearch.ElasticsearchInterpreter,org.apache.zeppelin.scalding.ScaldingInterpreter,org.apache.zeppelin.alluxio.AlluxioInterpreter,org.apache.zeppelin.hbase.HbaseInterpreter,org.apache.zeppelin.livy.LivySparkInterpreter,org.apache.zeppelin.livy.LivyPySparkInterpreter,org.apache.zeppelin.livy.LivyPySpark3Interpreter,org.apache.zeppelin.livy.LivySparkRInterpreter,org.apache.zeppelin.livy.LivySparkSQLInterpreter,org.apache.zeppelin.bigquery.BigQueryInterpreter,org.apache.zeppelin.beam.BeamInterpreter,org.apache.zeppelin.pig.PigInterpreter,org.apache.zeppelin.pig.PigQueryInterpreter,org.apache.zeppelin.scio.ScioInterpreter,org.apache.zeppelin.groovy.GroovyInterpreter</value> + <description>Comma separated interpreter configurations. First interpreter become a default</description> + </property> + + <property> + <name>zeppelin.interpreter.group.order</name> + <value>spark,md,angular,sh,livy,alluxio,file,psql,flink,python,ignite,lens,cassandra,geode,kylin,elasticsearch,scalding,jdbc,hbase,bigquery,beam,groovy</value> + <description></description> + </property> + + <property> + <name>zeppelin.interpreter.connect.timeout</name> + <value>30000</value> + <description>Interpreter process connect timeout in msec.</description> + </property> + + <property> + <name>zeppelin.interpreter.output.limit</name> + <value>102400</value> + <description>Output message from interpreter exceeding the limit will be truncated</description> + </property> + + <property> + <name>zeppelin.ssl</name> + <value>false</value> + <description>Should SSL be used by the servers?</description> + </property> + + <property> + <name>zeppelin.ssl.client.auth</name> + <value>false</value> + <description>Should client authentication be used for SSL connections?</description> + </property> + + <property> + <name>zeppelin.ssl.keystore.path</name> + <value>keystore</value> + <description>Path to keystore relative to Zeppelin configuration directory</description> + </property> + + <property> + <name>zeppelin.ssl.keystore.type</name> + <value>JKS</value> + <description>The format of the given keystore (e.g. JKS or PKCS12)</description> + </property> + + <property> + <name>zeppelin.ssl.keystore.password</name> + <value>change me</value> + <description>Keystore password. Can be obfuscated by the Jetty Password tool</description> + </property> + + <!-- + <property> + <name>zeppelin.ssl.key.manager.password</name> + <value>change me</value> + <description>Key Manager password. Defaults to keystore password. Can be obfuscated.</description> + </property> + --> + + <property> + <name>zeppelin.ssl.truststore.path</name> + <value>truststore</value> + <description>Path to truststore relative to Zeppelin configuration directory. Defaults to the keystore path</description> + </property> + + <property> + <name>zeppelin.ssl.truststore.type</name> + <value>JKS</value> + <description>The format of the given truststore (e.g. JKS or PKCS12). Defaults to the same type as the keystore type</description> + </property> + + <!-- + <property> + <name>zeppelin.ssl.truststore.password</name> + <value>change me</value> + <description>Truststore password. Can be obfuscated by the Jetty Password tool. Defaults to the keystore password</description> + </property> + --> + + <property> + <name>zeppelin.server.allowed.origins</name> + <value>*</value> + <description>Allowed sources for REST and WebSocket requests (i.e. http://onehost:8080,http://otherhost.com). If you leave * you are vulnerable to https://issues.apache.org/jira/browse/ZEPPELIN-173</description> + </property> + + <property> + <name>zeppelin.anonymous.allowed</name> + <value>false</value> + <description>Anonymous user allowed by default</description> + </property> + + <property> + <name>zeppelin.username.force.lowercase</name> + <value>false</value> + <description>Force convert username case to lower case, useful for Active Directory/LDAP. Default is not to change case</description> + </property> + + <property> + <name>zeppelin.notebook.default.owner.username</name> + <value></value> + <description>Set owner role by default</description> + </property> + + <property> + <name>zeppelin.notebook.public</name> + <value>true</value> + <description>Make notebook public by default when created, private otherwise</description> + </property> + + <property> + <name>zeppelin.websocket.max.text.message.size</name> + <value>1024000</value> + <description>Size in characters of the maximum text message to be received by websocket. Defaults to 1024000</description> + </property> + + <property> + <name>zeppelin.server.default.dir.allowed</name> + <value>false</value> + <description>Enable directory listings on server.</description> + </property> + + <!-- + <property> + <name>zeppelin.interpreter.lifecyclemanager.class</name> + <value>org.apache.zeppelin.interpreter.lifecycle.TimeoutLifecycleManager</value> + <description>LifecycleManager class for managing the lifecycle of interpreters, by default interpreter will + be closed after timeout</description> + </property> + + <property> + <name>zeppelin.interpreter.lifecyclemanager.timeout.checkinterval</name> + <value>60000</value> + <description>Milliseconds of the interval to checking whether interpreter is time out</description> + </property> + + <property> + <name>zeppelin.interpreter.lifecyclemanager.timeout.threshold</name> + <value>3600000</value> + <description>Milliseconds of the interpreter timeout threshold, by default it is 1 hour</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.server.jetty.name</name> + <value>Jetty(7.6.0.v20120127)</value> + <description>Hardcoding Application Server name to Prevent Fingerprinting</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.server.jetty.request.header.size</name> + <value>8192</value> + <description>Http Request Header Size Limit (to prevent HTTP 413)</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.server.xframe.options</name> + <value>SAMEORIGIN</value> + <description>The X-Frame-Options HTTP response header can be used to indicate whether or not a browser should be allowed to render a page in a frame/iframe/object.</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.server.strict.transport</name> + <value>max-age=631138519</value> + <description>The HTTP Strict-Transport-Security response header is a security feature that lets a web site tell browsers that it should only be communicated with using HTTPS, instead of using HTTP. Enable this when Zeppelin is running on HTTPS. Value is in Seconds, the default value is equivalent to 20 years.</description> + </property> + --> + <!-- + + <property> + <name>zeppelin.server.xxss.protection</name> + <value>1</value> + <description>The HTTP X-XSS-Protection response header is a feature of Internet Explorer, Chrome and Safari that stops pages from loading when they detect reflected cross-site scripting (XSS) attacks. When value is set to 1 and a cross-site scripting attack is detected, the browser will sanitize the page (remove the unsafe parts).</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.interpreter.callback.portRange</name> + <value>10000:10010</value> + </property> + --> + + <!-- + <property> + <name>zeppelin.recovery.storage.class</name> + <value>org.apache.zeppelin.interpreter.recovery.FileSystemRecoveryStorage</value> + <description>ReoveryStorage implementation</description> + </property> + --> + + <!-- + <property> + <name>zeppelin.recovery.dir</name> + <value>recovery</value> + <description>Location where recovery metadata is stored</description> + </property> + --> + + <!-- GitHub configurations + <property> + <name>zeppelin.notebook.git.remote.url</name> + <value></value> + <description>remote Git repository URL</description> + </property> + + <property> + <name>zeppelin.notebook.git.remote.username</name> + <value>token</value> + <description>remote Git repository username</description> + </property> + + <property> + <name>zeppelin.notebook.git.remote.access-token</name> + <value></value> + <description>remote Git repository password</description> + </property> + + <property> + <name>zeppelin.notebook.git.remote.origin</name> + <value>origin</value> + <description>Git repository remote</description> + </property> + + <property> + <name>zeppelin.notebook.cron.enable</name> + <value>false</value> + <description>Notebook enable cron scheduler feature</description> + </property> + <property> + <name>zeppelin.notebook.cron.folders</name> + <value></value> + <description>Notebook cron folders</description> + </property> + --> +</configuration> http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/DeveloperGuide.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/DeveloperGuide.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/DeveloperGuide.md deleted file mode 100644 index ce26ea7..0000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/DeveloperGuide.md +++ /dev/null @@ -1,26 +0,0 @@ -<!--- - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. See accompanying LICENSE file. ---> - -# Developper Guide - -(Need add more details) - -By default, submarine uses YARN service framework as runtime. If you want to add your own implementation. You can add a new `RuntimeFactory` implementation and configure following option to `submarine.xml` (which should be placed under same `$HADOOP_CONF_DIR`) - -``` -<property> - <name>submarine.runtime.class</name> - <value>... full qualified class name for your runtime factory ... </value> -</property> -``` http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/QuickStart.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/QuickStart.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/QuickStart.md deleted file mode 100644 index b720b5a..0000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/QuickStart.md +++ /dev/null @@ -1,134 +0,0 @@ -<!--- - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. See accompanying LICENSE file. ---> - -# Quick Start Guide - -## Prerequisite - -Must: -- Apache Hadoop 3.1.0, YARN service enabled. - -Optional: -- Enable YARN DNS. (When distributed training required.) -- Enable GPU on YARN support. (When GPU-based training required.) - -## Run jobs - -### Commandline options - -```$xslt -usage: job run - -checkpoint_path <arg> Training output directory of the job, could - be local or other FS directory. This - typically includes checkpoint files and - exported model - -docker_image <arg> Docker image name/tag - -env <arg> Common environment variable of worker/ps - -input_path <arg> Input of the job, could be local or other FS - directory - -name <arg> Name of the job - -num_ps <arg> Number of PS tasks of the job, by default - it's 0 - -num_workers <arg> Numnber of worker tasks of the job, by - default it's 1 - -ps_docker_image <arg> Specify docker image for PS, when this is - not specified, PS uses --docker_image as - default. - -ps_launch_cmd <arg> Commandline of worker, arguments will be - directly used to launch the PS - -ps_resources <arg> Resource of each PS, for example - memory-mb=2048,vcores=2,yarn.io/gpu=2 - -queue <arg> Name of queue to run the job, by default it - uses default queue - -saved_model_path <arg> Model exported path (savedmodel) of the job, - which is needed when exported model is not - placed under ${checkpoint_path}could be - local or other FS directory. This will be - used to serve. - -tensorboard <arg> Should we run TensorBoard for this job? By - default it's true - -verbose Print verbose log for troubleshooting - -wait_job_finish Specified when user want to wait the job - finish - -worker_docker_image <arg> Specify docker image for WORKER, when this - is not specified, WORKER uses --docker_image - as default. - -worker_launch_cmd <arg> Commandline of worker, arguments will be - directly used to launch the worker - -worker_resources <arg> Resource of each worker, for example - memory-mb=2048,vcores=2,yarn.io/gpu=2 -``` - -### Launch Standalone Tensorflow Application: - -#### Commandline -``` -yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \ - --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \ - --docker_image <your-docker-image> \ - --input_path hdfs://default/dataset/cifar-10-data \ - --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ - --worker_resources memory=4G,vcores=2,gpu=2 \ - --worker_launch_cmd "python ... (Your training application cmd)" -``` - -#### Notes: - -1) `DOCKER_JAVA_HOME` points to JAVA_HOME inside Docker image. -2) `DOCKER_HADOOP_HDFS_HOME` points to HADOOP_HDFS_HOME inside Docker image. -3) `--worker_resources` can include gpu when you need GPU to train your task. - -### Launch Distributed Tensorflow Application: - -#### Commandline - -``` -yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \ - --name tf-job-001 --docker_image <your docker image> \ - --input_path hdfs://default/dataset/cifar-10-data \ - --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ - --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ - --num_workers 2 \ - --worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \ - --num_ps 2 \ - --ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \ -``` - -#### Notes: - -1) Very similar to standalone TF application, but you need to specify #worker/#ps -2) Different resources can be specified for worker and PS. -3) `TF_CONFIG` environment will be auto generated and set before executing user's launch command. - -## Run jobs - -### Get Job Status - -``` -yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001 -``` - -Output looks like: -``` -Job Meta Info: - Application Id: application_1532131617202_0005 - Input Path: hdfs://default/dataset/cifar-10-data - Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir - Run Parameters: --name tf-job-001 --docker_image wtan/tf-1.8.0-gpu:0.0.3 - (... all your commandline before run the job) -``` - -After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/DeveloperGuide.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/DeveloperGuide.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/DeveloperGuide.md new file mode 100644 index 0000000..76e3ae0 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/DeveloperGuide.md @@ -0,0 +1,24 @@ +<!--- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> + +# Developer Guide + +By default, submarine uses YARN service framework as runtime. If you want to add your own implementation. You can add a new `RuntimeFactory` implementation and configure following option to `submarine.xml` (which should be placed under same `$HADOOP_CONF_DIR`) + +``` +<property> + <name>submarine.runtime.class</name> + <value>... full qualified class name for your runtime factory ... </value> +</property> +``` http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Examples.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Examples.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Examples.md new file mode 100644 index 0000000..3e7f02f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Examples.md @@ -0,0 +1,21 @@ +<!--- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> + +# Examples + +Here're some examples about Submarine usage. + +[Running Distributed CIFAR 10 Tensorflow Job](RunningDistributedCifar10TFJobs.html) + +[Running Zeppelin Notebook on YARN](RunningZeppelinOnYARN.html) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Index.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Index.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Index.md new file mode 100644 index 0000000..0b78a87 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/Index.md @@ -0,0 +1,42 @@ +<!--- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> + +Submarine is a project which allows infra engineer / data scientist to run *unmodified* Tensorflow programs on YARN. + +Goals of Submarine: + +- It allows jobs for easy access to data/models in HDFS and other storages. + +- Can launch services to serve Tensorflow/MXNet models. + +- Support run distributed Tensorflow jobs with simple configs. + +- Support run user-specified Docker images. + +- Support specify GPU and other resources. + +- Support launch tensorboard for training jobs if user specified. + +- Support customized DNS name for roles (like tensorboard.$user.$domain:6006) + + +Click below contents if you want to understand more. + +- [QuickStart Guide](QuickStart.html) + +- [Examples](Examples.html) + +- [How to write Dockerfile for Submarine jobs](WriteDockerfile.html) + +- [Developer guide](DeveloperGuide.html) http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/QuickStart.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/QuickStart.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/QuickStart.md new file mode 100644 index 0000000..da4fb95 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/QuickStart.md @@ -0,0 +1,174 @@ +<!--- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> + +# Quick Start Guide + +## Prerequisite + +Must: + +- Apache Hadoop 3.1.x, YARN service enabled. + +Optional: + +- Enable YARN DNS. (When distributed training is required.) +- Enable GPU on YARN support. (When GPU-based training is required.) + +## Run jobs + +### Commandline options + +```$xslt +usage: job run + -checkpoint_path <arg> Training output directory of the job, could + be local or other FS directory. This + typically includes checkpoint files and + exported model + -docker_image <arg> Docker image name/tag + -env <arg> Common environment variable of worker/ps + -input_path <arg> Input of the job, could be local or other FS + directory + -name <arg> Name of the job + -num_ps <arg> Number of PS tasks of the job, by default + it's 0 + -num_workers <arg> Numnber of worker tasks of the job, by + default it's 1 + -ps_docker_image <arg> Specify docker image for PS, when this is + not specified, PS uses --docker_image as + default. + -ps_launch_cmd <arg> Commandline of worker, arguments will be + directly used to launch the PS + -ps_resources <arg> Resource of each PS, for example + memory-mb=2048,vcores=2,yarn.io/gpu=2 + -queue <arg> Name of queue to run the job, by default it + uses default queue + -saved_model_path <arg> Model exported path (savedmodel) of the job, + which is needed when exported model is not + placed under ${checkpoint_path}could be + local or other FS directory. This will be + used to serve. + -tensorboard <arg> Should we run TensorBoard for this job? By + default it's true + -verbose Print verbose log for troubleshooting + -wait_job_finish Specified when user want to wait the job + finish + -worker_docker_image <arg> Specify docker image for WORKER, when this + is not specified, WORKER uses --docker_image + as default. + -worker_launch_cmd <arg> Commandline of worker, arguments will be + directly used to launch the worker + -worker_resources <arg> Resource of each worker, for example + memory-mb=2048,vcores=2,yarn.io/gpu=2 +``` + +### Launch Standalone Tensorflow Application: + +#### Commandline +``` +yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \ + --docker_image <your-docker-image> \ + --input_path hdfs://default/dataset/cifar-10-data \ + --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ + --worker_resources memory=4G,vcores=2,gpu=2 \ + --worker_launch_cmd "python ... (Your training application cmd)" \ + --tensorboard # this will launch a companion tensorboard container for monitoring +``` + +#### Notes: + +1) `DOCKER_JAVA_HOME` points to JAVA_HOME inside Docker image. + +2) `DOCKER_HADOOP_HDFS_HOME` points to HADOOP_HDFS_HOME inside Docker image. + +3) `--worker_resources` can include gpu when you need GPU to train your task. + +4) When `--tensorboard` is specified, you can go to YARN new UI, go to services -> `<you specified service>` -> Click `...` to access Tensorboard. + +This will launch a Tensorboard to monitor *all your jobs*. By access YARN UI (the new UI). You can go to services page, go to the `tensorboard-service`, click quick links (`Tensorboard`) can lead you to the tensorboard. + +See below screenshot: + +![alt text](./images/tensorboard-service.png "Tensorboard service") + +### Launch Distributed Tensorflow Application: + +#### Commandline + +``` +yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \ + --name tf-job-001 --docker_image <your docker image> \ + --input_path hdfs://default/dataset/cifar-10-data \ + --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --num_workers 2 \ + --worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \ + --num_ps 2 \ + --ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \ +``` + +#### Notes: + +1) Very similar to standalone TF application, but you need to specify #worker/#ps + +2) Different resources can be specified for worker and PS. + +3) `TF_CONFIG` environment will be auto generated and set before executing user's launch command. + +## Get job history / logs + +### Get Job Status from CLI + +``` +yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001 +``` + +Output looks like: +``` +Job Meta Info: + Application Id: application_1532131617202_0005 + Input Path: hdfs://default/dataset/cifar-10-data + Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir + Run Parameters: --name tf-job-001 --docker_image wtan/tf-1.8.0-gpu:0.0.3 + (... all your commandline before run the job) +``` + +After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job. + +### Run tensorboard to monitor your jobs + +``` +# Cleanup previous service if needed +yarn app -destroy tensorboard-service; \ +yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ + job run --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --num_workers 0 --tensorboard +``` + +You can view multiple job training history like from the `Tensorboard` link: + +![alt text](./images/multiple-tensorboard-jobs.png "Tensorboard for multiple jobs") + + +### Get component logs from a training job + +There're two ways to get training job logs, one is from YARN UI (new or old): + +![alt text](./images/job-logs-ui.png "Job logs UI") + +Or you can use `yarn logs -applicationId <applicationId>` to get logs from CLI http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningDistributedCifar10TFJobs.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningDistributedCifar10TFJobs.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningDistributedCifar10TFJobs.md new file mode 100644 index 0000000..127c80f --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningDistributedCifar10TFJobs.md @@ -0,0 +1,162 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +# Tutorial: Running Distributed Cifar10 Tensorflow Estimator Example. + +## Prepare data for training + +CIFAR-10 is a common benchmark in machine learning for image recognition. Below example is based on CIFAR-10 dataset. + +1) Checkout https://github.com/tensorflow/models/: +``` +git clone https://github.com/tensorflow/models/ +``` + +2) Go to `models/tutorials/image/cifar10_estimator` + +3) Generate data by using following command: (required Tensorflow installed) + +``` +python generate_cifar10_tfrecords.py --data-dir=cifar-10-data +``` + +4) Upload data to HDFS + +``` +hadoop fs -put cifar-10-data/ /dataset/cifar-10-data +``` + +**Please note that:** + +YARN service doesn't allow multiple services with the same name, so please run following command +``` +yarn application -destroy <service-name> +``` +to delete services if you want to reuse the same service name. + +## Prepare Docker images + +Refer to [Write Dockerfile](WriteDockerfile.md) to build a Docker image or use prebuilt one. + +## Run Tensorflow jobs + +### Run standalone training + +``` +yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ + job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \ + --input_path hdfs://default/dataset/cifar-10-data \ + --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 + --num_workers 1 --worker_resources memory=8G,vcores=2,gpu=1 \ + --worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \ + --tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3 +``` + +Explanations: + +- When access of HDFS is required, the two environments are required to indicate: JAVA_HOME and HDFS_HOME to access libhdfs libraries *inside Docker image*. We will try to eliminate specifying this in the future. +- Docker image for worker and tensorboard can be specified separately. For this case, Tensorboard doesn't need GPU, so we will use cpu Docker image for Tensorboard. (Same for parameter-server in the distributed example below). + +### Run distributed training + +``` +yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ + job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \ + --input_path hdfs://default/dataset/cifar-10-data \ + --env(s) (same as standalone) + --num_workers 2 \ + --worker_resources memory=8G,vcores=2,gpu=1 \ + --worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \ + --ps_docker_image wtan/tf-1.8.0-cpu:0.0.3 \ + --num_ps 1 --ps_resources memory=4G,vcores=2,gpu=0 \ + --ps_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \ + --tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3 +``` + +Explanations: + +- `>1` num_workers indicates it is a distributed training. +- Parameters / resources / Docker image of parameter server can be specified separately. For many cases, parameter server doesn't require GPU. + +*Outputs of distributed training* + +Sample output of master: +``` +... +allow_soft_placement: true +, '_tf_random_seed': None, '_task_type': u'master', '_environment': u'cloud', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe77cb15050>, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:29:14.656022: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> localhost:8000} +2018-05-06 22:29:14.656097: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.656112: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:29:14.659359: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:Restoring parameters from hdfs://default/tmp/cifar-10-jobdir/model.ckpt-0 +INFO:tensorflow:Evaluation [1/625] +INFO:tensorflow:Evaluation [2/625] +INFO:tensorflow:Evaluation [3/625] +INFO:tensorflow:Evaluation [4/625] +INFO:tensorflow:Evaluation [5/625] +INFO:tensorflow:Evaluation [6/625] +... +INFO:tensorflow:Validation (step 1): loss = 1220.6445, global_step = 1, accuracy = 0.1 +INFO:tensorflow:loss = 6.3980675, step = 0 +INFO:tensorflow:loss = 6.3980675, learning_rate = 0.1 +INFO:tensorflow:global_step/sec: 2.34092 +INFO:tensorflow:Average examples/sec: 1931.22 (1931.22), step = 100 +INFO:tensorflow:Average examples/sec: 354.236 (38.6479), step = 110 +INFO:tensorflow:Average examples/sec: 211.096 (38.7693), step = 120 +INFO:tensorflow:Average examples/sec: 156.533 (38.1633), step = 130 +INFO:tensorflow:Average examples/sec: 128.6 (38.7372), step = 140 +INFO:tensorflow:Average examples/sec: 111.533 (39.0239), step = 150 +``` + +Sample output of worker: +``` +, '_tf_random_seed': None, '_task_type': u'worker', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2a490b050>, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:45.807936: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808040: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:45.808064: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:8000} +2018-05-06 22:28:45.809919: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +... +INFO:tensorflow:loss = 5.319096, step = 0 +INFO:tensorflow:loss = 5.319096, learning_rate = 0.1 +INFO:tensorflow:Average examples/sec: 49.2338 (49.2338), step = 10 +INFO:tensorflow:Average examples/sec: 52.117 (55.3589), step = 20 +INFO:tensorflow:Average examples/sec: 53.2754 (55.7541), step = 30 +INFO:tensorflow:Average examples/sec: 53.8388 (55.6028), step = 40 +INFO:tensorflow:Average examples/sec: 54.1082 (55.2134), step = 50 +INFO:tensorflow:Average examples/sec: 54.3141 (55.3676), step = 60 +``` + +Sample output of ps: +``` +... +, '_tf_random_seed': None, '_task_type': u'ps', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4be54dff90>, '_tf_config': gpu_options { + per_process_gpu_memory_fraction: 1.0 +} +... +2018-05-06 22:28:42.562316: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.562408: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000} +2018-05-06 22:28:42.562433: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000} +2018-05-06 22:28:42.564242: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000 +``` http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningZeppelinOnYARN.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningZeppelinOnYARN.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningZeppelinOnYARN.md new file mode 100644 index 0000000..e06526c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/RunningZeppelinOnYARN.md @@ -0,0 +1,37 @@ +<!--- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> + +# Running Zeppelin Notebook On Submarine + +This is a simple example about how to run Zeppelin notebook by using Submarine. + +## Step 1: Build Docker Image + +Go to `src/main/docker/zeppelin-notebook-example`, build the Docker image. Or you can use the prebuilt one: `hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1` + +## Step 2: Launch the notebook on YARN + +Submit command to YARN: + +`yarn app -destroy zeppelin-notebook; +yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ + job run --name zeppelin-notebook \ + --docker_image hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1 \ + --worker_resources memory=8G,vcores=2,gpu=1 \ + --num_workers 1 \ + -worker_launch_cmd "/usr/local/bin/run_container.sh"` + +Once the container got launched, you can go to `YARN services` UI page, access the `zeppelin-notebook` job, and go to the quicklink `notebook` by clicking `...`. + +The notebook is secured by admin/admin user name and password. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/WriteDockerfile.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/WriteDockerfile.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/WriteDockerfile.md new file mode 100644 index 0000000..79aac8d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/markdown/WriteDockerfile.md @@ -0,0 +1,117 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +# Creating Docker Images for Running Tensorflow on YARN + +## How to create docker images to run Tensorflow on YARN + +Dockerfile to run Tensorflow on YARN need two part: + +**Base libraries which Tensorflow depends on** + +1) OS base image, for example ```ubuntu:16.04``` + +2) Tensorflow depended libraries and packages. For example ```python```, ```scipy```. For GPU support, need ```cuda```, ```cudnn```, etc. + +3) Tensorflow package. + +**Libraries to access HDFS** + +1) JDK + +2) Hadoop + +Here's an example of a base image (w/o GPU support) to install Tensorflow: +``` +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + && \ + python -m ipykernel.kernelspec + +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl +``` + +On top of above image, add files, install packages to access HDFS +``` +RUN apt-get update && apt-get install -y openjdk-8-jdk wget +RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz +RUN tar zxf hadoop-3.1.0.tar.gz +``` + +Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step. + +## Use examples to build your own Tensorflow docker images + +We provided following examples for you to build tensorflow docker images. + +For Tensorflow 1.8.0 (Precompiled to CUDA 9.x) + +- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only. +- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models +- *docker/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9. +- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models. + +## Build Docker images + +### Manually build Docker image: + +Under `docker/` directory, run `build-all.sh` to build Docker images. It will build following images: + +- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries. +- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow. +- `tf-1.8.0-gpu:0.0.1` which includes cifar10 model +- `tf-1.8.0-cpu:0.0.1` which inclues cifar10 model (cpu only). + +### Use prebuilt images + +(No liability) +You can also use prebuilt images for convenience: + +- hadoopsubmarine/tf-1.8.0-gpu:0.0.1 +- hadoopsubmarine/tf-1.8.0-cpu:0.0.1 http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/css/site.css ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/css/site.css b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/css/site.css new file mode 100644 index 0000000..7315db3 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/css/site.css @@ -0,0 +1,29 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#banner { + height: 93px; + background: none; +} + +#bannerLeft img { + margin-left: 30px; + margin-top: 10px; +} + +#bannerRight img { + margin: 17px; +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/job-logs-ui.png ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/job-logs-ui.png b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/job-logs-ui.png new file mode 100644 index 0000000..18b9e17 Binary files /dev/null and b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/job-logs-ui.png differ http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/multiple-tensorboard-jobs.png ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/multiple-tensorboard-jobs.png b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/multiple-tensorboard-jobs.png new file mode 100644 index 0000000..8e3db79 Binary files /dev/null and b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/multiple-tensorboard-jobs.png differ http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/tensorboard-service.png ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/tensorboard-service.png b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/tensorboard-service.png new file mode 100644 index 0000000..3251d74 Binary files /dev/null and b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/resources/images/tensorboard-service.png differ http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/site.xml ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/site.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/site.xml new file mode 100644 index 0000000..5feae9a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/site/site.xml @@ -0,0 +1,28 @@ +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. See accompanying LICENSE file. +--> +<project name="Apache Hadoop ${project.version}"> + + <skin> + <groupId>org.apache.maven.skins</groupId> + <artifactId>maven-stylus-skin</artifactId> + <version>${maven-stylus-skin.version}</version> + </skin> + + <body> + <links> + <item name="Apache Hadoop" href="http://hadoop.apache.org/"/> + </links> + </body> + +</project> --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org