Repository: incubator-singa Updated Branches: refs/heads/master 29de86337 -> c3a248a4b
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/job.proto ---------------------------------------------------------------------- diff --git a/src/proto/job.proto b/src/proto/job.proto new file mode 100644 index 0000000..7c462d2 --- /dev/null +++ b/src/proto/job.proto @@ -0,0 +1,461 @@ +package singa; + +message JobProto { + required ClusterProto cluster = 1; + required ModelProto model = 2; +} + +message ClusterProto { + optional int32 nworker_groups = 1; + optional int32 nserver_groups = 2; + optional int32 nworkers_per_group = 3 [default = 1]; + optional int32 nservers_per_group = 4 [default = 1]; + optional int32 nworkers_per_procs = 5 [default = 1]; + optional int32 nservers_per_procs = 6 [default = 1]; + + // servers and workers in different processes? + optional bool server_worker_separate = 11 [default = false]; + + // port number is used by ZeroMQ + optional int32 start_port = 13 [default = 6723]; + // local workspace, train/val/test shards, checkpoint files + optional string workspace = 14 [default = "workspace"]; + + // conduct updates at server side; otherwise do it at worker side + optional bool server_update = 40 [default = true]; + // share memory space between worker groups in one procs + optional bool share_memory = 41 [default = true]; + + // bandwidth of ethernet, Bytes per second, default is 1 Gbps + optional int32 bandwidth=50 [default=134217728]; + // poll time in milliseconds + optional int32 poll_time=51 [default =100]; +} + + +enum Phase { + kTrain = 0; + kValidation = 1; + kTest= 2; + // postivie phase for contrastive divergence algorithm + kPositive = 3; + // negative phase for contrastive divergence algorithm + kNegative = 4; + kForward = 5; + kBackward = 6; +} + +message ModelProto { + // model name, e.g., "cifar10-dcnn", "mnist-mlp" + required string name = 1; + // frequency of displaying training info + required int32 display_frequency = 3 ; + // total num of steps for training + required int32 train_steps = 5; + // configuration of SGD updater, including learning rate, etc. + required UpdaterProto updater = 7; + enum GradCalcAlg { + // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN + kBackPropagation = 1; + // CD algorithm for RBM, DBM etc., models + kContrastiveDivergence = 2; + } + // gradient calculation algorithm + required GradCalcAlg alg = 8 [default = kBackPropagation]; + required NetProto neuralnet = 9; + + // total num of steps for validation + optional int32 validation_steps = 30 [default = 0]; + // total num of steps for test + optional int32 test_steps = 31 [default = 0]; + // frequency of validation + optional int32 validation_frequency = 32; + // frequency of test + optional int32 test_frequency = 33 [default = 0]; + // frequency of checkpoint + optional int32 checkpoint_frequency = 34 [default = 0]; + // send parameters to servers after training for this num of steps + optional int32 warmup_steps = 35 [default = 0]; + // checkpoint path + optional bool resume = 36 [default = false]; + + // start display after this num steps + optional int32 display_after = 60[default = 0]; + // start checkpoint after this num steps + optional int32 checkpoint_after = 61 [default = 0]; + // start test after this num steps + optional int32 test_after = 62 [default = 0]; +// start validation after this num steps + optional int32 validation_after = 63 [default = 0]; + // last snapshot step + optional int32 step = 64 [default = 0]; + // display debug info + optional bool debug = 65 [default = false]; + // checkpoint files + repeated string checkpoint = 66; + // reset the version of params loaded from checkpoint file to step + optional bool reset_param_version = 67 [default = false]; +} + +message NetProto { + repeated LayerProto layer = 1; + // partitioning type for parallelism + optional int32 partition_dim = 2 [default = 0]; +} + +// weight matrix should be defined before bias vector +message ParamProto { + enum InitMethod { + // fix the values of all parameters a constant in the value field + kConstant = 0; + // sample gaussian with std and mean + kGaussian = 1; + // uniform sampling between low and high + kUniform = 2; + // copy the content and history which are from previous training + kPretrained = 3; + // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from + // Gaussian distribution + kGaussainSqrtFanIn = 4; + // from Toronto Convnet, rectified linear activation, let + // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3), + // the program will multiply it. + kUniformSqrtFanIn = 5; + // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh + // activation, range is [-a, +a], for sigmoid activation, range is + // [-4a, +4a], put the scale factor to value field. + // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a> + kUniformSqrtFanInOut = 6; + } + optional InitMethod init_method = 1 [default = kGaussian]; + // constant init + optional float value = 5 [default = 1]; + // for uniform sampling + optional float low = 6 [default = -1]; + optional float high = 7 [default = 1]; + // for gaussian sampling + optional float mean = 8 [default = 0]; + optional float std = 9 [default = 1]; + // multiplied on the global learning rate. + optional float learning_rate_multiplier = 15 [default = 1]; + // multiplied on the global weight decay. + optional float weight_decay_multiplier = 16 [default = 1]; + // partition dimension, -1 for no partition + optional int32 partition_dim = 30; + // usually, the program will infer the param shape + repeated int32 shape = 31; + // used for identifying the same params from diff models and display deug info + optional string name = 61 [default = ""]; + // name of the owner param from which this param shares the values + optional string share_from = 62; + // used interally + optional int32 id = 63; + // parameter slice limit (Google Protobuf also has size limit) + optional int32 split_threshold = 64 [default = 5000000]; + // used internally + optional int32 owner = 65 [default = -1]; +} + +enum PartitionType{ + kDataPartition=0; + kLayerPartition=1; + kNone=2; +} + +message LayerProto { + // the layer name used for identification + required string name = 1; + enum LayerType{ + kBridgeSrc = 15; + kBridgeDst = 16; + kConvolution = 1; + kConcate = 2; + kShardData = 3; + kDropout = 4; + kInnerProduct = 5; + kLabel = 18; + kLMDBData = 17; + kLRN = 6; + kMnist = 7; + kPooling = 8; + kPrefetch = 19; + kReLU = 9; + kRGBImage = 10; + kSoftmaxLoss = 11; + kSlice = 12; + kSplit = 13; + kTanh = 14; + } + // source layer names + repeated string srclayers = 3; + // parameters, e.g., weight matrix or bias vector + repeated ParamProto param = 12; + // all layers are included in the net structure for training phase by default. + // some layers like data layer for loading test data are not used by training + // phase should be removed by setting the exclude field. + repeated Phase exclude = 15; + // the layer type from the enum above + required LayerType type = 20; + // configuration for convolution layer + optional ConvolutionProto convolution_conf = 30; + // configuration for concatenation layer + optional ConcateProto concate_conf = 31; + // configuration for dropout layer + optional DropoutProto dropout_conf = 33; + // configuration for inner product layer + optional InnerProductProto innerproduct_conf = 34; + // configuration for local response normalization layer + optional DataProto lmdbdata_conf = 35; + // configuration for local response normalization layer + optional LRNProto lrn_conf = 45; + // configuration for mnist parser layer + optional MnistProto mnist_conf= 36; + // configuration for pooling layer + optional PoolingProto pooling_conf = 37; + // configuration for prefetch layer + optional PrefetchProto prefetch_conf = 44; + // configuration for rectified linear unit layer + optional ReLUProto relu_conf = 38; + // configuration for rgb image parser layer + optional RGBImageProto rgbimage_conf = 39; + // configuration for data layer + optional DataProto sharddata_conf = 32; + // configuration for slice layer + optional SliceProto slice_conf = 41; + // configuration for softmax loss layer + optional SoftmaxLossProto softmaxloss_conf = 40; + // configuration for split layer + optional SplitProto split_conf = 42; + // configuration for tanh layer + optional TanhProto tanh_conf = 43; + + + // overrides the partition dimension for neural net + optional int32 partition_dim =59 [default = -1]; + optional string datablob = 58 [default = "unknow"]; + + // names of parameters shared from other layers + repeated string share_param = 60; + optional int32 partition_id = 62 [default = 0]; +} + +message RGBImageProto { + // scale factor for each pixel + optional float scale = 1 [default = 1.0]; + // size after cropping + optional int32 cropsize = 2 [default = 0]; + // mirror the image + optional bool mirror = 3 [default = false]; + // meanfile path + optional string meanfile = 4 [default = ""]; +} + +message PrefetchProto { + repeated LayerProto sublayers = 1; +} + +message SplitProto { + optional int32 num_splits = 1 [default =1]; +} + +// scaled tan: A*tan(B*x) +message TanhProto { + // A of A*tan(B*x) + optional float outer_scale = 1 [default = 1.0]; + // B of A*tan(B*x) + optional float inner_scale = 2 [default = 1.0]; +} + +message SoftmaxLossProto { + // computing accuracy against topk results + optional int32 topk = 1 [default = 1]; + // loss scale factor + optional float scale= 30 [default = 1]; +} + +message ConvolutionProto { + // The number of outputs for the layer + required int32 num_filters = 1; + // the kernel height/width + required int32 kernel= 2; + + // The padding height/width + optional int32 pad = 30 [default = 0]; + // the stride + optional int32 stride = 31 [default = 1]; + // whether to have bias terms + optional bool bias_term = 32 [default = true]; +} + +message ConcateProto { + // on which dimension, starts from 0 + required int32 concate_dim = 1; +} + +message DataProto { + // path to the data file/folder, absolute or relative to the workspace + required string path = 2; + // batch size. + required int32 batchsize = 4; + // skip [0,random_skip] records + optional int32 random_skip = 30 [default = 0]; +} + +message MnistProto { + // normalization x/norm_a + required float norm_a = 1 [default = 1]; + // normalization x-norm_b + required float norm_b = 2 [default = 0]; + + // elastic distortion + optional int32 kernel = 30 [default = 0]; + optional float sigma = 31 [default = 0]; + optional float alpha = 32 [default = 0]; + // rotation or horizontal shearing + optional float beta = 33 [default = 0]; + // scaling + optional float gamma = 34 [default = 0]; + // scale to this size as input for deformation + optional int32 resize = 35 [default = 0] ; + optional int32 elastic_freq = 36 [default = 0]; +} + +// Message that stores parameters used by DropoutLayer +message DropoutProto { + // dropout ratio + optional float dropout_ratio = 30 [default = 0.5]; +} + +// Message that stores parameters used by InnerProductLayer +message InnerProductProto { + // number of outputs for the layer + required int32 num_output = 1; + // use bias vector or not + optional bool bias_term = 30 [default = true]; +} + +message LRNProto { + // local response size + required int32 local_size = 1 [default = 5]; + // scale factor + optional float alpha = 31 [default = 1.0]; + // exponential number + optional float beta = 32 [default = 0.75]; + enum NormRegion { + // across channels, e.g., r,g,b + ACROSS_CHANNELS = 0; + // within channel, e.g., r, g and b are concatenated into one channel + WITHIN_CHANNEL = 1; + } + // normalization objective + optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS]; + // offset + optional float knorm =34 [default = 1.0]; +} + +message PoolingProto { + // The kernel size (square) + required int32 kernel= 1; + enum PoolMethod { + MAX = 0; + AVE = 1; + } + // The pooling method + optional PoolMethod pool = 30 [default = MAX]; + // The padding size + optional uint32 pad = 31 [default = 0]; + // The stride + optional uint32 stride = 32 [default = 1]; +} + +message SliceProto{ + required int32 slice_dim = 1; +} + +message ReLUProto { + // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). + // Rectifier nonlinearities improve neural network acoustic models. + // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing. + optional float negative_slope = 1 [default = 0]; +} + +message UpdaterProto { + enum UpdaterType{ + // noraml SGD with momentum and weight decay + kSGD = 1; + // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf + kAdaGrad = 2; + // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf + kRMSProp = 3; + // Nesterov first optimal gradient method + kNesterov = 4; + } + // updater type + required UpdaterType type = 1 [default=kSGD]; + // configuration for RMSProp algorithm + optional RMSPropProto rmsprop_conf = 50; + + enum ChangeMethod { + kFixed = 0; + kInverseT = 1; + kInverse = 2; + kExponential = 3; + kLinear = 4; + kStep = 5; + kFixedStep = 6; + } + // change method for learning rate + required ChangeMethod lr_change= 2 [default = kFixed]; + + optional FixedStepProto fixedstep_conf=40; + optional StepProto step_conf=41; + optional LinearProto linear_conf=42; + optional ExponentialProto exponential_conf=43; + optional InverseProto inverse_conf=44; + optional InverseTProto inverset_conf=45; + + optional float momentum = 31 [default = 0]; + optional float weight_decay = 32 [default = 0]; + // base learning rate + optional float base_lr = 34 [default = 0]; + // used to avoid divide by 0, i.e. x/(y+delta) + optional float delta = 35 [default = 0.00000001]; +} + +message RMSPropProto{ + // history=history*rho_+(1-rho_)*(grad*grad_scale); + required float rho = 1; +} + +message FixedStepProto{ + repeated int32 step = 28; + // lr = step_lr[i] if current step >= step[i] + repeated float step_lr = 29; +} + +message StepProto{ + // lr = base_lr * gamma^(step/change_freq) + required float gamma = 35 [default = 1]; + // lr = base_lr * gamma^(step/change_freq) + required int32 change_freq= 40; +} +message LinearProto{ + // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr + required int32 change_freq= 40; + // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr + required float final_lr = 39; +} +message ExponentialProto{ + // lr = base / 2^(step/change_freq) + required int32 change_freq= 40; +} +message InverseTProto{ + // lr = base_lr / (1+step/final_lr) + required float final_lr = 39; +} +message InverseProto{ + // lr = base_lr*(1+gamma*step)^(-pow) + required float gamma = 1 [default = 1]; + // lr = base_lr*(1+gamma*step)^(-pow) + required float pow = 2 [default = 0]; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/model.proto ---------------------------------------------------------------------- diff --git a/src/proto/model.proto b/src/proto/model.proto deleted file mode 100644 index f3b8dfe..0000000 --- a/src/proto/model.proto +++ /dev/null @@ -1,427 +0,0 @@ -package singa; -enum Phase { - kTrain = 0; - kValidation = 1; - kTest= 2; - // postivie phase for contrastive divergence algorithm - kPositive = 3; - // negative phase for contrastive divergence algorithm - kNegative = 4; - kForward = 5; - kBackward = 6; -} - -message ModelProto { - // model name, e.g., "cifar10-dcnn", "mnist-mlp" - required string name = 1; - // frequency of displaying training info - required int32 display_frequency = 3 ; - // total num of steps for training - required int32 train_steps = 5; - // configuration of SGD updater, including learning rate, etc. - required UpdaterProto updater = 7; - enum GradCalcAlg { - // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN - kBackPropagation = 1; - // CD algorithm for RBM, DBM etc., models - kContrastiveDivergence = 2; - } - // gradient calculation algorithm - required GradCalcAlg alg = 8 [default = kBackPropagation]; - required NetProto neuralnet = 9; - - // total num of steps for validation - optional int32 validation_steps = 30 [default = 0]; - // total num of steps for test - optional int32 test_steps = 31 [default = 0]; - // frequency of validation - optional int32 validation_frequency = 32; - // frequency of test - optional int32 test_frequency = 33 [default = 0]; - // frequency of checkpoint - optional int32 checkpoint_frequency = 34 [default = 0]; - // send parameters to servers after training for this num of steps - optional int32 warmup_steps = 35 [default = 0]; - // checkpoint path - optional bool resume = 36 [default = false]; - - // start display after this num steps - optional int32 display_after = 60[default = 0]; - // start checkpoint after this num steps - optional int32 checkpoint_after = 61 [default = 0]; - // start test after this num steps - optional int32 test_after = 62 [default = 0]; -// start validation after this num steps - optional int32 validation_after = 63 [default = 0]; - // last snapshot step - optional int32 step = 64 [default = 0]; - // display debug info - optional bool debug = 65 [default = false]; - // checkpoint files - repeated string checkpoint = 66; - // reset the version of params loaded from checkpoint file to step - optional bool reset_param_version = 67 [default = false]; -} - -message NetProto { - repeated LayerProto layer = 1; - // partitioning type for parallelism - optional int32 partition_dim = 2 [default = 0]; -} - -// weight matrix should be defined before bias vector -message ParamProto { - enum InitMethod { - // fix the values of all parameters a constant in the value field - kConstant = 0; - // sample gaussian with std and mean - kGaussian = 1; - // uniform sampling between low and high - kUniform = 2; - // copy the content and history which are from previous training - kPretrained = 3; - // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from - // Gaussian distribution - kGaussainSqrtFanIn = 4; - // from Toronto Convnet, rectified linear activation, let - // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3), - // the program will multiply it. - kUniformSqrtFanIn = 5; - // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh - // activation, range is [-a, +a], for sigmoid activation, range is - // [-4a, +4a], put the scale factor to value field. - // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a> - kUniformSqrtFanInOut = 6; - } - optional InitMethod init_method = 1 [default = kGaussian]; - // constant init - optional float value = 5 [default = 1]; - // for uniform sampling - optional float low = 6 [default = -1]; - optional float high = 7 [default = 1]; - // for gaussian sampling - optional float mean = 8 [default = 0]; - optional float std = 9 [default = 1]; - // multiplied on the global learning rate. - optional float learning_rate_multiplier = 15 [default = 1]; - // multiplied on the global weight decay. - optional float weight_decay_multiplier = 16 [default = 1]; - // partition dimension, -1 for no partition - optional int32 partition_dim = 30; - // usually, the program will infer the param shape - repeated int32 shape = 31; - // used for identifying the same params from diff models and display deug info - optional string name = 61 [default = ""]; - // name of the owner param from which this param shares the values - optional string share_from = 62; - // used interally - optional int32 id = 63; - // parameter slice limit (Google Protobuf also has size limit) - optional int32 split_threshold = 64 [default = 5000000]; - // used internally - optional int32 owner = 65 [default = -1]; -} - -enum PartitionType{ - kDataPartition=0; - kLayerPartition=1; - kNone=2; -} - -message LayerProto { - // the layer name used for identification - required string name = 1; - enum LayerType{ - kBridgeSrc = 15; - kBridgeDst = 16; - kConvolution = 1; - kConcate = 2; - kShardData = 3; - kDropout = 4; - kInnerProduct = 5; - kLabel = 18; - kLMDBData = 17; - kLRN = 6; - kMnist = 7; - kPooling = 8; - kPrefetch = 19; - kReLU = 9; - kRGBImage = 10; - kSoftmaxLoss = 11; - kSlice = 12; - kSplit = 13; - kTanh = 14; - } - // source layer names - repeated string srclayers = 3; - // parameters, e.g., weight matrix or bias vector - repeated ParamProto param = 12; - // all layers are included in the net structure for training phase by default. - // some layers like data layer for loading test data are not used by training - // phase should be removed by setting the exclude field. - repeated Phase exclude = 15; - // the layer type from the enum above - required LayerType type = 20; - // configuration for convolution layer - optional ConvolutionProto convolution_conf = 30; - // configuration for concatenation layer - optional ConcateProto concate_conf = 31; - // configuration for dropout layer - optional DropoutProto dropout_conf = 33; - // configuration for inner product layer - optional InnerProductProto innerproduct_conf = 34; - // configuration for local response normalization layer - optional DataProto lmdbdata_conf = 35; - // configuration for local response normalization layer - optional LRNProto lrn_conf = 45; - // configuration for mnist parser layer - optional MnistProto mnist_conf= 36; - // configuration for pooling layer - optional PoolingProto pooling_conf = 37; - // configuration for prefetch layer - optional PrefetchProto prefetch_conf = 44; - // configuration for rectified linear unit layer - optional ReLUProto relu_conf = 38; - // configuration for rgb image parser layer - optional RGBImageProto rgbimage_conf = 39; - // configuration for data layer - optional DataProto sharddata_conf = 32; - // configuration for slice layer - optional SliceProto slice_conf = 41; - // configuration for softmax loss layer - optional SoftmaxLossProto softmaxloss_conf = 40; - // configuration for split layer - optional SplitProto split_conf = 42; - // configuration for tanh layer - optional TanhProto tanh_conf = 43; - - - // overrides the partition dimension for neural net - optional int32 partition_dim =59 [default = -1]; - optional string datablob = 58 [default = "unknow"]; - - // names of parameters shared from other layers - repeated string share_param = 60; - optional int32 partition_id = 62 [default = 0]; -} - -message RGBImageProto { - // scale factor for each pixel - optional float scale = 1 [default = 1.0]; - // size after cropping - optional int32 cropsize = 2 [default = 0]; - // mirror the image - optional bool mirror = 3 [default = false]; - // meanfile path - optional string meanfile = 4 [default = ""]; -} - -message PrefetchProto { - repeated LayerProto sublayers = 1; -} - -message SplitProto { - optional int32 num_splits = 1 [default =1]; -} - -// scaled tan: A*tan(B*x) -message TanhProto { - // A of A*tan(B*x) - optional float outer_scale = 1 [default = 1.0]; - // B of A*tan(B*x) - optional float inner_scale = 2 [default = 1.0]; -} - -message SoftmaxLossProto { - // computing accuracy against topk results - optional int32 topk = 1 [default = 1]; - // loss scale factor - optional float scale= 30 [default = 1]; -} - -message ConvolutionProto { - // The number of outputs for the layer - required int32 num_filters = 1; - // the kernel height/width - required int32 kernel= 2; - - // The padding height/width - optional int32 pad = 30 [default = 0]; - // the stride - optional int32 stride = 31 [default = 1]; - // whether to have bias terms - optional bool bias_term = 32 [default = true]; -} - -message ConcateProto { - // on which dimension, starts from 0 - required int32 concate_dim = 1; -} - -message DataProto { - // path to the data file/folder, absolute or relative to the workspace - required string path = 2; - // batch size. - required int32 batchsize = 4; - // skip [0,random_skip] records - optional int32 random_skip = 30 [default = 0]; -} - -message MnistProto { - // normalization x/norm_a - required float norm_a = 1 [default = 1]; - // normalization x-norm_b - required float norm_b = 2 [default = 0]; - - // elastic distortion - optional int32 kernel = 30 [default = 0]; - optional float sigma = 31 [default = 0]; - optional float alpha = 32 [default = 0]; - // rotation or horizontal shearing - optional float beta = 33 [default = 0]; - // scaling - optional float gamma = 34 [default = 0]; - // scale to this size as input for deformation - optional int32 resize = 35 [default = 0] ; - optional int32 elastic_freq = 36 [default = 0]; -} - -// Message that stores parameters used by DropoutLayer -message DropoutProto { - // dropout ratio - optional float dropout_ratio = 30 [default = 0.5]; -} - -// Message that stores parameters used by InnerProductLayer -message InnerProductProto { - // number of outputs for the layer - required int32 num_output = 1; - // use bias vector or not - optional bool bias_term = 30 [default = true]; -} - -message LRNProto { - // local response size - required int32 local_size = 1 [default = 5]; - // scale factor - optional float alpha = 31 [default = 1.0]; - // exponential number - optional float beta = 32 [default = 0.75]; - enum NormRegion { - // across channels, e.g., r,g,b - ACROSS_CHANNELS = 0; - // within channel, e.g., r, g and b are concatenated into one channel - WITHIN_CHANNEL = 1; - } - // normalization objective - optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS]; - // offset - optional float knorm =34 [default = 1.0]; -} - -message PoolingProto { - // The kernel size (square) - required int32 kernel= 1; - enum PoolMethod { - MAX = 0; - AVE = 1; - } - // The pooling method - optional PoolMethod pool = 30 [default = MAX]; - // The padding size - optional uint32 pad = 31 [default = 0]; - // The stride - optional uint32 stride = 32 [default = 1]; -} - -message SliceProto{ - required int32 slice_dim = 1; -} - -message ReLUProto { - // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). - // Rectifier nonlinearities improve neural network acoustic models. - // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing. - optional float negative_slope = 1 [default = 0]; -} - -message UpdaterProto { - enum UpdaterType{ - // noraml SGD with momentum and weight decay - kSGD = 1; - // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf - kAdaGrad = 2; - // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf - kRMSProp = 3; - // Nesterov first optimal gradient method - kNesterov = 4; - } - // updater type - required UpdaterType type = 1 [default=kSGD]; - // configuration for RMSProp algorithm - optional RMSPropProto rmsprop_conf = 50; - - enum ChangeMethod { - kFixed = 0; - kInverseT = 1; - kInverse = 2; - kExponential = 3; - kLinear = 4; - kStep = 5; - kFixedStep = 6; - } - // change method for learning rate - required ChangeMethod lr_change= 2 [default = kFixed]; - - optional FixedStepProto fixedstep_conf=40; - optional StepProto step_conf=41; - optional LinearProto linear_conf=42; - optional ExponentialProto exponential_conf=43; - optional InverseProto inverse_conf=44; - optional InverseTProto inverset_conf=45; - - optional float momentum = 31 [default = 0]; - optional float weight_decay = 32 [default = 0]; - // base learning rate - optional float base_lr = 34 [default = 0]; - // used to avoid divide by 0, i.e. x/(y+delta) - optional float delta = 35 [default = 0.00000001]; -} - -message RMSPropProto{ - // history=history*rho_+(1-rho_)*(grad*grad_scale); - required float rho = 1; -} - -message FixedStepProto{ - repeated int32 step = 28; - // lr = step_lr[i] if current step >= step[i] - repeated float step_lr = 29; -} - -message StepProto{ - // lr = base_lr * gamma^(step/change_freq) - required float gamma = 35 [default = 1]; - // lr = base_lr * gamma^(step/change_freq) - required int32 change_freq= 40; -} -message LinearProto{ - // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr - required int32 change_freq= 40; - // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr - required float final_lr = 39; -} -message ExponentialProto{ - // lr = base / 2^(step/change_freq) - required int32 change_freq= 40; -} -message InverseTProto{ - // lr = base_lr / (1+step/final_lr) - required float final_lr = 39; -} -message InverseProto{ - // lr = base_lr*(1+gamma*step)^(-pow) - required float gamma = 1 [default = 1]; - // lr = base_lr*(1+gamma*step)^(-pow) - required float pow = 2 [default = 0]; -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/singa.proto ---------------------------------------------------------------------- diff --git a/src/proto/singa.proto b/src/proto/singa.proto new file mode 100644 index 0000000..94af58d --- /dev/null +++ b/src/proto/singa.proto @@ -0,0 +1,8 @@ +package singa; + +message SingaProto { + // ip/hostname:port[,ip/hostname:port] + required string zookeeper_host = 1; + // if not set, use the default dir of glog + optional string log_dir = 2; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/test/test_cluster.cc ---------------------------------------------------------------------- diff --git a/src/test/test_cluster.cc b/src/test/test_cluster.cc index c34dd0f..a51126d 100644 --- a/src/test/test_cluster.cc +++ b/src/test/test_cluster.cc @@ -1,5 +1,4 @@ #include "gtest/gtest.h" -#include "proto/cluster.pb.h" #include "utils/cluster.h" using namespace singa; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/trainer/trainer.cc ---------------------------------------------------------------------- diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc index 78ec49f..4a0a47a 100644 --- a/src/trainer/trainer.cc +++ b/src/trainer/trainer.cc @@ -11,6 +11,7 @@ #include "trainer/trainer.h" #include "mshadow/tensor.h" + namespace singa { using std::vector; using std::map; @@ -193,7 +194,7 @@ vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto& mconf){ return workers; } -void Trainer::Resume(ModelProto& mconf) { +void Trainer::Resume(ModelProto* modelConf) { tinydir_dir dir; string folder = Cluster::Get()->checkpoint_folder(); tinydir_open(&dir, folder.c_str()); @@ -223,34 +224,34 @@ void Trainer::Resume(ModelProto& mconf) { } if (latest_step > 0) { - mconf.set_step(latest_step); + modelConf->set_step(latest_step); for (auto ck_file : ck_files) - mconf.add_checkpoint(folder + "/" +string(ck_file)); + modelConf->add_checkpoint(folder + "/" +string(ck_file)); } tinydir_close(&dir); } -void Trainer::Start(ModelProto& mconf, const GlobalProto& gconf, - const ClusterProto& cconf, int job, bool resume){ +void Trainer::Start(int job, bool resume, + const JobProto& jobConf, const SingaProto& singaConf) { // register job to zookeeper at the beginning - auto cluster=Cluster::Get(gconf, cconf, job); - - RegisterDefaultClasses(mconf); + auto cluster = Cluster::Get(job, singaConf, jobConf.cluster()); + ModelProto model = jobConf.model(); + RegisterDefaultClasses(model); if (resume) - Resume(mconf); + Resume(&model); router_ = new Router(); router_->Bind(kInprocRouterEndpoint); const string hostip = cluster->hostip(); int port = router_->Bind("tcp://" + hostip + ":*"); // register endpoint to zookeeper - cluster->Register(hostip + ":" + std::to_string(port), getpid()); + cluster->Register(getpid(), hostip + ":" + std::to_string(port)); int nthreads = 1; - const vector<Worker*> workers = CreateWorkers(nthreads, mconf); + const vector<Worker*> workers = CreateWorkers(nthreads, model); nthreads += workers.size(); - const vector<Server*> servers = CreateServers(nthreads, mconf); - SetupWorkerServer(mconf, workers, servers); + const vector<Server*> servers = CreateServers(nthreads, model); + SetupWorkerServer(model, workers, servers); #ifdef USE_MPI for (int i = 0; i < nthreads; i++) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/trainer/worker.cc ---------------------------------------------------------------------- diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc index 7d779ad..87d251d 100644 --- a/src/trainer/worker.cc +++ b/src/trainer/worker.cc @@ -6,7 +6,6 @@ #include "utils/cluster.h" #include "utils/factory.h" #include "trainer/worker.h" -#include "proto/model.pb.h" namespace singa { using std::thread; @@ -173,6 +172,9 @@ void Worker::Run() { step_++; } + // save the model + Checkpoint(step_, train_net_); + // clean up if(updater_ == nullptr) { int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/cluster.cc ---------------------------------------------------------------------- diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc index 791332d..6dad2a8 100644 --- a/src/utils/cluster.cc +++ b/src/utils/cluster.cc @@ -3,18 +3,17 @@ #include <unistd.h> #include <fstream> #include "utils/cluster.h" -#include "proto/cluster.pb.h" #include "proto/common.pb.h" #include <sys/stat.h> #include <sys/types.h> namespace singa { std::shared_ptr<Cluster> Cluster::instance_; -Cluster::Cluster(const GlobalProto & global, const ClusterProto &cluster, - int job_id) { - cluster_ = cluster; - global_ = global; - SetupFolders(cluster); +Cluster::Cluster( + int job, const SingaProto& singaConf, const ClusterProto& clusterConf) { + cluster_ = clusterConf; + singa_ = singaConf; + SetupFolders(clusterConf); if(server_worker_separate()) nprocs_=nworker_procs()+nserver_procs(); else @@ -38,14 +37,14 @@ Cluster::Cluster(const GlobalProto & global, const ClusterProto &cluster, } } - auto rt = new ZKClusterRT(global_.zookeeper_host(), job_id); + auto rt = new ZKClusterRT(singa_.zookeeper_host(), job); rt->Init(); cluster_rt_=shared_ptr<ClusterRuntime>(static_cast<ClusterRuntime*>(rt)); hostip_=GetHostIP(); } -void Cluster::Register(const string& endpoint, int pid) { +void Cluster::Register(int pid, const string& endpoint) { procs_id_=cluster_rt_->RegistProc(endpoint, pid); CHECK_GE(procs_id_,0); CHECK_LT(procs_id_,nprocs()); @@ -69,9 +68,9 @@ void Cluster::SetupFolders(const ClusterProto &cluster){ mkdir(checkpoint_folder().c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } -shared_ptr<Cluster> Cluster::Get(const GlobalProto& global, - const ClusterProto& cluster, int job_id){ - instance_.reset(new Cluster(global, cluster, job_id)); +shared_ptr<Cluster> Cluster::Get( + int job, const SingaProto& singaConf, const ClusterProto& clusterConf) { + instance_.reset(new Cluster(job, singaConf, clusterConf)); return instance_; } @@ -82,7 +81,7 @@ shared_ptr<Cluster> Cluster::Get() { } return instance_; } -int Cluster::Hash(int gid, int id, int flag){ +int Cluster::Hash(int gid, int id, int flag) { int ret=-1; if(flag==kServer){ ret=(flag*cluster_.nserver_groups()+gid)*cluster_.nservers_per_group() + id; @@ -91,7 +90,7 @@ int Cluster::Hash(int gid, int id, int flag){ } return ret; } -int Cluster::ProcsIDOf(int group_id, int id, int flag){ +int Cluster::ProcsIDOf(int group_id, int id, int flag) { return procs_ids_.at(Hash(group_id, id, flag)); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/param.cc ---------------------------------------------------------------------- diff --git a/src/utils/param.cc b/src/utils/param.cc index 69e3b09..5541acc 100644 --- a/src/utils/param.cc +++ b/src/utils/param.cc @@ -3,7 +3,7 @@ #include <chrono> #include <random> #include "utils/param.h" -#include "proto/cluster.pb.h" +#include "proto/job.pb.h" #include "mshadow/tensor.h" #include "utils/singleton.h" using namespace mshadow; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/tool.cc ---------------------------------------------------------------------- diff --git a/src/utils/tool.cc b/src/utils/tool.cc index 267d266..3ffd0e8 100644 --- a/src/utils/tool.cc +++ b/src/utils/tool.cc @@ -2,8 +2,8 @@ #include <glog/logging.h> #include <iostream> #include <fstream> -#include "proto/cluster.pb.h" #include "utils/cluster_rt.h" +#include "proto/singa.pb.h" #include "utils/common.h" #ifndef GFLAGS_GFLAGS_H_ namespace gflags = google; @@ -15,7 +15,7 @@ int main(int argc, char **argv) { google::InitGoogleLogging(argv[0]); gflags::ParseCommandLineFlags(&argc, &argv, true); - singa::GlobalProto global; + singa::SingaProto global; singa::ReadProtoFromTextFile(FLAGS_global.c_str(), &global); singa::SetupLog(global.log_dir(), "SingaTool"); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index 8e949ef..18e53ce 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -2,7 +2,7 @@ #include "utils/updater.h" #include "mshadow/tensor.h" #include "mshadow/cxxnet_op.h" -#include "proto/model.pb.h" +#include "proto/job.pb.h" using namespace mshadow; using namespace mshadow::expr; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/tool/gen_hosts.py ---------------------------------------------------------------------- diff --git a/tool/gen_hosts.py b/tool/gen_hosts.py index a3bec47..e38c8bf 100644 --- a/tool/gen_hosts.py +++ b/tool/gen_hosts.py @@ -4,19 +4,20 @@ import argparse import os import sys from google.protobuf import text_format -from pb2.cluster_pb2 import ClusterProto +from pb2.job_pb2 import JobProto # parse command line parser = argparse.ArgumentParser(description='Generate host list from host file for a SINGA job') -parser.add_argument('-conf', dest='conf', metavar='CONF_FILE', required=True, help='cluster.conf file') +parser.add_argument('-conf', dest='conf', metavar='CONF_FILE', required=True, help='job.conf file') parser.add_argument('-hosts', dest='hosts', metavar='HOST_FILE', required=True, help='global host file') parser.add_argument('-output', dest='output', metavar='OUTPUT_FILE', required=True, help='generated list') args = parser.parse_args(); # read from .conf file fd_conf = open(args.conf, 'r') -cluster = ClusterProto() -text_format.Merge(str(fd_conf.read()), cluster) +job = JobProto() +text_format.Merge(str(fd_conf.read()), job) +cluster = job.cluster nworker_procs = cluster.nworker_groups * cluster.nworkers_per_group / cluster.nworkers_per_procs nserver_procs = cluster.nserver_groups * cluster.nservers_per_group / cluster.nservers_per_procs nprocs = 0 @@ -39,10 +40,10 @@ fd_hosts.close() # write to output file num_hosts = len(hosts) if (num_hosts == 0): - print "contains no valid host %s" % args.hosts + print "Contains no valid host %s" % args.hosts sys.exit(1) fd_output = open(args.output, 'w') for i in range(nprocs): fd_output.write(hosts[i % num_hosts] + '\n') fd_output.close() -print 'generate host list at %s' % args.output +print 'Generate host list to %s' % args.output
