[ https://issues.apache.org/jira/browse/COUCHDB-2975?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15211933#comment-15211933 ]
Nick Vatamaniuc commented on COUCHDB-2975: ------------------------------------------ Noticed transient mode does not clean up child specs after it is done. Even if exit is normal. The intent behind that is to let users restart children. >From erlang docs saw this {{If the child is temporary, the child specification >is deleted as soon as the process terminates. This means that delete_child/2 >has no meaning, and restart_child/2 can not be used for these children.}} However in our code sometimes we explicitly delete child: {code} cancel_replication({BaseId, Extension}) -> ... case supervisor:terminate_child(couch_replicator_job_sup, FullRepId) of ok -> ... case supervisor:delete_child(couch_replicator_job_sup, FullRepId) of ok -> {ok, {cancelled, ?l2b(FullRepId)}}; ... {code} That would make it seem as if supervisor auto-deleted the child spec in some cases. To test that it doesn't start a normal replication (not a continuous one) and then after it is finished inspect the state of {{couch_replicator_job_sup}}. An example of state from supervisor after 10 replication have finished on a cluster: {code} {state, {local,couch_replicator_job_sup}, one_for_one, [{child,undefined,"ac35738f5003c02b6780116fdf04b524", {gen_server,start_link, [couch_replicator, {rep, {"ac35738f5003c02b6780116fdf04b524",[]}, {httpdb,"http://adm:pass@localhost:5984/rdyno_src_0001/", nil, [{"Accept","application/json"}, {"User-Agent","CouchDB-Replicator/5fa9098"}], 200000, [{socket_options,[{keepalive,true},{nodelay,false}]}], 1,250,nil,1}, {httpdb,"http://adm:pass@localhost:5984/rdyno_tgt_0009/", nil, [{"Accept","application/json"}, {"User-Agent","CouchDB-Replicator/5fa9098"}], 200000, [{socket_options,[{keepalive,true},{nodelay,false}]}], 1,250,nil,1}, [{checkpoint_interval,5000}, {connection_timeout,200000}, {continuous,false}, {http_connections,1}, {retries,1}, {socket_options,[{keepalive,true},{nodelay,false}]}, {use_checkpoints,true}, {worker_batch_size,500}, {worker_processes,1}], {user_ctx,null,[],undefined}, db,nil, <<"rdyno_0001"...(15 B)>>, <<"shards/a00"...(47 B)>>}, [{timeout,200000}]]}, transient,250,worker, [couch_replicator]}, {child,undefined,"6c48c1ab7a6e3ed5e3d4415ced912e4a", {gen_server,start_link, [couch_replicator, {rep, {"6c48c1ab7a6e3ed5e3d4415ced912e4a",[]}, {httpdb,"http://adm:pass@localhost:5984/rdyno_src_0001/", nil, [{"Accept","application/json"}, {"User-Agent","CouchDB-Replicator/5fa9098"}], 200000, [{socket_options,[{keepalive,true},{nodelay,false}]}], 1,250,nil,1}, {httpdb,"http://adm:pass@localhost:5984/rdyno_tgt_0002/", nil, [{"Accept","application/json"}, {"User-Agent","CouchDB-Replicator/5fa9098"}], 200000, [{socket_options,[{keepalive,true},{nodelay,false}]}], 1,250,nil,1}, [{checkpoint_interval,5000}, {connection_timeout,200000}, {continuous,false}, {http_connections,1}, {retries,1}, {socket_options,[{keepalive,true},{nodelay,false}]}, {use_checkpoints,true}, {worker_batch_size,500}, {worker_processes,1}], {user_ctx,null,[],undefined}, db,nil, <<"rdyno_0001"...(15 B)>>, <<"shards/200"...(47 B)>>}, [{timeout,200000}]]}, transient,250,worker, [couch_replicator]}], undefined,100,1,[],couch_replicator_job_sup,[]} {code} > Automatically restart replication jobs if they crash > ---------------------------------------------------- > > Key: COUCHDB-2975 > URL: https://issues.apache.org/jira/browse/COUCHDB-2975 > Project: CouchDB > Issue Type: Improvement > Components: Replication > Reporter: Robert Newson > > We currently use the temporary restart strategy for replication jobs, which > means if they crash they are not restarted. > Instead, let's use the transient restart strategy, ensuring they are > restarted on abnormal termination, while still allowing these tasks to end > successfully on completion or cancellation. -- This message was sent by Atlassian JIRA (v6.3.4#6332)