[ 
https://issues.apache.org/jira/browse/ACCUMULO-4787?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16341318#comment-16341318
 ] 

Adam J Shook commented on ACCUMULO-4787:
----------------------------------------

This thread from a {{jstack}} looks promising.  A quick scan of the code looks 
like the input stream is open but never closed.

{code}
Thread 43276: (state = IN_JAVA)
 - 
org.apache.hadoop.hdfs.DFSInputStream$ByteArrayStrategy.doRead(org.apache.hadoop.hdfs.BlockReader,
 int, int) @bci=7, line=782 (Compiled frame; information may be imprecise)
 - 
org.apache.hadoop.hdfs.DFSInputStream.readBuffer(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy,
 int, int, java.util.Map) @bci=10, line=838 (Compiled frame)
 - 
org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy,
 int, int) @bci=172, line=898 (Compiled frame)
 - org.apache.hadoop.hdfs.DFSInputStream.read(byte[], int, int) @bci=37, 
line=942 (Compiled frame)
 - org.apache.hadoop.hdfs.DFSInputStream.read() @bci=21, line=742 (Compiled 
frame)
 - java.io.DataInputStream.readByte() @bci=4, line=265 (Compiled frame)
 - org.apache.hadoop.io.WritableUtils.readVLong(java.io.DataInput) @bci=34, 
line=315 (Compiled frame)
 - org.apache.accumulo.server.data.ServerMutation.readFields(java.io.DataInput) 
@bci=17, line=55 (Compiled frame)
 - 
org.apache.accumulo.tserver.logger.LogFileValue.readFields(java.io.DataInput) 
@bci=38, line=45 (Compiled frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem.getWalEdits(org.apache.accumulo.core.replication.ReplicationTarget,
 java.io.DataInputStream, org.apache.hadoop.fs.Path, 
org.apache.accumulo.server.replication.proto.Replication$Status, long, 
java.util.Set) @bci=65, line=709 (Compiled frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem$WalClientExecReturn.execute(org.apache.accumulo.core.replication.thrift.ReplicationServicer$Client)
 @bci=28, line=538 (Compiled frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem$WalClientExecReturn.execute(java.lang.Object)
 @bci=5, line=513 (Compiled frame)
 - 
org.apache.accumulo.core.client.impl.ReplicationClient.executeServicerWithReturn(org.apache.accumulo.core.client.impl.ClientContext,
 com.google.common.net.HostAndPort, 
org.apache.accumulo.core.client.impl.ClientExecReturn, long) @bci=14, line=191 
(Compiled frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem.replicateLogs(org.apache.accumulo.core.client.impl.ClientContext,
 com.google.common.net.HostAndPort, 
org.apache.accumulo.core.replication.ReplicationTarget, 
org.apache.hadoop.fs.Path, 
org.apache.accumulo.server.replication.proto.Replication$Status, long, 
java.lang.String, org.apache.accumulo.core.security.thrift.TCredentials, 
org.apache.accumulo.server.replication.ReplicaSystemHelper, 
org.apache.hadoop.security.UserGroupInformation, long) @bci=440, line=436 
(Compiled frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem._replicate(org.apache.hadoop.fs.Path,
 org.apache.accumulo.server.replication.proto.Replication$Status, 
org.apache.accumulo.core.replication.ReplicationTarget, 
org.apache.accumulo.server.replication.ReplicaSystemHelper, 
org.apache.accumulo.core.conf.AccumuloConfiguration, 
org.apache.accumulo.core.client.impl.ClientContext, 
org.apache.hadoop.security.UserGroupInformation) @bci=295, line=297 (Compiled 
frame)
 - 
org.apache.accumulo.tserver.replication.AccumuloReplicaSystem.replicate(org.apache.hadoop.fs.Path,
 org.apache.accumulo.server.replication.proto.Replication$Status, 
org.apache.accumulo.core.replication.ReplicationTarget, 
org.apache.accumulo.server.replication.ReplicaSystemHelper) @bci=232, line=216 
(Compiled frame)
 - 
org.apache.accumulo.tserver.replication.ReplicationProcessor.process(java.lang.String,
 byte[]) @bci=312, line=134 (Compiled frame)
 - org.apache.accumulo.server.zookeeper.DistributedWorkQueue$1.run() @bci=28, 
line=107 (Compiled frame)
 - 
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
 @bci=95, line=1142 (Compiled frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=617 
(Interpreted frame)
 - org.apache.accumulo.fate.util.LoggingRunnable.run() @bci=4, line=35 
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)
{code}

> Numerous leaked CLOSE_WAIT threads from TabletServer
> ----------------------------------------------------
>
>                 Key: ACCUMULO-4787
>                 URL: https://issues.apache.org/jira/browse/ACCUMULO-4787
>             Project: Accumulo
>          Issue Type: Bug
>    Affects Versions: 1.8.1
>         Environment: * Ubuntu 14.04
> * HDFS 2.6.0 and 2.5.0 (in the middle of an upgrade cycle)
> * ZooKeeper 3.4.6
> * Accumulo 1.8.1
> * HotSpot 1.8.0_121
>            Reporter: Adam J Shook
>            Assignee: Adam J Shook
>            Priority: Minor
>
> I'm running into an issue across all environments where TabletServers are 
> occupying a large number of ports in a CLOSED_WAIT state writing to a 
> DataNode at port 50010.  I'm seeing numbers from around 12,000 to 20,000 
> ports.  In some instances, there were over 68k and it was affecting other 
> applications from getting a free port and they would fail to start (which is 
> how we found this in the first place).



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to