Hi,

I messaged about a month back about experiencing issues with Solr when it loses 
connection to the Zookeeper Leader (For example, the zookeeper leader being 
recycled).

I have tested a bunch of configurations in our kubernetes space and have had no 
luck. We have tried mimicking the original Bitnami solution and still no luck. 
Furthermore, we tested this theory with Docker Compose where we used the 
out-the-box image and configuration for Solr and Zookeeper and experienced the 
same issue.

The issue is where if solr (when it has more than 10 collections) loses 
connection to the Zookeeper leader it will then put a handful of collection in 
the down state and will not recover for a specified time. For example, 10 
collections = 5 minutes whereas 200 collections it will take days or will 
require remediation from an engineer.

We do not currently use the solr operator. We currently deploy it through a 
statefulset of both zookeeper and solr. Zookeeper is a headless service.
Docker Compose to replicate issue:
Docker-compose.yml
services:

  # ─── ZooKeeper ensemble (3 nodes for quorum) ───────────────────────────────

  zookeeper-0:
    image: zookeeper:3.9
    hostname: zookeeper-0
    restart: unless-stopped
    environment:
      ZOO_MY_ID: "1"
      ZOO_SERVERS: "server.1=zookeeper-0:2888:3888;2181 
server.2=zookeeper-1:2888:3888;2181 server.3=zookeeper-2:2888:3888;2181"
    volumes:
      - zk0-data:/data
      - zk0-datalog:/datalog
    healthcheck:
      # AdminServer HTTP endpoint — works regardless of election state; 
4LW/zkServer.sh unreliable in 3.9
      test: ["CMD", "wget", "-q", "--spider", 
"http://localhost:8080/commands/ruok";]
      interval: 5s
      timeout: 5s
      retries: 20
      start_period: 20s

  zookeeper-1:
    image: zookeeper:3.9
    hostname: zookeeper-1
    restart: unless-stopped
    environment:
      ZOO_MY_ID: "2"
      ZOO_SERVERS: "server.1=zookeeper-0:2888:3888;2181 
server.2=zookeeper-1:2888:3888;2181 server.3=zookeeper-2:2888:3888;2181"
    volumes:
      - zk1-data:/data
      - zk1-datalog:/datalog
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", 
"http://localhost:8080/commands/ruok";]
      interval: 5s
      timeout: 5s
      retries: 20
      start_period: 20s

  zookeeper-2:
    image: zookeeper:3.9
    hostname: zookeeper-2
    restart: unless-stopped
    environment:
      ZOO_MY_ID: "3"
      ZOO_SERVERS: "server.1=zookeeper-0:2888:3888;2181 
server.2=zookeeper-1:2888:3888;2181 server.3=zookeeper-2:2888:3888;2181"
    volumes:
      - zk2-data:/data
      - zk2-datalog:/datalog
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", 
"http://localhost:8080/commands/ruok";]
      interval: 5s
      timeout: 5s
      retries: 20
      start_period: 20s

  # ─── ZK chroot init (creates /solrcloud-test znode, then exits) ─────────────

  zk-init:
    image: zookeeper:3.9
    restart: "no"
    depends_on:
      zookeeper-0:
        condition: service_healthy
    command: >
      sh -c "zkCli.sh -server zookeeper-0:2181 create /solrcloud-test '' 2>&1 | 
tail -1; echo 'ZK chroot ready'"
    healthcheck:
      test: ["CMD-SHELL", "exit 0"]
      interval: 5s
      retries: 1

  # ─── SolrCloud nodes ────────────────────────────────────────────────────────

  solrcloud-0:
    image: solr:10
    hostname: solrcloud-0
    restart: unless-stopped
    environment:
      SOLR_SKIP_ROOT_CHECK: "true"
      SOLR_PORT: "8983"
      SOLR_JAVA_MEM: "-XX:+UseContainerSupport -XX:MaxRAMPercentage=60.0"
      # /solrcloud-test chroot isolates this cluster within the ZK ensemble
      ZK_HOST: 
"zookeeper-0:2181,zookeeper-1:2181,zookeeper-2:2181/solrcloud-test"
      SOLR_HOST: "solrcloud-0"
      SOLR_LOG_LEVEL: "WARN"
      LOG4J_FORMAT_MSG_NO_LOOKUPS: "true"
      SOLR_OPTS: "-Dhost=solrcloud-0"
    ports:
      - "8983:8983"
    depends_on:
      zookeeper-0:
        condition: service_healthy
      zookeeper-1:
        condition: service_healthy
      zookeeper-2:
        condition: service_healthy
      zk-init:
        condition: service_completed_successfully
    volumes:
      - solr0-data:/var/solr
      - ./config/solr-log.xml:/opt/solr/server/resources/log4j2.xml:ro
      - ./config/solr-log.xml:/var/solr/log4j2.xml:ro
    healthcheck:
      test: ["CMD", "curl", "-f", 
"http://localhost:8983/solr/admin/info/system";]
      interval: 10s
      timeout: 5s
      retries: 10
      start_period: 60s

  solrcloud-1:
    image: solr:10
    hostname: solrcloud-1
    restart: unless-stopped
    environment:
      SOLR_SKIP_ROOT_CHECK: "true"
      SOLR_PORT: "8983"
      SOLR_JAVA_MEM: "-XX:+UseContainerSupport -XX:MaxRAMPercentage=60.0"
      ZK_HOST: 
"zookeeper-0:2181,zookeeper-1:2181,zookeeper-2:2181/solrcloud-test"
      SOLR_HOST: "solrcloud-1"
      SOLR_LOG_LEVEL: "WARN"
      LOG4J_FORMAT_MSG_NO_LOOKUPS: "true"
      SOLR_OPTS: "-Dhost=solrcloud-1"
    ports:
      - "8984:8983"
    depends_on:
      zookeeper-0:
        condition: service_healthy
      zookeeper-1:
        condition: service_healthy
      zookeeper-2:
        condition: service_healthy
      zk-init:
        condition: service_completed_successfully
    volumes:
      - solr1-data:/var/solr
      - ./config/solr-log.xml:/opt/solr/server/resources/log4j2.xml:ro
      - ./config/solr-log.xml:/var/solr/log4j2.xml:ro
    healthcheck:
      test: ["CMD", "curl", "-f", 
"http://localhost:8983/solr/admin/info/system";]
      interval: 10s
      timeout: 5s
      retries: 10
      start_period: 60s

volumes:
  zk0-data:
  zk0-datalog:
  zk1-data:
  zk1-datalog:
  zk2-data:
  zk2-datalog:
  solr0-data:
  solr1-data:

Zk-failover-test.ps1

$ErrorActionPreference = "Stop"
Set-Location $PSScriptRoot

$ZkContainers = @(
    "solrcloud-docker-compose-zookeeper-0-1",
    "solrcloud-docker-compose-zookeeper-1-1",
    "solrcloud-docker-compose-zookeeper-2-1"
)
$SolrContainers = @(
    "solrcloud-docker-compose-solrcloud-0-1",
    "solrcloud-docker-compose-solrcloud-1-1"
)
$AllHealthChecked = $ZkContainers + $SolrContainers

# ── 1. Start the stack ────────────────────────────────────────────────────────

Write-Host "`n[1/5] Starting Docker Compose stack..." -ForegroundColor Cyan
docker compose up -d
if ($LASTEXITCODE -ne 0) { throw "docker compose up failed" }

# ── 2. Wait for all containers to be healthy ──────────────────────────────────

Write-Host "`n[2/5] Waiting for all containers to be healthy..." 
-ForegroundColor Cyan
$timeout = 300
$elapsed = 0

while ($elapsed -lt $timeout) {
    Start-Sleep -Seconds 5
    $elapsed += 5

    $statuses = $AllHealthChecked | ForEach-Object {
        docker inspect $_ --format "{{.State.Health.Status}}" 2>$null
    }

    $unhealthy = ($statuses | Where-Object { $_ -ne "healthy" }).Count
    Write-Host "  ${elapsed}s — $($statuses.Count - 
$unhealthy)/$($statuses.Count) healthy"

    if ($unhealthy -eq 0) { break }
}

if ($elapsed -ge $timeout) { throw "Containers did not become healthy within 
${timeout}s" }
Write-Host "  All containers healthy." -ForegroundColor Green

# ── 3. Create 10 Solr collections ─────────────────────────────────────────────

Write-Host "`n[3/5] Creating 10 Solr collections..." -ForegroundColor Cyan
for ($i = 1; $i -le 10; $i++) {
    $name = "test-collection-$i"
    $uri  = 
"http://localhost:8983/solr/admin/collections?action=CREATE&name=$name&numShards=1&replicationFactor=2&wt=json";
    try {
        $resp = Invoke-RestMethod -Uri $uri -Method Get
        $status = $resp.responseHeader.status
        Write-Host "  Created $name (status: $status)"
    } catch {
        Write-Warning "  Failed to create $name`: $_"
    }
}

# ── 4. Find the ZooKeeper leader ──────────────────────────────────────────────

Write-Host "`n[4/5] Finding ZooKeeper leader..." -ForegroundColor Cyan
$leaderContainer = $null

foreach ($container in $ZkContainers) {
    $stat = docker exec $container wget -qO- 
"http://localhost:8080/commands/stat"; 2>&1
    if ($stat -match '"server_state"\s*:\s*"leader"') {
        $leaderContainer = $container
        Write-Host "  Leader: $container" -ForegroundColor Yellow
        break
    }
}

if (-not $leaderContainer) { throw "Could not find ZooKeeper leader" }

# ── 5. Restart the leader ─────────────────────────────────────────────────────

Write-Host "`n[5/5] Restarting ZK leader ($leaderContainer)..." 
-ForegroundColor Cyan
docker restart $leaderContainer
Write-Host "  Restarted. New leader election underway." -ForegroundColor Green

Write-Host "`nDone." -ForegroundColor Green





Some examples of Logs:
2026-05-11 10:42:42.419 WARN  
(OverseerCollectionConfigSetProcessor-144122478466367489-solrcloud-0.solrcloud-service:8983_solr-n_0000000801)
 [] i.o.s.m.SdkObservableInstrument 
CallbackRegistration{instrumentDescriptors=[InstrumentDescriptor{name=solr_overseer_collection_work_queue_size,
 description=Size of overseer's collection work queue, unit=, 
type=OBSERVABLE_GAUGE, valueType=LONG, 
advice=Advice{explicitBucketBoundaries=null, attributes=null}}]} has called 
close() multiple times.
2026-05-11 10:42:42.721 WARN  
(main-SendThread(zookeeper-0.zookeeper-headless.solr.svc.cluster.local:2181)) 
[] o.a.z.ClientCnxn Session 0x20006a16d230001 for server 
zookeeper-0.zookeeper-headless.solr.svc.cluster.local/10.101.80.182:2181, 
Closing socket connection. Attempting reconnect except it is a 
SessionExpiredException or SessionTimeoutException. 
=>org.apache.zookeeper.ClientCnxn$EndOfStreamException: Unable to read 
additional data from server sessionid 0x20006a16d230001, likely server has 
closed socket
      at 
org.apache.zookeeper.ClientCnxnSocketNIO.doIO(ClientCnxnSocketNIO.java:77)

2026-05-11 10:42:43.067 WARN  
(main-SendThread(zookeeper-2.zookeeper-headless.solr.svc.cluster.local:2181)) 
[] o.a.z.ClientCnxn Session 0x20006a16d230001 for server 
zookeeper-2.zookeeper-headless.solr.svc.cluster.local/10.101.80.221:2181, 
Closing socket connection. Attempting reconnect except it is a 
SessionExpiredException or SessionTimeoutException. 
=>java.net.NoRouteToHostException: No route to host
      at java.base/sun.nio.ch.Net.pollConnect(Native Method)

2026-05-11 10:42:44.529 WARN  (OverseerExitThread) [] o.a.s.c.Overseer I 
(id=144122478466367489-solrcloud-0.solrcloud-service:8983_solr-n_0000000801) am 
exiting, but I'm still the leader
2026-05-11 10:42:44.839 ERROR 
(qtp1745043985-34-solrcloud-0.solrcloud-service-128) [] 
o.a.s.h.RequestHandlerBase Client exception 
=>org.apache.solr.common.SolrException: We are not the leader
      at 
org.apache.solr.handler.admin.PrepRecoveryOp.lambda$execute$0(PrepRecoveryOp.java:98)

2026-05-11 10:42:44.840 ERROR 
(qtp1745043985-38-solrcloud-0.solrcloud-service-129) [] 
o.a.s.h.RequestHandlerBase Client exception 
=>org.apache.solr.common.SolrException: We are not the leader
      at 
org.apache.solr.handler.admin.PrepRecoveryOp.lambda$execute$0(PrepRecoveryOp.java:98)

2026-05-11 10:42:44.842 ERROR 
(qtp1745043985-35-solrcloud-0.solrcloud-service-130) [] 
o.a.s.h.RequestHandlerBase Client exception 
=>org.apache.solr.common.SolrException: We are not the leader
      at 
org.apache.solr.handler.admin.PrepRecoveryOp.lambda$execute$0(PrepRecoveryOp.java:98)

2026-05-11 10:42:50.440 WARN  
(recoveryExecutor-11-thread-2-processing-solrcloud-0.solrcloud-service:8983_solr
 attemp3_shard1_replica_n2 attemp3 shard1 core_node4) [c:attemp3 s:shard1 
r:core_node4 x:attemp3_shard1_replica_n2] o.a.s.u.PeerSyncWithLeader no frame 
of reference to tell if we've missed updates
2026-05-11 10:42:50.440 WARN  
(recoveryExecutor-11-thread-3-processing-solrcloud-0.solrcloud-service:8983_solr
 weoroo_shard1_replica_n2 weoroo shard1 core_node4) [c:weoroo s:shard1 
r:core_node4 x:weoroo_shard1_replica_n2] o.a.s.u.PeerSyncWithLeader no frame of 
reference to tell if we've missed updates
2026-05-11 10:42:50.445 WARN  
(recoveryExecutor-11-thread-1-processing-solrcloud-0.solrcloud-service:8983_solr
 test2_shard1_replica_n2 test2 shard1 core_node4) [c:test2 s:shard1 
r:core_node4 x:test2_shard1_replica_n2] o.a.s.u.PeerSyncWithLeader no frame of 
reference to tell if we've missed updates

Thanks,


Liam Newton

Mobile: 07500270375

Email: [email protected]<mailto:[email protected]>

Platform Engineer

Disclaimer

The information contained in this communication from the sender is 
confidential. It is intended solely for use by the recipient and others 
authorized to receive it. If you are not the recipient, you are hereby notified 
that any disclosure, copying, distribution or taking action in relation of the 
contents of this information is strictly prohibited and may be unlawful.

This email has been scanned for viruses and malware, and may have been 
automatically archived by Mimecast, a leader in email security and cyber 
resilience. Mimecast integrates email defenses with brand protection, security 
awareness training, web security, compliance and other essential capabilities. 
Mimecast helps protect large and small organizations from malicious activity, 
human error and technology failure; and to lead the movement toward building a 
more resilient world. To find out more, visit our website.

Reply via email to