Zookeeper is from the docker hub zookeeper:3.5.7 image. Below is our nifi.properties (with secrets and hostnames modified).
thanks! - Wyllys nifi.flow.configuration.file=/opt/nifi/nifi-current/latest_flow/nifi-0/flow.xml.gz nifi.flow.configuration.archive.enabled=true nifi.flow.configuration.archive.dir=/opt/nifi/nifi-current/archives nifi.flow.configuration.archive.max.time=30 days nifi.flow.configuration.archive.max.storage=500 MB nifi.flow.configuration.archive.max.count= nifi.flowcontroller.autoResumeState=false nifi.flowcontroller.graceful.shutdown.period=10 sec nifi.flowservice.writedelay.interval=500 ms nifi.administrative.yield.duration=30 sec nifi.bored.yield.duration=10 millis nifi.queue.backpressure.count=10000 nifi.queue.backpressure.size=1 GB nifi.authorizer.configuration.file=./conf/authorizers.xml nifi.login.identity.provider.configuration.file=./conf/login-identity-providers.xml nifi.templates.directory=/opt/nifi/nifi-current/templates nifi.ui.banner.text=KI Nifi Cluster nifi.ui.autorefresh.interval=30 sec nifi.nar.library.directory=./lib nifi.nar.library.autoload.directory=./extensions nifi.nar.working.directory=./work/nar/ nifi.documentation.working.directory=./work/docs/components nifi.state.management.configuration.file=./conf/state-management.xml nifi.state.management.provider.local=local-provider nifi.state.management.provider.cluster=zk-provider nifi.state.management.embedded.zookeeper.start=false nifi.state.management.embedded.zookeeper.properties=./conf/zookeeper.properties nifi.database.directory=./database_repository nifi.h2.url.append=;LOCK_TIMEOUT=25000;WRITE_DELAY=0;AUTO_SERVER=FALSE nifi.flowfile.repository.implementation=org.apache.nifi.controller.repository.WriteAheadFlowFileRepository nifi.flowfile.repository.wal.implementation=org.apache.nifi.wali.SequentialAccessWriteAheadLog nifi.flowfile.repository.directory=./flowfile_repository nifi.flowfile.repository.partitions=256 nifi.flowfile.repository.checkpoint.interval=2 mins nifi.flowfile.repository.always.sync=false nifi.flowfile.repository.encryption.key.provider.implementation= nifi.flowfile.repository.encryption.key.provider.location= nifi.flowfile.repository.encryption.key.id= nifi.flowfile.repository.encryption.key= nifi.swap.manager.implementation=org.apache.nifi.controller.FileSystemSwapManager nifi.queue.swap.threshold=20000 nifi.swap.in.period=5 sec nifi.swap.in.threads=1 nifi.swap.out.period=5 sec nifi.swap.out.threads=4 nifi.content.repository.implementation=org.apache.nifi.controller.repository.FileSystemRepository nifi.content.claim.max.appendable.size=1 MB nifi.content.claim.max.flow.files=100 nifi.content.repository.directory.default=./content_repository nifi.content.repository.archive.max.retention.period=12 hours nifi.content.repository.archive.max.usage.percentage=50% nifi.content.repository.archive.enabled=true nifi.content.repository.always.sync=false nifi.content.viewer.url=../nifi-content-viewer/ nifi.content.repository.encryption.key.provider.implementation= nifi.content.repository.encryption.key.provider.location= nifi.content.repository.encryption.key.id= nifi.content.repository.encryption.key= nifi.provenance.repository.implementation=org.apache.nifi.provenance.WriteAheadProvenanceRepository nifi.provenance.repository.debug.frequency=1_000_000 nifi.provenance.repository.encryption.key.provider.implementation= nifi.provenance.repository.encryption.key.provider.location= nifi.provenance.repository.encryption.key.id= nifi.provenance.repository.encryption.key= nifi.provenance.repository.directory.default=./provenance_repository nifi.provenance.repository.max.storage.time=7 days nifi.provenance.repository.max.storage.size=100 GB nifi.provenance.repository.rollover.time=120 secs nifi.provenance.repository.rollover.size=100 MB nifi.provenance.repository.query.threads=2 nifi.provenance.repository.index.threads=2 nifi.provenance.repository.compress.on.rollover=true nifi.provenance.repository.always.sync=false nifi.provenance.repository.indexed.fields=EventType, FlowFileUUID, Filename, ProcessorID, Relationship nifi.provenance.repository.indexed.attributes= nifi.provenance.repository.index.shard.size=4 GB nifi.provenance.repository.max.attribute.length=65536 nifi.provenance.repository.concurrent.merge.threads=2 nifi.provenance.repository.buffer.size=100000 nifi.components.status.repository.implementation=org.apache.nifi.controller.status.history.VolatileComponentStatusRepository nifi.components.status.repository.buffer.size=1440 nifi.components.status.snapshot.frequency=1 min nifi.remote.input.host=nifi-0.nifi.ki.svc.cluster.local nifi.remote.input.secure=true nifi.remote.input.socket.port=10000 nifi.remote.input.http.enabled=true nifi.remote.input.http.transaction.ttl=30 sec nifi.remote.contents.cache.expiration=30 secs nifi.web.war.directory=./lib nifi.web.http.host= nifi.web.http.port= nifi.web.http.network.interface.default= nifi.web.https.host=nifi-0.nifi.ki.svc.cluster.local nifi.web.https.port=8080 nifi.web.https.network.interface.default= nifi.web.jetty.working.directory=./work/jetty nifi.web.jetty.threads=200 nifi.web.max.header.size=16 KB nifi.web.proxy.context.path=/nifi-api,/nifi nifi.web.proxy.host=ingress.ourdomain.com nifi.sensitive.props.key= nifi.sensitive.props.key.protected= nifi.sensitive.props.algorithm=PBEWITHMD5AND256BITAES-CBC-OPENSSL nifi.sensitive.props.provider=BC nifi.sensitive.props.additional.keys= nifi.security.keystore=/opt/nifi/nifi-current/security/nifi-0.keystore.jks nifi.security.keystoreType=jks nifi.security.keystorePasswd=XXXXXXXXXXXXXXXX nifi.security.keyPasswd=XXXXXXXXXXXXXXXXX nifi.security.truststore=/opt/nifi/nifi-current/security/nifi-0.truststore.jks nifi.security.truststoreType=jks nifi.security.truststorePasswd=XXXXXXXXXXXXXXXXXXXXXXXXXXX nifi.security.user.authorizer=managed-authorizer nifi.security.user.login.identity.provider= nifi.security.ocsp.responder.url= nifi.security.ocsp.responder.certificate= nifi.security.user.oidc.discovery.url=https://keycloak-server-address/auth/realms/Test/.well-known/openid-configuration nifi.security.user.oidc.connect.timeout=15 secs nifi.security.user.oidc.read.timeout=15 secs nifi.security.user.oidc.client.id=nifi nifi.security.user.oidc.client.secret=XXXXXXXXXXXXXXXXXXXXX nifi.security.user.oidc.preferred.jwsalgorithm=RS512 nifi.security.user.oidc.additional.scopes= nifi.security.user.oidc.claim.identifying.user= nifi.security.user.knox.url= nifi.security.user.knox.publicKey= nifi.security.user.knox.cookieName=hadoop-jwt nifi.security.user.knox.audiences= nifi.cluster.protocol.heartbeat.interval=30 secs nifi.cluster.protocol.is.secure=true nifi.cluster.is.node=true nifi.cluster.node.address=nifi-0.nifi.ki.svc.cluster.local nifi.cluster.node.protocol.port=2882 nifi.cluster.node.protocol.threads=40 nifi.cluster.node.protocol.max.threads=50 nifi.cluster.node.event.history.size=25 nifi.cluster.node.connection.timeout=120 secs nifi.cluster.node.read.timeout=120 secs nifi.cluster.node.max.concurrent.requests=100 nifi.cluster.firewall.file= nifi.cluster.flow.election.max.wait.time=5 mins nifi.cluster.flow.election.max.candidates= nifi.cluster.load.balance.host=nifi-0.nifi.ki.svc.cluster.local nifi.cluster.load.balance.port=6342 nifi.cluster.load.balance.connections.per.node=4 nifi.cluster.load.balance.max.thread.count=8 nifi.cluster.load.balance.comms.timeout=30 sec nifi.zookeeper.connect.string=zk-0.zk-hs.ki.svc.cluster.local:2181,zk-1.zk-hs.ki.svc.cluster.local:2181,zk-2.zk-hs.ki.svc.cluster.local:2181 nifi.zookeeper.connect.timeout=30 secs nifi.zookeeper.session.timeout=30 secs nifi.zookeeper.root.node=/nifi nifi.zookeeper.auth.type= nifi.zookeeper.kerberos.removeHostFromPrincipal= nifi.zookeeper.kerberos.removeRealmFromPrincipal= nifi.kerberos.krb5.file= nifi.kerberos.service.principal= nifi.kerberos.service.keytab.location= nifi.kerberos.spnego.principal= nifi.kerberos.spnego.keytab.location= nifi.kerberos.spnego.authentication.expiration=12 hours nifi.variable.registry.properties= nifi.analytics.predict.enabled=false nifi.analytics.predict.interval=3 mins nifi.analytics.query.interval=5 mins nifi.analytics.connection.model.implementation=org.apache.nifi.controller.status.analytics.models.OrdinaryLeastSquares nifi.analytics.connection.model.score.name=rSquared nifi.analytics.connection.model.score.threshold=.90 ________________________________ From: Chris Sampson <chris.samp...@naimuri.com> Sent: Tuesday, September 29, 2020 12:41 PM To: users@nifi.apache.org <users@nifi.apache.org> Subject: Re: Clustered nifi issues Also, which version of zookeeper and what image (I've found different versions and images provided better stability)? Cheers, Chris Sampson On Tue, 29 Sep 2020, 17:34 Sushil Kumar, <skm....@gmail.com<mailto:skm....@gmail.com>> wrote: Hello Wyll It may be helpful if you can send nifi.properties. Thanks Sushil Kumar On Tue, Sep 29, 2020 at 7:58 AM Wyll Ingersoll <wyllys.ingers...@keepertech.com<mailto:wyllys.ingers...@keepertech.com>> wrote: I have a 3-node Nifi (1.11.4) cluster in kubernetes environment (as a StatefulSet) using external zookeeper (3 nodes also) to manage state. Whenever even 1 node (pod/container) goes down or is restarted, it can throw the whole cluster into a bad state that forces me to restart ALL of the pods in order to recover. This seems wrong. The problem seems to be that when the primary node goes away, the remaining 2 nodes don't ever try to take over. Instead, I have restart all of them individually until one of them becomes the primary, then the other 2 eventually join and sync up. When one of the nodes is refusing to sync up, I often see these errors in the log and the only way to get it back into the cluster is to restart it. The node showing the errors below never seems to be able to rejoin or resync with the other 2 nodes. 2020-09-29 10:18:53,324 ERROR [Reconnect to Cluster] o.a.nifi.controller.StandardFlowService Handling reconnection request failed due to: org.apache.nifi.cluster.ConnectionException: Failed to connect node to cluster due to: java.lang.NullPointerException org.apache.nifi.cluster.ConnectionException: Failed to connect node to cluster due to: java.lang.NullPointerException at org.apache.nifi.controller.StandardFlowService.loadFromConnectionResponse(StandardFlowService.java:1035) at org.apache.nifi.controller.StandardFlowService.handleReconnectionRequest(StandardFlowService.java:668) at org.apache.nifi.controller.StandardFlowService.access$200(StandardFlowService.java:109) at org.apache.nifi.controller.StandardFlowService$1.run(StandardFlowService.java:415) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.NullPointerException: null at org.apache.nifi.controller.StandardFlowService.loadFromConnectionResponse(StandardFlowService.java:989) ... 4 common frames omitted 2020-09-29 10:18:53,326 INFO [Reconnect to Cluster] o.a.c.f.imps.CuratorFrameworkImpl Starting 2020-09-29 10:18:53,327 INFO [Reconnect to Cluster] org.apache.zookeeper.ClientCnxnSocket jute.maxbuffer value is 4194304 Bytes 2020-09-29 10:18:53,328 INFO [Reconnect to Cluster] o.a.c.f.imps.CuratorFrameworkImpl Default schema 2020-09-29 10:18:53,807 INFO [Reconnect to Cluster-EventThread] o.a.c.f.state.ConnectionStateManager State change: CONNECTED 2020-09-29 10:18:53,809 INFO [Reconnect to Cluster-EventThread] o.a.c.framework.imps.EnsembleTracker New config event received: {server.1=zk-0.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>, version=0, server.3=zk-2.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>, server.2=zk-1.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>} 2020-09-29 10:18:53,810 INFO [Curator-Framework-0] o.a.c.f.imps.CuratorFrameworkImpl backgroundOperationsLoop exiting 2020-09-29 10:18:53,813 INFO [Reconnect to Cluster-EventThread] o.a.c.framework.imps.EnsembleTracker New config event received: {server.1=zk-0.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>, version=0, server.3=zk-2.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>, server.2=zk-1.zk-hs.ki.svc.cluster.local:2888:3888:participant;0.0.0.0:2181<http://0.0.0.0:2181>} 2020-09-29 10:18:54,323 INFO [Reconnect to Cluster] o.a.n.c.l.e.CuratorLeaderElectionManager Cannot unregister Leader Election Role 'Primary Node' becuase that role is not registered 2020-09-29 10:18:54,324 INFO [Reconnect to Cluster] o.a.n.c.l.e.CuratorLeaderElectionManager Cannot unregister Leader Election Role 'Cluster Coordinator' becuase that role is not registered