piby180 opened a new issue, #12888:
URL: https://github.com/apache/pinot/issues/12888
Hey all!
We are facing serious issue of high availability despite replication across
all components.
# Cluster and Table Setup
Some context about our cluster configuration:
1. Our cluster is deployed using pinot official helm chart on AWS EKS
2. Machine Type : c6a.8xlarge (32 cores, 64 GB)
3. Machine count : 6
4. Pinot components : 3 servers, 3 controllers, 3 brokers, 3 minions, 5
zookeeper
5. Resources : 3 * Server (4 cores, 14 GB, 1000 GB), 3 * Controller (4
cores, 14 GB, 100 GB), 3 * Broker (4 cores, 14 GB, 100GB), 5 * Zookeeper (4
cores, 14 GB, 100 GB)
6. Pinot Version : 1.0.0 . Zookeeper version :
3.8.0-5a02a05eddb59aee6ac762f7ea82e92a68eb9c0f, built on 2022-02-25 08:49 UTC
(deployed with pinot helm chart)
Here is our values.yaml
````
image:
repository: ${CI_REGISTRY_IMAGE}
tag: "${PINOT_IMAGE_TAG}"
pullPolicy: Always
imagePullSecrets:
- name: ${CI_PROJECT_NAME}
cluster:
name: "${PINOT_CLUSTER_NAME}"
serviceAccount:
annotations:
eks.amazonaws.com/role-arn: "${PINOT_IRSA_ROLE_ARN}"
name: "pinot"
probes:
initialDelaySeconds: 300
periodSeconds: 30
pinotAuth:
enabled: true
controllerFactoryClass:
org.apache.pinot.controller.api.access.ZkBasicAuthAccessControlFactory
brokerFactoryClass:
org.apache.pinot.broker.broker.ZkBasicAuthAccessControlFactory
configs:
- access.control.principals=admin
- access.control.principals.admin.password=${PINOT_ADMIN_PASSWORD}
- access.control.init.username=admin
- access.control.init.password=${PINOT_ADMIN_PASSWORD}
#
------------------------------------------------------------------------------
# Pinot Controller:
#
------------------------------------------------------------------------------
controller:
replicaCount: 3
probes:
livenessEnabled: true
readinessEnabled: true
persistence:
size: ${PINOT_CONTROLLER_VOL_SIZE}
storageClass: ${PINOT_STORAGE_CLASS}
data:
dir: "${PINOT_SEGMENT_DIR}"
podSecurityContext:
fsGroupChangePolicy: Always
runAsUser: 1000
runAsGroup: 3000
fsGroup: 3000
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 3000
jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx14G
-Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200
-Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log
-javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml"
service:
annotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
external:
enabled: false
resources:
requests:
cpu : 4
memory: "14Gi"
nodeSelector:
workload-type: ${PINOT_WORKLOAD_TYPE}
podAnnotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
extraEnv:
- name: LOG4J_CONSOLE_LEVEL
value: error
# Extra configs will be appended to pinot-controller.conf file
extra:
configs: |-
pinot.set.instance.id.to.hostname=true
controller.task.scheduler.enabled=true
controller.task.frequencyPeriod=1h
access.control.init.username=admin
access.control.init.password=${PINOT_ADMIN_PASSWORD}
controller.local.temp.dir=/tmp/pinot-tmp-data/
controller.allow.hlc.tables=false
controller.enable.split.commit=true
controller.realtime.segment.deepStoreUploadRetryEnabled=true
controller.segment.fetcher.auth.token=${PINOT_AUTH_TOKEN}
pinot.controller.storage.factory.s3.disableAcl=false
pinot.controller.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS
pinot.controller.storage.factory.s3.region=${AWS_S3_REGION}
pinot.controller.storage.factory.s3.httpclient.maxConnections=100
pinot.controller.storage.factory.s3.httpclient.socketTimeout=30s
pinot.controller.storage.factory.s3.httpclient.connectionTimeout=2s
pinot.controller.storage.factory.s3.httpclient.connectionTimeToLive=0s
pinot.controller.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s
pinot.controller.segment.fetcher.protocols=file,http,s3
pinot.controller.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher
pinot.multistage.engine.enabled=true
pinot.server.instance.currentDataTableVersion=4
pinot.query.server.port=8421
pinot.query.runner.port=8442
pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory
pinot.query.scheduler.accounting.enable.thread.memory.sampling=true
pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true
pinot.query.scheduler.accounting.oom.enable.killing.query=true
pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true
#
------------------------------------------------------------------------------
# Pinot Broker:
#
------------------------------------------------------------------------------
broker:
replicaCount: 3
jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx14G
-Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200
-Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log
-javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml"
podSecurityContext:
fsGroupChangePolicy: Always
runAsUser: 1000
runAsGroup: 3000
fsGroup: 3000
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 3000
service:
annotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
external:
enabled: false
ingress:
v1:
enabled: true
ingressClassName: ""
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/target-group-attributes:
stickiness.enabled=true,stickiness.lb_cookie.duration_seconds=60
alb.ingress.kubernetes.io/certificate-arn:
"${PINOT_BROKER_ALB_ACM_CERTIFICATE_ARN}"
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
alb.ingress.kubernetes.io/load-balancer-attributes:
access_logs.s3.enabled=false
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/security-groups:
${PINOT_BROKER_ALB_SECURITY_GROUP}
alb.ingress.kubernetes.io/ssl-policy:
ELBSecurityPolicy-TLS-1-2-Ext-2018-06
alb.ingress.kubernetes.io/tags: ${PINOT_BROKER_ALB_TAGS}
tls: []
path: /
hosts:
- ${PINOT_BROKER_ALB_HOST}
resources:
requests:
cpu : 4
memory: "14Gi"
nodeSelector:
workload-type: ${PINOT_WORKLOAD_TYPE}
podAnnotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
extraEnv:
- name: LOG4J_CONSOLE_LEVEL
value: debug
extra:
configs: |-
pinot.set.instance.id.to.hostname=true
pinot.multistage.engine.enabled=true
pinot.server.instance.currentDataTableVersion=4
pinot.query.server.port=8421
pinot.query.runner.port=8442
pinot.broker.enable.query.cancellation=true
pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory
pinot.query.scheduler.accounting.enable.thread.memory.sampling=true
pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true
pinot.query.scheduler.accounting.oom.enable.killing.query=true
pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true
#
------------------------------------------------------------------------------
# Pinot Server:
#
------------------------------------------------------------------------------
server:
replicaCount: 3
probes:
livenessEnabled: true
readinessEnabled: true
podSecurityContext:
fsGroupChangePolicy: Always
runAsUser: 1000
runAsGroup: 3000
fsGroup: 3000
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 3000
persistence:
size: ${PINOT_SERVER_VOL_SIZE}
storageClass: ${PINOT_STORAGE_CLASS}
jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx6G
-Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200
-Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log
-javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml"
resources:
requests:
cpu : 4
memory: "14Gi"
nodeSelector:
workload-type: ${PINOT_WORKLOAD_TYPE}
podAnnotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
extraEnv:
- name: LOG4J_CONSOLE_LEVEL
value: error
extra:
configs: |-
pinot.set.instance.id.to.hostname=true
pinot.server.instance.realtime.alloc.offheap=true
pinot.server.instance.enable.split.commit=true
realtime.segment.serverUploadToDeepStore=true
pinot.server.instance.segment.store.uri=${PINOT_SEGMENT_DIR}
pinot.server.storage.factory.s3.disableAcl=false
pinot.server.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS
pinot.server.storage.factory.s3.region=${AWS_S3_REGION}
pinot.server.segment.fetcher.protocols=file,http,s3
pinot.server.storage.factory.s3.httpclient.maxConnections=1000
pinot.server.storage.factory.s3.httpclient.socketTimeout=30s
pinot.server.storage.factory.s3.httpclient.connectionTimeout=2s
pinot.server.storage.factory.s3.httpclient.connectionTimeToLive=0s
pinot.server.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s
pinot.server.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher
pinot.server.segment.fetcher.auth.token=${PINOT_AUTH_TOKEN}
pinot.server.segment.uploader.auth.token=${PINOT_AUTH_TOKEN}
pinot.server.instance.auth.token=${PINOT_AUTH_TOKEN}
pinot.multistage.engine.enabled=true
pinot.server.instance.currentDataTableVersion=4
pinot.query.server.port=8421
pinot.query.runner.port=8442
pinot.server.enable.query.cancellation=true
pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory
pinot.query.scheduler.accounting.enable.thread.memory.sampling=true
pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true
pinot.query.scheduler.accounting.oom.enable.killing.query=true
pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true
#
------------------------------------------------------------------------------
# Pinot Minion:
#
------------------------------------------------------------------------------
minionStateless:
enabled: false
minion:
enabled: true
replicaCount: 3
dataDir: "${PINOT_MINION_DATA_DIR}"
jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx8G
-Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200
-Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log"
podSecurityContext:
fsGroupChangePolicy: Always
runAsUser: 1000
runAsGroup: 3000
fsGroup: 3000
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 3000
persistence:
enabled: true
accessMode: ReadWriteOnce
size: ${PINOT_MINION_VOL_SIZE}
storageClass: ${PINOT_STORAGE_CLASS}
resources:
requests:
cpu : 4
memory: "14Gi"
nodeSelector:
workload-type: ${PINOT_WORKLOAD_TYPE}
podAnnotations:
"prometheus.io/scrape": "true"
"prometheus.io/port": "8008"
extraEnv:
- name: LOG4J_CONSOLE_LEVEL
value: error
extra:
configs: |-
pinot.set.instance.id.to.hostname=true
pinot.minion.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS
pinot.minion.storage.factory.s3.region=${AWS_S3_REGION}
pinot.minion.segment.fetcher.protocols=file,http,s3
pinot.minion.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher
pinot.minion.storage.factory.s3.httpclient.maxConnections=1000
pinot.minion.storage.factory.s3.httpclient.socketTimeout=30s
pinot.minion.storage.factory.s3.httpclient.connectionTimeout=2s
pinot.minion.storage.factory.s3.httpclient.connectionTimeToLive=0s
pinot.minion.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s
segment.fetcher.auth.token=${PINOT_AUTH_TOKEN}
task.auth.token=${PINOT_AUTH_TOKEN}
pinot.multistage.engine.enabled=true
pinot.server.instance.currentDataTableVersion=4
pinot.query.server.port=8421
pinot.query.runner.port=8442
pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory
pinot.query.scheduler.accounting.enable.thread.memory.sampling=true
pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true
pinot.query.scheduler.accounting.oom.enable.killing.query=true
pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true
zookeeper:
enabled: true
urlOverride: "my-zookeeper:2181/my-pinot"
port: 2181
podSecurityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1001
containerSecurityContext:
runAsNonRoot: true
runAsUser: 1000
env:
# https://github.com/mrbobbytables/zookeeper/blob/master/README.md
ZOO_HEAP_SIZE: "10G"
ZOOKEEPER_LOG_STDOUT_THRESHOLD: "ERROR"
JAVA_OPTS: "-XX:+ExitOnOutOfMemoryError -Xms4G -Xmx10G
-Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200
-Xlog:gc*:file=/opt/bitnami/zookeeper/logs/gc-pinot-zookeeper.log"
resources:
requests:
cpu : 4
memory: "14Gi"
replicaCount: 3
persistence:
enabled: true
size: ${PINOT_ZOOKEEPER_VOL_SIZE}
storageClass: ${PINOT_STORAGE_CLASS}
image:
PullPolicy: "IfNotPresent"
nodeSelector:
workload-type: ${PINOT_WORKLOAD_TYPE}
# References
#
https://docs.pinot.apache.org/operators/operating-pinot/oom-protection-using-automatic-query-killing
#
https://docs.pinot.apache.org/operators/tutorials/deployment-pinot-on-kubernetes
# https://startree.ai/blog/capacity-planning-in-apache-pinot-part-1
# https://startree.ai/blog/capacity-planning-in-apache-pinot-part-2
````
We query Pinot using three ways:
1. Via Superset with URL :
pinot://<username>:<password>@<pinot-broker-host>:<pinot-broker-port>/query/sql?controller=http://<pinot-controller-host>:<pinot-controller-port>/verify_ssl=true
2. Via Pinot Broker Ingress Load Balancer (deployed with pinot helm chart)
3. Via Pinot admin UI accessible by port-forwarding pinot-controller
service object.
Our standard offline tables have the following config
````
{
"tableName": "pinot_metadata_feeds",
"tableType": "OFFLINE",
"segmentsConfig": {
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "365",
"schemaName": "pinot_metadata_feeds",
"replication": "3",
"replicasPerPartition": "3",
"segmentPushType": "APPEND",
"segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy"
},
"ingestionConfig": {},
"task": {
"taskTypeConfigsMap": {
"MergeRollupTask": {
"1day.mergeType": "concat",
"1day.bucketTimePeriod": "1d",
"1day.bufferTimePeriod": "1d"
}
}
},
"tenants": {},
"tableIndexConfig": {
"loadMode": "MMAP",
"nullHandlingEnabled": "true"
},
"metadata": {
"customConfigs": {}
}
}
`````
# Problem
When broker or server pod get restarted during cluster update or when our
ops team make changes to kubernetes cluster, some of our queries fail.
With multistage disabled : It seems like the queries are routed in round
robin fashion. If you retry the same query for 5 times, it will fail 1-2 times
when it reach the server pod or broker pod which is restarting. For 3-4 times,
it reach the healthy broker/server pods and return result.
With multistage enabled: The queries almost always fail when one of the
broker or server pod is restarting. It seems the queries are fanning out to all
servers.
Disabling multistage is not an option for us since we are using joins in
some queries.
The error log we get when for example server-2 is restarting
````
Error dispatching query to
server=pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local@{8421,8442}
stage=1
org.apache.pinot.query.service.dispatch.QueryDispatcher.submit(QueryDispatcher.java:144)
org.apache.pinot.query.service.dispatch.QueryDispatcher.submitAndReduce(QueryDispatcher.java:93)
org.apache.pinot.broker.requesthandler.MultiStageBrokerRequestHandler.handleRequest(MultiStageBrokerRequestHandler.java:179)
org.apache.pinot.broker.requesthandler.BaseBrokerRequestHandler.handleRequest(BaseBrokerRequestHandler.java:263)
UNAVAILABLE: Unable to resolve host
pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local
io.grpc.Status.asRuntimeException(Status.java:539)
io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:487)
io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:576)
io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:70)
java.net.UnknownHostException:
pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local: Name or
service not known
io.grpc.internal.DnsNameResolver.resolveAddresses(DnsNameResolver.java:223)
io.grpc.internal.DnsNameResolver.doResolve(DnsNameResolver.java:282)
io.grpc.internal.DnsNameResolver$Resolve.run(DnsNameResolver.java:318)
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local: Name or
service not known
java.base/java.net.Inet6AddressImpl.lookupAllHostAddr(Native Method)
java.base/java.net.InetAddress$PlatformNameService.lookupAllHostAddr(InetAddress.java:930)
java.base/java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1543)
java.base/java.net.InetAddress$NameServiceAddresses.get(InetAddress.java:848)
````
Full query response in Pinot UI for a succeeded query with multistage
enabled (here you can see the query is being routed to multiple servers)
SELECT COUNT(*) FROM pinot_metadata_feeds;
```
{
"resultTable": {
"dataSchema": {
"columnNames": [
"EXPR$0"
],
"columnDataTypes": [
"LONG"
]
},
"rows": [
[
1206170
]
]
},
"requestId": "92319303000002827",
"stageStats": {
"1": {
"numBlocks": 6,
"numRows": 3,
"stageExecutionTimeMs": 266,
"stageExecutionUnit": 3,
"stageExecWallTimeMs": 89,
"operatorStats": {
"aggregateoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442":
{
"numBlocks": "2",
"numRows": "1",
"operatorExecutionTimeMs": "88",
"operatorExecStartTimeMs": "1712833326722",
"operatorId":
"aggregateoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442",
"operatorExecEndTimeMs": "1712833326811"
},
"mailboxsendoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442":
{
"numBlocks": "1",
"numRows": "1",
"operatorExecutionTimeMs": "90",
"operatorExecStartTimeMs": "1712833326722",
"operatorId":
"mailboxsendoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442",
"operatorExecEndTimeMs": "1712833326811"
},
"mailboxreceiveoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442":
{
"numBlocks": "3",
"numRows": "1",
"operatorExecutionTimeMs": "88",
"operatorExecStartTimeMs": "1712833326722",
"operatorId":
"mailboxreceiveoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442",
"operatorExecEndTimeMs": "1712833326811"
}
}
},
"2": {
"numBlocks": 3,
"numRows": 2,
"stageExecutionTimeMs": 142,
"stageExecutionUnit": 2,
"stageExecWallTimeMs": 72,
"numSegmentsQueried": 14144,
"numSegmentsProcessed": 14144,
"numSegmentsMatched": 14144,
"numDocsScanned": 1206170,
"totalDocs": 1206170,
"traceInfo": {
"pinot_metadata_feeds": "[{\"0\":[{\"SegmentPrunerService
Time\":8},{\"CombinePlanNode Time\":5},{\"AggregationCombineOperator
Time\":35},{\"StreamingInstanceResponseOperator
Time\":36}]},{\"0_0\":[]},{\"0_1\":[]},{\"0_2\":[{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator
Time\":0},...{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator Time\":0}]}]",
"operatorExecutionTimeMs": "71",
"operatorExecStartTimeMs": "1712833326730",
"numSegmentsPrunedByServer": "0",
"numSegmentsProcessed": "14144",
"operatorId":
"leafstagetransferableblockoperator_...@pinot-server-0.pinot-server-headless.pinot-dev.svc.cluster.local:8442",
"numEntriesScannedInFilter": "0",
"table": "pinot_metadata_feeds"
}
},
"tableNames": [
"pinot_metadata_feeds"
]
}
},
"exceptions": [],
"numServersQueried": 0,
"numServersResponded": 0,
"numSegmentsQueried": 14144,
"numSegmentsProcessed": 14144,
"numSegmentsMatched": 14144,
"numConsumingSegmentsQueried": 0,
"numConsumingSegmentsProcessed": 0,
"numConsumingSegmentsMatched": 0,
"numDocsScanned": 1206170,
"numEntriesScannedInFilter": 0,
"numEntriesScannedPostFilter": 0,
"numGroupsLimitReached": false,
"totalDocs": 1206170,
"timeUsedMs": 172,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"segmentStatistics": [],
"traceInfo": {},
"minConsumingFreshnessTimeMs": 0,
"numSegmentsPrunedByBroker": 0,
"numSegmentsPrunedByServer": 0,
"numSegmentsPrunedInvalid": 0,
"numSegmentsPrunedByLimit": 0,
"numSegmentsPrunedByValue": 0,
"explainPlanNumEmptyFilterSegments": 0,
"explainPlanNumMatchAllFilterSegments": 0,
"brokerId":
"Broker_pinot-broker-0.pinot-broker-headless.pinot-dev.svc.cluster.local_8099",
"brokerReduceTimeMs": 88,
"numRowsResultSet": 1
}
```
Full query response in Pinot UI for succeeded query with multistage disabled
SELECT COUNT(*) FROM pinot_metadata_feeds;
````
{
"resultTable": {
"dataSchema": {
"columnNames": [
"count(*)"
],
"columnDataTypes": [
"LONG"
]
},
"rows": [
[
1206170
]
]
},
"requestId": "132994053000000022",
"brokerId":
"Broker_pinot-broker-2.pinot-broker-headless.pinot-dev.svc.cluster.local_8099",
"exceptions": [],
"numServersQueried": 1,
"numServersResponded": 1,
"numSegmentsQueried": 14144,
"numSegmentsProcessed": 14144,
"numSegmentsMatched": 14144,
"numConsumingSegmentsQueried": 0,
"numConsumingSegmentsProcessed": 0,
"numConsumingSegmentsMatched": 0,
"numDocsScanned": 1206170,
"numEntriesScannedInFilter": 0,
"numEntriesScannedPostFilter": 0,
"numGroupsLimitReached": false,
"totalDocs": 1206170,
"timeUsedMs": 189,
"offlineThreadCpuTimeNs": 0,
"realtimeThreadCpuTimeNs": 0,
"offlineSystemActivitiesCpuTimeNs": 0,
"realtimeSystemActivitiesCpuTimeNs": 0,
"offlineResponseSerializationCpuTimeNs": 0,
"realtimeResponseSerializationCpuTimeNs": 0,
"offlineTotalCpuTimeNs": 0,
"realtimeTotalCpuTimeNs": 0,
"brokerReduceTimeMs": 0,
"segmentStatistics": [],
"traceInfo": {
"pinot-server-2_O": "[{\"0\":[{\"SegmentPrunerService
Time\":6},{\"CombinePlanNode Time\":3},{\"AggregationCombineOperator
Time\":49},{\"InstanceResponseOperator
Time\":49}]},{\"0_0\":[]},{\"0_1\":[]},{\"0_2\":[{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator
Time\":0},...{\"FastFilteredCountOperator
Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator
Time\":0}]}]"
},
"minConsumingFreshnessTimeMs": 0,
"numSegmentsPrunedByBroker": 0,
"numSegmentsPrunedByServer": 0,
"numSegmentsPrunedInvalid": 0,
"numSegmentsPrunedByLimit": 0,
"numSegmentsPrunedByValue": 0,
"explainPlanNumEmptyFilterSegments": 0,
"explainPlanNumMatchAllFilterSegments": 0,
"numRowsResultSet": 1
}
````
# Expectation
Since we have 3 replicas for every segment and 3 replicas for every
component, pinot must only route the queries to heathy broker/server pods and
the query must not fail in case 1 out 3 server/broker pod is unavailable.
# Solutions tried so far
1.
https://docs.pinot.apache.org/operators/operating-pinot/tuning/query-routing-using-adaptive-server-selection
Added the following to broker-conf
````
pinot.broker.adaptive.server.selector.enable.stats.collection = true
pinot.broker.adaptive.server.selector.type=HYBRID
````
2. Enabled replicaGroup using default tags
```
{
"tableName": "pinot_metadata_feeds",
"tableType": "OFFLINE",
"quota": {
"maxQueriesPerSecond": 300,
"storage": "140G"
},
"routing": {
"segmentPrunerTypes": ["partition"],
"instanceSelectorType": "replicaGroup"
},
"segmentsConfig": {
"retentionTimeUnit": "DAYS",
"retentionTimeValue": "3650",
"schemaName": "pinot_metadata_feeds",
"replication": "3",
"replicasPerPartition": "1",
"segmentPushType": "APPEND",
"segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy"
},
"ingestionConfig": {},
"task": {
"taskTypeConfigsMap": {
"MergeRollupTask": {
"1day.mergeType": "concat",
"1day.bucketTimePeriod": "1d",
"1day.bufferTimePeriod": "1d"
}
}
},
"tenants": {},
"tableIndexConfig": {
"loadMode": "MMAP",
"nullHandlingEnabled": "true"
},
"instanceAssignmentConfigMap": {
"OFFLINE": {
"tagPoolConfig": {
"tag": "DefaultTenant_OFFLINE"
},
"replicaGroupPartitionConfig": {
"replicaGroupBased": true,
"numInstances": 3,
"numReplicaGroups": 3,
"numInstancesPerReplicaGroup": 1
}
}
},
"metadata": {
"customConfigs": {}
}
}
```
3. Upgrade 1.0.0 to 1.1.0 : Will test in the evening.
As of now, Pinot for us is not highly available despite following all best
practices regarding replication. This scares us currently as we can face
downtime any day if kubernetes restart one pod randomly which is not uncommon.
This issue is quite important for us as it was our base assumption that
Pinot is Highly Available.
Any help is much appreciated!
Thanks!
FYI : This issue was first discussed on Slack
[here](https://apache-pinot.slack.com/archives/C011C9JHN7R/p1712665057902059)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]