gianm commented on a change in pull request #8672: Druid Doctor URL: https://github.com/apache/incubator-druid/pull/8672#discussion_r335256239
########## File path: web-console/src/dialogs/doctor-dialog/doctor-checks.tsx ########## @@ -0,0 +1,421 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import axios from 'axios'; + +import { pluralIfNeeded, queryDruidSql } from '../../utils'; +import { deepGet } from '../../utils/object-change'; +import { postToSampler } from '../../utils/sampler'; + +export interface CheckControls { + addSuggestion: (message: string) => void; + addIssue: (message: string) => void; + terminateChecks: () => void; +} + +export interface DoctorCheck { + name: string; + check: (controls: CheckControls) => Promise<void>; +} + +const RUNTIME_PROPERTIES_ALL_NODES_MUST_AGREE_ON: string[] = [ + 'user.timezone', + 'druid.zk.service.host', +]; + +const RUNTIME_PROPERTIES_ALL_NODES_SHOULD_AGREE_ON: string[] = ['java.version']; + +// In the future (when we can query other nodes) is will also be cool to check: +// 'druid.storage.type' <=> historicals, overlords, mm +// 'druid.indexer.logs.type' <=> overlord, mm, + peons + +const RUNTIME_PROPERTIES_MASTER_NODES_SHOULD_AGREE_ON: string[] = [ + 'druid.metadata.storage.type', // overlord + coordinator + 'druid.metadata.storage.connector.connectURI', +]; + +export const DOCTOR_CHECKS: DoctorCheck[] = [ + // ------------------------------------- + // Self (router) checks + // ------------------------------------- + { + name: 'Verify own status', + check: async controls => { + // Make sure that the router responds to /status and gives some valid info back + let status: any; + try { + status = (await axios.get(`/status`)).data; + } catch (e) { + controls.addIssue( + `Did not get a /status response, is the cluster running? Got: ${e.message}`, + ); + controls.terminateChecks(); + return; + } + + if (typeof status.version !== 'string') { + controls.addIssue('Could not get a valid /status response.'); + } + }, + }, + { + name: 'Verify own runtime properties', + check: async controls => { + // Make sure that everything in /status/properties is above board + let properties: Record<string, string>; + try { + properties = (await axios.get(`/status/properties`)).data; + } catch (e) { + controls.addIssue('Did not get a /status/properties response, something must be broken.'); + return; + } + + // Check that the management proxy is on, it really should be for someone to access the console in the first place but everything could happen + if (properties['druid.router.managementProxy.enabled'] !== 'true') { + controls.addIssue( + `The router's "druid.router.managementProxy.enabled" is not reported as "true" that is unusual.`, + ); + } + + // Check that the underlying Java is Java 8 the only officially supported Java version at the moment. + if ( + properties['java.runtime.version'] && + !properties['java.runtime.version'].startsWith('1.8') + ) { + controls.addSuggestion( + `It looks like are running Java ${properties['java.runtime.version']}, Druid only officially supports Java 1.8.x`, + ); + } + + // Check that "user.timezone" + if (properties['user.timezone'] && properties['user.timezone'] !== 'UTC') { + controls.addSuggestion( + `It looks like "user.timezone" is set to ${properties['user.timezone']}, it is recommended to set this to "UTC"`, + ); + } + }, + }, + + // ------------------------------------- + // Coordinator and Overlord + // ------------------------------------- + { + name: 'Verify the Coordinator and Overlord status', + check: async controls => { + // Make sure that everything in Coordinator's /status is good + let myStatus: any; + try { + myStatus = (await axios.get(`/status`)).data; + } catch { + return; + } + + let coordinatorStatus: any; + try { + coordinatorStatus = (await axios.get(`/proxy/coordinator/status`)).data; + } catch (e) { + controls.addIssue('Did not get a /status response from the coordinator, is it running?'); + return; + } + + let overlordStatus: any; + try { + overlordStatus = (await axios.get(`/proxy/overlord/status`)).data; + } catch (e) { + controls.addIssue('Did not get a /status response from the overlord, is it running?'); + return; + } + + if (myStatus.version !== coordinatorStatus.version) { + controls.addSuggestion( + `It looks like the Router and Coordinator nodes are on different versions of Druid, are you in the middle of a rolling upgrade?`, Review comment: Putting into the format I suggested above, this would be: It looks like the Router and Coordinator nodes are running different versions of Druid. This may indicate a problem if you are not in the middle of a rolling upgrade. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@druid.apache.org For additional commands, e-mail: commits-h...@druid.apache.org