Skip to content

Commit 6120775

Browse files
authored
feat(network node livecheck): implement a functionality to determine network node activeness with prometheus metrics (#567)
Signed-off-by: instamenta <[email protected]>
1 parent 05c5517 commit 6120775

File tree

3 files changed

+149
-122
lines changed

3 files changed

+149
-122
lines changed

src/commands/node.mjs

Lines changed: 102 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@ import {
5555
DEFAULT_NETWORK_NODE_NAME,
5656
FREEZE_ADMIN_ACCOUNT,
5757
HEDERA_NODE_DEFAULT_STAKE_AMOUNT,
58-
TREASURY_ACCOUNT_ID
58+
TREASURY_ACCOUNT_ID,
59+
LOCAL_HOST
5960
} from '../core/constants.mjs'
61+
import { NodeStatusCodes, NodeStatusEnums } from '../core/enumerations.mjs'
6062

6163
/**
6264
* Defines the core functionalities of 'node' command
@@ -360,72 +362,116 @@ export class NodeCommand extends BaseCommand {
360362
}
361363

362364
/**
365+
* @param {string} namespace
363366
* @param {string} nodeId
364-
* @param {number} [maxAttempt]
365-
* @param {string} [status]
366-
* @param {string} [logfile]
367-
* @returns {Promise<boolean>}
367+
* @param {TaskWrapper} task
368+
* @param {string} title
369+
* @param {number} index
370+
* @param {number} [status]
371+
* @param {number} [maxAttempts]
372+
* @param {number} [delay]
373+
* @param {number} [timeout]
374+
* @returns {Promise<string>}
368375
*/
369-
async checkNetworkNodeState (nodeId, maxAttempt = 100, status = 'ACTIVE', logfile = 'output/hgcaa.log') {
376+
async checkNetworkNodeActiveness (namespace, nodeId, task, title, index,
377+
status = NodeStatusCodes.ACTIVE, maxAttempts = 120, delay = 1_000, timeout = 1_000) {
370378
nodeId = nodeId.trim()
371379
const podName = Templates.renderNetworkPodName(nodeId)
372-
const logfilePath = `${constants.HEDERA_HAPI_PATH}/${logfile}`
380+
const podPort = 9_999
381+
const localPort = 19_000 + index
382+
task.title = `${title} - status ${chalk.yellow('STARTING')}, attempt ${chalk.blueBright(`0/${maxAttempts}`)}`
383+
384+
const srv = await this.k8.portForward(podName, localPort, podPort)
385+
373386
let attempt = 0
374-
let isActive = false
387+
let success = false
388+
while (attempt < maxAttempts) {
389+
const controller = new AbortController()
390+
391+
const timeoutId = setTimeout(() => {
392+
task.title = `${title} - status ${chalk.yellow('TIMEOUT')}, attempt ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
393+
controller.abort()
394+
}, timeout)
375395

376-
this.logger.debug(`Checking if node ${nodeId} is ${status}...`)
377-
// check log file is accessible
378-
let logFileAccessible = false
379-
while (attempt++ < maxAttempt) {
380396
try {
381-
if (await this.k8.hasFile(podName, constants.ROOT_CONTAINER, logfilePath)) {
382-
logFileAccessible = true
383-
break
397+
const url = `http://${LOCAL_HOST}:${localPort}/metrics`
398+
const response = await fetch(url, { signal: controller.signal })
399+
400+
if (!response.ok) {
401+
task.title = `${title} - status ${chalk.yellow('UNKNOWN')}, attempt ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
402+
clearTimeout(timeoutId)
403+
throw new Error() // Guard
384404
}
385-
} catch (e) {
386-
} // ignore errors
387405

388-
await sleep(1000)
389-
}
406+
const text = await response.text()
407+
const statusLine = text
408+
.split('\n')
409+
.find(line => line.startsWith('platform_PlatformStatus'))
390410

391-
if (!logFileAccessible) {
392-
throw new FullstackTestingError(`Logs are not accessible: ${logfilePath}`)
393-
}
411+
if (!statusLine) {
412+
task.title = `${title} - status ${chalk.yellow('STARTING')}, attempt: ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
413+
clearTimeout(timeoutId)
414+
throw new Error() // Guard
415+
}
394416

395-
attempt = 0
396-
while (attempt < maxAttempt) {
397-
try {
398-
const output = await this.k8.execContainer(podName, constants.ROOT_CONTAINER, ['tail', '-100', logfilePath])
399-
if (output && output.indexOf('Terminating Netty') < 0 && // make sure we are not at the beginning of a restart
400-
(output.indexOf(`Now current platform status = ${status}`) > 0 ||
401-
output.indexOf(`Platform Status Change ${status}`) > 0 ||
402-
output.indexOf(`is ${status}`) > 0 ||
403-
output.indexOf(`"newStatus":"${status}"`) > 0)) {
404-
this.logger.debug(`Node ${nodeId} is ${status} [ attempt: ${attempt}/${maxAttempt}]`)
405-
isActive = true
417+
const statusNumber = parseInt(statusLine.split(' ').pop())
418+
419+
if (statusNumber === status) {
420+
task.title = `${title} - status ${chalk.green(NodeStatusEnums[status])}, attempt: ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
421+
success = true
422+
clearTimeout(timeoutId)
406423
break
424+
} else if (statusNumber === NodeStatusCodes.CATASTROPHIC_FAILURE) {
425+
task.title = `${title} - status ${chalk.red('CATASTROPHIC_FAILURE')}, attempt: ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
426+
break
427+
} else if (statusNumber) {
428+
task.title = `${title} - status ${chalk.yellow(NodeStatusEnums[statusNumber])}, attempt: ${chalk.blueBright(`${attempt}/${maxAttempts}`)}`
407429
}
408-
this.logger.debug(`Node ${nodeId} is not ${status} yet. Trying again... [ attempt: ${attempt}/${maxAttempt} ]`)
409-
} catch (e) {
410-
this.logger.warn(`error in checking if node ${nodeId} is ${status}: ${e.message}. Trying again... [ attempt: ${attempt}/${maxAttempt} ]`)
411-
412-
// ls the HAPI path for debugging
413-
await this.k8.execContainer(podName, constants.ROOT_CONTAINER, `ls -la ${constants.HEDERA_HAPI_PATH}`)
430+
clearTimeout(timeoutId)
431+
} catch {} // Catch all guard and fetch errors
414432

415-
// ls the output directory for debugging
416-
await this.k8.execContainer(podName, constants.ROOT_CONTAINER, `ls -la ${constants.HEDERA_HAPI_PATH}/output`)
417-
}
418-
attempt += 1
419-
await sleep(1000)
433+
attempt++
434+
clearTimeout(timeoutId)
435+
await sleep(delay)
420436
}
421437

422-
this.logger.info(`!> -- Node ${nodeId} is ${status} -- <!`)
438+
await this.k8.stopPortForward(srv)
423439

424-
if (!isActive) {
425-
throw new FullstackTestingError(`node '${nodeId}' is not ${status} [ attempt = ${attempt}/${maxAttempt} ]`)
440+
if (!success) {
441+
throw new FullstackTestingError(`node '${nodeId}' is not ${NodeStatusEnums[status]}` +
442+
`[ attempt = ${chalk.blueBright(`${attempt}/${maxAttempts}`)} ]`)
426443
}
427444

428-
return true
445+
return podName
446+
}
447+
448+
/**
449+
* @param {Object} ctx
450+
* @param {TaskWrapper} task
451+
* @param {string[]} nodeIds
452+
* @param {number} [status]
453+
* @returns {Listr<any, any, any>}
454+
*/
455+
checkNodeActivenessTask (ctx, task, nodeIds, status = NodeStatusCodes.ACTIVE) {
456+
const { config: { namespace } } = ctx
457+
458+
const subTasks = nodeIds.map((nodeId, i) => {
459+
const reminder = ('debugNodeId' in ctx.config && ctx.config.debugNodeId === nodeId) ? 'Please attach JVM debugger now.' : ''
460+
const title = `Check network pod: ${chalk.yellow(nodeId)} ${chalk.red(reminder)}`
461+
462+
const subTask = async (ctx, task) => {
463+
ctx.config.podNames[nodeId] = await this.checkNetworkNodeActiveness(namespace, nodeId, task, title, i, status)
464+
}
465+
466+
return { title, task: subTask }
467+
})
468+
469+
return task.newListr(subTasks, {
470+
concurrent: true,
471+
rendererOptions: {
472+
collapseSubtasks: false
473+
}
474+
})
429475
}
430476

431477
/**
@@ -490,67 +536,6 @@ export class NodeCommand extends BaseCommand {
490536
})
491537
}
492538

493-
/**
494-
*
495-
* @param {string} debugNodeId
496-
* @param {TaskWrapper} task
497-
* @param {string[]} nodeIds
498-
* @return {Listr<any, any, any>}
499-
*/
500-
checkNodeActiveTask (debugNodeId, task, nodeIds) {
501-
const subTasks = []
502-
for (const nodeId of nodeIds) {
503-
let reminder = ''
504-
if (debugNodeId === nodeId) {
505-
reminder = ' Please attach JVM debugger now.'
506-
}
507-
if (this.configManager.getFlag(flags.app) !== '' && this.configManager.getFlag(flags.app) !== constants.HEDERA_APP_NAME) {
508-
subTasks.push({
509-
title: `Check node: ${chalk.yellow(nodeId)} ${chalk.red(reminder)}`,
510-
task: async () => await this.checkNetworkNodeState(nodeId, 100, 'ACTIVE', 'output/swirlds.log')
511-
})
512-
} else {
513-
subTasks.push({
514-
title: `Check node: ${chalk.yellow(nodeId)} ${chalk.red(reminder)}`,
515-
task: async () => await this.checkNetworkNodeState(nodeId)
516-
})
517-
}
518-
}
519-
520-
// set up the sub-tasks
521-
return task.newListr(subTasks, {
522-
concurrent: true,
523-
rendererOptions: {
524-
collapseSubtasks: false
525-
}
526-
})
527-
}
528-
529-
/**
530-
* Return task for checking for if node is in freeze state
531-
* @param {any} ctx
532-
* @param {TaskWrapper} task
533-
* @param {string[]} nodeIds
534-
* @returns {*}
535-
*/
536-
checkNodeFreezeTask (ctx, task, nodeIds) {
537-
const subTasks = []
538-
for (const nodeId of nodeIds) {
539-
subTasks.push({
540-
title: `Check node: ${chalk.yellow(nodeId)}`,
541-
task: async () => await this.checkNetworkNodeState(nodeId, 100, 'FREEZE_COMPLETE')
542-
})
543-
}
544-
545-
// set up the sub-tasks
546-
return task.newListr(subTasks, {
547-
concurrent: false,
548-
rendererOptions: {
549-
collapseSubtasks: false
550-
}
551-
})
552-
}
553-
554539
/**
555540
* Return task for setup network nodes
556541
* @param {any} ctx
@@ -1199,7 +1184,7 @@ export class NodeCommand extends BaseCommand {
11991184
{
12001185
title: 'Check nodes are ACTIVE',
12011186
task: (ctx, task) => {
1202-
return this.checkNodeActiveTask(ctx.config.debugNodeId, task, ctx.config.nodeIds)
1187+
return this.checkNodeActivenessTask(ctx, task, ctx.config.nodeIds)
12031188
}
12041189
},
12051190
{
@@ -1533,7 +1518,7 @@ export class NodeCommand extends BaseCommand {
15331518
{
15341519
title: 'Check nodes are ACTIVE',
15351520
task: (ctx, task) => {
1536-
return this.checkNodeActiveTask(ctx.config.debugNodeId, task, ctx.config.nodeIds)
1521+
return this.checkNodeActivenessTask(ctx, task, ctx.config.nodeIds)
15371522
}
15381523
},
15391524
{
@@ -2033,8 +2018,7 @@ export class NodeCommand extends BaseCommand {
20332018
{
20342019
title: 'Check network nodes are frozen',
20352020
task: (ctx, task) => {
2036-
const config = /** @type {NodeAddConfigClass} **/ ctx.config
2037-
return this.checkNodeFreezeTask(ctx, task, config.existingNodeIds)
2021+
return this.checkNodeActivenessTask(ctx, task, ctx.config.existingNodeIds, NodeStatusCodes.FREEZE_COMPLETE)
20382022
}
20392023
},
20402024
{
@@ -2128,7 +2112,7 @@ export class NodeCommand extends BaseCommand {
21282112
{
21292113
title: 'Check all nodes are ACTIVE',
21302114
task: async (ctx, task) => {
2131-
return this.checkNodeActiveTask(ctx.config.debugNodeId, task, ctx.config.allNodeIds)
2115+
return this.checkNodeActivenessTask(ctx, task, ctx.config.allNodeIds)
21322116
}
21332117
},
21342118
{
@@ -2883,8 +2867,7 @@ export class NodeCommand extends BaseCommand {
28832867
{
28842868
title: 'Check network nodes are frozen',
28852869
task: (ctx, task) => {
2886-
const config = /** @type {NodeUpdateConfigClass} **/ ctx.config
2887-
return this.checkNodeFreezeTask(ctx, task, config.existingNodeIds)
2870+
return this.checkNodeActivenessTask(ctx, task, ctx.config.existingNodeIds, NodeStatusCodes.FREEZE_COMPLETE)
28882871
}
28892872
},
28902873
{
@@ -2962,7 +2945,7 @@ export class NodeCommand extends BaseCommand {
29622945
{
29632946
title: 'Check all nodes are ACTIVE',
29642947
task: async (ctx, task) => {
2965-
return this.checkNodeActiveTask(ctx.config.debugNodeId, task, ctx.config.allNodeIds)
2948+
return this.checkNodeActivenessTask(ctx, task, ctx.config.allNodeIds)
29662949
}
29672950
},
29682951
{
@@ -3198,8 +3181,7 @@ export class NodeCommand extends BaseCommand {
31983181
{
31993182
title: 'Check network nodes are frozen',
32003183
task: (ctx, task) => {
3201-
const config = /** @type {NodeDeleteConfigClass} **/ ctx.config
3202-
return this.checkNodeFreezeTask(ctx, task, config.existingNodeIds)
3184+
return this.checkNodeActivenessTask(ctx, task, ctx.config.existingNodeIds, NodeStatusCodes.FREEZE_COMPLETE)
32033185
}
32043186
},
32053187
{
@@ -3269,7 +3251,7 @@ export class NodeCommand extends BaseCommand {
32693251
{
32703252
title: 'Check all nodes are ACTIVE',
32713253
task: async (ctx, task) => {
3272-
return this.checkNodeActiveTask(ctx.config.debugNodeId, task, ctx.config.allNodeIds)
3254+
return this.checkNodeActivenessTask(ctx, task, ctx.config.allNodeIds)
32733255
}
32743256
},
32753257
{

src/core/enumerations.mjs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/**
2+
* Copyright (C) 2024 Hedera Hashgraph, LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the ""License"");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an ""AS IS"" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
*/
17+
'use strict'
18+
19+
export const NodeStatusCodes = {
20+
NO_VALUE: 0,
21+
STARTING_UP: 1,
22+
ACTIVE: 2,
23+
BEHIND: 4,
24+
FREEZING: 5,
25+
FREEZE_COMPLETE: 6,
26+
REPLAYING_EVENTS: 7,
27+
OBSERVING: 8,
28+
CHECKING: 9,
29+
RECONNECT_COMPLETE: 10,
30+
CATASTROPHIC_FAILURE: 11
31+
}
32+
33+
export const NodeStatusEnums = {
34+
0: 'NO_VALUE',
35+
1: 'STARTING_UP',
36+
2: 'ACTIVE',
37+
4: 'BEHIND',
38+
5: 'FREEZING',
39+
6: 'FREEZE_COMPLETE',
40+
7: 'REPLAYING_EVENTS',
41+
8: 'OBSERVING',
42+
9: 'CHECKING',
43+
10: 'RECONNECT_COMPLETE',
44+
11: 'CATASTROPHIC_FAILURE'
45+
}

test/e2e/e2e_node_util.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,8 @@ export function e2eNodeKeyRefreshTest (testName, mode, releaseTag = HEDERA_PLATF
165165
expect(2)
166166
try {
167167
await expect(
168-
nodeCmd.checkNetworkNodeState(nodeId,
169-
5)).rejects.toThrowError()
168+
nodeCmd.checkNetworkNodeActiveness(namespace, nodeId, { title: '' }, '', 44, undefined, 15)
169+
).rejects.toThrowError()
170170
} catch (e) {
171171
expect(e).not.toBeNull()
172172
} finally {

0 commit comments

Comments
 (0)