From 4367a7e174da7b2994ca9b154b493a591770b650 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 27 Aug 2024 10:31:42 +0800 Subject: [PATCH] enhance: avoid the coexistence of the old coordinator and the new node/proxy In standalone mode, block the start process until the new coordinator is active to avoid the coexistence of the old coordinator and the new node/proxy 1. In the start/restart process, the new coordinator will become active immediately and will not be blocked 2. In the rolling upgrade process, the new coordinator will not be active until the old coordinator is down, and it will be blocked Signed-off-by: Wei Liu --- cmd/roles/roles.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cmd/roles/roles.go b/cmd/roles/roles.go index 51e9fd37d43a..ec2dd6c65aef 100644 --- a/cmd/roles/roles.go +++ b/cmd/roles/roles.go @@ -421,6 +421,35 @@ func (mr *MilvusRoles) Run() { componentMap[typeutil.QueryCoordRole] = queryCoord } + waitCoordBecomeHealthy := func() { + for { + select { + case <-ctx.Done(): + log.Info("wait all coord become healthy loop quit") + return + default: + rcState := rootCoord.Health(ctx) + dcState := dataCoord.Health(ctx) + icState := indexCoord.Health(ctx) + qcState := queryCoord.Health(ctx) + + if rcState == commonpb.StateCode_Healthy && dcState == commonpb.StateCode_Healthy && icState == commonpb.StateCode_Healthy && qcState == commonpb.StateCode_Healthy { + log.Info("all coord become healthy") + return + } + log.Info("wait all coord become healthy", zap.String("rootCoord", rcState.String()), zap.String("dataCoord", dcState.String()), zap.String("indexCoord", icState.String()), zap.String("queryCoord", qcState.String())) + time.Sleep(time.Second) + } + } + } + + // In standalone mode, block the start process until the new coordinator is active to avoid the coexistence of the old coordinator and the new node/proxy + // 1. In the start/restart process, the new coordinator will become active immediately and will not be blocked + // 2. In the rolling upgrade process, the new coordinator will not be active until the old coordinator is down, and it will be blocked + if mr.Local { + waitCoordBecomeHealthy() + } + if mr.EnableQueryNode { queryNode = mr.runQueryNode(ctx, local, &wg) componentMap[typeutil.QueryNodeRole] = queryNode