Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Zen2] Add lag detector #35685

Merged
merged 17 commits into from
Nov 26, 2018
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
private long maxTermSeen;
private final Reconfigurator reconfigurator;
private final ClusterBootstrapService clusterBootstrapService;
private final LagDetector lagDetector;

private Mode mode;
private Optional<DiscoveryNode> lastKnownLeader;
Expand Down Expand Up @@ -156,6 +157,8 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
masterService.setClusterStateSupplier(this::getStateForMasterService);
this.reconfigurator = new Reconfigurator(settings, clusterSettings);
this.clusterBootstrapService = new ClusterBootstrapService(settings, transportService);
this.lagDetector = new LagDetector(settings, transportService.getThreadPool(), n -> removeNode(n, "lagging"),
transportService::getLocalNode);
}

private Runnable getOnLeaderFailure() {
Expand Down Expand Up @@ -373,6 +376,7 @@ void becomeCandidate(String method) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();

if (applierState.nodes().getMasterNodeId() != null) {
applierState = clusterStateWithNoMasterBlock(applierState);
Expand Down Expand Up @@ -427,6 +431,7 @@ void becomeFollower(String method, DiscoveryNode leaderNode) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();
}

private PreVoteResponse getPreVoteResponse() {
Expand Down Expand Up @@ -511,6 +516,11 @@ public void invariant() {
assert (applierState.nodes().getMasterNodeId() == null) == applierState.blocks().hasGlobalBlock(NO_MASTER_BLOCK_WRITES.id());
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse())
: preVoteCollector + " vs " + getPreVoteResponse();

final Set<DiscoveryNode> lagDetectorTrackedNodes = new HashSet<>(lagDetector.getTrackedNodes());
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
assert lagDetectorTrackedNodes.contains(getLocalNode()) == false;
assert followersChecker.getKnownFollowers().equals(lagDetectorTrackedNodes);

if (mode == Mode.LEADER) {
final boolean becomingMaster = getStateForMasterService().term() != getCurrentTerm();

Expand Down Expand Up @@ -830,8 +840,10 @@ public String toString() {
}
});

leaderChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
followersChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();
leaderChecker.setCurrentNodes(publishNodes);
followersChecker.setCurrentNodes(publishNodes);
lagDetector.setTrackedNodes(publishNodes);
publication.start(followersChecker.getFaultyNodes());
}
} catch (Exception e) {
Expand Down Expand Up @@ -987,7 +999,7 @@ public void onNodeAck(DiscoveryNode node, Exception e) {
}
}
},
transportService.getThreadPool()::relativeTimeInMillis);
transportService.getThreadPool()::relativeTimeInMillis, lagDetector::setAppliedVersion);
this.publishRequest = publishRequest;
this.publicationContext = publicationContext;
this.localNodeAckEvent = localNodeAckEvent;
Expand Down Expand Up @@ -1050,6 +1062,7 @@ public void onSuccess(String source) {
if (mode == Mode.LEADER) {
scheduleReconfigurationIfNeeded();
}
lagDetector.startLagDetector(publishRequest.getAcceptedState().version());
}
ackListener.onNodeAck(getLocalNode(), null);
publishListener.onResponse(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.coordination;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;

/**
* A publication can succeed and complete before all nodes have applied the published state and acknowledged it; however we need every node
* eventually either to apply the published state (or a later state) or be removed from the cluster. This component achieves this by
* removing any lagging nodes from the cluster after a timeout.
*/
public class LagDetector {

private static final Logger logger = LogManager.getLogger(LagDetector.class);

// the timeout for each node to apply a value after the end of publication, before being removed from the cluster
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
public static final Setting<TimeValue> CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING =
Setting.timeSetting("cluster.follower_lag.timeout",
TimeValue.timeValueMillis(90000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);

private final TimeValue clusterStateApplicationTimeout;
private final Consumer<DiscoveryNode> onLagDetected;
private final Supplier<DiscoveryNode> localNodeSupplier;
private final ThreadPool threadPool;
private final Map<DiscoveryNode, NodeAppliedStateTracker> appliedStateTrackersByNode = newConcurrentMap();

public LagDetector(final Settings settings, final ThreadPool threadPool, final Consumer<DiscoveryNode> onLagDetected,
final Supplier<DiscoveryNode> localNodeSupplier) {
this.threadPool = threadPool;
this.clusterStateApplicationTimeout = CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.get(settings);
this.onLagDetected = onLagDetected;
this.localNodeSupplier = localNodeSupplier;
}

public void setTrackedNodes(final Iterable<DiscoveryNode> discoveryNodes) {
final Set<DiscoveryNode> discoveryNodeSet = new HashSet<>();
discoveryNodes.forEach(discoveryNodeSet::add);
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
discoveryNodeSet.remove(localNodeSupplier.get());
appliedStateTrackersByNode.keySet().retainAll(discoveryNodeSet);
discoveryNodeSet.forEach(node -> appliedStateTrackersByNode.putIfAbsent(node, new NodeAppliedStateTracker(node)));
}

public void clearTrackedNodes() {
appliedStateTrackersByNode.clear();
}

public void setAppliedVersion(final DiscoveryNode discoveryNode, final long appliedVersion) {
final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
if (nodeAppliedStateTracker == null) {
logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
} else {
nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
}
}

public void startLagDetector(final long version) {
final List<NodeAppliedStateTracker> laggingTrackers
= appliedStateTrackersByNode.values().stream().filter(t -> t.appliedVersionLessThan(version)).collect(Collectors.toList());
ywelsch marked this conversation as resolved.
Show resolved Hide resolved

if (laggingTrackers.isEmpty()) {
logger.trace("lag detection for version {} is unnecessary: {}", version, appliedStateTrackersByNode.values());
} else {
logger.trace("starting lag detector for version {}: {}", version, laggingTrackers);

threadPool.scheduleUnlessShuttingDown(clusterStateApplicationTimeout, Names.GENERIC, new Runnable() {
@Override
public void run() {
laggingTrackers.forEach(t -> t.detectLag(version));
}

@Override
public String toString() {
return "lag detector for version " + version + " on " + laggingTrackers;
}
});
}
}

@Override
public String toString() {
return "LagDetector{" +
"clusterStateApplicationTimeout=" + clusterStateApplicationTimeout +
", appliedStateTrackersByNode=" + appliedStateTrackersByNode.values() +
'}';
}

// for assertions
Set<DiscoveryNode> getTrackedNodes() {
return Collections.unmodifiableSet(appliedStateTrackersByNode.keySet());
}

private class NodeAppliedStateTracker {
private final DiscoveryNode discoveryNode;
private final AtomicLong appliedVersion = new AtomicLong();

NodeAppliedStateTracker(final DiscoveryNode discoveryNode) {
this.discoveryNode = discoveryNode;
}

void increaseAppliedVersion(long appliedVersion) {
long maxAppliedVersion = this.appliedVersion.updateAndGet(v -> Math.max(v, appliedVersion));
logger.trace("{} applied version {}, max now {}", this, appliedVersion, maxAppliedVersion);
}

boolean appliedVersionLessThan(final long version) {
return appliedVersion.get() < version;
}

@Override
public String toString() {
return "NodeAppliedStateTracker{" +
"discoveryNode=" + discoveryNode +
", appliedVersion=" + appliedVersion +
'}';
}

private void detectLag(final long version) {
if (appliedStateTrackersByNode.get(discoveryNode) != NodeAppliedStateTracker.this) {
logger.trace("{}, no longer active", this);
return;
}

long appliedVersion = NodeAppliedStateTracker.this.appliedVersion.get();
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
if (version <= appliedVersion) {
logger.trace("{}, satisfied, node applied version {}", this, appliedVersion);
return;
}

logger.debug("{}, detected lag, node has only applied version {}", this, appliedVersion);
ywelsch marked this conversation as resolved.
Show resolved Hide resolved
onLagDetected.accept(discoveryNode);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.Optional;
import java.util.Set;
import java.util.function.LongSupplier;
import java.util.function.ObjLongConsumer;

public abstract class Publication {

Expand All @@ -46,16 +47,19 @@ public abstract class Publication {
private final AckListener ackListener;
private final LongSupplier currentTimeSupplier;
private final long startTime;
private final ObjLongConsumer<DiscoveryNode> onNodeApplicationAck;

private Optional<ApplyCommitRequest> applyCommitRequest; // set when state is committed
private boolean isCompleted; // set when publication is completed
private boolean timedOut; // set when publication timed out

public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier) {
public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier,
ObjLongConsumer<DiscoveryNode> onNodeApplicationAck) {
this.publishRequest = publishRequest;
this.ackListener = ackListener;
this.currentTimeSupplier = currentTimeSupplier;
startTime = currentTimeSupplier.getAsLong();
this.onNodeApplicationAck = onNodeApplicationAck;
applyCommitRequest = Optional.empty();
publicationTargets = new ArrayList<>(publishRequest.getAcceptedState().getNodes().getNodes().size());
publishRequest.getAcceptedState().getNodes().iterator().forEachRemaining(n -> publicationTargets.add(new PublicationTarget(n)));
Expand Down Expand Up @@ -251,6 +255,7 @@ void sendApplyCommit() {
void setAppliedCommit() {
assert state == PublicationTargetState.SENT_APPLY_COMMIT : state + " -> " + PublicationTargetState.APPLIED_COMMIT;
state = PublicationTargetState.APPLIED_COMMIT;
onNodeApplicationAck.accept(discoveryNode, publishRequest.getAcceptedState().version());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can avoid to hook this in here, and whether there's a way to do this in Coordinator. We call ackOnce(null) here, which in turn calls AckListener.onNodeAck(DiscoveryNode, @Nullable Exception). We already hook into that acklistener in Coordinator, so we could also get those events there. And we also know the version of the cluster state we're publishing in Coordinator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And we also know the version of the cluster state we're publishing in Coordinator.

How do we know that? We start the lag detector after clearing currentPublication, and another publication could then start. It wouldn't be right to detect lag based on a newer publication.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My suggestion is something like the following (I had to revert the not-null assertion, because I think we don't have that guarantee, and CoordinatorTests were failing):

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
index f132998d6ba..64e9310402c 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -985,6 +985,9 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
 
                     @Override
                     public void onNodeAck(DiscoveryNode node, Exception e) {
+                        if (e == null) {
+                            lagDetector.setAppliedVersion(node, publishRequest.getAcceptedState().version());
+                        }
                         // acking and cluster state application for local node is handled specially
                         if (node.equals(getLocalNode())) {
                             synchronized (mutex) {
@@ -999,7 +1002,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
                         }
                     }
                 },
-                transportService.getThreadPool()::relativeTimeInMillis, lagDetector::setAppliedVersion);
+                transportService.getThreadPool()::relativeTimeInMillis);
             this.publishRequest = publishRequest;
             this.publicationContext = publicationContext;
             this.localNodeAckEvent = localNodeAckEvent;
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java b/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
index 3180913a012..ea52ec95673 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
@@ -85,8 +85,11 @@ public class LagDetector {
         }
 
         final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
-        assert nodeAppliedStateTracker != null : "untracked node " + discoveryNode + " applied version " + appliedVersion;
-        nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
+        if (nodeAppliedStateTracker == null) {
+            logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
+        } else {
+            nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
+        }
     }
 
     public void startLagDetector(final long version) {
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
index a602750bba8..9ec8d562b81 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
@@ -36,7 +36,6 @@ import java.util.List;
 import java.util.Optional;
 import java.util.Set;
 import java.util.function.LongSupplier;
-import java.util.function.ObjLongConsumer;
 
 public abstract class Publication {
 
@@ -47,19 +46,16 @@ public abstract class Publication {
     private final AckListener ackListener;
     private final LongSupplier currentTimeSupplier;
     private final long startTime;
-    private final ObjLongConsumer<DiscoveryNode> onNodeApplicationAck;
 
     private Optional<ApplyCommitRequest> applyCommitRequest; // set when state is committed
     private boolean isCompleted; // set when publication is completed
     private boolean timedOut; // set when publication timed out
 
-    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier,
-                       ObjLongConsumer<DiscoveryNode> onNodeApplicationAck) {
+    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier) {
         this.publishRequest = publishRequest;
         this.ackListener = ackListener;
         this.currentTimeSupplier = currentTimeSupplier;
         startTime = currentTimeSupplier.getAsLong();
-        this.onNodeApplicationAck = onNodeApplicationAck;
         applyCommitRequest = Optional.empty();
         publicationTargets = new ArrayList<>(publishRequest.getAcceptedState().getNodes().getNodes().size());
         publishRequest.getAcceptedState().getNodes().iterator().forEachRemaining(n -> publicationTargets.add(new PublicationTarget(n)));
@@ -255,7 +251,6 @@ public abstract class Publication {
         void setAppliedCommit() {
             assert state == PublicationTargetState.SENT_APPLY_COMMIT : state + " -> " + PublicationTargetState.APPLIED_COMMIT;
             state = PublicationTargetState.APPLIED_COMMIT;
-            onNodeApplicationAck.accept(discoveryNode, publishRequest.getAcceptedState().version());
             ackOnce(null);
         }
 
diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
index e17c77ce6dc..914ee1e95f7 100644
--- a/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
+++ b/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
@@ -103,8 +103,7 @@ public class PublicationTests extends ESTestCase {
         Set<DiscoveryNode> missingJoins = new HashSet<>();
 
         MockPublication(PublishRequest publishRequest, Discovery.AckListener ackListener, LongSupplier currentTimeSupplier) {
-            super(publishRequest, ackListener, currentTimeSupplier, (n, l) -> {
-            });
+            super(publishRequest, ackListener, currentTimeSupplier);
             this.publishRequest = publishRequest;
         }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, ok, I did that in 63b21ee.

ackOnce(null);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.cluster.NodeConnectionsService;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
import org.elasticsearch.cluster.coordination.LagDetector;
import org.elasticsearch.cluster.coordination.Coordinator;
import org.elasticsearch.cluster.coordination.ElectionSchedulerFactory;
import org.elasticsearch.cluster.coordination.FollowersChecker;
Expand Down Expand Up @@ -469,7 +470,8 @@ public void apply(Settings value, Settings current, Settings previous) {
LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING,
Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION,
TransportAddVotingTombstonesAction.MAXIMUM_VOTING_TOMBSTONES_SETTING,
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING,
LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING
)));

public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ public void testAckListenerReceivesNoAckFromHangingFollower() {

assertTrue("expected immediate ack from " + follower1, ackCollector.hasAckedSuccessfully(follower1));
assertFalse("expected no ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
cluster.stabilise();
cluster.stabilise(defaultMillis(PUBLISH_TIMEOUT_SETTING));
assertTrue("expected eventual ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
assertFalse("expected no ack from " + follower0, ackCollector.hasAcked(follower0));
}
Expand Down Expand Up @@ -1097,7 +1097,6 @@ void stabilise(long stabilisationDurationMillis) {
}

runFor(stabilisationDurationMillis, "stabilising");
fixLag();

final ClusterNode leader = getAnyLeader();
final long leaderTerm = leader.coordinator.getCurrentTerm();
Expand Down Expand Up @@ -1154,35 +1153,6 @@ void stabilise(long stabilisationDurationMillis) {
leader.improveConfiguration(lastAcceptedState), sameInstance(lastAcceptedState));
}

// TODO remove this when lag detection is implemented
void fixLag() {
DaveCTurner marked this conversation as resolved.
Show resolved Hide resolved
final ClusterNode leader = getAnyLeader();
final long leaderVersion = leader.getLastAppliedClusterState().version();
final long minVersion = clusterNodes.stream()
.filter(n -> isConnectedPair(n, leader))
.map(n -> n.getLastAppliedClusterState().version()).min(Long::compare).orElse(Long.MIN_VALUE);
assert minVersion >= 0;
if (minVersion < leaderVersion) {
logger.info("--> fixLag publishing a value to fix lag, leaderVersion={}, minVersion={}", leaderVersion, minVersion);
onNode(leader.getLocalNode(), () -> {
synchronized (leader.coordinator.mutex) {
leader.submitValue(randomLong());
}
}).run();

runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY
// may need to bump terms too
+ DEFAULT_ELECTION_DELAY,
"re-stabilising after lag-fixing publication");

if (clusterNodes.stream().anyMatch(n -> n.getClusterStateApplyResponse().equals(ClusterStateApplyResponse.HANG))) {
runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING), "allowing lag-fixing publication to time out");
}
} else {
logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
}
}

void runFor(long runDurationMillis, String description) {
final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
logger.info("--> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);
Expand Down
Loading