elastic · DaveCTurner · Nov 26, 2018 · Nov 19, 2018 · Nov 21, 2018 · Nov 21, 2018
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -118,6 +118,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
     private long maxTermSeen;
     private final Reconfigurator reconfigurator;
     private final ClusterBootstrapService clusterBootstrapService;
+    private final LagDetector lagDetector;
 
     private Mode mode;
     private Optional<DiscoveryNode> lastKnownLeader;
@@ -156,6 +157,8 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
         masterService.setClusterStateSupplier(this::getStateForMasterService);
         this.reconfigurator = new Reconfigurator(settings, clusterSettings);
         this.clusterBootstrapService = new ClusterBootstrapService(settings, transportService);
+        this.lagDetector = new LagDetector(settings, transportService.getThreadPool(), n -> removeNode(n, "lagging"),
+            transportService::getLocalNode);
     }
 
     private Runnable getOnLeaderFailure() {
@@ -373,6 +376,7 @@ void becomeCandidate(String method) {
 
             followersChecker.clearCurrentNodes();
             followersChecker.updateFastResponseState(getCurrentTerm(), mode);
+            lagDetector.clearTrackedNodes();
 
             if (applierState.nodes().getMasterNodeId() != null) {
                 applierState = clusterStateWithNoMasterBlock(applierState);
@@ -427,6 +431,7 @@ void becomeFollower(String method, DiscoveryNode leaderNode) {
 
         followersChecker.clearCurrentNodes();
         followersChecker.updateFastResponseState(getCurrentTerm(), mode);
+        lagDetector.clearTrackedNodes();
     }
 
     private PreVoteResponse getPreVoteResponse() {
@@ -511,6 +516,11 @@ public void invariant() {
             assert (applierState.nodes().getMasterNodeId() == null) == applierState.blocks().hasGlobalBlock(NO_MASTER_BLOCK_WRITES.id());
             assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse())
                 : preVoteCollector + " vs " + getPreVoteResponse();
+
+            final Set<DiscoveryNode> lagDetectorTrackedNodes = new HashSet<>(lagDetector.getTrackedNodes());
+            assert lagDetectorTrackedNodes.contains(getLocalNode()) == false;
+            assert followersChecker.getKnownFollowers().equals(lagDetectorTrackedNodes);
+
             if (mode == Mode.LEADER) {
                 final boolean becomingMaster = getStateForMasterService().term() != getCurrentTerm();
 
@@ -830,8 +840,10 @@ public String toString() {
                     }
                 });
 
-                leaderChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
-                followersChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
+                final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();
+                leaderChecker.setCurrentNodes(publishNodes);
+                followersChecker.setCurrentNodes(publishNodes);
+                lagDetector.setTrackedNodes(publishNodes);
                 publication.start(followersChecker.getFaultyNodes());
             }
         } catch (Exception e) {
@@ -987,7 +999,7 @@ public void onNodeAck(DiscoveryNode node, Exception e) {
                         }
                     }
                 },
-                transportService.getThreadPool()::relativeTimeInMillis);
+                transportService.getThreadPool()::relativeTimeInMillis, lagDetector::setAppliedVersion);
             this.publishRequest = publishRequest;
             this.publicationContext = publicationContext;
             this.localNodeAckEvent = localNodeAckEvent;
@@ -1050,6 +1062,7 @@ public void onSuccess(String source) {
                                     if (mode == Mode.LEADER) {
                                         scheduleReconfigurationIfNeeded();
                                     }
+                                    lagDetector.startLagDetector(publishRequest.getAcceptedState().version());
                                 }
                                 ackListener.onNodeAck(getLocalNode(), null);
                                 publishListener.onResponse(null);

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java b/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.cluster.coordination;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.settings.Setting;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.threadpool.ThreadPool.Names;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
+
+/**
+ * A publication can succeed and complete before all nodes have applied the published state and acknowledged it; however we need every node
+ * eventually either to apply the published state (or a later state) or be removed from the cluster. This component achieves this by
+ * removing any lagging nodes from the cluster after a timeout.
+ */
+public class LagDetector {
+
+    private static final Logger logger = LogManager.getLogger(LagDetector.class);
+
+    // the timeout for each node to apply a value after the end of publication, before being removed from the cluster
+    public static final Setting<TimeValue> CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING =
+        Setting.timeSetting("cluster.follower_lag.timeout",
+            TimeValue.timeValueMillis(90000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
+
+    private final TimeValue clusterStateApplicationTimeout;
+    private final Consumer<DiscoveryNode> onLagDetected;
+    private final Supplier<DiscoveryNode> localNodeSupplier;
+    private final ThreadPool threadPool;
+    private final Map<DiscoveryNode, NodeAppliedStateTracker> appliedStateTrackersByNode = newConcurrentMap();
+
+    public LagDetector(final Settings settings, final ThreadPool threadPool, final Consumer<DiscoveryNode> onLagDetected,
+                       final Supplier<DiscoveryNode> localNodeSupplier) {
+        this.threadPool = threadPool;
+        this.clusterStateApplicationTimeout = CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.get(settings);
+        this.onLagDetected = onLagDetected;
+        this.localNodeSupplier = localNodeSupplier;
+    }
+
+    public void setTrackedNodes(final Iterable<DiscoveryNode> discoveryNodes) {
+        final Set<DiscoveryNode> discoveryNodeSet = new HashSet<>();
+        discoveryNodes.forEach(discoveryNodeSet::add);
+        discoveryNodeSet.remove(localNodeSupplier.get());
+        appliedStateTrackersByNode.keySet().retainAll(discoveryNodeSet);
+        discoveryNodeSet.forEach(node -> appliedStateTrackersByNode.putIfAbsent(node, new NodeAppliedStateTracker(node)));
+    }
+
+    public void clearTrackedNodes() {
+        appliedStateTrackersByNode.clear();
+    }
+
+    public void setAppliedVersion(final DiscoveryNode discoveryNode, final long appliedVersion) {
+        final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
+        if (nodeAppliedStateTracker == null) {
+            logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
+        } else {
+            nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
+        }
+    }
+
+    public void startLagDetector(final long version) {
+        final List<NodeAppliedStateTracker> laggingTrackers
+            = appliedStateTrackersByNode.values().stream().filter(t -> t.appliedVersionLessThan(version)).collect(Collectors.toList());
+
+        if (laggingTrackers.isEmpty()) {
+            logger.trace("lag detection for version {} is unnecessary: {}", version, appliedStateTrackersByNode.values());
+        } else {
+            logger.trace("starting lag detector for version {}: {}", version, laggingTrackers);
+
+            threadPool.scheduleUnlessShuttingDown(clusterStateApplicationTimeout, Names.GENERIC, new Runnable() {
+                @Override
+                public void run() {
+                    laggingTrackers.forEach(t -> t.detectLag(version));
+                }
+
+                @Override
+                public String toString() {
+                    return "lag detector for version " + version + " on " + laggingTrackers;
+                }
+            });
+        }
+    }
+
+    @Override
+    public String toString() {
+        return "LagDetector{" +
+            "clusterStateApplicationTimeout=" + clusterStateApplicationTimeout +
+            ", appliedStateTrackersByNode=" + appliedStateTrackersByNode.values() +
+            '}';
+    }
+
+    // for assertions
+    Set<DiscoveryNode> getTrackedNodes() {
+        return Collections.unmodifiableSet(appliedStateTrackersByNode.keySet());
+    }
+
+    private class NodeAppliedStateTracker {
+        private final DiscoveryNode discoveryNode;
+        private final AtomicLong appliedVersion = new AtomicLong();
+
+        NodeAppliedStateTracker(final DiscoveryNode discoveryNode) {
+            this.discoveryNode = discoveryNode;
+        }
+
+        void increaseAppliedVersion(long appliedVersion) {
+            long maxAppliedVersion = this.appliedVersion.updateAndGet(v -> Math.max(v, appliedVersion));
+            logger.trace("{} applied version {}, max now {}", this, appliedVersion, maxAppliedVersion);
+        }
+
+        boolean appliedVersionLessThan(final long version) {
+            return appliedVersion.get() < version;
+        }
+
+        @Override
+        public String toString() {
+            return "NodeAppliedStateTracker{" +
+                "discoveryNode=" + discoveryNode +
+                ", appliedVersion=" + appliedVersion +
+                '}';
+        }
+
+        private void detectLag(final long version) {
+            if (appliedStateTrackersByNode.get(discoveryNode) != NodeAppliedStateTracker.this) {
+                logger.trace("{}, no longer active", this);
+                return;
+            }
+
+            long appliedVersion = NodeAppliedStateTracker.this.appliedVersion.get();
+            if (version <= appliedVersion) {
+                logger.trace("{}, satisfied, node applied version {}", this, appliedVersion);
+                return;
+            }
+
+            logger.debug("{}, detected lag, node has only applied version {}", this, appliedVersion);
+            onLagDetected.accept(discoveryNode);
+        }
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
@@ -36,6 +36,7 @@
 import java.util.Optional;
 import java.util.Set;
 import java.util.function.LongSupplier;
+import java.util.function.ObjLongConsumer;
 
 public abstract class Publication {
 
@@ -46,16 +47,19 @@ public abstract class Publication {
     private final AckListener ackListener;
     private final LongSupplier currentTimeSupplier;
     private final long startTime;
+    private final ObjLongConsumer<DiscoveryNode> onNodeApplicationAck;
 
     private Optional<ApplyCommitRequest> applyCommitRequest; // set when state is committed
     private boolean isCompleted; // set when publication is completed
     private boolean timedOut; // set when publication timed out
 
-    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier) {
+    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier,
+                       ObjLongConsumer<DiscoveryNode> onNodeApplicationAck) {
         this.publishRequest = publishRequest;
         this.ackListener = ackListener;
         this.currentTimeSupplier = currentTimeSupplier;
         startTime = currentTimeSupplier.getAsLong();
+        this.onNodeApplicationAck = onNodeApplicationAck;
         applyCommitRequest = Optional.empty();
         publicationTargets = new ArrayList<>(publishRequest.getAcceptedState().getNodes().getNodes().size());
         publishRequest.getAcceptedState().getNodes().iterator().forEachRemaining(n -> publicationTargets.add(new PublicationTarget(n)));
@@ -251,6 +255,7 @@ void sendApplyCommit() {
         void setAppliedCommit() {
             assert state == PublicationTargetState.SENT_APPLY_COMMIT : state + " -> " + PublicationTargetState.APPLIED_COMMIT;
             state = PublicationTargetState.APPLIED_COMMIT;
+            onNodeApplicationAck.accept(discoveryNode, publishRequest.getAcceptedState().version());
             ackOnce(null);
         }
 

diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -33,6 +33,7 @@
 import org.elasticsearch.cluster.NodeConnectionsService;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
+import org.elasticsearch.cluster.coordination.LagDetector;
 import org.elasticsearch.cluster.coordination.Coordinator;
 import org.elasticsearch.cluster.coordination.ElectionSchedulerFactory;
 import org.elasticsearch.cluster.coordination.FollowersChecker;
@@ -469,7 +470,8 @@ public void apply(Settings value, Settings current, Settings previous) {
                     LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING,
                     Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION,
                     TransportAddVotingTombstonesAction.MAXIMUM_VOTING_TOMBSTONES_SETTING,
-                    ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING
+                    ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING,
+                    LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING
             )));
 
     public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(

diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java
@@ -599,7 +599,7 @@ public void testAckListenerReceivesNoAckFromHangingFollower() {
 
         assertTrue("expected immediate ack from " + follower1, ackCollector.hasAckedSuccessfully(follower1));
         assertFalse("expected no ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
-        cluster.stabilise();
+        cluster.stabilise(defaultMillis(PUBLISH_TIMEOUT_SETTING));
         assertTrue("expected eventual ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
         assertFalse("expected no ack from " + follower0, ackCollector.hasAcked(follower0));
     }
@@ -1097,7 +1097,6 @@ void stabilise(long stabilisationDurationMillis) {
             }
 
             runFor(stabilisationDurationMillis, "stabilising");
-            fixLag();
 
             final ClusterNode leader = getAnyLeader();
             final long leaderTerm = leader.coordinator.getCurrentTerm();
@@ -1154,35 +1153,6 @@ void stabilise(long stabilisationDurationMillis) {
                 leader.improveConfiguration(lastAcceptedState), sameInstance(lastAcceptedState));
         }
 
-        // TODO remove this when lag detection is implemented
-        void fixLag() {
-            final ClusterNode leader = getAnyLeader();
-            final long leaderVersion = leader.getLastAppliedClusterState().version();
-            final long minVersion = clusterNodes.stream()
-                .filter(n -> isConnectedPair(n, leader))
-                .map(n -> n.getLastAppliedClusterState().version()).min(Long::compare).orElse(Long.MIN_VALUE);
-            assert minVersion >= 0;
-            if (minVersion < leaderVersion) {
-                logger.info("--> fixLag publishing a value to fix lag, leaderVersion={}, minVersion={}", leaderVersion, minVersion);
-                onNode(leader.getLocalNode(), () -> {
-                    synchronized (leader.coordinator.mutex) {
-                        leader.submitValue(randomLong());
-                    }
-                }).run();
-
-                runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY
-                        // may need to bump terms too
-                        + DEFAULT_ELECTION_DELAY,
-                    "re-stabilising after lag-fixing publication");
-
-                if (clusterNodes.stream().anyMatch(n -> n.getClusterStateApplyResponse().equals(ClusterStateApplyResponse.HANG))) {
-                    runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING), "allowing lag-fixing publication to time out");
-                }
-            } else {
-                logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
-            }
-        }
-
         void runFor(long runDurationMillis, String description) {
             final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
             logger.info("--> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);