Seq Number based recovery should validate last lucene commit max seq# (…

…#22851) The seq# base recovery logic relies on rolling back lucene to remove any operations above the global checkpoint. This part of the plan is not implemented yet but have to have these guarantees. Instead we should make the seq# logic validate that the last commit point (and the only one we have) maintains the invariant and if not, fall back to file based recovery. This commit adds a test that creates situation where rollback is needed (primary failover with ops in flight) and fixes another issue that was surfaced by it - if a primary can't serve a seq# based recovery request and does a file copy, it still used the incoming `startSeqNo` as a filter. Relates to #22484 & #10708
elastic · Jan 31, 2017 · eb36b82 · eb36b82
1 parent 29f63c7
commit eb36b82
Show file tree

Hide file tree

Showing 14 changed files with 294 additions and 119 deletions.
diff --git a/core/src/main/java/org/elasticsearch/index/seqno/LocalCheckpointTracker.java b/core/src/main/java/org/elasticsearch/index/seqno/LocalCheckpointTracker.java
@@ -139,6 +139,16 @@ long getMaxSeqNo() {
         return nextSeqNo - 1;
     }
 
+
+    /**
+     * constructs a {@link SeqNoStats} object, using local state and the supplied global checkpoint
+     *
+     * @implNote this is needed to make sure the local checkpoint and max seq no are consistent
+     */
+    synchronized SeqNoStats getStats(final long globalCheckpoint) {
+        return new SeqNoStats(getMaxSeqNo(), getCheckpoint(), globalCheckpoint);
+    }
+
     /**
      * Waits for all operations up to the provided sequence number to complete.
      *

diff --git a/core/src/main/java/org/elasticsearch/index/seqno/SeqNoStats.java b/core/src/main/java/org/elasticsearch/index/seqno/SeqNoStats.java
@@ -39,6 +39,10 @@ public class SeqNoStats implements ToXContent, Writeable {
     private final long globalCheckpoint;
 
     public SeqNoStats(long maxSeqNo, long localCheckpoint, long globalCheckpoint) {
+        assert localCheckpoint <= maxSeqNo:
+            "local checkpoint [" + localCheckpoint + "] is above maximum seq no [" + maxSeqNo + "]";
+        // note that the the global checkpoint can be higher from both maxSeqNo and localCheckpoint
+        // as we use this stats object to describe lucene commits as well as live statistic.
         this.maxSeqNo = maxSeqNo;
         this.localCheckpoint = localCheckpoint;
         this.globalCheckpoint = globalCheckpoint;

diff --git a/core/src/main/java/org/elasticsearch/index/seqno/SequenceNumbersService.java b/core/src/main/java/org/elasticsearch/index/seqno/SequenceNumbersService.java
@@ -111,7 +111,7 @@ public void markSeqNoAsCompleted(final long seqNo) {
      * @return stats encapuslating the maximum sequence number, the local checkpoint and the global checkpoint
      */
     public SeqNoStats stats() {
-        return new SeqNoStats(getMaxSeqNo(), getLocalCheckpoint(), getGlobalCheckpoint());
+        return localCheckpointTracker.getStats(getGlobalCheckpoint());
     }
 
     /**

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoverySourceService.java b/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoverySourceService.java
@@ -199,10 +199,10 @@ private RecoverySourceHandler createRecoverySourceHandler(StartRecoveryRequest r
                 Supplier<Long> currentClusterStateVersionSupplier = () -> clusterService.state().getVersion();
                 if (shard.indexSettings().isOnSharedFilesystem()) {
                     handler = new SharedFSRecoverySourceHandler(shard, recoveryTarget, request, currentClusterStateVersionSupplier,
-                        this::delayNewRecoveries, logger);
+                        this::delayNewRecoveries, settings);
                 } else {
                     handler = new RecoverySourceHandler(shard, recoveryTarget, request, currentClusterStateVersionSupplier,
-                        this::delayNewRecoveries, recoverySettings.getChunkSize().bytesAsInt(), logger);
+                        this::delayNewRecoveries, recoverySettings.getChunkSize().bytesAsInt(), settings);
                 }
                 return handler;
             }

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java b/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java
@@ -41,6 +41,7 @@
 import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.index.engine.RecoveryEngineException;
 import org.elasticsearch.index.mapper.MapperException;
+import org.elasticsearch.index.seqno.SeqNoStats;
 import org.elasticsearch.index.seqno.SequenceNumbersService;
 import org.elasticsearch.index.shard.IllegalIndexShardStateException;
 import org.elasticsearch.index.shard.IndexEventListener;
@@ -61,7 +62,6 @@
 import org.elasticsearch.transport.TransportService;
 
 import java.io.IOException;
-import java.util.Optional;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -365,7 +365,15 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
     public static long getStartingSeqNo(final RecoveryTarget recoveryTarget) {
         try {
             final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.indexShard().shardPath().resolveTranslog());
-            return recoveryTarget.store().loadSeqNoStats(globalCheckpoint).getLocalCheckpoint() + 1;
+            final SeqNoStats seqNoStats = recoveryTarget.store().loadSeqNoStats(globalCheckpoint);
+            if (seqNoStats.getMaxSeqNo() <= seqNoStats.getGlobalCheckpoint()) {
+                // commit point is good for seq no based recovery as the maximum seq# including in it
+                // is below the global checkpoint (i.e., it excludes any ops thay may not be on the primary)
+                // Recovery will start at the first op after the local check point stored in the commit.
+                return seqNoStats.getLocalCheckpoint() + 1;
+            } else {
+                return SequenceNumbersService.UNASSIGNED_SEQ_NO;
+            }
         } catch (final IOException e) {
             // this can happen, for example, if a phase one of the recovery completed successfully, a network partition happens before the
             // translog on the recovery target is opened, the recovery enters a retry loop seeing now that the index files are on disk and