Skip to content

Commit

Permalink
Seq Number based recovery should validate last lucene commit max seq# (
Browse files Browse the repository at this point in the history
…#22851)

The seq# base recovery logic relies on rolling back lucene to remove any operations above the global checkpoint. This part of the plan is not implemented yet but have to have these guarantees. Instead we should make the seq# logic validate that the last commit point (and the only one we have) maintains the invariant and if not, fall back to file based recovery.

 This commit adds a test that creates situation where rollback is needed (primary failover with ops in flight) and fixes another issue that was surfaced by it - if a primary can't serve a seq# based recovery request and does a file copy, it still used the incoming `startSeqNo` as a filter.

 Relates to #22484 & #10708
  • Loading branch information
bleskes authored Jan 31, 2017
1 parent 29f63c7 commit eb36b82
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 119 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,16 @@ long getMaxSeqNo() {
return nextSeqNo - 1;
}


/**
* constructs a {@link SeqNoStats} object, using local state and the supplied global checkpoint
*
* @implNote this is needed to make sure the local checkpoint and max seq no are consistent
*/
synchronized SeqNoStats getStats(final long globalCheckpoint) {
return new SeqNoStats(getMaxSeqNo(), getCheckpoint(), globalCheckpoint);
}

/**
* Waits for all operations up to the provided sequence number to complete.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ public class SeqNoStats implements ToXContent, Writeable {
private final long globalCheckpoint;

public SeqNoStats(long maxSeqNo, long localCheckpoint, long globalCheckpoint) {
assert localCheckpoint <= maxSeqNo:
"local checkpoint [" + localCheckpoint + "] is above maximum seq no [" + maxSeqNo + "]";
// note that the the global checkpoint can be higher from both maxSeqNo and localCheckpoint
// as we use this stats object to describe lucene commits as well as live statistic.
this.maxSeqNo = maxSeqNo;
this.localCheckpoint = localCheckpoint;
this.globalCheckpoint = globalCheckpoint;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public void markSeqNoAsCompleted(final long seqNo) {
* @return stats encapuslating the maximum sequence number, the local checkpoint and the global checkpoint
*/
public SeqNoStats stats() {
return new SeqNoStats(getMaxSeqNo(), getLocalCheckpoint(), getGlobalCheckpoint());
return localCheckpointTracker.getStats(getGlobalCheckpoint());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,10 @@ private RecoverySourceHandler createRecoverySourceHandler(StartRecoveryRequest r
Supplier<Long> currentClusterStateVersionSupplier = () -> clusterService.state().getVersion();
if (shard.indexSettings().isOnSharedFilesystem()) {
handler = new SharedFSRecoverySourceHandler(shard, recoveryTarget, request, currentClusterStateVersionSupplier,
this::delayNewRecoveries, logger);
this::delayNewRecoveries, settings);
} else {
handler = new RecoverySourceHandler(shard, recoveryTarget, request, currentClusterStateVersionSupplier,
this::delayNewRecoveries, recoverySettings.getChunkSize().bytesAsInt(), logger);
this::delayNewRecoveries, recoverySettings.getChunkSize().bytesAsInt(), settings);
}
return handler;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.engine.RecoveryEngineException;
import org.elasticsearch.index.mapper.MapperException;
import org.elasticsearch.index.seqno.SeqNoStats;
import org.elasticsearch.index.seqno.SequenceNumbersService;
import org.elasticsearch.index.shard.IllegalIndexShardStateException;
import org.elasticsearch.index.shard.IndexEventListener;
Expand All @@ -61,7 +62,6 @@
import org.elasticsearch.transport.TransportService;

import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

Expand Down Expand Up @@ -365,7 +365,15 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
public static long getStartingSeqNo(final RecoveryTarget recoveryTarget) {
try {
final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.indexShard().shardPath().resolveTranslog());
return recoveryTarget.store().loadSeqNoStats(globalCheckpoint).getLocalCheckpoint() + 1;
final SeqNoStats seqNoStats = recoveryTarget.store().loadSeqNoStats(globalCheckpoint);
if (seqNoStats.getMaxSeqNo() <= seqNoStats.getGlobalCheckpoint()) {
// commit point is good for seq no based recovery as the maximum seq# including in it
// is below the global checkpoint (i.e., it excludes any ops thay may not be on the primary)
// Recovery will start at the first op after the local check point stored in the commit.
return seqNoStats.getLocalCheckpoint() + 1;
} else {
return SequenceNumbersService.UNASSIGNED_SEQ_NO;
}
} catch (final IOException e) {
// this can happen, for example, if a phase one of the recovery completed successfully, a network partition happens before the
// translog on the recovery target is opened, the recovery enters a retry loop seeing now that the index files are on disk and
Expand Down
Loading

0 comments on commit eb36b82

Please sign in to comment.