Skip to content

Commit

Permalink
Fix flakiness with SegmentReplicationSuiteIT (#11977)
Browse files Browse the repository at this point in the history
* Fix SegmentReplicationSuiteIT

This test fails because of a race during shard/node shutdown with node-node replication.
Fixed by properly synchronizing creation of new replication events with cancellation and cancelling
after shards are closed.

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>

* Remove CopyState caching from OngoingSegmentReplications.

This change removes the responsibility of caching CopyState inside of OngoingSegmentReplications.
1. CopyState was originally cached to prevent frequent disk reads while building segment metadata.  This is now
cached lower down in IndexShard and is not required here.
2. Change prepareForReplication method to return SegmentReplicationSourceHandler directly
3. Move responsibility of creating and clearing CopyState to the handler.

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>

* Fix comment for afterIndexShardClosed method.

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>

* Fix comment on beforeIndexShardClosed

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>

* Remove unnecessary method from OngoingSegmentReplications

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>

---------

Signed-off-by: Marc Handalian <marc.handalian@gmail.com>
  • Loading branch information
mch2 committed Apr 14, 2024
1 parent cc22310 commit e828c18
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 295 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@

package org.opensearch.indices.replication;

import org.apache.lucene.tests.util.LuceneTestCase;
import org.opensearch.action.admin.indices.delete.DeleteIndexRequest;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.indices.replication.common.ReplicationType;
import org.opensearch.test.OpenSearchIntegTestCase;
import org.junit.Before;

@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9499")
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.SUITE, minNumDataNodes = 2)
public class SegmentReplicationSuiteIT extends SegmentReplicationBaseIT {

Expand Down Expand Up @@ -64,6 +62,7 @@ public void testDropRandomNodeDuringReplication() throws Exception {
ensureYellow(INDEX_NAME);
client().prepareIndex(INDEX_NAME).setId(Integer.toString(docCount)).setSource("field", "value" + docCount).execute().get();
internalCluster().startDataOnlyNode();
ensureGreen(INDEX_NAME);
client().admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).actionGet();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ public CheckpointInfoResponse(
this.infosBytes = infosBytes;
}

public CheckpointInfoResponse(final ReplicationCheckpoint checkpoint, final byte[] infosBytes) {
this.checkpoint = checkpoint;
this.infosBytes = infosBytes;
this.metadataMap = checkpoint.getMetadataMap();
}

public CheckpointInfoResponse(StreamInput in) throws IOException {
this.checkpoint = new ReplicationCheckpoint(in);
this.metadataMap = in.readMap(StreamInput::readString, StoreFileMetadata::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@
import org.opensearch.indices.IndicesService;
import org.opensearch.indices.recovery.FileChunkWriter;
import org.opensearch.indices.recovery.RecoverySettings;
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
import org.opensearch.indices.replication.common.CopyState;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -36,7 +34,6 @@
/**
* Manages references to ongoing segrep events on a node.
* Each replica will have a new {@link SegmentReplicationSourceHandler} created when starting replication.
* CopyStates will be cached for reuse between replicas and only released when all replicas have finished copying segments.
*
* @opensearch.internal
*/
Expand All @@ -45,7 +42,6 @@ class OngoingSegmentReplications {
private static final Logger logger = LogManager.getLogger(OngoingSegmentReplications.class);
private final RecoverySettings recoverySettings;
private final IndicesService indicesService;
private final Map<ReplicationCheckpoint, CopyState> copyStateMap;
private final Map<String, SegmentReplicationSourceHandler> allocationIdToHandlers;

/**
Expand All @@ -57,46 +53,9 @@ class OngoingSegmentReplications {
OngoingSegmentReplications(IndicesService indicesService, RecoverySettings recoverySettings) {
this.indicesService = indicesService;
this.recoverySettings = recoverySettings;
this.copyStateMap = Collections.synchronizedMap(new HashMap<>());
this.allocationIdToHandlers = ConcurrentCollections.newConcurrentMap();
}

/*
Operations on the {@link #copyStateMap} member.
*/

/**
* A synchronized method that checks {@link #copyStateMap} for the given {@link ReplicationCheckpoint} key
* and returns the cached value if one is present. If the key is not present, a {@link CopyState}
* object is constructed and stored in the map before being returned.
*/
synchronized CopyState getCachedCopyState(ReplicationCheckpoint checkpoint) throws IOException {
if (isInCopyStateMap(checkpoint)) {
final CopyState copyState = fetchFromCopyStateMap(checkpoint);
// we incref the copyState for every replica that is using this checkpoint.
// decref will happen when copy completes.
copyState.incRef();
return copyState;
} else {
// From the checkpoint's shard ID, fetch the IndexShard
ShardId shardId = checkpoint.getShardId();
final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
final IndexShard indexShard = indexService.getShard(shardId.id());
// build the CopyState object and cache it before returning
final CopyState copyState = new CopyState(checkpoint, indexShard);

/*
Use the checkpoint from the request as the key in the map, rather than
the checkpoint from the created CopyState. This maximizes cache hits
if replication targets make a request with an older checkpoint.
Replication targets are expected to fetch the checkpoint in the response
CopyState to bring themselves up to date.
*/
addToCopyStateMap(checkpoint, copyState);
return copyState;
}
}

/**
* Start sending files to the replica.
*
Expand All @@ -114,51 +73,43 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener<GetSegmentF
);
}
// update the given listener to release the CopyState before it resolves.
final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(listener, () -> {
final SegmentReplicationSourceHandler sourceHandler = allocationIdToHandlers.remove(request.getTargetAllocationId());
if (sourceHandler != null) {
removeCopyState(sourceHandler.getCopyState());
}
});
final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(
listener,
() -> allocationIdToHandlers.remove(request.getTargetAllocationId())
);
handler.sendFiles(request, wrappedListener);
} else {
listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList()));
}
}

/**
* Prepare for a Replication event. This method constructs a {@link CopyState} holding files to be sent off of the current
* node's store. This state is intended to be sent back to Replicas before copy is initiated so the replica can perform a diff against its
* local store. It will then build a handler to orchestrate the segment copy that will be stored locally and started on a subsequent request from replicas
* with the list of required files.
* Prepare for a Replication event. This method constructs a {@link SegmentReplicationSourceHandler} that orchestrates segment copy and
* will internally incref files for copy.
*
* @param request {@link CheckpointInfoRequest}
* @param fileChunkWriter {@link FileChunkWriter} writer to handle sending files over the transport layer.
* @return {@link CopyState} the built CopyState for this replication event.
* @throws IOException - When there is an IO error building CopyState.
* @return {@link SegmentReplicationSourceHandler} the built CopyState for this replication event.
*/
CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) throws IOException {
final CopyState copyState = getCachedCopyState(request.getCheckpoint());
final SegmentReplicationSourceHandler newHandler = createTargetHandler(
request.getTargetNode(),
copyState,
request.getTargetAllocationId(),
fileChunkWriter
);
final SegmentReplicationSourceHandler existingHandler = allocationIdToHandlers.putIfAbsent(
request.getTargetAllocationId(),
newHandler
);
// If we are already replicating to this allocation Id, cancel the old and replace with a new execution.
// This will clear the old handler & referenced copy state holding an incref'd indexCommit.
if (existingHandler != null) {
logger.warn("Override handler for allocation id {}", request.getTargetAllocationId());
cancelHandlers(handler -> handler.getAllocationId().equals(request.getTargetAllocationId()), "cancel due to retry");
assert allocationIdToHandlers.containsKey(request.getTargetAllocationId()) == false;
allocationIdToHandlers.put(request.getTargetAllocationId(), newHandler);
}
assert allocationIdToHandlers.containsKey(request.getTargetAllocationId());
return copyState;
SegmentReplicationSourceHandler prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) {
return allocationIdToHandlers.computeIfAbsent(request.getTargetAllocationId(), aId -> {
try {
// From the checkpoint's shard ID, fetch the IndexShard
final ShardId shardId = request.getCheckpoint().getShardId();
final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
final IndexShard indexShard = indexService.getShard(shardId.id());
return new SegmentReplicationSourceHandler(
request.getTargetNode(),
fileChunkWriter,
indexShard,
request.getTargetAllocationId(),
Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
recoverySettings.getMaxConcurrentFileChunks()
);
} catch (IOException e) {
throw new UncheckedIOException("Error creating replication handler", e);
}
});
}

/**
Expand All @@ -167,8 +118,8 @@ CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter f
* @param shard {@link IndexShard}
* @param reason {@link String} - Reason for the cancel
*/
synchronized void cancel(IndexShard shard, String reason) {
cancelHandlers(handler -> handler.getCopyState().getShard().shardId().equals(shard.shardId()), reason);
void cancel(IndexShard shard, String reason) {
cancelHandlers(handler -> handler.shardId().equals(shard.shardId()), reason);
}

/**
Expand All @@ -177,11 +128,10 @@ synchronized void cancel(IndexShard shard, String reason) {
* @param allocationId {@link String} - Allocation ID.
* @param reason {@link String} - Reason for the cancel
*/
synchronized void cancel(String allocationId, String reason) {
void cancel(String allocationId, String reason) {
final SegmentReplicationSourceHandler handler = allocationIdToHandlers.remove(allocationId);
if (handler != null) {
handler.cancel(reason);
removeCopyState(handler.getCopyState());
}
}

Expand All @@ -194,14 +144,6 @@ void cancelReplication(DiscoveryNode node) {
cancelHandlers(handler -> handler.getTargetNode().equals(node), "Node left");
}

/**
* Checks if the {@link #copyStateMap} has the input {@link ReplicationCheckpoint}
* as a key by invoking {@link Map#containsKey(Object)}.
*/
boolean isInCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
return copyStateMap.containsKey(replicationCheckpoint);
}

int size() {
return allocationIdToHandlers.size();
}
Expand All @@ -211,58 +153,20 @@ Map<String, SegmentReplicationSourceHandler> getHandlers() {
return allocationIdToHandlers;
}

int cachedCopyStateSize() {
return copyStateMap.size();
}

private SegmentReplicationSourceHandler createTargetHandler(
DiscoveryNode node,
CopyState copyState,
String allocationId,
FileChunkWriter fileChunkWriter
) {
return new SegmentReplicationSourceHandler(
node,
fileChunkWriter,
copyState.getShard().getThreadPool(),
copyState,
allocationId,
Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
recoverySettings.getMaxConcurrentFileChunks()
);
}

/**
* Adds the input {@link CopyState} object to {@link #copyStateMap}.
* The key is the CopyState's {@link ReplicationCheckpoint} object.
*/
private void addToCopyStateMap(ReplicationCheckpoint checkpoint, CopyState copyState) {
copyStateMap.putIfAbsent(checkpoint, copyState);
}

/**
* Given a {@link ReplicationCheckpoint}, return the corresponding
* {@link CopyState} object, if any, from {@link #copyStateMap}.
*/
private CopyState fetchFromCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
return copyStateMap.get(replicationCheckpoint);
}

/**
* Remove a CopyState. Intended to be called after a replication event completes.
* This method will remove a copyState from the copyStateMap only if its refCount hits 0.
*
* @param copyState {@link CopyState}
* Clear handlers for any allocationIds not in sync.
* @param shardId {@link ShardId}
* @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
*/
private synchronized void removeCopyState(CopyState copyState) {
if (copyState.decRef() == true) {
copyStateMap.remove(copyState.getRequestedReplicationCheckpoint());
}
void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
cancelHandlers(
(handler) -> handler.shardId().equals(shardId) && inSyncAllocationIds.contains(handler.getAllocationId()) == false,
"Shard is no longer in-sync with the primary"
);
}

/**
* Remove handlers from allocationIdToHandlers map based on a filter predicate.
* This will also decref the handler's CopyState reference.
*/
private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> predicate, String reason) {
final List<String> allocationIds = allocationIdToHandlers.values()
Expand All @@ -278,17 +182,4 @@ private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> p
cancel(allocationId, reason);
}
}

/**
* Clear copystate and target handlers for any non insync allocationIds.
* @param shardId {@link ShardId}
* @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
*/
public void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
cancelHandlers(
(handler) -> handler.getCopyState().getShard().shardId().equals(shardId)
&& inSyncAllocationIds.contains(handler.getAllocationId()) == false,
"Shard is no longer in-sync with the primary"
);
}
}
Loading

0 comments on commit e828c18

Please sign in to comment.