Fix flakiness with SegmentReplicationSuiteIT (#11977)

* Fix SegmentReplicationSuiteIT This test fails because of a race during shard/node shutdown with node-node replication. Fixed by properly synchronizing creation of new replication events with cancellation and cancelling after shards are closed. Signed-off-by: Marc Handalian <marc.handalian@gmail.com> * Remove CopyState caching from OngoingSegmentReplications. This change removes the responsibility of caching CopyState inside of OngoingSegmentReplications. 1. CopyState was originally cached to prevent frequent disk reads while building segment metadata. This is now cached lower down in IndexShard and is not required here. 2. Change prepareForReplication method to return SegmentReplicationSourceHandler directly 3. Move responsibility of creating and clearing CopyState to the handler. Signed-off-by: Marc Handalian <marc.handalian@gmail.com> * Fix comment for afterIndexShardClosed method. Signed-off-by: Marc Handalian <marc.handalian@gmail.com> * Fix comment on beforeIndexShardClosed Signed-off-by: Marc Handalian <marc.handalian@gmail.com> * Remove unnecessary method from OngoingSegmentReplications Signed-off-by: Marc Handalian <marc.handalian@gmail.com> --------- Signed-off-by: Marc Handalian <marc.handalian@gmail.com>
opensearch-project · Apr 14, 2024 · e828c18 · e828c18
1 parent cc22310
commit e828c18
Show file tree

Hide file tree

Showing 12 changed files with 125 additions and 295 deletions.
diff --git a/...nternalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java b/...nternalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java
@@ -8,15 +8,13 @@
 
 package org.opensearch.indices.replication;
 
-import org.apache.lucene.tests.util.LuceneTestCase;
 import org.opensearch.action.admin.indices.delete.DeleteIndexRequest;
 import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.indices.replication.common.ReplicationType;
 import org.opensearch.test.OpenSearchIntegTestCase;
 import org.junit.Before;
 
-@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9499")
 @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.SUITE, minNumDataNodes = 2)
 public class SegmentReplicationSuiteIT extends SegmentReplicationBaseIT {
 
@@ -64,6 +62,7 @@ public void testDropRandomNodeDuringReplication() throws Exception {
         ensureYellow(INDEX_NAME);
         client().prepareIndex(INDEX_NAME).setId(Integer.toString(docCount)).setSource("field", "value" + docCount).execute().get();
         internalCluster().startDataOnlyNode();
+        ensureGreen(INDEX_NAME);
         client().admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).actionGet();
     }
 

diff --git a/server/src/main/java/org/opensearch/indices/replication/CheckpointInfoResponse.java b/server/src/main/java/org/opensearch/indices/replication/CheckpointInfoResponse.java
@@ -40,6 +40,12 @@ public CheckpointInfoResponse(
         this.infosBytes = infosBytes;
     }
 
+    public CheckpointInfoResponse(final ReplicationCheckpoint checkpoint, final byte[] infosBytes) {
+        this.checkpoint = checkpoint;
+        this.infosBytes = infosBytes;
+        this.metadataMap = checkpoint.getMetadataMap();
+    }
+
     public CheckpointInfoResponse(StreamInput in) throws IOException {
         this.checkpoint = new ReplicationCheckpoint(in);
         this.metadataMap = in.readMap(StreamInput::readString, StoreFileMetadata::new);

diff --git a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java
@@ -21,12 +21,10 @@
 import org.opensearch.indices.IndicesService;
 import org.opensearch.indices.recovery.FileChunkWriter;
 import org.opensearch.indices.recovery.RecoverySettings;
-import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
-import org.opensearch.indices.replication.common.CopyState;
 
 import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -36,7 +34,6 @@
 /**
  * Manages references to ongoing segrep events on a node.
  * Each replica will have a new {@link SegmentReplicationSourceHandler} created when starting replication.
- * CopyStates will be cached for reuse between replicas and only released when all replicas have finished copying segments.
  *
  * @opensearch.internal
  */
@@ -45,7 +42,6 @@ class OngoingSegmentReplications {
     private static final Logger logger = LogManager.getLogger(OngoingSegmentReplications.class);
     private final RecoverySettings recoverySettings;
     private final IndicesService indicesService;
-    private final Map<ReplicationCheckpoint, CopyState> copyStateMap;
     private final Map<String, SegmentReplicationSourceHandler> allocationIdToHandlers;
 
     /**
@@ -57,46 +53,9 @@ class OngoingSegmentReplications {
     OngoingSegmentReplications(IndicesService indicesService, RecoverySettings recoverySettings) {
         this.indicesService = indicesService;
         this.recoverySettings = recoverySettings;
-        this.copyStateMap = Collections.synchronizedMap(new HashMap<>());
         this.allocationIdToHandlers = ConcurrentCollections.newConcurrentMap();
     }
 
-    /*
-      Operations on the {@link #copyStateMap} member.
-     */
-
-    /**
-     * A synchronized method that checks {@link #copyStateMap} for the given {@link ReplicationCheckpoint} key
-     * and returns the cached value if one is present. If the key is not present, a {@link CopyState}
-     * object is constructed and stored in the map before being returned.
-     */
-    synchronized CopyState getCachedCopyState(ReplicationCheckpoint checkpoint) throws IOException {
-        if (isInCopyStateMap(checkpoint)) {
-            final CopyState copyState = fetchFromCopyStateMap(checkpoint);
-            // we incref the copyState for every replica that is using this checkpoint.
-            // decref will happen when copy completes.
-            copyState.incRef();
-            return copyState;
-        } else {
-            // From the checkpoint's shard ID, fetch the IndexShard
-            ShardId shardId = checkpoint.getShardId();
-            final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
-            final IndexShard indexShard = indexService.getShard(shardId.id());
-            // build the CopyState object and cache it before returning
-            final CopyState copyState = new CopyState(checkpoint, indexShard);
-
-            /*
-              Use the checkpoint from the request as the key in the map, rather than
-              the checkpoint from the created CopyState. This maximizes cache hits
-              if replication targets make a request with an older checkpoint.
-              Replication targets are expected to fetch the checkpoint in the response
-              CopyState to bring themselves up to date.
-             */
-            addToCopyStateMap(checkpoint, copyState);
-            return copyState;
-        }
-    }
-
     /**
      * Start sending files to the replica.
      *
@@ -114,51 +73,43 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener<GetSegmentF
                 );
             }
             // update the given listener to release the CopyState before it resolves.
-            final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(listener, () -> {
-                final SegmentReplicationSourceHandler sourceHandler = allocationIdToHandlers.remove(request.getTargetAllocationId());
-                if (sourceHandler != null) {
-                    removeCopyState(sourceHandler.getCopyState());
-                }
-            });
+            final ActionListener<GetSegmentFilesResponse> wrappedListener = ActionListener.runBefore(
+                listener,
+                () -> allocationIdToHandlers.remove(request.getTargetAllocationId())
+            );
             handler.sendFiles(request, wrappedListener);
         } else {
             listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList()));
         }
     }
 
     /**
-     * Prepare for a Replication event. This method constructs a {@link CopyState} holding files to be sent off of the current
-     * node's store.  This state is intended to be sent back to Replicas before copy is initiated so the replica can perform a diff against its
-     * local store.  It will then build a handler to orchestrate the segment copy that will be stored locally and started on a subsequent request from replicas
-     * with the list of required files.
+     * Prepare for a Replication event. This method constructs a {@link SegmentReplicationSourceHandler} that orchestrates segment copy and
+     * will internally incref files for copy.
      *
      * @param request         {@link CheckpointInfoRequest}
      * @param fileChunkWriter {@link FileChunkWriter} writer to handle sending files over the transport layer.
-     * @return {@link CopyState} the built CopyState for this replication event.
-     * @throws IOException - When there is an IO error building CopyState.
+     * @return {@link SegmentReplicationSourceHandler} the built CopyState for this replication event.
      */
-    CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) throws IOException {
-        final CopyState copyState = getCachedCopyState(request.getCheckpoint());
-        final SegmentReplicationSourceHandler newHandler = createTargetHandler(
-            request.getTargetNode(),
-            copyState,
-            request.getTargetAllocationId(),
-            fileChunkWriter
-        );
-        final SegmentReplicationSourceHandler existingHandler = allocationIdToHandlers.putIfAbsent(
-            request.getTargetAllocationId(),
-            newHandler
-        );
-        // If we are already replicating to this allocation Id, cancel the old and replace with a new execution.
-        // This will clear the old handler & referenced copy state holding an incref'd indexCommit.
-        if (existingHandler != null) {
-            logger.warn("Override handler for allocation id {}", request.getTargetAllocationId());
-            cancelHandlers(handler -> handler.getAllocationId().equals(request.getTargetAllocationId()), "cancel due to retry");
-            assert allocationIdToHandlers.containsKey(request.getTargetAllocationId()) == false;
-            allocationIdToHandlers.put(request.getTargetAllocationId(), newHandler);
-        }
-        assert allocationIdToHandlers.containsKey(request.getTargetAllocationId());
-        return copyState;
+    SegmentReplicationSourceHandler prepareForReplication(CheckpointInfoRequest request, FileChunkWriter fileChunkWriter) {
+        return allocationIdToHandlers.computeIfAbsent(request.getTargetAllocationId(), aId -> {
+            try {
+                // From the checkpoint's shard ID, fetch the IndexShard
+                final ShardId shardId = request.getCheckpoint().getShardId();
+                final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
+                final IndexShard indexShard = indexService.getShard(shardId.id());
+                return new SegmentReplicationSourceHandler(
+                    request.getTargetNode(),
+                    fileChunkWriter,
+                    indexShard,
+                    request.getTargetAllocationId(),
+                    Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
+                    recoverySettings.getMaxConcurrentFileChunks()
+                );
+            } catch (IOException e) {
+                throw new UncheckedIOException("Error creating replication handler", e);
+            }
+        });
     }
 
     /**
@@ -167,8 +118,8 @@ CopyState prepareForReplication(CheckpointInfoRequest request, FileChunkWriter f
      * @param shard  {@link IndexShard}
      * @param reason {@link String} - Reason for the cancel
      */
-    synchronized void cancel(IndexShard shard, String reason) {
-        cancelHandlers(handler -> handler.getCopyState().getShard().shardId().equals(shard.shardId()), reason);
+    void cancel(IndexShard shard, String reason) {
+        cancelHandlers(handler -> handler.shardId().equals(shard.shardId()), reason);
     }
 
     /**
@@ -177,11 +128,10 @@ synchronized void cancel(IndexShard shard, String reason) {
      * @param allocationId {@link String} - Allocation ID.
      * @param reason       {@link String} - Reason for the cancel
      */
-    synchronized void cancel(String allocationId, String reason) {
+    void cancel(String allocationId, String reason) {
         final SegmentReplicationSourceHandler handler = allocationIdToHandlers.remove(allocationId);
         if (handler != null) {
             handler.cancel(reason);
-            removeCopyState(handler.getCopyState());
         }
     }
 
@@ -194,14 +144,6 @@ void cancelReplication(DiscoveryNode node) {
         cancelHandlers(handler -> handler.getTargetNode().equals(node), "Node left");
     }
 
-    /**
-     * Checks if the {@link #copyStateMap} has the input {@link ReplicationCheckpoint}
-     * as a key by invoking {@link Map#containsKey(Object)}.
-     */
-    boolean isInCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
-        return copyStateMap.containsKey(replicationCheckpoint);
-    }
-
     int size() {
         return allocationIdToHandlers.size();
     }
@@ -211,58 +153,20 @@ Map<String, SegmentReplicationSourceHandler> getHandlers() {
         return allocationIdToHandlers;
     }
 
-    int cachedCopyStateSize() {
-        return copyStateMap.size();
-    }
-
-    private SegmentReplicationSourceHandler createTargetHandler(
-        DiscoveryNode node,
-        CopyState copyState,
-        String allocationId,
-        FileChunkWriter fileChunkWriter
-    ) {
-        return new SegmentReplicationSourceHandler(
-            node,
-            fileChunkWriter,
-            copyState.getShard().getThreadPool(),
-            copyState,
-            allocationId,
-            Math.toIntExact(recoverySettings.getChunkSize().getBytes()),
-            recoverySettings.getMaxConcurrentFileChunks()
-        );
-    }
-
     /**
-     * Adds the input {@link CopyState} object to {@link #copyStateMap}.
-     * The key is the CopyState's {@link ReplicationCheckpoint} object.
-     */
-    private void addToCopyStateMap(ReplicationCheckpoint checkpoint, CopyState copyState) {
-        copyStateMap.putIfAbsent(checkpoint, copyState);
-    }
-
-    /**
-     * Given a {@link ReplicationCheckpoint}, return the corresponding
-     * {@link CopyState} object, if any, from {@link #copyStateMap}.
-     */
-    private CopyState fetchFromCopyStateMap(ReplicationCheckpoint replicationCheckpoint) {
-        return copyStateMap.get(replicationCheckpoint);
-    }
-
-    /**
-     * Remove a CopyState. Intended to be called after a replication event completes.
-     * This method will remove a copyState from the copyStateMap only if its refCount hits 0.
-     *
-     * @param copyState {@link CopyState}
+     * Clear handlers for any allocationIds not in sync.
+     * @param shardId {@link ShardId}
+     * @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
      */
-    private synchronized void removeCopyState(CopyState copyState) {
-        if (copyState.decRef() == true) {
-            copyStateMap.remove(copyState.getRequestedReplicationCheckpoint());
-        }
+    void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
+        cancelHandlers(
+            (handler) -> handler.shardId().equals(shardId) && inSyncAllocationIds.contains(handler.getAllocationId()) == false,
+            "Shard is no longer in-sync with the primary"
+        );
     }
 
     /**
      * Remove handlers from allocationIdToHandlers map based on a filter predicate.
-     * This will also decref the handler's CopyState reference.
      */
     private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> predicate, String reason) {
         final List<String> allocationIds = allocationIdToHandlers.values()
@@ -278,17 +182,4 @@ private void cancelHandlers(Predicate<? super SegmentReplicationSourceHandler> p
             cancel(allocationId, reason);
         }
     }
-
-    /**
-     * Clear copystate and target handlers for any non insync allocationIds.
-     * @param shardId {@link ShardId}
-     * @param inSyncAllocationIds {@link List} of in-sync allocation Ids.
-     */
-    public void clearOutOfSyncIds(ShardId shardId, Set<String> inSyncAllocationIds) {
-        cancelHandlers(
-            (handler) -> handler.getCopyState().getShard().shardId().equals(shardId)
-                && inSyncAllocationIds.contains(handler.getAllocationId()) == false,
-            "Shard is no longer in-sync with the primary"
-        );
-    }
 }