From 06a1111e004e09fe8708f7832f12b2cfee49a0e1 Mon Sep 17 00:00:00 2001 From: Basava Date: Tue, 10 Sep 2024 15:41:35 +0000 Subject: [PATCH] [BACKPORT 2024.1][#23335] DocDB: Set field close timestamp for the log segment being copied Summary: Original commit: 622046d812ba4978dec96d7e12f11e94e06f220b / D36956 When a log segment is being copied partially, we weren't setting the footer's `close_timestamp_micros` for the new segment. This could result in log GC that violates time based policy controlled by flag `time_based_wal_gc_clock_delta_usec`, as we don't prevent a GC of a potential log segment if it doesn't have the close timestamp set (note that this does not violate other log GC policies which are required for correctness in any way). `Log::GetSegmentsToGCUnlocked` -> `LogReader::GetSegmentPrefixNotIncluding` -> `LogReader::ViolatesMaxTimePolicy(segment)` wrongly infers that the age violates the max log retention policy. ``` bool LogReader::ViolatesMaxTimePolicy(const scoped_refptr& segment) const { ... int64_t now = GetCurrentTimeMicros(); int64_t age_seconds = (now - segment->footer().close_timestamp_micros()) / 1000000; // this will infer wrong age because close ts is missing if (age_seconds > FLAGS_log_max_seconds_to_retain) { ... return true; } return false; } ``` and then, `Log::GetSegmentsToGCUnlocked` -> `Log::ApplyTimeRetentionPolicy` doesn't prevent this since the footer doesn't have the close ts. ``` void Log::ApplyTimeRetentionPolicy(SegmentSequence* segments_to_gc) const { int64_t now = GetCurrentTimeMicros() + FLAGS_time_based_wal_gc_clock_delta_usec; for (auto iter = segments_to_gc->begin(); iter != segments_to_gc->end(); ++iter) { const auto& segment = *iter; if (!segment->footer().has_close_timestamp_micros()) continue; // We let the segment be GC'ed if it doesn't have the field close_timestamp_micros set. int64_t age_seconds = (now - segment->footer().close_timestamp_micros()) / 1000000; if (age_seconds < wal_retention_secs()) { // Truncate the list of segments to GC here -- if this one is too new, then all later ones are // also too new. segments_to_gc->truncate(iter); break; } } } ``` The above violation seems possible only on the tablet split codepath which invokes the function `ReadableLogSegment::CopyTo`, and that too when the split op isn't the last op in the parent's log segment. This is rare since there are only certain special ops that are allowed to be appended to the log after the split op (`NO_OP`, `SNAPSHOT_OP`, `CLONE_OP`, `CHANGE_CONFIG_OP`). Additional note: The field `close_timestamp_micros` seems only relevant in the log GC codepath. so there shouldn't have been any other issues caused by this bug. This diff addresses the issue by setting the field `close_timestamp_micros` for the log segment being copied. Jira: DB-12262 Test Plan: ./yb_build.sh --cxx-test='TEST_F(LogTest, CopyUpTo) {' The above test fails without the fix. Added a new test where we attempt a leader change on the parent tablet after split, forcing the log copy path to invoke actual copy (duplication) as opposed to hard linking to the paren't log segment. ``` switch (relation) { case SegmentOpIdRelation::kOpIdBeforeSegment: return true; case SegmentOpIdRelation::kOpIdIsLast: stop = true; FALLTHROUGH_INTENDED; case SegmentOpIdRelation::kEmptySegment: FALLTHROUGH_INTENDED; case SegmentOpIdRelation::kOpIdAfterSegment: RETURN_NOT_OK(env->LinkFile(src_path, dest_path)); VLOG_WITH_PREFIX(1) << Format("Hard linked $0 to $1", src_path, dest_path); return stop; case SegmentOpIdRelation::kOpIdIsInsideAndNotLast: // Copy part of the segment up to and including max_included_op_id. RETURN_NOT_OK(segment->CopyTo( env, GetNewSegmentWritableFileOptions(), dest_path, max_included_op_id)); return true; } ``` ./yb_build.sh --cxx-test='TEST_F(TabletSplitITest, TestLogCopySetsCloseTimestampInFooter) {' -n 50 --tp 1 this test fails consistently without the fix. Reviewers: timur Reviewed By: timur Subscribers: ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D37177 --- src/yb/consensus/log-test.cc | 3 + src/yb/consensus/log_util.cc | 2 +- src/yb/integration-tests/mini_cluster.cc | 17 +++-- src/yb/integration-tests/mini_cluster.h | 13 ++-- .../integration-tests/tablet-split-itest.cc | 73 +++++++++++++++++++ 5 files changed, 96 insertions(+), 12 deletions(-) diff --git a/src/yb/consensus/log-test.cc b/src/yb/consensus/log-test.cc index a19b2130ff80..27d71eb78729 100644 --- a/src/yb/consensus/log-test.cc +++ b/src/yb/consensus/log-test.cc @@ -1659,15 +1659,18 @@ TEST_F(LogTest, CopyUpTo) { ASSERT_OK(CheckReadEntriesResultEq(entries_copy_result, entries_result)); } + bool has_replicated_entries = false; for (size_t entry_idx = 0; entry_idx < num_copied_segment_entries; ++entry_idx) { const auto& copied_entry = entries_copy_result.entries[entry_idx]; if (copied_entry->has_replicate()) { + has_replicated_entries = true; const auto& op_id = OpId::FromPB(copied_entry->replicate().id()); op_id_with_entry_meta_by_idx[op_id.index] = std::make_pair(op_id, entries_copy_result.entry_metadata[entry_idx]); } } num_ops_copied += num_copied_segment_entries; + ASSERT_EQ(has_replicated_entries, segment_copy->footer().has_close_timestamp_micros()); } if (num_ops_copied > copy_num_ops) { diff --git a/src/yb/consensus/log_util.cc b/src/yb/consensus/log_util.cc index 1e05bbd1c2e6..52a803d713de 100644 --- a/src/yb/consensus/log_util.cc +++ b/src/yb/consensus/log_util.cc @@ -388,7 +388,7 @@ Status ReadableLogSegment::CopyTo( } footer.set_num_entries(num_entries); if (latest_ht > 0) { - footer_.set_close_timestamp_micros(HybridTime(latest_ht).GetPhysicalValueMicros()); + footer.set_close_timestamp_micros(HybridTime(latest_ht).GetPhysicalValueMicros()); } // Note: log_index created here might have holes, because specific WAL segment can have holes due // to WAL rewriting. For example, we can have segment with op_ids: { 3.25, 4.26, 5.21 } and footer diff --git a/src/yb/integration-tests/mini_cluster.cc b/src/yb/integration-tests/mini_cluster.cc index 08d9791aeb68..4e08aefdbcbc 100644 --- a/src/yb/integration-tests/mini_cluster.cc +++ b/src/yb/integration-tests/mini_cluster.cc @@ -1160,13 +1160,20 @@ Result> WaitForTableActiveTabletLeadersPeers( } Status WaitUntilTabletHasLeader( - MiniCluster* cluster, const TabletId& tablet_id, CoarseTimePoint deadline) { + MiniCluster* cluster, const TabletId& tablet_id, CoarseTimePoint deadline, + RequireLeaderIsReady require_leader_is_ready) { return Wait( - [cluster, &tablet_id] { - auto tablet_peers = ListTabletPeers(cluster, [&tablet_id](auto peer) { + [cluster, &tablet_id, require_leader_is_ready] { + auto tablet_peers = ListTabletPeers( + cluster, [&tablet_id, require_leader_is_ready](auto peer) { auto consensus_result = peer->GetConsensus(); - return peer->tablet_id() == tablet_id && consensus_result && - consensus_result.get()->GetLeaderStatus() != consensus::LeaderStatus::NOT_LEADER; + if (peer->tablet_id() == tablet_id && consensus_result) { + const auto leader_status = consensus_result.get()->GetLeaderStatus(); + return require_leader_is_ready + ? leader_status == consensus::LeaderStatus::LEADER_AND_READY + : leader_status != consensus::LeaderStatus::NOT_LEADER; + } + return false; }); return tablet_peers.size() == 1; }, diff --git a/src/yb/integration-tests/mini_cluster.h b/src/yb/integration-tests/mini_cluster.h index c864e90ff84e..17cf4407dad4 100644 --- a/src/yb/integration-tests/mini_cluster.h +++ b/src/yb/integration-tests/mini_cluster.h @@ -77,6 +77,11 @@ namespace tserver { class MiniTabletServer; } +YB_DEFINE_ENUM(ListPeersFilter, (kAll)(kLeaders)(kNonLeaders)); +YB_STRONGLY_TYPED_BOOL(ForceStepDown); +YB_STRONGLY_TYPED_BOOL(IncludeTransactionStatusTablets); +YB_STRONGLY_TYPED_BOOL(RequireLeaderIsReady) + struct MiniClusterOptions { // Number of master servers. size_t num_masters = 1; @@ -316,9 +321,6 @@ MUST_USE_RESULT std::vector JumpClocks( void StepDownAllTablets(MiniCluster* cluster); void StepDownRandomTablet(MiniCluster* cluster); -YB_DEFINE_ENUM(ListPeersFilter, (kAll)(kLeaders)(kNonLeaders)); -YB_STRONGLY_TYPED_BOOL(IncludeTransactionStatusTablets); - using TabletPeerFilter = std::function; std::unordered_set ListTabletIdsForTable( @@ -367,7 +369,8 @@ Result> WaitForTableActiveTabletLeadersPeers( MonoDelta timeout = std::chrono::seconds(30) * kTimeMultiplier); Status WaitUntilTabletHasLeader( - MiniCluster* cluster, const TabletId& tablet_id, CoarseTimePoint deadline); + MiniCluster* cluster, const TabletId& tablet_id, CoarseTimePoint deadline, + RequireLeaderIsReady require_leader_is_ready = RequireLeaderIsReady::kFalse); Status WaitForLeaderOfSingleTablet( MiniCluster* cluster, tablet::TabletPeerPtr leader, MonoDelta duration, @@ -381,8 +384,6 @@ Status WaitForTableLeaders( Status WaitUntilMasterHasLeader(MiniCluster* cluster, MonoDelta timeout); -YB_STRONGLY_TYPED_BOOL(ForceStepDown); - Status StepDown( tablet::TabletPeerPtr leader, const std::string& new_leader_uuid, ForceStepDown force_step_down, MonoDelta timeout = 10s); diff --git a/src/yb/integration-tests/tablet-split-itest.cc b/src/yb/integration-tests/tablet-split-itest.cc index 7e85a21171ab..d38d2a88a1ba 100644 --- a/src/yb/integration-tests/tablet-split-itest.cc +++ b/src/yb/integration-tests/tablet-split-itest.cc @@ -31,6 +31,8 @@ #include "yb/consensus/consensus.proxy.h" #include "yb/consensus/consensus_util.h" #include "yb/consensus/log.h" +#include "yb/consensus/log.messages.h" +#include "yb/consensus/log_reader.h" #include "yb/consensus/raft_consensus.h" #include "yb/dockv/doc_key.h" @@ -895,6 +897,77 @@ TEST_F(TabletSplitITest, SplitDuringReplicaOffline) { }, 30s * kTimeMultiplier, "Waiting for TS-1 to catch up ..."), AsString(s)); } +// During apply of tablet split op, each child tablet copies the log segments from the parent. For +// each segment, the copy could either be a hard link to the parent's copy or an actual duplication +// of the segment. The below test asserts that footer of the duplicated segments is rightly formed. +TEST_F(TabletSplitITest, TestLogCopySetsCloseTimestampInFooter) { + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true; + + SetNumTablets(1); + CreateTable(); + const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kDefaultNumRows)); + + const auto catalog_mgr = ASSERT_RESULT(catalog_manager()); + const auto tablet = ASSERT_RESULT(GetSingleTestTabletInfo(catalog_mgr)); + const auto tablet_id = tablet->id(); + LOG(INFO) << "Source tablet id " << tablet_id; + + size_t leader_idx; + CHECK_NOTNULL(GetLeaderForTablet(cluster_.get(), tablet_id, &leader_idx)); + const auto follower1_idx = (leader_idx + 1) % 3; + const auto follower1_id = cluster_->mini_tablet_server(follower1_idx)->server()->permanent_uuid(); + const auto follower2_idx = (leader_idx + 2) % 3; + const auto follower2_id = cluster_->mini_tablet_server(follower2_idx)->server()->permanent_uuid(); + + cluster_->mini_tablet_server(follower1_idx)->Shutdown(); + + ASSERT_OK(catalog_mgr->TEST_SplitTablet(tablet, split_hash_code)); + ASSERT_OK(WaitForTabletSplitCompletion( + /* expected_non_split_tablets = */ 2, /* expected_split_tablets = */ 1, + /* num_replicas_online = */ 2)); + + // Initiate a leader change in order to append NO_OP entry to tablet's log from the new leader. + // We do this hoping that it would trigger the duplication of log segment while creation of the + // child tablets once the offline TS restarts, instead of hard linking to the parent's log + // segment. An actual copy is triggered when the split op is in the segment but not the last op. + const auto leader_peer = ASSERT_RESULT(GetLeaderPeerForTablet(cluster_.get(), tablet_id)); + ASSERT_OK(StepDown(leader_peer, follower2_id, ForceStepDown::kTrue)); + // Wait for the new leader to replicate NO_OP. + ASSERT_OK(WaitUntilTabletHasLeader( + cluster_.get(), tablet_id, CoarseMonoClock::Now() + 5s, RequireLeaderIsReady::kTrue)); + // Now follower1_idx comes back alive, and receives the split op as well as the NO_OP, and applies + // the split op while satisfying the condition that the split op isn't the last op in the segment. + ASSERT_OK(cluster_->mini_tablet_server(follower1_idx)->Start()); + // Wait for the split op to be applied on the restarted TS. + ASSERT_OK(WaitForTabletSplitCompletion( + /* expected_non_split_tablets = */ 2, /* expected_split_tablets = */ 1, + /* num_replicas_online = */ 3)); + + auto peers = ListTabletPeers( + cluster_.get(), ListPeersFilter::kAll, IncludeTransactionStatusTablets::kFalse); + for (const auto& peer : peers) { + log::SegmentSequence segments; + ASSERT_OK(peer->log()->GetLogReader()->GetSegmentsSnapshot(&segments)); + for (const auto& segment : segments) { + if (!segment->HasFooter()) { + continue; + } + auto entries_copy_result = segment->ReadEntries(); + ASSERT_OK(entries_copy_result.status); + bool has_replicated_entries = false; + for (const auto& entry : entries_copy_result.entries) { + if (entry->has_replicate()) { + has_replicated_entries = true; + break; + } + } + ASSERT_EQ(has_replicated_entries, segment->footer().has_close_timestamp_micros()) + << "T " << peer->tablet_id() << " P " << peer->permanent_uuid() + << ": Expected valid close timestamp for segment with replicated entries."; + } + } +} + // Test for https://github.com/yugabyte/yugabyte-db/issues/6890. // Writes data to the tablet, splits it and then tries to do full scan with `select count(*)` // using two different instances of YBTable one after another.