From 26a07a4f4163f7bfdb9c8faa0b1a3bffc5a7320b Mon Sep 17 00:00:00 2001 From: Hemant Bhanawat Date: Mon, 28 Nov 2022 16:25:53 +0530 Subject: [PATCH] [#15027] docdb: Fix to compute the middle of middle key for tablet splitting Summary: ``` create table demo ( id int generated always as identity, data text, primary key(id asc) ); copy demo(data) from program 'base64 -w 16384 /dev/urandom | head -c 629145600'; ``` Creates and splits a single tablet into 412MB/190MB instead of ~2*300MB. This might happen because we take split key (midpoint) of the largest SST file and it just contains continuous range of IDs. To resolve such issues partially, we can take each SSTable file's mid-point, and do a weighted mid-point of mid-points. Each mid-point's weight could be based on the size of the SSTable maybe because we are doing size based splitting. Test Plan: Added an test in table_test yb_build.sh --gtest_filter TableTest.MiddleOfMiddleKey -n 100 -- -p 1 Reviewers: arybochkin, timur Reviewed By: timur Subscribers: ybase, arybochkin Differential Revision: https://phabricator.dev.yugabyte.com/D21214 --- src/yb/rocksdb/db/version_set.cc | 59 +++++++++++++++++++++++++++++- src/yb/rocksdb/db/version_set.h | 4 +- src/yb/rocksdb/table/table_test.cc | 39 ++++++++++++++++++++ 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/src/yb/rocksdb/db/version_set.cc b/src/yb/rocksdb/db/version_set.cc index 3796261a0099..cdbc03f06247 100644 --- a/src/yb/rocksdb/db/version_set.cc +++ b/src/yb/rocksdb/db/version_set.cc @@ -2067,6 +2067,62 @@ std::string Version::DebugString(bool hex) const { return r; } +namespace { + +struct MiddleKeyWithSize { + std::string middle_key; + uint64_t size; +}; + +static bool compareKeys(MiddleKeyWithSize f1, + MiddleKeyWithSize f2) { + return f1.middle_key.compare(f2.middle_key) > 0; +} + +} // namespace + +Result Version::GetMiddleOfMiddleKeys() { + const auto level = storage_info_.num_levels_ - 1; + // Largest files are at lowest level. + std::vector sst_files; + sst_files.reserve(storage_info_.files_[level].size()); + uint64_t total_size = 0; + // Get middle key and file size for every file + for (const auto* file : storage_info_.files_[level]) { + TableCache::TableReaderWithHandle trwh = VERIFY_RESULT(table_cache_->GetTableReader( + vset_->env_options_, cfd_->internal_comparator(), file->fd, kDefaultQueryId, + /* no_io = */ false, cfd_->internal_stats()->GetFileReadHist(level), + IsFilterSkipped(level, /* is_file_last_in_level = */ true))); + + const auto result_mkey = trwh.table_reader->GetMiddleKey(); + if (!result_mkey.ok()) { + if (result_mkey.status().IsIncomplete()) { + continue; + } + return result_mkey; + } + + const auto file_size = file->fd.GetTotalFileSize(); + sst_files.push_back({*result_mkey, file_size}); + total_size += file_size; + } + + if (sst_files.size() == 0) { + return STATUS(Incomplete, "Either no SST file or too small SST files."); + } + + std::sort(sst_files.begin(), sst_files.end(), compareKeys); + uint64_t sorted_size = 0; + // Weighted middle of middle based on file size + for (const auto& sst_file : sst_files) { + sorted_size += sst_file.size; + if (sorted_size > total_size/2) { + return sst_file.middle_key; + } + } + return STATUS(InternalError, "Unexpected error state."); +} + Result Version::GetLargestSstTableReader() { // Largest files are at lowest level. const auto level = storage_info_.num_levels_ - 1; @@ -2088,8 +2144,7 @@ Result Version::GetLargestSstTableReader() { } Result Version::GetMiddleKey() { - const auto trwh = VERIFY_RESULT(GetLargestSstTableReader()); - return trwh.table_reader->GetMiddleKey(); + return GetMiddleOfMiddleKeys(); } Result Version::TEST_GetLargestSstTableReader() { diff --git a/src/yb/rocksdb/db/version_set.h b/src/yb/rocksdb/db/version_set.h index 5a4c47f8356f..616ab94b4e98 100644 --- a/src/yb/rocksdb/db/version_set.h +++ b/src/yb/rocksdb/db/version_set.h @@ -515,7 +515,8 @@ class Version { size_t GetMemoryUsageByTableReaders(); - // Returns approximate middle key of the largest SST file (see TableReader::GetMiddleKey). + // Returns weighted middle key of the approximate middle keys of the SST files + // (see TableReader::GetMiddleKey). // Returns Status(Incomplete) if there are no SST files for this version. Result GetMiddleKey(); @@ -571,6 +572,7 @@ class Version { void UpdateFilesByCompactionPri(); Result GetLargestSstTableReader(); + Result GetMiddleOfMiddleKeys(); ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs Logger* info_log_; diff --git a/src/yb/rocksdb/table/table_test.cc b/src/yb/rocksdb/table/table_test.cc index 39235f1407db..8eddd96520ae 100644 --- a/src/yb/rocksdb/table/table_test.cc +++ b/src/yb/rocksdb/table/table_test.cc @@ -2796,6 +2796,45 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) { } } +namespace { + +void GenerateSSTFile(rocksdb::DB* db, int start_index, int num_records) { + for (int j = start_index; j < start_index + num_records; j++) { + ASSERT_OK(db->Put(rocksdb::WriteOptions(), std::to_string(j), "1")); + } + ASSERT_OK(db->Flush(FlushOptions())); +} + +} // namespace + +TEST_F(TableTest, MiddleOfMiddleKey) { + rocksdb::Options options; + options.compaction_style = rocksdb::kCompactionStyleNone; + options.num_levels = 1; + options.create_if_missing = true; + const std::string kDBPath = test::TmpDir() + "/mid_key"; + ASSERT_OK(DestroyDB(kDBPath, options)); + rocksdb::DB* db; + ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db)); + + // Create two files with 200 and 300 records. + GenerateSSTFile(db, 0, 200); + GenerateSSTFile(db, 200, 300); + + // Same as the midkey of the largest sst which has 300 records. + const auto mkey_first = ASSERT_RESULT(db->GetMiddleKey()); + const auto tw = ASSERT_RESULT(db->TEST_GetLargestSstTableReader()); + const auto mid_key_of_sst = ASSERT_RESULT(tw->GetMiddleKey()); + ASSERT_EQ(mkey_first, mid_key_of_sst); + + // Create a file with 400 records. This is largest sst. + GenerateSSTFile(db, 500, 400); + + const auto mkey_second = ASSERT_RESULT(db->GetMiddleKey()); + // Still the same as the midkey of the previous largest sst. + ASSERT_EQ(mkey_second, mid_key_of_sst); +} + } // namespace rocksdb int main(int argc, char** argv) {