Skip to content

Commit

Permalink
[yugabyte#15027] docdb: Fix to compute the middle of middle key for t…
Browse files Browse the repository at this point in the history
…ablet splitting

Summary:
```

create table demo ( id int generated always as identity, data text, primary key(id asc) );
copy demo(data) from program 'base64 -w 16384 /dev/urandom | head -c 629145600';
```
Creates and splits a single tablet into 412MB/190MB instead of ~2*300MB.
This might happen because we take split key (midpoint) of the largest SST file and it just contains continuous range of IDs.

To resolve such issues partially, we can take each SSTable file's mid-point, and do a weighted mid-point of mid-points. Each mid-point's weight could be based on the size of the SSTable maybe because we are doing size based splitting.

Test Plan:
Added an test in table_test

yb_build.sh --gtest_filter TableTest.MiddleOfMiddleKey -n 100 -- -p 1

Reviewers: arybochkin, timur

Reviewed By: timur

Subscribers: ybase, arybochkin

Differential Revision: https://phabricator.dev.yugabyte.com/D21214
  • Loading branch information
Hemant Bhanawat authored and jayant07-yb committed Dec 7, 2022
1 parent 5120c88 commit 26a07a4
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 3 deletions.
59 changes: 57 additions & 2 deletions src/yb/rocksdb/db/version_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,62 @@ std::string Version::DebugString(bool hex) const {
return r;
}

namespace {

struct MiddleKeyWithSize {
std::string middle_key;
uint64_t size;
};

static bool compareKeys(MiddleKeyWithSize f1,
MiddleKeyWithSize f2) {
return f1.middle_key.compare(f2.middle_key) > 0;
}

} // namespace

Result<std::string> Version::GetMiddleOfMiddleKeys() {
const auto level = storage_info_.num_levels_ - 1;
// Largest files are at lowest level.
std::vector <MiddleKeyWithSize> sst_files;
sst_files.reserve(storage_info_.files_[level].size());
uint64_t total_size = 0;
// Get middle key and file size for every file
for (const auto* file : storage_info_.files_[level]) {
TableCache::TableReaderWithHandle trwh = VERIFY_RESULT(table_cache_->GetTableReader(
vset_->env_options_, cfd_->internal_comparator(), file->fd, kDefaultQueryId,
/* no_io = */ false, cfd_->internal_stats()->GetFileReadHist(level),
IsFilterSkipped(level, /* is_file_last_in_level = */ true)));

const auto result_mkey = trwh.table_reader->GetMiddleKey();
if (!result_mkey.ok()) {
if (result_mkey.status().IsIncomplete()) {
continue;
}
return result_mkey;
}

const auto file_size = file->fd.GetTotalFileSize();
sst_files.push_back({*result_mkey, file_size});
total_size += file_size;
}

if (sst_files.size() == 0) {
return STATUS(Incomplete, "Either no SST file or too small SST files.");
}

std::sort(sst_files.begin(), sst_files.end(), compareKeys);
uint64_t sorted_size = 0;
// Weighted middle of middle based on file size
for (const auto& sst_file : sst_files) {
sorted_size += sst_file.size;
if (sorted_size > total_size/2) {
return sst_file.middle_key;
}
}
return STATUS(InternalError, "Unexpected error state.");
}

Result<TableCache::TableReaderWithHandle> Version::GetLargestSstTableReader() {
// Largest files are at lowest level.
const auto level = storage_info_.num_levels_ - 1;
Expand All @@ -2088,8 +2144,7 @@ Result<TableCache::TableReaderWithHandle> Version::GetLargestSstTableReader() {
}

Result<std::string> Version::GetMiddleKey() {
const auto trwh = VERIFY_RESULT(GetLargestSstTableReader());
return trwh.table_reader->GetMiddleKey();
return GetMiddleOfMiddleKeys();
}

Result<TableReader*> Version::TEST_GetLargestSstTableReader() {
Expand Down
4 changes: 3 additions & 1 deletion src/yb/rocksdb/db/version_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,8 @@ class Version {

size_t GetMemoryUsageByTableReaders();

// Returns approximate middle key of the largest SST file (see TableReader::GetMiddleKey).
// Returns weighted middle key of the approximate middle keys of the SST files
// (see TableReader::GetMiddleKey).
// Returns Status(Incomplete) if there are no SST files for this version.
Result<std::string> GetMiddleKey();

Expand Down Expand Up @@ -571,6 +572,7 @@ class Version {
void UpdateFilesByCompactionPri();

Result<TableCache::TableReaderWithHandle> GetLargestSstTableReader();
Result<std::string> GetMiddleOfMiddleKeys();

ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
Logger* info_log_;
Expand Down
39 changes: 39 additions & 0 deletions src/yb/rocksdb/table/table_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2796,6 +2796,45 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
}
}

namespace {

void GenerateSSTFile(rocksdb::DB* db, int start_index, int num_records) {
for (int j = start_index; j < start_index + num_records; j++) {
ASSERT_OK(db->Put(rocksdb::WriteOptions(), std::to_string(j), "1"));
}
ASSERT_OK(db->Flush(FlushOptions()));
}

} // namespace

TEST_F(TableTest, MiddleOfMiddleKey) {
rocksdb::Options options;
options.compaction_style = rocksdb::kCompactionStyleNone;
options.num_levels = 1;
options.create_if_missing = true;
const std::string kDBPath = test::TmpDir() + "/mid_key";
ASSERT_OK(DestroyDB(kDBPath, options));
rocksdb::DB* db;
ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db));

// Create two files with 200 and 300 records.
GenerateSSTFile(db, 0, 200);
GenerateSSTFile(db, 200, 300);

// Same as the midkey of the largest sst which has 300 records.
const auto mkey_first = ASSERT_RESULT(db->GetMiddleKey());
const auto tw = ASSERT_RESULT(db->TEST_GetLargestSstTableReader());
const auto mid_key_of_sst = ASSERT_RESULT(tw->GetMiddleKey());
ASSERT_EQ(mkey_first, mid_key_of_sst);

// Create a file with 400 records. This is largest sst.
GenerateSSTFile(db, 500, 400);

const auto mkey_second = ASSERT_RESULT(db->GetMiddleKey());
// Still the same as the midkey of the previous largest sst.
ASSERT_EQ(mkey_second, mid_key_of_sst);
}

} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down

0 comments on commit 26a07a4

Please sign in to comment.