From fb1b009595813ae6dabd11628281568c8537f7fd Mon Sep 17 00:00:00 2001 From: Sergey Date: Fri, 13 Oct 2023 12:44:11 +0500 Subject: [PATCH] optimize allocations by using pre allocated buffer for priorities (#2006) ## Problem Badger allocates a lot of objects over time. I created a simple reproducer and measured allocations after 10 minutes of running it. ``` (pprof) top Showing nodes accounting for 267006, 99.54% of 268253 total Dropped 71 nodes (cum <= 1341) Showing top 10 nodes out of 14 flat flat% sum% cum cum% 155255 57.88% 57.88% 155255 57.88% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 (inline) 65539 24.43% 82.31% 65539 24.43% github.com/dgraph-io/badger/v4.(*levelsController).levelTargets 43691 16.29% 98.60% 264485 98.60% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels 2521 0.94% 99.54% 2521 0.94% os.(*File).Stat 0 0% 99.54% 264485 98.60% github.com/dgraph-io/badger/v4.(*levelsController).runCompactor 0 0% 99.54% 264485 98.60% github.com/dgraph-io/badger/v4.(*levelsController).runCompactor.func3 0 0% 99.54% 2521 0.94% github.com/dgraph-io/badger/v4.(*logFile).open 0 0% 99.54% 2521 0.94% github.com/dgraph-io/badger/v4.(*valueLog).open 0 0% 99.54% 2528 0.94% github.com/dgraph-io/badger/v4.Open 0 0% 99.54% 2521 0.94% github.com/dgraph-io/ristretto/z.OpenMmapFile (pprof) sample_index=alloc_space (pprof) top Showing nodes accounting for 238.72MB, 98.59% of 242.14MB total Dropped 51 nodes (cum <= 1.21MB) Showing top 10 nodes out of 34 flat flat% sum% cum cum% 166.41MB 68.72% 68.72% 166.41MB 68.72% github.com/dgraph-io/badger/v4/skl.newArena (inline) 59.04MB 24.38% 93.10% 59.04MB 24.38% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 (inline) 4MB 1.65% 94.75% 4MB 1.65% github.com/dgraph-io/ristretto/z.Calloc (inline) 4MB 1.65% 96.41% 4MB 1.65% github.com/dgraph-io/badger/v4.(*levelsController).levelTargets 3.01MB 1.24% 97.65% 3.01MB 1.24% github.com/google/flatbuffers/go.NewBuilder (inline) 1.27MB 0.52% 98.17% 1.27MB 0.52% github.com/dgraph-io/ristretto.newCmRow 1MB 0.41% 98.59% 64.04MB 26.45% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels 0 0% 98.59% 7.01MB 2.89% github.com/dgraph-io/badger/v4.(*DB).flushMemtable 0 0% 98.59% 7.01MB 2.89% github.com/dgraph-io/badger/v4.(*DB).handleMemTableFlush 0 0% 98.59% 83.20MB 34.36% github.com/dgraph-io/badger/v4.(*DB).newMemTable ``` We see that pickCompactLevels makes a pretty high number of allocations due to appending to slice over and over again: ``` (pprof) list pickCompactLevels Total: 268253 ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels in /Users/deff/go/pkg/mod/github.com/dgraph-io/badger/v4@v4.2.0/levels.go 43691 264485 (flat, cum) 98.60% of Total . . 539:func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { . 65539 540: t := s.levelTargets() . . 541: addPriority := func(level int, score float64) { . . 542: pri := compactionPriority{ . . 543: level: level, . . 544: score: score, . . 545: adjusted: score, . . 546: t: t, . . 547: } . . 548: prios = append(prios, pri) . . 549: } . . 550: . . 551: // Add L0 priority based on the number of tables. . 42134 552: addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables)) . . 553: . . 554: // All other levels use size to calculate priority. . . 555: for i := 1; i < len(s.levels); i++ { . . 556: // Don't consider those tables that are already being compacted right now. . . 557: delSize := s.cstatus.delSize(i) . . 558: . . 559: l := s.levels[i] . . 560: sz := l.getTotalSize() - delSize . 113121 561: addPriority(i, float64(sz)/float64(t.targetSz[i])) . . 562: } . . 563: y.AssertTrue(len(prios) == len(s.levels)) . . 564: . . 565: // The following code is borrowed from PebbleDB and results in healthier LSM tree structure. . . 566: // If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1 . . 567: // score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so . . 568: // on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then . . 569: // we'll increase the priority of Li-1. . . 570: // Overall what this means is, if the bottom level is already overflowing, then de-prioritize . . 571: // compaction of the above level. If the bottom level is not full, then increase the priority of . . 572: // above level. . . 573: var prevLevel int . . 574: for level := t.baseLevel; level < len(s.levels); level++ { . . 575: if prios[prevLevel].adjusted >= 1 { . . 576: // Avoid absurdly large scores by placing a floor on the score that we'll . . 577: // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily . . 578: const minScore = 0.01 . . 579: if prios[level].score >= minScore { . . 580: prios[prevLevel].adjusted /= prios[level].adjusted . . 581: } else { . . 582: prios[prevLevel].adjusted /= minScore . . 583: } . . 584: } . . 585: prevLevel = level . . 586: } . . 587: . . 588: // Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score. . . 589: // We'll still sort them by their adjusted score below. Having both these scores allows us to . . 590: // make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0 . . 591: // compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions. . . 592: out := prios[:0] . . 593: for _, p := range prios[:len(prios)-1] { . . 594: if p.score >= 1.0 { . . 595: out = append(out, p) . . 596: } . . 597: } . . 598: prios = out . . 599: . . 600: // Sort by the adjusted score. 43691 43691 601: sort.Slice(prios, func(i, j int) bool { . . 602: return prios[i].adjusted > prios[j].adjusted . . 603: }) . . 604: return prios . . 605:} . . 606: ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 in /Users/deff/go/pkg/mod/github.com/dgraph-io/badger/v4@v4.2.0/levels.go 155255 155255 (flat, cum) 57.88% of Total . . 541: addPriority := func(level int, score float64) { . . 542: pri := compactionPriority{ . . 543: level: level, . . 544: score: score, . . 545: adjusted: score, . . 546: t: t, . . 547: } 155255 155255 548: prios = append(prios, pri) . . 549: } . . 550: . . 551: // Add L0 priority based on the number of tables. . . 552: addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables)) . . 553: ``` ## Solution I suggest two optimizations: 1. Pre-allocate `prios` capacity according to numbers of `s.levels` 2. Reuse `prios` memory in compaction process, thanks to one-threaded logic of compactor Results after optimization (10 min run of reproducer): ``` (pprof) top Showing nodes accounting for 165466, 99.84% of 165735 total Dropped 27 nodes (cum <= 828) Showing top 10 nodes out of 48 flat flat% sum% cum cum% 40962 24.72% 24.72% 40962 24.72% github.com/dgraph-io/badger/v4.(*levelsController).levelTargets 32768 19.77% 44.49% 32768 19.77% github.com/dgraph-io/badger/v4/skl.(*Arena).putNode 32768 19.77% 64.26% 32768 19.77% github.com/dgraph-io/badger/v4/y.KeyWithTs (inline) 21845 13.18% 77.44% 62807 37.90% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels 21845 13.18% 90.62% 21845 13.18% github.com/dgraph-io/badger/v4.(*logFile).encodeEntry 8192 4.94% 95.56% 8192 4.94% github.com/dgraph-io/badger/v4/table.(*Builder).addHelper 4681 2.82% 98.39% 4681 2.82% regexp/syntax.(*Regexp).Simplify 2341 1.41% 99.80% 2341 1.41% runtime/pprof.allFrames 64 0.039% 99.84% 32832 19.81% github.com/dgraph-io/badger/v4.(*Txn).commitAndSend 0 0% 99.84% 32832 19.81% github.com/dgraph-io/badger/v4.(*DB).Update (pprof) sample_index=alloc_space (pprof) top Showing nodes accounting for 180.47MB, 97.79% of 184.54MB total Dropped 22 nodes (cum <= 0.92MB) Showing top 10 nodes out of 53 flat flat% sum% cum cum% 166.41MB 90.17% 90.17% 166.41MB 90.17% github.com/dgraph-io/badger/v4/skl.newArena 4MB 2.17% 92.34% 4MB 2.17% github.com/dgraph-io/ristretto/z.Calloc 3.01MB 1.63% 93.97% 3.01MB 1.63% github.com/google/flatbuffers/go.NewBuilder 2.50MB 1.35% 95.32% 2.50MB 1.35% github.com/dgraph-io/badger/v4.(*levelsController).levelTargets 1.76MB 0.96% 96.28% 2.97MB 1.61% compress/flate.NewWriter (inline) 1.16MB 0.63% 96.91% 1.16MB 0.63% github.com/dgraph-io/ristretto/z.(*Bloom).Size 0.64MB 0.34% 97.25% 1.20MB 0.65% compress/flate.(*compressor).init 0.50MB 0.27% 97.52% 1MB 0.54% github.com/dgraph-io/badger/v4.(*Txn).commitAndSend 0.50MB 0.27% 97.79% 3MB 1.63% github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels 0 0% 97.79% 2.97MB 1.61% compress/gzip.(*Writer).Write ``` And inside pickCompactLevels: ``` ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels in /Users/deff/dev/work/badger/levels.go 21845 62807 (flat, cum) 37.90% of Total . . 544:func (s *levelsController) pickCompactLevels(prios []compactionPriority) []compactionPriority { . 40962 545: t := s.levelTargets() . . 546: addPriority := func(level int, score float64) { . . 547: pri := compactionPriority{ . . 548: level: level, . . 549: score: score, . . 550: adjusted: score, . . 551: t: t, . . 552: } . . 553: prios = append(prios, pri) . . 554: } . . 555: . . 556: if cap(prios) < len(s.levels) { . . 557: prios = make([]compactionPriority, 0, len(s.levels)) . . 558: } . . 559: prios = prios[:0] . . 560: . . 561: // Add L0 priority based on the number of tables. . . 562: addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables)) . . 563: . . 564: // All other levels use size to calculate priority. . . 565: for i := 1; i < len(s.levels); i++ { . . 566: // Don't consider those tables that are already being compacted right now. . . 567: delSize := s.cstatus.delSize(i) . . 568: . . 569: l := s.levels[i] . . 570: sz := l.getTotalSize() - delSize . . 571: addPriority(i, float64(sz)/float64(t.targetSz[i])) . . 572: } . . 573: y.AssertTrue(len(prios) == len(s.levels)) . . 574: . . 575: // The following code is borrowed from PebbleDB and results in healthier LSM tree structure. . . 576: // If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1 . . 577: // score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so . . 578: // on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then . . 579: // we'll increase the priority of Li-1. . . 580: // Overall what this means is, if the bottom level is already overflowing, then de-prioritize . . 581: // compaction of the above level. If the bottom level is not full, then increase the priority of . . 582: // above level. . . 583: var prevLevel int . . 584: for level := t.baseLevel; level < len(s.levels); level++ { . . 585: if prios[prevLevel].adjusted >= 1 { . . 586: // Avoid absurdly large scores by placing a floor on the score that we'll . . 587: // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily . . 588: const minScore = 0.01 . . 589: if prios[level].score >= minScore { . . 590: prios[prevLevel].adjusted /= prios[level].adjusted . . 591: } else { . . 592: prios[prevLevel].adjusted /= minScore . . 593: } . . 594: } . . 595: prevLevel = level . . 596: } . . 597: . . 598: // Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score. . . 599: // We'll still sort them by their adjusted score below. Having both these scores allows us to . . 600: // make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0 . . 601: // compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions. . . 602: out := prios[:0] . . 603: for _, p := range prios[:len(prios)-1] { . . 604: if p.score >= 1.0 { . . 605: out = append(out, p) . . 606: } . . 607: } . . 608: prios = out . . 609: . . 610: // Sort by the adjusted score. 21845 21845 611: sort.Slice(prios, func(i, j int) bool { . . 612: return prios[i].adjusted > prios[j].adjusted . . 613: }) . . 614: return prios . . 615:} . . 616: ``` ## Profile from real project Both profiles are measured after 30 minutes from application start ### Before optimization: ``` (pprof) top Showing nodes accounting for 621.02MB, 85.32% of 727.90MB total Dropped 550 nodes (cum <= 3.64MB) Showing top 10 nodes out of 146 flat flat% sum% cum cum% 380.72MB 52.30% 52.30% 380.72MB 52.30% github.com/dgraph-io/badger/v3.(*levelsController).pickCompactLevels.func1 104.01MB 14.29% 66.59% 104.01MB 14.29% github.com/dgraph-io/badger/v3/skl.newArena 33.27MB 4.57% 71.16% 33.27MB 4.57% github.com/dgraph-io/ristretto.newCmRow 27.05MB 3.72% 74.88% 27.05MB 3.72% github.com/dgraph-io/badger/v3/y.SafeCopy 23.50MB 3.23% 78.11% 23.50MB 3.23% github.com/dgraph-io/badger/v3.(*levelsController).levelTargets 18.31MB 2.52% 80.62% 18.31MB 2.52% github.com/dgraph-io/ristretto/z.(*Bloom).Size 18.02MB 2.48% 83.10% 18.02MB 2.48% github.com/dgraph-io/badger/v3/y.(*Slice).Resize 8MB 1.10% 84.20% 412.23MB 56.63% github.com/dgraph-io/badger/v3.(*levelsController).pickCompactLevels 4.12MB 0.57% 84.77% 8.13MB 1.12% github.com/blevesearch/vellum.(*FSTIterator).next 4MB 0.55% 85.32% 4MB 0.55% github.com/blevesearch/vellum.(*decoderV1).stateAt ``` ### After optimization: ``` Type: alloc_space Time: Sep 11, 2023 at 5:50pm (+05) Entering interactive mode (type "help" for commands, "o" for options) (pprof) top Showing nodes accounting for 262.17MB, 66.88% of 391.99MB total Dropped 453 nodes (cum <= 1.96MB) Showing top 10 nodes out of 290 flat flat% sum% cum cum% 104.01MB 26.53% 26.53% 104.01MB 26.53% github.com/dgraph-io/badger/v3/skl.newArena 33.91MB 8.65% 35.18% 33.91MB 8.65% github.com/dgraph-io/ristretto.newCmRow 28.50MB 7.27% 42.45% 28.50MB 7.27% github.com/dgraph-io/badger/v3.(*levelsController).levelTargets 26.52MB 6.77% 49.22% 26.52MB 6.77% github.com/dgraph-io/badger/v3/y.(*Slice).Resize 25.03MB 6.38% 55.61% 25.03MB 6.38% github.com/dgraph-io/badger/v3/y.SafeCopy 17.16MB 4.38% 59.98% 17.16MB 4.38% github.com/dgraph-io/ristretto/z.(*Bloom).Size 7.12MB 1.82% 61.80% 9.12MB 2.33% github.com/anyproto/go-chash.(*cHash).addMembers 6.72MB 1.72% 63.51% 12.22MB 3.12% github.com/blevesearch/vellum.(*FSTIterator).next 6.71MB 1.71% 65.22% 6.71MB 1.71% bytes.growSlice 6.50MB 1.66% 66.88% 6.50MB 1.66% github.com/blevesearch/vellum.(*builderNodePool).Get ``` # Reproducer ``` package main import ( "fmt" "os" "github.com/dgraph-io/badger/v4" _ "net/http/pprof" "net/http" "log" ) func generateItems(db *badger.DB, n int) error { return db.Update(func(txn *badger.Txn) error { for i := 0; i < n; i++ { err := txn.Set([]byte(fmt.Sprintf("key-%d", i)), []byte(fmt.Sprintf("value-%d", i))) if err != nil { return err } } return nil }) } func run() error { forever := make(chan struct{}) db, err := badger.Open(badger.DefaultOptions("./tmp")) if err != nil { return fmt.Errorf("open badger: %w", err) } err = generateItems(db, 1000) if err != nil { return fmt.Errorf("generate items: %w", err) } go func() { log.Println(http.ListenAndServe("localhost:9000", nil)) }() <-forever return nil } func main() { err := run() if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } } ``` --- db.go | 2 +- levels.go | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/db.go b/db.go index d30ac6c3d..c79fd0253 100644 --- a/db.go +++ b/db.go @@ -1617,7 +1617,7 @@ func (db *DB) Flatten(workers int) error { } } if len(levels) <= 1 { - prios := db.lc.pickCompactLevels() + prios := db.lc.pickCompactLevels(nil) if len(prios) == 0 || prios[0].score <= 1.0 { db.opt.Infof("All tables consolidated into one level. Flattening done.\n") return nil diff --git a/levels.go b/levels.go index 3e397e704..6bbaf55ca 100644 --- a/levels.go +++ b/levels.go @@ -473,8 +473,13 @@ func (s *levelsController) runCompactor(id int, lc *z.Closer) { } return false } + + var priosBuffer []compactionPriority runOnce := func() bool { - prios := s.pickCompactLevels() + prios := s.pickCompactLevels(priosBuffer) + defer func() { + priosBuffer = prios + }() if id == 0 { // Worker ID zero prefers to compact L0 always. prios = moveL0toFront(prios) @@ -536,7 +541,9 @@ func (s *levelsController) lastLevel() *levelHandler { // pickCompactLevel determines which level to compact. // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction -func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { +// It tries to reuse priosBuffer to reduce memory allocation, +// passing nil is acceptable, then new memory will be allocated. +func (s *levelsController) pickCompactLevels(priosBuffer []compactionPriority) (prios []compactionPriority) { t := s.levelTargets() addPriority := func(level int, score float64) { pri := compactionPriority{ @@ -548,6 +555,12 @@ func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { prios = append(prios, pri) } + // Grow buffer to fit all levels. + if cap(priosBuffer) < len(s.levels) { + priosBuffer = make([]compactionPriority, 0, len(s.levels)) + } + prios = priosBuffer[:0] + // Add L0 priority based on the number of tables. addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables)) @@ -1707,7 +1720,7 @@ type LevelInfo struct { func (s *levelsController) getLevelInfo() []LevelInfo { t := s.levelTargets() - prios := s.pickCompactLevels() + prios := s.pickCompactLevels(nil) result := make([]LevelInfo, len(s.levels)) for i, l := range s.levels { l.RLock()