Skip to content

Commit

Permalink
support dump scheduler cache snapshot to json file
Browse files Browse the repository at this point in the history
Signed-off-by: lowang-bh <lhui_wang@163.com>

fix: json marsh error for unsupport type
add root path for dumped json file

Signed-off-by: lowang-bh <lhui_wang@163.com>
  • Loading branch information
lowang-bh committed Oct 23, 2023
1 parent 0c1e1f2 commit 0d8ef12
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 9 deletions.
3 changes: 3 additions & 0 deletions cmd/scheduler/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ type ServerOption struct {
PercentageOfNodesToFind int32

NodeSelector []string
CacheDumpFileDir string
EnableCacheDumper bool
}

// DecryptFunc is custom function to parse ca file
type DecryptFunc func(c *ServerOption) error

// ServerOpts server options.
Expand Down Expand Up @@ -128,6 +130,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.EnableMetrics, "enable-metrics", false, "Enable the metrics function; it is false by default")
fs.StringSliceVar(&s.NodeSelector, "node-selector", nil, "volcano only work with the labeled node, like: --node-selector=volcano.sh/role:train --node-selector=volcano.sh/role:serving")
fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default")
fs.StringVar(&s.CacheDumpFileDir, "dump-dir", "/tmp", "The target dir where the json file put at when dump cache info to json file")
}

// CheckOptionOrDie check lock-object-namespace when LeaderElection is enabled.
Expand Down
1 change: 1 addition & 0 deletions cmd/scheduler/app/options/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func TestAddFlags(t *testing.T) {
PercentageOfNodesToFind: defaultPercentageOfNodesToFind,
EnableLeaderElection: true,
LockObjectNamespace: defaultLockObjectNamespace,
CacheDumpFileDir: "/tmp",
}

if !reflect.DeepEqual(expected, s) {
Expand Down
4 changes: 2 additions & 2 deletions pkg/scheduler/api/cluster_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ type ClusterInfo struct {
Nodes map[string]*NodeInfo
Queues map[QueueID]*QueueInfo
NamespaceInfo map[NamespaceName]*NamespaceInfo
RevocableNodes map[string]*NodeInfo
NodeList []string
RevocableNodes map[string]*NodeInfo `json:"-"`
NodeList []string `json:"-"`
CSINodesStatus map[string]*CSINodeStatusInfo
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/api/job_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ type TaskInfo struct {
Pod *v1.Pod

// CustomBindErrHandler is a custom callback func called when task bind err.
CustomBindErrHandler func() error
CustomBindErrHandler func() error `json:"-"`
// CustomBindErrHandlerSucceeded indicates whether CustomBindErrHandler is executed successfully.
CustomBindErrHandlerSucceeded bool
}
Expand Down
40 changes: 35 additions & 5 deletions pkg/scheduler/cache/dumper.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@
package cache

import (
"encoding/json"
"fmt"
"os"
"os/signal"
"path"
"runtime"
"strings"
"syscall"
"time"

"k8s.io/klog/v2"

Expand All @@ -32,9 +36,31 @@ import (
// for debugging purposes. Usage: run `kill -s USR2 <pid>` in the shell, where <pid>
// is the process id of the scheduler process.
type Dumper struct {
Cache Cache
Cache Cache
RootDir string // target directory for the dumped json file
}

// dumpToJSONFile marsh scheduler cache snapshot to json file
func (d *Dumper) dumpToJSONFile() {
snapshot := d.Cache.Snapshot()
name := fmt.Sprintf("snapshot-%d.json", time.Now().Unix())
fName := path.Join(d.RootDir, name)
file, err := os.OpenFile(fName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
klog.Errorf("error creating snapshot because of error creating file: %v", err)
return
}
defer file.Close()
klog.Infoln("Starting to dump info in scheduler cache to file", fName)
if err = json.NewEncoder(file).Encode(snapshot.Nodes); err != nil {
klog.Errorf("Failed to dump info in scheduler cache, json encode error: %v", err)
return
}

klog.Infoln("Successfully dump info in scheduler cache to file", fName)
}

// dumpAll prints all information to log
func (d *Dumper) dumpAll() {
snapshot := d.Cache.Snapshot()
klog.Info("Dump of nodes info in scheduler cache")
Expand Down Expand Up @@ -73,16 +99,20 @@ func (d *Dumper) printJobInfo(jobInfo *api.JobInfo) string {
}

// ListenForSignal starts a goroutine that will respond when process
// receives SIGUSER2 signal.
// receives SIGUSER1/SIGUSER2 signal.
func (d *Dumper) ListenForSignal(stopCh <-chan struct{}) {
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGUSR2)
ch1 := make(chan os.Signal, 1)
ch2 := make(chan os.Signal, 1)
signal.Notify(ch1, syscall.SIGUSR1)
signal.Notify(ch2, syscall.SIGUSR2)
go func() {
for {
select {
case <-stopCh:
return
case <-ch:
case <-ch1:
d.dumpToJSONFile()
case <-ch2:
d.dumpAll()
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func NewScheduler(
fileWatcher: watcher,
cache: cache,
schedulePeriod: period,
dumper: schedcache.Dumper{Cache: cache},
dumper: schedcache.Dumper{Cache: cache, RootDir: options.ServerOpts.CacheDumpFileDir},
}

return scheduler, nil
Expand Down

0 comments on commit 0d8ef12

Please sign in to comment.