Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix corruption checks v3.5 #14282

Merged
merged 29 commits into from
Sep 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d32de2c
server: Extract triggerCorruptAlarm to function
serathius Jun 13, 2022
35cbdf3
server: Extract corruption detection to dedicated struct
serathius Jun 13, 2022
78a6f38
server: Cover corruptionMonitor with tests
serathius Jun 13, 2022
336fef4
server: Test HashByRev values to make sure they don't change
serathius May 19, 2022
bc592c7
server: Extract unsafeHashByRev function
serathius May 17, 2022
3f26995
server: Move unsafeHashByRev to new hash.go file
serathius May 17, 2022
f5ed371
server: Extract kvHash struct
serathius May 17, 2022
679e327
server: Refactor hasher
serathius May 19, 2022
22d3e4e
server: Return error from scheduleCompaction
serathius May 18, 2022
ea684db
server: Move reading KV index inside scheduleCompaction function
serathius May 19, 2022
f1a759a
server: Fix range in mock not returning same number of keys and values
serathius May 19, 2022
21e5d5d
server: Calculate hash during compaction
serathius May 19, 2022
2b8dd0d
server: Pass revision as int
serathius May 19, 2022
991b429
server: Move adjusting revision to hasher
serathius May 19, 2022
1ff5992
server: Store real rv range in hasher
serathius May 19, 2022
a3f609d
server: Return revision range that hash was calcualted for
serathius May 19, 2022
6311072
server: Remove duplicated compaction revision
serathius May 19, 2022
7358362
server: Extract hasher to separate interface
serathius May 19, 2022
1200b10
server: Cache compaction hash for HashByRev API
serathius Jun 7, 2022
d3db3bc
tests: Add integration tests for compact hash
serathius Jun 7, 2022
00bc8da
tests: Add tests for HashByRev HTTP API
serathius Jun 7, 2022
037a898
tests: Unify TestCompactionHash and extend it to also Delete keys and…
serathius Jun 8, 2022
a8020a0
tests: Rename corruptHash to CorruptBBolt
serathius Jul 25, 2022
8d4ca10
tests: Move CorruptBBolt to testutil
serathius Jul 25, 2022
4a75e3d
server: Refactor compaction checker
serathius Jul 25, 2022
a56ec0b
tests: Cover periodic check in tests
serathius Jul 25, 2022
21fb173
server: Implement compaction hash checking
serathius May 20, 2022
5660bf0
server: Make corrtuption check optional and period configurable
serathius Jul 5, 2022
2ddb9e0
tests: Fix member id in CORRUPT alarm
serathius Jul 25, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,10 @@ type ServerConfig struct {

// InitialCorruptCheck is true to check data corruption on boot
// before serving any peer/client traffic.
InitialCorruptCheck bool
CorruptCheckTime time.Duration
InitialCorruptCheck bool
CorruptCheckTime time.Duration
CompactHashCheckEnabled bool
CompactHashCheckTime time.Duration

// PreVote is true to enable Raft Pre-Vote.
PreVote bool
Expand Down
13 changes: 11 additions & 2 deletions server/embed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,10 @@ type Config struct {
// AuthTokenTTL specifies the TTL in seconds of the simple token
AuthTokenTTL uint `json:"auth-token-ttl"`

ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"`
ExperimentalCorruptCheckTime time.Duration `json:"experimental-corrupt-check-time"`
ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"`
ExperimentalCorruptCheckTime time.Duration `json:"experimental-corrupt-check-time"`
ExperimentalCompactHashCheckEnabled bool `json:"experimental-compact-hash-check-enabled"`
ExperimentalCompactHashCheckTime time.Duration `json:"experimental-compact-hash-check-time"`
// ExperimentalEnableV2V3 configures URLs that expose deprecated V2 API working on V3 store.
// Deprecated in v3.5.
// TODO: Delete in v3.6 (https://github.com/etcd-io/etcd/issues/12913)
Expand Down Expand Up @@ -501,6 +503,9 @@ func NewConfig() *Config {
ExperimentalMemoryMlock: false,
ExperimentalTxnModeWriteWithSharedBuffer: true,

ExperimentalCompactHashCheckEnabled: false,
ExperimentalCompactHashCheckTime: time.Minute,

V2Deprecation: config.V2_DEPR_DEFAULT,
}
cfg.InitialCluster = cfg.InitialClusterFromName(cfg.Name)
Expand Down Expand Up @@ -698,6 +703,10 @@ func (cfg *Config) Validate() error {
return fmt.Errorf("setting experimental-enable-lease-checkpoint-persist requires experimental-enable-lease-checkpoint")
}

if cfg.ExperimentalCompactHashCheckTime <= 0 {
return fmt.Errorf("--experimental-compact-hash-check-time must be >0 (set to %v)", cfg.ExperimentalCompactHashCheckTime)
}

return nil
}

Expand Down
8 changes: 6 additions & 2 deletions server/embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
HostWhitelist: cfg.HostWhitelist,
InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck,
CorruptCheckTime: cfg.ExperimentalCorruptCheckTime,
CompactHashCheckEnabled: cfg.ExperimentalCompactHashCheckEnabled,
CompactHashCheckTime: cfg.ExperimentalCompactHashCheckTime,
PreVote: cfg.PreVote,
Logger: cfg.logger,
ForceNewCluster: cfg.ForceNewCluster,
Expand Down Expand Up @@ -247,8 +249,8 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {

// newly started member ("memberInitialized==false")
// does not need corruption check
if memberInitialized {
if err = e.Server.CheckInitialHashKV(); err != nil {
if memberInitialized && srvcfg.InitialCorruptCheck {
if err = e.Server.CorruptionChecker().InitialCheck(); err != nil {
// set "EtcdServer" to nil, so that it does not block on "EtcdServer.Close()"
// (nothing to close since rafthttp transports have not been started)

Expand Down Expand Up @@ -339,6 +341,8 @@ func print(lg *zap.Logger, ec Config, sc config.ServerConfig, memberInitialized
zap.Bool("pre-vote", sc.PreVote),
zap.Bool("initial-corrupt-check", sc.InitialCorruptCheck),
zap.String("corrupt-check-time-interval", sc.CorruptCheckTime.String()),
zap.Bool("compact-check-time-enabled", sc.CompactHashCheckEnabled),
zap.Duration("compact-check-time-interval", sc.CompactHashCheckTime),
zap.String("auto-compaction-mode", sc.AutoCompactionMode),
zap.Duration("auto-compaction-retention", sc.AutoCompactionRetention),
zap.String("auto-compaction-interval", sc.AutoCompactionRetention.String()),
Expand Down
2 changes: 2 additions & 0 deletions server/etcdmain/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ func newConfig() *config {
// experimental
fs.BoolVar(&cfg.ec.ExperimentalInitialCorruptCheck, "experimental-initial-corrupt-check", cfg.ec.ExperimentalInitialCorruptCheck, "Enable to check data corruption before serving any client/peer traffic.")
fs.DurationVar(&cfg.ec.ExperimentalCorruptCheckTime, "experimental-corrupt-check-time", cfg.ec.ExperimentalCorruptCheckTime, "Duration of time between cluster corruption check passes.")
fs.BoolVar(&cfg.ec.ExperimentalCompactHashCheckEnabled, "experimental-compact-hash-check-enabled", cfg.ec.ExperimentalCompactHashCheckEnabled, "Enable leader to periodically check followers compaction hashes.")
fs.DurationVar(&cfg.ec.ExperimentalCompactHashCheckTime, "experimental-compact-hash-check-time", cfg.ec.ExperimentalCompactHashCheckTime, "Duration of time between leader checks followers compaction hashes.")

fs.BoolVar(&cfg.ec.ExperimentalEnableLeaseCheckpoint, "experimental-enable-lease-checkpoint", false, "Enable leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.")
// TODO: delete in v3.7
Expand Down
27 changes: 14 additions & 13 deletions server/etcdserver/api/v3rpc/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,20 @@ type ClusterStatusGetter interface {
}

type maintenanceServer struct {
lg *zap.Logger
rg etcdserver.RaftStatusGetter
kg KVGetter
bg BackendGetter
a Alarmer
lt LeaderTransferrer
hdr header
cs ClusterStatusGetter
d Downgrader
lg *zap.Logger
rg etcdserver.RaftStatusGetter
hasher mvcc.HashStorage
kg KVGetter
bg BackendGetter
a Alarmer
lt LeaderTransferrer
hdr header
cs ClusterStatusGetter
d Downgrader
}

func NewMaintenanceServer(s *etcdserver.EtcdServer) pb.MaintenanceServer {
srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, kg: s, bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s}
srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, hasher: s.KV().HashStorage(), kg: s, bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s}
if srv.lg == nil {
srv.lg = zap.NewNop()
}
Expand Down Expand Up @@ -180,7 +181,7 @@ func (ms *maintenanceServer) Snapshot(sr *pb.SnapshotRequest, srv pb.Maintenance
}

func (ms *maintenanceServer) Hash(ctx context.Context, r *pb.HashRequest) (*pb.HashResponse, error) {
h, rev, err := ms.kg.KV().Hash()
h, rev, err := ms.hasher.Hash()
if err != nil {
return nil, togRPCError(err)
}
Expand All @@ -190,12 +191,12 @@ func (ms *maintenanceServer) Hash(ctx context.Context, r *pb.HashRequest) (*pb.H
}

func (ms *maintenanceServer) HashKV(ctx context.Context, r *pb.HashKVRequest) (*pb.HashKVResponse, error) {
h, rev, compactRev, err := ms.kg.KV().HashByRev(r.Revision)
h, rev, err := ms.hasher.HashByRev(r.Revision)
if err != nil {
return nil, togRPCError(err)
}

resp := &pb.HashKVResponse{Header: &pb.ResponseHeader{Revision: rev}, Hash: h, CompactRevision: compactRev}
resp := &pb.HashKVResponse{Header: &pb.ResponseHeader{Revision: rev}, Hash: h.Hash, CompactRevision: h.CompactRevision}
ms.hdr.fill(resp.Header)
return resp, nil
}
Expand Down
Loading