3.4.18 etcd data inconsistency #14211

chaochn47 · 2022-07-11T23:24:05Z

What happened?

A 3 etcd cluster returned inconsistent data.

'bad' Node

{
   "level": "info",
   "ts": "2022-06-09T09:21:10.368Z",
   "caller": "mvcc/kvstore_compaction.go:56",
   "msg": "finished scheduled compaction",
   "compact-revision": 165517829,
   "took": "13.589883ms"
}

{
    "level": "fatal",
    "ts": "2022-06-09T09:22:13.472Z",
    "caller": "mvcc/kvstore_txn.go:150",
    "msg": "range failed to find revision pair",
    "revision-main": 165519108,
    "revision-sub": 0,
    "stacktrace": "
          go.etcd.io/etcd/mvcc.(storeTxnRead).rangeKeys (http://go.etcd.io/etcd/mvcc.(storeTxnRead).rangeKeys)
             /home/ec2-user/workplace/etcd/src/etcd/mvcc/kvstore_txn.go:150
          go.etcd.io/etcd/mvcc.(storeTxnRead).Range (http://go.etcd.io/etcd/mvcc.(storeTxnRead).Range)
              /home/ec2-user/workplace/etcd/src/etcd/mvcc/kvstore_txn.go:51
          go.etcd.io/etcd/mvcc.(metricsTxnWrite).Range (http://go.etcd.io/etcd/mvcc.(metricsTxnWrite).Range)
              /home/ec2-user/workplace/etcd/src/etcd/mvcc/metrics_txn.go:37
          go.etcd.io/etcd/etcdserver.(applierV3backend).Range (http://go.etcd.io/etcd/etcdserver.(applierV3backend).Range)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/apply.go:286
          go.etcd.io/etcd/etcdserver.(EtcdServer).Range.func3 (http://go.etcd.io/etcd/etcdserver.(EtcdServer).Range.func3)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/v3_server.go:121
          go.etcd.io/etcd/etcdserver.(EtcdServer).doSerialize (http://go.etcd.io/etcd/etcdserver.(EtcdServer).doSerialize)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/v3_server.go:600
          go.etcd.io/etcd/etcdserver.(EtcdServer).Range (http://go.etcd.io/etcd/etcdserver.(EtcdServer).Range)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/v3_server.go:122
          go.etcd.io/etcd/etcdserver/api/v3rpc.(kvServer).Range (http://go.etcd.io/etcd/etcdserver/api/v3rpc.(kvServer).Range)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/api/v3rpc/key.go:52
          go.etcd.io/etcd/etcdserver/etcdserverpb._KV_Range_Handler.func1 (http://go.etcd.io/etcd/etcdserver/etcdserverpb._KV_Range_Handler.func1)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/etcdserverpb/rpc.pb.go:3554
          github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1 (http://github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1)
              /home/ec2-user/go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:31 (http://github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:31)
          github.com/grpc-ecosystem/go-grpc-prometheus.(ServerMetrics).UnaryServerInterceptor.func1 (http://github.com/grpc-ecosystem/go-grpc-prometheus.(ServerMetrics).UnaryServerInterceptor.func1)
              /home/ec2-user/go/pkg/mod/github.com/grpc-ecosystem/go-grpc-prometheus@v1.2.0/server_metrics.go:107 (http://github.com/grpc-ecosystem/go-grpc-prometheus@v1.2.0/server_metrics.go:107)
          github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1 (http://github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1)
              /home/ec2-user/go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:34 (http://github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:34)
          go.etcd.io/etcd/etcdserver/api/v3rpc.newUnaryInterceptor.func1 (http://go.etcd.io/etcd/etcdserver/api/v3rpc.newUnaryInterceptor.func1)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/api/v3rpc/interceptor.go:70
          github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1 (http://github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1)
              /home/ec2-user/go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:34 (http://github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:34)
          go.etcd.io/etcd/etcdserver/api/v3rpc.newLogUnaryInterceptor.func1 (http://go.etcd.io/etcd/etcdserver/api/v3rpc.newLogUnaryInterceptor.func1)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/api/v3rpc/interceptor.go:77
          github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1 (http://github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1)
              /home/ec2-user/go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:39 (http://github.com/grpc-ecosystem/go-grpc-middleware@v1.0.1-0.20190118093823-f849b5445de4/chain.go:39)
          go.etcd.io/etcd/etcdserver/etcdserverpb._KV_Range_Handler (http://go.etcd.io/etcd/etcdserver/etcdserverpb._KV_Range_Handler)
              /home/ec2-user/workplace/etcd/src/etcd/etcdserver/etcdserverpb/rpc.pb.go:3556
          google.golang.org/grpc.(Server).processUnaryRPC (http://google.golang.org/grpc.(Server).processUnaryRPC)
              /home/ec2-user/go/pkg/mod/google.golang.org/grpc@v1.26.0/server.go:1024 (http://google.golang.org/grpc@v1.26.0/server.go:1024)
          google.golang.org/grpc.(Server).handleStream (http://google.golang.org/grpc.(Server).handleStream)
              /home/ec2-user/go/pkg/mod/google.golang.org/grpc@v1.26.0/server.go:1313 (http://google.golang.org/grpc@v1.26.0/server.go:1313)
          google.golang.org/grpc.(Server).serveStreams.func1.1 (http://google.golang.org/grpc.(Server).serveStreams.func1.1)
              /home/ec2-user/go/pkg/mod/google.golang.org/grpc@v1.26.0/server.go:722 (http://google.golang.org/grpc@v1.26.0/server.go:722)“
}

kube-controller-manager continuously cannot update its lock due to partial data in etcd.

W0630 19:46:09.303166      10 watcher.go:199] watch chan error: etcdserver: mvcc: required revision has been compacted

E0630 19:45:38.273530      10 status.go:71] apiserver received an error that is not an metav1.Status: &errors.errorString{s:"converting (v1.ConfigMap) to (core.Endpoints): Subsets not present in src"}

What did you expect to happen?

The error range failed to find revision pair should never ever happen because read transaction will either read from write buffer or boltdb.

We reported this case as a reference.

How can we reproduce it (as minimally and precisely as possible)?

Unfortunately, I don't have any clue to reproduce it.

Anything else we need to know?

mvcc revision diverged between nodes.

db size diverged

compact failed to apply because of the revision divergence.
consistency checker was not enabled in this old cluster to help oncall operators to retain WAL logs in time.

Etcd version (please run commands below)

$ etcd --version
v3.4.18

Etcd configuration (command line flags or environment variables)

paste your configuration here

Etcd debug information (please run commands blow, feel free to obfuscate the IP address or FQDN in the output)

$ etcdctl member list -w table
# paste output here

$ etcdctl --endpoints=<member list> endpoint status -w table
# paste output here

Relevant log output

No response

The text was updated successfully, but these errors were encountered:

ahrtr · 2022-07-13T02:56:00Z

Please provide the info below,

etcdctl  endpoint status -w json --cluster
# Run command below on each member
./etcd-dump-db  iterate-bucket  ~/tmp/etcd/infra1.etcd/  meta  --decode

chaochn47 · 2022-07-13T17:21:18Z

Thanks for the comment!

Unfortunately the point-in-time db state on each member was lost.

Corruption check on the new clusters will alarm and print CI in disk if this issue happened again.

ahrtr · 2022-07-14T01:06:45Z

if this issue happened again.

Is it easy to reproduce this issue in your environment (dev or production?) ?

Corruption check on the new clusters

Did you just create a new cluster or replace one problematic member with a new one?

chaochn47 · 2022-07-14T17:10:48Z

Is it easy to reproduce this issue in your environment (dev or production?) ?

It happened in a production cluster created on demand.

I am not even sure where to start the reproduce the error range failed to find revision pair. I suspect this corruption may happen in boltdb or fs layer or underlying disk.

Did you just create a new cluster or replace one problematic member with a new one?

The oncall operator replaced the problematic member with a new one and everything recovered..

ahrtr · 2022-07-14T21:27:56Z

The oncall operator replaced the problematic member with a new one and everything recovered..

Thanks for the feedback. If just one member has issue, in other words majorities are still working well, then most likely the db file of the problematic member was corrupted. Usually you will see panic when you try to execute boltDB CMD in this case, something like 13406. Had the oncall operator in your company tried any BoltDB CMD and confirmed this?

chaochn47 · 2022-07-21T00:38:44Z

Yeah, I downloaded the snapshot taken from the impacted node before and after the panic.

I ran some bbolt cmds like bbolt check and some other subcommands (stats) on both db files. The results returned OK and no panic like 13406

I'd like to keep this issue open until second occurrence with WAL log presence.

lavacat · 2022-07-31T22:52:21Z

@chaochn47 do you still have data in your metric system? Can you post other metrics? Seams like something happened on 06/09
etcd_server_is_leader - wondering if there was leader change on 06/09 or if there were too many leader changes indicating network/load issue.

etcd_server_heartbeat_send_failures_total to make sure there were not hb failures
etcd_server_snapshot_apply_in_progress_total just in case there was some anomaly.

rate(etcd_debugging_mvcc_compact_revision[1m]) - to see if all nodes competing at the same rate.
etcd_disk_defrag_inflight - to see if there were any correlated defrags around 06/09

It's interesting why revision divergence happened on 06/09 but size divergence on 06/19.

Was cluster under heavy load/different load pattern?

chaochn47 · 2022-08-01T17:34:33Z

@lavacat thanks for taking a look!

We do have data in the metrics and logging system.

etcd_server_is_leader - wondering if there was leader change on 06/09 or if there were too many leader changes indicating network/load issue.

There is a leadership change around 5 mins before this event. However, we started to see 3 more similar issues happening on other production clusters. They does not have correlation to leader changes.

etcd_server_heartbeat_send_failures_total to make sure there were not hb failures
etcd_server_snapshot_apply_in_progress_total just in case there was some anomaly.

No heartbeat failures and no snapshot apply in progress.

rate(etcd_debugging_mvcc_compact_revision[1m]) - to see if all nodes competing at the same rate.
etcd_disk_defrag_inflight - to see if there were any correlated defrags around 06/09

Compaction is driven by kube-apiserver every 5mins. The compact_revision is consistent until the event.
Defrag is performed by another on-host agent. It does not perform defrag because db_size - db_size_in_use does not breach 200MiB threshold.

Was cluster under heavy load/different load pattern?

I think the cluster is under light load. The mvcc range, txn, put rate is low and db size is under 25MiB.

chaochn47 · 2022-08-01T17:38:17Z

I suspected the write Txn failed to write back the buffer to read Txn buffer when a new concurrentReadTxn is created at the same time copied a stale read Txn buffer. But from the code perspective, this seems like not a issue.

The only possibility is the disk write failed and mmap'd data is not reflected somehow.. But disk write failure should have been poped up to the etcd at the same time..

But we are enabling corrupt check, alarming faster, retaining more WAL files, better tooling to investigate, etc. So hopefully next time there will be more information..

miancheng7 · 2023-09-19T17:38:32Z

Provide a similar case we observed on September 12, 2023.

What happened?

A 3 etcd cluster returned inconsistent data.

The last 3 relevant logs before the first revision error occurred in the bad node. There is not panic log, such as range failed to find revision pair.

{
  "level": "info",
  "ts": "2023-09-12T13:30:55.970Z",
  "caller": "mvcc/index.go:189",
  "msg": "compact tree index",
  "revision": 540040293
}
{
 "level": "info",
 "ts": "2023-09-12T13:30:56.278Z",
 "msg": "finished scheduled compaction",
 "compact-revision": 540040293,
 "took": "300.432453ms"
}
{
 "level": "info",
 "ts": "2023-09-12T13:33:06.694Z",
 "msg": "purged",
 "path": "/mnt/<subpath>/member/snap/0000000000000025-00000000216876b7.snap"
}

kube-controller-manager continuously fail for mvcc revision error.

W0912 13:53:03.296333   11 watcher.go:220] watch chan error: etcdserver: mvcc: required revision has been compacted
E0912 13:49:48.143489   11 status.go:71] apiserver received an error that is not an metav1.Status: rpctypes.EtcdError{code:0xb, desc:"etcdserver: mvcc: required revision is a future revision"}: etcdserver: mvcc: required revision is a future revision

Anything else we need to know?

MVCC revision diverged between nodes. Db size in used diverged.

Etcd version (please run commands below)

$ etcd --version
3.4.20

ahrtr · 2023-09-19T18:46:16Z

It seems the names "revision divergency" and "db size total divergency" confused. m1-m2 seems actually to be db size total divergency.

Please

execute etcdctl endpoint status -w json --cluster, and provide the output.
Is it possible to provide the db files?

miancheng7 · 2023-09-19T19:01:10Z

Thanks, updated.

FYI,

execute etcdctl endpoint status -w json --cluster, and provide the output.

{
  "Endpoint": "http://xxx:2379/",
  "Status": {
    "header": {
      "cluster_id": 11216169436736817000,
      "member_id": 3380614885785619000,
      "revision": 540470663,
      "raft_term": 37
    },
    "version": "3.4.20",
    "dbSize": 401346560,
    "leader": 3587062029378754000,
    "raftIndex": 560998036,
    "raftTerm": 37,
    "raftAppliedIndex": 560998036,
    "dbSizeInUse": 207654912
  }
}
{
  "Endpoint": "http://xxx:2379/",
  "Status": {
    "header": {
      "cluster_id": 11216169436736817000,
      "member_id": 3587062029378754000,
      "revision": 540470663,
      "raft_term": 37
    },
    "version": "3.4.20",
    "dbSize": 401272832,
    "leader": 3587062029378754000,
    "raftIndex": 560998036,
    "raftTerm": 37,
    "raftAppliedIndex": 560998036,
    "dbSizeInUse": 207732736
  }
}
{
  "Endpoint": "http://xxx:2379/",
  "Status": {
    "header": {
      "cluster_id": 11216169436736817000,
      "member_id": 15037885980601823000,
      "revision": 540057523,
      "raft_term": 37
    },
    "version": "3.4.20",
    "dbSize": 401489920,
    "leader": 3587062029378754000,
    "raftIndex": 560998036,
    "raftTerm": 37,
    "raftAppliedIndex": 560998036,
    "dbSizeInUse": 192278528
  }
}

Is it possible to provide the db files?

We’re not allowed to share customer data.

ahrtr · 2023-09-19T19:30:27Z

Please compare the consisten_index of the three members using etcd-dump-db

$ ./etcd-dump-db iterate-bucket path-2-db meta --decode
key="scheduledCompactRev", value={2804621 0}
key="finishedCompactRev", value={2804621 0}
key="consistent_index", value=3050985

Could you share the complete log of three members?
To double check, the revision started to diverge starting from Sept. 11 13:30?

Tachone · 2023-11-07T08:42:50Z

Provide a case we observed on Nov 1, 2023.

What happened?

A 3 etcd cluster returned inconsistent data.

etcd version: 3.4.18

Run etcdctl get 3 netmasterpods on 2 normal nodes. But get 5 netmaster pods on the 1 bad node（we have deleted the 2 netmasterpods）

A panic occurred during operation, it seems that the data is inconsistent after etcd restarting, no errors were detected at startup. ( I don't think it's related with #266 )

Anything else we need to know?

The RaftIndex、AppliedIndex of the 3 nodes is consistent
Run bbolt check returns OK
Run etcd-dump-db, there are indeed 5 netmaster pods in bbolt. But it seems that the revision gap is relatively large with the normal node, but I am not completely sure that the pod has not been deleted and rebuilt.
experimental-initial-corrupt-check is disabled.

Bad node

Normal node

Tachone · 2023-11-29T12:30:11Z

Do you have any suggestions on the direction of investigation? I haven't been able to reproduce it yet. @ahrtr @serathius

fuweid · 2023-11-30T03:22:05Z

I am working on reproducing for bbolt data inconsistency. Will update it if I find out something. @Tachone

fuweid · 2023-11-30T03:46:25Z

@Tachone Would you mind sharing what experimental-backend-bbolt-freelist-type value you are using? thanks

Tachone · 2023-12-04T03:31:38Z

@Tachone Would you mind sharing what experimental-backend-bbolt-freelist-type value you are using? thanks

--experimental-backend-bbolt-freelist-type 'array', default value in 3.4.18 @fuweid

Tachone · 2023-12-04T04:06:07Z

@fuweid

I think there are some important information to help reproduce

the machine may under heavy load when the problem occurred
bbolt panic twice on different pages in 2 days, etcd restarted normally
the bad etcd node has no consistent <rev,KeyValue> data in bbolt with other nodes since rev={main:1054916837, sub:0}

fuweid · 2023-12-04T04:14:25Z

@Tachone Thanks!

the machine may under heavy load when the problem occurred

The machine wasn't restarted, is it correct? Is there any useful kernel message in dmesg?

bbolt panic twice on different pages in 2 days, etcd restarted normally

After bbolt panic, etcd still can restart and serve requests?

the bad etcd node has no consistent <rev,KeyValue> data in bbolt with other nodes since rev={main:1054916837, sub:0}

For the bad etcd node, after leader replicated the log, it can catch up with other two nodes?

Tachone · 2023-12-04T04:36:46Z

@fuweid

Yes. But I did not find any obvious abnormal information in dmesg.
Yes.
Yes. The RaftIndex and AppliedIndex of this node is consistent with leader by 'endpoint status'

serathius · 2023-12-04T08:47:36Z

I have seen couple of corruptions, but nothing following a single pattern that would suggest a application issue. I would attribute them mostly to memory stamping (page is written under wrong page id). Still investigating them would be useful to propose additional debug information and safeguards that would give us more insight into the problem.

My suggestion would be to separate the investigation, handle each report as independent issue as long as there is no strong suggestion that they have exactly the same root cause.

jmhbnz · 2024-06-20T18:21:39Z

Discussed during sig-etcd triage meeting, @ahrtr is this issue still valid? There is a suggestion above we handle each report of this data corruption as it's own issue as the root cause may not be the same.

chaochn47 added the type/bug label Jul 11, 2022

ahrtr added priority/important-soon Must be staffed and worked on either currently, or very soon, ideally in time for the next release. labels Jul 12, 2022

ahrtr added the stage/tracked label Jul 21, 2022

serathius mentioned this issue Aug 22, 2022

Introduce etcd linearizability tests #14045

Closed

33 tasks

cenkalti mentioned this issue Nov 1, 2022

server: add more context to panic message #14667

Merged

serathius added the release/v3.4 label Nov 15, 2022

ahrtr self-assigned this Dec 4, 2023

ahrtr added area/bbolt data/corruption labels Dec 4, 2023

fuweid mentioned this issue Dec 8, 2023

[3.4.18] panic: page xxx already freed #17084

Open

4 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

3.4.18 etcd data inconsistency #14211

3.4.18 etcd data inconsistency #14211

chaochn47 commented Jul 11, 2022 •

edited

Loading

paste your configuration here

ahrtr commented Jul 13, 2022

chaochn47 commented Jul 13, 2022 •

edited

Loading

ahrtr commented Jul 14, 2022

chaochn47 commented Jul 14, 2022

ahrtr commented Jul 14, 2022

chaochn47 commented Jul 21, 2022 •

edited

Loading

lavacat commented Jul 31, 2022

chaochn47 commented Aug 1, 2022 •

edited

Loading

chaochn47 commented Aug 1, 2022 •

edited

Loading

miancheng7 commented Sep 19, 2023 •

edited

Loading

ahrtr commented Sep 19, 2023

miancheng7 commented Sep 19, 2023

ahrtr commented Sep 19, 2023

Tachone commented Nov 7, 2023

Tachone commented Nov 29, 2023

fuweid commented Nov 30, 2023

fuweid commented Nov 30, 2023

Tachone commented Dec 4, 2023 •

edited

Loading

Tachone commented Dec 4, 2023

fuweid commented Dec 4, 2023

Tachone commented Dec 4, 2023

serathius commented Dec 4, 2023

jmhbnz commented Jun 20, 2024

3.4.18 etcd data inconsistency #14211

3.4.18 etcd data inconsistency #14211

Comments

chaochn47 commented Jul 11, 2022 • edited Loading

What happened?

What did you expect to happen?

How can we reproduce it (as minimally and precisely as possible)?

Anything else we need to know?

Etcd version (please run commands below)

Etcd configuration (command line flags or environment variables)

paste your configuration here

Etcd debug information (please run commands blow, feel free to obfuscate the IP address or FQDN in the output)

Relevant log output

ahrtr commented Jul 13, 2022

chaochn47 commented Jul 13, 2022 • edited Loading

ahrtr commented Jul 14, 2022

chaochn47 commented Jul 14, 2022

ahrtr commented Jul 14, 2022

chaochn47 commented Jul 21, 2022 • edited Loading

lavacat commented Jul 31, 2022

chaochn47 commented Aug 1, 2022 • edited Loading

chaochn47 commented Aug 1, 2022 • edited Loading

miancheng7 commented Sep 19, 2023 • edited Loading

What happened?

Anything else we need to know?

Etcd version (please run commands below)

ahrtr commented Sep 19, 2023

miancheng7 commented Sep 19, 2023

ahrtr commented Sep 19, 2023

Tachone commented Nov 7, 2023

What happened?

Anything else we need to know?

Tachone commented Nov 29, 2023

fuweid commented Nov 30, 2023

fuweid commented Nov 30, 2023

Tachone commented Dec 4, 2023 • edited Loading

Tachone commented Dec 4, 2023

fuweid commented Dec 4, 2023

Tachone commented Dec 4, 2023

serathius commented Dec 4, 2023

jmhbnz commented Jun 20, 2024

chaochn47 commented Jul 11, 2022 •

edited

Loading

chaochn47 commented Jul 13, 2022 •

edited

Loading

chaochn47 commented Jul 21, 2022 •

edited

Loading

chaochn47 commented Aug 1, 2022 •

edited

Loading

chaochn47 commented Aug 1, 2022 •

edited

Loading

miancheng7 commented Sep 19, 2023 •

edited

Loading

Tachone commented Dec 4, 2023 •

edited

Loading