We have a 3 nodes k8s that has vault with HA installed using raft. Another single node vault installed as standalone VM for autounseal the vault in k8s using transit. The following are the helm value file used to install hashicorp vault on k8s.
csi:
enabled: true
pod:
nodeSelector:
node-role.kubernetes.io/control-plane: ""
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
volumeMounts:
- mountPath: /vault/tls
name: tls
readOnly: true
volumes:
- name: tls
secret:
secretName: vault-cluster-tls
global:
tlsDisable: false
injector:
enabled: true
logLevel: trace
nodeSelector:
node-role.kubernetes.io/control-plane: ""
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
server:
auditStorage:
enabled: true
ha:
enabled: true
raft:
config: |
ui = true
cluster_name = "vault-integrated-storage"
listener "tcp" {
tls_disable = 0
address = "[::]:8200"
cluster_address = "[::]:8201"
tls_cert_file = "/vault/userconfig/vault-cluster-tls/tls.crt"
tls_key_file = "/vault/userconfig/vault-cluster-tls/tls.key"
}
storage "raft" {
path = "/vault/data"
retry_join {
leader_api_addr = "https://vault-0.vault-internal:8200"
leader_ca_cert_file = "/vault/userconfig/vault-cluster-tls/ca.crt"
leader_client_cert_file = "/vault/userconfig/vault-cluster-tls/tls.crt"
leader_client_key_file = "/vault/userconfig/vault-cluster-tls/tls.key"
}
retry_join {
leader_api_addr = "https://vault-1.vault-internal:8200"
leader_ca_cert_file = "/vault/userconfig/vault-cluster-tls/ca.crt"
leader_client_cert_file = "/vault/userconfig/vault-cluster-tls/tls.crt"
leader_client_key_file = "/vault/userconfig/vault-cluster-tls/tls.key"
}
retry_join {
leader_api_addr = "https://vault-2.vault-internal:8200"
leader_ca_cert_file = "/vault/userconfig/vault-cluster-tls/ca.crt"
leader_client_cert_file = "/vault/userconfig/vault-cluster-tls/tls.crt"
leader_client_key_file = "/vault/userconfig/vault-cluster-tls/tls.key"
}
}
seal "transit" {
address = "<omit>"
token = "<omit>"
disable_renewal = "false"
key_name = "autounseal"
mount_path = "transit/"
}
service_registration "kubernetes" {}
enabled: true
setNodeId: false
replicas: 3
ingress:
annotations:
nginx.ingress.kubernetes.io/backend-protocol: HTTPS
enabled: true
hosts:
- host: vault.k8s.local
tls:
- hosts:
- vault.k8s.local
secretName: vault-ingress-tls
logLevel: trace
nodeSelector:
node-role.kubernetes.io/control-plane: ""
standalone:
enabled: false
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
volumeMounts:
- mountPath: /etc/ssl/certs/serena-ca.crt
name: cert-store
subPath: serena-ca.crt
- mountPath: /vault/userconfig/vault-cluster-tls/
name: vault-cert
volumes:
- name: cert-store
secret:
items:
- key: ca.crt
path: serena-ca.crt
secretName: vault-cluster-tls
- name: vault-cert
secret:
secretName: vault-cluster-tls
serverTelemetry:
serviceMonitor:
enabled: true
selectors:
release: monitoring
ui:
enabled: true
We noticed that when turned off all the k8s nodes and turn it back on the next day, There are chances that the following logs will shown in all the vault-0 vault-1 and vault-2. Once it happens, it seems there is no way to recover it.
- Is it the vault cannot handle all k8s nodes abnormal turned off at the same time?
- Is the following error mean the raftdb is corrupted?
2024-08-20T02:28:35.356Z [INFO] proxy environment: http_proxy="" https_proxy="" no_proxy=""
2024-08-20T02:28:35.356Z [WARN] storage.raft.fsm: raft FSM db file has wider permissions than needed: needed=-rw------- existing=-rw-rw----
2024-08-20T02:28:35.357Z [DEBUG] storage.raft.fsm: time to open database: elapsed=1.461221ms path=/vault/data/vault.db
panic: assertion failed: Page expected to be: 7128, but self identifies as 0
goroutine 1 [running]:
go.etcd.io/bbolt._assert(...)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/db.go:1359
go.etcd.io/bbolt.(*page).fastCheck(0x7f8e5341f000, 0x1bd8)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/page.go:57 +0x1d9
go.etcd.io/bbolt.(*Tx).page(0x7f8e5340d000?, 0x88b2f20?)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx.go:534 +0x7b
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0xc002c78620, {0xc00352d950, 0x3, 0xa}, 0xc00397e480)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx.go:546 +0x5d
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0xc002c78620, {0xc00352d950, 0x2, 0xa}, 0xc00397e480)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx.go:555 +0xc8
go.etcd.io/bbolt.(*Tx).forEachPageInternal(0xc002c78620, {0xc00352d950, 0x1, 0xa}, 0xc00397e480)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx.go:555 +0xc8
go.etcd.io/bbolt.(*Tx).forEachPage(...)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx.go:542
go.etcd.io/bbolt.(*Tx).checkBucket(0xc002c78620, 0xc002a8e680, 0xc00397e888, 0xc00397e7c8, {0xcfe3680, 0x13338200}, 0xc0001943c0)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx_check.go:83 +0x114
go.etcd.io/bbolt.(*Tx).checkBucket.func2({0x7f8e5341c0d2?, 0xc00397e5b8?, 0xc0003d6b10?})
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx_check.go:110 +0x90
go.etcd.io/bbolt.(*Bucket).ForEachBucket(0x0?, 0xc00397e680)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/bucket.go:403 +0x96
go.etcd.io/bbolt.(*Tx).checkBucket(0xc002c78620, 0xc002c78638, 0xc00397e888, 0xc00397e7c8, {0xcfe3680, 0x13338200}, 0xc0001943c0)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/tx_check.go:108 +0x255
go.etcd.io/bbolt.(*DB).freepages(0xc0036c58c8)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/db.go:1181 +0x225
go.etcd.io/bbolt.(*DB).loadFreelist.func1()
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/db.go:412 +0xbb
sync.(*Once).doSlow(0xc0036c5a90?, 0x10?)
/opt/hostedtoolcache/go/1.22.5/x64/src/sync/once.go:74 +0xc2
sync.(*Once).Do(...)
/opt/hostedtoolcache/go/1.22.5/x64/src/sync/once.go:65
go.etcd.io/bbolt.(*DB).loadFreelist(0xc0036c58c8?)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/db.go:408 +0x45
go.etcd.io/bbolt.Open({0xc0036b1968, 0x18}, 0x180, 0xc00282ed80)
/home/runner/go/pkg/mod/go.etcd.io/bbolt@v1.3.7/db.go:290 +0x3ee
github.com/hashicorp/raft-boltdb/v2.New({{0xc0036b1968, 0x18}, 0xc00282ed80, 0x0, 0x1})
/home/runner/go/pkg/mod/github.com/hashicorp/raft-boltdb/v2@v2.3.0/bolt_store.go:79 +0x45
github.com/hashicorp/vault/physical/raft.NewRaftBackend(0xc00280b200, {0xd084e88, 0xc00280b320})
/home/runner/work/vault/vault/physical/raft/raft.go:507 +0x613
github.com/hashicorp/vault/command.(*ServerCommand).setupStorage(0xc0036c5208, 0xc003718f08)
/home/runner/work/vault/vault/command/server.go:811 +0x319
github.com/hashicorp/vault/command.(*ServerCommand).Run(0xc0036c5208, {0xc000196a10, 0x1, 0x1})
/home/runner/work/vault/vault/command/server.go:1188 +0x10a6
github.com/hashicorp/cli.(*CLI).Run(0xc003365a40)
/home/runner/go/pkg/mod/github.com/hashicorp/cli@v1.1.6/cli.go:265 +0x5b8
github.com/hashicorp/vault/command.RunCustom({0xc000196a00?, 0x2?, 0x2?}, 0xc0000061c0?)
/home/runner/work/vault/vault/command/main.go:243 +0x9a6
github.com/hashicorp/vault/command.Run(...)
/home/runner/work/vault/vault/command/main.go:147
main.main()
/home/runner/work/vault/vault/main.go:13 +0x47