Skip to content

Commit

Permalink
enhance csi-doctor.sh collect (#729)
Browse files Browse the repository at this point in the history
* fix collect bug & add top pod & top node & collect doctor output as well
  • Loading branch information
timfeirg committed Sep 1, 2023
1 parent c6d521b commit 4dedd3f
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 42 deletions.
8 changes: 7 additions & 1 deletion docs/en/administration/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ wget https://raw.githubusercontent.com/juicedata/juicefs-csi-driver/master/scrip
chmod a+x csi-doctor.sh
```

If kubectl is renamed in your environment (folks use this method to manage multiple Kubernetes clusters), or put elsewhere other than `PATH`, you can easily match this by editing the script, and change the `$kbctl` variable.
If kubectl is renamed in your environment (folks use this method to manage multiple Kubernetes clusters), or put elsewhere other than `PATH`, you can easily match this by editing the script, and change the `$kbctl` variable:

```shell
# Faced with two different clusters, their corresponding kubectl aliased to kubectl_1 / kubectl_2
KBCTL=kubectl_1 ./csi-doctor.sh debug my-app-pod -n default
KBCTL=kubectl_2 ./csi-doctor.sh debug my-app-pod -n default
```

A commonly used feature is obtaining mount pod information. Assuming application pod being `my-app-pod` in namespace `default`, to obtain the mount pod that accompanies the application pod:

Expand Down
8 changes: 7 additions & 1 deletion docs/zh_cn/administration/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ wget https://raw.githubusercontent.com/juicedata/juicefs-csi-driver/master/scrip
chmod a+x csi-doctor.sh
```

如果在你的运行环境中,kubectl 被重命名(比方说需要管理多个 Kubernetes 集群时,常常用不同 alias 来调用不同集群的 kubectl),或者并未放在 `PATH` 下,你也可以方便地修改脚步,将 `$kbctl` 变量替换为实际需要运行的 kubectl。
如果在你的运行环境中,kubectl 被重命名(比方说需要管理多个 Kubernetes 集群时,常常用不同 alias 来调用不同集群的 kubectl),或者并未放在 `PATH` 下,你也可以方便地修改脚步,将 `$kbctl` 变量替换为实际需要运行的 kubectl:

```shell
# 假设面对两个不同集群,kubectl 分别被别名为 kubectl_1 / kubectl_2
KBCTL=kubectl_1 ./csi-doctor.sh debug my-app-pod -n default
KBCTL=kubectl_2 ./csi-doctor.sh debug my-app-pod -n default
```

诊断脚本中最为常用的功能,就是方便地获取 Mount Pod 相关信息。假设应用 Pod 为 `default` 命名空间下的 `my-app-pod`

Expand Down
128 changes: 88 additions & 40 deletions scripts/csi-doctor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,28 +41,9 @@ run() {

DEFAULT_APP_NS="${APP_NS:-default}"
ORIGINAL_ARGS=( "$@" )
kbctl=kubectl

juicefs_resources() {
local namespace="${namespace:-$DEFAULT_APP_NS}"

mkdir -p "$diagnose_dir/app"
$kbctl describe po "$app" -n $namespace &>"$diagnose_dir/app/$app-describe.log" 2>&1
$kbctl get po "$app" -oyaml -n $namespace &>"$diagnose_dir/app/$app.yaml" 2>&1

NODE_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{.spec.nodeName}')
PVC_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}' | awk '{print $1}')
PV_NAME=$(${kbctl} -n ${namespace} get pvc $PVC_NAME -o jsonpath='{.spec.volumeName}')

mkdir -p "$diagnose_dir/pv"
$kbctl get pv "$PV_NAME" -oyaml &>"$diagnose_dir/pv/pv-$PV_NAME.yaml" 2>&1
$kbctl describe pv "$PV_NAME" &>"$diagnose_dir/pv/pv-$PV_NAME-describe.log" 2>&1

mkdir -p "$diagnose_dir/pvc"
$kbctl get pvc "$PVC_NAME" -n $namespace -oyaml &>"$diagnose_dir/pvc/$PVC_NAME.yaml" 2>&1
$kbctl describe pvc "$PVC_NAME" -n $namespace &>"$diagnose_dir/pvc/$PVC_NAME-describe.log" 2>&1
}
kbctl=${KBCTL:-"kubectl"}

PROVISIONER=csi.juicefs.com
SHOULD_CHECK_CSI_CONRTROLLER=''

debug_app_pod() {
Expand All @@ -82,8 +63,21 @@ debug_app_pod() {
app_pod_uid=$(${kbctl} -n $namespace get po $app -o jsonpath='{.metadata.uid}')
for pvc_name in $PVC_NAMES
do
pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}' 2>/dev/null)

if [ "$pv_name" == "" ]; then
echo "Ignore $pvc_name due to empty volumeName"
continue
fi

provisioner=$(${kbctl} get pv ${pv_name} -ojsonpath='{.spec.csi.driver}' 2>/dev/null)
if [ "$provisioner" != "$PROVISIONER" ]; then
echo "Ignore $pvc_name since it's not JuiceFS"
continue
fi

debug_pvc $pvc_name
pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')

pv_id=$(${kbctl} get pv $pv_name -o jsonpath='{.spec.csi.volumeHandle}')
if [ "$NODE_NAME" != "" ]; then
mount_pod_names=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $pv_id)
Expand Down Expand Up @@ -257,22 +251,55 @@ mount_exec() {
set +x
}

collect_mount_pod_msg() {
collect_pv() {
local namespace="${namespace:-$DEFAULT_APP_NS}"
juicefs_namespace=${JFS_NS:-"kube-system"}
mkdir -p "$diagnose_dir/juicefs-${juicefs_namespace}"

NODE_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{.spec.nodeName}')
PVC_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}' | awk '{print $1}')
PV_NAME=$(${kbctl} -n ${namespace} get pvc $PVC_NAME -o jsonpath='{.spec.volumeName}')
PV_ID=$(${kbctl} get pv $PV_NAME -o jsonpath='{.spec.csi.volumeHandle}')
MOUNT_POD_NAME=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $PV_ID)
$kbctl top node "$NODE_NAME" &>"$diagnose_dir/juicefs-$juicefs_namespace/node-top-$NODE_NAME.txt" 2>&1

PVC_NAMES=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}')
mkdir -p "$diagnose_dir/pv"
mkdir -p "$diagnose_dir/pvc"
for pvc_name in $PVC_NAMES
do

$kbctl logs "$MOUNT_POD_NAME" -n $juicefs_namespace -c jfs-format &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-init.log" 2>&1
$kbctl logs "$MOUNT_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod.log" 2>&1
$kbctl get po "$MOUNT_POD_NAME" -oyaml -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod.yaml" 2>&1
$kbctl describe po "$MOUNT_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-describe.log" 2>&1
$kbctl get pvc "$pvc_name" -n $namespace -oyaml &>"$diagnose_dir/pvc/$pvc_name.yaml" 2>&1
$kbctl describe pvc "$pvc_name" -n $namespace &>"$diagnose_dir/pvc/$pvc_name-describe.log" 2>&1

pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')

if [ "$pv_name" == "" ]; then
echo "Ignore $pvc_name due to empty volumeName"
continue
fi

provisioner=$(${kbctl} get pv ${pv_name} -ojsonpath='{.spec.csi.driver}' 2>/dev/null)
if [ "$provisioner" != "$PROVISIONER" ]; then
echo "Ignore $pv_name since it's not JuiceFS"
continue
fi

$kbctl get pv "$pv_name" -oyaml &>"$diagnose_dir/pv/pv-$pv_name.yaml" 2>&1
$kbctl describe pv "$pv_name" &>"$diagnose_dir/pv/pv-$pv_name-describe.log" 2>&1

pv_id=$(${kbctl} get pv $pv_name -o jsonpath='{.spec.csi.volumeHandle}')
if [ "$NODE_NAME" != "" ]; then
mount_pod_names=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $pv_id)
for mount_pod_name in $mount_pod_names
do
$kbctl top po "$mount_pod_name" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-top-$mount_pod_name.txt" 2>&1
$kbctl get po "$mount_pod_name" -oyaml -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.yaml" 2>&1
$kbctl describe po "$mount_pod_name" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.log" 2>&1
$kbctl logs "$mount_pod_name" -n $juicefs_namespace --all-containers=true >>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.log" 2>&1
done
fi
done

kubectl -n $juicefs_namespace cp ${BASH_SOURCE[0]} $mount_pod_name:/tmp
kubectl -n $juicefs_namespace exec $mount_pod_name -- /tmp/csi-doctor.sh doctor
kubectl -n $juicefs_namespace cp $mount_pod_name:/tmp/juicefs-doctor $diagnose_dir/doctor
}

collect_juicefs_csi_msg() {
Expand All @@ -294,21 +321,25 @@ collect_juicefs_csi_msg() {
$kbctl describe po "$CSI_CTRL_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-csi-controller/$CSI_CTRL_POD_NAME-describe.log" 2>&1
}

archive() {
pd_collect() {
mkdir -p "$diagnose_dir/app"
$kbctl get po "$app" -oyaml -n $namespace &>"$diagnose_dir/app/$app.yaml" 2>&1
code=$?
if [ "${code}" != "0" ]; then
echo "$namespace/$app not found, abort"
rm -rf $diagnose_dir
exit $code
fi
echo "Start collecting logs for ${namespace}/${app}"
$kbctl describe po "$app" -n $namespace &>"$diagnose_dir/app/$app-describe.log" 2>&1
collect_pv
collect_juicefs_csi_msg
set -e
archive_file_path="${current_dir}/diagnose_juicefs_${timestamp}.tar.gz"
tar -zcf "${archive_file_path}" -C /tmp $(basename $diagnose_dir)
echo "Results have been saved to ${archive_file_path}"
}

pd_collect() {
echo "Start collecting logs for ${namespace}/${app}"
collect_mount_pod_msg
collect_juicefs_csi_msg
juicefs_resources
archive
}

collect() {
if [ "${ORIGINAL_ARGS[1]}" == "" ]; then
echo "EXAMPLES:"
Expand All @@ -328,6 +359,17 @@ collect() {
rm -rf $diagnose_dir
}

doctor() {
mkdir -p /tmp/juicefs-doctor
cd /tmp/juicefs-doctor
mps=$(df -t fuse.juicefs --output=target | tail -n +2)
for mp in $mps
do
fname=$(juicefs doctor $mp | tail -n 1 | grep -oP '\S+.tar.gz')
mv $fname $(echo $mp | tr '/' '_').tar.gz
done
}

main() {
if [[ $# -eq 0 ]]; then
print_usage
Expand All @@ -348,6 +390,9 @@ main() {
collect|help)
action=$1
;;
doctor|help)
action=$1
;;
get-mount|help)
action=$1
;;
Expand Down Expand Up @@ -375,6 +420,9 @@ main() {
collect)
collect
;;
doctor)
doctor
;;
get-mount)
get_mount_pod
;;
Expand Down

0 comments on commit 4dedd3f

Please sign in to comment.