diff --git a/docs/en/administration/troubleshooting.md b/docs/en/administration/troubleshooting.md index 8c9bb30218..9a75af5f5d 100644 --- a/docs/en/administration/troubleshooting.md +++ b/docs/en/administration/troubleshooting.md @@ -17,7 +17,13 @@ wget https://raw.githubusercontent.com/juicedata/juicefs-csi-driver/master/scrip chmod a+x csi-doctor.sh ``` -If kubectl is renamed in your environment (folks use this method to manage multiple Kubernetes clusters), or put elsewhere other than `PATH`, you can easily match this by editing the script, and change the `$kbctl` variable. +If kubectl is renamed in your environment (folks use this method to manage multiple Kubernetes clusters), or put elsewhere other than `PATH`, you can easily match this by editing the script, and change the `$kbctl` variable: + +```shell +# Faced with two different clusters, their corresponding kubectl aliased to kubectl_1 / kubectl_2 +KBCTL=kubectl_1 ./csi-doctor.sh debug my-app-pod -n default +KBCTL=kubectl_2 ./csi-doctor.sh debug my-app-pod -n default +``` A commonly used feature is obtaining mount pod information. Assuming application pod being `my-app-pod` in namespace `default`, to obtain the mount pod that accompanies the application pod: diff --git a/docs/zh_cn/administration/troubleshooting.md b/docs/zh_cn/administration/troubleshooting.md index ca27df5545..cff44e485b 100644 --- a/docs/zh_cn/administration/troubleshooting.md +++ b/docs/zh_cn/administration/troubleshooting.md @@ -17,7 +17,13 @@ wget https://raw.githubusercontent.com/juicedata/juicefs-csi-driver/master/scrip chmod a+x csi-doctor.sh ``` -如果在你的运行环境中,kubectl 被重命名(比方说需要管理多个 Kubernetes 集群时,常常用不同 alias 来调用不同集群的 kubectl),或者并未放在 `PATH` 下,你也可以方便地修改脚步,将 `$kbctl` 变量替换为实际需要运行的 kubectl。 +如果在你的运行环境中,kubectl 被重命名(比方说需要管理多个 Kubernetes 集群时,常常用不同 alias 来调用不同集群的 kubectl),或者并未放在 `PATH` 下,你也可以方便地修改脚步,将 `$kbctl` 变量替换为实际需要运行的 kubectl: + +```shell +# 假设面对两个不同集群,kubectl 分别被别名为 kubectl_1 / kubectl_2 +KBCTL=kubectl_1 ./csi-doctor.sh debug my-app-pod -n default +KBCTL=kubectl_2 ./csi-doctor.sh debug my-app-pod -n default +``` 诊断脚本中最为常用的功能,就是方便地获取 Mount Pod 相关信息。假设应用 Pod 为 `default` 命名空间下的 `my-app-pod`: diff --git a/scripts/csi-doctor.sh b/scripts/csi-doctor.sh index e31a4f76db..d5eb25f0c1 100755 --- a/scripts/csi-doctor.sh +++ b/scripts/csi-doctor.sh @@ -41,28 +41,9 @@ run() { DEFAULT_APP_NS="${APP_NS:-default}" ORIGINAL_ARGS=( "$@" ) -kbctl=kubectl - -juicefs_resources() { - local namespace="${namespace:-$DEFAULT_APP_NS}" - - mkdir -p "$diagnose_dir/app" - $kbctl describe po "$app" -n $namespace &>"$diagnose_dir/app/$app-describe.log" 2>&1 - $kbctl get po "$app" -oyaml -n $namespace &>"$diagnose_dir/app/$app.yaml" 2>&1 - - NODE_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{.spec.nodeName}') - PVC_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}' | awk '{print $1}') - PV_NAME=$(${kbctl} -n ${namespace} get pvc $PVC_NAME -o jsonpath='{.spec.volumeName}') - - mkdir -p "$diagnose_dir/pv" - $kbctl get pv "$PV_NAME" -oyaml &>"$diagnose_dir/pv/pv-$PV_NAME.yaml" 2>&1 - $kbctl describe pv "$PV_NAME" &>"$diagnose_dir/pv/pv-$PV_NAME-describe.log" 2>&1 - - mkdir -p "$diagnose_dir/pvc" - $kbctl get pvc "$PVC_NAME" -n $namespace -oyaml &>"$diagnose_dir/pvc/$PVC_NAME.yaml" 2>&1 - $kbctl describe pvc "$PVC_NAME" -n $namespace &>"$diagnose_dir/pvc/$PVC_NAME-describe.log" 2>&1 -} +kbctl=${KBCTL:-"kubectl"} +PROVISIONER=csi.juicefs.com SHOULD_CHECK_CSI_CONRTROLLER='' debug_app_pod() { @@ -82,8 +63,21 @@ debug_app_pod() { app_pod_uid=$(${kbctl} -n $namespace get po $app -o jsonpath='{.metadata.uid}') for pvc_name in $PVC_NAMES do + pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}' 2>/dev/null) + + if [ "$pv_name" == "" ]; then + echo "Ignore $pvc_name due to empty volumeName" + continue + fi + + provisioner=$(${kbctl} get pv ${pv_name} -ojsonpath='{.spec.csi.driver}' 2>/dev/null) + if [ "$provisioner" != "$PROVISIONER" ]; then + echo "Ignore $pvc_name since it's not JuiceFS" + continue + fi + debug_pvc $pvc_name - pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}') + pv_id=$(${kbctl} get pv $pv_name -o jsonpath='{.spec.csi.volumeHandle}') if [ "$NODE_NAME" != "" ]; then mount_pod_names=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $pv_id) @@ -257,22 +251,55 @@ mount_exec() { set +x } -collect_mount_pod_msg() { +collect_pv() { local namespace="${namespace:-$DEFAULT_APP_NS}" juicefs_namespace=${JFS_NS:-"kube-system"} mkdir -p "$diagnose_dir/juicefs-${juicefs_namespace}" NODE_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{.spec.nodeName}') - PVC_NAME=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}' | awk '{print $1}') - PV_NAME=$(${kbctl} -n ${namespace} get pvc $PVC_NAME -o jsonpath='{.spec.volumeName}') - PV_ID=$(${kbctl} get pv $PV_NAME -o jsonpath='{.spec.csi.volumeHandle}') - MOUNT_POD_NAME=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $PV_ID) + $kbctl top node "$NODE_NAME" &>"$diagnose_dir/juicefs-$juicefs_namespace/node-top-$NODE_NAME.txt" 2>&1 + + PVC_NAMES=$(${kbctl} -n ${namespace} get po ${app} -o jsonpath='{..persistentVolumeClaim.claimName}') + mkdir -p "$diagnose_dir/pv" + mkdir -p "$diagnose_dir/pvc" + for pvc_name in $PVC_NAMES + do - $kbctl logs "$MOUNT_POD_NAME" -n $juicefs_namespace -c jfs-format &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-init.log" 2>&1 - $kbctl logs "$MOUNT_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod.log" 2>&1 - $kbctl get po "$MOUNT_POD_NAME" -oyaml -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod.yaml" 2>&1 - $kbctl describe po "$MOUNT_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-describe.log" 2>&1 + $kbctl get pvc "$pvc_name" -n $namespace -oyaml &>"$diagnose_dir/pvc/$pvc_name.yaml" 2>&1 + $kbctl describe pvc "$pvc_name" -n $namespace &>"$diagnose_dir/pvc/$pvc_name-describe.log" 2>&1 + + pv_name=$(${kbctl} -n ${namespace} get pvc $pvc_name -o jsonpath='{.spec.volumeName}') + + if [ "$pv_name" == "" ]; then + echo "Ignore $pvc_name due to empty volumeName" + continue + fi + provisioner=$(${kbctl} get pv ${pv_name} -ojsonpath='{.spec.csi.driver}' 2>/dev/null) + if [ "$provisioner" != "$PROVISIONER" ]; then + echo "Ignore $pv_name since it's not JuiceFS" + continue + fi + + $kbctl get pv "$pv_name" -oyaml &>"$diagnose_dir/pv/pv-$pv_name.yaml" 2>&1 + $kbctl describe pv "$pv_name" &>"$diagnose_dir/pv/pv-$pv_name-describe.log" 2>&1 + + pv_id=$(${kbctl} get pv $pv_name -o jsonpath='{.spec.csi.volumeHandle}') + if [ "$NODE_NAME" != "" ]; then + mount_pod_names=$(${kbctl} -n $juicefs_namespace get po --field-selector spec.nodeName=$NODE_NAME -l app.kubernetes.io/name=juicefs-mount -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep $pv_id) + for mount_pod_name in $mount_pod_names + do + $kbctl top po "$mount_pod_name" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-top-$mount_pod_name.txt" 2>&1 + $kbctl get po "$mount_pod_name" -oyaml -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.yaml" 2>&1 + $kbctl describe po "$mount_pod_name" -n $juicefs_namespace &>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.log" 2>&1 + $kbctl logs "$mount_pod_name" -n $juicefs_namespace --all-containers=true >>"$diagnose_dir/juicefs-$juicefs_namespace/mount-pod-$mount_pod_name.log" 2>&1 + done + fi + done + + kubectl -n $juicefs_namespace cp ${BASH_SOURCE[0]} $mount_pod_name:/tmp + kubectl -n $juicefs_namespace exec $mount_pod_name -- /tmp/csi-doctor.sh doctor + kubectl -n $juicefs_namespace cp $mount_pod_name:/tmp/juicefs-doctor $diagnose_dir/doctor } collect_juicefs_csi_msg() { @@ -294,21 +321,25 @@ collect_juicefs_csi_msg() { $kbctl describe po "$CSI_CTRL_POD_NAME" -n $juicefs_namespace &>"$diagnose_dir/juicefs-csi-controller/$CSI_CTRL_POD_NAME-describe.log" 2>&1 } -archive() { +pd_collect() { + mkdir -p "$diagnose_dir/app" + $kbctl get po "$app" -oyaml -n $namespace &>"$diagnose_dir/app/$app.yaml" 2>&1 + code=$? + if [ "${code}" != "0" ]; then + echo "$namespace/$app not found, abort" + rm -rf $diagnose_dir + exit $code + fi + echo "Start collecting logs for ${namespace}/${app}" + $kbctl describe po "$app" -n $namespace &>"$diagnose_dir/app/$app-describe.log" 2>&1 + collect_pv + collect_juicefs_csi_msg set -e archive_file_path="${current_dir}/diagnose_juicefs_${timestamp}.tar.gz" tar -zcf "${archive_file_path}" -C /tmp $(basename $diagnose_dir) echo "Results have been saved to ${archive_file_path}" } -pd_collect() { - echo "Start collecting logs for ${namespace}/${app}" - collect_mount_pod_msg - collect_juicefs_csi_msg - juicefs_resources - archive -} - collect() { if [ "${ORIGINAL_ARGS[1]}" == "" ]; then echo "EXAMPLES:" @@ -328,6 +359,17 @@ collect() { rm -rf $diagnose_dir } +doctor() { + mkdir -p /tmp/juicefs-doctor + cd /tmp/juicefs-doctor + mps=$(df -t fuse.juicefs --output=target | tail -n +2) + for mp in $mps + do + fname=$(juicefs doctor $mp | tail -n 1 | grep -oP '\S+.tar.gz') + mv $fname $(echo $mp | tr '/' '_').tar.gz + done +} + main() { if [[ $# -eq 0 ]]; then print_usage @@ -348,6 +390,9 @@ main() { collect|help) action=$1 ;; + doctor|help) + action=$1 + ;; get-mount|help) action=$1 ;; @@ -375,6 +420,9 @@ main() { collect) collect ;; + doctor) + doctor + ;; get-mount) get_mount_pod ;;