簡潔實用的版本查詢、系統診斷與故障排查命令集。
更新日期: 2026-01-30
# Kubernetes 版本
kubectl version --short
# Airflow 版本
kubectl exec -n airflow -it $(kubectl get pods -n airflow -l component=webserver -o jsonpath='{.items[0].metadata.name}') -- airflow version
# 節點狀態
kubectl get nodes -o wide
# Airflow Pods
kubectl get pods -n airflow -o wide
# 資源使用
kubectl top nodes && kubectl top pods -n airflow
kubectl version --short
kubectl version --output yaml
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name} {.status.nodeInfo.kubeletVersion}{"\n"}{end}'
ssh -i ~/.ssh/id_rsa root@10.10.0.85 'kubeadm version -o short && kubelet --version'
kubectl exec -n airflow -it $(kubectl get pods -n airflow -l component=webserver -o jsonpath='{.items[0].metadata.name}') -- airflow version
source .venv/bin/activate && airflow version
python -c "import airflow; print(f'Airflow: {airflow.__version__}')"
helm version
helm list -n airflow
helm status airflow -n airflow
kubectl get pods -n airflow -o jsonpath='{.items[*].spec.containers[*].image}' | tr ' ' '\n' | sort -u
kubectl get nodes -o wide
kubectl get pods -n airflow -o wide
kubectl describe nodes
kubectl get events -A --sort-by='.lastTimestamp' | tail -30
kubectl get pods -n airflow -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
kubectl get pods -n airflow -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.containerStatuses[0].restartCount}{"\n"}{end}'
kubectl describe pod <pod-name> -n airflow
kubectl get pods -n airflow -o wide
kubectl get pods -n airflow -l component=webserver
kubectl get pods -n airflow -l component=scheduler
kubectl get pods -n airflow -l component=worker
kubectl logs -f deployment/airflow-scheduler -n airflow
kubectl logs -f deployment/airflow-webserver -n airflow
kubectl logs <pod-name> -n airflow --tail=100
kubectl logs deployment/airflow-scheduler -n airflow | grep -i error
kubectl exec -it <scheduler-pod> -n airflow -- airflow dags list
kubectl exec -it <scheduler-pod> -n airflow -- airflow dags info <dag_id>
kubectl exec -it <scheduler-pod> -n airflow -- ls -la /opt/airflow/dags
helm get values airflow -n airflow
kubectl get configmap -n airflow
kubectl get secrets -n airflow
kubectl exec -it <pod-name> -n airflow -- env | grep AIRFLOW
# 直接連線檢查版本
ssh -i ~/.ssh/id_rsa root@10.10.0.85
psql --version
psql -U postgres -c "SELECT version();"
# 檢查 PostgreSQL 服務狀態
sudo systemctl status postgresql
sudo systemctl status patroni
# 查看 PostgreSQL 日誌
sudo journalctl -u postgresql -n 50
sudo journalctl -u patroni -n 50
# 查看 Patroni 叢集狀態
sudo patronictl -c /etc/patroni.yml list
# 查看 Patroni 配置
sudo cat /etc/patroni.yml
# 檢查 Patroni 服務
sudo systemctl status patroni
sudo systemctl start patroni
sudo systemctl restart patroni
# Patroni API 狀態
curl http://10.10.0.85:8008/health
curl http://10.10.0.85:8008/leader
curl http://10.10.0.85:8008/cluster
# 查看 Patroni 日誌
sudo journalctl -u patroni -f
# 檢查 Etcd 版本
etcd --version
etcdctl version
# 查看 Etcd 成員
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 member list
# 查看 Etcd 健康狀態
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 endpoint health
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 endpoint status
# 查看 Etcd 中的 Patroni 鍵
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 get "" --prefix | grep patroni
# 查看 Etcd 日誌
sudo journalctl -u etcd-patroni -f
# 檢查 Etcd 服務
sudo systemctl status etcd-patroni
sudo systemctl restart etcd-patroni
# Etcd 性能測試
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 check perf
# 連線到主資料庫
psql -h 10.10.0.85 -U postgres -d postgres
# 查看複製狀態
psql -U postgres -c "SELECT client_addr, state, write_lag FROM pg_stat_replication;"
# 查看 WAL 接收器狀態
psql -U postgres -c "SELECT * FROM pg_stat_wal_receiver;"
# 查看資料庫列表
psql -U postgres -c "\l"
# 查看資料表
psql -U postgres -d airflow_db -c "\dt"
# 查看複製使用者
psql -U postgres -c "SELECT usename, usesuper, usereplication FROM pg_user WHERE usereplication = true;"
# 查看連接
psql -U postgres -c "SELECT datname, usename, state FROM pg_stat_activity WHERE state IS NOT NULL GROUP BY datname, usename, state;"
# 模擬主節點故障進行容錯轉移
# 1. 查看當前主節點
sudo patronictl -c /etc/patroni.yml list
# 2. 停止主節點 Patroni 服務
sudo systemctl stop patroni
# 3. 觀察叢集自動選舉新主節點
sleep 5
sudo patronictl -c /etc/patroni.yml list
# 4. 重新啟動原主節點
sudo systemctl start patroni
# 列出所有節點
sudo patronictl -c /etc/patroni.yml list
# 對特定節點進行重新初始化 (會從主節點同步資料)
sudo patronictl -c /etc/patroni.yml reinit pgcluster node2
# 清除節點數據重新初始化
sudo rm -rf /var/lib/postgresql/18/main
sudo patronictl -c /etc/patroni.yml reinit pgcluster node2
# 從 Kubernetes Pod 連線到 PostgreSQL
kubectl exec -it <airflow-pod> -n airflow -- bash
psql -h 10.10.0.85 -p 5432 -U postgres -d airflow_db -c "SELECT version();"
# 測試複製連接
psql -h 10.10.0.87 -U replicator -c "SELECT 1;" 2>&1
# 檢查 Airflow 資料庫連線
psql -h 10.10.0.85 -p 5432 -U airflow_user -d airflow_db -c "SELECT COUNT(*) FROM dag;"
# 查看 RabbitMQ Pod
kubectl get pods -n airflow -l app=rabbitmq
# 查看 RabbitMQ 狀態
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl status
# 查看隊列
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_queues
# 查看使用者
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_users
# 查看權限
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_permissions
# RabbitMQ 日誌
kubectl logs -f <rabbitmq-pod> -n airflow
# Port-forward to RabbitMQ management UI
kubectl port-forward svc/airflow-rabbitmq 15672:15672 -n airflow
# 在瀏覽器中打開: http://localhost:15672
# 預設使用者: user
# 預設密碼: bitnami (或檢查 Helm values)
# 查看所有 ConfigMap
kubectl get configmap -n airflow
# 查看特定 ConfigMap
kubectl describe configmap <configmap-name> -n airflow
# 查看所有 Secrets
kubectl get secrets -n airflow
# 查看特定 Secret
kubectl describe secret <secret-name> -n airflow
# 解碼 Secret 內容
kubectl get secret <secret-name> -n airflow -o jsonpath='{.data.password}' | base64 -d
# 檢查 Airflow 數據庫連線 Secret
kubectl get secret airflow-postgresql -n airflow -o yaml
# 查看 PVC 狀態
kubectl get pvc -n airflow
# 查看 PV 狀態
kubectl get pv
# 查看 PVC 詳情
kubectl describe pvc <pvc-name> -n airflow
# 檢查存儲類別
kubectl get storageclass
# 測試 NFS 掛載
kubectl exec -it <pod-name> -n airflow -- df -h
kubectl exec -it <pod-name> -n airflow -- ls -la /opt/airflow/dags
kubectl exec -it <pod-name> -n airflow -- ls -la /opt/airflow/logs
# 檢查 NFS 掛載點
mount | grep nfs
# 檢查 NFS 服務
showmount -e <nfs-server>
# 手動掛載測試
sudo mount -t nfs <nfs-server>:/path /mnt/test
# 檢查 NFS 日誌
sudo journalctl -u nfs-server
kubectl get pod <pod-name> -n airflow -o jsonpath='{.status.podIP}'
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- ping <pod-ip>
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nc -zv <pod-ip>:<port>
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nslookup kubernetes.default
kubectl get svc -n airflow
kubectl describe svc <service-name> -n airflow
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nc -zv airflow-webserver.airflow 8080
ssh -i ~/.ssh/id_rsa root@10.10.0.85
ip addr show
ip route show
ip -d link show | grep flannel
ping 8.8.8.8
kubectl top nodes
kubectl top pods -n airflow
kubectl top pods -A --sort-by=memory
kubectl top pods -A --sort-by=cpu
# 資源總和
kubectl top pods -n airflow --no-headers | awk '{cpu+=$2; mem+=$3} END {print "CPU: " cpu "m\nMem: " mem "Mi"}'
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.allocatable.cpu}{"\t"}{.status.allocatable.memory}{"\n"}{end}'
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.capacity.cpu}{"\t"}{.status.capacity.memory}{"\n"}{end}'
kubectl describe pod <pod-name> -n airflow
kubectl logs <pod-name> -n airflow
kubectl logs <pod-name> -n airflow --previous
kubectl get events -n airflow --sort-by='.lastTimestamp'
kubectl rollout restart deployment/airflow-scheduler -n airflow
kubectl rollout restart deployment/airflow-webserver -n airflow
kubectl delete pod --field-selector status.phase=Failed -n airflow
kubectl rollout history deployment/airflow-scheduler -n airflow
kubectl rollout undo deployment/airflow-scheduler -n airflow
df -h
du -sh /var/lib/docker/*
du -sh /var/lib/containerd/*
docker system prune -a
sudo journalctl --vacuum=500M
#!/bin/bash
echo "=== K8s 版本 ==="
kubectl version --short
echo "=== 節點狀態 ==="
kubectl get nodes -o wide
echo "=== Airflow Pods ==="
kubectl get pods -n airflow
echo "=== 叢集事件 ==="
kubectl get events -A --sort-by='.lastTimestamp' | tail -10
echo "=== 資源使用 ==="
kubectl top nodes 2>/dev/null || echo "Metrics Server 未部署"
#!/bin/bash
POD_NAME=$1
NAMESPACE=${2:-airflow}
echo "=== Pod 狀態 ==="
kubectl describe pod $POD_NAME -n $NAMESPACE
echo "=== 最近日誌 ==="
kubectl logs $POD_NAME -n $NAMESPACE --tail=50
echo "=== 環境變數 ==="
kubectl exec $POD_NAME -n $NAMESPACE -- env | head -20
echo "=== Events ==="
kubectl get events -n $NAMESPACE | grep $POD_NAME
| 用途 | 命令 |
|---|---|
| K8s 版本 | kubectl version --short |
| Airflow 版本 | kubectl exec -n airflow ... airflow version |
| 所有節點 | kubectl get nodes -o wide |
| 所有 Pod | kubectl get pods -n airflow |
| Pod 日誌 | kubectl logs <pod-name> -n airflow |
| 進入 Pod | kubectl exec -it <pod-name> -n airflow -- bash |
| 資源使用 | kubectl top nodes && kubectl top pods -n airflow |
| 重啟 Scheduler | kubectl rollout restart deployment/airflow-scheduler -n airflow |
| Helm 配置 | helm get values airflow -n airflow |
| 叢集事件 | kubectl get events -A --sort-by='.lastTimestamp' |
Last Updated: 2026-01-30