Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 

12 KiB

K8s Airflow HA - 快速命令參考

簡潔實用的版本查詢、系統診斷與故障排查命令集。

更新日期: 2026-01-30


📋 目錄

  1. 快速查詢
  2. 版本查詢
  3. 基本診斷
  4. Airflow 診斷
  5. PostgreSQL + Patroni + Etcd 診斷
  6. RabbitMQ 診斷
  7. Kubernetes 密鑰與配置
  8. NFS 與存儲
  9. 網路連通性
  10. 效能監控
  11. 故障排查
  12. 快速診斷腳本
  13. 速查表

📋 快速查詢 (一句話查詢)

# Kubernetes 版本
kubectl version --short

# Airflow 版本  
kubectl exec -n airflow -it $(kubectl get pods -n airflow -l component=webserver -o jsonpath='{.items[0].metadata.name}') -- airflow version

# 節點狀態
kubectl get nodes -o wide

# Airflow Pods
kubectl get pods -n airflow -o wide

# 資源使用
kubectl top nodes && kubectl top pods -n airflow

📦 版本查詢

Kubernetes 版本

kubectl version --short
kubectl version --output yaml
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name} {.status.nodeInfo.kubeletVersion}{"\n"}{end}'
ssh -i ~/.ssh/id_rsa root@10.10.0.85 'kubeadm version -o short && kubelet --version'

Airflow 版本

kubectl exec -n airflow -it $(kubectl get pods -n airflow -l component=webserver -o jsonpath='{.items[0].metadata.name}') -- airflow version
source .venv/bin/activate && airflow version
python -c "import airflow; print(f'Airflow: {airflow.__version__}')"

Helm & 容器映像

helm version
helm list -n airflow
helm status airflow -n airflow
kubectl get pods -n airflow -o jsonpath='{.items[*].spec.containers[*].image}' | tr ' ' '\n' | sort -u

🔍 基本診斷

節點與 Pod

kubectl get nodes -o wide
kubectl get pods -n airflow -o wide
kubectl describe nodes
kubectl get events -A --sort-by='.lastTimestamp' | tail -30

Pod 狀態檢查

kubectl get pods -n airflow -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
kubectl get pods -n airflow -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.containerStatuses[0].restartCount}{"\n"}{end}'
kubectl describe pod <pod-name> -n airflow

🔧 Airflow 診斷

Pod 檢查

kubectl get pods -n airflow -o wide
kubectl get pods -n airflow -l component=webserver
kubectl get pods -n airflow -l component=scheduler
kubectl get pods -n airflow -l component=worker

日誌查看

kubectl logs -f deployment/airflow-scheduler -n airflow
kubectl logs -f deployment/airflow-webserver -n airflow
kubectl logs <pod-name> -n airflow --tail=100
kubectl logs deployment/airflow-scheduler -n airflow | grep -i error

DAG 檢查

kubectl exec -it <scheduler-pod> -n airflow -- airflow dags list
kubectl exec -it <scheduler-pod> -n airflow -- airflow dags info <dag_id>
kubectl exec -it <scheduler-pod> -n airflow -- ls -la /opt/airflow/dags

配置檢查

helm get values airflow -n airflow
kubectl get configmap -n airflow
kubectl get secrets -n airflow
kubectl exec -it <pod-name> -n airflow -- env | grep AIRFLOW

🗄️ PostgreSQL + Patroni + Etcd 診斷

PostgreSQL 版本與狀態

# 直接連線檢查版本
ssh -i ~/.ssh/id_rsa root@10.10.0.85
psql --version
psql -U postgres -c "SELECT version();"

# 檢查 PostgreSQL 服務狀態
sudo systemctl status postgresql
sudo systemctl status patroni

# 查看 PostgreSQL 日誌
sudo journalctl -u postgresql -n 50
sudo journalctl -u patroni -n 50

Patroni 狀態與檢查

# 查看 Patroni 叢集狀態
sudo patronictl -c /etc/patroni.yml list

# 查看 Patroni 配置
sudo cat /etc/patroni.yml

# 檢查 Patroni 服務
sudo systemctl status patroni
sudo systemctl start patroni
sudo systemctl restart patroni

# Patroni API 狀態
curl http://10.10.0.85:8008/health
curl http://10.10.0.85:8008/leader
curl http://10.10.0.85:8008/cluster

# 查看 Patroni 日誌
sudo journalctl -u patroni -f

Etcd 叢集診斷

# 檢查 Etcd 版本
etcd --version
etcdctl version

# 查看 Etcd 成員
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 member list

# 查看 Etcd 健康狀態
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 endpoint health
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 endpoint status

# 查看 Etcd 中的 Patroni 鍵
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 get "" --prefix | grep patroni

# 查看 Etcd 日誌
sudo journalctl -u etcd-patroni -f

# 檢查 Etcd 服務
sudo systemctl status etcd-patroni
sudo systemctl restart etcd-patroni

# Etcd 性能測試
sudo ETCDCTL_API=3 etcdctl --endpoints=http://127.0.0.1:12379 check perf

數據庫操作

# 連線到主資料庫
psql -h 10.10.0.85 -U postgres -d postgres

# 查看複製狀態
psql -U postgres -c "SELECT client_addr, state, write_lag FROM pg_stat_replication;"

# 查看 WAL 接收器狀態
psql -U postgres -c "SELECT * FROM pg_stat_wal_receiver;"

# 查看資料庫列表
psql -U postgres -c "\l"

# 查看資料表
psql -U postgres -d airflow_db -c "\dt"

# 查看複製使用者
psql -U postgres -c "SELECT usename, usesuper, usereplication FROM pg_user WHERE usereplication = true;"

# 查看連接
psql -U postgres -c "SELECT datname, usename, state FROM pg_stat_activity WHERE state IS NOT NULL GROUP BY datname, usename, state;"

自動容錯轉移測試

# 模擬主節點故障進行容錯轉移
# 1. 查看當前主節點
sudo patronictl -c /etc/patroni.yml list

# 2. 停止主節點 Patroni 服務
sudo systemctl stop patroni

# 3. 觀察叢集自動選舉新主節點
sleep 5
sudo patronictl -c /etc/patroni.yml list

# 4. 重新啟動原主節點
sudo systemctl start patroni

Patroni 節點重新初始化

# 列出所有節點
sudo patronictl -c /etc/patroni.yml list

# 對特定節點進行重新初始化 (會從主節點同步資料)
sudo patronictl -c /etc/patroni.yml reinit pgcluster node2

# 清除節點數據重新初始化
sudo rm -rf /var/lib/postgresql/18/main
sudo patronictl -c /etc/patroni.yml reinit pgcluster node2

PostgreSQL 連線測試

# 從 Kubernetes Pod 連線到 PostgreSQL
kubectl exec -it <airflow-pod> -n airflow -- bash
psql -h 10.10.0.85 -p 5432 -U postgres -d airflow_db -c "SELECT version();"

# 測試複製連接
psql -h 10.10.0.87 -U replicator -c "SELECT 1;" 2>&1

# 檢查 Airflow 資料庫連線
psql -h 10.10.0.85 -p 5432 -U airflow_user -d airflow_db -c "SELECT COUNT(*) FROM dag;"

🔌 RabbitMQ 診斷 (若使用 CeleryExecutor)

RabbitMQ 狀態檢查

# 查看 RabbitMQ Pod
kubectl get pods -n airflow -l app=rabbitmq

# 查看 RabbitMQ 狀態
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl status

# 查看隊列
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_queues

# 查看使用者
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_users

# 查看權限
kubectl exec -it <rabbitmq-pod> -n airflow -- rabbitmqctl list_permissions

# RabbitMQ 日誌
kubectl logs -f <rabbitmq-pod> -n airflow

RabbitMQ 管理界面

# Port-forward to RabbitMQ management UI
kubectl port-forward svc/airflow-rabbitmq 15672:15672 -n airflow

# 在瀏覽器中打開: http://localhost:15672
# 預設使用者: user
# 預設密碼: bitnami (或檢查 Helm values)

🔐 Kubernetes 密鑰與配置檢查

ConfigMap 與 Secrets

# 查看所有 ConfigMap
kubectl get configmap -n airflow

# 查看特定 ConfigMap
kubectl describe configmap <configmap-name> -n airflow

# 查看所有 Secrets
kubectl get secrets -n airflow

# 查看特定 Secret
kubectl describe secret <secret-name> -n airflow

# 解碼 Secret 內容
kubectl get secret <secret-name> -n airflow -o jsonpath='{.data.password}' | base64 -d

# 檢查 Airflow 數據庫連線 Secret
kubectl get secret airflow-postgresql -n airflow -o yaml

📝 NFS 與存儲診斷

NFS 存儲檢查

# 查看 PVC 狀態
kubectl get pvc -n airflow

# 查看 PV 狀態
kubectl get pv

# 查看 PVC 詳情
kubectl describe pvc <pvc-name> -n airflow

# 檢查存儲類別
kubectl get storageclass

# 測試 NFS 掛載
kubectl exec -it <pod-name> -n airflow -- df -h
kubectl exec -it <pod-name> -n airflow -- ls -la /opt/airflow/dags
kubectl exec -it <pod-name> -n airflow -- ls -la /opt/airflow/logs

主機 NFS 操作

# 檢查 NFS 掛載點
mount | grep nfs

# 檢查 NFS 服務
showmount -e <nfs-server>

# 手動掛載測試
sudo mount -t nfs <nfs-server>:/path /mnt/test

# 檢查 NFS 日誌
sudo journalctl -u nfs-server

Pod 與 Service 測試

kubectl get pod <pod-name> -n airflow -o jsonpath='{.status.podIP}'
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- ping <pod-ip>
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nc -zv <pod-ip>:<port>
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nslookup kubernetes.default

kubectl get svc -n airflow
kubectl describe svc <service-name> -n airflow
kubectl run -it --rm debug --image=busybox:1.28 --restart=Never -- nc -zv airflow-webserver.airflow 8080

Node 網路檢查

ssh -i ~/.ssh/id_rsa root@10.10.0.85
ip addr show
ip route show
ip -d link show | grep flannel
ping 8.8.8.8

📊 效能監控

資源使用

kubectl top nodes
kubectl top pods -n airflow
kubectl top pods -A --sort-by=memory
kubectl top pods -A --sort-by=cpu

# 資源總和
kubectl top pods -n airflow --no-headers | awk '{cpu+=$2; mem+=$3} END {print "CPU: " cpu "m\nMem: " mem "Mi"}'

節點容量

kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.allocatable.cpu}{"\t"}{.status.allocatable.memory}{"\n"}{end}'
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.capacity.cpu}{"\t"}{.status.capacity.memory}{"\n"}{end}'

⚠️ 故障排查

常見診斷

kubectl describe pod <pod-name> -n airflow
kubectl logs <pod-name> -n airflow
kubectl logs <pod-name> -n airflow --previous
kubectl get events -n airflow --sort-by='.lastTimestamp'

重啟與清理

kubectl rollout restart deployment/airflow-scheduler -n airflow
kubectl rollout restart deployment/airflow-webserver -n airflow
kubectl delete pod --field-selector status.phase=Failed -n airflow
kubectl rollout history deployment/airflow-scheduler -n airflow
kubectl rollout undo deployment/airflow-scheduler -n airflow

磁碟與日誌

df -h
du -sh /var/lib/docker/*
du -sh /var/lib/containerd/*
docker system prune -a
sudo journalctl --vacuum=500M

🤖 快速診斷腳本

健康檢查

#!/bin/bash
echo "=== K8s 版本 ==="
kubectl version --short

echo "=== 節點狀態 ==="
kubectl get nodes -o wide

echo "=== Airflow Pods ==="
kubectl get pods -n airflow

echo "=== 叢集事件 ==="
kubectl get events -A --sort-by='.lastTimestamp' | tail -10

echo "=== 資源使用 ==="
kubectl top nodes 2>/dev/null || echo "Metrics Server 未部署"

故障排查

#!/bin/bash
POD_NAME=$1
NAMESPACE=${2:-airflow}

echo "=== Pod 狀態 ==="
kubectl describe pod $POD_NAME -n $NAMESPACE

echo "=== 最近日誌 ==="
kubectl logs $POD_NAME -n $NAMESPACE --tail=50

echo "=== 環境變數 ==="
kubectl exec $POD_NAME -n $NAMESPACE -- env | head -20

echo "=== Events ==="
kubectl get events -n $NAMESPACE | grep $POD_NAME

📚 速查表

用途 命令
K8s 版本 kubectl version --short
Airflow 版本 kubectl exec -n airflow ... airflow version
所有節點 kubectl get nodes -o wide
所有 Pod kubectl get pods -n airflow
Pod 日誌 kubectl logs <pod-name> -n airflow
進入 Pod kubectl exec -it <pod-name> -n airflow -- bash
資源使用 kubectl top nodes && kubectl top pods -n airflow
重啟 Scheduler kubectl rollout restart deployment/airflow-scheduler -n airflow
Helm 配置 helm get values airflow -n airflow
叢集事件 kubectl get events -A --sort-by='.lastTimestamp'

Last Updated: 2026-01-30