You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

161 lines
4.8 KiB

  1. from __future__ import annotations
  2. from ftplib import print_line
  3. from airflow import DAG
  4. from airflow.decorators import task
  5. from airflow.providers.mysql.hooks.mysql import MySqlHook
  6. from airflow.utils.trigger_rule import TriggerRule
  7. from datetime import datetime, timedelta
  8. import subprocess
  9. import re
  10. TOTAL_IPS = 100000
  11. BATCH_SIZE = 5000
  12. FPING_TIMEOUT_SEC = 60
  13. DB_EXEC_STEP = 2000
  14. PING_POOL = "ping_pool"
  15. PING_POOL_SLOTS_PER_TASK = 1
  16. default_args = {
  17. "owner": "admin",
  18. "retries": 1,
  19. "retry_delay": timedelta(minutes=1),
  20. }
  21. # 修正: 匹配 "1.06 ms" 格式
  22. LATENCY_RE = re.compile(r"(\d+\.?\d*)\s*ms")
  23. def _chunk_ranges(total: int, size: int) -> list[dict]:
  24. return [{"start": s, "end": min(s + size, total)} for s in range(0, total, size)]
  25. def _gen_ips_by_range(start: int, end: int) -> list[str]:
  26. ips = []
  27. for i in range(start, end):
  28. subnet = i // 255
  29. host = (i % 255) + 1
  30. ips.append(f"10.10.{subnet}.{host}")
  31. return ips
  32. with DAG(
  33. dag_id="05_ping_to_doris_celery",
  34. default_args=default_args,
  35. start_date=datetime(2023, 1, 1),
  36. catchup=False,
  37. tags=["monitor", "doris", "celery", "latest"],
  38. max_active_runs=1,
  39. max_active_tasks=8,
  40. ) as dag:
  41. @task
  42. def make_batches() -> list[dict]:
  43. return _chunk_ranges(TOTAL_IPS, BATCH_SIZE)
  44. @task(
  45. pool=PING_POOL,
  46. pool_slots=PING_POOL_SLOTS_PER_TASK,
  47. execution_timeout=timedelta(seconds=FPING_TIMEOUT_SEC * 2),
  48. )
  49. def ping_and_load_batch(batch: dict) -> dict:
  50. start, end = int(batch["start"]), int(batch["end"])
  51. ip_batch = _gen_ips_by_range(start, end)
  52. cmd = ["/opt/tools/bin/fping", "-C", "1", "-A"] + ip_batch
  53. now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  54. rows: list[tuple] = []
  55. alive_cnt = 0
  56. dead_cnt = 0
  57. try:
  58. proc = subprocess.run(
  59. cmd,
  60. stdout=subprocess.PIPE,
  61. stderr=subprocess.STDOUT,
  62. text=True,
  63. timeout=FPING_TIMEOUT_SEC,
  64. )
  65. output = proc.stdout.splitlines()
  66. for line in output:
  67. m = LATENCY_RE.search(line)
  68. m = LATENCY_RE.search(line)
  69. # 範例輸出:
  70. # 活的: "10.10.0.33 : [0], 64 bytes, 1.06 ms (1.06 avg, 0% loss)"
  71. # 死的: "10.10.0.1 : [0], timed out"
  72. # 或: "10.10.0.2 : -"
  73. # 提取 IP
  74. parts = line.split(":")
  75. if len(parts) < 2:
  76. continue
  77. ip = parts[0].strip()
  78. rest = ":".join(parts[1:]) # 剩餘部分
  79. # 檢查是否有延遲資訊
  80. m = LATENCY_RE.search(rest)
  81. if m and "timed out" not in rest:
  82. # 活著且有延遲資訊
  83. latency = float(m.group(1))
  84. alive_cnt += 1
  85. rows.append((now, ip, 1, latency, 0))
  86. # print(f"✓ {ip}: {latency} ms")
  87. else:
  88. # 死掉或 timeout
  89. dead_cnt += 1
  90. rows.append((now, ip, 0, -1, 100))
  91. # print(f"✗ {ip}: dead")
  92. except Exception as e:
  93. print(f"[ping_and_load_batch] exception={repr(e)} range=({start},{end}) size={len(ip_batch)}")
  94. dead_cnt = len(ip_batch)
  95. for ip in ip_batch:
  96. rows.append((now, ip, 0, -1, 100))
  97. # 寫入 Doris
  98. if rows:
  99. mysql_hook = MySqlHook(mysql_conn_id="doris_db")
  100. conn = mysql_hook.get_conn()
  101. cur = conn.cursor()
  102. sql = """
  103. INSERT INTO ping_results
  104. (monitor_time, target_ip, is_alive, latency_ms, packet_loss_rate)
  105. VALUES (%s, %s, %s, %s, %s) \
  106. """
  107. for i in range(0, len(rows), DB_EXEC_STEP):
  108. cur.executemany(sql, rows[i: i + DB_EXEC_STEP])
  109. conn.commit()
  110. cur.close()
  111. conn.close()
  112. print(f"[Batch {start}-{end}] Written {len(rows)} records to Doris")
  113. return {
  114. "start": start,
  115. "end": end,
  116. "count": end - start,
  117. "alive": alive_cnt,
  118. "dead": dead_cnt,
  119. }
  120. @task(trigger_rule=TriggerRule.ALL_DONE)
  121. def summarize(stats: list[dict]) -> None:
  122. total = sum(x.get("count", 0) for x in stats)
  123. alive = sum(x.get("alive", 0) for x in stats)
  124. dead = sum(x.get("dead", 0) for x in stats)
  125. alive_pct = (alive * 100 // total) if total > 0 else 0
  126. print(f"[SUMMARY] Total: {total} | Alive: {alive} ({alive_pct}%) | Dead: {dead} | Batches: {len(stats)}")
  127. batches = make_batches()
  128. stats = ping_and_load_batch.expand(batch=batches)
  129. summarize(stats)