monitoring: refresh overview dashboards

This commit is contained in:
Brad Stein 2025-11-18 14:08:33 -03:00
parent 8e6c0a3cfe
commit ff056551c7
7 changed files with 1511 additions and 1218 deletions

View File

@ -165,22 +165,22 @@ def node_io_expr(scope=""):
return scoped_node_expr(base, scope)
def namespace_cpu_share_expr():
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
total = f"clamp_min(sum( {resource_expr} ), 1)"
return f"100 * ( {selected} ) / {total}"
def namespace_cpu_share_expr():
return namespace_share_expr(NAMESPACE_CPU_RAW)
def namespace_ram_share_expr():
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
return f"100 * ( {selected} ) / {total}"
return namespace_share_expr(NAMESPACE_RAM_RAW)
def namespace_gpu_share_expr():
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
return f"100 * ( {selected} ) / {total}"
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = (
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE = (
'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)'
)
NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_USAGE
+ ") or on(namespace) ("
+ NAMESPACE_GPU_ALLOC
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
NAMESPACE_COMBINED_FILTER = (
'topk(10, ('
NAMESPACE_GPU_WEIGHT = (
"("
+ NAMESPACE_GPU_ALLOC
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ ") + ("
+ NAMESPACE_RAM_RAW
+ ' / 1e9) + ('
+ NAMESPACE_GPU_WEIGHT
+ " * 10))"
+ " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
"( "
+ NAMESPACE_CPU_RAW
+ " ) + ("
+ NAMESPACE_RAM_RAW
+ " / 1e9) + ("
+ NAMESPACE_GPU_WEIGHT
+ " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
NET_INGRESS_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
NET_EGRESS_EXPR = (
TRAEFIK_NET_EGRESS = (
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
NET_TOTAL_EXPR = (
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)"
# ---------------------------------------------------------------------------
# Panel factories
@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid):
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "palette-classic"},
},
"overrides": [],
},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": ["percent"],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
@ -511,7 +533,6 @@ def build_overview():
1,
link_to("atlas-pods"),
),
(6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
]
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
@ -591,12 +612,31 @@ def build_overview():
)
)
storage_panels = [
(23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
pie_panel(
11,
"Namespace CPU share",
namespace_cpu_share_expr(),
{"h": 9, "w": 8, "x": 0, "y": 10},
{"h": 9, "w": 8, "x": 0, "y": 16},
)
)
panels.append(
@ -604,7 +644,7 @@ def build_overview():
12,
"Namespace GPU share",
namespace_gpu_share_expr(),
{"h": 9, "w": 8, "x": 8, "y": 10},
{"h": 9, "w": 8, "x": 8, "y": 16},
)
)
panels.append(
@ -612,7 +652,7 @@ def build_overview():
13,
"Namespace RAM share",
namespace_ram_share_expr(),
{"h": 9, "w": 8, "x": 16, "y": 10},
{"h": 9, "w": 8, "x": 16, "y": 16},
)
)
@ -622,7 +662,7 @@ def build_overview():
14,
"Worker node CPU",
node_cpu_expr(worker_filter),
{"h": 8, "w": 12, "x": 0, "y": 19},
{"h": 8, "w": 12, "x": 0, "y": 25},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -636,7 +676,7 @@ def build_overview():
15,
"Worker node RAM",
node_mem_expr(worker_filter),
{"h": 8, "w": 12, "x": 12, "y": 19},
{"h": 8, "w": 12, "x": 12, "y": 25},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -651,7 +691,7 @@ def build_overview():
16,
"Control plane CPU",
node_cpu_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 0, "y": 27},
{"h": 7, "w": 12, "x": 0, "y": 33},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -663,7 +703,7 @@ def build_overview():
17,
"Control plane RAM",
node_mem_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 12, "y": 27},
{"h": 7, "w": 12, "x": 12, "y": 33},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -676,9 +716,9 @@ def build_overview():
18,
"Cluster ingress throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 12, "x": 0, "y": 34},
{"h": 7, "w": 8, "x": 0, "y": 40},
unit="Bps",
legend="Ingress",
legend="Ingress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
@ -689,9 +729,22 @@ def build_overview():
19,
"Cluster egress throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 12, "x": 12, "y": 34},
{"h": 7, "w": 8, "x": 8, "y": 40},
unit="Bps",
legend="Egress",
legend="Egress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
20,
"Intra-cluster throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 40},
unit="Bps",
legend="Internal traffic",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
@ -700,10 +753,10 @@ def build_overview():
panels.append(
timeseries_panel(
20,
21,
"Root filesystem usage",
root_usage_expr(),
{"h": 8, "w": 12, "x": 0, "y": 41},
{"h": 8, "w": 12, "x": 0, "y": 47},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -715,11 +768,11 @@ def build_overview():
)
panels.append(
{
"id": 21,
"id": 22,
"type": "bargauge",
"title": "Nodes closest to full root disks",
"datasource": PROM_DS,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
"fieldConfig": {
"defaults": {
@ -744,28 +797,10 @@ def build_overview():
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
"links": link_to("atlas-storage"),
"transformations": [{"id": "labelsToFields", "options": {}}],
}
)
storage_panels = [
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 49},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
@ -1110,12 +1145,15 @@ def build_network_dashboard():
panels.append(
stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
)
panels.append(
stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
)
panels.append(
stat_panel(
3,
4,
"Top router req/s",
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
{"h": 4, "w": 8, "x": 16, "y": 0},
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
legend="{{router}}",
instant=True,
@ -1123,10 +1161,10 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
4,
5,
"Per-node throughput",
node_net_expr(),
{"h": 8, "w": 24, "x": 0, "y": 4},
{"h": 8, "w": 24, "x": 0, "y": 8},
unit="Bps",
legend="{{node}}",
legend_display="table",
@ -1135,32 +1173,32 @@ def build_network_dashboard():
)
panels.append(
table_panel(
5,
6,
"Top namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
{"h": 9, "w": 12, "x": 0, "y": 12},
{"h": 9, "w": 12, "x": 0, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
6,
7,
"Top pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
{"h": 9, "w": 12, "x": 12, "y": 12},
{"h": 9, "w": 12, "x": 12, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
timeseries_panel(
7,
8,
"Traefik routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 21},
{"h": 9, "w": 12, "x": 0, "y": 25},
unit="req/s",
legend="{{router}}",
legend_display="table",
@ -1169,10 +1207,10 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
8,
9,
"Traefik entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 21},
{"h": 9, "w": 12, "x": 12, "y": 25},
unit="req/s",
legend="{{entrypoint}}",
legend_display="table",

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -80,7 +80,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -127,7 +127,7 @@
{
"id": 3,
"type": "stat",
"title": "Top router req/s",
"title": "Intra-cluster traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -138,6 +138,66 @@
"x": 16,
"y": 0
},
"targets": [
{
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Top router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
@ -187,7 +247,7 @@
}
},
{
"id": 4,
"id": 5,
"type": "timeseries",
"title": "Per-node throughput",
"datasource": {
@ -198,7 +258,7 @@
"h": 8,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -224,7 +284,7 @@
}
},
{
"id": 5,
"id": 6,
"type": "table",
"title": "Top namespaces",
"datasource": {
@ -235,7 +295,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 12
"y": 16
},
"targets": [
{
@ -260,7 +320,7 @@
]
},
{
"id": 6,
"id": 7,
"type": "table",
"title": "Top pods",
"datasource": {
@ -271,7 +331,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 12
"y": 16
},
"targets": [
{
@ -296,7 +356,7 @@
]
},
{
"id": 7,
"id": 8,
"type": "timeseries",
"title": "Traefik routers (req/s)",
"datasource": {
@ -307,7 +367,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 21
"y": 25
},
"targets": [
{
@ -333,7 +393,7 @@
}
},
{
"id": 8,
"id": 9,
"type": "timeseries",
"title": "Traefik entrypoints (req/s)",
"datasource": {
@ -344,7 +404,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 21
"y": 25
},
"targets": [
{

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,74 @@
# services/monitoring/dcgm-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
matchLabels:
app: dcgm-exporter
template:
metadata:
labels:
app: dcgm-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9400"
spec:
serviceAccountName: default
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
tolerations:
- operator: Exists
containers:
- name: dcgm-exporter
image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 9400
env:
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
securityContext:
privileged: true
resources:
requests:
cpu: 50m
memory: 64Mi
volumeMounts:
- name: pod-resources
mountPath: /var/lib/kubelet/pod-resources
volumes:
- name: pod-resources
hostPath:
path: /var/lib/kubelet/pod-resources
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: dcgm-exporter
namespace: monitoring
labels:
app: dcgm-exporter
spec:
selector:
app: dcgm-exporter
ports:
- name: metrics
port: 9400
targetPort: metrics

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -89,7 +89,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -136,7 +136,7 @@ data:
{
"id": 3,
"type": "stat",
"title": "Top router req/s",
"title": "Intra-cluster traffic",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -147,6 +147,66 @@ data:
"x": 16,
"y": 0
},
"targets": [
{
"expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "Bps",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Top router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
@ -196,7 +256,7 @@ data:
}
},
{
"id": 4,
"id": 5,
"type": "timeseries",
"title": "Per-node throughput",
"datasource": {
@ -207,7 +267,7 @@ data:
"h": 8,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -233,7 +293,7 @@ data:
}
},
{
"id": 5,
"id": 6,
"type": "table",
"title": "Top namespaces",
"datasource": {
@ -244,7 +304,7 @@ data:
"h": 9,
"w": 12,
"x": 0,
"y": 12
"y": 16
},
"targets": [
{
@ -269,7 +329,7 @@ data:
]
},
{
"id": 6,
"id": 7,
"type": "table",
"title": "Top pods",
"datasource": {
@ -280,7 +340,7 @@ data:
"h": 9,
"w": 12,
"x": 12,
"y": 12
"y": 16
},
"targets": [
{
@ -305,7 +365,7 @@ data:
]
},
{
"id": 7,
"id": 8,
"type": "timeseries",
"title": "Traefik routers (req/s)",
"datasource": {
@ -316,7 +376,7 @@ data:
"h": 9,
"w": 12,
"x": 0,
"y": 21
"y": 25
},
"targets": [
{
@ -342,7 +402,7 @@ data:
}
},
{
"id": 8,
"id": 9,
"type": "timeseries",
"title": "Traefik entrypoints (req/s)",
"datasource": {
@ -353,7 +413,7 @@ data:
"h": 9,
"w": 12,
"x": 12,
"y": 21
"y": 25
},
"targets": [
{

File diff suppressed because it is too large Load Diff

View File

@ -10,5 +10,6 @@ resources:
- grafana-dashboard-nodes.yaml
- grafana-dashboard-storage.yaml
- grafana-dashboard-network.yaml
- dcgm-exporter.yaml
- grafana-folders.yaml
- helmrelease.yaml