From 1c10be5cdcf2082dca902951408e4b493b10cc48 Mon Sep 17 00:00:00 2001 From: Franziska Kunsmann Date: Fri, 25 Jun 2021 20:04:30 +0200 Subject: [PATCH] bundles/zfs: add per-dataset metrics --- bundles/grafana/dashboard-rows/zfs.py | 724 +++++++++++++++++++++++++ bundles/grafana/items.py | 4 + bundles/zfs/files/telegraf-per-dataset | 51 ++ bundles/zfs/items.py | 3 + bundles/zfs/metadata.py | 11 +- 5 files changed, 792 insertions(+), 1 deletion(-) create mode 100644 bundles/grafana/dashboard-rows/zfs.py create mode 100644 bundles/zfs/files/telegraf-per-dataset diff --git a/bundles/grafana/dashboard-rows/zfs.py b/bundles/grafana/dashboard-rows/zfs.py new file mode 100644 index 0000000..b643d14 --- /dev/null +++ b/bundles/grafana/dashboard-rows/zfs.py @@ -0,0 +1,724 @@ +def dashboard_row_zfs(panel_id, node): + return { + 'title': 'zfs', + 'collapse': False, + 'editable': False, + 'height': '250px', + 'panels': [ + { + 'aliasColors': {}, + 'bars': False, + 'dashLength': 10, + 'dashes': False, + 'datasource': None, + 'fieldConfig': { + 'defaults': { + 'displayName': '${__field.name}' + }, + 'overrides': [] + }, + 'fill': 1, + 'fillGradient': 0, + 'hiddenSeries': False, + 'id': next(panel_id), + 'legend': { + 'alignAsTable': False, + 'avg': False, + 'current': False, + 'max': False, + 'min': False, + 'rightSide': False, + 'show': True, + 'total': False, + 'values': False + }, + 'lines': True, + 'linewidth': 1, + 'NonePointMode': 'None', + 'options': { + 'alertThreshold': True + }, + 'percentage': False, + 'pluginVersion': '7.5.5', + 'pointradius': 2, + 'points': False, + 'renderer': 'flot', + 'seriesOverrides': [], + 'spaceLength': 10, + 'span': 4, + 'stack': False, + 'steppedLine': False, + 'targets': [ + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_c" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "target" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "target")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_size" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "used" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "used")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + ], + 'thresholds': [], + 'timeRegions': [], + 'title': 'zfs arc usage', + 'tooltip': { + 'shared': True, + 'sort': 0, + 'value_type': 'individual' + }, + 'type': 'graph', + 'xaxis': { + 'buckets': None, + 'mode': 'time', + 'name': None, + 'show': True, + 'values': [] + }, + 'yaxes': [ + { + 'format': 'bits', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': 0, + 'show': True, + }, + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': None, + 'show': False, + } + ], + 'yaxis': { + 'align': False, + 'alignLevel': None + } + }, + { + 'aliasColors': {}, + 'bars': False, + 'dashLength': 10, + 'dashes': False, + 'datasource': None, + 'fieldConfig': { + 'defaults': { + 'displayName': '${__field.name}' + }, + 'overrides': [] + }, + 'fill': 1, + 'fillGradient': 0, + 'hiddenSeries': False, + 'id': next(panel_id), + 'legend': { + 'alignAsTable': False, + 'avg': False, + 'current': False, + 'max': False, + 'min': False, + 'rightSide': False, + 'show': True, + 'total': False, + 'values': False + }, + 'lines': True, + 'linewidth': 1, + 'NonePointMode': 'None', + 'options': { + 'alertThreshold': True + }, + 'percentage': False, + 'pluginVersion': '7.5.5', + 'pointradius': 2, + 'points': False, + 'renderer': 'flot', + 'seriesOverrides': [], + 'spaceLength': 10, + 'span': 4, + 'stack': False, + 'steppedLine': False, + 'targets': [ + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_l2_size" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "used" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "used")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + ], + 'thresholds': [], + 'timeRegions': [], + 'title': 'zfs l2arc usage', + 'tooltip': { + 'shared': True, + 'sort': 0, + 'value_type': 'individual' + }, + 'type': 'graph', + 'xaxis': { + 'buckets': None, + 'mode': 'time', + 'name': None, + 'show': True, + 'values': [] + }, + 'yaxes': [ + { + 'format': 'bits', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': 0, + 'show': True, + 'decimals': 0, + }, + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': None, + 'show': False, + } + ], + 'yaxis': { + 'align': False, + 'alignLevel': None + } + }, + { + 'aliasColors': {}, + 'bars': False, + 'dashLength': 10, + 'dashes': False, + 'datasource': None, + 'fieldConfig': { + 'defaults': { + 'displayName': '${__field.name}' + }, + 'overrides': [] + }, + 'fill': 1, + 'fillGradient': 0, + 'hiddenSeries': False, + 'id': next(panel_id), + 'legend': { + 'alignAsTable': False, + 'avg': False, + 'current': False, + 'max': False, + 'min': False, + 'rightSide': False, + 'show': True, + 'total': False, + 'values': False + }, + 'lines': True, + 'linewidth': 1, + 'NonePointMode': 'None', + 'options': { + 'alertThreshold': True + }, + 'percentage': False, + 'pluginVersion': '7.5.5', + 'pointradius': 2, + 'points': False, + 'renderer': 'flot', + 'seriesOverrides': [], + 'spaceLength': 10, + 'span': 4, + 'stack': False, + 'steppedLine': False, + 'targets': [ + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_hits" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "hits" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> derivative(unit: 1s, nonNegative: true) + |> yield(name: "misses")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_misses" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "misses" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> derivative(unit: 1s, nonNegative: true) + |> yield(name: "misses")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_l2_hits" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "l2hits" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> derivative(unit: 1s, nonNegative: true) + |> yield(name: "misses")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs" and + r["_field"] == "arcstats_l2_misses" and + r["host"] == "{node.name}" + ) + |> map(fn: (r) => ({{ + r with + _field: "l2misses" + }}) + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> derivative(unit: 1s, nonNegative: true) + |> yield(name: "misses")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + ], + 'thresholds': [], + 'timeRegions': [], + 'title': 'zfs arc hits/misses', + 'tooltip': { + 'shared': True, + 'sort': 0, + 'value_type': 'individual' + }, + 'type': 'graph', + 'xaxis': { + 'buckets': None, + 'mode': 'time', + 'name': None, + 'show': True, + 'values': [] + }, + 'yaxes': [ + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': 0, + 'show': True, + 'decimals': 0, + }, + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': None, + 'show': False, + } + ], + 'yaxis': { + 'align': False, + 'alignLevel': None + } + }, + { + 'aliasColors': {}, + 'bars': False, + 'dashLength': 10, + 'dashes': False, + 'datasource': None, + 'fieldConfig': { + 'defaults': { + 'displayName': '${__field.labels.dataset} ${__field.name}' + }, + 'overrides': [] + }, + 'fill': 1, + 'fillGradient': 0, + 'hiddenSeries': False, + 'id': next(panel_id), + 'legend': { + 'alignAsTable': False, + 'avg': False, + 'current': False, + 'max': False, + 'min': False, + 'rightSide': False, + 'show': True, + 'total': False, + 'values': False + }, + 'lines': True, + 'linewidth': 1, + 'NonePointMode': 'None', + 'options': { + 'alertThreshold': True + }, + 'percentage': False, + 'pluginVersion': '7.5.5', + 'pointradius': 2, + 'points': False, + 'renderer': 'flot', + 'seriesOverrides': [], + 'spaceLength': 10, + 'span': 6, + 'stack': True, + 'steppedLine': False, + 'targets': [ + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs_dataset" and + r["_field"] == "used" and + r["host"] == "{node.name}" + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "used")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs_dataset" and + r["_field"] == "usedsnap" and + r["host"] == "{node.name}" + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "out")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + ], + 'thresholds': [], + 'timeRegions': [], + 'title': 'zfs usage per dataset', + 'tooltip': { + 'shared': True, + 'sort': 0, + 'value_type': 'individual' + }, + 'type': 'graph', + 'xaxis': { + 'buckets': None, + 'mode': 'time', + 'name': None, + 'show': True, + 'values': [] + }, + 'yaxes': [ + { + 'format': 'bits', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': 0, + 'show': True, + }, + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': None, + 'show': False, + } + ], + 'yaxis': { + 'align': False, + 'alignLevel': None + } + }, + { + 'aliasColors': {}, + 'bars': False, + 'dashLength': 10, + 'dashes': False, + 'datasource': None, + 'fieldConfig': { + 'defaults': { + 'displayName': '${__field.labels.pool} ${__field.name}' + }, + 'overrides': [] + }, + 'fill': 1, + 'fillGradient': 0, + 'hiddenSeries': False, + 'id': next(panel_id), + 'legend': { + 'alignAsTable': False, + 'avg': False, + 'current': False, + 'max': False, + 'min': False, + 'rightSide': False, + 'show': True, + 'total': False, + 'values': False + }, + 'lines': True, + 'linewidth': 1, + 'NonePointMode': 'None', + 'options': { + 'alertThreshold': True + }, + 'percentage': False, + 'pluginVersion': '7.5.5', + 'pointradius': 2, + 'points': False, + 'renderer': 'flot', + 'seriesOverrides': [], + 'spaceLength': 10, + 'span': 6, + 'stack': False, + 'steppedLine': False, + 'targets': [ + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs_pool" and + r["_field"] == "free" and + r["host"] == "{node.name}" + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "in")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + { + 'groupBy': [ + {'type': 'time', 'params': ['$__interval']}, + {'type': 'fill', 'params': ['linear']}, + ], + 'orderByTime': "ASC", + 'policy': "default", + 'query': f"""from(bucket: "telegraf") + |> range(start: v.timeRangeStart, stop: v.timeRangeStop) + |> filter(fn: (r) => + r["_measurement"] == "zfs_pool" and + r["_field"] == "size" and + r["host"] == "{node.name}" + ) + |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) + |> yield(name: "out")""", + 'resultFormat': 'time_series', + 'select': [[ + {'type': 'field', 'params': ['value']}, + {'type': 'mean', 'params': []}, + ]], + "tags": [] + }, + ], + 'thresholds': [], + 'timeRegions': [], + 'title': 'zfs usage per pool', + 'tooltip': { + 'shared': True, + 'sort': 0, + 'value_type': 'individual' + }, + 'type': 'graph', + 'xaxis': { + 'buckets': None, + 'mode': 'time', + 'name': None, + 'show': True, + 'values': [] + }, + 'yaxes': [ + { + 'format': 'bits', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': 0, + 'show': True, + 'decimals': 0, + }, + { + 'format': 'short', + 'label': None, + 'logBase': 1, + 'max': None, + 'min': None, + 'show': False, + } + ], + 'yaxis': { + 'align': False, + 'alignLevel': None + } + }, + ], + } diff --git a/bundles/grafana/items.py b/bundles/grafana/items.py index d65f555..365f307 100644 --- a/bundles/grafana/items.py +++ b/bundles/grafana/items.py @@ -124,6 +124,10 @@ for rnode in repo.nodes: dashboard['rows'].append(dashboard_row_wireguard(panel_id, rnode)) dashboard['tags'].add('wireguard') + if rnode.has_bundle('zfs'): + dashboard['rows'].append(dashboard_row_zfs(panel_id, rnode)) + dashboard['tags'].add('zfs') + files[f'/var/lib/grafana/dashboards/{rnode.name}.json'] = { # use metadata_to_json, because this supports sets 'content': metadata_to_json(dashboard), diff --git a/bundles/zfs/files/telegraf-per-dataset b/bundles/zfs/files/telegraf-per-dataset new file mode 100644 index 0000000..44897de --- /dev/null +++ b/bundles/zfs/files/telegraf-per-dataset @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +from subprocess import check_output + +pools = check_output( + ['/usr/sbin/zpool', 'list', '-Hpo', 'name,free,size'], + env={ + 'LC_ALL': 'C', + }, +).decode('UTF-8') + +datasets = check_output( + ['/usr/sbin/zfs', 'list', '-Hpo', 'name,usedbydataset,usedsnap,compressratio'], + env={ + 'LC_ALL': 'C', + }, +).decode('UTF-8') + +zpools = {} +for line in pools.splitlines(): + name, free, total = line.split() + + zpools[name] = { + 'free': free, + 'total': total, + } + + print('zfs_pool,pool={} size={}i,free={}i'.format(name, total, free)) + +for line in datasets.splitlines(): + name, used, usedsnap, compressratio = line.split() + + pool = name.split('/')[0] + + if '/' not in name: + # covered by pool metrics above + continue + + if pool not in zpools: + raise Exception('BUG: {} in datasets, but {} not in pools'.format(name, pool)) + + if compressratio[-1] == 'x': + compressratio = compressratio[:-1] + + print('zfs_dataset,pool={},dataset={} used={}i,usedsnap={}i,compressratio={}'.format( + pool, + name, + used, + usedsnap, + compressratio, + )) diff --git a/bundles/zfs/items.py b/bundles/zfs/items.py index da80b73..d1789c3 100644 --- a/bundles/zfs/items.py +++ b/bundles/zfs/items.py @@ -44,6 +44,9 @@ files = { 'svc_systemd:zfs-zed:restart' }, }, + '/usr/local/sbin/telegraf-per-dataset': { + 'mode': '0755', + }, '/usr/local/sbin/zfs-auto-snapshot': { 'mode': '0755', }, diff --git a/bundles/zfs/metadata.py b/bundles/zfs/metadata.py index b2aa7a8..aaec7f6 100644 --- a/bundles/zfs/metadata.py +++ b/bundles/zfs/metadata.py @@ -84,9 +84,18 @@ if node.has_bundle('telegraf'): 'builtin': { 'zfs': [{ 'poolMetrics': True, - 'datasetMetrics': True, }], }, + 'exec': { + 'zfs-dataset': { + 'commands': ['sudo /usr/local/sbin/telegraf-per-dataset'], + 'data_format': 'influx', + 'timeout': '5s', + }, + }, + }, + 'sudo_commands': { + '/usr/local/sbin/telegraf-per-dataset', }, }