0
1
0
0
专栏/.../

查看 DB 和 RG 级别的监控指标--新特性 record-db-label

 pepezzzz  发表于  2024-02-17

背景

在 TiDB 的 7.x 引入 record-db-label 新特性 https://github.com/pingcap/tidb/pull/41477,指标项添加 db 和 resource group 的 label,实现对 db 和资源组的指标监控,取代了 record-db-qps(存在已知的 bug)。

操作过程

  1. 配置和启用

tiup cluster edit-config {cluster-name},添加 record-db-label 配置项:

...
server_configs:
  tidb:
    status.record-db-label: true
...

tiup cluster reload {cluster-name} -R tidb

mysql> show config where name like "%record-db-label%";
+------+------------------+------------------------+-------+
| Type | Instance         | Name                   | Value |
+------+------------------+------------------------+-------+
| tidb | 10.2.103.64:4101 | status.record-db-label | true  |
  1. 查看数据

在 prometheus 的 web UI 上,可以查看 tidb_executor_statement_total、tidb_server_handle_query_duration_seconds、tidb_server_execute_error_total 是否已经有数据,并且有 db 和 resource group 的分类。

image.png

  1. 配置 grafana 指标项

新特性 record-db-label 在 tidb cloud serverless 上使用,从任一 serverless 集群的监控项上拷贝其公式。

image.png

简单修改后得到以下三条 grafana 公式

QPS Per DB
sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db,type)
Average Query Duration Per DB
sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db) / sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db) 
Failed Query Per DB
sum(rate(tidb_server_execute_error_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db)

编辑 Grafana 面板,vi /tidb-deploy/grafana-3000/bin/tidb.json

插入位置第 57 行

...
"id": null,
"iteration": 1655990780337,
"links": [],
"panels": [

插入内容

		{
			"collapsed": true,
			"datasource": null,
			"gridPos": {
				"h": 1,
				"w": 24,
				"x": 0,
				"y": 0
			},
			"id": 316,
			"panels": [
				{
					"aliasColors": {},
					"bars": false,
					"dashLength": 10,
					"dashes": false,
					"datasource": "${DS_TEST-CLUSTER}",
					"decimals": null,
					"description": "The number of SQL statements executed per second on every Database, which are collected by SQL types.",
					"editable": true,
					"error": false,
					"fieldConfig": {
						"defaults": {},
						"overrides": []
					},
					"fill": 1,
					"fillGradient": 0,
					"grid": {},
					"gridPos": {
						"h": 6,
						"w": 12,
						"x": 0,
						"y": 7
					},
					"hiddenSeries": false,
					"id": 310,
					"legend": {
						"alignAsTable": true,
						"avg": true,
						"current": true,
						"hideEmpty": true,
						"hideZero": true,
						"max": true,
						"min": false,
						"rightSide": true,
						"show": true,
						"sort": null,
						"sortDesc": null,
						"total": false,
						"values": true
					},
					"lines": true,
					"linewidth": 1,
					"links": [],
					"nullPointMode": "null as zero",
					"options": {
						"alertThreshold": true
					},
					"percentage": false,
					"pluginVersion": "7.5.11",
					"pointradius": 5,
					"points": false,
					"renderer": "flot",
					"seriesOverrides": [],
					"spaceLength": 10,
					"stack": false,
					"steppedLine": false,
					"targets": [
						{
							"expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db,type)",
							"format": "time_series",
							"intervalFactor": 2,
							"legendFormat": "{{db}}-{{type}}",
							"refId": "A",
							"step": 30
						}
					],
					"thresholds": [],
					"timeFrom": null,
					"timeRegions": [],
					"timeShift": null,
					"title": "QPS",
					"tooltip": {
						"msResolution": false,
						"shared": true,
						"sort": 0,
						"value_type": "individual"
					},
					"type": "graph",
					"xaxis": {
						"buckets": null,
						"mode": "time",
						"name": null,
						"show": true,
						"values": []
					},
					"yaxes": [
						{
							"format": "short",
							"label": null,
							"logBase": 2,
							"max": null,
							"min": "0",
							"show": true
						},
						{
							"format": "short",
							"label": null,
							"logBase": 1,
							"max": null,
							"min": null,
							"show": true
						}
					],
					"yaxis": {
						"align": false,
						"alignLevel": null
					}
				},
				{
					"aliasColors": {},
					"bars": false,
					"dashLength": 10,
					"dashes": false,
					"datasource": "${DS_TEST-CLUSTER}",
					"decimals": null,
					"description": "The duration from receiving a request from the client to a database until the database executes the request and returns the result to the client.",
					"editable": true,
					"error": false,
					"fieldConfig": {
						"defaults": {},
						"overrides": []
					},
					"fill": 1,
					"fillGradient": 0,
					"grid": {},
					"gridPos": {
						"h": 6,
						"w": 12,
						"x": 12,
						"y": 7
					},
					"hiddenSeries": false,
					"id": 312,
					"legend": {
						"alignAsTable": true,
						"avg": true,
						"current": true,
						"hideEmpty": true,
						"hideZero": true,
						"max": true,
						"min": false,
						"rightSide": true,
						"show": true,
						"sort": null,
						"sortDesc": null,
						"total": false,
						"values": true
					},
					"lines": true,
					"linewidth": 1,
					"links": [],
					"nullPointMode": "null as zero",
					"options": {
						"alertThreshold": true
					},
					"percentage": false,
					"pluginVersion": "7.5.11",
					"pointradius": 5,
					"points": false,
					"renderer": "flot",
					"seriesOverrides": [],
					"spaceLength": 10,
					"stack": false,
					"steppedLine": false,
					"targets": [
						{
							"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db) / sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db)",
							"format": "time_series",
							"intervalFactor": 2,
							"legendFormat": "avg-{{db}}",
							"refId": "A",
							"step": 30
						}
					],
					"thresholds": [],
					"timeFrom": null,
					"timeRegions": [],
					"timeShift": null,
					"title": "Average Query Duration Per DB",
					"tooltip": {
						"msResolution": false,
						"shared": true,
						"sort": 0,
						"value_type": "individual"
					},
					"type": "graph",
					"xaxis": {
						"buckets": null,
						"mode": "time",
						"name": null,
						"show": true,
						"values": []
					},
					"yaxes": [
						{
							"format": "s",
							"label": null,
							"logBase": 2,
							"max": null,
							"min": "0",
							"show": true
						},
						{
							"format": "short",
							"label": null,
							"logBase": 1,
							"max": null,
							"min": null,
							"show": true
						}
					],
					"yaxis": {
						"align": false,
						"alignLevel": null
					}
				},
				{
					"aliasColors": {},
					"bars": false,
					"dashLength": 10,
					"dashes": false,
					"datasource": "${DS_TEST-CLUSTER}",
					"decimals": null,
					"description": "The statistics of error types according to the SQL statement execution errors per second on every database.",
					"editable": true,
					"error": false,
					"fieldConfig": {
						"defaults": {},
						"overrides": []
					},
					"fill": 1,
					"fillGradient": 0,
					"grid": {},
					"gridPos": {
						"h": 6,
						"w": 12,
						"x": 0,
						"y": 7
					},
					"hiddenSeries": false,
					"id": 314,
					"legend": {
						"alignAsTable": true,
						"avg": true,
						"current": true,
						"hideEmpty": true,
						"hideZero": true,
						"max": true,
						"min": false,
						"rightSide": true,
						"show": true,
						"sort": null,
						"sortDesc": null,
						"total": false,
						"values": true
					},
					"lines": true,
					"linewidth": 1,
					"links": [],
					"nullPointMode": "null as zero",
					"options": {
						"alertThreshold": true
					},
					"percentage": false,
					"pluginVersion": "7.5.11",
					"pointradius": 5,
					"points": false,
					"renderer": "flot",
					"seriesOverrides": [],
					"spaceLength": 10,
					"stack": false,
					"steppedLine": false,
					"targets": [
						{
							"expr": "sum(rate(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db)",
							"format": "time_series",
							"intervalFactor": 2,
							"legendFormat": "{{db}}",
							"refId": "A",
							"step": 30
						}
					],
					"thresholds": [],
					"timeFrom": null,
					"timeRegions": [],
					"timeShift": null,
					"title": "Failed Query Per DB",
					"tooltip": {
						"msResolution": false,
						"shared": true,
						"sort": 0,
						"value_type": "individual"
					},
					"type": "graph",
					"xaxis": {
						"buckets": null,
						"mode": "time",
						"name": null,
						"show": true,
						"values": []
					},
					"yaxes": [
						{
							"format": "short",
							"label": null,
							"logBase": 2,
							"max": null,
							"min": "0",
							"show": true
						},
						{
							"format": "short",
							"label": null,
							"logBase": 1,
							"max": null,
							"min": null,
							"show": true
						}
					],
					"yaxis": {
						"align": false,
						"alignLevel": null
					}
				}
			],
			"repeat": null,
			"title": "Per DB Detail",
			"type": "row"
		},

tiup cluster reload {cluster-name} -R grafana

  1. 实现效果

image.png

  1. 手工编辑公式可查看 resource group 指标

image.png

小结

新特性 record-db-label 具有更加丰富的指标项和维度,在 resource control 场景中,可以提供细粒度的监控指标功能。

0
1
0
0

版权声明:本文为 TiDB 社区用户原创文章,遵循 CC BY-NC-SA 4.0 版权协议,转载请附上原文出处链接和本声明。

评论
暂无评论