背景
在 TiDB 的 7.x 引入 record-db-label 新特性 https://github.com/pingcap/tidb/pull/41477,指标项添加 db 和 resource group 的 label,实现对 db 和资源组的指标监控,取代了 record-db-qps(存在已知的 bug)。
操作过程
- 配置和启用
tiup cluster edit-config {cluster-name}
,添加 record-db-label
配置项:
...
server_configs:
tidb:
status.record-db-label: true
...
tiup cluster reload {cluster-name} -R tidb
mysql> show config where name like "%record-db-label%";
+------+------------------+------------------------+-------+
| Type | Instance | Name | Value |
+------+------------------+------------------------+-------+
| tidb | 10.2.103.64:4101 | status.record-db-label | true |
- 查看数据
在 prometheus 的 web UI 上,可以查看 tidb_executor_statement_total、tidb_server_handle_query_duration_seconds、tidb_server_execute_error_total 是否已经有数据,并且有 db 和 resource group 的分类。
- 配置 grafana 指标项
新特性 record-db-label 在 tidb cloud serverless 上使用,从任一 serverless 集群的监控项上拷贝其公式。
简单修改后得到以下三条 grafana 公式
QPS Per DB
sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db,type)
Average Query Duration Per DB
sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db) / sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db)
Failed Query Per DB
sum(rate(tidb_server_execute_error_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", db!=""}[1m])) by (db)
编辑 Grafana 面板,vi /tidb-deploy/grafana-3000/bin/tidb.json
插入位置第 57 行
...
"id": null,
"iteration": 1655990780337,
"links": [],
"panels": [
插入内容
{
"collapsed": true,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 316,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"decimals": null,
"description": "The number of SQL statements executed per second on every Database, which are collected by SQL types.",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 310,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": null,
"sortDesc": null,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.11",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db,type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{db}}-{{type}}",
"refId": "A",
"step": 30
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "QPS",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 2,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"decimals": null,
"description": "The duration from receiving a request from the client to a database until the database executes the request and returns the result to the client.",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 7
},
"hiddenSeries": false,
"id": 312,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": null,
"sortDesc": null,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.11",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db) / sum(rate(tidb_server_handle_query_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "avg-{{db}}",
"refId": "A",
"step": 30
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Average Query Duration Per DB",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 2,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"decimals": null,
"description": "The statistics of error types according to the SQL statement execution errors per second on every database.",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 314,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": null,
"sortDesc": null,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.11",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db!=\"\"}[1m])) by (db)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{db}}",
"refId": "A",
"step": 30
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Failed Query Per DB",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 2,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
"title": "Per DB Detail",
"type": "row"
},
tiup cluster reload {cluster-name} -R grafana
- 实现效果
- 手工编辑公式可查看 resource group 指标
小结
新特性 record-db-label 具有更加丰富的指标项和维度,在 resource control 场景中,可以提供细粒度的监控指标功能。