1. 问题现象

TIDB日志中频繁的刷以下日志：

[2024/03/13 09:09:40.542 +08:00] [WARN] [cache.go:205] ["lock cached table for read"] [error="previous statement: update mysql.table_cache_meta set lock_type = 'READ', lease = 448342830925742080 where tid = 265617: [kv:9007]Write conflict, txnStartTS=448342833363681305, conflictStartTS=448342833363681293, conflictCommitTS=448342833363681328, key={tableID=57, handle=265617} primary={tableID=57, handle=265617} [try again later]"] [2024/03/13 09:09:40.555 +08:00] [WARN] [session.go:1966] ["run statement failed"] [schemaVersion=313410] [error="previous statement: update mysql.table_cache_meta set lock_type = 'READ', lease = 448342830938849280 where tid = 265617: [kv:9007]Write conflict, txnStartTS=448342833363681354, conflictStartTS=448342833363681331, conflictCommitTS=448342833363681368, key={tableID=57, handle=265617} primary={tableID=57, handle=265617} [try again later]"] [session="{\n "currDBName": "",\n "id": 0,\n "status": 2,\n "strictMode": true,\n "user": null\n}"]

通过监控发现，出现大量的2PC阶段的失败和PrePareWrite 失败的情况。同时整个集群的QPS断崖式下降。

2. 问题分析

从监控上来看大量的请求是处于两阶段提交失败的情况，且失败的表是mysql.table_cache_meta。从6.0.0开始，TIDB新增了cache表功能，对于热点的小表可以缓存在内存中，增加访问的速度。但是为了保证各个TIDB节点之间的缓存一致性，TIDB实现了一个lease机制(来自官网文档):

缓存表的写入延时高是受到实现的限制。存在多个 TiDB 实例时，一个 TiDB 实例并不知道其它的 TiDB 实例是否缓存了数据，如果该实例直接修改了表数据，而其它 TiDB 实例依然读取旧的缓存数据，就会读到错误的结果。为了保证数据正确性，缓存表的实现使用了一套基于 lease 的复杂机制：读操作在缓存数据同时，还会对于缓存设置一个有效期，也就是 lease。在 lease 过期之前，无法对数据执行修改操作。因为修改操作必须等待 lease 过期，所以会出现写入延迟。

这里可以猜测引起问题的是在更新lease的阶段。在SQL Build的阶段会调用getCacheTable ：

func (b *executorBuilder) getCacheTable(tblInfo *model.TableInfo, startTS uint64) kv.MemBuffer {
	tbl, ok := b.is.TableByID(tblInfo.ID)
	if !ok {
		b.err = errors.Trace(infoschema.ErrTableNotExists.GenWithStackByArgs(b.ctx.GetSessionVars().CurrentDB, tblInfo.Name))
		return nil
	}
	sessVars := b.ctx.GetSessionVars()
	leaseDuration := time.Duration(variable.TableCacheLease.Load()) * time.Second
	cacheData, loading := tbl.(table.CachedTable).TryReadFromCache(startTS, leaseDuration)
	if cacheData != nil {
		sessVars.StmtCtx.ReadFromTableCache = true
		return cacheData
	} else if loading {
		// continue
	} else {
		if !b.ctx.GetSessionVars().StmtCtx.InExplainStmt && !b.inDeleteStmt && !b.inUpdateStmt {
			tbl.(table.CachedTable).UpdateLockForRead(context.Background(), b.ctx.GetStore(), startTS, leaseDuration)
		}
	}
	return nil
}

这里有2个地方会去更新lease :

TryReadFromCache
UpdateLockForRead

在TryReadFromCache 中，当剩余过期时间小于1/2的lease的时候，会去尝试更新lease:

func (c *cachedTable) TryReadFromCache(ts uint64, leaseDuration time.Duration) (kv.MemBuffer, bool /*loading*/) {
	tmp := c.cacheData.Load()
	if tmp == nil {
		return nil, false
	}
	data := tmp.(*cacheData)
	if ts >= data.Start && ts < data.Lease {
		... ... 
		if distance >= 0 && distance <= leaseDuration/2 || triggerFailpoint {
			if h := c.TakeStateRemoteHandleNoWait(); h != nil {
				go c.renewLease(h, ts, data, leaseDuration)
			}
		}
		// If data is not nil, but data.MemBuffer is nil, it means the data is being
		// loading by a background goroutine.
		return data.MemBuffer, data.MemBuffer == nil
	}
	return nil, false
}

func (c *cachedTable) renewLease(handle StateRemote, ts uint64, data *cacheData, leaseDuration time.Duration) {
	... ...
	tid := c.Meta().ID
	lease := leaseFromTS(ts, leaseDuration)
	newLease, err := handle.RenewReadLease(context.Background(), tid, data.Lease, lease)
	... ...
}

func (h *stateRemoteHandle) RenewReadLease(ctx context.Context, tid int64, oldLocalLease, newValue uint64) (uint64, error) {
	var newLease uint64
	err := h.runInTxn(ctx, false, func(ctx context.Context, now uint64) error {
		lockType, remoteLease, _, err := h.loadRow(ctx, tid, false)
		if err != nil {
			return errors.Trace(err)
		}

		if now >= remoteLease {
			// read lock had already expired, fail to renew
			return nil
		}
		if lockType != CachedTableLockRead {
			// Not read lock, fail to renew
			return nil
		}

		// It means that the lease had already been changed by some other TiDB instances.
		if oldLocalLease != remoteLease {
			// 1. Data in [cacheDataTS -------- oldLocalLease) time range is also immutable.
			// 2. Data in [              now ------------------- remoteLease) time range is immutable.
			//
			// If now < oldLocalLease, it means data in all the time range is immutable,
			// so the old cache data is still available.
			if now < oldLocalLease {
				newLease = remoteLease
			}
			// Otherwise, there might be write operation during the oldLocalLease and the new remoteLease
			// Make renew lease operation fail.
			return nil
		}

		if newValue > remoteLease { // lease should never decrease!
			err = h.updateRow(ctx, tid, "READ", newValue)
			if err != nil {
				return errors.Trace(err)
			}
			newLease = newValue
		} else {
			newLease = remoteLease
		}
		return nil
	})

	return newLease, err
}

func (h *stateRemoteHandle) updateRow(ctx context.Context, tid int64, lockType string, lease uint64) error {
	_, err := h.execSQL(ctx, "update mysql.table_cache_meta set lock_type = %?, lease = %? where tid = %?", lockType, lease, tid)
	return err
}

如果在Build阶段发现lease过期了，则会调用UpdateLockForRead进行更新:

func (c *cachedTable) UpdateLockForRead(ctx context.Context, store kv.Storage, ts uint64, leaseDuration time.Duration) {
	if h := c.TakeStateRemoteHandle(); h != nil {
		go c.updateLockForRead(ctx, h, store, ts, leaseDuration)
	}
}

func (c *cachedTable) updateLockForRead(ctx context.Context, handle StateRemote, store kv.Storage, ts uint64, leaseDuration time.Duration) {
	... ... 

	// Load data from original table and the update lock information.
	tid := c.Meta().ID
	lease := leaseFromTS(ts, leaseDuration)
	succ, err := handle.LockForRead(ctx, tid, lease)
	if err != nil {
		log.Warn("lock cached table for read", zap.Error(err))
		return
	}
	... ... 
	// Current status is not suitable to cache.
}

func (h *stateRemoteHandle) LockForRead(ctx context.Context, tid int64, newLease uint64) ( /*succ*/ bool, error) {
	succ := false
	... ...

	err := h.runInTxn(ctx, false, func(ctx context.Context, now uint64) error {
		lockType, lease, _, err := h.loadRow(ctx, tid, false)
		if err != nil {
			return errors.Trace(err)
		}
		// The old lock is outdated, clear orphan lock.
		if now > lease {
			succ = true
			if err := h.updateRow(ctx, tid, "READ", newLease); err != nil {
				return errors.Trace(err)
			}
			return nil
		}

		switch lockType {
		case CachedTableLockNone:
		case CachedTableLockRead:
		case CachedTableLockWrite, CachedTableLockIntend:
			return nil
		}
		succ = true
		if newLease > lease { // Note the check, don't decrease lease value!
			if err := h.updateRow(ctx, tid, "READ", newLease); err != nil {
				return errors.Trace(err)
			}
		}

		return nil
	})
	return succ, err
}

func (h *stateRemoteHandle) updateRow(ctx context.Context, tid int64, lockType string, lease uint64) error {
	_, err := h.execSQL(ctx, "update mysql.table_cache_meta set lock_type = %?, lease = %? where tid = %?", lockType, lease, tid)
	return err
}

可以看到两个对lease的更新点，最后都会统一调用updateRow，并且在txn(事务中进行)。因为系统的默认tidb_table_cache_lease是3s，也就是说在1.5s之后，当select 语句使用到缓存表的时候，就会开始更新table_cache_meta表。

正常情况下往数据库系统的业务QPS可以达到40K-80K，因此每隔1.5s左右就需要更新一次lease，更新期间其他相同的请求会因为Write conflict而失败，因此导致整个系统的QPS断崖式下降。

问题处理

从分析来看有2个处理办法:

提高tidb_table_cache_lease的值: 不过根据分析来看，当剩余1/2 Lease的时候，在高QPS下，还是可能导致QPS抖动。
针对cache表，去掉cache，直接中TIKV读取数据。

当日在线上的处理操作是紧急去掉了cache ,系统即恢复正常。

参考文献

一篇文章说透缓存表 : https://tidb.net/blog/f663f0f5

TiDB v6.0.0 DMR 源码阅读——缓存表: https://tidb.net/book/book-rush/features/new-features/new-cache-tables

Tidb 6.1.4 中 table cache 引发集群 QPS 异常问题解析

1. 问题现象

2. 问题分析

问题处理

参考文献