From 61d13b561c46839b589858c02399c95c891a08df Mon Sep 17 00:00:00 2001 From: tangenta Date: Tue, 28 Dec 2021 17:51:49 +0800 Subject: [PATCH] encoding: skip utf8 charset validation in some cases (#31061) close pingcap/tidb#31014 --- expression/collation.go | 2 +- parser/charset/encoding.go | 15 +++++++++++++-- parser/lexer.go | 6 ------ parser/yy_parser.go | 4 ++-- server/util.go | 6 +++--- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/expression/collation.go b/expression/collation.go index 813560775e2b4..edfba16197895 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -297,7 +297,7 @@ func CheckAndDeriveCollationFromExprs(ctx sessionctx.Context, funcName string, e } func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) bool { - enc := charset.FindEncoding(ec.Charset) + enc := charset.FindEncodingTakeUTF8AsNoop(ec.Charset) for _, arg := range args { if arg.GetType().Charset == ec.Charset { continue diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index bf3d6b8ff269c..eb9cef46ecc24 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -29,6 +29,17 @@ func IsSupportedEncoding(charset string) bool { return ok } +// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset +// except that utf-8 is treated as no-operation encoding. This is used to +// reduce the overhead of utf-8 validation in some cases. +func FindEncodingTakeUTF8AsNoop(charset string) Encoding { + enc := FindEncoding(charset) + if enc.Tp() == EncodingTpUTF8 { + return EncodingBinImpl + } + return enc +} + // FindEncoding finds the encoding according to charset. func FindEncoding(charset string) Encoding { if len(charset) == 0 { @@ -104,7 +115,7 @@ const ( ) // CountValidBytes counts the first valid bytes in src that -// can be encode to the current encoding. +// can be encoded to the current encoding. func CountValidBytes(e Encoding, src []byte) int { nSrc := 0 e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool { @@ -117,7 +128,7 @@ func CountValidBytes(e Encoding, src []byte) int { } // CountValidBytesDecode counts the first valid bytes in src that -// can be decode to utf-8. +// can be decoded to utf-8. func CountValidBytesDecode(e Encoding, src []byte) int { nSrc := 0 e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool { diff --git a/parser/lexer.go b/parser/lexer.go index c274a53f9f049..543707627dbe1 100644 --- a/parser/lexer.go +++ b/parser/lexer.go @@ -146,12 +146,6 @@ func (s *Scanner) AppendWarn(err error) { } func (s *Scanner) tryDecodeToUTF8String(sql string) string { - if mysql.IsUTF8Charset(s.encoding.Name()) { - // Skip utf8 encoding because `ToUTF8` validates the whole SQL. - // This can cause failure when the SQL contains BLOB values. - // TODO: Convert charset on every token and use 'binary' encoding to decode token. - return sql - } utf8Lit, err := s.encoding.Transform(nil, charset.Slice(sql), charset.OpDecodeReplace) if err != nil { s.AppendError(err) diff --git a/parser/yy_parser.go b/parser/yy_parser.go index 58e18083b28cb..5fedd0a1afb48 100644 --- a/parser/yy_parser.go +++ b/parser/yy_parser.go @@ -396,7 +396,7 @@ var ( func resetParams(p *Parser) { p.charset = mysql.DefaultCharset p.collation = mysql.DefaultCollationName - p.lexer.encoding = charset.EncodingUTF8Impl + p.lexer.encoding = charset.EncodingBinImpl } // ParseParam represents the parameter of parsing. @@ -436,6 +436,6 @@ type CharsetClient string // ApplyOn implements ParseParam interface. func (c CharsetClient) ApplyOn(p *Parser) error { - p.lexer.encoding = charset.FindEncoding(string(c)) + p.lexer.encoding = charset.FindEncodingTakeUTF8AsNoop(string(c)) return nil } diff --git a/server/util.go b/server/util.go index 3bf27665e4d5c..8314f7d6cfd05 100644 --- a/server/util.go +++ b/server/util.go @@ -298,7 +298,7 @@ type inputDecoder struct { func newInputDecoder(chs string) *inputDecoder { return &inputDecoder{ - encoding: charset.FindEncoding(chs), + encoding: charset.FindEncodingTakeUTF8AsNoop(chs), buffer: nil, } } @@ -336,7 +336,7 @@ type resultEncoder struct { func newResultEncoder(chs string) *resultEncoder { return &resultEncoder{ chsName: chs, - encoding: charset.FindEncoding(chs), + encoding: charset.FindEncodingTakeUTF8AsNoop(chs), buffer: nil, isBinary: chs == charset.CharsetBinary, isNull: len(chs) == 0, @@ -353,7 +353,7 @@ func (d *resultEncoder) updateDataEncoding(chsID uint16) { if err != nil { logutil.BgLogger().Warn("unknown charset ID", zap.Error(err)) } - d.dataEncoding = charset.FindEncoding(chs) + d.dataEncoding = charset.FindEncodingTakeUTF8AsNoop(chs) d.dataIsBinary = chsID == mysql.BinaryDefaultCollationID }