encoding: skip utf8 charset validation in some cases (#31061)

close #31014
pingcap · Dec 28, 2021 · 61d13b5 · 61d13b5
1 parent 6d58e8e
commit 61d13b5
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 14 deletions.
diff --git a/expression/collation.go b/expression/collation.go
@@ -297,7 +297,7 @@ func CheckAndDeriveCollationFromExprs(ctx sessionctx.Context, funcName string, e
 }
 
 func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) bool {
-	enc := charset.FindEncoding(ec.Charset)
+	enc := charset.FindEncodingTakeUTF8AsNoop(ec.Charset)
 	for _, arg := range args {
 		if arg.GetType().Charset == ec.Charset {
 			continue

diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go
@@ -29,6 +29,17 @@ func IsSupportedEncoding(charset string) bool {
 	return ok
 }
 
+// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
+// except that utf-8 is treated as no-operation encoding. This is used to
+// reduce the overhead of utf-8 validation in some cases.
+func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
+	enc := FindEncoding(charset)
+	if enc.Tp() == EncodingTpUTF8 {
+		return EncodingBinImpl
+	}
+	return enc
+}
+
 // FindEncoding finds the encoding according to charset.
 func FindEncoding(charset string) Encoding {
 	if len(charset) == 0 {
@@ -104,7 +115,7 @@ const (
 )
 
 // CountValidBytes counts the first valid bytes in src that
-// can be encode to the current encoding.
+// can be encoded to the current encoding.
 func CountValidBytes(e Encoding, src []byte) int {
 	nSrc := 0
 	e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
@@ -117,7 +128,7 @@ func CountValidBytes(e Encoding, src []byte) int {
 }
 
 // CountValidBytesDecode counts the first valid bytes in src that
-// can be decode to utf-8.
+// can be decoded to utf-8.
 func CountValidBytesDecode(e Encoding, src []byte) int {
 	nSrc := 0
 	e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {

diff --git a/parser/lexer.go b/parser/lexer.go
@@ -146,12 +146,6 @@ func (s *Scanner) AppendWarn(err error) {
 }
 
 func (s *Scanner) tryDecodeToUTF8String(sql string) string {
-	if mysql.IsUTF8Charset(s.encoding.Name()) {
-		// Skip utf8 encoding because `ToUTF8` validates the whole SQL.
-		// This can cause failure when the SQL contains BLOB values.
-		// TODO: Convert charset on every token and use 'binary' encoding to decode token.
-		return sql
-	}
 	utf8Lit, err := s.encoding.Transform(nil, charset.Slice(sql), charset.OpDecodeReplace)
 	if err != nil {
 		s.AppendError(err)

diff --git a/parser/yy_parser.go b/parser/yy_parser.go
@@ -396,7 +396,7 @@ var (
 func resetParams(p *Parser) {
 	p.charset = mysql.DefaultCharset
 	p.collation = mysql.DefaultCollationName
-	p.lexer.encoding = charset.EncodingUTF8Impl
+	p.lexer.encoding = charset.EncodingBinImpl
 }
 
 // ParseParam represents the parameter of parsing.
@@ -436,6 +436,6 @@ type CharsetClient string
 
 // ApplyOn implements ParseParam interface.
 func (c CharsetClient) ApplyOn(p *Parser) error {
-	p.lexer.encoding = charset.FindEncoding(string(c))
+	p.lexer.encoding = charset.FindEncodingTakeUTF8AsNoop(string(c))
 	return nil
 }
diff --git a/server/util.go b/server/util.go
@@ -298,7 +298,7 @@ type inputDecoder struct {
 
 func newInputDecoder(chs string) *inputDecoder {
 	return &inputDecoder{
-		encoding: charset.FindEncoding(chs),
+		encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
 		buffer:   nil,
 	}
 }
@@ -336,7 +336,7 @@ type resultEncoder struct {
 func newResultEncoder(chs string) *resultEncoder {
 	return &resultEncoder{
 		chsName:  chs,
-		encoding: charset.FindEncoding(chs),
+		encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
 		buffer:   nil,
 		isBinary: chs == charset.CharsetBinary,
 		isNull:   len(chs) == 0,
@@ -353,7 +353,7 @@ func (d *resultEncoder) updateDataEncoding(chsID uint16) {
 	if err != nil {
 		logutil.BgLogger().Warn("unknown charset ID", zap.Error(err))
 	}
-	d.dataEncoding = charset.FindEncoding(chs)
+	d.dataEncoding = charset.FindEncodingTakeUTF8AsNoop(chs)
 	d.dataIsBinary = chsID == mysql.BinaryDefaultCollationID
 }