Skip to content

Commit

Permalink
encoding: skip utf8 charset validation in some cases (#31061)
Browse files Browse the repository at this point in the history
close #31014
  • Loading branch information
tangenta committed Dec 28, 2021
1 parent 6d58e8e commit 61d13b5
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 14 deletions.
2 changes: 1 addition & 1 deletion expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ func CheckAndDeriveCollationFromExprs(ctx sessionctx.Context, funcName string, e
}

func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) bool {
enc := charset.FindEncoding(ec.Charset)
enc := charset.FindEncodingTakeUTF8AsNoop(ec.Charset)
for _, arg := range args {
if arg.GetType().Charset == ec.Charset {
continue
Expand Down
15 changes: 13 additions & 2 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ func IsSupportedEncoding(charset string) bool {
return ok
}

// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
// except that utf-8 is treated as no-operation encoding. This is used to
// reduce the overhead of utf-8 validation in some cases.
func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
enc := FindEncoding(charset)
if enc.Tp() == EncodingTpUTF8 {
return EncodingBinImpl
}
return enc
}

// FindEncoding finds the encoding according to charset.
func FindEncoding(charset string) Encoding {
if len(charset) == 0 {
Expand Down Expand Up @@ -104,7 +115,7 @@ const (
)

// CountValidBytes counts the first valid bytes in src that
// can be encode to the current encoding.
// can be encoded to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
Expand All @@ -117,7 +128,7 @@ func CountValidBytes(e Encoding, src []byte) int {
}

// CountValidBytesDecode counts the first valid bytes in src that
// can be decode to utf-8.
// can be decoded to utf-8.
func CountValidBytesDecode(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {
Expand Down
6 changes: 0 additions & 6 deletions parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,6 @@ func (s *Scanner) AppendWarn(err error) {
}

func (s *Scanner) tryDecodeToUTF8String(sql string) string {
if mysql.IsUTF8Charset(s.encoding.Name()) {
// Skip utf8 encoding because `ToUTF8` validates the whole SQL.
// This can cause failure when the SQL contains BLOB values.
// TODO: Convert charset on every token and use 'binary' encoding to decode token.
return sql
}
utf8Lit, err := s.encoding.Transform(nil, charset.Slice(sql), charset.OpDecodeReplace)
if err != nil {
s.AppendError(err)
Expand Down
4 changes: 2 additions & 2 deletions parser/yy_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ var (
func resetParams(p *Parser) {
p.charset = mysql.DefaultCharset
p.collation = mysql.DefaultCollationName
p.lexer.encoding = charset.EncodingUTF8Impl
p.lexer.encoding = charset.EncodingBinImpl
}

// ParseParam represents the parameter of parsing.
Expand Down Expand Up @@ -436,6 +436,6 @@ type CharsetClient string

// ApplyOn implements ParseParam interface.
func (c CharsetClient) ApplyOn(p *Parser) error {
p.lexer.encoding = charset.FindEncoding(string(c))
p.lexer.encoding = charset.FindEncodingTakeUTF8AsNoop(string(c))
return nil
}
6 changes: 3 additions & 3 deletions server/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ type inputDecoder struct {

func newInputDecoder(chs string) *inputDecoder {
return &inputDecoder{
encoding: charset.FindEncoding(chs),
encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
buffer: nil,
}
}
Expand Down Expand Up @@ -336,7 +336,7 @@ type resultEncoder struct {
func newResultEncoder(chs string) *resultEncoder {
return &resultEncoder{
chsName: chs,
encoding: charset.FindEncoding(chs),
encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
buffer: nil,
isBinary: chs == charset.CharsetBinary,
isNull: len(chs) == 0,
Expand All @@ -353,7 +353,7 @@ func (d *resultEncoder) updateDataEncoding(chsID uint16) {
if err != nil {
logutil.BgLogger().Warn("unknown charset ID", zap.Error(err))
}
d.dataEncoding = charset.FindEncoding(chs)
d.dataEncoding = charset.FindEncodingTakeUTF8AsNoop(chs)
d.dataIsBinary = chsID == mysql.BinaryDefaultCollationID
}

Expand Down

0 comments on commit 61d13b5

Please sign in to comment.