pingcap · ti-chi-bot · May 6, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
diff --git a/expression/scalar_function.go b/expression/scalar_function.go
@@ -17,6 +17,7 @@ package expression
 import (
 	"bytes"
 	"fmt"
+	"sort"
 	"unsafe"
 
 	"github.com/pingcap/errors"
@@ -38,9 +39,10 @@ type ScalarFunction struct {
 	FuncName model.CIStr
 	// RetType is the type that ScalarFunction returns.
 	// TODO: Implement type inference here, now we use ast's return type temporarily.
-	RetType  *types.FieldType
-	Function builtinFunc
-	hashcode []byte
+	RetType           *types.FieldType
+	Function          builtinFunc
+	hashcode          []byte
+	canonicalhashcode []byte
 }
 
 // VecEvalInt evaluates this expression in a vectorized manner.
@@ -438,13 +440,129 @@ func (sf *ScalarFunction) EvalJSON(ctx sessionctx.Context, row chunk.Row) (types
 
 // HashCode implements Expression interface.
 func (sf *ScalarFunction) HashCode(sc *stmtctx.StatementContext) []byte {
+	if sc.CanonicalHashCode {
+		if len(sf.canonicalhashcode) > 0 {
+			return sf.canonicalhashcode
+		}
+		simpleCanonicalizedHashCode(sf, sc)
+		return sf.canonicalhashcode
+	}
 	if len(sf.hashcode) > 0 {
 		return sf.hashcode
 	}
 	ReHashCode(sf, sc)
 	return sf.hashcode
 }
 
+// ExpressionsSemanticEqual is used to judge whether two expression tree is semantic equivalent.
+func ExpressionsSemanticEqual(ctx sessionctx.Context, expr1, expr2 Expression) bool {
+	sc := ctx.GetSessionVars().StmtCtx
+	sc.CanonicalHashCode = true
+	defer func() {
+		sc.CanonicalHashCode = false
+	}()
+	return bytes.Equal(expr1.HashCode(sc), expr2.HashCode(sc))
+}
+
+// canonicalizedHashCode is used to judge whether two expression is semantically equal.
+func simpleCanonicalizedHashCode(sf *ScalarFunction, sc *stmtctx.StatementContext) {
+	if sf.canonicalhashcode != nil {
+		sf.canonicalhashcode = sf.canonicalhashcode[:0]
+	}
+	sf.canonicalhashcode = append(sf.canonicalhashcode, scalarFunctionFlag)
+
+	argsHashCode := make([][]byte, 0, len(sf.GetArgs()))
+	for _, arg := range sf.GetArgs() {
+		argsHashCode = append(argsHashCode, arg.HashCode(sc))
+	}
+	switch sf.FuncName.L {
+	case ast.Plus, ast.Mul, ast.EQ, ast.In, ast.LogicOr, ast.LogicAnd:
+		// encode original function name.
+		sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(sf.FuncName.L))
+		// reorder parameters hashcode, eg: a+b and b+a should has the same hashcode here.
+		sort.Slice(argsHashCode, func(i, j int) bool {
+			return bytes.Compare(argsHashCode[i], argsHashCode[j]) <= 0
+		})
+		for _, argCode := range argsHashCode {
+			sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+		}
+
+	case ast.GE, ast.LE: // directed binary OP: a >= b and b <= a should have the same hashcode.
+		// encode GE function name.
+		sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GE))
+		// encode GE function name and switch the args order.
+		if sf.FuncName.L == ast.GE {
+			for _, argCode := range argsHashCode {
+				sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+			}
+		} else {
+			for i := len(argsHashCode) - 1; i >= 0; i-- {
+				sf.canonicalhashcode = append(sf.canonicalhashcode, argsHashCode[i]...)
+			}
+		}
+	case ast.GT, ast.LT:
+		sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GT))
+		if sf.FuncName.L == ast.GT {
+			for _, argCode := range argsHashCode {
+				sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+			}
+		} else {
+			for i := len(argsHashCode) - 1; i >= 0; i-- {
+				sf.canonicalhashcode = append(sf.canonicalhashcode, argsHashCode[i]...)
+			}
+		}
+	case ast.UnaryNot:
+		child, ok := sf.GetArgs()[0].(*ScalarFunction)
+		if !ok {
+			// encode original function name.
+			sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(sf.FuncName.L))
+			// use the origin arg hash code.
+			for _, argCode := range argsHashCode {
+				sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+			}
+		} else {
+			childArgsHashCode := make([][]byte, 0, len(child.GetArgs()))
+			for _, arg := range child.GetArgs() {
+				childArgsHashCode = append(childArgsHashCode, arg.HashCode(sc))
+			}
+			switch child.FuncName.L {
+			case ast.GT: // not GT  ==> LE  ==> use GE and switch args
+				sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GE))
+				for i := len(childArgsHashCode) - 1; i >= 0; i-- {
+					sf.canonicalhashcode = append(sf.canonicalhashcode, childArgsHashCode[i]...)
+				}
+			case ast.LT: // not LT  ==> GE
+				sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GE))
+				for _, argCode := range childArgsHashCode {
+					sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+				}
+			case ast.GE: // not GE  ==> LT  ==> use GT and switch args
+				sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GT))
+				for i := len(childArgsHashCode) - 1; i >= 0; i-- {
+					sf.canonicalhashcode = append(sf.canonicalhashcode, childArgsHashCode[i]...)
+				}
+			case ast.LE: // not LE  ==> GT
+				sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(ast.GT))
+				for _, argCode := range childArgsHashCode {
+					sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+				}
+			}
+		}
+	default:
+		// encode original function name.
+		sf.canonicalhashcode = codec.EncodeCompactBytes(sf.canonicalhashcode, hack.Slice(sf.FuncName.L))
+		for _, argCode := range argsHashCode {
+			sf.canonicalhashcode = append(sf.canonicalhashcode, argCode...)
+		}
+		// Cast is a special case. The RetType should also be considered as an argument.
+		// Please see `newFunctionImpl()` for detail.
+		if sf.FuncName.L == ast.Cast {
+			evalTp := sf.RetType.EvalType()
+			sf.canonicalhashcode = append(sf.canonicalhashcode, byte(evalTp))
+		}
+	}
+}
+
 // ReHashCode is used after we change the argument in place.
 func ReHashCode(sf *ScalarFunction, sc *stmtctx.StatementContext) {
 	sf.hashcode = sf.hashcode[:0]

diff --git a/expression/scalar_function_test.go b/expression/scalar_function_test.go
@@ -27,6 +27,72 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+func TestExpressionSemanticEqual(t *testing.T) {
+	ctx := mock.NewContext()
+	a := &Column{
+		UniqueID: 1,
+		RetType:  types.NewFieldType(mysql.TypeDouble),
+	}
+	b := &Column{
+		UniqueID: 2,
+		RetType:  types.NewFieldType(mysql.TypeLong),
+	}
+	// order sensitive cases
+	// a < b; b > a
+	sf1 := newFunction(ast.LT, a, b)
+	sf2 := newFunction(ast.GT, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf1, sf2))
+
+	// a > b; b < a
+	sf3 := newFunction(ast.GT, a, b)
+	sf4 := newFunction(ast.LT, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf3, sf4))
+
+	// a<=b; b>=a
+	sf5 := newFunction(ast.LE, a, b)
+	sf6 := newFunction(ast.GE, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf5, sf6))
+
+	// a>=b; b<=a
+	sf7 := newFunction(ast.GE, a, b)
+	sf8 := newFunction(ast.LE, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf7, sf8))
+
+	// not(a<b); a >= b
+	sf9 := newFunction(ast.UnaryNot, sf1)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf9, sf7))
+
+	// a < b; not(a>=b)
+	sf10 := newFunction(ast.UnaryNot, sf7)
+	require.True(t, ExpressionsSemanticEqual(ctx, sf1, sf10))
+
+	// order insensitive cases
+	// a + b; b + a
+	p1 := newFunction(ast.Plus, a, b)
+	p2 := newFunction(ast.Plus, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, p1, p2))
+
+	// a * b; b * a
+	m1 := newFunction(ast.Mul, a, b)
+	m2 := newFunction(ast.Mul, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, m1, m2))
+
+	// a = b; b = a
+	e1 := newFunction(ast.EQ, a, b)
+	e2 := newFunction(ast.EQ, b, a)
+	require.True(t, ExpressionsSemanticEqual(ctx, e1, e2))
+
+	// a = b AND b + a; a + b AND b = a
+	a1 := newFunction(ast.LogicAnd, e1, p2)
+	a2 := newFunction(ast.LogicAnd, p1, e2)
+	require.True(t, ExpressionsSemanticEqual(ctx, a1, a2))
+
+	// a * b OR a + b;  b + a OR b * a
+	o1 := newFunction(ast.LogicOr, m1, p1)
+	o2 := newFunction(ast.LogicOr, p2, m2)
+	require.True(t, ExpressionsSemanticEqual(ctx, o1, o2))
+}
+
 func TestScalarFunction(t *testing.T) {
 	ctx := mock.NewContext()
 	a := &Column{

diff --git a/sessionctx/stmtctx/stmtctx.go b/sessionctx/stmtctx/stmtctx.go
@@ -403,6 +403,8 @@ type StatementContext struct {
 	useChunkAlloc bool
 	// Check if TiFlash read engine is removed due to strict sql mode.
 	TiFlashEngineRemovedDueToStrictSQLMode bool
+	// CanonicalHashCode try to get the canonical hash code from expression.
+	CanonicalHashCode bool
 }
 
 // StmtHints are SessionVars related sql hints.