Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load data: fix bug if load data with long content #29222

Merged
merged 5 commits into from
Dec 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions executor/load_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,9 @@ func (e *LoadDataInfo) isInQuoter(bs []byte) bool {
return inQuoter
}

// indexOfTerminator return index of terminator, if not, return -1.
// IndexOfTerminator return index of terminator, if not, return -1.
// normally, the field terminator and line terminator is short, so we just use brute force algorithm.
func (e *LoadDataInfo) indexOfTerminator(bs []byte, isInQuoter bool) int {
func (e *LoadDataInfo) IndexOfTerminator(bs []byte, inQuoter bool) int {
fieldTerm := []byte(e.FieldsInfo.Terminated)
fieldTermLen := len(fieldTerm)
lineTerm := []byte(e.LinesInfo.Terminated)
Expand Down Expand Up @@ -459,13 +459,10 @@ func (e *LoadDataInfo) indexOfTerminator(bs []byte, isInQuoter bool) int {
}
}
atFieldStart := true
inQuoter := false
loop:
for i := 0; i < len(bs); i++ {
if atFieldStart && bs[i] == e.FieldsInfo.Enclosed {
if !isInQuoter {
inQuoter = true
}
inQuoter = !inQuoter
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So all cases where one string crosses multiple lines will cause an error, right?
Is it possible to verify that in the test case?

Copy link
Contributor Author

@xiongjiwei xiongjiwei Nov 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not all cases, this error will only happen if one string is split into 2 parts and crosses multiple lines. e.g. first part "123 second part 456\n789" the \n in the second part should not interpret as new lines.

the test case No.4 test this scenario

atFieldStart = false
continue
}
Expand Down Expand Up @@ -525,7 +522,7 @@ func (e *LoadDataInfo) getLine(prevData, curData []byte, ignore bool) ([]byte, [
if ignore {
endIdx = strings.Index(string(hack.String(curData[curStartIdx:])), e.LinesInfo.Terminated)
} else {
endIdx = e.indexOfTerminator(curData[curStartIdx:], inquotor)
endIdx = e.IndexOfTerminator(curData[curStartIdx:], inquotor)
}
}
if endIdx == -1 {
Expand All @@ -539,7 +536,7 @@ func (e *LoadDataInfo) getLine(prevData, curData []byte, ignore bool) ([]byte, [
if ignore {
endIdx = strings.Index(string(hack.String(curData[startingLen:])), e.LinesInfo.Terminated)
} else {
endIdx = e.indexOfTerminator(curData[startingLen:], inquotor)
endIdx = e.IndexOfTerminator(curData[startingLen:], inquotor)
}
if endIdx != -1 {
nextDataIdx := startingLen + endIdx + terminatedLen
Expand All @@ -560,7 +557,7 @@ func (e *LoadDataInfo) getLine(prevData, curData []byte, ignore bool) ([]byte, [
if ignore {
endIdx = strings.Index(string(hack.String(prevData[startingLen:])), e.LinesInfo.Terminated)
} else {
endIdx = e.indexOfTerminator(prevData[startingLen:], inquotor)
endIdx = e.IndexOfTerminator(prevData[startingLen:], inquotor)
}
if endIdx >= prevLen {
return prevData[startingLen : startingLen+endIdx], curData[nextDataIdx:], true
Expand Down
22 changes: 22 additions & 0 deletions executor/write_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/pingcap/tidb/executor"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/parser/ast"
"github.com/pingcap/tidb/parser/model"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/planner/core"
Expand Down Expand Up @@ -2129,6 +2130,27 @@ func TestLoadDataEscape(t *testing.T) {
checkCases(tests, ld, t, tk, ctx, selectSQL, deleteSQL)
}

func TestLoadDataWithLongContent(t *testing.T) {
e := &executor.LoadDataInfo{
FieldsInfo: &ast.FieldsClause{Terminated: ",", Escaped: '\\', Enclosed: '"'},
LinesInfo: &ast.LinesClause{Terminated: "\n"},
}
tests := []struct {
content string
inQuoter bool
expectedIndex int
}{
{"123,123\n123,123", false, 7},
{"123123\\n123123", false, -1},
{"123123\n123123", true, -1},
{"123123\n123123\"\n", true, 14},
}

for _, tt := range tests {
require.Equal(t, tt.expectedIndex, e.IndexOfTerminator([]byte(tt.content), tt.inQuoter))
}
}

// TestLoadDataSpecifiedColumns reuse TestLoadDataEscape's test case :-)
func TestLoadDataSpecifiedColumns(t *testing.T) {
trivialMsg := "Records: 1 Deleted: 0 Skipped: 0 Warnings: 0"
Expand Down