Skip to content

Commit

Permalink
tests: fix race when pausing rename swap
Browse files Browse the repository at this point in the history
The pauseCh was only synchronised one way, which resulted in a race
window here the test thread (after requesting the pause) would get the
real path before the rename thread swapped the file back. This is easily
fixed by doing the swap twice each loop iteration so that we only
receive pause requests when we are in an okay-to-be-paused state.

Removing the retry logic lets us do far more test runs for the racing
tests, removing the need for the -short suite. The MkdirAll tests are
still a bit slow, but 2k runs should be fine.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
  • Loading branch information
cyphar committed Jul 16, 2024
1 parent 927d002 commit a604eb6
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 70 deletions.
23 changes: 6 additions & 17 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
path: ${{ env.GOCOVERDIR }}

test-unix-short:
test-unix:
strategy:
fail-fast: false
matrix:
Expand All @@ -74,32 +74,21 @@ jobs:
run: |
GOCOVERDIR="$(mktemp --tmpdir -d gocoverdir.XXXXXXXX)"
echo "GOCOVERDIR=$GOCOVERDIR" >>"$GITHUB_ENV"
- name: unit tests
run: |
go test -short -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
sudo go test -short -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
- name: go test
run: go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
- name: sudo go test
run: sudo go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
- name: upload coverage
uses: actions/upload-artifact@v4
with:
name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
path: ${{ env.GOCOVERDIR }}

test-linux-stress:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: "^1"
- name: stress test race protections
run: |
sudo go test -timeout 1h -v -cover -run 'Test.*Racing' ./...
coverage:
runs-on: ubuntu-latest
needs:
- test-windows
- test-unix-short
- test-unix
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
Expand Down
36 changes: 4 additions & 32 deletions lookup_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ func TestPartialLookupInRoot_BadInode(t *testing.T) {
type racingLookupMeta struct {
pauseCh chan struct{}
passOkCount, passErrCount, skipCount, failCount, badErrCount int // test state counts
badNameCount, fixRemainingPathCount, unstableProcSelfFdCount int // workaround counts
badNameCount, fixRemainingPathCount int // workaround counts
skipErrCounts map[error]int
}

Expand Down Expand Up @@ -351,32 +351,13 @@ func (m *racingLookupMeta) checkPartialLookup(t *testing.T, rootDir *os.File, un
handleName string
realPath string
unixStat unix.Stat_t
realPaths []string
)
if handle != nil {
handleName = handle.Name()

// Get the "proper" name from procSelfFdReadlink.
m.pauseCh <- struct{}{}
// For some reason, it seems that (at a rate of 0.01% or so) even
// though we pause the rename thread is a nonsensical path from
// procSelfFdReadlink that looks like the path is still swapped. The
// key thing to note is that adding sleeps doesn't seem to help much,
// but retrying several times usually ends up with us getting the
// "right" result eventually.
//
// This seems like a kernel bug (or a weird issue with Go channels),
// but for now let's just retry a few times and keep track of how many
// transitions we saw. If we only see one change, then we can use the
// last one and just track the statistics.
const readlinkRetryMax = 500
realPath, err = procSelfFdReadlink(handle)
for i := 0; err == nil && i < readlinkRetryMax; i++ {
if last := len(realPaths) - 1; last < 0 || realPath != realPaths[last] {
realPaths = append(realPaths, realPath)
}
realPath, err = procSelfFdReadlink(handle)
}
<-m.pauseCh
require.NoError(t, err, "get real path of returned handle")

Expand All @@ -386,12 +367,6 @@ func (m *racingLookupMeta) checkPartialLookup(t *testing.T, rootDir *os.File, un
_ = handle.Close()
}

assert.LessOrEqualf(t, len(realPaths), 2, "saw more than one transition in procSelfFdReadlink results: %v", realPaths)
if len(realPaths) > 1 {
m.unstableProcSelfFdCount++
assert.Contains(t, realPaths, handleName, "at least one real path should match the handle name if we got more than one real path")
}

if realPath != handleName {
// It's possible for handle.Name() to be wrong because while it was
// correct when it was set, it might not match if the path was swapped
Expand Down Expand Up @@ -539,10 +514,7 @@ func TestPartialLookup_RacingRename(t *testing.T) {
go doRenameExchangeLoop(pauseCh, exitCh, rootDir, test.subPathA, test.subPathB)

// Do several runs to try to catch bugs.
var testRuns = 10000
if testing.Short() {
testRuns = 300
}
const testRuns = 50000
m := newRacingLookupMeta(pauseCh)
for i := 0; i < testRuns; i++ {
m.checkPartialLookup(t, rootDir, test.unsafePath, test.skipErrs, test.allowedResults)
Expand All @@ -558,9 +530,9 @@ func TestPartialLookup_RacingRename(t *testing.T) {
testRuns, pct(m.passOkCount), pct(m.passErrCount), pct(m.skipCount), pct(m.failCount),
// failures due to incorrect errors (rather than bad paths)
pct(m.badErrCount))
t.Logf(" badHandleName=%s fixRemainingPath=%s unstableProcSelfFdCount=%s",
t.Logf(" badHandleName=%s fixRemainingPath=%s",
// stats for how many test runs had to have some "workarounds"
pct(m.badNameCount), pct(m.fixRemainingPathCount), pct(m.unstableProcSelfFdCount))
pct(m.badNameCount), pct(m.fixRemainingPathCount))
if len(m.skipErrCounts) > 0 {
t.Logf(" skipErr breakdown:")
for err, count := range m.skipErrCounts {
Expand Down
18 changes: 8 additions & 10 deletions mkdir_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"fmt"
"os"
"path/filepath"
"runtime"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -358,16 +359,15 @@ func TestMkdirAllHandle_RacingRename(t *testing.T) {
}(rootCh)

// Do several runs to try to catch bugs.
var testRuns = 10000
if testing.Short() {
testRuns = 300
}
const testRuns = 2000
m := newRacingMkdirMeta()
for i := 0; i < testRuns; i++ {
root := createTree(t, treeSpec...)
rootCh <- root

rootCh <- root
runtime.Gosched() // give the thread some time to do a rename
m.checkMkdirAllHandle_Racing(t, root, test.unsafePath, 0o711, test.allowedErrs)
rootCh <- ""

// Clean up the root after each run so we don't exhaust all
// space in the tmpfs.
Expand Down Expand Up @@ -430,16 +430,14 @@ func TestMkdirAllHandle_RacingDelete(t *testing.T) {
}(rootCh)

// Do several runs to try to catch bugs.
var testRuns = 10000
if testing.Short() {
testRuns = 300
}
const testRuns = 2000
m := newRacingMkdirMeta()
for i := 0; i < testRuns; i++ {
root := createTree(t, treeSpec...)
rootCh <- root

rootCh <- root
m.checkMkdirAllHandle_Racing(t, root, test.unsafePath, 0o711, test.allowedErrs)
rootCh <- ""

// Clean up the root after each run so we don't exhaust all
// space in the tmpfs.
Expand Down
16 changes: 5 additions & 11 deletions util_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,29 +89,23 @@ func hasRenameExchange() bool {
}

func doRenameExchangeLoop(pauseCh chan struct{}, exitCh <-chan struct{}, dir *os.File, pathA, pathB string) {
var swapped bool
swap := func() {
_ = unix.Renameat2(int(dir.Fd()), pathA, int(dir.Fd()), pathB, unix.RENAME_EXCHANGE)
//unix.Sync()
swapped = !swapped
}

for {
select {
case <-exitCh:
return
case <-pauseCh:
if swapped {
swap()
}
// Wait for caller to unpause us.
select {
case pauseCh <- struct{}{}:
case <-exitCh:
return
}
default:
swap()
// Do the swap twice so that we only pause when we are in a
// "correct" state.
for i := 0; i < 2; i++ {
_ = unix.Renameat2(int(dir.Fd()), pathA, int(dir.Fd()), pathB, unix.RENAME_EXCHANGE)
}
}
}
}
Expand Down

0 comments on commit a604eb6

Please sign in to comment.