From 1cfd0d1ff7e3c79202c745b11299cf5ace3c34e4 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 26 Jul 2024 16:59:30 -0400 Subject: [PATCH] cmd/watchflakes: report consistent failures at top Currently, watchflakes skips consistent failures, including ones at the top, i.e. failing the latest a few commits. Consistently failing at tip means builds are broken recently on this builder. Currently we rely on human watching the dashboard to report such failures. This CL lets watchflakes to do so. If a new issue is opened for such failures, the title includes "[consistent failure]" to increase awareness. Fixes golang/go#58819. Change-Id: I2ea89a5d8edee0b9087f03b58e4ff834c03e826d Reviewed-on: https://go-review.googlesource.com/c/build/+/601439 Reviewed-by: Carlos Amedee LUCI-TryBot-Result: Go LUCI --- cmd/watchflakes/luci.go | 1 + cmd/watchflakes/main.go | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/cmd/watchflakes/luci.go b/cmd/watchflakes/luci.go index d56ed481d3..02eba015fc 100644 --- a/cmd/watchflakes/luci.go +++ b/cmd/watchflakes/luci.go @@ -108,6 +108,7 @@ type BuildResult struct { StepLogURL string // textual log of the (last) failed step, if any StepLogText string Failures []*Failure + Top bool // whether this is a consistent failure at the top (tip) } type Commit struct { diff --git a/cmd/watchflakes/main.go b/cmd/watchflakes/main.go index 7257db4542..70558346b7 100644 --- a/cmd/watchflakes/main.go +++ b/cmd/watchflakes/main.go @@ -36,6 +36,8 @@ const timeLimit = 45 * 24 * time.Hour const maxFailPerBuild = 3 +const tooManyToBeFlakes = 4 + var ( build = flag.String("build", "", "a particular build ID or URL to analyze (mainly for debugging)") md = flag.Bool("md", false, "print Markdown output suitable for GitHub issues") @@ -228,6 +230,20 @@ Repeat: } } } + for _, issue := range issues { + if issue.Number == 0 && len(issue.Post) >= tooManyToBeFlakes && issue.Post[0].Top { + // New issue. Check if it is failing consistently at top. + top := 0 + for _, fp := range issue.Post { + if fp.Top { + top++ + } + } + if top >= tooManyToBeFlakes { + issue.Title += " [consistent failure]" + } + } + } if query != nil { format := (*FailurePost).Text @@ -324,13 +340,15 @@ func skipBrokenCommits(boards []*Dashboard) { // skipBrokenBuilders identifies builders that were consistently broken // (at least tooManyToBeFlakes failures in a row) and then turned ok. // It changes those consistent failures to SKIP. +// +// It does not skip consistent failures at the top (latest few commits). +// Instead, it sets Top to true on them. func skipBrokenBuilders(boards []*Dashboard) { - const tooManyToBeFlakes = 4 - for _, dash := range boards { for _, rs := range dash.Results { - bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent + bad := 0 badStart := 0 + top := true skip := func(i int) { // skip the i-th result if rs[i] != nil { fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch) @@ -343,10 +361,24 @@ func skipBrokenBuilders(boards []*Dashboard) { } switch r.Status { case bbpb.Status_SUCCESS: + if top && bad < tooManyToBeFlakes { + // Skip the run at the top. + // Too few to tell if it is flaky or consistent. + // It may also get fixed soon. + for j := 0; j < i; j++ { + skip(j) + } + } + top = false bad = 0 continue case bbpb.Status_FAILURE: bad++ + if top { + // Set Top to true, but don't skip. + r.Top = true + continue + } default: // ignore other status continue } @@ -368,7 +400,7 @@ func skipBrokenBuilders(boards []*Dashboard) { // even if there are just a few of them. Otherwise we get // spurious flakes when there's one bad entry before the // cutoff and lots after the cutoff. - if bad > 0 && badStart > 0 { + if bad > 0 { for j := badStart; j < len(rs); j++ { skip(j) }