diff --git a/flate/deflate.go b/flate/deflate.go
index 07265ddede..82882961a0 100644
--- a/flate/deflate.go
+++ b/flate/deflate.go
@@ -294,7 +294,6 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 	}
 	offset = 0
 
-	cGain := 0
 	if d.chain < 100 {
 		for i := prevHead; tries > 0; tries-- {
 			if wEnd == win[i+length] {
@@ -322,10 +321,14 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 		return
 	}
 
+	// Minimum gain to accept a match.
+	cGain := 4
+
 	// Some like it higher (CSV), some like it lower (JSON)
-	const baseCost = 6
+	const baseCost = 3
 	// Base is 4 bytes at with an additional cost.
 	// Matches must be better than this.
+
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
@@ -333,7 +336,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 				// Calculate gain. Estimate
 				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
 
-				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -490,27 +493,103 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if prevLength >= minMatchLength && s.length <= prevLength {
-			// Check for better match at end...
+			// No better match, but check for better match at end...
 			//
-			// checkOff must be >=2 since we otherwise risk checking s.index
-			// Offset of 2 seems to yield best results.
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield best results. 3 is sometimes better.
 			const checkOff = 2
-			prevIndex := s.index - 1
-			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
-				end := lookahead
-				if lookahead > maxMatchLength {
-					end = maxMatchLength
-				}
-				end += prevIndex
-				idx := prevIndex + prevLength - (4 - checkOff)
-				h := hash4(d.window[idx:])
-				ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff)
-				if ch2 > minIndex {
-					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
-					// It seems like a pure length metric is best.
-					if length > prevLength {
-						prevLength = length
-						prevOffset = prevIndex - ch2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := lookahead
+					if lookahead > maxMatchLength+checkOff {
+						end = maxMatchLength + checkOff
+					}
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						} else if false {
+							// Check one further ahead.
+							// Only rarely better, disabled for now.
+							prevIndex++
+							h := hash4(d.window[prevIndex+prevLength:])
+							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+								// It seems like a pure length metric is best.
+								if length > prevLength+checkOff {
+									prevLength = length
+									prevOffset = prevIndex - ch2
+									prevIndex--
+
+									// Extend back...
+									for i := checkOff; i >= 0; i-- {
+										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] {
+											// Emit tokens we "owe"
+											for j := 0; j <= i; j++ {
+												d.tokens.AddLiteral(d.window[prevIndex+j])
+												if d.tokens.n == maxFlateBlockTokens {
+													// The block includes the current character
+													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+														return
+													}
+													d.tokens.Reset()
+												}
+												s.index++
+												if s.index < s.maxInsertIndex {
+													h := hash4(d.window[s.index:])
+													ch := s.hashHead[h]
+													s.chainHead = int(ch)
+													s.hashPrev[s.index&windowMask] = ch
+													s.hashHead[h] = uint32(s.index + s.hashOffset)
+												}
+											}
+											break
+										} else {
+											prevLength++
+										}
+									}
+								}
+							}
+						}
 					}
 				}
 			}
diff --git a/flate/deflate_test.go b/flate/deflate_test.go
index 852012ed18..f9584ceb3a 100644
--- a/flate/deflate_test.go
+++ b/flate/deflate_test.go
@@ -33,24 +33,24 @@ type reverseBitsTest struct {
 }
 
 var deflateTests = []*deflateTest{
-	{[]byte{}, 0, []byte{0x3, 0x0}},
-	{[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
-	{[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
-	{[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
-
-	{[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}},
-	{[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
+	0: {[]byte{}, 0, []byte{0x3, 0x0}},
+	1: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	2: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	3: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+
+	4: {[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}},
+	5: {[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}},
+	6: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
 		[]byte{0x0, 0x8, 0x0, 0xf7, 0xff, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x3, 0x0},
 	},
-	{[]byte{}, 1, []byte{0x3, 0x0}},
-	{[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
-	{[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x2, 0xc0, 0x0}},
-	{[]byte{}, 9, []byte{0x3, 0x0}},
-	{[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}},
-	{[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x2, 0xc0, 0x0}},
+	7:  {[]byte{}, 1, []byte{0x3, 0x0}},
+	8:  {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	9:  {[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	10: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
+	11: {[]byte{}, 9, []byte{0x3, 0x0}},
+	12: {[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}},
+	13: {[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	14: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
 }
 
 var deflateInflateTests = []*deflateInflateTest{
@@ -120,7 +120,7 @@ func TestDeflate(t *testing.T) {
 		w.Write(h.in)
 		w.Close()
 		if !bytes.Equal(buf.Bytes(), h.out) {
-			t.Errorf("%d: Deflate(%d, %x) = \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out)
+			t.Errorf("%d: Deflate(%d, %x) got \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out)
 		}
 	}
 }
@@ -331,6 +331,8 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	r.Close()
 	if !bytes.Equal(input, out) {
+		os.WriteFile("testdata/fails/"+t.Name()+".got", out, os.ModePerm)
+		os.WriteFile("testdata/fails/"+t.Name()+".want", input, os.ModePerm)
 		t.Errorf("decompress(compress(data)) != data: level=%d input=%s", level, name)
 		return
 	}
diff --git a/flate/fuzz_test.go b/flate/fuzz_test.go
index 0bd95ff8a5..fdd41cfb93 100644
--- a/flate/fuzz_test.go
+++ b/flate/fuzz_test.go
@@ -5,26 +5,36 @@ package flate
 
 import (
 	"bytes"
+	"flag"
 	"io"
+	"os"
 	"strconv"
 	"testing"
 
 	"github.com/klauspost/compress/internal/fuzz"
 )
 
+// Fuzzing tweaks:
+var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level")
+var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)")
+var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size")
+var fuzzSLF = flag.Bool("sl", true, "Include stateless encodes")
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	os.Exit(m.Run())
+}
+
 func FuzzEncoding(f *testing.F) {
 	fuzz.AddFromZip(f, "testdata/regression.zip", true, false)
 	fuzz.AddFromZip(f, "testdata/fuzz/encode-raw-corpus.zip", true, testing.Short())
 	fuzz.AddFromZip(f, "testdata/fuzz/FuzzEncoding.zip", false, testing.Short())
-	// Fuzzing tweaks:
-	const (
-		// Test a subset of encoders.
-		startFuzz = HuffmanOnly
-		endFuzz   = BestCompression
 
-		// Max input size:
-		maxSize = 1 << 20
-	)
+	startFuzz := *fuzzStartF
+	endFuzz := *fuzzEndF
+	maxSize := *fuzzMaxF
+	stateless := *fuzzSLF
+
 	decoder := NewReader(nil)
 	buf := new(bytes.Buffer)
 	encs := make([]*Writer, endFuzz-startFuzz+1)
@@ -88,7 +98,9 @@ func FuzzEncoding(f *testing.F) {
 				t.Fatal(msg + "not equal")
 			}
 		}
-
+		if !stateless {
+			return
+		}
 		// Split into two and use history...
 		buf.Reset()
 		err := StatelessDeflate(buf, data[:len(data)/2], false, nil)