Move modules/gzip to gitea.com/macaron/gzip (#9058)

* Move modules/gzip to gitea.com/macaron/gzip * Fix vendor
2019-11-18 13:18:33 +08:00 · 2019-11-18 13:18:33 +08:00 · 9ff6312627
commit 9ff6312627
parent ba4e8f221b
54 changed files with 2972 additions and 5163 deletions
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ b/vendor/github.com/klauspost/compress/LICENSE
@ -1,4 +1,5 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
--- a/vendor/github.com/klauspost/compress/flate/copy.go
+++ b/vendor/github.com/klauspost/compress/flate/copy.go
@ -1,32 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// forwardCopy is like the built-in copy function except that it always goes
-// forward from the start, even if the dst and src overlap.
-// It is equivalent to:
-//   for i := 0; i < n; i++ {
-//     mem[dst+i] = mem[src+i]
-//   }
-func forwardCopy(mem []byte, dst, src, n int) {
-	if dst <= src {
-		copy(mem[dst:dst+n], mem[src:src+n])
-		return
-	}
-	for {
-		if dst >= src+n {
-			copy(mem[dst:dst+n], mem[src:src+n])
-			return
-		}
-		// There is some forward overlap.  The destination
-		// will be filled with a repeated pattern of mem[src:src+k].
-		// We copy one instance of the pattern here, then repeat.
-		// Each time around this loop k will double.
-		k := dst - src
-		copy(mem[dst:dst+k], mem[src:src+k])
-		n -= k
-		dst += k
-	}
-}
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
@ -1,41 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-import (
-	"github.com/klauspost/cpuid"
-)
-
-// crc32sse returns a hash for the first 4 bytes of the slice
-// len(a) must be >= 4.
-//go:noescape
-func crc32sse(a []byte) uint32
-
-// crc32sseAll calculates hashes for each 4-byte set in a.
-// dst must be east len(a) - 4 in size.
-// The size is not checked by the assembly.
-//go:noescape
-func crc32sseAll(a []byte, dst []uint32)
-
-// matchLenSSE4 returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-//
-// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
-//
-//go:noescape
-func matchLenSSE4(a, b []byte, max int) int
-
-// histogram accumulates a histogram of b in h.
-// h must be at least 256 entries in length,
-// and must be cleared before calling this function.
-//go:noescape
-func histogram(b []byte, h []int32)
-
-// Detect SSE 4.2 feature.
-func init() {
-	useSSE42 = cpuid.CPU.SSE42()
-}
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
@ -1,213 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-// func crc32sse(a []byte) uint32
-TEXT ·crc32sse(SB), 4, $0
-	MOVQ a+0(FP), R10
-	XORQ BX, BX
-
-	// CRC32   dword (R10), EBX
-	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0x1a
-
-	MOVL BX, ret+24(FP)
-	RET
-
-// func crc32sseAll(a []byte, dst []uint32)
-TEXT ·crc32sseAll(SB), 4, $0
-	MOVQ  a+0(FP), R8      // R8: src
-	MOVQ  a_len+8(FP), R10 // input length
-	MOVQ  dst+24(FP), R9   // R9: dst
-	SUBQ  $4, R10
-	JS    end
-	JZ    one_crc
-	MOVQ  R10, R13
-	SHRQ  $2, R10          // len/4
-	ANDQ  $3, R13          // len&3
-	XORQ  BX, BX
-	ADDQ  $1, R13
-	TESTQ R10, R10
-	JZ    rem_loop
-
-crc_loop:
-	MOVQ (R8), R11
-	XORQ BX, BX
-	XORQ DX, DX
-	XORQ DI, DI
-	MOVQ R11, R12
-	SHRQ $8, R11
-	MOVQ R12, AX
-	MOVQ R11, CX
-	SHRQ $16, R12
-	SHRQ $16, R11
-	MOVQ R12, SI
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-	// CRC32   ECX, EDX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd1
-
-	// CRC32   ESI, EDI
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
-	MOVL BX, (R9)
-	MOVL DX, 4(R9)
-	MOVL DI, 8(R9)
-
-	XORQ BX, BX
-	MOVL R11, AX
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-	MOVL BX, 12(R9)
-
-	ADDQ $16, R9
-	ADDQ $4, R8
-	XORQ BX, BX
-	SUBQ $1, R10
-	JNZ  crc_loop
-
-rem_loop:
-	MOVL (R8), AX
-
-	// CRC32   EAX, EBX
-	BYTE $0xF2; BYTE $0x0f
-	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-	MOVL BX, (R9)
-	ADDQ $4, R9
-	ADDQ $1, R8
-	XORQ BX, BX
-	SUBQ $1, R13
-	JNZ  rem_loop
-
-end:
-	RET
-
-one_crc:
-	MOVQ $1, R13
-	XORQ BX, BX
-	JMP  rem_loop
-
-// func matchLenSSE4(a, b []byte, max int) int
-TEXT ·matchLenSSE4(SB), 4, $0
-	MOVQ a_base+0(FP), SI
-	MOVQ b_base+24(FP), DI
-	MOVQ DI, DX
-	MOVQ max+48(FP), CX
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of max, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ CX, $8
-	JLT  cmp1
-	MOVQ (SI), AX
-	MOVQ (DI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, SI
-	ADDQ $8, DI
-	SUBQ $8, CX
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, DI
-
-	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
-	SUBQ DX, DI
-	MOVQ DI, ret+56(FP)
-	RET
-
-cmp1:
-	// In the slices' tail, compare 1 byte at a time.
-	CMPQ CX, $0
-	JEQ  matchLenEnd
-	MOVB (SI), AX
-	MOVB (DI), BX
-	CMPB AX, BX
-	JNE  matchLenEnd
-	ADDQ $1, SI
-	ADDQ $1, DI
-	SUBQ $1, CX
-	JMP  cmp1
-
-matchLenEnd:
-	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
-	SUBQ DX, DI
-	MOVQ DI, ret+56(FP)
-	RET
-
-// func histogram(b []byte, h []int32)
-TEXT ·histogram(SB), 4, $0
-	MOVQ b+0(FP), SI     // SI: &b
-	MOVQ b_len+8(FP), R9 // R9: len(b)
-	MOVQ h+24(FP), DI    // DI: Histogram
-	MOVQ R9, R8
-	SHRQ $3, R8
-	JZ   hist1
-	XORQ R11, R11
-
-loop_hist8:
-	MOVQ (SI), R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	MOVB R10, R11
-	INCL (DI)(R11*4)
-	SHRQ $8, R10
-
-	INCL (DI)(R10*4)
-
-	ADDQ $8, SI
-	DECQ R8
-	JNZ  loop_hist8
-
-hist1:
-	ANDQ $7, R9
-	JZ   end_hist
-	XORQ R10, R10
-
-loop_hist1:
-	MOVB (SI), R10
-	INCL (DI)(R10*4)
-	INCQ SI
-	DECQ R9
-	JNZ  loop_hist1
-
-end_hist:
-	RET
--- a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
+++ b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
@ -1,35 +0,0 @@
-//+build !amd64 noasm appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-func init() {
-	useSSE42 = false
-}
-
-// crc32sse should never be called.
-func crc32sse(a []byte) uint32 {
-	panic("no assembler")
-}
-
-// crc32sseAll should never be called.
-func crc32sseAll(a []byte, dst []uint32) {
-	panic("no assembler")
-}
-
-// matchLenSSE4 should never be called.
-func matchLenSSE4(a, b []byte, max int) int {
-	panic("no assembler")
-	return 0
-}
-
-// histogram accumulates a histogram of b in h.
-//
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogram(b []byte, h []int32) {
-	h = h[:256]
-	for _, t := range b {
-		h[t]++
-	}
-}
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@ -0,0 +1,257 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
+}
+
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
+}
+
+const (
+	tableBits       = 16             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 18                                           // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                              // Size of the table
+	allocHistory = maxMatchOffset * 10                          // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this.
+)
+
+const (
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+func load32(b []byte, i int) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func load3232(b []byte, i int32) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> tableShift
+}
+
+type tableEntry struct {
+	val    uint32
+	offset int32
+}
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
+}
+
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> ((32 - h) & 31)
+}
+
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
+}
+
+// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4x64(u uint64, h uint8) uint32 {
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < int(maxMatchOffset*8) {
+		l := maxMatchOffset * 8
+		// Make it at least 1MB.
+		if l < 1<<20 {
+			l = 1 << 20
+		}
+		e.hist = make([]byte, 0, l)
+	}
+	// We offset current position so everything will be out of reach
+	e.cur += maxMatchOffset + int32(len(e.hist))
+	e.hist = e.hist[:0]
+}
+
+// matchLen returns the maximum length.
+// 'a' must be the shortest of the two.
+func matchLen(a, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	if len(a) > 4 {
+		// Try 4 bytes first
+		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+			return bits.TrailingZeros32(diff) >> 3
+		}
+		// Switch to 8 byte matching.
+		checked = 4
+		a = a[4:]
+		b = b[4:]
+		for len(a) >= 8 {
+			b = b[:len(a)]
+			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+				return checked + (bits.TrailingZeros64(diff) >> 3)
+			}
+			checked += 8
+			a = a[8:]
+			b = b[8:]
+		}
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@ -35,7 +35,7 @@ const (
 )

 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
+var lengthExtraBits = [32]int8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@ -43,14 +43,14 @@ var lengthExtraBits = []int8{
 }

 // The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
+var lengthBase = [32]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
 	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }

 // offset code word extra bits.
-var offsetExtraBits = []int8{
+var offsetExtraBits = [64]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
@ -58,7 +58,7 @@ var offsetExtraBits = []int8{
 	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
 }

-var offsetBase = []uint32{
+var offsetBase = [64]uint32{
 	/* normal deflate */
 	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
 	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
@ -85,26 +85,48 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint
-	bytes           [bufferSize]byte
-	codegenFreq     [codegenCodeCount]int32
-	nbytes          int
-	literalFreq     []int32
-	offsetFreq      []int32
-	codegen         []uint8
+	nbits           uint16
+	nbytes          uint8
 	literalEncoding *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
 	err             error
+	lastHeader      int
+	// Set between 0 (reused block can be up to 2x the size)
+	logReusePenalty uint
+	lastHuffMan     bool
+	bytes           [256]byte
+	literalFreq     [lengthCodesStart + 32]uint16
+	offsetFreq      [32]uint16
+	codegenFreq     [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
 }

+// Huffman reuse.
+//
+// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
+//
+// This is controlled by several variables:
+//
+// If lastHeader is non-zero the Huffman table can be reused.
+// This also indicates that a Huffman table has been generated that can output all
+// possible symbols.
+// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
+// an EOB with the previous table must be written.
+//
+// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
+// optimal size and adding a penalty in 'logReusePenalty'.
+// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
+// is slower both for compression and decompression.
+
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
-		literalFreq:     make([]int32, maxNumLit),
-		offsetFreq:      make([]int32, offsetCodeCount),
-		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-		literalEncoding: newHuffmanEncoder(maxNumLit),
+		literalEncoding: newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@ -113,7 +135,42 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.writer = writer
 	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
-	w.bytes = [bufferSize]byte{}
+	w.bytes = [256]byte{}
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
+	offsets, lits = true, true
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetFreq[:len(a)]
+	for i := range a {
+		if b[i] == 0 && a[i] != 0 {
+			offsets = false
+			break
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalFreq[256:literalCount]
+	b = b[:len(a)]
+	for i := range a {
+		if b[i] == 0 && a[i] != 0 {
+			lits = false
+			break
+		}
+	}
+	if lits {
+		a = t.litHist[:]
+		b = w.literalFreq[:len(a)]
+		for i := range a {
+			if b[i] == 0 && a[i] != 0 {
+				lits = false
+				break
+			}
+		}
+	}
+	return
 }

 func (w *huffmanBitWriter) flush() {
@ -144,30 +201,11 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }

-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(b) << w.nbits
+func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
+		w.writeOutBits()
 	}
 }

@ -213,7 +251,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	// a copy of the frequencies, and as the place where we put the result.
 	// This is fine because the output is always shorter than the input used
 	// so far.
-	codegen := w.codegen // cache
+	codegen := w.codegen[:] // cache
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
@ -292,30 +330,54 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	codegen[outIndex] = badCode
 }

-// dynamicSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
-	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
 		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int(w.codegenFreq[16])*2 +
 		int(w.codegenFreq[17])*3 +
-		int(w.codegenFreq[18])*7
-	size = header +
-		litEnc.bitLength(w.literalFreq) +
-		offEnc.bitLength(w.offsetFreq) +
-		extraBits
+		int(w.codegenFreq[18])*7, numCodegens
+}

+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
+	size = header +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
+		extraBits
 	return size, numCodegens
 }

+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
 // fixedSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) fixedSize(extraBits int) int {
 	return 3 +
-		fixedLiteralEncoding.bitLength(w.literalFreq) +
-		fixedOffsetEncoding.bitLength(w.offsetFreq) +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
 		extraBits
 }

@ -333,32 +395,38 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 }

 func (w *huffmanBitWriter) writeCode(c hcode) {
-	if w.err != nil {
-		return
-	}
+	// The function does not get inlined if we "& 63" the shift.
 	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += uint(c.len)
+	w.nbits += c.len
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
+		w.writeOutBits()
 	}
 }

+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+	w.bytes[n] = byte(bits)
+	w.bytes[n+1] = byte(bits >> 8)
+	w.bytes[n+2] = byte(bits >> 16)
+	w.bytes[n+3] = byte(bits >> 24)
+	w.bytes[n+4] = byte(bits >> 32)
+	w.bytes[n+5] = byte(bits >> 40)
+	n += 6
+	if n >= bufferFlushSize {
+		if w.err != nil {
+			n = 0
+			return
+		}
+		w.write(w.bytes[:n])
+		n = 0
+	}
+	w.nbytes = n
+}
+
 // Write the header of a dynamic Huffman block to the output stream.
 //
 //  numLiterals  The number of literals specified in codegen
@ -412,6 +480,11 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	var flag int32
 	if isEof {
 		flag = 1
@ -426,6 +499,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
 	// Indicate that we are a fixed Huffman block
 	var value int32 = 2
 	if isEof {
@ -439,29 +518,23 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 // is larger than the original bytes, the data will be written as a
 // stored block.
 // If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	if w.err != nil {
 		return
 	}

-	tokens = append(tokens, endBlockMarker)
-	numLiterals, numOffsets := w.indexTokens(tokens)
-
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, false)
+	w.generate(tokens)
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
-		// We only bother calculating the costs of the extra bits required by
-		// the length of offset fields (which will be the same for both fixed
-		// and dynamic encoding), if we need to compare those two encodings
-		// against stored encoding.
-		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
-			// First eight length codes have extra size = 0.
-			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
-		}
-		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
-			// First four offset codes have extra size = 0.
-			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
-		}
+		extraBits = w.extraBitSize()
 	}

 	// Figure out smallest code.
@ -500,7 +573,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}

 	// Write the tokens.
-	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
 }

 // writeBlockDynamic encodes a block using a dynamic Huffman table.
@ -508,57 +581,103 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 // histogram distribution.
 // If input is supplied and the compression savings are below 1/16th of the
 // input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}

-	tokens = append(tokens, endBlockMarker)
-	numLiterals, numOffsets := w.indexTokens(tokens)
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
-
-	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
 	}

-	// Write Huffman table.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	// We cannot reuse pure huffman table.
+	if w.lastHuffMan && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+	if !sync {
+		tokens.Fill()
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, !sync)

+	var size int
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table
+		newSize := w.lastHeader + tokens.EstimatedBits()
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		newSize += newSize >> (w.logReusePenalty & 31)
+		extra := w.extraBitSize()
+		reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra)
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+		// Check if we get a reasonable size decrease.
+		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			w.lastHeader = 0
+			return
+		}
+	}
+
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		w.generate(tokens)
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		var numCodegens int
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
+		// Store bytes, if we don't get a reasonable improvement.
+		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			w.lastHeader = 0
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHeader, _ = w.headerSize()
+		w.lastHuffMan = false
+	}
+
+	if sync {
+		w.lastHeader = 0
+	}
 	// Write the tokens.
-	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }

 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
-	for i := range w.literalFreq {
-		w.literalFreq[i] = 0
-	}
-	for i := range w.offsetFreq {
-		w.offsetFreq[i] = 0
-	}
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
+	copy(w.literalFreq[:], t.litHist[:])
+	copy(w.literalFreq[256:], t.extraHist[:])
+	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])

-	for _, t := range tokens {
-		if t < matchType {
-			w.literalFreq[t.literal()]++
-			continue
-		}
-		length := t.length()
-		offset := t.offset()
-		w.literalFreq[lengthCodesStart+lengthCode(length)]++
-		w.offsetFreq[offsetCode(offset)]++
+	if t.n == 0 {
+		return
+	}
+	if filled {
+		return maxNumLit, maxNumDist
 	}
-
 	// get the number of literals
 	numLiterals = len(w.literalFreq)
 	for w.literalFreq[numLiterals-1] == 0 {
@ -575,41 +694,85 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets
 		w.offsetFreq[0] = 1
 		numOffsets = 1
 	}
-	w.literalEncoding.generate(w.literalFreq, 15)
-	w.offsetEncoding.generate(w.offsetFreq, 15)
 	return
 }

+func (w *huffmanBitWriter) generate(t *tokens) {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
 // writeTokens writes a slice of tokens to the output.
 // codes for literal and offset encoding must be supplied.
 func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
 	if w.err != nil {
 		return
 	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
 	for _, t := range tokens {
 		if t < matchType {
-			w.writeCode(leCodes[t.literal()])
+			w.writeCode(lits[t.literal()])
 			continue
 		}
+
 		// Write the length
 		length := t.length()
 		lengthCode := lengthCode(length)
-		w.writeCode(leCodes[lengthCode+lengthCodesStart])
-		extraLengthBits := uint(lengthExtraBits[lengthCode])
+		if false {
+			w.writeCode(lengths[lengthCode&31])
+		} else {
+			// inlined
+			c := lengths[lengthCode&31]
+			w.bits |= uint64(c.code) << (w.nbits & 63)
+			w.nbits += c.len
+			if w.nbits >= 48 {
+				w.writeOutBits()
+			}
+		}
+
+		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
 		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode])
+			extraLength := int32(length - lengthBase[lengthCode&31])
 			w.writeBits(extraLength, extraLengthBits)
 		}
 		// Write the offset
 		offset := t.offset()
 		offsetCode := offsetCode(offset)
-		w.writeCode(oeCodes[offsetCode])
-		extraOffsetBits := uint(offsetExtraBits[offsetCode])
+		if false {
+			w.writeCode(offs[offsetCode&31])
+		} else {
+			// inlined
+			c := offs[offsetCode&31]
+			w.bits |= uint64(c.code) << (w.nbits & 63)
+			w.nbits += c.len
+			if w.nbits >= 48 {
+				w.writeOutBits()
+			}
+		}
+		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
 		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode])
+			extraOffset := int32(offset - offsetBase[offsetCode&63])
 			w.writeBits(extraOffset, extraOffsetBits)
 		}
 	}
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
 }

 // huffOffset is a static offset encoder used for huffman only encoding.
@ -620,82 +783,99 @@ func init() {
 	w := newHuffmanBitWriter(nil)
 	w.offsetFreq[0] = 1
 	huffOffset = newHuffmanEncoder(offsetCodeCount)
-	huffOffset.generate(w.offsetFreq, 15)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
 }

 // writeBlockHuff encodes a block of bytes as either
 // Huffman encoded literals or uncompressed bytes if the
 // results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}

 	// Clear histogram
-	for i := range w.literalFreq {
+	for i := range w.literalFreq[:] {
 		w.literalFreq[i] = 0
 	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}

 	// Add everything as literals
-	histogram(input, w.literalFreq)
-
-	w.literalFreq[endBlockMarker] = 1
-
-	const numLiterals = endBlockMarker + 1
-	const numOffsets = 1
-
-	w.literalEncoding.generate(w.literalFreq, 15)
-
-	// Figure out smallest code.
-	// Always use dynamic Huffman or Store
-	var numCodegens int
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+	estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15

 	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+	ssize, storable := w.storedSize(input)
+	if storable && ssize < (estBits+estBits>>4) {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}

-	// Huffman.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	if w.lastHeader > 0 {
+		size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader)
+		estBits += estBits >> (w.logReusePenalty)
+
+		if estBits < size {
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
+		}
+	}
+
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+	if w.lastHeader == 0 {
+		w.literalFreq[endBlockMarker] = 1
+		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
+
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+	}
+
 	encoding := w.literalEncoding.codes[:257]
-	n := w.nbytes
 	for _, t := range input {
 		// Bitwriting inlined, ~30% speedup
 		c := encoding[t]
-		w.bits |= uint64(c.code) << w.nbits
-		w.nbits += uint(c.len)
-		if w.nbits < 48 {
-			continue
+		w.bits |= uint64(c.code) << ((w.nbits) & 63)
+		w.nbits += c.len
+		if w.nbits >= 48 {
+			bits := w.bits
+			w.bits >>= 48
+			w.nbits -= 48
+			n := w.nbytes
+			w.bytes[n] = byte(bits)
+			w.bytes[n+1] = byte(bits >> 8)
+			w.bytes[n+2] = byte(bits >> 16)
+			w.bytes[n+3] = byte(bits >> 24)
+			w.bytes[n+4] = byte(bits >> 32)
+			w.bytes[n+5] = byte(bits >> 40)
+			n += 6
+			if n >= bufferFlushSize {
+				if w.err != nil {
+					n = 0
+					return
+				}
+				w.write(w.bytes[:n])
+				n = 0
+			}
+			w.nbytes = n
 		}
-		// Store 6 bytes
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n < bufferFlushSize {
-			continue
-		}
-		w.write(w.bytes[:n])
-		if w.err != nil {
-			return // Return early in the event of write failures
-		}
-		n = 0
 	}
-	w.nbytes = n
-	w.writeCode(encoding[endBlockMarker])
+	if eof || sync {
+		w.writeCode(encoding[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
 }
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@ -6,9 +6,16 @@ package flate

 import (
 	"math"
+	"math/bits"
 	"sort"
 )

+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
+)
+
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
 	code, len uint16
@ -24,7 +31,7 @@ type huffmanEncoder struct {

 type literalNode struct {
 	literal uint16
-	freq    int32
+	freq    uint16
 }

 // A levelInfo describes the state of the constructed tree for a given depth.
@ -53,18 +60,24 @@ func (h *hcode) set(code uint16, length uint16) {
 	h.code = code
 }

-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }

 func newHuffmanEncoder(size int) *huffmanEncoder {
-	return &huffmanEncoder{codes: make([]hcode, size)}
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
 }

 // Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(maxNumLit)
+	h := newHuffmanEncoder(literalCount)
 	codes := h.codes
 	var ch uint16
-	for ch = 0; ch < maxNumLit; ch++ {
+	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
 		var size uint16
 		switch {
@ -105,7 +118,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
 var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()

-func (h *huffmanEncoder) bitLength(freq []int32) int {
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
@ -115,8 +128,6 @@ func (h *huffmanEncoder) bitLength(freq []int32) int {
 	return total
 }

-const maxBitsLimit = 16
-
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@ -160,9 +171,9 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     list[1].freq,
-			nextCharFreq: list[2].freq,
-			nextPairFreq: list[0].freq + list[1].freq,
+			lastFreq:     int32(list[1].freq),
+			nextCharFreq: int32(list[2].freq),
+			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@ -194,7 +205,12 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			l.lastFreq = l.nextCharFreq
 			// Lower leafCounts are the same of the previous node.
 			leafCounts[level][level] = n
-			l.nextCharFreq = list[n].freq
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
 		} else {
 			// The next item on this row is a pair from the previous row.
 			// nextPairFreq isn't valid until we generate two
@ -270,12 +286,12 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 //
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	if h.freqcache == nil {
 		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
-		// The largest of these is maxNumLit, so we allocate for that case.
-		h.freqcache = make([]literalNode, maxNumLit+1)
+		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+		// The largest of these is literalCount, so we allocate for that case.
+		h.freqcache = make([]literalNode, literalCount+1)
 	}
 	list := h.freqcache[:len(freq)+1]
 	// Number of non-zero literals
@ -342,3 +358,27 @@ func (s byFreq) Less(i, j int) bool {
 }

 func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+// histogramSize accumulates a histogram of b in h.
+// An estimated size in bits is returned.
+// Unassigned values are assigned '1' in the histogram.
+// len(h) must be >= 256, and h's elements must be all zeroes.
+func histogramSize(b []byte, h []uint16, fill bool) int {
+	h = h[:256]
+	for _, t := range b {
+		h[t]++
+	}
+	invTotal := 1.0 / float64(len(b))
+	shannon := 0.0
+	single := math.Ceil(-math.Log2(invTotal))
+	for i, v := range h[:] {
+		if v > 0 {
+			n := float64(v)
+			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+		} else if fill {
+			shannon += single
+			h[i] = 1
+		}
+	}
+	return int(shannon + 0.99)
+}
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@ -9,19 +9,24 @@ package flate

 import (
 	"bufio"
+	"fmt"
 	"io"
+	"math/bits"
 	"strconv"
 	"sync"
 )

 const (
-	maxCodeLen = 16 // max length of Huffman code
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
 	// The next three numbers come from the RFC section 3.2.7, with the
 	// additional proviso in section 3.2.5 which implies that distance codes
 	// 30 and 31 should never occur in compressed data.
 	maxNumLit  = 286
 	maxNumDist = 30
 	numCodes   = 19 // number of codes in Huffman meta-code
+
+	debugDecode = false
 )

 // Initialize the fixedHuffmanDecoder only once upon first use.
@ -101,10 +106,10 @@ const (
 )

 type huffmanDecoder struct {
-	min      int                      // the minimum code length
-	chunks   [huffmanNumChunks]uint32 // chunks as described above
-	links    [][]uint32               // overflow links
-	linkMask uint32                   // mask the width of the link table
+	min      int                       // the minimum code length
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
+	links    [][]uint16                // overflow links
+	linkMask uint32                    // mask the width of the link table
 }

 // Initialize Huffman decoding tables from array of code lengths.
@ -112,21 +117,24 @@ type huffmanDecoder struct {
 // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
 // degenerate case where the tree has only a single symbol with length 1. Empty
 // trees are permitted.
-func (h *huffmanDecoder) init(bits []int) bool {
+func (h *huffmanDecoder) init(lengths []int) bool {
 	// Sanity enables additional runtime tests during Huffman
 	// table construction. It's intended to be used during
 	// development to supplement the currently ad-hoc unit tests.
 	const sanity = false

+	if h.chunks == nil {
+		h.chunks = &[huffmanNumChunks]uint16{}
+	}
 	if h.min != 0 {
-		*h = huffmanDecoder{}
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}

 	// Count number of codes of each length,
 	// compute min and max length.
 	var count [maxCodeLen]int
 	var min, max int
-	for _, n := range bits {
+	for _, n := range lengths {
 		if n == 0 {
 			continue
 		}
@ -136,7 +144,7 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		if n > max {
 			max = n
 		}
-		count[n]++
+		count[n&maxCodeLenMask]++
 	}

 	// Empty tree. The decompressor.huffSym function will fail later if the tree
@ -154,8 +162,8 @@ func (h *huffmanDecoder) init(bits []int) bool {
 	var nextcode [maxCodeLen]int
 	for i := min; i <= max; i++ {
 		code <<= 1
-		nextcode[i] = code
-		code += count[i]
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
 	}

 	// Check that the coding is complete (i.e., that we've
@ -164,37 +172,56 @@ func (h *huffmanDecoder) init(bits []int) bool {
 	// accept degenerate single-code codings. See also
 	// TestDegenerateHuffmanCoding.
 	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		if debugDecode {
+			fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
+		}
 		return false
 	}

 	h.min = min
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
 	if max > huffmanChunkBits {
 		numLinks := 1 << (uint(max) - huffmanChunkBits)
 		h.linkMask = uint32(numLinks - 1)

 		// create link tables
 		link := nextcode[huffmanChunkBits+1] >> 1
-		h.links = make([][]uint32, huffmanNumChunks-link)
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint16, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
 		for j := uint(link); j < huffmanNumChunks; j++ {
-			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+			reverse := int(bits.Reverse16(uint16(j)))
 			reverse >>= uint(16 - huffmanChunkBits)
 			off := j - uint(link)
 			if sanity && h.chunks[reverse] != 0 {
 				panic("impossible: overwriting existing chunk")
 			}
-			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
-			h.links[off] = make([]uint32, numLinks)
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint16, numLinks)
+			} else {
+				links := h.links[off][:0]
+				h.links[off] = links[:numLinks]
+			}
 		}
+	} else {
+		h.links = h.links[:0]
 	}

-	for i, n := range bits {
+	for i, n := range lengths {
 		if n == 0 {
 			continue
 		}
 		code := nextcode[n]
 		nextcode[n]++
-		chunk := uint32(i<<huffmanValueShift | n)
-		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
+		chunk := uint16(i<<huffmanValueShift | n)
+		reverse := int(bits.Reverse16(uint16(code)))
 		reverse >>= uint(16 - n)
 		if n <= huffmanChunkBits {
 			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
@ -326,6 +353,9 @@ func (f *decompressor) nextBlock() {
 		f.huffmanBlock()
 	default:
 		// 3 is reserved.
+		if debugDecode {
+			fmt.Println("reserved data block encountered")
+		}
 		f.err = CorruptInputError(f.roffset)
 	}
 }
@ -404,11 +434,17 @@ func (f *decompressor) readHuffman() error {
 	}
 	nlit := int(f.b&0x1F) + 257
 	if nlit > maxNumLit {
+		if debugDecode {
+			fmt.Println("nlit > maxNumLit", nlit)
+		}
 		return CorruptInputError(f.roffset)
 	}
 	f.b >>= 5
 	ndist := int(f.b&0x1F) + 1
 	if ndist > maxNumDist {
+		if debugDecode {
+			fmt.Println("ndist > maxNumDist", ndist)
+		}
 		return CorruptInputError(f.roffset)
 	}
 	f.b >>= 5
@ -432,6 +468,9 @@ func (f *decompressor) readHuffman() error {
 		f.codebits[codeOrder[i]] = 0
 	}
 	if !f.h1.init(f.codebits[0:]) {
+		if debugDecode {
+			fmt.Println("init codebits failed")
+		}
 		return CorruptInputError(f.roffset)
 	}

@ -459,6 +498,9 @@ func (f *decompressor) readHuffman() error {
 			rep = 3
 			nb = 2
 			if i == 0 {
+				if debugDecode {
+					fmt.Println("i==0")
+				}
 				return CorruptInputError(f.roffset)
 			}
 			b = f.bits[i-1]
@ -473,6 +515,9 @@ func (f *decompressor) readHuffman() error {
 		}
 		for f.nb < nb {
 			if err := f.moreBits(); err != nil {
+				if debugDecode {
+					fmt.Println("morebits:", err)
+				}
 				return err
 			}
 		}
@ -480,6 +525,9 @@ func (f *decompressor) readHuffman() error {
 		f.b >>= nb
 		f.nb -= nb
 		if i+rep > n {
+			if debugDecode {
+				fmt.Println("i+rep > n", i, rep, n)
+			}
 			return CorruptInputError(f.roffset)
 		}
 		for j := 0; j < rep; j++ {
@ -489,6 +537,9 @@ func (f *decompressor) readHuffman() error {
 	}

 	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+		if debugDecode {
+			fmt.Println("init2 failed")
+		}
 		return CorruptInputError(f.roffset)
 	}

@ -566,12 +617,18 @@ readLiteral:
 			length = 258
 			n = 0
 		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 		if n > 0 {
 			for f.nb < n {
 				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
 					f.err = err
 					return
 				}
@ -585,15 +642,21 @@ readLiteral:
 		if f.hd == nil {
 			for f.nb < 5 {
 				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
 					f.err = err
 					return
 				}
 			}
-			dist = int(reverseByte[(f.b&0x1F)<<3])
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
 			f.b >>= 5
 			f.nb -= 5
 		} else {
 			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
 				f.err = err
 				return
 			}
@ -608,6 +671,9 @@ readLiteral:
 			extra := (dist & 1) << nb
 			for f.nb < nb {
 				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
 					f.err = err
 					return
 				}
@ -617,12 +683,18 @@ readLiteral:
 			f.nb -= nb
 			dist = 1<<(nb+1) + 1 + extra
 		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}

 		// No check on length; encoding can be prescient.
 		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
@ -661,15 +733,15 @@ func (f *decompressor) dataBlock() {
 	nr, err := io.ReadFull(f.r, f.buf[0:4])
 	f.roffset += int64(nr)
 	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		f.err = err
+		f.err = noEOF(err)
 		return
 	}
 	n := int(f.buf[0]) | int(f.buf[1])<<8
 	nn := int(f.buf[2]) | int(f.buf[3])<<8
 	if uint16(nn) != uint16(^n) {
+		if debugDecode {
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ^n)
+		}
 		f.err = CorruptInputError(f.roffset)
 		return
 	}
@ -697,10 +769,7 @@ func (f *decompressor) copyData() {
 	f.copyLen -= cnt
 	f.dict.writeMark(cnt)
 	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		f.err = err
+		f.err = noEOF(err)
 		return
 	}

@ -722,13 +791,18 @@ func (f *decompressor) finishBlock() {
 	f.step = (*decompressor).nextBlock
 }

+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+	if e == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return e
+}
+
 func (f *decompressor) moreBits() error {
 	c, err := f.r.ReadByte()
 	if err != nil {
-		if err == io.EOF {
-			err = io.ErrUnexpectedEOF
-		}
-		return err
+		return noEOF(err)
 	}
 	f.roffset++
 	f.b |= uint32(c) << f.nb
@ -743,25 +817,40 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 	// cases, the chunks slice will be 0 for the invalid sequence, leading it
 	// satisfy the n == 0 check below.
 	n := uint(h.min)
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	nb, b := f.nb, f.b
 	for {
-		for f.nb < n {
-			if err := f.moreBits(); err != nil {
-				return 0, err
+		for nb < n {
+			c, err := f.r.ReadByte()
+			if err != nil {
+				f.b = b
+				f.nb = nb
+				return 0, noEOF(err)
 			}
+			f.roffset++
+			b |= uint32(c) << (nb & 31)
+			nb += 8
 		}
-		chunk := h.chunks[f.b&(huffmanNumChunks-1)]
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
 		n = uint(chunk & huffmanCountMask)
 		if n > huffmanChunkBits {
-			chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
 			n = uint(chunk & huffmanCountMask)
 		}
-		if n <= f.nb {
+		if n <= nb {
 			if n == 0 {
+				f.b = b
+				f.nb = nb
+				if debugDecode {
+					fmt.Println("huffsym: n==0")
+				}
 				f.err = CorruptInputError(f.roffset)
 				return 0, f.err
 			}
-			f.b >>= n
-			f.nb -= n
+			f.b = b >> (n & 31)
+			f.nb = nb - n
 			return int(chunk >> huffmanValueShift), nil
 		}
 	}
@ -799,6 +888,8 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 		r:        makeReader(r),
 		bits:     f.bits,
 		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
 		dict:     f.dict,
 		step:     (*decompressor).nextBlock,
 	}
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@ -0,0 +1,174 @@
+package flate
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+// EncodeL1 uses a similar algorithm to level 1
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash(cv)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			nextHash = hash(uint32(now))
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == candidate.val {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == candidate.val {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			// Save the match found
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash(uint32(x))
+			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+			x >>= 16
+			currHash := hash(uint32(x))
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x) != candidate.val {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@ -0,0 +1,199 @@
+package flate
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash4u(cv, bTableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			nextHash = hash4u(uint32(now), bTableBits)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == candidate.val {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == candidate.val {
+				break
+			}
+			cv = uint32(now)
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := load6432(src, int32(i))
+				nextHash := hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash4u(uint32(x), bTableBits)
+			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+			e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)}
+			currHash := hash4u(uint32(x>>16), bTableBits)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != candidate.val {
+				cv = uint32(x >> 24)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@ -0,0 +1,225 @@
+package flate
+
+// fastEncL3
+type fastEncL3 struct {
+	fastGen
+	table [tableSize]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		const skipLog = 6
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash(cv)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := load3232(src, nextS)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			offset := s - (candidate.offset - e.cur)
+			if cv == candidate.val {
+				if offset > maxMatchOffset {
+					cv = now
+					// Previous will also be invalid, we have nothing.
+					continue
+				}
+				o2 := s - (candidates.Prev.offset - e.cur)
+				if cv != candidates.Prev.val || o2 > maxMatchOffset {
+					break
+				}
+				// Both match and are valid, pick longest.
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						break
+					}
+				}
+			}
+			cv = now
+		}
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					nextHash := hash(cv)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t, val: cv},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-3 to s.
+			x := load6432(src, s-3)
+			prevHash := hash(uint32(x))
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+			}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidates := e.table[currHash]
+			cv = uint32(x)
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur, val: cv},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if cv == candidate.val {
+				offset := s - (candidate.offset - e.cur)
+				if offset <= maxMatchOffset {
+					continue
+				}
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if cv == candidate.val {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						continue
+					}
+				}
+			}
+			cv = uint32(x >> 8)
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@ -0,0 +1,210 @@
+package flate
+
+import "fmt"
+
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == lCandidate.val {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+				// Found a 4 match...
+				lCandidate = e.bTable[hash7(next, tableBits)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := nextS - (lCandidate.offset - e.cur)
+				if lOff < maxMatchOffset && lCandidate.val == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(s+4, t+4, src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if false {
+			if t >= s {
+				panic("s-t")
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := load6432(src, s)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		if true {
+			i := nextS
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+				t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+				e.bTable[hash7(cv, tableBits)] = t
+				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.table[hash4u(t2.val, tableBits)] = t2
+
+				i += 3
+				for ; i < s-1; i += 3 {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+					t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+					e.bTable[hash7(cv, tableBits)] = t
+					e.bTable[hash7(cv>>8, tableBits)] = t2
+					e.table[hash4u(t2.val, tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+		e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@ -0,0 +1,276 @@
+package flate
+
+import "fmt"
+
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == lCandidate.Cur.val {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if lCandidate.Cur.val == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if false {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+					t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hash4u(t2.val, tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@ -0,0 +1,279 @@
+package flate
+
+import "fmt"
+
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == lCandidate.Cur.val {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if lCandidate.Cur.val == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if false {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := load6432(src, i)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		if true {
+			for i := nextS + 1; i < s-1; i += 2 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+				t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		cv = load6432(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/reverse_bits.go
+++ b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
@ -1,48 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-var reverseByte = [256]byte{
-	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
-}
-
-func reverseUint16(v uint16) uint16 {
-	return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
-}
-
-func reverseBits(number uint16, bitLength byte) uint16 {
-	return reverseUint16(number << uint8(16-bitLength))
-}
--- a/vendor/github.com/klauspost/compress/flate/snappy.go
+++ b/vendor/github.com/klauspost/compress/flate/snappy.go
@ -1,856 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Modified for deflate by Klaus Post (c) 2015.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
-	}
-	dst.n += uint16(len(lit))
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst *tokens, offset, length int) {
-	dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
-	dst.n++
-}
-
-type snappyEnc interface {
-	Encode(dst *tokens, src []byte)
-	Reset()
-}
-
-func newSnappy(level int) snappyEnc {
-	switch level {
-	case 1:
-		return &snappyL1{}
-	case 2:
-		return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-	case 3:
-		return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-	case 4:
-		return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
-	default:
-		panic("invalid level specified")
-	}
-}
-
-const (
-	tableBits       = 14             // Bits used in the table
-	tableSize       = 1 << tableBits // Size of the table
-	tableMask       = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
-	baseMatchOffset = 1              // The smallest match offset
-	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
-	maxMatchOffset  = 1 << 15        // The largest match offset
-)
-
-func load32(b []byte, i int) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
-
-// snappyL1 encapsulates level 1 compression
-type snappyL1 struct{}
-
-func (e *snappyL1) Reset() {}
-
-func (e *snappyL1) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 16 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		return
-	}
-
-	// Initialize the hash table.
-	//
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
-	var table [tableSize]uint16
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := len(src) - inputMargin
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s))
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := 32
-
-		nextS := s
-		candidate := 0
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS))
-			// TODO: < should be <=, and add a test for that.
-			if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
-				break
-			}
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-			base := s
-
-			// Extend the 4-byte match as long as possible.
-			//
-			// This is an inlined version of Snappy's:
-			//	s = extendMatch(src, candidate+4, s+4)
-			s += 4
-			s1 := base + maxMatchLength
-			if s1 > len(src) {
-				s1 = len(src)
-			}
-			a := src[s:s1]
-			b := src[candidate+4:]
-			b = b[:len(a)]
-			l := len(a)
-			for i := range a {
-				if a[i] != b[i] {
-					l = i
-					break
-				}
-			}
-			s += l
-
-			// matchToken is flate's equivalent of Snappy's emitCopy.
-			dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
-			dst.n++
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x >> 0))
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x >> 8))
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
-			// TODO: >= should be >, and add a test for that.
-			if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x >> 16))
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if nextEmit < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-}
-
-type tableEntry struct {
-	val    uint32
-	offset int32
-}
-
-func load3232(b []byte, i int32) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load6432(b []byte, i int32) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyGen struct {
-	prev []byte
-	cur  int32
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyL2 struct {
-	snappyGen
-	table [tableSize]tableEntry
-}
-
-// EncodeL2 uses a similar algorithm to level 1, but is capable
-// of matching across blocks giving better compression at a small slowdown.
-func (e *snappyL2) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 16 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntry{}
-		}
-		e.cur = maxStoreBlockSize
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
-			nextHash = hash(now)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset >= maxMatchOffset || cv != candidate.val {
-				// Out of range or not matched.
-				cv = now
-				continue
-			}
-			break
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-1)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidate = e.table[currHash&tableMask]
-			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
-
-			offset := s - (candidate.offset - e.cur)
-			if offset >= maxMatchOffset || uint32(x) != candidate.val {
-				cv = uint32(x >> 8)
-				nextHash = hash(cv)
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-type tableEntryPrev struct {
-	Cur  tableEntry
-	Prev tableEntry
-}
-
-// snappyL3
-type snappyL3 struct {
-	snappyGen
-	table [tableSize]tableEntryPrev
-}
-
-// Encode uses a similar algorithm to level 2, will check up to two candidates.
-func (e *snappyL3) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 16 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-	)
-
-	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntryPrev{}
-		}
-		e.cur = maxStoreBlockSize
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidates := e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-			nextHash = hash(now)
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset < maxMatchOffset {
-					break
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset < maxMatchOffset {
-						break
-					}
-				}
-			}
-			cv = now
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-2, s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-2)
-			prevHash := hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-			}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidates := e.table[currHash&tableMask]
-			cv = uint32(x)
-			e.table[currHash&tableMask] = tableEntryPrev{
-				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
-			}
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset < maxMatchOffset {
-					continue
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset < maxMatchOffset {
-						continue
-					}
-				}
-			}
-			cv = uint32(x >> 8)
-			nextHash = hash(cv)
-			s++
-			break
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-// snappyL4
-type snappyL4 struct {
-	snappyL3
-}
-
-// Encode uses a similar algorithm to level 3,
-// but will check up to two candidates if first isn't long enough.
-func (e *snappyL4) Encode(dst *tokens, src []byte) {
-	const (
-		inputMargin            = 16 - 1
-		minNonLiteralBlockSize = 1 + 1 + inputMargin
-		matchLenGood           = 12
-	)
-
-	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-	if e.cur > 1<<30 {
-		for i := range e.table {
-			e.table[i] = tableEntryPrev{}
-		}
-		e.cur = maxStoreBlockSize
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		// We do not fill the token table.
-		// This will be picked up by caller.
-		dst.n = uint16(len(src))
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load3232(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		var candidateAlt tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidates := e.table[nextHash&tableMask]
-			now := load3232(src, nextS)
-			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-			nextHash = hash(now)
-
-			// Check both candidates
-			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset < maxMatchOffset {
-					offset = s - (candidates.Prev.offset - e.cur)
-					if cv == candidates.Prev.val && offset < maxMatchOffset {
-						candidateAlt = candidates.Prev
-					}
-					break
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset < maxMatchOffset {
-						break
-					}
-				}
-			}
-			cv = now
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchlen(s, t, src)
-			// Try alternative candidate if match length < matchLenGood.
-			if l < matchLenGood-4 && candidateAlt.offset != 0 {
-				t2 := candidateAlt.offset - e.cur + 4
-				l2 := e.matchlen(s, t2, src)
-				if l2 > l {
-					l = l2
-					t = t2
-				}
-			}
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-			dst.n++
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-2, s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load6432(src, s-2)
-			prevHash := hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-			}
-			x >>= 8
-			prevHash = hash(uint32(x))
-
-			e.table[prevHash&tableMask] = tableEntryPrev{
-				Prev: e.table[prevHash&tableMask].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-			}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidates := e.table[currHash&tableMask]
-			cv = uint32(x)
-			e.table[currHash&tableMask] = tableEntryPrev{
-				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
-			}
-
-			// Check both candidates
-			candidate = candidates.Cur
-			candidateAlt = tableEntry{}
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset < maxMatchOffset {
-					offset = s - (candidates.Prev.offset - e.cur)
-					if cv == candidates.Prev.val && offset < maxMatchOffset {
-						candidateAlt = candidates.Prev
-					}
-					continue
-				}
-			} else {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
-				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset < maxMatchOffset {
-						continue
-					}
-				}
-			}
-			cv = uint32(x >> 8)
-			nextHash = hash(cv)
-			s++
-			break
-		}
-	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-}
-
-func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
-	}
-
-	// If we are inside the current block
-	if t >= 0 {
-		b := src[t:]
-		a := src[s:s1]
-		b = b[:len(a)]
-		// Extend the match to be as long as possible.
-		for i := range a {
-			if a[i] != b[i] {
-				return int32(i)
-			}
-		}
-		return int32(len(a))
-	}
-
-	// We found a match in the previous block.
-	tp := int32(len(e.prev)) + t
-	if tp < 0 {
-		return 0
-	}
-
-	// Extend the match to be as long as possible.
-	a := src[s:s1]
-	b := e.prev[tp:]
-	if len(b) > len(a) {
-		b = b[:len(a)]
-	}
-	a = a[:len(b)]
-	for i := range b {
-		if a[i] != b[i] {
-			return int32(i)
-		}
-	}
-	n := int32(len(b))
-	a = src[s+n : s1]
-	b = src[:len(a)]
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i) + n
-		}
-	}
-	return int32(len(a)) + n
-}
-
-// Reset the encoding table.
-func (e *snappyGen) Reset() {
-	e.prev = e.prev[:0]
-	e.cur += maxMatchOffset + 1
-}
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@ -0,0 +1,252 @@
+package flate
+
+import (
+	"io"
+	"math"
+)
+
+const (
+	maxStatelessBlock = math.MaxInt16
+
+	slTableBits  = 13
+	slTableSize  = 1 << slTableBits
+	slTableShift = 32 - slTableBits
+)
+
+type statelessWriter struct {
+	dst    io.Writer
+	closed bool
+}
+
+func (s *statelessWriter) Close() error {
+	if s.closed {
+		return nil
+	}
+	s.closed = true
+	// Emit EOF block
+	return StatelessDeflate(s.dst, nil, true)
+}
+
+func (s *statelessWriter) Write(p []byte) (n int, err error) {
+	err = StatelessDeflate(s.dst, p, false)
+	if err != nil {
+		return 0, err
+	}
+	return len(p), nil
+}
+
+func (s *statelessWriter) Reset(w io.Writer) {
+	s.dst = w
+	s.closed = false
+}
+
+// NewStatelessWriter will do compression but without maintaining any state
+// between Write calls.
+// There will be no memory kept between Write calls,
+// but compression and speed will be suboptimal.
+// Because of this, the size of actual Write calls will affect output size.
+func NewStatelessWriter(dst io.Writer) io.WriteCloser {
+	return &statelessWriter{dst: dst}
+}
+
+// StatelessDeflate allows to compress directly to a Writer without retaining state.
+// When returning everything will be flushed.
+func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
+	var dst tokens
+	bw := newHuffmanBitWriter(out)
+	if eof && len(in) == 0 {
+		// Just write an EOF block.
+		// Could be faster...
+		bw.writeStoredHeader(0, true)
+		bw.flush()
+		return bw.err
+	}
+
+	for len(in) > 0 {
+		todo := in
+		if len(todo) > maxStatelessBlock {
+			todo = todo[:maxStatelessBlock]
+		}
+		in = in[len(todo):]
+		// Compress
+		statelessEnc(&dst, todo)
+		isEof := eof && len(in) == 0
+
+		if dst.n == 0 {
+			bw.writeStoredHeader(len(todo), isEof)
+			if bw.err != nil {
+				return bw.err
+			}
+			bw.writeBytes(todo)
+		} else if int(dst.n) > len(todo)-len(todo)>>4 {
+			// If we removed less than 1/16th, huffman compress the block.
+			bw.writeBlockHuff(isEof, todo, false)
+		} else {
+			bw.writeBlockDynamic(&dst, isEof, todo, false)
+		}
+		if bw.err != nil {
+			return bw.err
+		}
+		dst.Reset()
+	}
+	if !eof {
+		// Align.
+		bw.writeStoredHeader(0, false)
+	}
+	bw.flush()
+	return bw.err
+}
+
+func hashSL(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> slTableShift
+}
+
+func load3216(b []byte, i int16) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6416(b []byte, i int16) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func statelessEnc(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	type tableEntry struct {
+		offset int16
+	}
+
+	var table [slTableSize]tableEntry
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	s := int16(1)
+	nextEmit := int16(0)
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int16(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3216(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashSL(cv)
+			candidate = table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit || nextS <= 0 {
+				goto emitRemainder
+			}
+
+			now := load6416(src, nextS)
+			table[nextHash] = tableEntry{offset: s}
+			nextHash = hashSL(uint32(now))
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = table[nextHash]
+			now >>= 8
+			table[nextHash] = tableEntry{offset: s}
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset
+			l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			// Save the match found
+			dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6416(src, s-2)
+			o := s - 2
+			prevHash := hashSL(uint32(x))
+			table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashSL(uint32(x))
+			candidate = table[currHash]
+			table[currHash] = tableEntry{offset: o + 2}
+
+			if uint32(x) != load3216(src, candidate.offset) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@ -4,7 +4,13 @@

 package flate

-import "fmt"
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)

 const (
 	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
@ -19,7 +25,7 @@ const (

 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
 // is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
+var lengthCodes = [256]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
 	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
@ -48,7 +54,37 @@ var lengthCodes = [...]uint32{
 	27, 27, 27, 27, 27, 28,
 }

-var offsetCodes = [...]uint32{
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
 	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
 	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@ -67,49 +103,265 @@ var offsetCodes = [...]uint32{
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 }

+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
 type token uint32

 type tokens struct {
-	tokens [maxStoreBlockSize + 1]token
-	n      uint16 // Must be able to contain maxStoreBlockSize
+	nLits     int
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	n         uint16      // Must be able to contain maxStoreBlockSize
+	tokens    [maxStoreBlockSize + 1]token
 }

-// Convert a literal into a literal token.
-func literalToken(literal uint32) token { return token(literalType + literal) }
-
-// Convert a < xlength, xoffset > pair into a match token.
-func matchToken(xlength uint32, xoffset uint32) token {
-	return token(matchType + xlength<<lengthShift + xoffset)
-}
-
-func matchTokend(xlength uint32, xoffset uint32) token {
-	if xlength > maxMatchLength || xoffset > maxMatchOffset {
-		panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
-		return token(matchType)
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
 	}
-	return token(matchType + xlength<<lengthShift + xoffset)
+	t.n = 0
+	t.nLits = 0
+	for i := range t.litHist[:] {
+		t.litHist[i] = 0
+	}
+	for i := range t.extraHist[:] {
+		t.extraHist[i] = 0
+	}
+	for i := range t.offHist[:] {
+		t.offHist[i] = 0
+	}
+}
+
+func (t *tokens) Fill() {
+	if t.n == 0 {
+		return
+	}
+	for i, v := range t.litHist[:] {
+		if v == 0 {
+			t.litHist[i] = 1
+			t.nLits++
+		}
+	}
+	for i, v := range t.extraHist[:literalCount-256] {
+		if v == 0 {
+			t.nLits++
+			t.extraHist[i] = 1
+		}
+	}
+	for i, v := range t.offHist[:offsetCodeCount] {
+		if v == 0 {
+			t.offHist[i] = 1
+		}
+	}
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.tokens[t.n] = tok
+			t.litHist[tok]++
+			t.n++
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset())
+	}
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+	ol := int(dst.n)
+	for i, v := range lit {
+		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+		dst.litHist[v]++
+	}
+	dst.n += uint16(len(lit))
+	dst.nLits += len(lit)
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+	t.nLits++
+}
+
+// EstimatedBits will return an minimum size estimated by an *optimal*
+// compression of the block.
+// The size of the block
+func (t *tokens) EstimatedBits() int {
+	shannon := float64(0)
+	bits := int(0)
+	nMatches := 0
+	if t.nLits > 0 {
+		invTotal := 1.0 / float64(t.nLits)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float64(v)
+				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for _, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float64(v)
+				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+				bits += int(lengthExtraBits[v&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float64(nMatches)
+		for _, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float64(v)
+				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+				bits += int(offsetExtraBits[v&31]) * int(n)
+			}
+		}
+	}
+
+	return int(shannon) + bits
+}
+
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	if debugDecode {
+		if xlength >= maxMatchLength+baseMatchLength {
+			panic(fmt.Errorf("invalid length: %v", xlength))
+		}
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	t.nLits++
+	lengthCode := lengthCodes1[uint8(xlength)] & 31
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.extraHist[lengthCode]++
+	t.offHist[offsetCode(xoffset)&31]++
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	if debugDecode {
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oc := offsetCode(xoffset) & 31
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			xl = 258 - baseMatchLength
+		}
+		xlength -= xl
+		xl -= 3
+		t.nLits++
+		lengthCode := lengthCodes1[uint8(xl)] & 31
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.extraHist[lengthCode]++
+		t.offHist[oc]++
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// VarInt returns the tokens as varint encoded bytes.
+func (t *tokens) VarInt() []byte {
+	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
+	var off int
+	for _, v := range t.tokens[:t.n] {
+		off += binary.PutUvarint(b[off:], uint64(v))
+	}
+	return b[:off]
+}
+
+// FromVarInt restores t to the varint encoded tokens provided.
+// Any data in t is removed.
+func (t *tokens) FromVarInt(b []byte) error {
+	var buf = bytes.NewReader(b)
+	var toks []token
+	for {
+		r, err := binary.ReadUvarint(buf)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		toks = append(toks, token(r))
+	}
+	t.indexTokens(toks)
+	return nil
 }

 // Returns the type of a token
 func (t token) typ() uint32 { return uint32(t) & typeMask }

 // Returns the literal of a literal token
-func (t token) literal() uint32 { return uint32(t - literalType) }
+func (t token) literal() uint8 { return uint8(t) }

 // Returns the extra offset of a match token
 func (t token) offset() uint32 { return uint32(t) & offsetMask }

-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+func (t token) length() uint8 { return uint8(t >> lengthShift) }

-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+// The code is never more than 8 bits, but is returned as uint32 for convenience.
+func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }

 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
-	if off < uint32(len(offsetCodes)) {
-		return offsetCodes[off]
-	} else if off>>7 < uint32(len(offsetCodes)) {
-		return offsetCodes[off>>7] + 14
-	} else {
-		return offsetCodes[off>>14] + 28
+	if false {
+		if off < uint32(len(offsetCodes)) {
+			return offsetCodes[off&255]
+		} else if off>>7 < uint32(len(offsetCodes)) {
+			return offsetCodes[(off>>7)&255] + 14
+		} else {
+			return offsetCodes[(off>>14)&255] + 28
+		}
 	}
+	if off < uint32(len(offsetCodes)) {
+		return offsetCodes[uint8(off)]
+	}
+	return offsetCodes14[uint8(off>>7)]
 }
--- a/vendor/github.com/klauspost/compress/gzip/gunzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@ -10,11 +10,11 @@ import (
 	"bufio"
 	"encoding/binary"
 	"errors"
+	"hash/crc32"
 	"io"
 	"time"

 	"github.com/klauspost/compress/flate"
-	"github.com/klauspost/crc32"
 )

 const (
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@ -7,10 +7,10 @@ package gzip
 import (
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"

 	"github.com/klauspost/compress/flate"
-	"github.com/klauspost/crc32"
 )

 // These constants are copied from the flate package, so that code that imports
@ -22,6 +22,13 @@ const (
 	DefaultCompression  = flate.DefaultCompression
 	ConstantCompression = flate.ConstantCompression
 	HuffmanOnly         = flate.HuffmanOnly
+
+	// StatelessCompression will do compression but without maintaining any state
+	// between Write calls.
+	// There will be no memory kept between Write calls,
+	// but compression and speed will be suboptimal.
+	// Because of this, the size of actual Write calls will affect output size.
+	StatelessCompression = -3
 )

 // A Writer is an io.WriteCloser.
@ -59,7 +66,7 @@ func NewWriter(w io.Writer) *Writer {
 // integer value between BestSpeed and BestCompression inclusive. The error
 // returned will be nil if the level is valid.
 func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
-	if level < HuffmanOnly || level > BestCompression {
+	if level < StatelessCompression || level > BestCompression {
 		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
 	}
 	z := new(Writer)
@ -69,9 +76,12 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {

 func (z *Writer) init(w io.Writer, level int) {
 	compressor := z.compressor
-	if compressor != nil {
-		compressor.Reset(w)
+	if level != StatelessCompression {
+		if compressor != nil {
+			compressor.Reset(w)
+		}
 	}
+
 	*z = Writer{
 		Header: Header{
 			OS: 255, // unknown
@ -189,12 +199,16 @@ func (z *Writer) Write(p []byte) (int, error) {
 				return n, z.err
 			}
 		}
-		if z.compressor == nil {
+
+		if z.compressor == nil && z.level != StatelessCompression {
 			z.compressor, _ = flate.NewWriter(z.w, z.level)
 		}
 	}
 	z.size += uint32(len(p))
 	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+	if z.level == StatelessCompression {
+		return len(p), flate.StatelessDeflate(z.w, p, false)
+	}
 	n, z.err = z.compressor.Write(p)
 	return n, z.err
 }
@ -211,7 +225,7 @@ func (z *Writer) Flush() error {
 	if z.err != nil {
 		return z.err
 	}
-	if z.closed {
+	if z.closed || z.level == StatelessCompression {
 		return nil
 	}
 	if !z.wroteHeader {
@ -240,7 +254,11 @@ func (z *Writer) Close() error {
 			return z.err
 		}
 	}
-	z.err = z.compressor.Close()
+	if z.level == StatelessCompression {
+		z.err = flate.StatelessDeflate(z.w, nil, true)
+	} else {
+		z.err = z.compressor.Close()
+	}
 	if z.err != nil {
 		return z.err
 	}