1
0
Fork 0
forked from forgejo/forgejo
This commit is contained in:
techknowlogick 2021-02-28 18:08:33 -05:00 committed by GitHub
parent 030646eea4
commit 47f6a4ec3f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
947 changed files with 26119 additions and 7062 deletions

View file

@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x
These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
If only one or two inputs are available the scalar calculation method will be used for the
optimal speed in these cases.
## Operation

View file

@ -1,132 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE.Golang file.
// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
package md5simd
import (
"encoding/binary"
"math/bits"
)
type digest struct {
s [4]uint32
x [BlockSize]byte
nx int
len uint64
}
func blockGeneric(dig *digest, p []byte) {
// load state
a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
for i := 0; i <= len(p)-BlockSize; i += BlockSize {
// eliminate bounds checks on p
q := p[i:]
q = q[:BlockSize:BlockSize]
// save current state
aa, bb, cc, dd := a, b, c, d
// load input block
x0 := binary.LittleEndian.Uint32(q[4*0x0:])
x1 := binary.LittleEndian.Uint32(q[4*0x1:])
x2 := binary.LittleEndian.Uint32(q[4*0x2:])
x3 := binary.LittleEndian.Uint32(q[4*0x3:])
x4 := binary.LittleEndian.Uint32(q[4*0x4:])
x5 := binary.LittleEndian.Uint32(q[4*0x5:])
x6 := binary.LittleEndian.Uint32(q[4*0x6:])
x7 := binary.LittleEndian.Uint32(q[4*0x7:])
x8 := binary.LittleEndian.Uint32(q[4*0x8:])
x9 := binary.LittleEndian.Uint32(q[4*0x9:])
xa := binary.LittleEndian.Uint32(q[4*0xa:])
xb := binary.LittleEndian.Uint32(q[4*0xb:])
xc := binary.LittleEndian.Uint32(q[4*0xc:])
xd := binary.LittleEndian.Uint32(q[4*0xd:])
xe := binary.LittleEndian.Uint32(q[4*0xe:])
xf := binary.LittleEndian.Uint32(q[4*0xf:])
// round 1
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
// round 2
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
// round 3
a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
// round 4
a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
// add saved state
a += aa
b += bb
c += cc
d += dd
}
// save state
dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
}

View file

@ -2,70 +2,72 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//+build !noasm,!appengine,gc
// This is the AVX512 implementation of the MD5 block function (16-way parallel)
#define prep(index) \
KMOVQ kmask, ktmp \
KMOVQ kmask, ktmp \
VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND1noload(a, b, c, d, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND2(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPS c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPD c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
#define ROUND3(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
#define ROUND4(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VXORPS c, ones, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VPXORQ c, ones, tmp \
VPADDD b, a, a
TEXT ·block16(SB),4,$0-40
TEXT ·block16(SB), 4, $0-40
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
#define a Z0
#define b Z1
@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40
// Registers Z16 through to Z31 are used for caching purposes
// ----------------------------------------------------------
#define dig BX
#define count DX
#define base SI
@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40
// load source pointers
VMOVUPD 0x00(AX), ptrs
MOVQ $-1, AX
MOVQ $-1, AX
VPBROADCASTQ AX, ones
loop:
@ -190,7 +191,7 @@ loop:
ROUND3(c,d,a,b, Z31,0x2e,16)
ROUND3(b,c,d,a, Z18,0x2f,23)
VXORPS d, ones, tmp
VPXORQ d, ones, tmp
ROUND4(a,b,c,d, Z16,0x30, 6)
ROUND4(d,a,b,c, Z23,0x31,10)

View file

@ -1,3 +1,5 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2018 Igneous Systems
// MIT License
//
@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40
#define consts DI
#define prepmask \
VXORPS mask, mask, mask \
VPXOR mask, mask, mask \
VPCMPGTD mask, off, mask
#define prep(index) \
@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40
#define roll(shift, a) \
VPSLLD $shift, a, rtmp1 \
VPSRLD $32-shift, a, a \
VORPS rtmp1, a, a
VPOR rtmp1, a, a
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40
VPADDD b, a, a
#define ROUND1load(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VXORPD c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND2(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp2, tmp2 \
VANDNPS c, tmp, tmp \
VPAND b, tmp2, tmp2 \
VANDNPD c, tmp, tmp \
load(index) \
VORPS tmp, tmp2, tmp2 \
VPOR tmp, tmp2, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
VXORPS d, tmp, tmp \
VXORPS b, tmp, tmp \
VPXOR d, tmp, tmp \
VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD b, tmp \
@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND4(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VORPS b, tmp, tmp \
VXORPS c, tmp, tmp \
VPOR b, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD tmp, a, a \
load(index) \
roll(shift,a) \
VXORPS c, ones, tmp \
VPXOR c, ones, tmp \
VPADDD b, a, a
// load digest into state registers
@ -242,7 +244,7 @@ loop:
ROUND3(b,c,d,a, 0,0x2f,23)
load(0)
VXORPS d, ones, tmp
VPXOR d, ones, tmp
ROUND4(a,b,c,d, 7,0x30, 6)
ROUND4(d,a,b,c,14,0x31,10)

View file

@ -9,14 +9,18 @@ package md5simd
import (
"fmt"
"math"
"sync"
"unsafe"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
func init() {
// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
}
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
return inf
}(md5consts[:])
func init() {
hasAVX512 = cpuid.CPU.AVX512F()
}
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
} else {
d8a, d8b := digest8{}, digest8{}
for i := range d8a.v0 {
j := i + 8
d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
if !half {
d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
}
return
}
i8 := [2][8][]byte{}
for i := range i8[0] {
i8[0][i], i8[1][i] = input[i], input[8+i]
}
if half {
blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
} else {
wg := sync.WaitGroup{}
wg.Add(2)
go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
wg.Wait()
}
// Preparing data using copy is slower since copies aren't inlined.
for i := range d8a.v0 {
j := i + 8
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
if !half {
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
}
// Calculate on this goroutine
if half {
for i := range s.i8[0][:] {
s.i8[0][i] = input[i]
}
for i := range s.d8a.v0[:] {
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
return
}
for i := range s.i8[0][:] {
s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
}
for i := range s.d8a.v0[:] {
j := (i + 8) & 15
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
// of using the current for one of the blocks.
s.wg.Add(2)
go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
s.wg.Wait()
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
for i := range s.d8b.v0[:] {
j := (i + 8) & 15
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}

View file

@ -2,6 +2,4 @@ module github.com/minio/md5-simd
go 1.14
require (
github.com/klauspost/cpuid v1.2.3
)
require github.com/klauspost/cpuid/v2 v2.0.1

View file

@ -1,2 +1,2 @@
github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/cpuid/v2 v2.0.1 h1:lb04bBEJoAoV48eHs4Eq0UyhmJCkRSdIjQ3uS8WJRM4=
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=

View file

@ -10,6 +10,7 @@ import (
"encoding/binary"
"errors"
"fmt"
"sync"
"sync/atomic"
)
@ -121,6 +122,14 @@ func (d *md5Digest) Close() {
}
}
var sumChPool sync.Pool
func init() {
sumChPool.New = func() interface{} {
return make(chan sumResult, 1)
}
}
// Sum - Return MD5 sum in bytes
func (d *md5Digest) Sum(in []byte) (result []byte) {
if d.blocksCh == nil {
@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) {
if len(trail)%BlockSize != 0 {
panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
}
sumCh := make(chan sumResult, 1)
sumCh := sumChPool.Get().(chan sumResult)
d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
sum := <-sumCh
sumChPool.Put(sumCh)
return append(in, sum.digest[:]...)
}

View file

@ -10,8 +10,9 @@ import (
"encoding/binary"
"fmt"
"runtime"
"sync"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
// MD5 initialization constants
@ -23,6 +24,9 @@ const (
init1 = 0xefcdab89
init2 = 0x98badcfe
init3 = 0x10325476
// Use scalar routine when below this many lanes
useScalarBelow = 3
)
// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
@ -56,11 +60,15 @@ type md5Server struct {
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
allBufs []byte // Preallocated buffer.
buffers chan []byte // Preallocated buffers, sliced from allBufs.
i8 [2][8][]byte // avx2 temporary vars
d8a, d8b digest8
wg sync.WaitGroup
}
// NewServer - Create new object for parallel processing handling
func NewServer() Server {
if !cpuid.CPU.AVX2() {
if !cpuid.CPU.Supports(cpuid.AVX2) {
return &fallbackServer{}
}
md5srv := &md5Server{}
@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) {
sum := sumResult{}
// Add end block to current digest.
blockGeneric(&dig, block.msg)
blockScalar(&dig.s, block.msg)
binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
@ -262,6 +270,88 @@ func (s *md5Server) Close() {
// Invoke assembly and send results back
func (s *md5Server) blocks(lanes []blockInput) {
if len(lanes) < useScalarBelow {
// Use scalar routine when below this many lanes
switch len(lanes) {
case 0:
case 1:
lane := lanes[0]
var d digest
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) > 0 {
// Update...
blockScalar(&d.s, lane.msg)
}
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], d.s[0])
binary.LittleEndian.PutUint32(dig[4:], d.s[1])
binary.LittleEndian.PutUint32(dig[8:], d.s[2])
binary.LittleEndian.PutUint32(dig[12:], d.s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[0] = blockInput{}
default:
s.wg.Add(len(lanes))
var results [useScalarBelow]digest
for i := range lanes {
lane := lanes[i]
go func(i int) {
var d digest
defer s.wg.Done()
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) == 0 {
results[i] = d
return
}
// Update...
blockScalar(&d.s, lane.msg)
results[i] = d
}(i)
}
s.wg.Wait()
for i, lane := range lanes {
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[i] = blockInput{}
}
}
return
}
inputs := [16][]byte{}
for i := range lanes {
inputs[i] = lanes[i].msg

View file

@ -1,19 +1,21 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package md5simd
import (
"sort"
)
// Helper struct for sorting blocks based on length
type lane struct {
len uint
pos uint
}
type digest struct {
s [4]uint32
}
// Helper struct for generating number of rounds in combination with mask for valid lanes
type maskRounds struct {
mask uint64
@ -23,15 +25,22 @@ type maskRounds struct {
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [8]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
}
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [16]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xffff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}

View file

@ -27,6 +27,12 @@ type Hasher interface {
Close()
}
// StdlibHasher returns a Hasher that uses the stdlib for hashing.
// Used hashers are stored in a pool for fast reuse.
func StdlibHasher() Hasher {
return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
}
// md5Wrapper is a wrapper around the builtin hasher.
type md5Wrapper struct {
hash.Hash

11
vendor/github.com/minio/md5-simd/md5block_amd64.go generated vendored Normal file
View file

@ -0,0 +1,11 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
package md5simd
// Encode p to digest
//go:noescape
func blockScalar(dig *[4]uint32, p []byte)

714
vendor/github.com/minio/md5-simd/md5block_amd64.s generated vendored Normal file
View file

@ -0,0 +1,714 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
// func blockScalar(dig *[4]uint32, p []byte)
TEXT ·blockScalar(SB), $0-32
MOVQ p_len+16(FP), AX
MOVQ dig+0(FP), CX
MOVQ p_base+8(FP), DX
SHRQ $0x06, AX
SHLQ $0x06, AX
LEAQ (DX)(AX*1), AX
CMPQ DX, AX
JEQ end
MOVL (CX), BX
MOVL 4(CX), BP
MOVL 8(CX), SI
MOVL 12(CX), CX
MOVL $0xffffffff, DI
loop:
MOVL (DX), R8
MOVL CX, R9
MOVL BX, R10
MOVL BP, R11
MOVL SI, R12
MOVL CX, R13
// ROUND1
XORL SI, R9
ADDL $0xd76aa478, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 4(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xe8c7b756, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 8(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0x242070db, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 12(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xc1bdceee, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 16(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0xf57c0faf, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 20(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x4787c62a, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 24(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa8304613, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 28(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xfd469501, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 32(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x698098d8, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 36(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x8b44f7af, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 40(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xffff5bb1, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 44(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x895cd7be, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 48(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x6b901122, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 52(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xfd987193, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 56(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa679438e, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 60(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x49b40821, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 4(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
// ROUND2
MOVL CX, R9
MOVL CX, R14
XORL DI, R9
ADDL $0xf61e2562, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 24(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc040b340, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 44(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x265e5a51, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL (DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe9b6c7aa, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xd62f105d, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 40(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0x02441453, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 60(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xd8a1e681, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 16(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe7d3fbc8, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 36(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0x21e1cde6, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 56(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc33707d6, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 12(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xf4d50d87, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 32(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x455a14ed, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 52(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xa9e3e905, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 8(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xfcefa3f8, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 28(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x676f02d9, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 48(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x8d2a4c8a, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
// ROUND3
MOVL SI, R9
ADDL $0xfffa3942, BX
ADDL R8, BX
MOVL 32(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x8771f681, CX
ADDL R8, CX
MOVL 44(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x6d9d6122, SI
ADDL R8, SI
MOVL 56(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xfde5380c, BP
ADDL R8, BP
MOVL 4(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xa4beea44, BX
ADDL R8, BX
MOVL 16(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x4bdecfa9, CX
ADDL R8, CX
MOVL 28(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xf6bb4b60, SI
ADDL R8, SI
MOVL 40(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xbebfbc70, BP
ADDL R8, BP
MOVL 52(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0x289b7ec6, BX
ADDL R8, BX
MOVL (DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xeaa127fa, CX
ADDL R8, CX
MOVL 12(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xd4ef3085, SI
ADDL R8, SI
MOVL 24(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0x04881d05, BP
ADDL R8, BP
MOVL 36(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xd9d4d039, BX
ADDL R8, BX
MOVL 48(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xe6db99e5, CX
ADDL R8, CX
MOVL 60(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x1fa27cf8, SI
ADDL R8, SI
MOVL 8(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xc4ac5665, BP
ADDL R8, BP
MOVL (DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
// ROUND4
MOVL DI, R9
XORL CX, R9
ADDL $0xf4292244, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 28(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x432aff97, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 56(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xab9423a7, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 20(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xfc93a039, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 48(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x655b59c3, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 12(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x8f0ccc92, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 40(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xffeff47d, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 4(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x85845dd1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 32(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x6fa87e4f, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 60(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xfe2ce6e0, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 24(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xa3014314, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 52(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x4e0811a1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 16(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0xf7537e82, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 44(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xbd3af235, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 8(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0x2ad7d2bb, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 36(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xeb86d391, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
ROLL $0x15, BP
ADDL SI, BP
ADDL R10, BX
ADDL R11, BP
ADDL R12, SI
ADDL R13, CX
// Prepare next loop
ADDQ $0x40, DX
CMPQ DX, AX
JB loop
// Write output
MOVQ dig+0(FP), AX
MOVL BX, (AX)
MOVL BP, 4(AX)
MOVL SI, 8(AX)
MOVL CX, 12(AX)
end:
RET