forked from forgejo/forgejo
go1.16 (#14783)
This commit is contained in:
parent
030646eea4
commit
47f6a4ec3f
947 changed files with 26119 additions and 7062 deletions
2
vendor/github.com/minio/md5-simd/README.md
generated
vendored
2
vendor/github.com/minio/md5-simd/README.md
generated
vendored
|
@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x
|
|||
|
||||
These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
|
||||
|
||||
If only one or two inputs are available the scalar calculation method will be used for the
|
||||
optimal speed in these cases.
|
||||
|
||||
## Operation
|
||||
|
||||
|
|
132
vendor/github.com/minio/md5-simd/block-generic.go
generated
vendored
132
vendor/github.com/minio/md5-simd/block-generic.go
generated
vendored
|
@ -1,132 +0,0 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE.Golang file.
|
||||
|
||||
// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
|
||||
|
||||
package md5simd
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"math/bits"
|
||||
)
|
||||
|
||||
type digest struct {
|
||||
s [4]uint32
|
||||
x [BlockSize]byte
|
||||
nx int
|
||||
len uint64
|
||||
}
|
||||
|
||||
func blockGeneric(dig *digest, p []byte) {
|
||||
// load state
|
||||
a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
|
||||
|
||||
for i := 0; i <= len(p)-BlockSize; i += BlockSize {
|
||||
// eliminate bounds checks on p
|
||||
q := p[i:]
|
||||
q = q[:BlockSize:BlockSize]
|
||||
|
||||
// save current state
|
||||
aa, bb, cc, dd := a, b, c, d
|
||||
|
||||
// load input block
|
||||
x0 := binary.LittleEndian.Uint32(q[4*0x0:])
|
||||
x1 := binary.LittleEndian.Uint32(q[4*0x1:])
|
||||
x2 := binary.LittleEndian.Uint32(q[4*0x2:])
|
||||
x3 := binary.LittleEndian.Uint32(q[4*0x3:])
|
||||
x4 := binary.LittleEndian.Uint32(q[4*0x4:])
|
||||
x5 := binary.LittleEndian.Uint32(q[4*0x5:])
|
||||
x6 := binary.LittleEndian.Uint32(q[4*0x6:])
|
||||
x7 := binary.LittleEndian.Uint32(q[4*0x7:])
|
||||
x8 := binary.LittleEndian.Uint32(q[4*0x8:])
|
||||
x9 := binary.LittleEndian.Uint32(q[4*0x9:])
|
||||
xa := binary.LittleEndian.Uint32(q[4*0xa:])
|
||||
xb := binary.LittleEndian.Uint32(q[4*0xb:])
|
||||
xc := binary.LittleEndian.Uint32(q[4*0xc:])
|
||||
xd := binary.LittleEndian.Uint32(q[4*0xd:])
|
||||
xe := binary.LittleEndian.Uint32(q[4*0xe:])
|
||||
xf := binary.LittleEndian.Uint32(q[4*0xf:])
|
||||
|
||||
// round 1
|
||||
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
|
||||
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
|
||||
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
|
||||
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
|
||||
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
|
||||
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
|
||||
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
|
||||
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
|
||||
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
|
||||
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
|
||||
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
|
||||
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
|
||||
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
|
||||
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
|
||||
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
|
||||
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
|
||||
|
||||
// round 2
|
||||
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
|
||||
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
|
||||
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
|
||||
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
|
||||
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
|
||||
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
|
||||
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
|
||||
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
|
||||
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
|
||||
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
|
||||
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
|
||||
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
|
||||
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
|
||||
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
|
||||
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
|
||||
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
|
||||
|
||||
// round 3
|
||||
a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
|
||||
d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
|
||||
c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
|
||||
b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
|
||||
a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
|
||||
d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
|
||||
c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
|
||||
b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
|
||||
a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
|
||||
d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
|
||||
c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
|
||||
b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
|
||||
a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
|
||||
d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
|
||||
c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
|
||||
b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
|
||||
|
||||
// round 4
|
||||
a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
|
||||
d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
|
||||
c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
|
||||
b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
|
||||
a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
|
||||
d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
|
||||
c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
|
||||
b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
|
||||
a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
|
||||
d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
|
||||
c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
|
||||
b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
|
||||
a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
|
||||
d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
|
||||
c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
|
||||
b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
|
||||
|
||||
// add saved state
|
||||
a += aa
|
||||
b += bb
|
||||
c += cc
|
||||
d += dd
|
||||
}
|
||||
|
||||
// save state
|
||||
dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
|
||||
}
|
103
vendor/github.com/minio/md5-simd/block16_amd64.s
generated
vendored
103
vendor/github.com/minio/md5-simd/block16_amd64.s
generated
vendored
|
@ -2,70 +2,72 @@
|
|||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,gc
|
||||
|
||||
// This is the AVX512 implementation of the MD5 block function (16-way parallel)
|
||||
|
||||
#define prep(index) \
|
||||
KMOVQ kmask, ktmp \
|
||||
KMOVQ kmask, ktmp \
|
||||
VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
|
||||
|
||||
#define ROUND1(a, b, c, d, index, const, shift) \
|
||||
VXORPS c, tmp, tmp \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VPTERNLOGD $0x6C, b, d, tmp \
|
||||
prep(index) \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD b, a, a
|
||||
VPXORQ c, tmp, tmp \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VPTERNLOGD $0x6C, b, d, tmp \
|
||||
prep(index) \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD b, a, a
|
||||
|
||||
#define ROUND1noload(a, b, c, d, const, shift) \
|
||||
VXORPS c, tmp, tmp \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VPTERNLOGD $0x6C, b, d, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD b, a, a
|
||||
VPXORQ c, tmp, tmp \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VPTERNLOGD $0x6C, b, d, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD b, a, a
|
||||
|
||||
#define ROUND2(a, b, c, d, zreg, const, shift) \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VANDNPS c, tmp, tmp \
|
||||
VPTERNLOGD $0xEC, b, tmp, tmp2 \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD tmp2, a, a \
|
||||
VMOVAPD c, tmp2 \
|
||||
VPROLD $shift, a, a \
|
||||
VPADDD b, a, a
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VANDNPD c, tmp, tmp \
|
||||
VPTERNLOGD $0xEC, b, tmp, tmp2 \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD tmp2, a, a \
|
||||
VMOVAPD c, tmp2 \
|
||||
VPROLD $shift, a, a \
|
||||
VPADDD b, a, a
|
||||
|
||||
#define ROUND3(a, b, c, d, zreg, const, shift) \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VPTERNLOGD $0x96, b, d, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD b, tmp \
|
||||
VPADDD b, a, a
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VPTERNLOGD $0x96, b, d, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VMOVAPD b, tmp \
|
||||
VPADDD b, a, a
|
||||
|
||||
#define ROUND4(a, b, c, d, zreg, const, shift) \
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VPTERNLOGD $0x36, b, c, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VXORPS c, ones, tmp \
|
||||
VPADDD b, a, a
|
||||
VPADDD 64*const(consts), a, a \
|
||||
VPADDD zreg, a, a \
|
||||
VPTERNLOGD $0x36, b, c, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
VPROLD $shift, a, a \
|
||||
VPXORQ c, ones, tmp \
|
||||
VPADDD b, a, a
|
||||
|
||||
TEXT ·block16(SB),4,$0-40
|
||||
TEXT ·block16(SB), 4, $0-40
|
||||
|
||||
MOVQ state+0(FP), BX
|
||||
MOVQ base+8(FP), SI
|
||||
MOVQ ptrs+16(FP), AX
|
||||
KMOVQ mask+24(FP), K1
|
||||
MOVQ n+32(FP), DX
|
||||
MOVQ ·avx512md5consts+0(SB), DI
|
||||
MOVQ state+0(FP), BX
|
||||
MOVQ base+8(FP), SI
|
||||
MOVQ ptrs+16(FP), AX
|
||||
KMOVQ mask+24(FP), K1
|
||||
MOVQ n+32(FP), DX
|
||||
MOVQ ·avx512md5consts+0(SB), DI
|
||||
|
||||
#define a Z0
|
||||
#define b Z1
|
||||
|
@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40
|
|||
// Registers Z16 through to Z31 are used for caching purposes
|
||||
// ----------------------------------------------------------
|
||||
|
||||
|
||||
#define dig BX
|
||||
#define count DX
|
||||
#define base SI
|
||||
|
@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40
|
|||
// load source pointers
|
||||
VMOVUPD 0x00(AX), ptrs
|
||||
|
||||
MOVQ $-1, AX
|
||||
MOVQ $-1, AX
|
||||
VPBROADCASTQ AX, ones
|
||||
|
||||
loop:
|
||||
|
@ -190,7 +191,7 @@ loop:
|
|||
ROUND3(c,d,a,b, Z31,0x2e,16)
|
||||
ROUND3(b,c,d,a, Z18,0x2f,23)
|
||||
|
||||
VXORPS d, ones, tmp
|
||||
VPXORQ d, ones, tmp
|
||||
|
||||
ROUND4(a,b,c,d, Z16,0x30, 6)
|
||||
ROUND4(d,a,b,c, Z23,0x31,10)
|
||||
|
|
36
vendor/github.com/minio/md5-simd/block8_amd64.s
generated
vendored
36
vendor/github.com/minio/md5-simd/block8_amd64.s
generated
vendored
|
@ -1,3 +1,5 @@
|
|||
//+build !noasm,!appengine,gc
|
||||
|
||||
// Copyright (c) 2018 Igneous Systems
|
||||
// MIT License
|
||||
//
|
||||
|
@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40
|
|||
#define consts DI
|
||||
|
||||
#define prepmask \
|
||||
VXORPS mask, mask, mask \
|
||||
VPXOR mask, mask, mask \
|
||||
VPCMPGTD mask, off, mask
|
||||
|
||||
#define prep(index) \
|
||||
|
@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40
|
|||
#define roll(shift, a) \
|
||||
VPSLLD $shift, a, rtmp1 \
|
||||
VPSRLD $32-shift, a, a \
|
||||
VORPS rtmp1, a, a
|
||||
VPOR rtmp1, a, a
|
||||
|
||||
#define ROUND1(a, b, c, d, index, const, shift) \
|
||||
VXORPS c, tmp, tmp \
|
||||
VPXOR c, tmp, tmp \
|
||||
VPADDD 32*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VANDPS b, tmp, tmp \
|
||||
VXORPS d, tmp, tmp \
|
||||
VPAND b, tmp, tmp \
|
||||
VPXOR d, tmp, tmp \
|
||||
prep(index) \
|
||||
VPADDD tmp, a, a \
|
||||
roll(shift,a) \
|
||||
|
@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40
|
|||
VPADDD b, a, a
|
||||
|
||||
#define ROUND1load(a, b, c, d, index, const, shift) \
|
||||
VXORPS c, tmp, tmp \
|
||||
VXORPD c, tmp, tmp \
|
||||
VPADDD 32*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VANDPS b, tmp, tmp \
|
||||
VXORPS d, tmp, tmp \
|
||||
VPAND b, tmp, tmp \
|
||||
VPXOR d, tmp, tmp \
|
||||
load(index) \
|
||||
VPADDD tmp, a, a \
|
||||
roll(shift,a) \
|
||||
|
@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40
|
|||
#define ROUND2(a, b, c, d, index, const, shift) \
|
||||
VPADDD 32*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VANDPS b, tmp2, tmp2 \
|
||||
VANDNPS c, tmp, tmp \
|
||||
VPAND b, tmp2, tmp2 \
|
||||
VANDNPD c, tmp, tmp \
|
||||
load(index) \
|
||||
VORPS tmp, tmp2, tmp2 \
|
||||
VPOR tmp, tmp2, tmp2 \
|
||||
VMOVAPD c, tmp \
|
||||
VPADDD tmp2, a, a \
|
||||
VMOVAPD c, tmp2 \
|
||||
|
@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40
|
|||
VPADDD 32*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
load(index) \
|
||||
VXORPS d, tmp, tmp \
|
||||
VXORPS b, tmp, tmp \
|
||||
VPXOR d, tmp, tmp \
|
||||
VPXOR b, tmp, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
roll(shift,a) \
|
||||
VMOVAPD b, tmp \
|
||||
|
@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40
|
|||
#define ROUND4(a, b, c, d, index, const, shift) \
|
||||
VPADDD 32*const(consts), a, a \
|
||||
VPADDD mem, a, a \
|
||||
VORPS b, tmp, tmp \
|
||||
VXORPS c, tmp, tmp \
|
||||
VPOR b, tmp, tmp \
|
||||
VPXOR c, tmp, tmp \
|
||||
VPADDD tmp, a, a \
|
||||
load(index) \
|
||||
roll(shift,a) \
|
||||
VXORPS c, ones, tmp \
|
||||
VPXOR c, ones, tmp \
|
||||
VPADDD b, a, a
|
||||
|
||||
// load digest into state registers
|
||||
|
@ -242,7 +244,7 @@ loop:
|
|||
ROUND3(b,c,d,a, 0,0x2f,23)
|
||||
|
||||
load(0)
|
||||
VXORPS d, ones, tmp
|
||||
VPXOR d, ones, tmp
|
||||
|
||||
ROUND4(a,b,c,d, 7,0x30, 6)
|
||||
ROUND4(d,a,b,c,14,0x31,10)
|
||||
|
|
79
vendor/github.com/minio/md5-simd/block_amd64.go
generated
vendored
79
vendor/github.com/minio/md5-simd/block_amd64.go
generated
vendored
|
@ -9,14 +9,18 @@ package md5simd
|
|||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/klauspost/cpuid"
|
||||
"github.com/klauspost/cpuid/v2"
|
||||
)
|
||||
|
||||
var hasAVX512 bool
|
||||
|
||||
func init() {
|
||||
// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
|
||||
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
|
||||
|
||||
|
@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
|
|||
return inf
|
||||
}(md5consts[:])
|
||||
|
||||
func init() {
|
||||
hasAVX512 = cpuid.CPU.AVX512F()
|
||||
}
|
||||
|
||||
// Interface function to assembly code
|
||||
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
|
||||
if hasAVX512 {
|
||||
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
|
||||
} else {
|
||||
d8a, d8b := digest8{}, digest8{}
|
||||
for i := range d8a.v0 {
|
||||
j := i + 8
|
||||
d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
|
||||
if !half {
|
||||
d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
i8 := [2][8][]byte{}
|
||||
for i := range i8[0] {
|
||||
i8[0][i], i8[1][i] = input[i], input[8+i]
|
||||
}
|
||||
if half {
|
||||
blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
|
||||
} else {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(2)
|
||||
go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
|
||||
go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
|
||||
wg.Wait()
|
||||
}
|
||||
// Preparing data using copy is slower since copies aren't inlined.
|
||||
|
||||
for i := range d8a.v0 {
|
||||
j := i + 8
|
||||
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
|
||||
if !half {
|
||||
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
|
||||
}
|
||||
// Calculate on this goroutine
|
||||
if half {
|
||||
for i := range s.i8[0][:] {
|
||||
s.i8[0][i] = input[i]
|
||||
}
|
||||
for i := range s.d8a.v0[:] {
|
||||
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
|
||||
}
|
||||
blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
|
||||
for i := range s.d8a.v0[:] {
|
||||
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
for i := range s.i8[0][:] {
|
||||
s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
|
||||
}
|
||||
|
||||
for i := range s.d8a.v0[:] {
|
||||
j := (i + 8) & 15
|
||||
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
|
||||
s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
|
||||
}
|
||||
|
||||
// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
|
||||
// of using the current for one of the blocks.
|
||||
s.wg.Add(2)
|
||||
go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
|
||||
go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
|
||||
s.wg.Wait()
|
||||
for i := range s.d8a.v0[:] {
|
||||
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
|
||||
}
|
||||
for i := range s.d8b.v0[:] {
|
||||
j := (i + 8) & 15
|
||||
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
|
||||
}
|
||||
}
|
||||
|
||||
|
|
4
vendor/github.com/minio/md5-simd/go.mod
generated
vendored
4
vendor/github.com/minio/md5-simd/go.mod
generated
vendored
|
@ -2,6 +2,4 @@ module github.com/minio/md5-simd
|
|||
|
||||
go 1.14
|
||||
|
||||
require (
|
||||
github.com/klauspost/cpuid v1.2.3
|
||||
)
|
||||
require github.com/klauspost/cpuid/v2 v2.0.1
|
||||
|
|
4
vendor/github.com/minio/md5-simd/go.sum
generated
vendored
4
vendor/github.com/minio/md5-simd/go.sum
generated
vendored
|
@ -1,2 +1,2 @@
|
|||
github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
|
||||
github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
||||
github.com/klauspost/cpuid/v2 v2.0.1 h1:lb04bBEJoAoV48eHs4Eq0UyhmJCkRSdIjQ3uS8WJRM4=
|
||||
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
|
|
12
vendor/github.com/minio/md5-simd/md5-digest_amd64.go
generated
vendored
12
vendor/github.com/minio/md5-simd/md5-digest_amd64.go
generated
vendored
|
@ -10,6 +10,7 @@ import (
|
|||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
|
@ -121,6 +122,14 @@ func (d *md5Digest) Close() {
|
|||
}
|
||||
}
|
||||
|
||||
var sumChPool sync.Pool
|
||||
|
||||
func init() {
|
||||
sumChPool.New = func() interface{} {
|
||||
return make(chan sumResult, 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Sum - Return MD5 sum in bytes
|
||||
func (d *md5Digest) Sum(in []byte) (result []byte) {
|
||||
if d.blocksCh == nil {
|
||||
|
@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) {
|
|||
if len(trail)%BlockSize != 0 {
|
||||
panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
|
||||
}
|
||||
sumCh := make(chan sumResult, 1)
|
||||
sumCh := sumChPool.Get().(chan sumResult)
|
||||
d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
|
||||
|
||||
sum := <-sumCh
|
||||
sumChPool.Put(sumCh)
|
||||
|
||||
return append(in, sum.digest[:]...)
|
||||
}
|
||||
|
|
96
vendor/github.com/minio/md5-simd/md5-server_amd64.go
generated
vendored
96
vendor/github.com/minio/md5-simd/md5-server_amd64.go
generated
vendored
|
@ -10,8 +10,9 @@ import (
|
|||
"encoding/binary"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
"github.com/klauspost/cpuid"
|
||||
"github.com/klauspost/cpuid/v2"
|
||||
)
|
||||
|
||||
// MD5 initialization constants
|
||||
|
@ -23,6 +24,9 @@ const (
|
|||
init1 = 0xefcdab89
|
||||
init2 = 0x98badcfe
|
||||
init3 = 0x10325476
|
||||
|
||||
// Use scalar routine when below this many lanes
|
||||
useScalarBelow = 3
|
||||
)
|
||||
|
||||
// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
|
||||
|
@ -56,11 +60,15 @@ type md5Server struct {
|
|||
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
|
||||
allBufs []byte // Preallocated buffer.
|
||||
buffers chan []byte // Preallocated buffers, sliced from allBufs.
|
||||
|
||||
i8 [2][8][]byte // avx2 temporary vars
|
||||
d8a, d8b digest8
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewServer - Create new object for parallel processing handling
|
||||
func NewServer() Server {
|
||||
if !cpuid.CPU.AVX2() {
|
||||
if !cpuid.CPU.Supports(cpuid.AVX2) {
|
||||
return &fallbackServer{}
|
||||
}
|
||||
md5srv := &md5Server{}
|
||||
|
@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) {
|
|||
|
||||
sum := sumResult{}
|
||||
// Add end block to current digest.
|
||||
blockGeneric(&dig, block.msg)
|
||||
blockScalar(&dig.s, block.msg)
|
||||
|
||||
binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
|
||||
binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
|
||||
|
@ -262,6 +270,88 @@ func (s *md5Server) Close() {
|
|||
|
||||
// Invoke assembly and send results back
|
||||
func (s *md5Server) blocks(lanes []blockInput) {
|
||||
if len(lanes) < useScalarBelow {
|
||||
// Use scalar routine when below this many lanes
|
||||
switch len(lanes) {
|
||||
case 0:
|
||||
case 1:
|
||||
lane := lanes[0]
|
||||
var d digest
|
||||
a, ok := s.digests[lane.uid]
|
||||
if ok {
|
||||
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
|
||||
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
|
||||
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
|
||||
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
|
||||
} else {
|
||||
d.s[0] = init0
|
||||
d.s[1] = init1
|
||||
d.s[2] = init2
|
||||
d.s[3] = init3
|
||||
}
|
||||
if len(lane.msg) > 0 {
|
||||
// Update...
|
||||
blockScalar(&d.s, lane.msg)
|
||||
}
|
||||
dig := [Size]byte{}
|
||||
binary.LittleEndian.PutUint32(dig[0:], d.s[0])
|
||||
binary.LittleEndian.PutUint32(dig[4:], d.s[1])
|
||||
binary.LittleEndian.PutUint32(dig[8:], d.s[2])
|
||||
binary.LittleEndian.PutUint32(dig[12:], d.s[3])
|
||||
s.digests[lane.uid] = dig
|
||||
|
||||
if lane.msg != nil {
|
||||
s.buffers <- lane.msg
|
||||
}
|
||||
lanes[0] = blockInput{}
|
||||
|
||||
default:
|
||||
s.wg.Add(len(lanes))
|
||||
var results [useScalarBelow]digest
|
||||
for i := range lanes {
|
||||
lane := lanes[i]
|
||||
go func(i int) {
|
||||
var d digest
|
||||
defer s.wg.Done()
|
||||
a, ok := s.digests[lane.uid]
|
||||
if ok {
|
||||
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
|
||||
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
|
||||
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
|
||||
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
|
||||
} else {
|
||||
d.s[0] = init0
|
||||
d.s[1] = init1
|
||||
d.s[2] = init2
|
||||
d.s[3] = init3
|
||||
}
|
||||
if len(lane.msg) == 0 {
|
||||
results[i] = d
|
||||
return
|
||||
}
|
||||
// Update...
|
||||
blockScalar(&d.s, lane.msg)
|
||||
results[i] = d
|
||||
}(i)
|
||||
}
|
||||
s.wg.Wait()
|
||||
for i, lane := range lanes {
|
||||
dig := [Size]byte{}
|
||||
binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
|
||||
binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
|
||||
binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
|
||||
binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
|
||||
s.digests[lane.uid] = dig
|
||||
|
||||
if lane.msg != nil {
|
||||
s.buffers <- lane.msg
|
||||
}
|
||||
lanes[i] = blockInput{}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
inputs := [16][]byte{}
|
||||
for i := range lanes {
|
||||
inputs[i] = lanes[i].msg
|
||||
|
|
37
vendor/github.com/minio/md5-simd/md5-util_amd64.go
generated
vendored
37
vendor/github.com/minio/md5-simd/md5-util_amd64.go
generated
vendored
|
@ -1,19 +1,21 @@
|
|||
//+build !noasm,!appengine,gc
|
||||
|
||||
// Copyright (c) 2020 MinIO Inc. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
package md5simd
|
||||
|
||||
import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Helper struct for sorting blocks based on length
|
||||
type lane struct {
|
||||
len uint
|
||||
pos uint
|
||||
}
|
||||
|
||||
type digest struct {
|
||||
s [4]uint32
|
||||
}
|
||||
|
||||
// Helper struct for generating number of rounds in combination with mask for valid lanes
|
||||
type maskRounds struct {
|
||||
mask uint64
|
||||
|
@ -23,15 +25,22 @@ type maskRounds struct {
|
|||
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
|
||||
// Sort on blocks length small to large
|
||||
var sorted [8]lane
|
||||
for c, inpt := range input {
|
||||
for c, inpt := range input[:] {
|
||||
sorted[c] = lane{uint(len(inpt)), uint(c)}
|
||||
for i := c - 1; i >= 0; i-- {
|
||||
// swap so largest is at the end...
|
||||
if sorted[i].len > sorted[i+1].len {
|
||||
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
|
||||
|
||||
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
|
||||
m, round := uint64(0xff), uint64(0)
|
||||
|
||||
for _, s := range sorted {
|
||||
for _, s := range sorted[:] {
|
||||
if s.len > 0 {
|
||||
if uint64(s.len)>>6 > round {
|
||||
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
|
||||
|
@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
|
|||
}
|
||||
|
||||
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
|
||||
|
||||
// Sort on blocks length small to large
|
||||
var sorted [16]lane
|
||||
for c, inpt := range input {
|
||||
for c, inpt := range input[:] {
|
||||
sorted[c] = lane{uint(len(inpt)), uint(c)}
|
||||
for i := c - 1; i >= 0; i-- {
|
||||
// swap so largest is at the end...
|
||||
if sorted[i].len > sorted[i+1].len {
|
||||
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
|
||||
|
||||
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
|
||||
m, round := uint64(0xffff), uint64(0)
|
||||
|
||||
for _, s := range sorted {
|
||||
for _, s := range sorted[:] {
|
||||
if s.len > 0 {
|
||||
if uint64(s.len)>>6 > round {
|
||||
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
|
||||
|
|
6
vendor/github.com/minio/md5-simd/md5.go
generated
vendored
6
vendor/github.com/minio/md5-simd/md5.go
generated
vendored
|
@ -27,6 +27,12 @@ type Hasher interface {
|
|||
Close()
|
||||
}
|
||||
|
||||
// StdlibHasher returns a Hasher that uses the stdlib for hashing.
|
||||
// Used hashers are stored in a pool for fast reuse.
|
||||
func StdlibHasher() Hasher {
|
||||
return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
|
||||
}
|
||||
|
||||
// md5Wrapper is a wrapper around the builtin hasher.
|
||||
type md5Wrapper struct {
|
||||
hash.Hash
|
||||
|
|
11
vendor/github.com/minio/md5-simd/md5block_amd64.go
generated
vendored
Normal file
11
vendor/github.com/minio/md5-simd/md5block_amd64.go
generated
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
|
||||
|
||||
// +build !appengine
|
||||
// +build !noasm
|
||||
// +build gc
|
||||
|
||||
package md5simd
|
||||
|
||||
// Encode p to digest
|
||||
//go:noescape
|
||||
func blockScalar(dig *[4]uint32, p []byte)
|
714
vendor/github.com/minio/md5-simd/md5block_amd64.s
generated
vendored
Normal file
714
vendor/github.com/minio/md5-simd/md5block_amd64.s
generated
vendored
Normal file
|
@ -0,0 +1,714 @@
|
|||
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
|
||||
|
||||
// +build !appengine
|
||||
// +build !noasm
|
||||
// +build gc
|
||||
|
||||
// func blockScalar(dig *[4]uint32, p []byte)
|
||||
TEXT ·blockScalar(SB), $0-32
|
||||
MOVQ p_len+16(FP), AX
|
||||
MOVQ dig+0(FP), CX
|
||||
MOVQ p_base+8(FP), DX
|
||||
SHRQ $0x06, AX
|
||||
SHLQ $0x06, AX
|
||||
LEAQ (DX)(AX*1), AX
|
||||
CMPQ DX, AX
|
||||
JEQ end
|
||||
MOVL (CX), BX
|
||||
MOVL 4(CX), BP
|
||||
MOVL 8(CX), SI
|
||||
MOVL 12(CX), CX
|
||||
MOVL $0xffffffff, DI
|
||||
|
||||
loop:
|
||||
MOVL (DX), R8
|
||||
MOVL CX, R9
|
||||
MOVL BX, R10
|
||||
MOVL BP, R11
|
||||
MOVL SI, R12
|
||||
MOVL CX, R13
|
||||
|
||||
// ROUND1
|
||||
XORL SI, R9
|
||||
ADDL $0xd76aa478, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R9
|
||||
XORL CX, R9
|
||||
MOVL 4(DX), R8
|
||||
ADDL R9, BX
|
||||
ROLL $0x07, BX
|
||||
MOVL SI, R9
|
||||
ADDL BP, BX
|
||||
XORL BP, R9
|
||||
ADDL $0xe8c7b756, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R9
|
||||
XORL SI, R9
|
||||
MOVL 8(DX), R8
|
||||
ADDL R9, CX
|
||||
ROLL $0x0c, CX
|
||||
MOVL BP, R9
|
||||
ADDL BX, CX
|
||||
XORL BX, R9
|
||||
ADDL $0x242070db, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R9
|
||||
XORL BP, R9
|
||||
MOVL 12(DX), R8
|
||||
ADDL R9, SI
|
||||
ROLL $0x11, SI
|
||||
MOVL BX, R9
|
||||
ADDL CX, SI
|
||||
XORL CX, R9
|
||||
ADDL $0xc1bdceee, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R9
|
||||
XORL BX, R9
|
||||
MOVL 16(DX), R8
|
||||
ADDL R9, BP
|
||||
ROLL $0x16, BP
|
||||
MOVL CX, R9
|
||||
ADDL SI, BP
|
||||
XORL SI, R9
|
||||
ADDL $0xf57c0faf, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R9
|
||||
XORL CX, R9
|
||||
MOVL 20(DX), R8
|
||||
ADDL R9, BX
|
||||
ROLL $0x07, BX
|
||||
MOVL SI, R9
|
||||
ADDL BP, BX
|
||||
XORL BP, R9
|
||||
ADDL $0x4787c62a, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R9
|
||||
XORL SI, R9
|
||||
MOVL 24(DX), R8
|
||||
ADDL R9, CX
|
||||
ROLL $0x0c, CX
|
||||
MOVL BP, R9
|
||||
ADDL BX, CX
|
||||
XORL BX, R9
|
||||
ADDL $0xa8304613, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R9
|
||||
XORL BP, R9
|
||||
MOVL 28(DX), R8
|
||||
ADDL R9, SI
|
||||
ROLL $0x11, SI
|
||||
MOVL BX, R9
|
||||
ADDL CX, SI
|
||||
XORL CX, R9
|
||||
ADDL $0xfd469501, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R9
|
||||
XORL BX, R9
|
||||
MOVL 32(DX), R8
|
||||
ADDL R9, BP
|
||||
ROLL $0x16, BP
|
||||
MOVL CX, R9
|
||||
ADDL SI, BP
|
||||
XORL SI, R9
|
||||
ADDL $0x698098d8, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R9
|
||||
XORL CX, R9
|
||||
MOVL 36(DX), R8
|
||||
ADDL R9, BX
|
||||
ROLL $0x07, BX
|
||||
MOVL SI, R9
|
||||
ADDL BP, BX
|
||||
XORL BP, R9
|
||||
ADDL $0x8b44f7af, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R9
|
||||
XORL SI, R9
|
||||
MOVL 40(DX), R8
|
||||
ADDL R9, CX
|
||||
ROLL $0x0c, CX
|
||||
MOVL BP, R9
|
||||
ADDL BX, CX
|
||||
XORL BX, R9
|
||||
ADDL $0xffff5bb1, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R9
|
||||
XORL BP, R9
|
||||
MOVL 44(DX), R8
|
||||
ADDL R9, SI
|
||||
ROLL $0x11, SI
|
||||
MOVL BX, R9
|
||||
ADDL CX, SI
|
||||
XORL CX, R9
|
||||
ADDL $0x895cd7be, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R9
|
||||
XORL BX, R9
|
||||
MOVL 48(DX), R8
|
||||
ADDL R9, BP
|
||||
ROLL $0x16, BP
|
||||
MOVL CX, R9
|
||||
ADDL SI, BP
|
||||
XORL SI, R9
|
||||
ADDL $0x6b901122, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R9
|
||||
XORL CX, R9
|
||||
MOVL 52(DX), R8
|
||||
ADDL R9, BX
|
||||
ROLL $0x07, BX
|
||||
MOVL SI, R9
|
||||
ADDL BP, BX
|
||||
XORL BP, R9
|
||||
ADDL $0xfd987193, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R9
|
||||
XORL SI, R9
|
||||
MOVL 56(DX), R8
|
||||
ADDL R9, CX
|
||||
ROLL $0x0c, CX
|
||||
MOVL BP, R9
|
||||
ADDL BX, CX
|
||||
XORL BX, R9
|
||||
ADDL $0xa679438e, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R9
|
||||
XORL BP, R9
|
||||
MOVL 60(DX), R8
|
||||
ADDL R9, SI
|
||||
ROLL $0x11, SI
|
||||
MOVL BX, R9
|
||||
ADDL CX, SI
|
||||
XORL CX, R9
|
||||
ADDL $0x49b40821, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R9
|
||||
XORL BX, R9
|
||||
MOVL 4(DX), R8
|
||||
ADDL R9, BP
|
||||
ROLL $0x16, BP
|
||||
MOVL CX, R9
|
||||
ADDL SI, BP
|
||||
|
||||
// ROUND2
|
||||
MOVL CX, R9
|
||||
MOVL CX, R14
|
||||
XORL DI, R9
|
||||
ADDL $0xf61e2562, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R14
|
||||
ANDL SI, R9
|
||||
MOVL 24(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL SI, R9
|
||||
ADDL R14, BX
|
||||
MOVL SI, R14
|
||||
ROLL $0x05, BX
|
||||
ADDL BP, BX
|
||||
XORL DI, R9
|
||||
ADDL $0xc040b340, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R14
|
||||
ANDL BP, R9
|
||||
MOVL 44(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BP, R9
|
||||
ADDL R14, CX
|
||||
MOVL BP, R14
|
||||
ROLL $0x09, CX
|
||||
ADDL BX, CX
|
||||
XORL DI, R9
|
||||
ADDL $0x265e5a51, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R14
|
||||
ANDL BX, R9
|
||||
MOVL (DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BX, R9
|
||||
ADDL R14, SI
|
||||
MOVL BX, R14
|
||||
ROLL $0x0e, SI
|
||||
ADDL CX, SI
|
||||
XORL DI, R9
|
||||
ADDL $0xe9b6c7aa, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R14
|
||||
ANDL CX, R9
|
||||
MOVL 20(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL CX, R9
|
||||
ADDL R14, BP
|
||||
MOVL CX, R14
|
||||
ROLL $0x14, BP
|
||||
ADDL SI, BP
|
||||
XORL DI, R9
|
||||
ADDL $0xd62f105d, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R14
|
||||
ANDL SI, R9
|
||||
MOVL 40(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL SI, R9
|
||||
ADDL R14, BX
|
||||
MOVL SI, R14
|
||||
ROLL $0x05, BX
|
||||
ADDL BP, BX
|
||||
XORL DI, R9
|
||||
ADDL $0x02441453, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R14
|
||||
ANDL BP, R9
|
||||
MOVL 60(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BP, R9
|
||||
ADDL R14, CX
|
||||
MOVL BP, R14
|
||||
ROLL $0x09, CX
|
||||
ADDL BX, CX
|
||||
XORL DI, R9
|
||||
ADDL $0xd8a1e681, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R14
|
||||
ANDL BX, R9
|
||||
MOVL 16(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BX, R9
|
||||
ADDL R14, SI
|
||||
MOVL BX, R14
|
||||
ROLL $0x0e, SI
|
||||
ADDL CX, SI
|
||||
XORL DI, R9
|
||||
ADDL $0xe7d3fbc8, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R14
|
||||
ANDL CX, R9
|
||||
MOVL 36(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL CX, R9
|
||||
ADDL R14, BP
|
||||
MOVL CX, R14
|
||||
ROLL $0x14, BP
|
||||
ADDL SI, BP
|
||||
XORL DI, R9
|
||||
ADDL $0x21e1cde6, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R14
|
||||
ANDL SI, R9
|
||||
MOVL 56(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL SI, R9
|
||||
ADDL R14, BX
|
||||
MOVL SI, R14
|
||||
ROLL $0x05, BX
|
||||
ADDL BP, BX
|
||||
XORL DI, R9
|
||||
ADDL $0xc33707d6, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R14
|
||||
ANDL BP, R9
|
||||
MOVL 12(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BP, R9
|
||||
ADDL R14, CX
|
||||
MOVL BP, R14
|
||||
ROLL $0x09, CX
|
||||
ADDL BX, CX
|
||||
XORL DI, R9
|
||||
ADDL $0xf4d50d87, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R14
|
||||
ANDL BX, R9
|
||||
MOVL 32(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BX, R9
|
||||
ADDL R14, SI
|
||||
MOVL BX, R14
|
||||
ROLL $0x0e, SI
|
||||
ADDL CX, SI
|
||||
XORL DI, R9
|
||||
ADDL $0x455a14ed, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R14
|
||||
ANDL CX, R9
|
||||
MOVL 52(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL CX, R9
|
||||
ADDL R14, BP
|
||||
MOVL CX, R14
|
||||
ROLL $0x14, BP
|
||||
ADDL SI, BP
|
||||
XORL DI, R9
|
||||
ADDL $0xa9e3e905, BX
|
||||
ADDL R8, BX
|
||||
ANDL BP, R14
|
||||
ANDL SI, R9
|
||||
MOVL 8(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL SI, R9
|
||||
ADDL R14, BX
|
||||
MOVL SI, R14
|
||||
ROLL $0x05, BX
|
||||
ADDL BP, BX
|
||||
XORL DI, R9
|
||||
ADDL $0xfcefa3f8, CX
|
||||
ADDL R8, CX
|
||||
ANDL BX, R14
|
||||
ANDL BP, R9
|
||||
MOVL 28(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BP, R9
|
||||
ADDL R14, CX
|
||||
MOVL BP, R14
|
||||
ROLL $0x09, CX
|
||||
ADDL BX, CX
|
||||
XORL DI, R9
|
||||
ADDL $0x676f02d9, SI
|
||||
ADDL R8, SI
|
||||
ANDL CX, R14
|
||||
ANDL BX, R9
|
||||
MOVL 48(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL BX, R9
|
||||
ADDL R14, SI
|
||||
MOVL BX, R14
|
||||
ROLL $0x0e, SI
|
||||
ADDL CX, SI
|
||||
XORL DI, R9
|
||||
ADDL $0x8d2a4c8a, BP
|
||||
ADDL R8, BP
|
||||
ANDL SI, R14
|
||||
ANDL CX, R9
|
||||
MOVL 20(DX), R8
|
||||
ORL R9, R14
|
||||
MOVL CX, R9
|
||||
ADDL R14, BP
|
||||
MOVL CX, R14
|
||||
ROLL $0x14, BP
|
||||
ADDL SI, BP
|
||||
|
||||
// ROUND3
|
||||
MOVL SI, R9
|
||||
ADDL $0xfffa3942, BX
|
||||
ADDL R8, BX
|
||||
MOVL 32(DX), R8
|
||||
XORL CX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, BX
|
||||
ROLL $0x04, BX
|
||||
MOVL BP, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0x8771f681, CX
|
||||
ADDL R8, CX
|
||||
MOVL 44(DX), R8
|
||||
XORL SI, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, CX
|
||||
ROLL $0x0b, CX
|
||||
MOVL BX, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0x6d9d6122, SI
|
||||
ADDL R8, SI
|
||||
MOVL 56(DX), R8
|
||||
XORL BP, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, SI
|
||||
ROLL $0x10, SI
|
||||
MOVL CX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0xfde5380c, BP
|
||||
ADDL R8, BP
|
||||
MOVL 4(DX), R8
|
||||
XORL BX, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BP
|
||||
ROLL $0x17, BP
|
||||
MOVL SI, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0xa4beea44, BX
|
||||
ADDL R8, BX
|
||||
MOVL 16(DX), R8
|
||||
XORL CX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, BX
|
||||
ROLL $0x04, BX
|
||||
MOVL BP, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0x4bdecfa9, CX
|
||||
ADDL R8, CX
|
||||
MOVL 28(DX), R8
|
||||
XORL SI, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, CX
|
||||
ROLL $0x0b, CX
|
||||
MOVL BX, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0xf6bb4b60, SI
|
||||
ADDL R8, SI
|
||||
MOVL 40(DX), R8
|
||||
XORL BP, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, SI
|
||||
ROLL $0x10, SI
|
||||
MOVL CX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0xbebfbc70, BP
|
||||
ADDL R8, BP
|
||||
MOVL 52(DX), R8
|
||||
XORL BX, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BP
|
||||
ROLL $0x17, BP
|
||||
MOVL SI, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0x289b7ec6, BX
|
||||
ADDL R8, BX
|
||||
MOVL (DX), R8
|
||||
XORL CX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, BX
|
||||
ROLL $0x04, BX
|
||||
MOVL BP, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0xeaa127fa, CX
|
||||
ADDL R8, CX
|
||||
MOVL 12(DX), R8
|
||||
XORL SI, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, CX
|
||||
ROLL $0x0b, CX
|
||||
MOVL BX, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0xd4ef3085, SI
|
||||
ADDL R8, SI
|
||||
MOVL 24(DX), R8
|
||||
XORL BP, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, SI
|
||||
ROLL $0x10, SI
|
||||
MOVL CX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0x04881d05, BP
|
||||
ADDL R8, BP
|
||||
MOVL 36(DX), R8
|
||||
XORL BX, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BP
|
||||
ROLL $0x17, BP
|
||||
MOVL SI, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0xd9d4d039, BX
|
||||
ADDL R8, BX
|
||||
MOVL 48(DX), R8
|
||||
XORL CX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, BX
|
||||
ROLL $0x04, BX
|
||||
MOVL BP, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0xe6db99e5, CX
|
||||
ADDL R8, CX
|
||||
MOVL 60(DX), R8
|
||||
XORL SI, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, CX
|
||||
ROLL $0x0b, CX
|
||||
MOVL BX, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0x1fa27cf8, SI
|
||||
ADDL R8, SI
|
||||
MOVL 8(DX), R8
|
||||
XORL BP, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, SI
|
||||
ROLL $0x10, SI
|
||||
MOVL CX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0xc4ac5665, BP
|
||||
ADDL R8, BP
|
||||
MOVL (DX), R8
|
||||
XORL BX, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BP
|
||||
ROLL $0x17, BP
|
||||
MOVL SI, R9
|
||||
ADDL SI, BP
|
||||
|
||||
// ROUND4
|
||||
MOVL DI, R9
|
||||
XORL CX, R9
|
||||
ADDL $0xf4292244, BX
|
||||
ADDL R8, BX
|
||||
ORL BP, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BX
|
||||
MOVL 28(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x06, BX
|
||||
XORL SI, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0x432aff97, CX
|
||||
ADDL R8, CX
|
||||
ORL BX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, CX
|
||||
MOVL 56(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0a, CX
|
||||
XORL BP, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0xab9423a7, SI
|
||||
ADDL R8, SI
|
||||
ORL CX, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, SI
|
||||
MOVL 20(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0f, SI
|
||||
XORL BX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0xfc93a039, BP
|
||||
ADDL R8, BP
|
||||
ORL SI, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, BP
|
||||
MOVL 48(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x15, BP
|
||||
XORL CX, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0x655b59c3, BX
|
||||
ADDL R8, BX
|
||||
ORL BP, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BX
|
||||
MOVL 12(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x06, BX
|
||||
XORL SI, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0x8f0ccc92, CX
|
||||
ADDL R8, CX
|
||||
ORL BX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, CX
|
||||
MOVL 40(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0a, CX
|
||||
XORL BP, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0xffeff47d, SI
|
||||
ADDL R8, SI
|
||||
ORL CX, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, SI
|
||||
MOVL 4(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0f, SI
|
||||
XORL BX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0x85845dd1, BP
|
||||
ADDL R8, BP
|
||||
ORL SI, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, BP
|
||||
MOVL 32(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x15, BP
|
||||
XORL CX, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0x6fa87e4f, BX
|
||||
ADDL R8, BX
|
||||
ORL BP, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BX
|
||||
MOVL 60(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x06, BX
|
||||
XORL SI, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0xfe2ce6e0, CX
|
||||
ADDL R8, CX
|
||||
ORL BX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, CX
|
||||
MOVL 24(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0a, CX
|
||||
XORL BP, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0xa3014314, SI
|
||||
ADDL R8, SI
|
||||
ORL CX, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, SI
|
||||
MOVL 52(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0f, SI
|
||||
XORL BX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0x4e0811a1, BP
|
||||
ADDL R8, BP
|
||||
ORL SI, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, BP
|
||||
MOVL 16(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x15, BP
|
||||
XORL CX, R9
|
||||
ADDL SI, BP
|
||||
ADDL $0xf7537e82, BX
|
||||
ADDL R8, BX
|
||||
ORL BP, R9
|
||||
XORL SI, R9
|
||||
ADDL R9, BX
|
||||
MOVL 44(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x06, BX
|
||||
XORL SI, R9
|
||||
ADDL BP, BX
|
||||
ADDL $0xbd3af235, CX
|
||||
ADDL R8, CX
|
||||
ORL BX, R9
|
||||
XORL BP, R9
|
||||
ADDL R9, CX
|
||||
MOVL 8(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0a, CX
|
||||
XORL BP, R9
|
||||
ADDL BX, CX
|
||||
ADDL $0x2ad7d2bb, SI
|
||||
ADDL R8, SI
|
||||
ORL CX, R9
|
||||
XORL BX, R9
|
||||
ADDL R9, SI
|
||||
MOVL 36(DX), R8
|
||||
MOVL DI, R9
|
||||
ROLL $0x0f, SI
|
||||
XORL BX, R9
|
||||
ADDL CX, SI
|
||||
ADDL $0xeb86d391, BP
|
||||
ADDL R8, BP
|
||||
ORL SI, R9
|
||||
XORL CX, R9
|
||||
ADDL R9, BP
|
||||
ROLL $0x15, BP
|
||||
ADDL SI, BP
|
||||
ADDL R10, BX
|
||||
ADDL R11, BP
|
||||
ADDL R12, SI
|
||||
ADDL R13, CX
|
||||
|
||||
// Prepare next loop
|
||||
ADDQ $0x40, DX
|
||||
CMPQ DX, AX
|
||||
JB loop
|
||||
|
||||
// Write output
|
||||
MOVQ dig+0(FP), AX
|
||||
MOVL BX, (AX)
|
||||
MOVL BP, 4(AX)
|
||||
MOVL SI, 8(AX)
|
||||
MOVL CX, 12(AX)
|
||||
|
||||
end:
|
||||
RET
|
Loading…
Add table
Add a link
Reference in a new issue