--- /sys/src/ape/lib/sec/amd64/mkfile +++ /sys/src/ape/lib/sec/amd64/mkfile @@ -3,10 +3,21 @@ APE=/sys/src/ape LIB=/$objtype/lib/ape/libsec.a -OFILES= \ +FILES=\ + md5block\ + sha1block\ HFILES=/sys/include/ape/libsec.h -UPDATE=mkfile +SFILES=${FILES:%=%.s} + +OFILES=${SFILES:%.s=%.$O} + +UPDATE=mkfile\ + $HFILES\ + $SFILES\ > (32 - S11)); + * a += b; + */ + +#define BODY1(off,V,FN,SH,A,B,C,D)\ + FN(B,C,D)\ + LEAL V(A)(DI*1),A;\ + ADDL (off)(BP),A;\ + ROLL $SH,A;\ + ADDL B,A;\ + +#define BODY(off,V,FN,SH,A,B,C,D)\ + FN(B,C,D)\ + LEAL V(A)(DI*1),A;\ + ADDL (off)(BP),A;\ + ROLL $SH,A;\ + ADDL B,A;\ + +/* + * fn1 = ((c ^ d) & b) ^ d + */ +#define FN1(B,C,D)\ + MOVL C,DI;\ + XORL D,DI;\ + ANDL B,DI;\ + XORL D,DI;\ + +/* + * fn2 = ((b ^ c) & d) ^ c; + */ +#define FN2(B,C,D)\ + MOVL B,DI;\ + XORL C,DI;\ + ANDL D,DI;\ + XORL C,DI;\ + +/* + * fn3 = b ^ c ^ d; + */ +#define FN3(B,C,D)\ + MOVL B,DI;\ + XORL C,DI;\ + XORL D,DI;\ + +/* + * fn4 = c ^ (b | ~d); + */ +#define FN4(B,C,D)\ + MOVL D,DI;\ + XORL $-1,DI;\ + ORL B,DI;\ + XORL C,DI;\ + +#define LEN 8 +#define STATE 16 + +TEXT _md5block+0(SB), $0 + + MOVQ RARG,R8 + MOVLQZX len+LEN(FP),BX + ADDQ BX,R8 + +mainloop: + MOVQ state+STATE(FP),SI + MOVL (SI),AX + MOVL 4(SI),BX + MOVL 8(SI),CX + MOVL 12(SI),DX + + BODY1( 0*4,0xd76aa478,FN1,S11,AX,BX,CX,DX) + BODY1( 1*4,0xe8c7b756,FN1,S12,DX,AX,BX,CX) + BODY1( 2*4,0x242070db,FN1,S13,CX,DX,AX,BX) + BODY1( 3*4,0xc1bdceee,FN1,S14,BX,CX,DX,AX) + + BODY1( 4*4,0xf57c0faf,FN1,S11,AX,BX,CX,DX) + BODY1( 5*4,0x4787c62a,FN1,S12,DX,AX,BX,CX) + BODY1( 6*4,0xa8304613,FN1,S13,CX,DX,AX,BX) + BODY1( 7*4,0xfd469501,FN1,S14,BX,CX,DX,AX) + + BODY1( 8*4,0x698098d8,FN1,S11,AX,BX,CX,DX) + BODY1( 9*4,0x8b44f7af,FN1,S12,DX,AX,BX,CX) + BODY1(10*4,0xffff5bb1,FN1,S13,CX,DX,AX,BX) + BODY1(11*4,0x895cd7be,FN1,S14,BX,CX,DX,AX) + + BODY1(12*4,0x6b901122,FN1,S11,AX,BX,CX,DX) + BODY1(13*4,0xfd987193,FN1,S12,DX,AX,BX,CX) + BODY1(14*4,0xa679438e,FN1,S13,CX,DX,AX,BX) + BODY1(15*4,0x49b40821,FN1,S14,BX,CX,DX,AX) + + + BODY( 1*4,0xf61e2562,FN2,S21,AX,BX,CX,DX) + BODY( 6*4,0xc040b340,FN2,S22,DX,AX,BX,CX) + BODY(11*4,0x265e5a51,FN2,S23,CX,DX,AX,BX) + BODY( 0*4,0xe9b6c7aa,FN2,S24,BX,CX,DX,AX) + + BODY( 5*4,0xd62f105d,FN2,S21,AX,BX,CX,DX) + BODY(10*4,0x02441453,FN2,S22,DX,AX,BX,CX) + BODY(15*4,0xd8a1e681,FN2,S23,CX,DX,AX,BX) + BODY( 4*4,0xe7d3fbc8,FN2,S24,BX,CX,DX,AX) + + BODY( 9*4,0x21e1cde6,FN2,S21,AX,BX,CX,DX) + BODY(14*4,0xc33707d6,FN2,S22,DX,AX,BX,CX) + BODY( 3*4,0xf4d50d87,FN2,S23,CX,DX,AX,BX) + BODY( 8*4,0x455a14ed,FN2,S24,BX,CX,DX,AX) + + BODY(13*4,0xa9e3e905,FN2,S21,AX,BX,CX,DX) + BODY( 2*4,0xfcefa3f8,FN2,S22,DX,AX,BX,CX) + BODY( 7*4,0x676f02d9,FN2,S23,CX,DX,AX,BX) + BODY(12*4,0x8d2a4c8a,FN2,S24,BX,CX,DX,AX) + + + BODY( 5*4,0xfffa3942,FN3,S31,AX,BX,CX,DX) + BODY( 8*4,0x8771f681,FN3,S32,DX,AX,BX,CX) + BODY(11*4,0x6d9d6122,FN3,S33,CX,DX,AX,BX) + BODY(14*4,0xfde5380c,FN3,S34,BX,CX,DX,AX) + + BODY( 1*4,0xa4beea44,FN3,S31,AX,BX,CX,DX) + BODY( 4*4,0x4bdecfa9,FN3,S32,DX,AX,BX,CX) + BODY( 7*4,0xf6bb4b60,FN3,S33,CX,DX,AX,BX) + BODY(10*4,0xbebfbc70,FN3,S34,BX,CX,DX,AX) + + BODY(13*4,0x289b7ec6,FN3,S31,AX,BX,CX,DX) + BODY( 0*4,0xeaa127fa,FN3,S32,DX,AX,BX,CX) + BODY( 3*4,0xd4ef3085,FN3,S33,CX,DX,AX,BX) + BODY( 6*4,0x04881d05,FN3,S34,BX,CX,DX,AX) + + BODY( 9*4,0xd9d4d039,FN3,S31,AX,BX,CX,DX) + BODY(12*4,0xe6db99e5,FN3,S32,DX,AX,BX,CX) + BODY(15*4,0x1fa27cf8,FN3,S33,CX,DX,AX,BX) + BODY( 2*4,0xc4ac5665,FN3,S34,BX,CX,DX,AX) + + + BODY( 0*4,0xf4292244,FN4,S41,AX,BX,CX,DX) + BODY( 7*4,0x432aff97,FN4,S42,DX,AX,BX,CX) + BODY(14*4,0xab9423a7,FN4,S43,CX,DX,AX,BX) + BODY( 5*4,0xfc93a039,FN4,S44,BX,CX,DX,AX) + + BODY(12*4,0x655b59c3,FN4,S41,AX,BX,CX,DX) + BODY( 3*4,0x8f0ccc92,FN4,S42,DX,AX,BX,CX) + BODY(10*4,0xffeff47d,FN4,S43,CX,DX,AX,BX) + BODY( 1*4,0x85845dd1,FN4,S44,BX,CX,DX,AX) + + BODY( 8*4,0x6fa87e4f,FN4,S41,AX,BX,CX,DX) + BODY(15*4,0xfe2ce6e0,FN4,S42,DX,AX,BX,CX) + BODY( 6*4,0xa3014314,FN4,S43,CX,DX,AX,BX) + BODY(13*4,0x4e0811a1,FN4,S44,BX,CX,DX,AX) + + BODY( 4*4,0xf7537e82,FN4,S41,AX,BX,CX,DX) + BODY(11*4,0xbd3af235,FN4,S42,DX,AX,BX,CX) + BODY( 2*4,0x2ad7d2bb,FN4,S43,CX,DX,AX,BX) + BODY( 9*4,0xeb86d391,FN4,S44,BX,CX,DX,AX) + + ADDQ $(16*4),BP + MOVQ state+STATE(FP),DI + ADDL AX,0(DI) + ADDL BX,4(DI) + ADDL CX,8(DI) + ADDL DX,12(DI) + + CMPQ BP,R8 + JCS mainloop + + RET + + END --- /sys/src/libsec/amd64/mkfile +++ /sys/src/libsec/amd64/mkfile @@ -2,10 +2,18 @@ objtype=amd64 >> 2 + */ +#define BSWAPDI BYTE $0x0f; BYTE $0xcf; + +#define BODY(off,FN,V,A,B,C,D,E)\ + MOVL (off-64)(BP),DI;\ + XORL (off-56)(BP),DI;\ + XORL (off-32)(BP),DI;\ + XORL (off-12)(BP),DI;\ + ROLL $1,DI;\ + MOVL DI,off(BP);\ + LEAL V(DI)(E*1),E;\ + MOVL A,DI;\ + ROLL $5,DI;\ + ADDL DI,E;\ + FN(B,C,D)\ + ADDL DI,E;\ + RORL $2,B;\ + +#define BODY0(off,FN,V,A,B,C,D,E)\ + MOVLQZX off(BX),DI;\ + BSWAPDI;\ + MOVL DI,off(BP);\ + LEAL V(DI)(E*1),E;\ + MOVL A,DI;\ + ROLL $5,DI;\ + ADDL DI,E;\ + FN(B,C,D)\ + ADDL DI,E;\ + RORL $2,B;\ + +/* + * fn1 = (((C^D)&B)^D); + */ +#define FN1(B,C,D)\ + MOVL C,DI;\ + XORL D,DI;\ + ANDL B,DI;\ + XORL D,DI;\ + +/* + * fn24 = B ^ C ^ D + */ +#define FN24(B,C,D)\ + MOVL B,DI;\ + XORL C,DI;\ + XORL D,DI;\ + +/* + * fn3 = ((B ^ C) & (D ^= B)) ^ B + * D ^= B to restore D + */ +#define FN3(B,C,D)\ + MOVL B,DI;\ + XORL C,DI;\ + XORL B,D;\ + ANDL D,DI;\ + XORL B,DI;\ + XORL B,D;\ + +/* + * stack offsets + * void sha1block(uchar *DATA, int LEN, ulong *STATE) + */ +#define DATA 0 +#define LEN 8 +#define STATE 16 + +/* + * stack offsets for locals + * ulong w[80]; + * uchar *edata; + * ulong *w15, *w40, *w60, *w80; + * register local + * ulong *wp = BP + * ulong a = eax, b = ebx, c = ecx, d = edx, e = esi + * ulong tmp = edi + */ +#define Rpdata R8 +#define WARRAY (-8-(80*4)) +#define TMP1 (-16-(80*4)) +#define TMP2 (-24-(80*4)) +#define W15 (-32-(80*4)) +#define W40 (-40-(80*4)) +#define W60 (-48-(80*4)) +#define W80 (-56-(80*4)) +#define EDATA (-64-(80*4)) + +TEXT _sha1block+0(SB),$384 + + MOVQ RARG, Rpdata + MOVLQZX len+LEN(FP),BX + ADDQ BX, RARG + MOVQ RARG,edata+EDATA(SP) + + LEAQ aw15+(WARRAY+15*4)(SP),DI + MOVQ DI,w15+W15(SP) + LEAQ aw40+(WARRAY+40*4)(SP),DX + MOVQ DX,w40+W40(SP) + LEAQ aw60+(WARRAY+60*4)(SP),CX + MOVQ CX,w60+W60(SP) + LEAQ aw80+(WARRAY+80*4)(SP),DI + MOVQ DI,w80+W80(SP) + +mainloop: + LEAQ warray+WARRAY(SP),BP + + MOVQ state+STATE(FP),DI + MOVL (DI),AX + MOVL 4(DI),BX + MOVL BX,tmp1+TMP1(SP) + MOVL 8(DI),CX + MOVL 12(DI),DX + MOVL 16(DI),SI + + MOVQ Rpdata,BX + +loop1: + BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI) + MOVL SI,tmp2+TMP2(SP) + BODY0(4,FN1,0x5a827999,SI,AX,tmp1+TMP1(SP),CX,DX) + MOVL tmp1+TMP1(SP),SI + BODY0(8,FN1,0x5a827999,DX,tmp2+TMP2(SP),AX,SI,CX) + BODY0(12,FN1,0x5a827999,CX,DX,tmp2+TMP2(SP),AX,SI) + MOVL SI,tmp1+TMP1(SP) + BODY0(16,FN1,0x5a827999,SI,CX,DX,tmp2+TMP2(SP),AX) + MOVL tmp2+TMP2(SP),SI + + ADDQ $20,BX + ADDQ $20,BP + CMPQ BP,w15+W15(SP) + JCS loop1 + + BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI) + ADDQ $4,BX + MOVQ BX,R8 + MOVQ tmp1+TMP1(SP),BX + + BODY(4,FN1,0x5a827999,SI,AX,BX,CX,DX) + BODY(8,FN1,0x5a827999,DX,SI,AX,BX,CX) + BODY(12,FN1,0x5a827999,CX,DX,SI,AX,BX) + BODY(16,FN1,0x5a827999,BX,CX,DX,SI,AX) + + ADDQ $20,BP + +loop2: + BODY(0,FN24,0x6ed9eba1,AX,BX,CX,DX,SI) + BODY(4,FN24,0x6ed9eba1,SI,AX,BX,CX,DX) + BODY(8,FN24,0x6ed9eba1,DX,SI,AX,BX,CX) + BODY(12,FN24,0x6ed9eba1,CX,DX,SI,AX,BX) + BODY(16,FN24,0x6ed9eba1,BX,CX,DX,SI,AX) + + ADDQ $20,BP + CMPQ BP,w40+W40(SP) + JCS loop2 + +loop3: + BODY(0,FN3,0x8f1bbcdc,AX,BX,CX,DX,SI) + BODY(4,FN3,0x8f1bbcdc,SI,AX,BX,CX,DX) + BODY(8,FN3,0x8f1bbcdc,DX,SI,AX,BX,CX) + BODY(12,FN3,0x8f1bbcdc,CX,DX,SI,AX,BX) + BODY(16,FN3,0x8f1bbcdc,BX,CX,DX,SI,AX) + + ADDQ $20,BP + CMPQ BP,w60+W60(SP) + JCS loop3 + +loop4: + BODY(0,FN24,0xca62c1d6,AX,BX,CX,DX,SI) + BODY(4,FN24,0xca62c1d6,SI,AX,BX,CX,DX) + BODY(8,FN24,0xca62c1d6,DX,SI,AX,BX,CX) + BODY(12,FN24,0xca62c1d6,CX,DX,SI,AX,BX) + BODY(16,FN24,0xca62c1d6,BX,CX,DX,SI,AX) + + ADDQ $20,BP + CMPQ BP,w80+W80(SP) + JCS loop4 + + MOVQ state+STATE(FP),DI + ADDL AX,0(DI) + ADDL BX,4(DI) + ADDL CX,8(DI) + ADDL DX,12(DI) + ADDL SI,16(DI) + + MOVQ edata+EDATA(SP),DI + CMPQ Rpdata,DI + JCS mainloop + + RET + END