diff -r 22c01d2b1186 -r 0094f9be672e SHA1Stream.st --- a/SHA1Stream.st Sat Mar 23 15:30:15 2019 +0100 +++ b/SHA1Stream.st Sat Mar 23 20:24:47 2019 +0100 @@ -51,8 +51,8 @@ #if defined(__GNUC__) || defined(__CLANG__) # define STATIC_INLINE static inline -# define __SSE4_1__ -# define __SHA__ +# define __SSE4_1__ 1 +# define __SHA__ 1 # include static void SHA1Transform(unsigned int32 state[5], unsigned char buffer[64]); // static void __attribute__ ((__target__ ("sha,sse4.1"))) SHA1Transform_x86(unsigned int32 state[5], unsigned char buffer[64]); @@ -99,15 +99,33 @@ /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ +// 876543210 -> rol,24 -> 108765432 -> & -> 10xx65xx +// 876543210 -> rol,8 -> 654321087 -> & -> xx43xx87 +// oring: 10436587 #ifdef LITTLE_ENDIAN -# define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ - |(rol(block->l[i],8)&0x00FF00FF)) +// cg: does not make any difference (actually, slightly slower, as it seems)... +# if 0 && (defined(__i386__) || defined(__x86__) || defined(__x86_64__)) && (defined(__GNUC__) || defined(__CLANG__)) + static inline u_int32_t __bswap(u_int32_t v) { + register u_int32_t l = v; + __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l)); + return l; + } +# define blk0(i) \ + (block->l[i] = __bswap(block->l[i])) +# else +# define blk0(i) \ + (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ + |(rol(block->l[i],8)&0x00FF00FF)) +# endif #else # define blk0(i) block->l[i] #endif -#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ - ^block->l[(i+2)&15]^block->l[i&15],1)) +#define blk(i) (block->l[i&15] = \ + rol(block->l[(i+13)&15] \ + ^ block->l[(i+8)&15] \ + ^ block->l[(i+2)&15] \ + ^ block->l[i&15],1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); @@ -180,6 +198,9 @@ #if defined(__CLANG__) +// +// a specially tuned version +// static void SHA1Transform_x86 (unsigned int32 state[5], unsigned char buffer[64]) { @@ -388,7 +409,13 @@ unsigned char buffer[64]; #endif { - SHA1Transform_generic(state, buffer); + extern unsigned char __cpu_hasSSE4_1_and_SHA; + + if (__cpu_hasSSE4_1_and_SHA) { + SHA1Transform_x86(state, buffer); + } else { + SHA1Transform_generic(state, buffer); + } } #endif @@ -669,31 +696,31 @@ CPU cc algo mb/sec MAC (2010 macbook; 2.7Ghz Duo) clang -O2 slow 128.5 - 132 + 132 MAC (2012 macbook; 2.6Ghz I7) clang -O2 190 - chunk size 10: 86.70 Mb/s - chunk size 50: 227.07 Mb/s - chunk size 1000: 405.82 Mb/s - chunk size 50000: 421.98 Mb/s + chunk size 10: 86.70 Mb/s 90.83 + chunk size 50: 227.07 Mb/s 238.42 + chunk size 1000: 405.82 Mb/s 414.64 + chunk size 50000: 421.98 Mb/s 447.73 timing throughput: - [exBegin] + [exBegin] |hashStream n t| hashStream := SHA1Stream new. n := 1000000. t := Time millisecondsToRun:[ - n timesRepeat:[ - hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'. - ]. - ]. + n timesRepeat:[ + hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'. + ]. + ]. t := (t / 1000) asFloat. Transcript show:t; show:' seconds for '; show:(50*n/1024) asFloat; showCR:' Kb'. Transcript show:(n*50/1024 / t); showCR:' Kb/s' - [exEnd] + [exEnd] " ! ! @@ -840,17 +867,17 @@ %{ unsigned char value; OBJ _hashContext = __INST(hashContext); - + // fetch first; check below value = __intVal(anInteger); if (__isSmallInteger(anInteger) && value <= 255 && __isByteArray(_hashContext) && __byteArraySize(_hashContext) == sizeof(SHA1_CTX) ) { - SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext); + SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext); - SHA1Update(ctx, &value, 1); - RETURN(self); + SHA1Update(ctx, &value, 1); + RETURN(self); } bad: ; %}. @@ -870,7 +897,7 @@ INT len, offs; INT objSize; int nInstBytes; - char *extPtr; + unsigned char *extPtr; OBJ _hashContext = __INST(hashContext); // convert here; check later @@ -881,61 +908,61 @@ && __byteArraySize(_hashContext) == sizeof(SHA1_CTX) && __bothSmallInteger(count, start) ) { - SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext); + SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext); - if (__isByteArrayLike(anObject)) { - extPtr = (char *)__byteArrayVal(anObject); - objSize = __byteArraySize(anObject); - } else if (__isStringLike(anObject)) { - extPtr = (char *)__stringVal(anObject); - objSize = __stringSize(anObject); - } else if (__isExternalBytesLike(anObject)) { - OBJ sz; + if (__isByteArrayLike(anObject)) { + extPtr = (unsigned char *)__byteArrayVal(anObject); + objSize = __byteArraySize(anObject); + } else if (__isStringLike(anObject)) { + extPtr = (unsigned char *)__stringVal(anObject); + objSize = __stringSize(anObject); + } else if (__isExternalBytesLike(anObject)) { + OBJ sz; - nInstBytes = 0; - extPtr = (char *)__externalBytesAddress(anObject); - if (extPtr == NULL) goto bad; - sz = __externalBytesSize(anObject); - objSize = __intVal(sz); - if (!__isSmallInteger(sz)) { - objSize = 0; /* unknown */ - } - } else { - OBJ oClass = __Class(anObject); - int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars); + nInstBytes = 0; + extPtr = (unsigned char *)__externalBytesAddress(anObject); + if (extPtr == NULL) goto bad; + sz = __externalBytesSize(anObject); + objSize = __intVal(sz); + if (!__isSmallInteger(sz)) { + objSize = 0; /* unknown */ + } + } else { + OBJ oClass = __Class(anObject); + int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars); - nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars); - switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) { - case BYTEARRAY: - case WORDARRAY: - case LONGARRAY: - case SWORDARRAY: - case SLONGARRAY: - case FLOATARRAY: - break; - case DOUBLEARRAY: + nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars); + switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) { + case BYTEARRAY: + case WORDARRAY: + case LONGARRAY: + case SWORDARRAY: + case SLONGARRAY: + case FLOATARRAY: + break; + case DOUBLEARRAY: #ifdef __NEED_DOUBLE_ALIGN - nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1); + nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1); #endif - break; - case LONGLONGARRAY: - case SLONGLONGARRAY: + break; + case LONGLONGARRAY: + case SLONGLONGARRAY: #ifdef __NEED_LONGLONG_ALIGN - nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1); + nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1); #endif - break; - default: - goto bad; - } - // nInstBytes is the number of bytes occupied by pointer instance variables - // subtract from size and add to byte-pointer - objSize = __Size(anObject) - nInstBytes; - extPtr = (char *)anObject + nInstBytes; - } - if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) { - SHA1Update(ctx, extPtr+offs, (unsigned int)len); - RETURN (count); - } + break; + default: + goto bad; + } + // nInstBytes is the number of bytes occupied by pointer instance variables + // subtract from size and add to byte-pointer + objSize = __Size(anObject) - nInstBytes; + extPtr = (unsigned char *)anObject + nInstBytes; + } + if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) { + SHA1Update(ctx, extPtr+offs, (unsigned int)len); + RETURN (count); + } } bad: ; %}.