--- a/SHA1Stream.st Sat Mar 23 15:30:15 2019 +0100
+++ b/SHA1Stream.st Sat Mar 23 20:24:47 2019 +0100
@@ -51,8 +51,8 @@
#if defined(__GNUC__) || defined(__CLANG__)
# define STATIC_INLINE static inline
-# define __SSE4_1__
-# define __SHA__
+# define __SSE4_1__ 1
+# define __SHA__ 1
# include <immintrin.h>
static void SHA1Transform(unsigned int32 state[5], unsigned char buffer[64]);
// static void __attribute__ ((__target__ ("sha,sse4.1"))) SHA1Transform_x86(unsigned int32 state[5], unsigned char buffer[64]);
@@ -99,15 +99,33 @@
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
+// 876543210 -> rol,24 -> 108765432 -> & -> 10xx65xx
+// 876543210 -> rol,8 -> 654321087 -> & -> xx43xx87
+// oring: 10436587
#ifdef LITTLE_ENDIAN
-# define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
- |(rol(block->l[i],8)&0x00FF00FF))
+// cg: does not make any difference (actually, slightly slower, as it seems)...
+# if 0 && (defined(__i386__) || defined(__x86__) || defined(__x86_64__)) && (defined(__GNUC__) || defined(__CLANG__))
+ static inline u_int32_t __bswap(u_int32_t v) {
+ register u_int32_t l = v;
+ __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l));
+ return l;
+ }
+# define blk0(i) \
+ (block->l[i] = __bswap(block->l[i]))
+# else
+# define blk0(i) \
+ (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+ |(rol(block->l[i],8)&0x00FF00FF))
+# endif
#else
# define blk0(i) block->l[i]
#endif
-#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
- ^block->l[(i+2)&15]^block->l[i&15],1))
+#define blk(i) (block->l[i&15] = \
+ rol(block->l[(i+13)&15] \
+ ^ block->l[(i+8)&15] \
+ ^ block->l[(i+2)&15] \
+ ^ block->l[i&15],1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
@@ -180,6 +198,9 @@
#if defined(__CLANG__)
+//
+// a specially tuned version
+//
static void
SHA1Transform_x86 (unsigned int32 state[5], unsigned char buffer[64])
{
@@ -388,7 +409,13 @@
unsigned char buffer[64];
#endif
{
- SHA1Transform_generic(state, buffer);
+ extern unsigned char __cpu_hasSSE4_1_and_SHA;
+
+ if (__cpu_hasSSE4_1_and_SHA) {
+ SHA1Transform_x86(state, buffer);
+ } else {
+ SHA1Transform_generic(state, buffer);
+ }
}
#endif
@@ -669,31 +696,31 @@
CPU cc algo mb/sec
MAC (2010 macbook; 2.7Ghz Duo) clang -O2 slow 128.5
- 132
+ 132
MAC (2012 macbook; 2.6Ghz I7) clang -O2 190
- chunk size 10: 86.70 Mb/s
- chunk size 50: 227.07 Mb/s
- chunk size 1000: 405.82 Mb/s
- chunk size 50000: 421.98 Mb/s
+ chunk size 10: 86.70 Mb/s 90.83
+ chunk size 50: 227.07 Mb/s 238.42
+ chunk size 1000: 405.82 Mb/s 414.64
+ chunk size 50000: 421.98 Mb/s 447.73
timing throughput:
- [exBegin]
+ [exBegin]
|hashStream n t|
hashStream := SHA1Stream new.
n := 1000000.
t := Time millisecondsToRun:[
- n timesRepeat:[
- hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'.
- ].
- ].
+ n timesRepeat:[
+ hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'.
+ ].
+ ].
t := (t / 1000) asFloat.
Transcript show:t; show:' seconds for '; show:(50*n/1024) asFloat; showCR:' Kb'.
Transcript show:(n*50/1024 / t); showCR:' Kb/s'
- [exEnd]
+ [exEnd]
"
! !
@@ -840,17 +867,17 @@
%{
unsigned char value;
OBJ _hashContext = __INST(hashContext);
-
+
// fetch first; check below
value = __intVal(anInteger);
if (__isSmallInteger(anInteger) && value <= 255
&& __isByteArray(_hashContext)
&& __byteArraySize(_hashContext) == sizeof(SHA1_CTX)
) {
- SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
+ SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
- SHA1Update(ctx, &value, 1);
- RETURN(self);
+ SHA1Update(ctx, &value, 1);
+ RETURN(self);
}
bad: ;
%}.
@@ -870,7 +897,7 @@
INT len, offs;
INT objSize;
int nInstBytes;
- char *extPtr;
+ unsigned char *extPtr;
OBJ _hashContext = __INST(hashContext);
// convert here; check later
@@ -881,61 +908,61 @@
&& __byteArraySize(_hashContext) == sizeof(SHA1_CTX)
&& __bothSmallInteger(count, start)
) {
- SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
+ SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
- if (__isByteArrayLike(anObject)) {
- extPtr = (char *)__byteArrayVal(anObject);
- objSize = __byteArraySize(anObject);
- } else if (__isStringLike(anObject)) {
- extPtr = (char *)__stringVal(anObject);
- objSize = __stringSize(anObject);
- } else if (__isExternalBytesLike(anObject)) {
- OBJ sz;
+ if (__isByteArrayLike(anObject)) {
+ extPtr = (unsigned char *)__byteArrayVal(anObject);
+ objSize = __byteArraySize(anObject);
+ } else if (__isStringLike(anObject)) {
+ extPtr = (unsigned char *)__stringVal(anObject);
+ objSize = __stringSize(anObject);
+ } else if (__isExternalBytesLike(anObject)) {
+ OBJ sz;
- nInstBytes = 0;
- extPtr = (char *)__externalBytesAddress(anObject);
- if (extPtr == NULL) goto bad;
- sz = __externalBytesSize(anObject);
- objSize = __intVal(sz);
- if (!__isSmallInteger(sz)) {
- objSize = 0; /* unknown */
- }
- } else {
- OBJ oClass = __Class(anObject);
- int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars);
+ nInstBytes = 0;
+ extPtr = (unsigned char *)__externalBytesAddress(anObject);
+ if (extPtr == NULL) goto bad;
+ sz = __externalBytesSize(anObject);
+ objSize = __intVal(sz);
+ if (!__isSmallInteger(sz)) {
+ objSize = 0; /* unknown */
+ }
+ } else {
+ OBJ oClass = __Class(anObject);
+ int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars);
- nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars);
- switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) {
- case BYTEARRAY:
- case WORDARRAY:
- case LONGARRAY:
- case SWORDARRAY:
- case SLONGARRAY:
- case FLOATARRAY:
- break;
- case DOUBLEARRAY:
+ nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars);
+ switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) {
+ case BYTEARRAY:
+ case WORDARRAY:
+ case LONGARRAY:
+ case SWORDARRAY:
+ case SLONGARRAY:
+ case FLOATARRAY:
+ break;
+ case DOUBLEARRAY:
#ifdef __NEED_DOUBLE_ALIGN
- nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1);
+ nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1);
#endif
- break;
- case LONGLONGARRAY:
- case SLONGLONGARRAY:
+ break;
+ case LONGLONGARRAY:
+ case SLONGLONGARRAY:
#ifdef __NEED_LONGLONG_ALIGN
- nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1);
+ nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1);
#endif
- break;
- default:
- goto bad;
- }
- // nInstBytes is the number of bytes occupied by pointer instance variables
- // subtract from size and add to byte-pointer
- objSize = __Size(anObject) - nInstBytes;
- extPtr = (char *)anObject + nInstBytes;
- }
- if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) {
- SHA1Update(ctx, extPtr+offs, (unsigned int)len);
- RETURN (count);
- }
+ break;
+ default:
+ goto bad;
+ }
+ // nInstBytes is the number of bytes occupied by pointer instance variables
+ // subtract from size and add to byte-pointer
+ objSize = __Size(anObject) - nInstBytes;
+ extPtr = (unsigned char *)anObject + nInstBytes;
+ }
+ if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) {
+ SHA1Update(ctx, extPtr+offs, (unsigned int)len);
+ RETURN (count);
+ }
}
bad: ;
%}.