SHA1Stream.st
changeset 23957 0094f9be672e
parent 23956 22c01d2b1186
child 23958 1c903aa4a100
--- a/SHA1Stream.st	Sat Mar 23 15:30:15 2019 +0100
+++ b/SHA1Stream.st	Sat Mar 23 20:24:47 2019 +0100
@@ -51,8 +51,8 @@
 
 #if defined(__GNUC__) || defined(__CLANG__)
 # define STATIC_INLINE static inline
-# define __SSE4_1__
-# define __SHA__
+# define __SSE4_1__ 1
+# define __SHA__    1
 # include <immintrin.h>
  static void SHA1Transform(unsigned int32 state[5], unsigned char buffer[64]);
  // static void __attribute__ ((__target__ ("sha,sse4.1"))) SHA1Transform_x86(unsigned int32 state[5], unsigned char buffer[64]);
@@ -99,15 +99,33 @@
 
 /* blk0() and blk() perform the initial expand. */
 /* I got the idea of expanding during the round function from SSLeay */
+// 876543210 -> rol,24 -> 108765432 -> & -> 10xx65xx
+// 876543210 -> rol,8  -> 654321087 -> & -> xx43xx87
+// oring: 10436587
 #ifdef LITTLE_ENDIAN
-# define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
-    |(rol(block->l[i],8)&0x00FF00FF))
+// cg: does not make any difference (actually, slightly slower, as it seems)...
+# if 0 && (defined(__i386__) || defined(__x86__) || defined(__x86_64__)) && (defined(__GNUC__) || defined(__CLANG__))
+   static inline u_int32_t __bswap(u_int32_t v) {
+	register u_int32_t l = v;
+	__asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l));
+	return l;
+   }
+#  define blk0(i) \
+    (block->l[i] = __bswap(block->l[i]))
+# else
+#  define blk0(i) \
+    (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+		  |(rol(block->l[i],8)&0x00FF00FF))
+# endif
 #else
 # define blk0(i) block->l[i]
 #endif
 
-#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
-    ^block->l[(i+2)&15]^block->l[i&15],1))
+#define blk(i) (block->l[i&15] = \
+    rol(block->l[(i+13)&15] \
+	^ block->l[(i+8)&15] \
+	^ block->l[(i+2)&15] \
+	^ block->l[i&15],1))
 
 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
 #define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
@@ -180,6 +198,9 @@
 
 #if defined(__CLANG__)
 
+//
+// a specially tuned version
+//
 static void
 SHA1Transform_x86 (unsigned int32 state[5], unsigned char buffer[64])
 {
@@ -388,7 +409,13 @@
     unsigned char buffer[64];
 #endif
 {
-    SHA1Transform_generic(state, buffer);
+    extern unsigned char __cpu_hasSSE4_1_and_SHA;
+
+    if (__cpu_hasSSE4_1_and_SHA) {
+	SHA1Transform_x86(state, buffer);
+    } else {
+	SHA1Transform_generic(state, buffer);
+    }
 }
 
 #endif
@@ -669,31 +696,31 @@
     CPU                             cc          algo        mb/sec
 
     MAC (2010 macbook; 2.7Ghz Duo)  clang -O2   slow        128.5
-                                                            132
+							    132
     MAC (2012 macbook; 2.6Ghz I7)   clang -O2               190
 
 
-    chunk size 10: 86.70 Mb/s
-    chunk size 50: 227.07 Mb/s
-    chunk size 1000: 405.82 Mb/s
-    chunk size 50000: 421.98 Mb/s
+    chunk size 10:     86.70 Mb/s   90.83
+    chunk size 50:    227.07 Mb/s  238.42
+    chunk size 1000:  405.82 Mb/s  414.64
+    chunk size 50000: 421.98 Mb/s  447.73
 
 
   timing throughput:
-                                                                [exBegin]
+								[exBegin]
     |hashStream n t|
 
     hashStream := SHA1Stream new.
     n := 1000000.
     t := Time millisecondsToRun:[
-            n timesRepeat:[
-                hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'.
-            ].
-         ].
+	    n timesRepeat:[
+		hashStream nextPutAll:'12345678901234567890123456789012345678901234567890'.
+	    ].
+	 ].
     t := (t / 1000) asFloat.
     Transcript show:t; show:' seconds for '; show:(50*n/1024) asFloat; showCR:' Kb'.
     Transcript show:(n*50/1024 / t); showCR:' Kb/s'
-                                                                [exEnd]
+								[exEnd]
 "
 ! !
 
@@ -840,17 +867,17 @@
 %{
    unsigned char value;
    OBJ _hashContext = __INST(hashContext);
-   
+
    // fetch first; check below
    value = __intVal(anInteger);
    if (__isSmallInteger(anInteger) && value <= 255
        && __isByteArray(_hashContext)
        && __byteArraySize(_hashContext) == sizeof(SHA1_CTX)
    ) {
-        SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
+	SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
 
-        SHA1Update(ctx, &value, 1);
-        RETURN(self);
+	SHA1Update(ctx, &value, 1);
+	RETURN(self);
     }
 bad: ;
 %}.
@@ -870,7 +897,7 @@
     INT len, offs;
     INT objSize;
     int nInstBytes;
-    char *extPtr;
+    unsigned char *extPtr;
     OBJ _hashContext = __INST(hashContext);
 
     // convert here; check later
@@ -881,61 +908,61 @@
        && __byteArraySize(_hashContext) == sizeof(SHA1_CTX)
        && __bothSmallInteger(count, start)
     ) {
-        SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
+	SHA1_CTX *ctx = (SHA1_CTX *)__byteArrayVal(_hashContext);
 
-        if (__isByteArrayLike(anObject)) {
-            extPtr = (char *)__byteArrayVal(anObject);
-            objSize = __byteArraySize(anObject);
-        } else if (__isStringLike(anObject)) { 
-            extPtr = (char *)__stringVal(anObject);
-            objSize = __stringSize(anObject);
-        } else if (__isExternalBytesLike(anObject)) {
-            OBJ sz;
+	if (__isByteArrayLike(anObject)) {
+	    extPtr = (unsigned char *)__byteArrayVal(anObject);
+	    objSize = __byteArraySize(anObject);
+	} else if (__isStringLike(anObject)) {
+	    extPtr = (unsigned char *)__stringVal(anObject);
+	    objSize = __stringSize(anObject);
+	} else if (__isExternalBytesLike(anObject)) {
+	    OBJ sz;
 
-            nInstBytes = 0;
-            extPtr = (char *)__externalBytesAddress(anObject);
-            if (extPtr == NULL) goto bad;
-            sz = __externalBytesSize(anObject);
-            objSize = __intVal(sz);
-            if (!__isSmallInteger(sz)) {
-                objSize = 0; /* unknown */
-            }
-        } else {
-            OBJ oClass = __Class(anObject);
-            int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars);
+	    nInstBytes = 0;
+	    extPtr = (unsigned char *)__externalBytesAddress(anObject);
+	    if (extPtr == NULL) goto bad;
+	    sz = __externalBytesSize(anObject);
+	    objSize = __intVal(sz);
+	    if (!__isSmallInteger(sz)) {
+		objSize = 0; /* unknown */
+	    }
+	} else {
+	    OBJ oClass = __Class(anObject);
+	    int nInstVars = __intVal(__ClassInstPtr(oClass)->c_ninstvars);
 
-            nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars);
-            switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) {
-                case BYTEARRAY:
-                case WORDARRAY:
-                case LONGARRAY:
-                case SWORDARRAY:
-                case SLONGARRAY:
-                case FLOATARRAY:
-                    break;
-                case DOUBLEARRAY:
+	    nInstBytes = OHDR_SIZE + __OBJS2BYTES__(nInstVars);
+	    switch (__intVal(__ClassInstPtr(oClass)->c_flags) & ARRAYMASK) {
+		case BYTEARRAY:
+		case WORDARRAY:
+		case LONGARRAY:
+		case SWORDARRAY:
+		case SLONGARRAY:
+		case FLOATARRAY:
+		    break;
+		case DOUBLEARRAY:
 #ifdef __NEED_DOUBLE_ALIGN
-                    nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1);
+		    nInstBytes = (nInstBytes-1+__DOUBLE_ALIGN) &~ (__DOUBLE_ALIGN-1);
 #endif
-                    break;
-                case LONGLONGARRAY:
-                case SLONGLONGARRAY:
+		    break;
+		case LONGLONGARRAY:
+		case SLONGLONGARRAY:
 #ifdef __NEED_LONGLONG_ALIGN
-                    nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1);
+		    nInstBytes = (nInstBytes-1+__LONGLONG_ALIGN) &~ (__LONGLONG_ALIGN-1);
 #endif
-                    break;
-                default:
-                    goto bad;
-            }
-            // nInstBytes is the number of bytes occupied by pointer instance variables
-            // subtract from size and add to byte-pointer
-            objSize = __Size(anObject) - nInstBytes;
-            extPtr = (char *)anObject + nInstBytes;
-        }
-        if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) {
-            SHA1Update(ctx, extPtr+offs, (unsigned int)len);
-            RETURN (count);
-        }
+		    break;
+		default:
+		    goto bad;
+	    }
+	    // nInstBytes is the number of bytes occupied by pointer instance variables
+	    // subtract from size and add to byte-pointer
+	    objSize = __Size(anObject) - nInstBytes;
+	    extPtr = (unsigned char *)anObject + nInstBytes;
+	}
+	if ((offs >= 0) && (len >= 0) && (objSize >= (len + offs))) {
+	    SHA1Update(ctx, extPtr+offs, (unsigned int)len);
+	    RETURN (count);
+	}
     }
 bad: ;
 %}.