reverse - 50% speedup on i386; 30% on all others
authorClaus Gittinger <cg@exept.de>
Tue, 19 Oct 1999 22:01:44 +0200
changeset 4924 9bec9689f646
parent 4923 1cacdcfc82c9
child 4925 e6373ec0ff8a
reverse - 50% speedup on i386; 30% on all others
ByteArray.st
--- a/ByteArray.st	Tue Oct 19 19:15:33 1999 +0200
+++ b/ByteArray.st	Tue Oct 19 22:01:44 1999 +0200
@@ -1558,22 +1558,22 @@
 
 #define OP_LOOP(OP) \
     while (count > 16) {                                             \
-        ((unsigned int *)dstP)[0] OP ((unsigned int *)srcP)[0];      \
-        ((unsigned int *)dstP)[1] OP ((unsigned int *)srcP)[1];      \
-        ((unsigned int *)dstP)[2] OP ((unsigned int *)srcP)[2];      \
-        ((unsigned int *)dstP)[3] OP ((unsigned int *)srcP)[3];      \
+        ((unsigned int *)dstP)[0] OP (((unsigned int *)srcP)[0]);    \
+        ((unsigned int *)dstP)[1] OP (((unsigned int *)srcP)[1]);    \
+        ((unsigned int *)dstP)[2] OP (((unsigned int *)srcP)[2]);    \
+        ((unsigned int *)dstP)[3] OP (((unsigned int *)srcP)[3]);    \
         srcP += 16;                                                  \
         dstP += 16;                                                  \
         count -= 16;                                                 \
     }                                                                \
     while (count > 4) {                                              \
-        ((unsigned int *)dstP)[0] OP ((unsigned int *)srcP)[0];      \
+        ((unsigned int *)dstP)[0] OP (((unsigned int *)srcP)[0]);    \
         srcP += 4;                                                   \
         dstP += 4;                                                   \
         count -= 4;                                                  \
     }                                                                \
     while (count > 0) {                                              \
-        *dstP OP *srcP;                                              \
+        *dstP OP (*srcP);                                            \
         srcP++;                                                      \
         dstP++;                                                      \
         count--;                                                     \
@@ -1584,18 +1584,34 @@
                 OP_LOOP( ^= )
                 RETURN (self);
             }
+            if (ruleSymbol == @symbol(bitXorNot:)) {
+                OP_LOOP( ^=~ )
+                RETURN (self);
+            }
             if (ruleSymbol == @symbol(bitAnd:)) {
                 OP_LOOP( &= )
                 RETURN (self);
             }
+            if (ruleSymbol == @symbol(bitAndNot:)) {
+                OP_LOOP( &=~ )
+                RETURN (self);
+            }
             if (ruleSymbol == @symbol(bitOr:)) {
                 OP_LOOP( |= )
                 RETURN (self);
             }
+            if (ruleSymbol == @symbol(bitOrNot:)) {
+                OP_LOOP( |=~ )
+                RETURN (self);
+            }
             if (ruleSymbol == @symbol(copy)) {
                 OP_LOOP( = )
                 RETURN (self);
             }
+            if (ruleSymbol == @symbol(copyNot)) {
+                OP_LOOP( =~ )
+                RETURN (self);
+            }
             if (ruleSymbol == @symbol(+)) {
                 OP_LOOP( += )
                 RETURN (self);
@@ -1631,11 +1647,12 @@
         bitBlitBytesFrom:1 to:4 with:#[1 1 1 1 1 1 1 1] startingAt:1 rule:#+
      #[1 2 3 4 5 6 7 8]
         bitBlitBytesFrom:1 to:4 with:#[1 1 1 1 2 2 2 2] startingAt:5 rule:#+
+     #[1 2 3 4 5 6 7 8]
+        bitBlitBytesFrom:1 to:4 with:#[1 1 1 1 2 2 2 2] startingAt:5 rule:#copyNot
 
      #[1 2 3 4 5 6 7 8]
         bitBlitBytesFrom:1 to:8 with:(1 to:8) startingAt:1 rule:#+
     "
-
 !
 
 bitOrBytesFrom:dstStart to:dstEnd with:sourceBytes startingAt:sourceStart 
@@ -2090,22 +2107,146 @@
     OBJ cls;
 
     if (__qClass(self) == @global(ByteArray)) {
-	cnt = __byteArraySize(self);
-	p1 = __ByteArrayInstPtr(self)->ba_element;
-	p2 = p1 + cnt - 1;
-	while (p1 < p2) {
-	    t = *p1;
-	    *p1++ = *p2;
-	    *p2-- = t;
-	}
-	RETURN ( self );
+        cnt = __byteArraySize(self);
+        p1 = __ByteArrayInstPtr(self)->ba_element;
+        p2 = p1 + cnt - 1;
+
+#if defined(i386) && defined(__GNUC__)
+        /*
+         * can we use the bswap instruction ?
+         */
+        if (__cpuType__ >= 586) {
+            if ((cnt & 3) == 0) {
+                unsigned int *ip1, *ip2;
+
+                ip1 = (unsigned int *)p1;
+                ip2 = (unsigned int *)(p2 - 3);
+
+#  define BSWAP(__val) \
+   ({                              \
+       int __rslt;                 \
+       asm("bswap %0   \n          \
+           "  : "=r" (__rslt)      \
+              : "0" ((int)(__val)) \
+          );                       \
+       __rslt;                     \
+    })
+                ip2 -= 5;
+                while (ip1 <= ip2) {
+                    int t1, t2;
+
+                    t1 = ip1[0];
+                    t2 = ip2[5];
+                    ip2[5] = BSWAP(t1);
+                    ip1[0] = BSWAP(t2);
+
+                    t1 = ip1[1];
+                    t2 = ip2[4];
+                    ip2[4] = BSWAP(t1);
+                    ip1[1] = BSWAP(t2);
+
+                    t1 = ip1[2];
+                    t2 = ip2[3];
+                    ip2[3] = BSWAP(t1);
+                    ip1[2] = BSWAP(t2);
+
+                    t1 = ip1[3];
+                    t2 = ip2[2];
+                    ip2[2] = BSWAP(t1);
+                    ip1[3] = BSWAP(t2);
+
+                    ip1 += 4;
+                    ip2 -= 4;
+                }
+                ip2 += 5;
+
+                while (ip1 < ip2) {
+                    int t;
+
+                    t = BSWAP(*ip1);
+                    *ip1++ = BSWAP(*ip2);
+                    *ip2-- = t;
+                }                
+
+                if (ip1 == ip2) {
+                    int t;
+                    t = *ip1;
+                    t = BSWAP(t);
+                    *ip1 = t;
+                }
+                RETURN ( self );
+            }
+        }
+#endif /* i386 && __GNUC__ */
+        p2 -= 4;
+        while (p1 <= p2) {
+            t = p1[0];
+            p1[0] = p2[4];
+            p2[4] = t;
+
+            t = p1[1];
+            p1[1] = p2[3];
+            p2[3] = t;
+
+            t = p1[2];
+            p1[2] = p2[2];
+            p2[2] = t;
+
+            t = p1[3];
+            p1[3] = p2[1];
+            p2[1] = t;
+
+            p1 += 4;
+            p2 -= 4;
+        }
+        p2 += 4;
+
+        while (p1 < p2) {
+            t = *p1;
+            *p1++ = *p2;
+            *p2-- = t;
+        }
+        RETURN ( self );
     }
 %}.
     ^ super reverse
 
     "
-     #[1 2 3 4 5] reverse
-     #[1 2 3 4] reverse
+     #[1 2 3 4 5] reverse      
+     #[1 2 3 4] reverse       
+     #[1 2 3 4 5 6] reverse    
+     #[1 2 3 4 5 6 7] reverse    
+     #[1 2 3 4 5 6 7 8] reverse        
+     #[1 2 3 4 5 6 7 8 9] reverse    
+     #[1 2 3 4 5 6 7 8 9 10] reverse    
+     #[1 2 3 4 5 6 7 8 9 10 11 12] reverse    
+     #[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] reverse    
+     #[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] reverse    
+     (1 to:255) asByteArray reverse   
+
+     1 to:1024 do:[:i|
+        |bytes rBytes|
+
+        bytes := ((1 to:i) asArray collect:[:i | i bitAnd:255]) asByteArray.
+        rBytes := bytes copy.
+        rBytes reverse reverse ~= bytes ifTrue:[
+            self halt
+        ]
+     ].
+
+     Time millisecondsToRun:[
+        10000000 timesRepeat:[
+            #[1 2 3 4 5 6 7 8] reverse 
+        ]
+     ]  
+
+     |b|
+     b := (0 to:255) asByteArray.
+     Time millisecondsToRun:[
+        10000000 timesRepeat:[
+            b reverse 
+        ]
+     ]  
     "
 !
 
@@ -2155,7 +2296,6 @@
      written as a primitive for speed on image grabbing 
      (if display order is different)"
 
-"/    self swapLongsFrom:1 to:(self size + 1).
 %{  /* NOCONTEXT */
 
     REGISTER unsigned char *p;
@@ -2163,20 +2303,20 @@
     REGISTER unsigned t;
 
     if (__qClass(self) == @global(ByteArray)) {
-	cnt = __byteArraySize(self);
-	cnt = cnt & ~3; /* make it even */
-	p = __ByteArrayInstPtr(self)->ba_element;
-	while (cnt > 0) {
-	    t = p[0];
-	    p[0] = p[3];
-	    p[3] = t;
-	    t = p[1];
-	    p[1] = p[2];
-	    p[2] = t;
-	    p += 4;
-	    cnt -= 4;
-	}
-	RETURN ( self );
+        cnt = __byteArraySize(self);
+        cnt = cnt & ~3; /* make it even */
+        p = __ByteArrayInstPtr(self)->ba_element;
+        while (cnt > 0) {
+            t = p[0];
+            p[0] = p[3];
+            p[3] = t;
+            t = p[1];
+            p[1] = p[2];
+            p[2] = t;
+            p += 4;
+            cnt -= 4;
+        }
+        RETURN ( self );
     }
 %}.
     ^ super swapLongs "/ rubbish - there is no one currenly
@@ -2439,15 +2579,30 @@
     int len;
 
     if (__qClass(self) == @global(ByteArray)) {
-	max = 0;
-	index = 0;
-	len = __qSize(self) - OHDR_SIZE;
-	cp = &(__ByteArrayInstPtr(self)->ba_element[0]);
-	while (++index <= len) {
-	    if (*cp > max) max = *cp;
-	    cp++;
-	}
-	RETURN ( __MKSMALLINT(max) );
+        max = 0;
+        index = 0;
+        len = __qSize(self) - OHDR_SIZE;
+        cp = &(__ByteArrayInstPtr(self)->ba_element[0]);
+
+#if defined(i386) && defined(__GNUC__)
+        {
+            extern int __cpu_hasMMX;
+            /*
+             * can we use mmx instructions ?
+             */
+            if ((__cpuType__ >= 586) && __cpu_hasMMX) {
+printf("mmx\n");
+            }
+        }
+#endif
+        while (++index <= len) {
+            unsigned int byte;
+
+            byte = *cp;
+            cp++;
+            if (byte > max) max = byte;
+        }
+        RETURN ( __MKSMALLINT(max) );
     }
 %}.
     ^ super max
@@ -2589,5 +2744,5 @@
 !ByteArray class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/ByteArray.st,v 1.123 1999-10-19 12:51:36 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/ByteArray.st,v 1.124 1999-10-19 20:01:44 cg Exp $'
 ! !