String.st
changeset 19715 58d5215c623e
parent 19579 9a62e43aedf5
child 19733 34ff21b5c668
--- a/String.st	Fri May 06 04:22:40 2016 +0200
+++ b/String.st	Fri May 06 04:52:00 2016 +0200
@@ -3978,116 +3978,283 @@
      && (__isSmallInteger(startIndex))
      && (__intVal(startIndex) > 0)
     ) {
-	unsigned char *y = __stringVal(self);
-	unsigned char *x = __stringVal(aSubString);
-	int m = __stringSize(aSubString);
-	int n = __stringSize(self);
+        unsigned char *y = __stringVal(self);
+        unsigned char *x = __stringVal(aSubString);
+        int m = __stringSize(aSubString);
+        int n = __stringSize(self);
 #       define MAX_PATTERN_SIZE 128
 #       define XSIZE 256
 #       define ASIZE 256
 #       define MAX(a,b) (a>b ? a : b)
 
-	if (m == 0) {
+        if (m == 0) {
 #if 1
-	    /* empty string does not match */
-	    RETURN(__mkSmallInteger(0));
+            /* empty string does not match */
+            RETURN(__mkSmallInteger(0));
 #else
-	    /* empty string matches */
-	    RETURN(startIndex);
+            /* empty string matches */
+            RETURN(startIndex);
 #endif
-	}
-	if (m <= XSIZE) {
-	    int i, j;
-	    static int lastPatternSize = 0;
-	    static char lastPattern[MAX_PATTERN_SIZE+1] = { 0 };
-	    static int bmGs[XSIZE+1], bmBc[ASIZE];
+        }
+        if (m <= XSIZE) {
+            int i, j;
+            static int lastPatternSize = 0;
+            static char lastPattern[MAX_PATTERN_SIZE+1] = { 0 };
+            static int bmGs[XSIZE+1], bmBc[ASIZE];
 
 #           define preBmBc(x, m, bmBc) {          \
-	       int i;                             \
-						  \
-	       for (i = 0; i < ASIZE; ++i)        \
-		  bmBc[i] = m;                    \
-	       for (i = 0; i < m - 1; ++i)        \
-		  bmBc[x[i]] = m - i - 1;         \
-	    }
+               int i;                             \
+                                                  \
+               for (i = 0; i < ASIZE; ++i)        \
+                  bmBc[i] = m;                    \
+               for (i = 0; i < m - 1; ++i)        \
+                  bmBc[x[i]] = m - i - 1;         \
+            }
 
 #           define suffixes(x, m, suff) {                       \
-	       int f, g, i;                                     \
-								\
-	       suff[m - 1] = m;                                 \
-	       g = m - 1;                                       \
-	       for (i = m - 2; i >= 0; --i) {                   \
-		  if (i > g && suff[i + m - 1 - f] < i - g)     \
-		     suff[i] = suff[i + m - 1 - f];             \
-		  else {                                        \
-		     if (i < g)                                 \
-			g = i;                                  \
-		     f = i;                                     \
-		     while (g >= 0 && x[g] == x[g + m - 1 - f]) \
-			--g;                                    \
-		     suff[i] = f - g;                           \
-		  }                                             \
-	       }                                                \
-	    }
+               int f, g, i;                                     \
+                                                                \
+               suff[m - 1] = m;                                 \
+               g = m - 1;                                       \
+               for (i = m - 2; i >= 0; --i) {                   \
+                  if (i > g && suff[i + m - 1 - f] < i - g)     \
+                     suff[i] = suff[i + m - 1 - f];             \
+                  else {                                        \
+                     if (i < g)                                 \
+                        g = i;                                  \
+                     f = i;                                     \
+                     while (g >= 0 && x[g] == x[g + m - 1 - f]) \
+                        --g;                                    \
+                     suff[i] = f - g;                           \
+                  }                                             \
+               }                                                \
+            }
 
 #           define preBmGs(x, m, bmGs) {                        \
-	       int i, j, suff[XSIZE];                           \
-								\
-	       suffixes(x, m, suff);                            \
-								\
-	       for (i = 0; i < m; ++i)                          \
-		  bmGs[i] = m;                                  \
-	       j = 0;                                           \
-	       for (i = m - 1; i >= 0; --i)                     \
-		  if (suff[i] == i + 1)                         \
-		     for (; j < m - 1 - i; ++j)                 \
-			if (bmGs[j] == m)                       \
-			   bmGs[j] = m - 1 - i;                 \
-	       for (i = 0; i <= m - 2; ++i)                     \
-		  bmGs[m - 1 - suff[i]] = m - 1 - i;            \
-	    }
-
-	    /* tables only depend on pattern; so we can cache them in case the same string is searched again */
-	    if ((m == lastPatternSize)
-	     && (strcmp(lastPattern, x) == 0)) {
-		/* tables are still valid */
-		// printf("valid: \"%s\"\n", lastPattern);
-	    } else {
-		/* Preprocessing */
-		// printf("compute: \"%s\"\n", lastPattern);
-		preBmGs(x, m, bmGs);
-		preBmBc(x, m, bmBc);
-		if (m <= MAX_PATTERN_SIZE) {
-		    // printf("cache for: \"%s\"\n", lastPattern);
-		    strcpy(lastPattern, x);
-		    lastPatternSize = m;
-		}
-	    }
-
-	    /* Searching */
-	    j = __intVal(startIndex) - 1;
-	    while (j <= n - m) {
-	       for (i = m - 1; i >= 0 && x[i] == y[i + j]; --i);
-	       if (i < 0) {
-		  RETURN (__mkSmallInteger(j+1));
-		  j += bmGs[0];
-	       } else {
-		  int s1 = bmGs[i];
-		  int s2 = bmBc[y[i + j]] - m + 1 + i;
-		  j += MAX(s1, s2);
-	       }
-	    }
-	    notFound = true;
-	}
+               int i, j, suff[XSIZE];                           \
+                                                                \
+               suffixes(x, m, suff);                            \
+                                                                \
+               for (i = 0; i < m; ++i)                          \
+                  bmGs[i] = m;                                  \
+               j = 0;                                           \
+               for (i = m - 1; i >= 0; --i)                     \
+                  if (suff[i] == i + 1)                         \
+                     for (; j < m - 1 - i; ++j)                 \
+                        if (bmGs[j] == m)                       \
+                           bmGs[j] = m - 1 - i;                 \
+               for (i = 0; i <= m - 2; ++i)                     \
+                  bmGs[m - 1 - suff[i]] = m - 1 - i;            \
+            }
+
+            /* tables only depend on pattern; so we can cache them in case the same string is searched again */
+            if ((m == lastPatternSize)
+             && (strcmp(lastPattern, x) == 0)) {
+                /* tables are still valid */
+                // printf("valid: \"%s\"\n", lastPattern);
+            } else {
+                /* Preprocessing */
+                // printf("compute: \"%s\"\n", lastPattern);
+                preBmGs(x, m, bmGs);
+                preBmBc(x, m, bmBc);
+                if (m <= MAX_PATTERN_SIZE) {
+                    // printf("cache for: \"%s\"\n", lastPattern);
+                    strcpy(lastPattern, x);
+                    lastPatternSize = m;
+                }
+            }
+
+            /* Searching */
+            j = __intVal(startIndex) - 1;
+            while (j <= n - m) {
+               for (i = m - 1; i >= 0 && x[i] == y[i + j]; --i);
+               if (i < 0) {
+                  RETURN (__mkSmallInteger(j+1));
+                  j += bmGs[0];
+               } else {
+                  int s1 = bmGs[i];
+                  int s2 = bmBc[y[i + j]] - m + 1 + i;
+                  j += MAX(s1, s2);
+               }
+            }
+            notFound = true;
+        }
     }
 #endif /* ! __SCHTEAM__ */
 %}.
     notFound == true ifTrue:[
-	^ exceptionValue value.
+        ^ exceptionValue value.
+    ].
+    ^ self slowIndexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue caseSensitive:caseSensitive
+
+    "Modified: / 05-08-2012 / 12:27:31 / cg"
+!
+
+slowIndexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue caseSensitive:caseSensitive
+    "naive search fallback (non-BM).
+     Use this for short searchStrings (<= 2)
+     or for caseSensitive searches"
+
+    |notFound|
+
+%{
+#ifndef __SCHTEAM__
+    if (__qIsStringLike(self)
+     && __isStringLike(aSubString)
+     && (__isSmallInteger(startIndex))
+     && (__intVal(startIndex) > 0)
+    ) {
+        unsigned char *_pSelf = __stringVal(self);
+        unsigned char *_substring = __stringVal(aSubString);
+        unsigned char *_pSelfI, *_pSelfMax;
+        int _lenSelf = __stringSize(self);
+        int _lenSubstring = __stringSize(aSubString);
+        int _idx0Max = _lenSelf - _lenSubstring;
+        unsigned char _first;
+        unsigned char _ucFirst;
+        unsigned char _lcFirst;
+        unsigned char _oppositeCaseFirst;
+        int i;
+        
+        if (_lenSubstring == 0) {
+#if 1
+            /* empty string does not match */
+            notFound = true;
+            goto getOutOfHere;
+#else
+            /* empty string matches */
+            RETURN(startIndex);
+#endif
+        }
+        
+        // searched string's length > string
+        if (_idx0Max < 0) {
+            notFound = true;
+            goto getOutOfHere;
+        }
+        
+        _first = _lcFirst = _substring[0];
+        if (((_first >= 'A') && (_first <= 'Z'))
+         || ((_first >= 0xC0) && (_first <= 0xDE))) {
+            _ucFirst = _first;
+            _lcFirst = _oppositeCaseFirst = _first - 'A' + 'a';
+        } else {
+            if (((_first >= 'a') && (_first <= 'z'))
+             || ((_first >= 0xE0) && (_first <= 0xFE))) {
+                _lcFirst = _first;
+                _ucFirst = _oppositeCaseFirst = _first - 'a' + 'A';
+            }
+        }
+        
+        // idx:
+        // 0123456789
+        
+        // 1234567890 - lenSelf = 10
+        // abc        - lenSubstring = 3
+        //            - idx0Max = 7 (last legal startIndex)
+        
+        i = __intVal(startIndex) - 1;
+        _pSelfI = _pSelf + i;
+        _pSelfMax = _pSelf + _idx0Max;
+        
+        for (; _pSelfI <= _pSelfMax; _pSelfI++) {
+            int j;
+            unsigned char _selfChar;
+            
+            // find the first char
+            _selfChar = _pSelfI[0];
+            if (_selfChar != _first) {
+                if (caseSensitive == true) continue;
+                if (_selfChar != _oppositeCaseFirst) {
+searchNext: ;
+                    continue;
+                }    
+            }
+            
+            // first char matches
+            // compare rest
+            for (j=1; j<_lenSubstring; j++) {
+                char _selfChar, _subChar, _lcSubChar, _ucSubChar;
+                
+                _subChar = _substring[j];
+                _selfChar = _pSelfI[j];
+                
+                if (_selfChar == _subChar) continue;
+                if (caseSensitive == true) goto searchNext;
+
+                _lcSubChar = _subChar;
+                if (((_lcSubChar >= 'A') && (_lcSubChar <= 'Z'))
+                 || ((_lcSubChar >= 0xC0) && (_lcSubChar <= 0xDE))) {
+                    _lcSubChar = _subChar - 'A' + 'a';
+                    if (_selfChar != _lcSubChar) goto searchNext;
+                } else {
+                    if (((_lcSubChar >= 'a') && (_lcSubChar <= 'z'))
+                     || ((_lcSubChar >= 0xE0) && (_lcSubChar <= 0xFE))) {
+                        _ucSubChar = _subChar - 'a' + 'A';
+                        if (_selfChar != _ucSubChar) goto searchNext;
+                    } else {
+                        goto searchNext;
+                    }
+                }
+            }
+            // if we arrive here, we have a match at i
+            RETURN( __mkSmallInteger( _pSelfI - _pSelf + 1 ) );
+        }
+        notFound = true;
+    }
+    
+    getOutOfHere: ;
+#endif /* ! __SCHTEAM__ */
+%}.
+    "/ arrive here if either not found, or invalid arguments
+    notFound == true ifTrue:[
+        ^ exceptionValue value.
     ].
     ^ super indexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue caseSensitive:caseSensitive
 
-    "Modified: / 05-08-2012 / 12:27:31 / cg"
+    "
+     'abcdefg' slowIndexOfSubCollection:'abc' startingAt:1 ifAbsent:nil caseSensitive:false
+     'abcdefg' slowIndexOfSubCollection:'bcd' startingAt:1 ifAbsent:nil caseSensitive:false
+     'abcdefg' slowIndexOfSubCollection:'cde' startingAt:1 ifAbsent:nil caseSensitive:false
+     'abcabcg' slowIndexOfSubCollection:'abc' startingAt:2 ifAbsent:nil caseSensitive:false
+
+     'ABCDEFG' slowIndexOfSubCollection:'abc' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCDEFG' slowIndexOfSubCollection:'Abc' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCDEFG' slowIndexOfSubCollection:'aBC' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCDEFG' slowIndexOfSubCollection:'ABC' startingAt:1 ifAbsent:nil caseSensitive:false
+
+     'ABCDEFG' slowIndexOfSubCollection:'a' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCDEFG' slowIndexOfSubCollection:'A' startingAt:1 ifAbsent:nil caseSensitive:false
+
+     'ABCDEFG' slowIndexOfSubCollection:'bcd' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCDEFG' slowIndexOfSubCollection:'cde' startingAt:1 ifAbsent:nil caseSensitive:false
+     'ABCABCG' slowIndexOfSubCollection:'abc' startingAt:2 ifAbsent:nil caseSensitive:false
+     
+     '1234567890' slowIndexOfSubCollection:'abc' startingAt:1 ifAbsent:nil caseSensitive:false
+     '1234567890' slowIndexOfSubCollection:'123' startingAt:1 ifAbsent:nil caseSensitive:false
+     '1234567890' slowIndexOfSubCollection:'123' startingAt:1 ifAbsent:nil caseSensitive:true
+     '1234567890' slowIndexOfSubCollection:'234' startingAt:1 ifAbsent:nil caseSensitive:true
+     '1234567890' slowIndexOfSubCollection:'345' startingAt:1 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:1 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:2 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:3 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:4 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:5 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:6 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:7 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:8 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:9 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:10 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'123' startingAt:11 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:1 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:3 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:5 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:6 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:8 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'231' startingAt:9 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'2310' startingAt:6 ifAbsent:nil caseSensitive:true
+     '1231231231' slowIndexOfSubCollection:'2310' startingAt:8 ifAbsent:nil caseSensitive:true
+    " 
 ! !
 
 !String methodsFor:'testing'!