--- a/String.st Sat Aug 10 10:20:31 2019 +0200
+++ b/String.st Sat Aug 10 13:32:14 2019 +0200
@@ -5246,130 +5246,222 @@
indexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue caseSensitive:caseSensitive
"redefined as primitive for maximum speed (BM).
Compared to the strstr libc function, on my machine,
- BM is faster for caseSensitive compares above around 8.5 searched characters;
- for caseInsensitive compares, strstr is slower than caseInsensitiveIndex.
- (for much longer searched strings, BM is much faster; 5times as fast for 20chars)"
+ BM is faster for caseSensitive compares above around 8.5 searched characters.
+ For much longer searched strings, BM is much faster; 5times as fast for 20chars.
+ For caseInsensitive compares, strstr was found to be slower than caseInsensitiveIndexOf."
|notFound|
- caseSensitive ifFalse:[
- ^ self caseInsensitiveIndexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue.
- ].
-
%{ /* STACK:4000 */
#ifndef __SCHTEAM__
if (__qIsStringLike(self)
&& __isStringLike(aSubString)
&& (__isSmallInteger(startIndex))
&& (__intVal(startIndex) > 0)
+ && (caseSensitive==true)
) {
- unsigned char *y = __stringVal(self);
- unsigned char *x = __stringVal(aSubString);
- int m = __stringSize(aSubString);
- int n = __stringSize(self);
-# define MAX_PATTERN_SIZE 128
-# define XSIZE 256
-# define ASIZE 256
-# define MAX(a,b) (a>b ? a : b)
-
- if (m == 0) {
-#if 1
- /* empty string does not match */
- RETURN(__mkSmallInteger(0));
-#else
- /* empty string matches */
- RETURN(startIndex);
-#endif
- }
- if (m <= XSIZE) {
- int i, j;
- static int lastPatternSize = 0;
- static char lastPattern[MAX_PATTERN_SIZE+1] = { 0 };
- static int bmGs[XSIZE+1], bmBc[ASIZE];
-
-# define preBmBc(x, m, bmBc) { \
- int i; \
- \
- for (i = 0; i < ASIZE; ++i) \
- bmBc[i] = m; \
- for (i = 0; i < m - 1; ++i) \
- bmBc[x[i]] = m - i - 1; \
+ unsigned char *haystack = __stringVal(self);
+ unsigned char *needle = __stringVal(aSubString);
+ int srchLen = __stringSize(aSubString);
+ int myLen = __stringSize(self);
+ int srchIdx = __intVal(startIndex) - 1;
+
+ if (srchLen == 0) {
+ #if 1
+ /* empty string does not match */
+ RETURN(__mkSmallInteger(0));
+ #else
+ /* empty string matches */
+ RETURN(startIndex);
+ #endif
+ }
+ // console_fprintf(stderr, "myLen:%d, srchLen:%d, srchIdx:%d\n", myLen,srchLen,srchIdx);
+ if (srchIdx > (myLen - srchLen)) {
+ // console_fprintf(stderr, "srchIdx too large\n");
+ notFound = true;
+ } else {
+ if ( (srchLen > 4) /* && (myLen >= 30) */ ) {
+ #define MAX_PATTERN_SIZE 128
+ #define XSIZE 256
+ #define ASIZE 256
+ #define MAX(a,b) (a>b ? a : b)
+
+ // console_fprintf(stderr, "BM srchLen:%d\n", srchLen);
+ if (srchLen <= MAX_PATTERN_SIZE) {
+ int i;
+ static short lastPatternSize1 = 0;
+ static char lastPattern1[MAX_PATTERN_SIZE+1] = { 0 };
+ static short bmGs1[XSIZE+1], bmBc1[ASIZE];
+ static short lastPatternSize2 = 0;
+ static char lastPattern2[MAX_PATTERN_SIZE+1] = { 0 };
+ static short bmGs2[XSIZE+1], bmBc2[ASIZE];
+ short *bmGsX, *bmBcX;
+ static short flipFlop = 0;
+
+ #define preBmBc(x, m, bmBc) { \
+ int ii; \
+ \
+ for (ii = 0; ii < ASIZE; ++ii) \
+ bmBc[ii] = m; \
+ for (ii = 0; ii < m - 1; ++ii) \
+ bmBc[x[ii]] = m - ii - 1; \
+ }
+
+ #define suffixes(x, m, suff) { \
+ int f, g, i; \
+ \
+ suff[m - 1] = m; \
+ g = m - 1; \
+ for (i = m - 2; i >= 0; --i) { \
+ if (i > g && suff[i + m - 1 - f] < i - g) \
+ suff[i] = suff[i + m - 1 - f]; \
+ else { \
+ if (i < g) \
+ g = i; \
+ f = i; \
+ while (g >= 0 && x[g] == x[g + m - 1 - f]) \
+ --g; \
+ suff[i] = f - g; \
+ } \
+ } \
+ }
+
+ #define preBmGs(x, m, bmGs) { \
+ int i, j, suff[XSIZE]; \
+ \
+ suffixes(x, m, suff); \
+ \
+ for (i = 0; i < m; ++i) \
+ bmGs[i] = m; \
+ j = 0; \
+ for (i = m - 1; i >= 0; --i) \
+ if (suff[i] == i + 1) \
+ for (; j < m - 1 - i; ++j) \
+ if (bmGs[j] == m) \
+ bmGs[j] = m - 1 - i; \
+ for (i = 0; i <= m - 2; ++i) \
+ bmGs[m - 1 - suff[i]] = m - 1 - i; \
+ }
+
+ /* tables only depend on pattern;
+ * so we can cache them in case the same string is searched again
+ * Here, two such tables are used alternatively
+ * to remember the setup for the last two searches.
+ */
+ if ((srchLen == lastPatternSize1) && (strcmp(lastPattern1, needle) == 0)) {
+ /* tables1 still valid */
+ // console_fprintf(stderr, "valid1: \"%s\"\n", lastPattern1);
+ bmGsX = bmGs1; bmBcX = bmBc1;
+ } else {
+ if ((srchLen == lastPatternSize2) && (strcmp(lastPattern2, needle) == 0)) {
+ /* tables1 still valid */
+ // console_fprintf(stderr, "valid2: \"%s\"\n", lastPattern2);
+ bmGsX = bmGs2; bmBcX = bmBc2;
+ } else {
+ if (flipFlop) {
+ // console_fprintf(stderr, "cache in2 for: \"%s\"\n", needle);
+ strcpy(lastPattern2, needle);
+ lastPatternSize2 = srchLen;
+ bmGsX = bmGs2; bmBcX = bmBc2;
+ } else {
+ // console_fprintf(stderr, "cache in1 for: \"%s\"\n", needle);
+ strcpy(lastPattern1, needle);
+ lastPatternSize1 = srchLen;
+ bmGsX = bmGs1; bmBcX = bmBc1;
+ }
+
+ /* Preprocessing */
+ // console_fprintf(stderr, "compute: \"%s\"\n", needle);
+ preBmGs(needle, srchLen, bmGsX);
+ preBmBc(needle, srchLen, bmBcX);
+
+ flipFlop = 1-flipFlop;
+ }
+ }
+
+ /* Searching */
+ {
+ int i;
+
+ // console_fprintf(stderr, "srchIdx:%d; lRest:%d\n", srchIdx, (myLen - srchLen));
+ while (srchIdx <= (myLen - srchLen)) {
+ // console_fprintf(stderr, "srchIdx: %d\n", srchIdx);
+ for (i = srchLen - 1; i >= 0 && needle[i] == haystack[i + srchIdx]; --i);
+ if (i < 0) {
+ RETURN (__mkSmallInteger(srchIdx+1));
+ } else {
+ short s1 = bmGsX[i];
+ short s2 = bmBcX[haystack[i + srchIdx]] - srchLen + 1 + i;
+ srchIdx += MAX(s1, s2);
+ }
+ }
+ }
+ notFound = true;
+ }
+ } else {
+ unsigned char *where;
+
+ switch (srchLen) {
+ case 1:
+ // console_fprintf(stderr, "strchr\n");
+ where = (unsigned char *)strchr(haystack+srchIdx, needle[0]);
+ break;
+ case 2:
+ // console_fprintf(stderr, "strstr2\n");
+ {
+ const unsigned char *h = haystack+srchIdx;
+
+ uint16_t nw = needle[0]<<8 | needle[1], hw = h[0]<<8 | h[1];
+ for (h++; *h && hw != nw; hw = hw<<8 | *++h);
+ where = *h ? (unsigned char *)(h-1) : NULL;
+ }
+ break;
+ case 3:
+ // console_fprintf(stderr, "strstr3\n");
+ {
+ const unsigned char *h = haystack+srchIdx;
+
+ uint32_t nw = needle[0]<<24 | needle[1]<<16 | needle[2]<<8;
+ uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
+ for (h+=2; *h && hw != nw; hw = (hw|*++h)<<8);
+ where = *h ? (unsigned char *)(h-2) : NULL;
+ }
+ break;
+ case 4:
+ // console_fprintf(stderr, "strstr4\n");
+ {
+ const unsigned char *h = haystack+srchIdx;
+
+ uint32_t nw = needle[0]<<24 | needle[1]<<16 | needle[2]<<8 | needle[3];
+ uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
+ for (h+=3; *h && hw != nw; hw = hw<<8 | *++h);
+ where = *h ? (unsigned char *)(h-3) : NULL;
+ }
+ break;
+ default:
+ // console_fprintf(stderr, "strstr\n");
+ where = strstr((char *)(haystack+srchIdx), (char *)needle);
+ break;
+ }
+ if (where != NULL) {
+ RETURN (__mkSmallInteger(where-haystack + 1));
+ }
+ notFound = true;
}
-
-# define suffixes(x, m, suff) { \
- int f, g, i; \
- \
- suff[m - 1] = m; \
- g = m - 1; \
- for (i = m - 2; i >= 0; --i) { \
- if (i > g && suff[i + m - 1 - f] < i - g) \
- suff[i] = suff[i + m - 1 - f]; \
- else { \
- if (i < g) \
- g = i; \
- f = i; \
- while (g >= 0 && x[g] == x[g + m - 1 - f]) \
- --g; \
- suff[i] = f - g; \
- } \
- } \
- }
-
-# define preBmGs(x, m, bmGs) { \
- int i, j, suff[XSIZE]; \
- \
- suffixes(x, m, suff); \
- \
- for (i = 0; i < m; ++i) \
- bmGs[i] = m; \
- j = 0; \
- for (i = m - 1; i >= 0; --i) \
- if (suff[i] == i + 1) \
- for (; j < m - 1 - i; ++j) \
- if (bmGs[j] == m) \
- bmGs[j] = m - 1 - i; \
- for (i = 0; i <= m - 2; ++i) \
- bmGs[m - 1 - suff[i]] = m - 1 - i; \
- }
-
- /* tables only depend on pattern; so we can cache them in case the same string is searched again */
- if ((m == lastPatternSize) && (strcmp(lastPattern, x) == 0)) {
- /* tables are still valid */
- // printf("valid: \"%s\"\n", lastPattern);
- } else {
- /* Preprocessing */
- // printf("compute: \"%s\"\n", lastPattern);
- preBmGs(x, m, bmGs);
- preBmBc(x, m, bmBc);
- if (m <= MAX_PATTERN_SIZE) {
- // printf("cache for: \"%s\"\n", lastPattern);
- strcpy(lastPattern, x);
- lastPatternSize = m;
- }
- }
-
- /* Searching */
- j = __intVal(startIndex) - 1;
- while (j <= n - m) {
- for (i = m - 1; i >= 0 && x[i] == y[i + j]; --i);
- if (i < 0) {
- RETURN (__mkSmallInteger(j+1));
- j += bmGs[0];
- } else {
- int s1 = bmGs[i];
- int s2 = bmBc[y[i + j]] - m + 1 + i;
- j += MAX(s1, s2);
- }
- }
- notFound = true;
}
}
#endif /* ! __SCHTEAM__ */
%}.
notFound == true ifTrue:[
+ "/ Stderr showCR:'notFound.'.
^ exceptionValue value.
].
+ caseSensitive ifFalse:[
+ Stderr showCR:'caseInsensitiveSearch...'.
+ ^ self caseInsensitiveIndexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue.
+ ].
+
"/ arrive here if aSubstring is a UnicodeString or arguments are invalid
^ super indexOfSubCollection:aSubString startingAt:startIndex ifAbsent:exceptionValue caseSensitive:true