CharacterArray.st
branchjv
changeset 18011 deb0c3355881
parent 17993 956342c369a2
parent 14660 78d3fa5b75e8
child 18017 7fef9e17913f
--- a/CharacterArray.st	Thu Dec 20 11:48:59 2012 +0000
+++ b/CharacterArray.st	Sat Jan 19 01:30:00 2013 +0000
@@ -11,7 +11,7 @@
 "
 "{ Package: 'stx:libbasic' }"
 
-ByteArray subclass:#CharacterArray
+UninterpretedBytes variableByteSubclass:#CharacterArray
 	instanceVariableNames:''
 	classVariableNames:'PreviousMatch DecoderTables EncoderTables DecodingFailedSignal
 		EncodingFailedSignal LastString LastShiftTable'
@@ -326,6 +326,7 @@
     "
 ! !
 
+
 !CharacterArray class methodsFor:'pattern matching'!
 
 matchEscapeCharacter
@@ -677,6 +678,7 @@
     ^ self == CharacterArray
 ! !
 
+
 !CharacterArray methodsFor:'Compatibility-ANSI'!
 
 addLineDelimiters
@@ -1012,12 +1014,6 @@
     "
 !
 
-isWideString
-    ^ self bitsPerCharacter > 8
-
-    "Created: / 05-09-2011 / 23:13:16 / cg"
-!
-
 lastSpacePosition
     "return the index of the last space character; 0 if there is none"
 
@@ -1559,6 +1555,7 @@
 ! !
 
 
+
 !CharacterArray methodsFor:'character searching'!
 
 includesMatchCharacters
@@ -1818,18 +1815,16 @@
      therefore 'foo' = 'Foo' will return false.
      Since this is incompatible to ST-80 (at least, V2.x) , this may change."
 
-    |mySize    "{ Class: SmallInteger }"
-     otherSize |
+    |mySize    "{ Class: SmallInteger }"|
 
     (aString isString or:[aString species == self species]) ifFalse:[
-	^ false
+        ^ false
     ].
     mySize := self size.
-    otherSize := aString size.
-    mySize == otherSize ifFalse:[^ false].
+    mySize ~~ (aString size) ifTrue:[^ false].
 
     1 to:mySize do:[:index |
-	(self at:index) = (aString at:index) ifFalse:[^ false].
+        (self at:index) = (aString at:index) ifFalse:[^ false].
     ].
     ^ true
 
@@ -1870,6 +1865,21 @@
     "Modified: 22.4.1996 / 15:55:00 / cg"
 !
 
+after:aString
+    "Compare the receiver with the argument and return true if the
+     receiver should come after the argument in a sorted list.
+     Otherwise return false.
+     NOTE: The comparison should be language specific, depending on the value of
+            LC_COLLATE, which is initialized from the environment.
+
+            Currently it is for Strings, but not for UnicodeStrings...
+
+     STUPID:
+        #after has a completely different meaning in SeqColl ..."
+
+    ^ (self compareCollatingWith:aString) > 0
+!
+
 compareCaselessWith:aString
     "Compare the receiver against the argument, ignoreing case.
      Return 1 if the receiver is greater, 0 if equal and -1 if less than the argument.
@@ -1900,6 +1910,23 @@
     "Modified: 22.4.1996 / 15:56:07 / cg"
 !
 
+compareCollatingWith:aString
+    "Compare the receiver with the argument and return 1 if the receiver is
+     greater, 0 if equal and -1 if less than the argument in a sorted list.
+     The comparison is language specific, depending on the value of
+     LC_COLLATE, which is in the shell environment."
+
+    "TODO not yet defined for unicode"
+    <resource: #todo>
+
+    |s|
+
+    (s := self string) ~~ self ifTrue:[
+        ^ s compareCollatingWith:aString
+    ].
+    ^ self compareWith:aString
+!
+
 compareWith:aString
     "Compare the receiver with the argument and return 1 if the receiver is
      greater, 0 if equal and -1 if less than the argument.
@@ -1971,8 +1998,18 @@
 
     "/ immediately after any change, execute (maybe in a debugger):
     "/      Set allSubInstancesDo:[:s | s rehash]
-    ^ self hash_dragonBook
-    "/ ^ self hash_sdbm.
+    "/ ^ self hash_dragonBook
+
+    |h|
+
+    "/
+    "/ this is the sdbm algorithm
+    "/
+    h := 0.
+    self do:[:char |
+        h := (65599 times:h) plus:char codePoint.
+    ].
+    ^ h 
 
     "
      'a' hash
@@ -1985,6 +2022,44 @@
      'ab' asArray hash
     "
 
+    "
+        |syms ms|
+
+        syms := Symbol allInstances.
+        Transcript show:'syms: '; showCR:syms size.
+        Transcript show:'sdbm hashes: '; showCR:(syms collect:[:s| s hash]) asSet size.
+        Transcript show:'dragonBook hashes: '; showCR:(syms collect:[:s| s hash_dragonBook]) asSet size.
+
+        ms := Time millisecondsToRun:[
+            10 timesRepeat:[
+                syms do:[:each| each hash].
+            ].
+        ].
+        Transcript show:'sdbm hash: '; showCR:ms.
+
+        ms := Time millisecondsToRun:[
+            10 timesRepeat:[
+                syms do:[:each| each hash_dragonBook].
+            ].
+        ].
+        Transcript show:'dragonBook: '; showCR:ms.
+
+        syms := syms collect:[:each| each asUnicode16String].
+        ms := Time millisecondsToRun:[
+            10 timesRepeat:[
+                syms do:[:each| each hash].
+            ].
+        ].
+        Transcript show:'unicode sdbm hash: '; showCR:ms.
+
+        ms := Time millisecondsToRun:[
+            10 timesRepeat:[
+                syms do:[:each| each hash_dragonBook].
+            ].
+        ].
+        Transcript show:'unicode dragonBook:'; showCR:ms.
+    "
+
     "Modified: / 26-12-2011 / 14:09:07 / cg"
 !
 
@@ -2000,21 +2075,20 @@
     self reverseDo:[:char |
 "/ Sorry, stc cannot compile this (as of 10.9.2007)
 "/        h := (h bitShift:4) + char asciiValue.
-	h := (h bitShift:4).
-	h:= h + char codePoint.
-	h := h bitAnd:16rFFFFFFFF.
-	g := (h bitAnd: 16rF0000000).
-	g ~~ 0 ifTrue:[
-	    h := h bitXor:(g bitShift:-24).
-	    h := h bitXor:g.
-	].
+        h := (h bitShift:4).
+        h := h + char codePoint.
+        h := h bitAnd:16rFFFFFFFF.
+        g := (h bitAnd: 16rF0000000).
+        g ~~ 0 ifTrue:[
+            h := h bitXor:(g bitShift:-24).
+            h := h bitXor:g.
+        ].
     ].
     "/
     "/ multiply by large prime to spread values
     "/ This speeds up Set and Dictionary by a factor of 10!!
     "/
-    h := h * 31415821.
-    h := h bitAnd:16r3fffffff.
+    h := h times:31415821.
     ^ h
 
     "
@@ -2031,30 +2105,6 @@
     "Created: / 26-12-2011 / 13:46:06 / cg"
 !
 
-hash_sdbm
-    "return an integer useful as a hash-key"
-
-    |h|
-
-    "/
-    "/ this is the sdbm algorithm
-    "/
-    h := 0.
-    self do:[:char |
-	h := (h * 65599) + char codePoint.
-	h := h bitAnd:16r3FFFFFFF.
-    ].
-    ^ h
-
-    "
-     'a' hash_sdbm
-     'aa' hash_sdbm
-     'ab' hash_sdbm
-    "
-
-    "Created: / 26-12-2011 / 13:48:13 / cg"
-!
-
 levenshteinTo:aString
     "return the levenshtein distance to the argument, aString;
      this value corresponds to the number of replacements that have to be
@@ -2361,11 +2411,12 @@
 !
 
 asByteArray
-    |bytes sz bytesPerCharacter idx|
-
-    self string ~~ self ifTrue:[
-	"/ for text and other wrappers
-	^ self string asByteArray
+    |bytes sz bytesPerCharacter idx str|
+
+    str := self string.
+    str ~~ self ifTrue:[
+        "/ for text and other wrappers
+        ^ str asByteArray
     ].
 
     "/ for real strings, a fallback
@@ -2374,19 +2425,19 @@
     bytes := ByteArray new:(sz * bytesPerCharacter).
     idx := 1.
     self do:[:char |
-	|code|
-
-	code := char codePoint.
-	bytesPerCharacter == 2 ifTrue:[
-	    bytes unsignedShortAt:idx put:code
-	] ifFalse:[
-	    bytesPerCharacter == 4 ifTrue:[
-		bytes unsignedLongAt:idx put:code
-	    ] ifFalse:[
-		bytes at:idx put:code
-	    ].
-	].
-	idx := idx + bytesPerCharacter.
+        |code|
+
+        code := char codePoint.
+        bytesPerCharacter == 2 ifTrue:[
+            bytes unsignedShortAt:idx put:code
+        ] ifFalse:[
+            bytesPerCharacter == 4 ifTrue:[
+                bytes unsignedLongAt:idx put:code
+            ] ifFalse:[
+                bytes at:idx put:code
+            ].
+        ].
+        idx := idx + bytesPerCharacter.
     ].
     ^ bytes
 
@@ -2758,11 +2809,12 @@
     bitsPerCharacter := newStr bitsPerCharacter.
 
     1 to:mySize do:[:i |
-	c := (self at:i) asLowercase.
-	c bitsPerCharacter > bitsPerCharacter ifTrue:[
-	    newStr := c stringSpecies fromString:newStr.
-	].
-	newStr at:i put:c
+        c := (self at:i) asLowercase.
+        (c bitsPerCharacter > bitsPerCharacter 
+         and:[c stringSpecies ~= newStr stringSpecies]) ifTrue:[
+            newStr := c stringSpecies fromString:newStr.
+        ].
+        newStr at:i put:c
     ].
     ^ newStr
 
@@ -2883,7 +2935,7 @@
     "if possible, return the receiver converted to a 'normal' string.
      It is only possible, if there are no characters with codePoints above 255 in the receiver."
 
-    self bitsPerCharacter == 8 ifTrue:[^ self].
+    self isWideString ifFalse:[^ self].
     (self contains:[:char | char codePoint > 255]) ifFalse:[^ self asSingleByteString].
     ^ self
 
@@ -3192,14 +3244,6 @@
     ^ self
 
     "Modified: 27.4.1996 / 13:29:30 / cg"
-!
-
-writeStream
-    "return a stream for writing onto the receiver.
-     Redefined to return a CharacterWriteStream which automatically checks for the bytesPerCharacter
-     and replaces the stream-buffer as required."
-
-    ^ CharacterWriteStream on:self
 ! !
 
 !CharacterArray methodsFor:'copying'!
@@ -3796,14 +3840,16 @@
 utf8Decoded
     "Interpreting myself as an UTF-8 representation, decode and return the decoded string."
 
-    |in out is16Bit c|
-
-    is16Bit := false.
+    |in out|
+
+    (self isWideString or:[self contains8BitCharacters]) ifFalse:[
+        "speed up common case"
+        ^ self.
+    ].
     out := CharacterWriteStream on:(String uninitializedNew:self size).
     in := self readStream.
     [in atEnd] whileFalse:[
-	c := Character utf8DecodeFrom:in.
-	out nextPut:c.
+        out nextPut:(Character utf8DecodeFrom:in).
     ].
     ^ out contents
 
@@ -3821,12 +3867,12 @@
       |utf8Encoding original readBack|
 
       1 to:16rFFFF do:[:ascii |
-	original := (Character value:ascii) asString.
-	utf8Encoding := original utf8Encoded.
-	readBack := utf8Encoding utf8Decoded.
-	readBack = original ifFalse:[
-	    self halt
-	]
+        original := (Character value:ascii) asString.
+        utf8Encoding := original utf8Encoded.
+        readBack := utf8Encoding utf8Decoded.
+        readBack = original ifFalse:[
+            self halt
+        ]
       ]
     "
 !
@@ -3858,19 +3904,27 @@
 utf8Encoded
     "Return my UTF-8 representation as a new String"
 
-    |s|
-
-    s := WriteStream on:(String uninitializedNew:self size).
+    |s sz|
+
+    (self isWideString or:[self contains8BitCharacters]) ifFalse:[
+        "speed up common case"
+        ^ self.
+    ].
+    "We already know, that we need more space, just a rough estimation"
+    sz := self size.
+    s := WriteStream on:(String uninitializedNew:(sz+2+(sz//3))).
     s nextPutAllUtf8:self.
     ^ s contents
 
     "
-     'abcde' utf8Encoded
+     'abcde1234' utf8Encoded
+     'abcdeäöüß' utf8Encoded
     "
 
     "Modified: / 11-05-2010 / 19:12:37 / cg"
 ! !
 
+
 !CharacterArray methodsFor:'matching - glob expressions'!
 
 compoundMatch:aString
@@ -4314,6 +4368,8 @@
     "Created: / 08-03-2012 / 03:11:11 / cg"
 ! !
 
+
+
 !CharacterArray methodsFor:'padded copying'!
 
 centerPaddedTo:newSize
@@ -4567,6 +4623,7 @@
     "
 ! !
 
+
 !CharacterArray methodsFor:'queries'!
 
 bitsPerCharacter
@@ -4576,12 +4633,12 @@
     |string max|
 
     (string := self string) ~~ self ifTrue:[
-	^ string bitsPerCharacter
+        ^ string bitsPerCharacter
     ].
 
     max := 8.
     self do:[:eachCharacter |
-	max := max max:(eachCharacter bitsPerCharacter)
+        max := max max:(eachCharacter bitsPerCharacter)
     ].
     ^ max
 
@@ -4837,6 +4894,7 @@
     "Modified: 17.4.1997 / 12:50:23 / cg"
 ! !
 
+
 !CharacterArray methodsFor:'special string converting'!
 
 expandPlaceholdersWith:argArrayOrDictionary
@@ -5641,6 +5699,7 @@
     "
 ! !
 
+
 !CharacterArray methodsFor:'substring searching'!
 
 findString:subString
@@ -6044,6 +6103,10 @@
     "
 !
 
+isWideString
+    ^ self bitsPerCharacter > 8
+!
+
 numArgs
     "treating the receiver as a message selector, return how many arguments would it take"
 
@@ -6076,7 +6139,11 @@
 !
 
 partsIfSelector
-    "treat the receiver as a message selector, return a collection of parts."
+    "treat the receiver as a message selector, return a collection of parts.
+     Notice: this is more tolerant than Smalltalk's syntax would suggest;
+     especially it allows for empty keyword parts between colons.
+     This is not (and should not be checked here), to allow parsing of
+     degenerate selectors as appearing with objectiveC."
 
     |idx1 "{ Class: SmallInteger }"
      coll idx2 sz|
@@ -6085,19 +6152,21 @@
     idx1 := 1.
     sz := self size.
     [true] whileTrue:[
-	idx2 := self indexOf:$: startingAt:idx1 + 1.
-	(idx2 == 0 or:[idx2 == sz]) ifTrue:[
-	    coll add:(self copyFrom:idx1).
-	    ^ coll
-	].
-	coll add:(self copyFrom:idx1 to:idx2).
-	idx1 := idx2 + 1
+        idx2 := self indexOf:$: startingAt:idx1.
+        (idx2 == 0 or:[idx2 == sz]) ifTrue:[
+            coll add:(self copyFrom:idx1).
+            ^ coll
+        ].
+        coll add:(self copyFrom:idx1 to:idx2).
+        idx1 := idx2 + 1
     ].
 
     "
-     'foo:bar:' partsIfSelector
-     #foo:bar: partsIfSelector
-     'hello' partsIfSelector
+     'foo:' partsIfSelector            
+     'foo:bar:' partsIfSelector    
+     'foo::::' partsIfSelector     
+     #foo:bar: partsIfSelector     
+     'hello' partsIfSelector       
      '+' partsIfSelector
     "
 ! !
@@ -6122,15 +6191,12 @@
 !CharacterArray class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.481 2012/10/31 19:10:50 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.491 2013-01-16 12:31:17 stefan Exp $'
 !
 
 version_CVS
-    ^ '§Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.481 2012/10/31 19:10:50 cg Exp §'
-!
-
-version_SVN
-    ^ '$Id: CharacterArray.st 10876 2012-11-30 17:19:23Z vranyj1 $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.491 2013-01-16 12:31:17 stefan Exp $'
 ! !
 
+
 CharacterArray initialize!