diff -r 6c07323b8fa2 -r 99bf865f7b78 CharacterArray.st --- a/CharacterArray.st Sun Aug 16 12:51:54 2009 +0200 +++ b/CharacterArray.st Sun Aug 16 23:30:49 2009 +0200 @@ -1817,7 +1817,7 @@ compareWith:aString "Compare the receiver with the argument and return 1 if the receiver is greater, 0 if equal and -1 if less than the argument. - This comparison is based on the elements ascii code - + This comparison is based on the elements' codepoints - i.e. upper/lowercase & national characters are NOT treated specially. 'foo' compareWith: 'Foo' will return 1. while 'foo' sameAs:'Foo' will return true" @@ -1832,10 +1832,10 @@ n := mySize min:otherSize. 1 to:n do:[:index | - c1 := self at:index. - c2 := aString at:index. - c1 > c2 ifTrue:[^ 1]. - c1 < c2 ifTrue:[^ -1]. + c1 := self at:index. + c2 := aString at:index. + c1 > c2 ifTrue:[^ 1]. + c1 < c2 ifTrue:[^ -1]. ]. mySize > otherSize ifTrue:[^ 1]. mySize < otherSize ifTrue:[^ -1]. @@ -1844,6 +1844,27 @@ "Modified: 22.4.1996 / 15:56:07 / cg" ! +endsWith:aStringOrCharacter + "return true, if the receiver ends with something, aStringOrCharacter." + + |s| + + (s := self string) ~~ self ifTrue:[ + ^ s endsWith:aStringOrCharacter + ]. + aStringOrCharacter isCharacter ifTrue:[ + ^ self last = aStringOrCharacter + ]. + ^ super endsWith:aStringOrCharacter + + " + 'hello world' endsWith:'world' + 'hello world' asText allBold endsWith:'world' + " + + "Modified: 12.5.1996 / 15:49:18 / cg" +! + hammingDistanceTo:aString "return the hamming distance (the number of characters which are different). In information theory, the Hamming distance between two strings of equal length @@ -1900,6 +1921,73 @@ " ! +levenshteinTo:aString + "return the levenshtein distance to the argument, aString; + this value corresponds to the number of replacements that have to be + made to get aString from the receiver. + See IEEE transactions on Computers 1976 Pg 172 ff." + + " + in the following, we assume that ommiting a character + is less of an error than inserting an extra character. + Therefore the different insertion (i) and deletion (d) values. + s: substitution weight + k: keyboard weight (typing a nearby key) - or nil (then use s) + c: case weight - or nil (then use s) + e: exchange weight - or nil (then use s*2) + i: insertion of extra character weight + d: delete of a character weight + " + + ^ StringUtilities + levenshteinDistanceFrom:self + to:aString + s:4 k:4 c:4 e:nil i:2 d:6 + + " + 'computer' levenshteinTo:'computer' + 'cOmputer' levenshteinTo:'computer' + 'cOmpuTer' levenshteinTo:'computer' + 'cimputer' levenshteinTo:'computer' + 'cumputer' levenshteinTo:'computer' + + 'cmputer' levenshteinTo:'computer' + 'coomputer' levenshteinTo:'computer' + + 'ocmprt' levenshteinTo:'computer' + 'computer' levenshteinTo:'computer' + 'ocmputer' levenshteinTo:'computer' + 'cmputer' levenshteinTo:'computer' + 'computer' levenshteinTo:'cmputer' + 'Computer' levenshteinTo:'computer' + + 'compiter' levenshteinTo:'computer' + 'compoter' levenshteinTo:'computer' + + 'comptuer' levenshteinTo:'computer' + " +! + +levenshteinTo:aString s:substWeight k:kbdTypoWeight c:caseWeight i:insrtWeight d:deleteWeight + "parametrized levenshtein. + return the levenshtein distance to the argument, aString; + this value corrensponds to the number of replacements that have to be + made to get aString from the receiver. + The arguments are the costs for + s:substitution, + k:keyboard type (substitution), + c:case-change, + i:insertion + d:deletion + of a character. + See IEEE transactions on Computers 1976 Pg 172 ff" + + ^ StringUtilities + levenshteinDistanceFrom:self + to:aString + s:substWeight k:kbdTypoWeight c:caseWeight e:nil i:insrtWeight d:deleteWeight +! + sameAs:aString "Compare the receiver with the argument like =, but ignore case differences. Return true or false." @@ -2027,6 +2115,81 @@ 'hello' sameStringAndEmphasisAs: 'fooba' asText allBold 'hello' sameStringAndEmphasisAs: 'fooba' asText allItalic " +! + +spellAgainst: aString + "return an integer between 0 and 100 indicating how similar + the argument is to the receiver. No case conversion is done. + This algorithm is much simpler (but also less exact) than the + levenshtein distance. Experiment which is better for your + application." + + | i1 "{ Class: SmallInteger }" + i2 "{ Class: SmallInteger }" + next1 "{ Class: SmallInteger }" + next2 "{ Class: SmallInteger }" + size1 "{ Class: SmallInteger }" + size2 "{ Class: SmallInteger }" + score "{ Class: SmallInteger }" + maxLen "{ Class: SmallInteger }" | + + size1 := self size. + size2 := aString size. + maxLen := size1 max:size2. + score := 0. + i1 := i2 := 1. + [i1 <= size1 and: [i2 <= size2]] whileTrue:[ + next1 := i1 + 1. + next2 := i2 + 1. + (self at:i1) == (aString at:i2) ifTrue: [ + score := score+1. + i1 := next1. + i2 := next2 + ] ifFalse: [ + (i2 < size2 and: [(self at:i1) == (aString at:next2)]) ifTrue: [ + i2 := next2 + ] ifFalse: [ + (i1 < size1 and: [(self at:next1) == (aString at:i2)]) ifTrue: [ + i1 := next1 + ] ifFalse: [ + i1 := next1. + i2 := next2 + ] + ] + ] + ]. + + score == maxLen ifTrue: [^ 100]. + ^ 100 * score // maxLen + + " + 'Smalltalk' spellAgainst: 'Smalltlak' + 'Smalltalk' spellAgainst: 'smalltlak' + 'Smalltalk' spellAgainst: 'smalltalk' + 'Smalltalk' spellAgainst: 'smalltlk' + 'Smalltalk' spellAgainst: 'Smalltolk' + " +! + +startsWith:aString + "return true, if the receiver starts with something, aString. + If the argument is empty, true is returned." + + |s| + + (s := self string) ~~ self ifTrue:[ + ^ s startsWith:aString + ]. + ^ super startsWith:aString + + " + 'hello world' startsWith:'hello' + 'hello world' asText allBold startsWith:'hello' + 'hello world' asText allBold startsWith:'' + " + + "Created: 12.5.1996 / 15:46:40 / cg" + "Modified: 12.5.1996 / 15:49:24 / cg" ! ! !CharacterArray methodsFor:'converting'! @@ -5160,27 +5323,6 @@ !CharacterArray methodsFor:'testing'! -endsWith:aStringOrCharacter - "return true, if the receiver ends with something, aStringOrCharacter." - - |s| - - (s := self string) ~~ self ifTrue:[ - ^ s endsWith:aStringOrCharacter - ]. - aStringOrCharacter isCharacter ifTrue:[ - ^ self last = aStringOrCharacter - ]. - ^ super endsWith:aStringOrCharacter - - " - 'hello world' endsWith:'world' - 'hello world' asText allBold endsWith:'world' - " - - "Modified: 12.5.1996 / 15:49:18 / cg" -! - isAlphaNumeric "return true, if the receiver is some alphanumeric word; i.e. consists of a letter followed by letters or digits." @@ -5375,122 +5517,6 @@ " ! -levenshteinTo:aString - "return the levenshtein distance to the argument, aString; - this value corresponds to the number of replacements that have to be - made to get aString from the receiver. - See IEEE transactions on Computers 1976 Pg 172 ff." - - " - in the following, we assume that ommiting a character - is less of an error than inserting an extra character. - Therefore the different insertion (i) and deletion (d) values. - s: substitution weight - k: keyboard weight (typing a nearby key) - c: case weight - i: insertion of extra character weight - d: delete of a character weight - " - - ^ self levenshteinTo:aString s:4 k:2 c:1 i:2 d:6 - - " - 'computer' levenshteinTo:'computer' - 'cOmputer' levenshteinTo:'computer' - 'cOmpuTer' levenshteinTo:'computer' - 'cimputer' levenshteinTo:'computer' - 'cumputer' levenshteinTo:'computer' - - 'cmputer' levenshteinTo:'computer' - 'coomputer' levenshteinTo:'computer' - - 'ocmprt' levenshteinTo:'computer' - 'computer' levenshteinTo:'computer' - 'ocmputer' levenshteinTo:'computer' - 'cmputer' levenshteinTo:'computer' - 'computer' levenshteinTo:'cmputer' - 'Computer' levenshteinTo:'computer' - " -! - -levenshteinTo:aString s:substWeight k:kbdTypoWeight c:caseWeight i:insrtWeight d:deleteWeight - "parametrized levenshtein. - return the levenshtein distance to the argument, aString; - this value corrensponds to the number of replacements that have to be - made to get aString from the receiver. - The arguments are the costs for - s:substitution, - k:keyboard type (substitution), - c:case-change, - i:insertion - d:deletion - of a character. - See IEEE transactions on Computers 1976 Pg 172 ff" - - |d "delta matrix" - len1 "{ Class: SmallInteger }" - len2 "{ Class: SmallInteger }" - dim "{ Class: SmallInteger }" - prevRow row col - dimPlus1 "{ Class: SmallInteger }" - min pp c1 c2| - - len1 := self size. - len2 := aString size. - - "create the help-matrix" - - dim := len1 max:len2. - dimPlus1 := dim + 1. - - d := Array new:dimPlus1. - 1 to:dimPlus1 do:[:i | - d at:i put:(Array new:dimPlus1) - ]. - - "init help-matrix" - - (d at:1) at:1 put:0. - row := d at:1. - 1 to:dim do:[:j | - row at:(j + 1) put:( (row at:j) + insrtWeight ) - ]. - - 1 to:dim do:[:i | - (d at:(i + 1)) at:1 put:( ((d at:i) at:1) + deleteWeight ) - ]. - - 1 to:len1 do:[:i | - c1 := self at:i. - 1 to:len2 do:[:j | - c2 := aString at:j. - (c1 == c2) ifTrue:[ - pp := 0 - ] ifFalse:[ - (c1 asLowercase == c2 asLowercase) ifTrue:[ - pp := caseWeight - ] ifFalse:[ - pp := substWeight. - substWeight ~~ kbdTypoWeight ifTrue:[ - (DoWhatIMeanSupport isKey:c1 asLowercase nextTo:c2 asLowercase) ifTrue:[ - pp := kbdTypoWeight. - ]. - ]. - ] - ]. - prevRow := d at:i. - row := d at:(i + 1). - col := j + 1. - min := (prevRow at:j) + pp. - min := min min:( (row at:j) + insrtWeight). - min := min min:( (prevRow at:col) + deleteWeight). - row at:col put: min - ] - ]. - - ^ (d at:(len1 + 1)) at:(len2 + 1) -! - numArgs "treating the receiver as a message selector, return how many arguments would it take" @@ -5547,81 +5573,6 @@ 'hello' partsIfSelector '+' partsIfSelector " -! - -spellAgainst: aString - "return an integer between 0 and 100 indicating how similar - the argument is to the receiver. No case conversion is done. - This algorithm is much simpler (but also less exact) than the - levenshtein distance. Experiment which is better for your - application." - - | i1 "{ Class: SmallInteger }" - i2 "{ Class: SmallInteger }" - next1 "{ Class: SmallInteger }" - next2 "{ Class: SmallInteger }" - size1 "{ Class: SmallInteger }" - size2 "{ Class: SmallInteger }" - score "{ Class: SmallInteger }" - maxLen "{ Class: SmallInteger }" | - - size1 := self size. - size2 := aString size. - maxLen := size1 max:size2. - score := 0. - i1 := i2 := 1. - [i1 <= size1 and: [i2 <= size2]] whileTrue:[ - next1 := i1 + 1. - next2 := i2 + 1. - (self at:i1) == (aString at:i2) ifTrue: [ - score := score+1. - i1 := next1. - i2 := next2 - ] ifFalse: [ - (i2 < size2 and: [(self at:i1) == (aString at:next2)]) ifTrue: [ - i2 := next2 - ] ifFalse: [ - (i1 < size1 and: [(self at:next1) == (aString at:i2)]) ifTrue: [ - i1 := next1 - ] ifFalse: [ - i1 := next1. - i2 := next2 - ] - ] - ] - ]. - - score == maxLen ifTrue: [^ 100]. - ^ 100 * score // maxLen - - " - 'Smalltalk' spellAgainst: 'Smalltlak' - 'Smalltalk' spellAgainst: 'smalltlak' - 'Smalltalk' spellAgainst: 'smalltalk' - 'Smalltalk' spellAgainst: 'smalltlk' - 'Smalltalk' spellAgainst: 'Smalltolk' - " -! - -startsWith:aString - "return true, if the receiver starts with something, aString. - If the argument is empty, true is returned." - - |s| - - (s := self string) ~~ self ifTrue:[ - ^ s startsWith:aString - ]. - ^ super startsWith:aString - - " - 'hello world' startsWith:'hello' - 'hello world' asText allBold startsWith:'hello' - 'hello world' asText allBold startsWith:'' - " - - "Created: 12.5.1996 / 15:46:40 / cg" - "Modified: 12.5.1996 / 15:49:24 / cg" ! ! !CharacterArray methodsFor:'tracing'! @@ -5644,7 +5595,7 @@ !CharacterArray class methodsFor:'documentation'! version - ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.401 2009-08-10 13:39:08 cg Exp $' + ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.402 2009-08-16 21:30:49 cg Exp $' ! ! CharacterArray initialize!