--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PhoneticStringUtilities.st Mon Aug 10 15:35:04 2009 +0200
@@ -0,0 +1,320 @@
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+ All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice. This software may not
+ be provided or otherwise made available to, or used by, any
+ other person. No title to or ownership of the software is
+ hereby transferred.
+"
+"{ Package: 'stx:libbasic2' }"
+
+Object subclass:#PhoneticStringUtilities
+ instanceVariableNames:''
+ classVariableNames:''
+ poolDictionaries:''
+ category:'Collections-Text-Support'
+!
+
+!PhoneticStringUtilities class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+ All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice. This software may not
+ be provided or otherwise made available to, or used by, any
+ other person. No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+ the soundexCode algorithm was originally contained in the CharacterArray class;
+ the koelnerPhoneticCode provides a similar functionality, which is more tuned towards the
+ German language. Doublemetaphone works with most european languages.
+"
+! !
+
+!PhoneticStringUtilities class methodsFor:'classification'!
+
+isSlavoGermanic:aString
+ ^ #('w' 'k' 'cz' 'witz') contains:[:sub | aString includesString:sub]
+
+ "
+ self isSlavoGermanic:'walter'
+ "
+! !
+
+!PhoneticStringUtilities class methodsFor:'phonetic codes'!
+
+koelnerPhoneticCodeOf:aString
+ "return a koelner phonetic code.
+ The koelnerPhonetic code is for the german language what the soundex code is for english;
+ it returns simular strings for similar sounding words.
+ There are some differences to soundex, though:
+ its length is not limited to 4, but depends on the length of the original string;
+ it does not start with the first character of the input."
+
+ |in ret val convFirst convRest|
+
+ convFirst := [:chars |
+ |pair|
+
+ pair := #(
+ ('#a#' '0')
+ ('#e#' '0')
+ ('#i#' '0')
+ ('#j#' '0')
+ ('#y#' '0')
+ ('#o#' '0')
+ ('#u#' '0')
+
+ ('#ca' '4')
+ ('#ch' '4')
+ ('#ck' '4')
+ ('#cl' '4')
+ ('#co' '4')
+ ('#cq' '4')
+ ('#cr' '4')
+ ('#cu' '4')
+ ('#cx' '4')
+
+ ('#c#' '8')
+ ) detect:[:pair | (pair first match:chars) ] ifNone:nil.
+
+ pair notNil ifTrue:[ pair second ] ifFalse:[ convRest value:chars ]
+ ].
+
+ convRest := [:chars |
+ |pair|
+
+ pair := #(
+ ('#ds' '8')
+ ('#dc' '8')
+ ('#dz' '8')
+ ('#ts' '8')
+ ('#tc' '8')
+ ('#tz' '8')
+ ('#d#' '2')
+ ('#t#' '2')
+ ('cx#' '8')
+ ('kx#' '8')
+ ('qx#' '8')
+ ('#x#' '48')
+ ('sc#' '8')
+ ('sz#' '8')
+ ('#ca' '4')
+ ('#co' '4')
+ ('#cu' '4')
+ ('#ch' '4')
+ ('#ck' '4')
+ ('#cx' '4')
+ ('#cq' '4')
+ ('#c#' '8')
+ ('#a#' '0')
+ ('#e#' '0')
+ ('#i#' '0')
+ ('#j#' '0')
+ ('#y#' '0')
+ ('#o#' '0')
+ ('#u#' '0')
+ ('#h#' '-')
+ ('#l#' '5')
+ ('#r#' '7')
+ ('#m#' '6')
+ ('#n#' '6')
+ ('#s#' '8')
+ ('#z#' '8')
+ ('#b#' '1')
+ ('#p#' '1')
+ ('#f#' '3')
+ ('#v#' '3')
+ ('#w#' '3')
+ ('#g#' '4')
+ ('#k#' '4')
+ ('#q#' '4')
+ ('###' '?')
+ ) detect:[:pair | (pair first match:chars) ].
+
+ pair second
+ ].
+
+ in := aString withoutSeparators asLowercase.
+ in := in copyReplaceString:'ph' withString:'f'.
+ in := in copyReplaceAll:$ü withAll:'u'.
+ in := in copyReplaceAll:$ä withAll:'a'.
+ in := in copyReplaceAll:$ö withAll:'o'.
+ in := in copyReplaceAll:$ß withAll:'ss'.
+ in := '#',in,'#'.
+
+ ret := ''.
+ 1 to:in size-2 do:[:i |
+ |sub|
+
+ sub := in copyFrom:i to:i+2.
+ val := (i==1)
+ ifTrue:[ convFirst value:sub ]
+ ifFalse:[ convRest value:sub ].
+ ret := ret,val
+ ].
+
+ ret := ret select:[:ch | ch ~= $-].
+
+ (ret startsWith:'0') ifTrue:[
+ ret := '0',(ret select:[:ch | ch ~= $0]).
+ ] ifFalse:[
+ ret := ret select:[:ch | ch ~= $0].
+ ].
+
+ ^ String streamContents:[:s |
+ |prev|
+
+ ret do:[:ch |
+ ch ~= prev ifTrue:[
+ s nextPut:ch
+ ].
+ prev := ch.
+ ].
+ ].
+
+ "
+ #(
+ 'Müller'
+ 'Miller'
+ 'Mueller'
+ 'Mühler'
+ 'Mühlherr'
+ 'Mülherr'
+ 'Myler'
+ 'Millar'
+ 'Myller'
+ 'Müllar'
+ 'Müler'
+ 'Muehler'
+ 'Mülller'
+ 'Müllerr'
+ 'Muehlherr'
+ 'Muellar'
+ 'Mueler'
+ 'Mülleer'
+ 'Mueller'
+ 'Nüller'
+ 'Nyller'
+ 'Niler'
+ 'Czerny'
+ 'Tscherny'
+ 'Czernie'
+ 'Tschernie'
+ 'Schernie'
+ 'Scherny'
+ 'Scherno'
+ 'Czerne'
+ 'Zerny'
+ 'Tzernie'
+ 'Breschnew'
+ ) do:[:w |
+ Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w)
+ ].
+ "
+
+ "
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew' -> '17863'
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff' -> '17863'
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff' -> '17863'
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff' -> '17863'
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff' -> '17863'
+ PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph' -> '17863'
+ "
+!
+
+soundexCodeOf:aString
+ "return a soundex phonetic code or nil.
+ Soundex returns similar codes for similar sounding words, making it a useful
+ tool when searching for words where the correct spelling is unknown.
+ (read Knuth or search the web if you dont know what a soundex code is).
+ Caveat: 'similar sounding words' means: 'similar sounding in english'."
+
+ |inStream codeStream ch last lch codeLength codes sc|
+
+ inStream := aString readStream.
+ inStream skipSeparators.
+ inStream atEnd ifTrue:[
+ ^ nil
+ ].
+
+ ch := inStream next.
+ ch isLetter ifFalse:[
+ ^ nil
+ ].
+ codeLength := 0.
+
+ codes := Dictionary new.
+ codes atAll:'bpfv' put:$1.
+ codes atAll:'cskgjqxz' put:$2.
+ codes atAll:'dt' put:$3.
+ codes atAll:'l' put:$4.
+ codes atAll:'nm' put:$5.
+ codes atAll:'r' put:$6.
+
+ codeStream := WriteStream on:(String new:4).
+ codeStream nextPut:(ch asUppercase).
+
+ [inStream atEnd] whileFalse:[
+ ch := inStream next.
+ lch := ch asLowercase.
+ lch = last ifFalse:[
+ last := lch.
+
+ sc := codes at:ch ifAbsent:nil.
+ sc notNil ifTrue:[
+ codeLength < 3 ifTrue:[
+ codeStream nextPut:sc.
+ codeLength := codeLength + 1.
+ ]
+ ] ifFalse:[
+"/ ch isLetter ifFalse:[
+"/ "/ something else - ignore it
+"/ ] ifTrue:[
+"/ "/ else its a vowel and we ignore it
+"/ ]
+ ].
+ ]
+ ].
+ [ codeLength < 3 ] whileTrue:[
+ codeStream nextPut:$0.
+ codeLength := codeLength + 1.
+ ].
+
+ ^ codeStream contents
+
+ "
+ PhoneticStringUtilities soundexCodeOf:'claus'
+ PhoneticStringUtilities soundexCodeOf:'clause'
+ PhoneticStringUtilities soundexCodeOf:'close'
+ PhoneticStringUtilities soundexCodeOf:'smalltalk'
+ PhoneticStringUtilities soundexCodeOf:'smaltalk'
+ PhoneticStringUtilities soundexCodeOf:'smaltak'
+ PhoneticStringUtilities soundexCodeOf:'smaltok'
+ PhoneticStringUtilities soundexCodeOf:'smoltok'
+ PhoneticStringUtilities soundexCodeOf:'aa'
+ PhoneticStringUtilities soundexCodeOf:'by'
+ PhoneticStringUtilities soundexCodeOf:'bab'
+ PhoneticStringUtilities soundexCodeOf:'bob'
+ PhoneticStringUtilities soundexCodeOf:'bop'
+ "
+! !
+
+!PhoneticStringUtilities class methodsFor:'documentation'!
+
+version
+ ^ '$Header: /cvs/stx/stx/libbasic2/PhoneticStringUtilities.st,v 1.1 2009-08-10 13:35:04 cg Exp $'
+! !