initial checkin
authorClaus Gittinger <cg@exept.de>
Mon, 10 Aug 2009 15:35:04 +0200
changeset 2197 33e71ed6cf32
parent 2196 9eecfefd08aa
child 2198 c2363f48b40b
initial checkin
PhoneticStringUtilities.st
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PhoneticStringUtilities.st	Mon Aug 10 15:35:04 2009 +0200
@@ -0,0 +1,320 @@
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+"{ Package: 'stx:libbasic2' }"
+
+Object subclass:#PhoneticStringUtilities
+	instanceVariableNames:''
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Text-Support'
+!
+
+!PhoneticStringUtilities class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+    the soundexCode algorithm was originally contained in the CharacterArray class;
+    the koelnerPhoneticCode provides a similar functionality, which is more tuned towards the
+    German language. Doublemetaphone works with most european languages.
+"
+! !
+
+!PhoneticStringUtilities class methodsFor:'classification'!
+
+isSlavoGermanic:aString
+    ^ #('w' 'k' 'cz' 'witz') contains:[:sub | aString includesString:sub]
+
+    "
+     self isSlavoGermanic:'walter'
+    "
+! !
+
+!PhoneticStringUtilities class methodsFor:'phonetic codes'!
+
+koelnerPhoneticCodeOf:aString
+    "return a koelner phonetic code.
+     The koelnerPhonetic code is for the german language what the soundex code is for english;
+     it returns simular strings for similar sounding words. 
+     There are some differences to soundex, though: 
+        its length is not limited to 4, but depends on the length of the original string;
+        it does not start with the first character of the input."
+
+    |in ret val convFirst convRest|
+
+    convFirst := [:chars |
+        |pair|
+
+        pair := #(
+                    ('#a#' '0')
+                    ('#e#' '0')
+                    ('#i#' '0')
+                    ('#j#' '0')
+                    ('#y#' '0')
+                    ('#o#' '0')
+                    ('#u#' '0')
+
+                    ('#ca' '4')
+                    ('#ch' '4')
+                    ('#ck' '4')
+                    ('#cl' '4')
+                    ('#co' '4')
+                    ('#cq' '4')
+                    ('#cr' '4')
+                    ('#cu' '4')
+                    ('#cx' '4')
+
+                    ('#c#' '8')
+                ) detect:[:pair | (pair first match:chars) ] ifNone:nil.
+
+        pair notNil ifTrue:[ pair second ] ifFalse:[ convRest value:chars ]
+    ].
+
+    convRest := [:chars |
+        |pair|
+
+        pair := #(
+                    ('#ds' '8')
+                    ('#dc' '8')
+                    ('#dz' '8')
+                    ('#ts' '8')
+                    ('#tc' '8')
+                    ('#tz' '8')
+                    ('#d#' '2')
+                    ('#t#' '2')
+                    ('cx#' '8')
+                    ('kx#' '8')
+                    ('qx#' '8')
+                    ('#x#' '48')
+                    ('sc#' '8')
+                    ('sz#' '8')
+                    ('#ca' '4')
+                    ('#co' '4')
+                    ('#cu' '4')
+                    ('#ch' '4')
+                    ('#ck' '4')
+                    ('#cx' '4')
+                    ('#cq' '4')
+                    ('#c#' '8')
+                    ('#a#' '0')
+                    ('#e#' '0')
+                    ('#i#' '0')
+                    ('#j#' '0')
+                    ('#y#' '0')
+                    ('#o#' '0')
+                    ('#u#' '0')
+                    ('#h#' '-')
+                    ('#l#' '5')
+                    ('#r#' '7')
+                    ('#m#' '6')
+                    ('#n#' '6')
+                    ('#s#' '8')
+                    ('#z#' '8')
+                    ('#b#' '1')
+                    ('#p#' '1')
+                    ('#f#' '3')
+                    ('#v#' '3')
+                    ('#w#' '3')
+                    ('#g#' '4')
+                    ('#k#' '4')
+                    ('#q#' '4')
+                    ('###' '?')
+                ) detect:[:pair | (pair first match:chars) ].
+
+        pair second
+    ].
+
+    in := aString withoutSeparators asLowercase.
+    in := in copyReplaceString:'ph' withString:'f'.
+    in := in copyReplaceAll:$ü withAll:'u'.
+    in := in copyReplaceAll:$ä withAll:'a'.
+    in := in copyReplaceAll:$ö withAll:'o'.
+    in := in copyReplaceAll:$ß withAll:'ss'.
+    in := '#',in,'#'.
+
+    ret := ''.
+    1 to:in size-2 do:[:i |
+        |sub|
+
+        sub := in copyFrom:i to:i+2.
+        val := (i==1) 
+                    ifTrue:[ convFirst value:sub ] 
+                    ifFalse:[ convRest value:sub ].
+        ret := ret,val
+    ].
+
+    ret := ret select:[:ch | ch ~= $-].
+
+    (ret startsWith:'0') ifTrue:[
+        ret := '0',(ret select:[:ch | ch ~= $0]).
+    ] ifFalse:[
+        ret := ret select:[:ch | ch ~= $0].
+    ].
+
+    ^ String streamContents:[:s |
+        |prev|
+
+        ret do:[:ch |
+            ch ~= prev ifTrue:[
+                s nextPut:ch
+            ].
+            prev := ch.
+        ].
+      ].
+
+    "
+     #(
+        'Müller'
+        'Miller'
+        'Mueller'
+        'Mühler'
+        'Mühlherr'
+        'Mülherr'
+        'Myler'
+        'Millar'
+        'Myller'
+        'Müllar'
+        'Müler'
+        'Muehler'
+        'Mülller'
+        'Müllerr'
+        'Muehlherr'
+        'Muellar'
+        'Mueler'
+        'Mülleer'
+        'Mueller'
+        'Nüller'
+        'Nyller'
+        'Niler'
+        'Czerny'
+        'Tscherny'
+        'Czernie'
+        'Tschernie'
+        'Schernie'
+        'Scherny'
+        'Scherno'
+        'Czerne'
+        'Zerny'
+        'Tzernie'
+        'Breschnew'
+     ) do:[:w |
+         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w)
+     ].
+    "
+
+    "
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew' -> '17863'
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff' -> '17863'
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff' -> '17863'
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff' -> '17863'
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff' -> '17863'
+     PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph' -> '17863'
+    "
+!
+
+soundexCodeOf:aString
+    "return a soundex phonetic code or nil.
+     Soundex returns similar codes for similar sounding words, making it a useful
+     tool when searching for words where the correct spelling is unknown.
+     (read Knuth or search the web if you dont know what a soundex code is).
+     Caveat: 'similar sounding words' means: 'similar sounding in english'."
+
+    |inStream codeStream ch last lch codeLength codes sc|
+
+    inStream := aString readStream.
+    inStream skipSeparators.
+    inStream atEnd ifTrue:[
+        ^ nil
+    ].
+
+    ch := inStream next.
+    ch isLetter ifFalse:[
+        ^ nil
+    ].
+    codeLength := 0.
+
+    codes := Dictionary new.
+    codes atAll:'bpfv'     put:$1.
+    codes atAll:'cskgjqxz' put:$2.
+    codes atAll:'dt'       put:$3.
+    codes atAll:'l'        put:$4.
+    codes atAll:'nm'       put:$5.
+    codes atAll:'r'        put:$6.
+
+    codeStream := WriteStream on:(String new:4).
+    codeStream nextPut:(ch asUppercase).
+
+    [inStream atEnd] whileFalse:[
+        ch := inStream next.
+        lch := ch asLowercase.
+        lch = last ifFalse:[
+            last := lch.
+
+            sc := codes at:ch ifAbsent:nil.
+            sc notNil ifTrue:[
+                codeLength < 3 ifTrue:[
+                    codeStream nextPut:sc.
+                    codeLength := codeLength + 1.
+                ]
+            ] ifFalse:[
+"/                ch isLetter ifFalse:[
+"/                    "/ something else - ignore it
+"/                ] ifTrue:[
+"/                    "/ else its a vowel and we ignore it
+"/                ]
+            ].
+        ]
+    ].
+    [ codeLength < 3 ] whileTrue:[
+        codeStream nextPut:$0.
+        codeLength := codeLength + 1.
+    ].
+
+    ^ codeStream contents
+
+    "
+     PhoneticStringUtilities soundexCodeOf:'claus'   
+     PhoneticStringUtilities soundexCodeOf:'clause'   
+     PhoneticStringUtilities soundexCodeOf:'close'   
+     PhoneticStringUtilities soundexCodeOf:'smalltalk' 
+     PhoneticStringUtilities soundexCodeOf:'smaltalk'  
+     PhoneticStringUtilities soundexCodeOf:'smaltak'   
+     PhoneticStringUtilities soundexCodeOf:'smaltok'   
+     PhoneticStringUtilities soundexCodeOf:'smoltok'   
+     PhoneticStringUtilities soundexCodeOf:'aa'        
+     PhoneticStringUtilities soundexCodeOf:'by'        
+     PhoneticStringUtilities soundexCodeOf:'bab'       
+     PhoneticStringUtilities soundexCodeOf:'bob'       
+     PhoneticStringUtilities soundexCodeOf:'bop'       
+    "
+! !
+
+!PhoneticStringUtilities class methodsFor:'documentation'!
+
+version
+    ^ '$Header: /cvs/stx/stx/libbasic2/PhoneticStringUtilities.st,v 1.1 2009-08-10 13:35:04 cg Exp $'
+! !