# HG changeset patch # User Claus Gittinger # Date 1249911304 -7200 # Node ID 33e71ed6cf32dc23441aa3714310b1a16d4eb056 # Parent 9eecfefd08aa570115237c09710b770cfb802dff initial checkin diff -r 9eecfefd08aa -r 33e71ed6cf32 PhoneticStringUtilities.st --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PhoneticStringUtilities.st Mon Aug 10 15:35:04 2009 +0200 @@ -0,0 +1,320 @@ +" + COPYRIGHT (c) 1994 by Claus Gittinger + COPYRIGHT (c) 2009 by eXept Software AG + All Rights Reserved + + This software is furnished under a license and may be used + only in accordance with the terms of that license and with the + inclusion of the above copyright notice. This software may not + be provided or otherwise made available to, or used by, any + other person. No title to or ownership of the software is + hereby transferred. +" +"{ Package: 'stx:libbasic2' }" + +Object subclass:#PhoneticStringUtilities + instanceVariableNames:'' + classVariableNames:'' + poolDictionaries:'' + category:'Collections-Text-Support' +! + +!PhoneticStringUtilities class methodsFor:'documentation'! + +copyright +" + COPYRIGHT (c) 1994 by Claus Gittinger + COPYRIGHT (c) 2009 by eXept Software AG + All Rights Reserved + + This software is furnished under a license and may be used + only in accordance with the terms of that license and with the + inclusion of the above copyright notice. This software may not + be provided or otherwise made available to, or used by, any + other person. No title to or ownership of the software is + hereby transferred. +" +! + +documentation +" + the soundexCode algorithm was originally contained in the CharacterArray class; + the koelnerPhoneticCode provides a similar functionality, which is more tuned towards the + German language. Doublemetaphone works with most european languages. +" +! ! + +!PhoneticStringUtilities class methodsFor:'classification'! + +isSlavoGermanic:aString + ^ #('w' 'k' 'cz' 'witz') contains:[:sub | aString includesString:sub] + + " + self isSlavoGermanic:'walter' + " +! ! + +!PhoneticStringUtilities class methodsFor:'phonetic codes'! + +koelnerPhoneticCodeOf:aString + "return a koelner phonetic code. + The koelnerPhonetic code is for the german language what the soundex code is for english; + it returns simular strings for similar sounding words. + There are some differences to soundex, though: + its length is not limited to 4, but depends on the length of the original string; + it does not start with the first character of the input." + + |in ret val convFirst convRest| + + convFirst := [:chars | + |pair| + + pair := #( + ('#a#' '0') + ('#e#' '0') + ('#i#' '0') + ('#j#' '0') + ('#y#' '0') + ('#o#' '0') + ('#u#' '0') + + ('#ca' '4') + ('#ch' '4') + ('#ck' '4') + ('#cl' '4') + ('#co' '4') + ('#cq' '4') + ('#cr' '4') + ('#cu' '4') + ('#cx' '4') + + ('#c#' '8') + ) detect:[:pair | (pair first match:chars) ] ifNone:nil. + + pair notNil ifTrue:[ pair second ] ifFalse:[ convRest value:chars ] + ]. + + convRest := [:chars | + |pair| + + pair := #( + ('#ds' '8') + ('#dc' '8') + ('#dz' '8') + ('#ts' '8') + ('#tc' '8') + ('#tz' '8') + ('#d#' '2') + ('#t#' '2') + ('cx#' '8') + ('kx#' '8') + ('qx#' '8') + ('#x#' '48') + ('sc#' '8') + ('sz#' '8') + ('#ca' '4') + ('#co' '4') + ('#cu' '4') + ('#ch' '4') + ('#ck' '4') + ('#cx' '4') + ('#cq' '4') + ('#c#' '8') + ('#a#' '0') + ('#e#' '0') + ('#i#' '0') + ('#j#' '0') + ('#y#' '0') + ('#o#' '0') + ('#u#' '0') + ('#h#' '-') + ('#l#' '5') + ('#r#' '7') + ('#m#' '6') + ('#n#' '6') + ('#s#' '8') + ('#z#' '8') + ('#b#' '1') + ('#p#' '1') + ('#f#' '3') + ('#v#' '3') + ('#w#' '3') + ('#g#' '4') + ('#k#' '4') + ('#q#' '4') + ('###' '?') + ) detect:[:pair | (pair first match:chars) ]. + + pair second + ]. + + in := aString withoutSeparators asLowercase. + in := in copyReplaceString:'ph' withString:'f'. + in := in copyReplaceAll:$ü withAll:'u'. + in := in copyReplaceAll:$ä withAll:'a'. + in := in copyReplaceAll:$ö withAll:'o'. + in := in copyReplaceAll:$ß withAll:'ss'. + in := '#',in,'#'. + + ret := ''. + 1 to:in size-2 do:[:i | + |sub| + + sub := in copyFrom:i to:i+2. + val := (i==1) + ifTrue:[ convFirst value:sub ] + ifFalse:[ convRest value:sub ]. + ret := ret,val + ]. + + ret := ret select:[:ch | ch ~= $-]. + + (ret startsWith:'0') ifTrue:[ + ret := '0',(ret select:[:ch | ch ~= $0]). + ] ifFalse:[ + ret := ret select:[:ch | ch ~= $0]. + ]. + + ^ String streamContents:[:s | + |prev| + + ret do:[:ch | + ch ~= prev ifTrue:[ + s nextPut:ch + ]. + prev := ch. + ]. + ]. + + " + #( + 'Müller' + 'Miller' + 'Mueller' + 'Mühler' + 'Mühlherr' + 'Mülherr' + 'Myler' + 'Millar' + 'Myller' + 'Müllar' + 'Müler' + 'Muehler' + 'Mülller' + 'Müllerr' + 'Muehlherr' + 'Muellar' + 'Mueler' + 'Mülleer' + 'Mueller' + 'Nüller' + 'Nyller' + 'Niler' + 'Czerny' + 'Tscherny' + 'Czernie' + 'Tschernie' + 'Schernie' + 'Scherny' + 'Scherno' + 'Czerne' + 'Zerny' + 'Tzernie' + 'Breschnew' + ) do:[:w | + Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w) + ]. + " + + " + PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew' -> '17863' + PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff' -> '17863' + PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff' -> '17863' + PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff' -> '17863' + PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff' -> '17863' + PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph' -> '17863' + " +! + +soundexCodeOf:aString + "return a soundex phonetic code or nil. + Soundex returns similar codes for similar sounding words, making it a useful + tool when searching for words where the correct spelling is unknown. + (read Knuth or search the web if you dont know what a soundex code is). + Caveat: 'similar sounding words' means: 'similar sounding in english'." + + |inStream codeStream ch last lch codeLength codes sc| + + inStream := aString readStream. + inStream skipSeparators. + inStream atEnd ifTrue:[ + ^ nil + ]. + + ch := inStream next. + ch isLetter ifFalse:[ + ^ nil + ]. + codeLength := 0. + + codes := Dictionary new. + codes atAll:'bpfv' put:$1. + codes atAll:'cskgjqxz' put:$2. + codes atAll:'dt' put:$3. + codes atAll:'l' put:$4. + codes atAll:'nm' put:$5. + codes atAll:'r' put:$6. + + codeStream := WriteStream on:(String new:4). + codeStream nextPut:(ch asUppercase). + + [inStream atEnd] whileFalse:[ + ch := inStream next. + lch := ch asLowercase. + lch = last ifFalse:[ + last := lch. + + sc := codes at:ch ifAbsent:nil. + sc notNil ifTrue:[ + codeLength < 3 ifTrue:[ + codeStream nextPut:sc. + codeLength := codeLength + 1. + ] + ] ifFalse:[ +"/ ch isLetter ifFalse:[ +"/ "/ something else - ignore it +"/ ] ifTrue:[ +"/ "/ else its a vowel and we ignore it +"/ ] + ]. + ] + ]. + [ codeLength < 3 ] whileTrue:[ + codeStream nextPut:$0. + codeLength := codeLength + 1. + ]. + + ^ codeStream contents + + " + PhoneticStringUtilities soundexCodeOf:'claus' + PhoneticStringUtilities soundexCodeOf:'clause' + PhoneticStringUtilities soundexCodeOf:'close' + PhoneticStringUtilities soundexCodeOf:'smalltalk' + PhoneticStringUtilities soundexCodeOf:'smaltalk' + PhoneticStringUtilities soundexCodeOf:'smaltak' + PhoneticStringUtilities soundexCodeOf:'smaltok' + PhoneticStringUtilities soundexCodeOf:'smoltok' + PhoneticStringUtilities soundexCodeOf:'aa' + PhoneticStringUtilities soundexCodeOf:'by' + PhoneticStringUtilities soundexCodeOf:'bab' + PhoneticStringUtilities soundexCodeOf:'bob' + PhoneticStringUtilities soundexCodeOf:'bop' + " +! ! + +!PhoneticStringUtilities class methodsFor:'documentation'! + +version + ^ '$Header: /cvs/stx/stx/libbasic2/PhoneticStringUtilities.st,v 1.1 2009-08-10 13:35:04 cg Exp $' +! !