PhoneticStringUtilities.st
changeset 2211 42fe8fe39e9c
parent 2210 9c428fe51c78
child 2213 d465fa29df0e
equal deleted inserted replaced
2210:9c428fe51c78 2211:42fe8fe39e9c
    24 	classVariableNames:''
    24 	classVariableNames:''
    25 	poolDictionaries:''
    25 	poolDictionaries:''
    26 	privateIn:PhoneticStringUtilities
    26 	privateIn:PhoneticStringUtilities
    27 !
    27 !
    28 
    28 
       
    29 PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
       
    30 	instanceVariableNames:''
       
    31 	classVariableNames:'CharacterTranslationDict'
       
    32 	poolDictionaries:''
       
    33 	privateIn:PhoneticStringUtilities
       
    34 !
       
    35 
    29 PhoneticStringUtilities::PhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
    36 PhoneticStringUtilities::PhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
    30 	instanceVariableNames:''
    37 	instanceVariableNames:''
    31 	classVariableNames:'CharacterTranslationDict'
    38 	classVariableNames:'CharacterTranslationDict'
    32 	poolDictionaries:''
    39 	poolDictionaries:''
    33 	privateIn:PhoneticStringUtilities
    40 	privateIn:PhoneticStringUtilities
    48 !
    55 !
    49 
    56 
    50 Object subclass:#NYSIISStringComparator
    57 Object subclass:#NYSIISStringComparator
    51 	instanceVariableNames:''
    58 	instanceVariableNames:''
    52 	classVariableNames:''
    59 	classVariableNames:''
       
    60 	poolDictionaries:''
       
    61 	privateIn:PhoneticStringUtilities
       
    62 !
       
    63 
       
    64 PhoneticStringUtilities::PhoneticStringComparator subclass:#PhonemStringComparator
       
    65 	instanceVariableNames:''
       
    66 	classVariableNames:'CharacterTranslationDict'
    53 	poolDictionaries:''
    67 	poolDictionaries:''
    54 	privateIn:PhoneticStringUtilities
    68 	privateIn:PhoneticStringUtilities
    55 !
    69 !
    56 
    70 
    57 PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
    71 PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
   103     koelner phoneticCode 
   117     koelner phoneticCode 
   104         provides a functionality similar to soundex, but much more tuned towards the German language
   118         provides a functionality similar to soundex, but much more tuned towards the German language
   105 
   119 
   106     Double metaphone 
   120     Double metaphone 
   107         works with most european languages.
   121         works with most european languages.
       
   122 
       
   123     phonem
       
   124         described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
       
   125         from 'ct Magazin fuer Computer & Technik 25/1999'.
       
   126 
       
   127     More info for german readers is found in:
       
   128         http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
       
   129 "
       
   130 !
       
   131 
       
   132 sampleData
       
   133 "
       
   134     for the 50 most common german names, we get:
       
   135 
       
   136                             ext. 
       
   137     name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln
       
   138 
       
   139     müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657
       
   140     schmidt     S253    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        8628
       
   141     schneider   S253    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627
       
   142     fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387
       
   143     weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317
       
   144     meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67
       
   145     wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       367
       
   146     schulz      S242    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        85
       
   147     becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147
       
   148     hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036
       
   149     schäfer     S216    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837
   108 "
   150 "
   109 ! !
   151 ! !
   110 
   152 
   111 !PhoneticStringUtilities class methodsFor:'classification'!
   153 !PhoneticStringUtilities class methodsFor:'classification'!
   112 
   154 
   367     "/ please change as required (and remove this comment)
   409     "/ please change as required (and remove this comment)
   368 
   410 
   369     "/ super initialize.   -- commented since inherited method does nothing
   411     "/ super initialize.   -- commented since inherited method does nothing
   370 ! !
   412 ! !
   371 
   413 
       
   414 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
       
   415 
       
   416 documentation
       
   417 "
       
   418     There are many extended and enhanced soundex variants around;
       
   419     here is one, called 'extended soundex'. It is destribed for example in
       
   420     http://www.epidata.dk/documentation.php.
       
   421     An author or origin is unknown.
       
   422 
       
   423     The number of digits is increased to 5 or 8;
       
   424     The first character is not used literally; instead it is encoded like the rest.
       
   425     This might have a negative effect on names starting with a vovel, though.
       
   426 
       
   427     Overall, it can be doubted if this is really an enhancement after all.
       
   428 "
       
   429 ! !
       
   430 
       
   431 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
       
   432 
       
   433 phoneticStringsFor:aString
       
   434     "generates both an extended soundex of length 5 and one of length 8"
       
   435 
       
   436     |first second u t prevCode|
       
   437 
       
   438     u := aString asUppercase.
       
   439     first := second := ''.
       
   440     u do:[:c | 
       
   441         t := self translate:c.
       
   442         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
   443             first := first , t.
       
   444             second := second , t.
       
   445             second size == 8 ifTrue:[
       
   446                 ^ Array with:(first copyTo:5) with:second 
       
   447             ].
       
   448         ].
       
   449         prevCode := t
       
   450     ].
       
   451     [ first size < 5 ] whileTrue:[
       
   452         first := first , '0'.
       
   453         second := second , '0'.
       
   454     ].
       
   455     [ second size < 8 ] whileTrue:[
       
   456         second := second , '0'
       
   457     ].
       
   458     ^ Array with:first with:second
       
   459 
       
   460     "
       
   461      self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
       
   462      self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
       
   463      self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
       
   464      self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
       
   465      self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
       
   466      self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
       
   467      self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
       
   468      self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
       
   469      self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
       
   470      self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
       
   471      self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
       
   472      self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
       
   473      self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
       
   474      self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
       
   475     "
       
   476 ! !
       
   477 
       
   478 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
       
   479 
       
   480 translate:aCharacter
       
   481     "use simple if's for more speed when compiled"
       
   482 
       
   483     "vowels serve as separators"
       
   484     aCharacter == $A ifTrue:[^ '0' ].         
       
   485     aCharacter == $E ifTrue:[^ '0' ].
       
   486     aCharacter == $I ifTrue:[^ '0' ].
       
   487     aCharacter == $O ifTrue:[^ '0' ].
       
   488     aCharacter == $U ifTrue:[^ '0' ].
       
   489     aCharacter == $Y ifTrue:[^ '0' ].
       
   490 
       
   491     aCharacter == $B ifTrue:[^ '1' ]. 
       
   492     aCharacter == $P ifTrue:[^ '1' ].
       
   493 
       
   494     aCharacter == $F ifTrue:[^ '2' ]. 
       
   495     aCharacter == $V ifTrue:[^ '2' ]. 
       
   496 
       
   497     aCharacter == $C ifTrue:[^ '3' ]. 
       
   498     aCharacter == $S ifTrue:[^ '3' ]. 
       
   499     aCharacter == $K ifTrue:[^ '3' ].
       
   500 
       
   501     aCharacter == $G ifTrue:[^ '4' ]. 
       
   502     aCharacter == $J ifTrue:[^ '4' ].
       
   503 
       
   504     aCharacter == $Q ifTrue:[^ '5' ]. 
       
   505     aCharacter == $X ifTrue:[^ '5' ]. 
       
   506     aCharacter == $Z ifTrue:[^ '5' ]. 
       
   507 
       
   508     aCharacter == $D ifTrue:[^ '6' ]. 
       
   509     aCharacter == $G ifTrue:[^ '6' ]. 
       
   510     aCharacter == $T ifTrue:[^ '6' ]. 
       
   511 
       
   512     aCharacter == $L ifTrue:[^ '7' ]. 
       
   513 
       
   514     aCharacter == $M ifTrue:[^ '8' ]. 
       
   515     aCharacter == $N ifTrue:[^ '8' ]. 
       
   516 
       
   517     aCharacter == $R ifTrue:[^ '9' ]. 
       
   518     ^ nil
       
   519 ! !
       
   520 
   372 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
   521 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
   373 
   522 
   374 documentation
   523 documentation
   375 "
   524 "
   376      The koelnerPhonetic code is for the german language what the soundex code is for english.
   525      The koelnerPhonetic code is for the german language what the soundex code is for english.
   482      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braeschneff' -> '17863'
   631      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braeschneff' -> '17863'
   483      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braessneff' -> '17863'
   632      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braessneff' -> '17863'
   484      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Pressneff' -> '17863'
   633      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Pressneff' -> '17863'
   485      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Presznäph' -> '17863'
   634      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Presznäph' -> '17863'
   486      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Präschnäf' -> '17863'
   635      PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Präschnäf' -> '17863'
       
   636     "
       
   637     "
       
   638      self basicNew phoneticStringsFor:'müller'      #('657')    
       
   639      self basicNew phoneticStringsFor:'möller'      #('657')
       
   640      self basicNew phoneticStringsFor:'miller'      #('657')     
       
   641      self basicNew phoneticStringsFor:'muller'      #('657')
       
   642      self basicNew phoneticStringsFor:'muler'       #('657')
       
   643      self basicNew phoneticStringsFor:'schmidt'     #('862')   
       
   644      self basicNew phoneticStringsFor:'schneider'   #('8627') 
       
   645      self basicNew phoneticStringsFor:'fischer'     #('387') 
       
   646      self basicNew phoneticStringsFor:'weber'       #('317') 
       
   647      self basicNew phoneticStringsFor:'meyer'       #('67') 
       
   648      self basicNew phoneticStringsFor:'wagner'      #('3467') 
       
   649      self basicNew phoneticStringsFor:'schulz'      #('858')
       
   650      self basicNew phoneticStringsFor:'becker'      #('147')
       
   651      self basicNew phoneticStringsFor:'hoffmann'    #('036')
       
   652      self basicNew phoneticStringsFor:'schäfer'     #('837') 
   487     "
   653     "
   488 ! !
   654 ! !
   489 
   655 
   490 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'!
   656 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'!
   491 
   657 
  1166                     of:answer
  1332                     of:answer
  1167                     to:replacementString
  1333                     to:replacementString
  1168                     startingAt:(answer size - aString size) + 1
  1334                     startingAt:(answer size - aString size) + 1
  1169     ].
  1335     ].
  1170     ^ answer
  1336     ^ answer
       
  1337 ! !
       
  1338 
       
  1339 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
       
  1340 
       
  1341 documentation
       
  1342 "
       
  1343 Implementation of the PHONEM algorithm, as described in
       
  1344 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
       
  1345 Ein Programm fuer kontextsensitive phonetische Textumwandlung
       
  1346 ct Magazin fuer Computer & Technik 25/1998'
       
  1347 "
       
  1348 ! !
       
  1349 
       
  1350 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
       
  1351 
       
  1352 phoneticStringsFor:aString 
       
  1353     |s idx t t2|
       
  1354 
       
  1355     s := aString asUppercase.
       
  1356 
       
  1357     idx := 1.
       
  1358     [idx < (s size-1)] whileTrue:[
       
  1359         t2 := nil.
       
  1360         t := s copyFrom:idx to:idx+1.
       
  1361         t = 'SC' ifTrue:[ t2 := 'C' ]
       
  1362         ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
       
  1363         ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
       
  1364         ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
       
  1365         ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
       
  1366         ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
       
  1367         ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
       
  1368         ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
       
  1369         ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
       
  1370         ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
       
  1371         ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
       
  1372         ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
       
  1373         ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
       
  1374         ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
       
  1375         ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
       
  1376         ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
       
  1377         ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
       
  1378         t2 notNil ifTrue:[
       
  1379             s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
       
  1380         ] ifFalse:[
       
  1381             idx := idx + 1.
       
  1382         ].
       
  1383     ].
       
  1384 
       
  1385     "/ single character substitutions via tr
       
  1386     s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
       
  1387     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
       
  1388     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
       
  1389     ^ Array with:s
       
  1390 
       
  1391     "
       
  1392      self basicNew phoneticStringsFor:'müller'  #('MYLR')    
       
  1393      self basicNew phoneticStringsFor:'möller'  #('MYLR')
       
  1394      self basicNew phoneticStringsFor:'miller'  #('MYLR')     
       
  1395      self basicNew phoneticStringsFor:'muller'  #('MULR') 
       
  1396      self basicNew phoneticStringsFor:'muler'   #('MULR') 
       
  1397      self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
       
  1398      self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
       
  1399      self basicNew phoneticStringsFor:'fischer'     #('VYCR')
       
  1400      self basicNew phoneticStringsFor:'weber'       #('VBR')
       
  1401      self basicNew phoneticStringsFor:'meyer'       #('MAYR')
       
  1402      self basicNew phoneticStringsFor:'wagner'      #('VACNR')
       
  1403      self basicNew phoneticStringsFor:'schulz'      #('CULC')
       
  1404      self basicNew phoneticStringsFor:'becker'      #('BCR')
       
  1405      self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
       
  1406      self basicNew phoneticStringsFor:'schäfer'     #('CVR')
       
  1407     "
  1171 ! !
  1408 ! !
  1172 
  1409 
  1173 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
  1410 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
  1174 
  1411 
  1175 copyright
  1412 copyright
  2663 ! !
  2900 ! !
  2664 
  2901 
  2665 !PhoneticStringUtilities class methodsFor:'documentation'!
  2902 !PhoneticStringUtilities class methodsFor:'documentation'!
  2666 
  2903 
  2667 version
  2904 version
  2668     ^ '$Header: /cvs/stx/stx/libbasic2/PhoneticStringUtilities.st,v 1.5 2009-08-11 16:50:07 cg Exp $'
  2905     ^ '$Header: /cvs/stx/stx/libbasic2/PhoneticStringUtilities.st,v 1.6 2009-08-12 18:26:40 cg Exp $'
  2669 ! !
  2906 ! !