103 koelner phoneticCode |
117 koelner phoneticCode |
104 provides a functionality similar to soundex, but much more tuned towards the German language |
118 provides a functionality similar to soundex, but much more tuned towards the German language |
105 |
119 |
106 Double metaphone |
120 Double metaphone |
107 works with most european languages. |
121 works with most european languages. |
|
122 |
|
123 phonem |
|
124 described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung' |
|
125 from 'ct Magazin fuer Computer & Technik 25/1999'. |
|
126 |
|
127 More info for german readers is found in: |
|
128 http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf |
|
129 " |
|
130 ! |
|
131 |
|
132 sampleData |
|
133 " |
|
134 for the 50 most common german names, we get: |
|
135 |
|
136 ext. |
|
137 name soundex soundex metaphone phonet phonet2 phonix daitsch phonem koeln |
|
138 |
|
139 müller M460 54600000 MLR MÜLA NILA M4000000 689000 MYLR 657 |
|
140 schmidt S253 25300000 SKMTT SHMIT ZNIT S5300000 463000 CMYD 8628 |
|
141 schneider S253 25360000 SKNTR SHNEIDA ZNEITA S5300000 463900 CNAYDR 8627 |
|
142 fischer F260 12600000 FSKR FISHA FIZA F8000000 749000 VYCR 387 |
|
143 weber W160 16000000 WBR WEBA FEBA $1000000 779000 VBR 317 |
|
144 meyer M600 56000000 MYR MEIA NEIA M0000000 619000 MAYR 67 |
|
145 wagner W256 25600000 WKNR WAKNA FAKNA $2500000 756900 VACNR 367 |
|
146 schulz S242 24200000 SKLS SHULS ZULZ S4800000 484000 CULC 85 |
|
147 becker B260 12600000 BKR BEKA BEKA B2000000 759000 BCR 147 |
|
148 hoffmann H155 15500000 HFMN HOFMAN UFNAN $7550000 576600 OVMAN 036 |
|
149 schäfer S216 21600000 SKFR SHEFA ZEFA S7000000 479000 CVR 837 |
108 " |
150 " |
109 ! ! |
151 ! ! |
110 |
152 |
111 !PhoneticStringUtilities class methodsFor:'classification'! |
153 !PhoneticStringUtilities class methodsFor:'classification'! |
112 |
154 |
367 "/ please change as required (and remove this comment) |
409 "/ please change as required (and remove this comment) |
368 |
410 |
369 "/ super initialize. -- commented since inherited method does nothing |
411 "/ super initialize. -- commented since inherited method does nothing |
370 ! ! |
412 ! ! |
371 |
413 |
|
414 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'! |
|
415 |
|
416 documentation |
|
417 " |
|
418 There are many extended and enhanced soundex variants around; |
|
419 here is one, called 'extended soundex'. It is destribed for example in |
|
420 http://www.epidata.dk/documentation.php. |
|
421 An author or origin is unknown. |
|
422 |
|
423 The number of digits is increased to 5 or 8; |
|
424 The first character is not used literally; instead it is encoded like the rest. |
|
425 This might have a negative effect on names starting with a vovel, though. |
|
426 |
|
427 Overall, it can be doubted if this is really an enhancement after all. |
|
428 " |
|
429 ! ! |
|
430 |
|
431 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'! |
|
432 |
|
433 phoneticStringsFor:aString |
|
434 "generates both an extended soundex of length 5 and one of length 8" |
|
435 |
|
436 |first second u t prevCode| |
|
437 |
|
438 u := aString asUppercase. |
|
439 first := second := ''. |
|
440 u do:[:c | |
|
441 t := self translate:c. |
|
442 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
443 first := first , t. |
|
444 second := second , t. |
|
445 second size == 8 ifTrue:[ |
|
446 ^ Array with:(first copyTo:5) with:second |
|
447 ]. |
|
448 ]. |
|
449 prevCode := t |
|
450 ]. |
|
451 [ first size < 5 ] whileTrue:[ |
|
452 first := first , '0'. |
|
453 second := second , '0'. |
|
454 ]. |
|
455 [ second size < 8 ] whileTrue:[ |
|
456 second := second , '0' |
|
457 ]. |
|
458 ^ Array with:first with:second |
|
459 |
|
460 " |
|
461 self basicNew phoneticStringsFor:'müller' #('87900' '87900000') |
|
462 self basicNew phoneticStringsFor:'miller' #('87900' '87900000') |
|
463 self basicNew phoneticStringsFor:'muller' #('87900' '87900000') |
|
464 self basicNew phoneticStringsFor:'muler' #('87900' '87900000') |
|
465 self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000') |
|
466 self basicNew phoneticStringsFor:'schneider' #('38690' '38690000') |
|
467 self basicNew phoneticStringsFor:'fischer' #('23900' '23900000') |
|
468 self basicNew phoneticStringsFor:'weber' #('19000' '19000000') |
|
469 self basicNew phoneticStringsFor:'meyer' #('89000' '89000000') |
|
470 self basicNew phoneticStringsFor:'wagner' #('48900' '48900000') |
|
471 self basicNew phoneticStringsFor:'schulz' #('37500' '37500000') |
|
472 self basicNew phoneticStringsFor:'becker' #('13900' '13900000') |
|
473 self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000') |
|
474 self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000') |
|
475 " |
|
476 ! ! |
|
477 |
|
478 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'! |
|
479 |
|
480 translate:aCharacter |
|
481 "use simple if's for more speed when compiled" |
|
482 |
|
483 "vowels serve as separators" |
|
484 aCharacter == $A ifTrue:[^ '0' ]. |
|
485 aCharacter == $E ifTrue:[^ '0' ]. |
|
486 aCharacter == $I ifTrue:[^ '0' ]. |
|
487 aCharacter == $O ifTrue:[^ '0' ]. |
|
488 aCharacter == $U ifTrue:[^ '0' ]. |
|
489 aCharacter == $Y ifTrue:[^ '0' ]. |
|
490 |
|
491 aCharacter == $B ifTrue:[^ '1' ]. |
|
492 aCharacter == $P ifTrue:[^ '1' ]. |
|
493 |
|
494 aCharacter == $F ifTrue:[^ '2' ]. |
|
495 aCharacter == $V ifTrue:[^ '2' ]. |
|
496 |
|
497 aCharacter == $C ifTrue:[^ '3' ]. |
|
498 aCharacter == $S ifTrue:[^ '3' ]. |
|
499 aCharacter == $K ifTrue:[^ '3' ]. |
|
500 |
|
501 aCharacter == $G ifTrue:[^ '4' ]. |
|
502 aCharacter == $J ifTrue:[^ '4' ]. |
|
503 |
|
504 aCharacter == $Q ifTrue:[^ '5' ]. |
|
505 aCharacter == $X ifTrue:[^ '5' ]. |
|
506 aCharacter == $Z ifTrue:[^ '5' ]. |
|
507 |
|
508 aCharacter == $D ifTrue:[^ '6' ]. |
|
509 aCharacter == $G ifTrue:[^ '6' ]. |
|
510 aCharacter == $T ifTrue:[^ '6' ]. |
|
511 |
|
512 aCharacter == $L ifTrue:[^ '7' ]. |
|
513 |
|
514 aCharacter == $M ifTrue:[^ '8' ]. |
|
515 aCharacter == $N ifTrue:[^ '8' ]. |
|
516 |
|
517 aCharacter == $R ifTrue:[^ '9' ]. |
|
518 ^ nil |
|
519 ! ! |
|
520 |
372 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
521 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
373 |
522 |
374 documentation |
523 documentation |
375 " |
524 " |
376 The koelnerPhonetic code is for the german language what the soundex code is for english. |
525 The koelnerPhonetic code is for the german language what the soundex code is for english. |
482 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braeschneff' -> '17863' |
631 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braeschneff' -> '17863' |
483 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braessneff' -> '17863' |
632 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Braessneff' -> '17863' |
484 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Pressneff' -> '17863' |
633 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Pressneff' -> '17863' |
485 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Presznäph' -> '17863' |
634 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Presznäph' -> '17863' |
486 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Präschnäf' -> '17863' |
635 PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new phoneticStringsFor:'Präschnäf' -> '17863' |
|
636 " |
|
637 " |
|
638 self basicNew phoneticStringsFor:'müller' #('657') |
|
639 self basicNew phoneticStringsFor:'möller' #('657') |
|
640 self basicNew phoneticStringsFor:'miller' #('657') |
|
641 self basicNew phoneticStringsFor:'muller' #('657') |
|
642 self basicNew phoneticStringsFor:'muler' #('657') |
|
643 self basicNew phoneticStringsFor:'schmidt' #('862') |
|
644 self basicNew phoneticStringsFor:'schneider' #('8627') |
|
645 self basicNew phoneticStringsFor:'fischer' #('387') |
|
646 self basicNew phoneticStringsFor:'weber' #('317') |
|
647 self basicNew phoneticStringsFor:'meyer' #('67') |
|
648 self basicNew phoneticStringsFor:'wagner' #('3467') |
|
649 self basicNew phoneticStringsFor:'schulz' #('858') |
|
650 self basicNew phoneticStringsFor:'becker' #('147') |
|
651 self basicNew phoneticStringsFor:'hoffmann' #('036') |
|
652 self basicNew phoneticStringsFor:'schäfer' #('837') |
487 " |
653 " |
488 ! ! |
654 ! ! |
489 |
655 |
490 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'! |
656 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'! |
491 |
657 |
1166 of:answer |
1332 of:answer |
1167 to:replacementString |
1333 to:replacementString |
1168 startingAt:(answer size - aString size) + 1 |
1334 startingAt:(answer size - aString size) + 1 |
1169 ]. |
1335 ]. |
1170 ^ answer |
1336 ^ answer |
|
1337 ! ! |
|
1338 |
|
1339 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'! |
|
1340 |
|
1341 documentation |
|
1342 " |
|
1343 Implementation of the PHONEM algorithm, as described in |
|
1344 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht - |
|
1345 Ein Programm fuer kontextsensitive phonetische Textumwandlung |
|
1346 ct Magazin fuer Computer & Technik 25/1998' |
|
1347 " |
|
1348 ! ! |
|
1349 |
|
1350 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'! |
|
1351 |
|
1352 phoneticStringsFor:aString |
|
1353 |s idx t t2| |
|
1354 |
|
1355 s := aString asUppercase. |
|
1356 |
|
1357 idx := 1. |
|
1358 [idx < (s size-1)] whileTrue:[ |
|
1359 t2 := nil. |
|
1360 t := s copyFrom:idx to:idx+1. |
|
1361 t = 'SC' ifTrue:[ t2 := 'C' ] |
|
1362 ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ] |
|
1363 ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ] |
|
1364 ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ] |
|
1365 ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ] |
|
1366 ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ] |
|
1367 ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ] |
|
1368 ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ] |
|
1369 ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ] |
|
1370 ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ] |
|
1371 ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ] |
|
1372 ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ] |
|
1373 ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ] |
|
1374 ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ] |
|
1375 ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ] |
|
1376 ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ] |
|
1377 ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]]. |
|
1378 t2 notNil ifTrue:[ |
|
1379 s := (s copyTo:idx-1),t2,(s copyFrom:idx+2) |
|
1380 ] ifFalse:[ |
|
1381 idx := idx + 1. |
|
1382 ]. |
|
1383 ]. |
|
1384 |
|
1385 "/ single character substitutions via tr |
|
1386 s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'. |
|
1387 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false. |
|
1388 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true. |
|
1389 ^ Array with:s |
|
1390 |
|
1391 " |
|
1392 self basicNew phoneticStringsFor:'müller' #('MYLR') |
|
1393 self basicNew phoneticStringsFor:'möller' #('MYLR') |
|
1394 self basicNew phoneticStringsFor:'miller' #('MYLR') |
|
1395 self basicNew phoneticStringsFor:'muller' #('MULR') |
|
1396 self basicNew phoneticStringsFor:'muler' #('MULR') |
|
1397 self basicNew phoneticStringsFor:'schmidt' #('CMYD') |
|
1398 self basicNew phoneticStringsFor:'schneider' #('CNAYDR') |
|
1399 self basicNew phoneticStringsFor:'fischer' #('VYCR') |
|
1400 self basicNew phoneticStringsFor:'weber' #('VBR') |
|
1401 self basicNew phoneticStringsFor:'meyer' #('MAYR') |
|
1402 self basicNew phoneticStringsFor:'wagner' #('VACNR') |
|
1403 self basicNew phoneticStringsFor:'schulz' #('CULC') |
|
1404 self basicNew phoneticStringsFor:'becker' #('BCR') |
|
1405 self basicNew phoneticStringsFor:'hoffmann' #('OVMAN') |
|
1406 self basicNew phoneticStringsFor:'schäfer' #('CVR') |
|
1407 " |
1171 ! ! |
1408 ! ! |
1172 |
1409 |
1173 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1410 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1174 |
1411 |
1175 copyright |
1412 copyright |