PhoneticStringUtilities.st
changeset 4491 d6c31bb1e928
parent 4490 33b5fbfc4b5d
child 4495 5d2da4bddbda
equal deleted inserted replaced
4490:33b5fbfc4b5d 4491:d6c31bb1e928
    28 	classVariableNames:''
    28 	classVariableNames:''
    29 	poolDictionaries:''
    29 	poolDictionaries:''
    30 	privateIn:PhoneticStringUtilities
    30 	privateIn:PhoneticStringUtilities
    31 !
    31 !
    32 
    32 
       
    33 PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator
       
    34 	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
       
    35 		currentIndex skipCount'
       
    36 	classVariableNames:''
       
    37 	poolDictionaries:''
       
    38 	privateIn:PhoneticStringUtilities
       
    39 !
       
    40 
       
    41 PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
       
    42 	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
       
    43 		currentIndex skipCount'
       
    44 	classVariableNames:''
       
    45 	poolDictionaries:''
       
    46 	privateIn:PhoneticStringUtilities
       
    47 !
       
    48 
    33 PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
    49 PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
    34 	instanceVariableNames:''
    50 	instanceVariableNames:''
    35 	classVariableNames:'CharacterTranslationDict'
    51 	classVariableNames:'CharacterTranslationDict'
    36 	poolDictionaries:''
    52 	poolDictionaries:''
    37 	privateIn:PhoneticStringUtilities
    53 	privateIn:PhoneticStringUtilities
    49 	classVariableNames:'CharacterTranslationDict'
    65 	classVariableNames:'CharacterTranslationDict'
    50 	poolDictionaries:''
    66 	poolDictionaries:''
    51 	privateIn:PhoneticStringUtilities
    67 	privateIn:PhoneticStringUtilities
    52 !
    68 !
    53 
    69 
       
    70 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator
       
    71 	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
       
    72 		currentIndex skipCount'
       
    73 	classVariableNames:''
       
    74 	poolDictionaries:''
       
    75 	privateIn:PhoneticStringUtilities
       
    76 !
       
    77 
    54 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
    78 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
    55 	instanceVariableNames:''
    79 	instanceVariableNames:''
    56 	classVariableNames:'CharacterTranslationDict'
    80 	classVariableNames:'CharacterTranslationDict'
    57 	poolDictionaries:''
    81 	poolDictionaries:''
    58 	privateIn:PhoneticStringUtilities
    82 	privateIn:PhoneticStringUtilities
    77 	classVariableNames:'CharacterTranslationDict'
   101 	classVariableNames:'CharacterTranslationDict'
    78 	poolDictionaries:''
   102 	poolDictionaries:''
    79 	privateIn:PhoneticStringUtilities
   103 	privateIn:PhoneticStringUtilities
    80 !
   104 !
    81 
   105 
    82 PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
   106 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator
    83 	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
   107 	instanceVariableNames:''
    84 		currentIndex skipCount'
   108 	classVariableNames:'CharacterTranslationDict'
    85 	classVariableNames:''
       
    86 	poolDictionaries:''
   109 	poolDictionaries:''
    87 	privateIn:PhoneticStringUtilities
   110 	privateIn:PhoneticStringUtilities
    88 !
   111 !
    89 
   112 
    90 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
   113 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
   151 
   174 
   152     phonem
   175     phonem
   153         described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
   176         described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
   154         from 'ct Magazin fuer Computer & Technik 25/1999'.
   177         from 'ct Magazin fuer Computer & Technik 25/1999'.
   155 
   178 
       
   179     mra
       
   180         Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.
       
   181 
       
   182     caverphone2
       
   183         better than soundex
       
   184 
       
   185     spanish phonetic code
       
   186         an algorithm slightly adjusted to spanish names
       
   187 
   156     More info for german readers is found in:
   188     More info for german readers is found in:
   157         http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
   189         http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
   158 "
   190 "
   159 !
   191 !
   160 
   192 
   161 sampleData
   193 sampleData
   162 "
   194 "
   163     for the 50 most common german names, we get:
   195     for the 50 most common german names, we get:
   164 
   196 
   165                             ext. 
   197                             ext. 
   166     name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln
   198     name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln  caverphone2  mra
   167 
   199 
   168     müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657
   200     müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657    MLA1111111   MLR
   169     schmidt     S253    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        8628
   201     schmidt     S530    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        862    SKMT111111   SCHMDT
   170     schneider   S253    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627
   202     schneider   S536    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627   SKNTA11111   SCHNDR
   171     fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387
   203     fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387    FSKA111111   FSCHR
   172     weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317
   204     weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317    WPA1111111   WBR
   173     meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67
   205     meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67     MA11111111   MYR
   174     wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       367
   206     wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       3467   WKNA111111   WGNR
   175     schulz      S242    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        85
   207     schulz      S420    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        858    SKS1111111   SCHLZ
   176     becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147
   208     becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147    PKA1111111   BCKR
   177     hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036
   209     hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036    AFMN111111   HFMN
   178     schäfer     S216    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837
   210     schäfer     S16ß    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837    SKFA111111   SCHFR
       
   211 
       
   212     |cls|
       
   213     
       
   214     cls := MRAStringComparator.
       
   215     cls := SoundexStringComparator.
       
   216     cls := KoelnerPhoneticCodeStringComparator.
       
   217     cls := Caverphone2StringComparator.
       
   218     #('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer' 
       
   219       'wagner' 'schulz'  'becker'    'hoffmann' 'schäfer')
       
   220     do:[:name |
       
   221         Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''.
       
   222     ].
       
   223 
       
   224     KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt'  -> '65752682'
   179 "
   225 "
   180 ! !
   226 ! !
   181 
   227 
   182 !PhoneticStringUtilities class methodsFor:'phonetic codes'!
   228 !PhoneticStringUtilities class methodsFor:'phonetic codes'!
   183 
   229 
   461 
   507 
   462 isAbstract
   508 isAbstract
   463     ^ self == PhoneticStringUtilities::PhoneticStringComparator
   509     ^ self == PhoneticStringUtilities::PhoneticStringComparator
   464 ! !
   510 ! !
   465 
   511 
       
   512 !PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'!
       
   513 
       
   514 encode:word
       
   515     ^ (self new phoneticStringsFor:word) first
       
   516 
       
   517     "
       
   518      SoundexStringComparator encode:'Fischer'             -> 'F260'
       
   519      Caverphone2StringComparator encode:'Fischer'         -> 'FSKA111111'
       
   520      KoelnerPhoneticCodeStringComparator encode:'Fischer' -> '387'
       
   521      MRAStringComparator encode:'Fischer'                 -> 'FSCHR'
       
   522      SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429'
       
   523     "
       
   524 
       
   525     "Created: / 02-08-2017 / 01:15:50 / cg"
       
   526 ! !
       
   527 
   466 !PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!
   528 !PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!
   467 
   529 
   468 does:aString soundLike:anotherString 
   530 does:aString soundLike:anotherString 
   469     |translations1 translations2|
   531     |translations1 translations2|
   470 
   532 
   514     "/ please change as required (and remove this comment)
   576     "/ please change as required (and remove this comment)
   515 
   577 
   516     "/ super initialize.   -- commented since inherited method does nothing
   578     "/ super initialize.   -- commented since inherited method does nothing
   517 ! !
   579 ! !
   518 
   580 
   519 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
   581 !PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'!
   520 
   582 
   521 documentation
   583 documentation
   522 "
   584 "
   523     There are many extended and enhanced soundex variants around;
   585     self encode:'AUERBACH' -> 097400, 097500
   524     here is one, called 'extended soundex'. It is destribed for example in
   586 
   525     http://www.epidata.dk/documentation.php.
   587     Encodes a string into a Daitch-Mokotoff Soundex value.
   526     An author or origin is unknown.
   588     The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, 
   527 
   589     yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation 
   528     The number of digits is increased to 5 or 8;
   590     but differences in spelling.
   529     The first character is not used literally; instead it is encoded like the rest.
   591 
   530     This might have a negative effect on names starting with a vovel, though.
   592     The main differences compared to the other soundex variants are:
   531 
   593         - coded names are 6 digits long
   532     Overall, it can be doubted if this is really an enhancement after all.
   594         - the initial character of the name is coded
       
   595         - rules to encoded multi-character n-grams
       
   596         - multiple possible encodings for the same name (branching)
       
   597 
       
   598     This implementation supports branching, depending on the used method:
       
   599         encode:aString            - branching disabled, only the first code will be returned
       
   600         phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|'
       
   601 
       
   602     [see also:]
       
   603         'Wikipedia - Daitch-Mokotoff Soundex'
       
   604             http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex 
       
   605 
       
   606         'Avotaynu - Soundexing and Genealogy'    
       
   607             http://www.avotaynu.com/soundex.htm
   533 "
   608 "
   534 ! !
   609 !
   535 
   610 
   536 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
   611 javaCode
   537 
       
   538 phoneticStringsFor:aString
       
   539     "generates both an extended soundex of length 5 and one of length 8"
       
   540 
       
   541     |first second u t prevCode|
       
   542 
       
   543     u := aString asUppercase.
       
   544     first := second := ''.
       
   545     u do:[:c | 
       
   546         t := self translate:c.
       
   547         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
   548             first := first , t.
       
   549             second := second , t.
       
   550             second size == 8 ifTrue:[
       
   551                 ^ Array with:(first copyTo:5) with:second 
       
   552             ].
       
   553         ].
       
   554         prevCode := t
       
   555     ].
       
   556     [ first size < 5 ] whileTrue:[
       
   557         first := first , '0'.
       
   558         second := second , '0'.
       
   559     ].
       
   560     [ second size < 8 ] whileTrue:[
       
   561         second := second , '0'
       
   562     ].
       
   563     ^ Array with:first with:second
       
   564 
       
   565     "
       
   566      self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
       
   567      self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
       
   568      self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
       
   569      self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
       
   570      self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
       
   571      self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
       
   572      self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
       
   573      self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
       
   574      self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
       
   575      self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
       
   576      self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
       
   577      self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
       
   578      self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
       
   579      self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
       
   580     "
       
   581 ! !
       
   582 
       
   583 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
       
   584 
       
   585 translate:aCharacter
       
   586     "use simple if's for more speed when compiled"
       
   587 
       
   588     "vowels serve as separators"
       
   589     aCharacter == $A ifTrue:[^ '0' ].         
       
   590     aCharacter == $E ifTrue:[^ '0' ].
       
   591     aCharacter == $I ifTrue:[^ '0' ].
       
   592     aCharacter == $O ifTrue:[^ '0' ].
       
   593     aCharacter == $U ifTrue:[^ '0' ].
       
   594     aCharacter == $Y ifTrue:[^ '0' ].
       
   595 
       
   596     aCharacter == $B ifTrue:[^ '1' ]. 
       
   597     aCharacter == $P ifTrue:[^ '1' ].
       
   598 
       
   599     aCharacter == $F ifTrue:[^ '2' ]. 
       
   600     aCharacter == $V ifTrue:[^ '2' ]. 
       
   601 
       
   602     aCharacter == $C ifTrue:[^ '3' ]. 
       
   603     aCharacter == $S ifTrue:[^ '3' ]. 
       
   604     aCharacter == $K ifTrue:[^ '3' ].
       
   605 
       
   606     aCharacter == $G ifTrue:[^ '4' ]. 
       
   607     aCharacter == $J ifTrue:[^ '4' ].
       
   608 
       
   609     aCharacter == $Q ifTrue:[^ '5' ]. 
       
   610     aCharacter == $X ifTrue:[^ '5' ]. 
       
   611     aCharacter == $Z ifTrue:[^ '5' ]. 
       
   612 
       
   613     aCharacter == $D ifTrue:[^ '6' ]. 
       
   614     aCharacter == $G ifTrue:[^ '6' ]. 
       
   615     aCharacter == $T ifTrue:[^ '6' ]. 
       
   616 
       
   617     aCharacter == $L ifTrue:[^ '7' ]. 
       
   618 
       
   619     aCharacter == $M ifTrue:[^ '8' ]. 
       
   620     aCharacter == $N ifTrue:[^ '8' ]. 
       
   621 
       
   622     aCharacter == $R ifTrue:[^ '9' ]. 
       
   623     ^ nil
       
   624 ! !
       
   625 
       
   626 !PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
       
   627 
       
   628 documentation
       
   629 "
       
   630     documentation to be added.
       
   631 
       
   632     [author:]
       
   633         cg
       
   634 
       
   635     [instance variables:]
       
   636 
       
   637     [class variables:]
       
   638 
       
   639     [see also:]
       
   640 
       
   641 "
       
   642 ! !
       
   643 
       
   644 !PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
       
   645 
       
   646 encode:word
       
   647     ^ self subclassResponsibility
       
   648 
       
   649     "Created: / 28-07-2017 / 15:20:49 / cg"
       
   650 !
       
   651 
       
   652 phoneticStringsFor:word 
       
   653     ^ Array with:(self encode:word)
       
   654 
       
   655     "Created: / 28-07-2017 / 15:20:38 / cg"
       
   656 ! !
       
   657 
       
   658 !PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
       
   659 
       
   660 documentation
       
   661 "
       
   662     Match Rating Approach Encoder
       
   663 
       
   664     The Western Airlines matching rating approach name encoder
       
   665 
       
   666     [see also:]
       
   667         https://en.wikipedia.org/wiki/Match_Rating_Approach
       
   668         
       
   669         G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
       
   670             ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' 
       
   671             US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
       
   672 "
       
   673 !
       
   674 
       
   675 rCode
       
   676 "<<END
   612 "<<END
   677 ## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
   613 /*
   678 ##
   614  * Licensed to the Apache Software Foundation (ASF) under one or more
   679 ## Redistribution and use in source and binary forms, with or without
   615  * contributor license agreements.  See the NOTICE file distributed with
   680 ## modification, are permitted provided that the following conditions are
   616  * this work for additional information regarding copyright ownership.
   681 ## met:
   617  * The ASF licenses this file to You under the Apache License, Version 2.0
   682 ##
   618  * (the "License"); you may not use this file except in compliance with
   683 ##     Redistributions of source code must retain the above copyright
   619  * the License.  You may obtain a copy of the License at
   684 ##     notice, this list of conditions and the following disclaimer.
   620  *
   685 ##
   621  *      http://www.apache.org/licenses/LICENSE-2.0
   686 ##     Redistributions in binary form must reproduce the above copyright
   622  *
   687 ##     notice, this list of conditions and the following disclaimer in
   623  * Unless required by applicable law or agreed to in writing, software
   688 ##     the documentation and/or other materials provided with the
   624  * distributed under the License is distributed on an "AS IS" BASIS,
   689 ##     distribution.
   625  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   690 ##
   626  * See the License for the specific language governing permissions and
   691 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   627  * limitations under the License.
   692 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   628  */
   693 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   629 package org.apache.commons.codec.language;
   694 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   630 
   695 ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   631 import org.apache.commons.codec.CharEncoding;
   696 ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   632 import org.apache.commons.codec.EncoderException;
   697 ## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   633 import org.apache.commons.codec.StringEncoder;
   698 ## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   634 
   699 ## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   635 import java.io.InputStream;
   700 ## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   636 import java.util.*;
   701 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   637 
   702 
   638 /**
   703 #' @rdname mra
   639  * Encodes a string into a Daitch-Mokotoff Soundex value.
   704 #' @title Match Rating Approach Encoder
   640  * <p>
   705 #'
   641  * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
   706 #' @description
   642  * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
   707 #' The Western Airlines matching rating approach name encoder
   643  * </p>
   708 #'
   644  * <p>
   709 #' @param word string or vector of strings to encode
   645  * The main differences compared to the other soundex variants are:
   710 #' @param x MRA-encoded character vector
   646  * </p>
   711 #' @param y MRA-encoded character vector
   647  * <ul>
   712 #'
   648  * <li>coded names are 6 digits long
   713 #' @details
   649  * <li>the initial character of the name is coded
   714 #'
   650  * <li>rules to encoded multi-character n-grams
   715 #' The variable \code{word} is the name to be encoded.  The variable
   651  * <li>multiple possible encodings for the same name (branching)
   716 #' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
   652  * </ul>
   717 #' because the algorithm itself is dependent upon its six-character
   653  * <p>
   718 #' length.  The variables \code{x} and \code{y} are MRA-encoded and are
   654  * This implementation supports branching, depending on the used method:
   719 #' compared to each other using the MRA comparison specification.
   655  * <ul>
   720 #'
   656  * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
   721 #' @return The \code{mra_encode} function returns match rating approach
   657  * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
   722 #' encoded character vector.  The \code{mra_compare} returns a boolean
   658  * </ul>
   723 #' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
   659  * <p>
   724 #' comparison test.
   660  * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
   725 #'
   661  * rules can be customized by overriding the default rules contained in the resource file
   726 #' @references
   662  * {@code org/apache/commons/codec/language/dmrules.txt}.
   727 #'
   663  * </p>
   728 #' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
   664  * <p>
   729 #' \emph{Accessing Individual Records from Personal Data Files Using
   665  * This class is thread-safe.
   730 #' Nonunique Identifiers,} US National Institute of Standards and
   666  * </p>
   731 #' Technology, SP-500-2 (1977), p. 17.
   667  *
   732 #'
   668  * @see Soundex
   733 #' @family phonics
   669  * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
   734 #'
   670  * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
   735 #' @examples
   671  *
   736 #' mra_encode("William")
   672  * @version $Id$
   737 #' mra_encode(c("Peter", "Peady"))
   673  * @since 1.10
   738 #' mra_encode("Stevenson")
   674  */
   739 
   675 public class DaitchMokotoffSoundex implements StringEncoder {
   740 #' @rdname mra
   676 
   741 #' @name mra_encode
   677     /**
   742 #' @export
   678      * Inner class representing a branch during DM soundex encoding.
   743 mra_encode <- function(word) {
   679      */
   744 
   680     private static final class Branch {
   745     ## First, remove any nonalphabetical characters and uppercase it
   681         private final StringBuilder builder;
   746     word <- gsub("[^[:alpha:]]*", "", word)
   682         private String cachedString;
   747     word <- toupper(word)
   683         private String lastReplacement;
   748 
   684 
   749     ## First character of key = first character of name
   685         private Branch() {
   750     first <- substr(word, 1, 1)
   686             builder = new StringBuilder();
   751     word <- substr(word, 2, nchar(word))
   687             lastReplacement = null;
   752 
   688             cachedString = null;
   753     ## Delete vowels not at the start of the word
   689         }
   754     word <- gsub("[AEIOU]", "", word)
   690 
   755     word <- paste(first, word, sep = "")
   691         /**
   756 
   692          * Creates a new branch, identical to this branch.
   757     ## Remove duplicate consecutive characters
   693          *
   758     word <- gsub("([A-Z])\\1+", "\\1", word)
   694          * @return a new, identical branch
   759 
   695          */
   760     ## If longer than 6 characters, take first and last 3...and we have
   696         public Branch createBranch() {
   761     ## to vectorize it
   697             final Branch branch = new Branch();
   762     for(i in 1:length(word)) {
   698             branch.builder.append(toString());
   763         if((l = nchar(word[i])) > 6) {
   699             branch.lastReplacement = this.lastReplacement;
   764             first <- substr(word[i], 1, 3)
   700             return branch;
   765             last <- substr(word[i], l - 2, l)
   701         }
   766             word[i] <- paste(first, last, sep = "");
   702 
       
   703         @Override
       
   704         public boolean equals(final Object other) {
       
   705             if (this == other) {
       
   706                 return true;
       
   707             }
       
   708             if (!!(other instanceof Branch)) {
       
   709                 return false;
       
   710             }
       
   711 
       
   712             return toString().equals(((Branch) other).toString());
       
   713         }
       
   714 
       
   715         /**
       
   716          * Finish this branch by appending '0's until the maximum code length has been reached.
       
   717          */
       
   718         public void finish() {
       
   719             while (builder.length() < MAX_LENGTH) {
       
   720                 builder.append('0');
       
   721                 cachedString = null;
       
   722             }
       
   723         }
       
   724 
       
   725         @Override
       
   726         public int hashCode() {
       
   727             return toString().hashCode();
       
   728         }
       
   729 
       
   730         /**
       
   731          * Process the next replacement to be added to this branch.
       
   732          *
       
   733          * @param replacement
       
   734          *            the next replacement to append
       
   735          * @param forceAppend
       
   736          *            indicates if the default processing shall be overridden
       
   737          */
       
   738         public void processNextReplacement(final String replacement, final boolean forceAppend) {
       
   739             final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend;
       
   740 
       
   741             if (append && builder.length() < MAX_LENGTH) {
       
   742                 builder.append(replacement);
       
   743                 // remove all characters after the maximum length
       
   744                 if (builder.length() > MAX_LENGTH) {
       
   745                     builder.delete(MAX_LENGTH, builder.length());
       
   746                 }
       
   747                 cachedString = null;
       
   748             }
       
   749 
       
   750             lastReplacement = replacement;
       
   751         }
       
   752 
       
   753         @Override
       
   754         public String toString() {
       
   755             if (cachedString == null) {
       
   756                 cachedString = builder.toString();
       
   757             }
       
   758             return cachedString;
   767         }
   759         }
   768     }
   760     }
   769 
   761 
   770     return(word)
   762     /**
       
   763      * Inner class for storing rules.
       
   764      */
       
   765     private static final class Rule {
       
   766         private final String pattern;
       
   767         private final String[] replacementAtStart;
       
   768         private final String[] replacementBeforeVowel;
       
   769         private final String[] replacementDefault;
       
   770 
       
   771         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
       
   772                 final String replacementDefault) {
       
   773             this.pattern = pattern;
       
   774             this.replacementAtStart = replacementAtStart.split("\\|");
       
   775             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
       
   776             this.replacementDefault = replacementDefault.split("\\|");
       
   777         }
       
   778 
       
   779         public int getPatternLength() {
       
   780             return pattern.length();
       
   781         }
       
   782 
       
   783         public String[] getReplacements(final String context, final boolean atStart) {
       
   784             if (atStart) {
       
   785                 return replacementAtStart;
       
   786             }
       
   787 
       
   788             final int nextIndex = getPatternLength();
       
   789             final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
       
   790             if (nextCharIsVowel) {
       
   791                 return replacementBeforeVowel;
       
   792             }
       
   793 
       
   794             return replacementDefault;
       
   795         }
       
   796 
       
   797         private boolean isVowel(final char ch) {
       
   798             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
       
   799         }
       
   800 
       
   801         public boolean matches(final String context) {
       
   802             return context.startsWith(pattern);
       
   803         }
       
   804 
       
   805         @Override
       
   806         public String toString() {
       
   807             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
       
   808                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
       
   809         }
       
   810     }
       
   811 
       
   812     private static final String COMMENT = "//";
       
   813     private static final String DOUBLE_QUOTE = "\"";
       
   814 
       
   815     private static final String MULTILINE_COMMENT_END = "*/";
       
   816 
       
   817     private static final String MULTILINE_COMMENT_START = "/*";
       
   818 
       
   819     /** The resource file containing the replacement and folding rules */
       
   820     private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
       
   821 
       
   822     /** The code length of a DM soundex value. */
       
   823     private static final int MAX_LENGTH = 6;
       
   824 
       
   825     /** Transformation rules indexed by the first character of their pattern. */
       
   826     private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
       
   827 
       
   828     /** Folding rules. */
       
   829     private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
       
   830 
       
   831     static {
       
   832         final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
       
   833         if (rulesIS == null) {
       
   834             throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
       
   835         }
       
   836 
       
   837         final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
       
   838         parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
       
   839         scanner.close();
       
   840 
       
   841         // sort RULES by pattern length in descending order
       
   842         for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
       
   843             final List<Rule> ruleList = rule.getValue();
       
   844             Collections.sort(ruleList, new Comparator<Rule>() {
       
   845                 @Override
       
   846                 public int compare(final Rule rule1, final Rule rule2) {
       
   847                     return rule2.getPatternLength() - rule1.getPatternLength();
       
   848                 }
       
   849             });
       
   850         }
       
   851     }
       
   852 
       
   853     private static void parseRules(final Scanner scanner, final String location,
       
   854             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
       
   855         int currentLine = 0;
       
   856         boolean inMultilineComment = false;
       
   857 
       
   858         while (scanner.hasNextLine()) {
       
   859             currentLine++;
       
   860             final String rawLine = scanner.nextLine();
       
   861             String line = rawLine;
       
   862 
       
   863             if (inMultilineComment) {
       
   864                 if (line.endsWith(MULTILINE_COMMENT_END)) {
       
   865                     inMultilineComment = false;
       
   866                 }
       
   867                 continue;
       
   868             }
       
   869 
       
   870             if (line.startsWith(MULTILINE_COMMENT_START)) {
       
   871                 inMultilineComment = true;
       
   872             } else {
       
   873                 // discard comments
       
   874                 final int cmtI = line.indexOf(COMMENT);
       
   875                 if (cmtI >= 0) {
       
   876                     line = line.substring(0, cmtI);
       
   877                 }
       
   878 
       
   879                 // trim leading-trailing whitespace
       
   880                 line = line.trim();
       
   881 
       
   882                 if (line.length() == 0) {
       
   883                     continue; // empty lines can be safely skipped
       
   884                 }
       
   885 
       
   886                 if (line.contains("=")) {
       
   887                     // folding
       
   888                     final String[] parts = line.split("=");
       
   889                     if (parts.length !!= 2) {
       
   890                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
       
   891                                 " parts: " + rawLine + " in " + location);
       
   892                     } else {
       
   893                         final String leftCharacter = parts[0];
       
   894                         final String rightCharacter = parts[1];
       
   895 
       
   896                         if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) {
       
   897                             throw new IllegalArgumentException("Malformed folding statement - " +
       
   898                                     "patterns are not single characters: " + rawLine + " in " + location);
       
   899                         }
       
   900 
       
   901                         asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
       
   902                     }
       
   903                 } else {
       
   904                     // rule
       
   905                     final String[] parts = line.split("\\s+");
       
   906                     if (parts.length !!= 4) {
       
   907                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
       
   908                                 " parts: " + rawLine + " in " + location);
       
   909                     } else {
       
   910                         try {
       
   911                             final String pattern = stripQuotes(parts[0]);
       
   912                             final String replacement1 = stripQuotes(parts[1]);
       
   913                             final String replacement2 = stripQuotes(parts[2]);
       
   914                             final String replacement3 = stripQuotes(parts[3]);
       
   915 
       
   916                             final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
       
   917                             final char patternKey = r.pattern.charAt(0);
       
   918                             List<Rule> rules = ruleMapping.get(patternKey);
       
   919                             if (rules == null) {
       
   920                                 rules = new ArrayList<Rule>();
       
   921                                 ruleMapping.put(patternKey, rules);
       
   922                             }
       
   923                             rules.add(r);
       
   924                         } catch (final IllegalArgumentException e) {
       
   925                             throw new IllegalStateException(
       
   926                                     "Problem parsing line '" + currentLine + "' in " + location, e);
       
   927                         }
       
   928                     }
       
   929                 }
       
   930             }
       
   931         }
       
   932     }
       
   933 
       
   934     private static String stripQuotes(String str) {
       
   935         if (str.startsWith(DOUBLE_QUOTE)) {
       
   936             str = str.substring(1);
       
   937         }
       
   938 
       
   939         if (str.endsWith(DOUBLE_QUOTE)) {
       
   940             str = str.substring(0, str.length() - 1);
       
   941         }
       
   942 
       
   943         return str;
       
   944     }
       
   945 
       
   946     /** Whether to use ASCII folding prior to encoding. */
       
   947     private final boolean folding;
       
   948 
       
   949     /**
       
   950      * Creates a new instance with ASCII-folding enabled.
       
   951      */
       
   952     public DaitchMokotoffSoundex() {
       
   953         this(true);
       
   954     }
       
   955 
       
   956     /**
       
   957      * Creates a new instance.
       
   958      * <p>
       
   959      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
       
   960      * è -&gt; e.
       
   961      * </p>
       
   962      *
       
   963      * @param folding
       
   964      *            if ASCII-folding shall be performed before encoding
       
   965      */
       
   966     public DaitchMokotoffSoundex(final boolean folding) {
       
   967         this.folding = folding;
       
   968     }
       
   969 
       
   970     /**
       
   971      * Performs a cleanup of the input string before the actual soundex transformation.
       
   972      * <p>
       
   973      * Removes all whitespace characters and performs ASCII folding if enabled.
       
   974      * </p>
       
   975      *
       
   976      * @param input
       
   977      *            the input string to cleanup
       
   978      * @return a cleaned up string
       
   979      */
       
   980     private String cleanup(final String input) {
       
   981         final StringBuilder sb = new StringBuilder();
       
   982         for (char ch : input.toCharArray()) {
       
   983             if (Character.isWhitespace(ch)) {
       
   984                 continue;
       
   985             }
       
   986 
       
   987             ch = Character.toLowerCase(ch);
       
   988             if (folding && FOLDINGS.containsKey(ch)) {
       
   989                 ch = FOLDINGS.get(ch);
       
   990             }
       
   991             sb.append(ch);
       
   992         }
       
   993         return sb.toString();
       
   994     }
       
   995 
       
   996     /**
       
   997      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
       
   998      * <p>
       
   999      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
       
  1000      * EncoderException if the supplied object is not of type java.lang.String.
       
  1001      * </p>
       
  1002      *
       
  1003      * @see #soundex(String)
       
  1004      *
       
  1005      * @param obj
       
  1006      *            Object to encode
       
  1007      * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
       
  1008      *         supplied.
       
  1009      * @throws EncoderException
       
  1010      *             if the parameter supplied is not of type java.lang.String
       
  1011      * @throws IllegalArgumentException
       
  1012      *             if a character is not mapped
       
  1013      */
       
  1014     @Override
       
  1015     public Object encode(final Object obj) throws EncoderException {
       
  1016         if (!!(obj instanceof String)) {
       
  1017             throw new EncoderException(
       
  1018                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
       
  1019         }
       
  1020         return encode((String) obj);
       
  1021     }
       
  1022 
       
  1023     /**
       
  1024      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
       
  1025      *
       
  1026      * @see #soundex(String)
       
  1027      *
       
  1028      * @param source
       
  1029      *            A String object to encode
       
  1030      * @return A DM Soundex code corresponding to the String supplied
       
  1031      * @throws IllegalArgumentException
       
  1032      *             if a character is not mapped
       
  1033      */
       
  1034     @Override
       
  1035     public String encode(final String source) {
       
  1036         if (source == null) {
       
  1037             return null;
       
  1038         }
       
  1039         return soundex(source, false)[0];
       
  1040     }
       
  1041 
       
  1042     /**
       
  1043      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
       
  1044      * <p>
       
  1045      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
       
  1046      * separated by '|'.
       
  1047      * </p>
       
  1048      * <p>
       
  1049      * Example: the name "AUERBACH" is encoded as both
       
  1050      * </p>
       
  1051      * <ul>
       
  1052      * <li>097400</li>
       
  1053      * <li>097500</li>
       
  1054      * </ul>
       
  1055      * <p>
       
  1056      * Thus the result will be "097400|097500".
       
  1057      * </p>
       
  1058      *
       
  1059      * @param source
       
  1060      *            A String object to encode
       
  1061      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
       
  1062      * @throws IllegalArgumentException
       
  1063      *             if a character is not mapped
       
  1064      */
       
  1065     public String soundex(final String source) {
       
  1066         final String[] branches = soundex(source, true);
       
  1067         final StringBuilder sb = new StringBuilder();
       
  1068         int index = 0;
       
  1069         for (final String branch : branches) {
       
  1070             sb.append(branch);
       
  1071             if (++index < branches.length) {
       
  1072                 sb.append('|');
       
  1073             }
       
  1074         }
       
  1075         return sb.toString();
       
  1076     }
       
  1077 
       
  1078     /**
       
  1079      * Perform the actual DM Soundex algorithm on the input string.
       
  1080      *
       
  1081      * @param source
       
  1082      *            A String object to encode
       
  1083      * @param branching
       
  1084      *            If branching shall be performed
       
  1085      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
       
  1086      *         selected branching mode
       
  1087      */
       
  1088     private String[] soundex(final String source, final boolean branching) {
       
  1089         if (source == null) {
       
  1090             return null;
       
  1091         }
       
  1092 
       
  1093         final String input = cleanup(source);
       
  1094 
       
  1095         final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
       
  1096         currentBranches.add(new Branch());
       
  1097 
       
  1098         char lastChar = '\0';
       
  1099         for (int index = 0; index < input.length(); index++) {
       
  1100             final char ch = input.charAt(index);
       
  1101 
       
  1102             // ignore whitespace inside a name
       
  1103             if (Character.isWhitespace(ch)) {
       
  1104                 continue;
       
  1105             }
       
  1106 
       
  1107             final String inputContext = input.substring(index);
       
  1108             final List<Rule> rules = RULES.get(ch);
       
  1109             if (rules == null) {
       
  1110                 continue;
       
  1111             }
       
  1112 
       
  1113             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
       
  1114             @SuppressWarnings("unchecked")
       
  1115             final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
       
  1116 
       
  1117             for (final Rule rule : rules) {
       
  1118                 if (rule.matches(inputContext)) {
       
  1119                     if (branching) {
       
  1120                         nextBranches.clear();
       
  1121                     }
       
  1122                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
       
  1123                     final boolean branchingRequired = replacements.length > 1 && branching;
       
  1124 
       
  1125                     for (final Branch branch : currentBranches) {
       
  1126                         for (final String nextReplacement : replacements) {
       
  1127                             // if we have multiple replacements, always create a new branch
       
  1128                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
       
  1129 
       
  1130                             // special rule: occurrences of mn or nm are treated differently
       
  1131                             final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
       
  1132 
       
  1133                             nextBranch.processNextReplacement(nextReplacement, force);
       
  1134 
       
  1135                             if (branching) {
       
  1136                                 nextBranches.add(nextBranch);
       
  1137                             } else {
       
  1138                                 break;
       
  1139                             }
       
  1140                         }
       
  1141                     }
       
  1142 
       
  1143                     if (branching) {
       
  1144                         currentBranches.clear();
       
  1145                         currentBranches.addAll(nextBranches);
       
  1146                     }
       
  1147                     index += rule.getPatternLength() - 1;
       
  1148                     break;
       
  1149                 }
       
  1150             }
       
  1151 
       
  1152             lastChar = ch;
       
  1153         }
       
  1154 
       
  1155         final String[] result = new String[currentBranches.size()];
       
  1156         int index = 0;
       
  1157         for (final Branch branch : currentBranches) {
       
  1158             branch.finish();
       
  1159             result[index++] = branch.toString();
       
  1160         }
       
  1161 
       
  1162         return result;
       
  1163     }
   771 }
  1164 }
   772 
  1165 END>>"
   773 #' @rdname mra
       
   774 #' @name mra_compare
       
   775 #' @export
       
   776 mra_compare <- function(x, y) {
       
   777     mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
       
   778 
       
   779     ## Obtain the minimum rating value by calculating the length sum of
       
   780     ## the encoded strings and using table A (from Wikipedia).  We start
       
   781     ## by setting the minimum to be the sum and move from there.
       
   782     mra$lensum <- nchar(mra$x) + nchar(mra$y)
       
   783     mra$min[mra$lensum == 12] <- 2
       
   784     mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
       
   785     mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
       
   786     mra$min[mra$lensum <= 4] <- 5
       
   787 
       
   788     ## If the length difference between the encoded strings is 3 or
       
   789     ## greater, then no similarity comparison is done.  For us, we
       
   790     ## continue the similarity comparison out of laziness and ensure the
       
   791     ## minimum is impossibly high to meet.
       
   792     mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
       
   793 
       
   794     ## Start the comparison.
       
   795     x <- strsplit(mra$x, split = "")
       
   796     y <- strsplit(mra$y, split = "")
       
   797     rows <- nrow(mra)
       
   798     for(i in 1:rows) {
       
   799         ## Process the encoded strings from left to right and remove any
       
   800         ## identical characters found from both strings respectively.
       
   801         j <- 1
       
   802         while(j < min(length(x[[i]]), length(y[[i]]))) {
       
   803             if(x[[i]][j] == y[[i]][j]) {
       
   804                 x[[i]] <- x[[i]][-j]
       
   805                 y[[i]] <- y[[i]][-j]
       
   806             } else
       
   807                 j <- j + 1
       
   808         }
       
   809 
       
   810         ## Process the unmatched characters from right to left and
       
   811         ## remove any identical characters found from both names
       
   812         ## respectively.
       
   813         x[[i]] <- rev(x[[i]])
       
   814         y[[i]] <- rev(y[[i]])
       
   815         j <- 1
       
   816         while(j < min(length(x[[i]]), length(y[[i]]))) {
       
   817             if(x[[i]][j] == y[[i]][j]) {
       
   818                 x[[i]] <- x[[i]][-j]
       
   819                 y[[i]] <- y[[i]][-j]
       
   820             } else
       
   821                 j <- j + 1
       
   822         }
       
   823         ## Subtract the number of unmatched characters from 6 in the
       
   824         ## longer string. This is the similarity rating.
       
   825         len <- min(length(x[[i]]), length(y[[i]]))
       
   826         mra$sim[i] <- 6 - len
       
   827     }
       
   828 
       
   829     ## If the similarity is greater than or equal to the minimum
       
   830     ## required, it is a successful match.
       
   831     mra$match <- (mra$sim >= mra$min)
       
   832     return(mra$match)
       
   833 }
       
   834 
       
   835 END>>
       
   836 ! !
       
   837 
       
   838 !PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
       
   839 
       
   840 encode:wordIn 
       
   841     "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
       
   842     
       
   843     |word prev|
       
   844 
       
   845     word := wordIn.
       
   846     
       
   847     "/ First, remove any nonalphabetical characters and uppercase it
       
   848 
       
   849     word := word select:#isLetter thenCollect:#asUppercase.
       
   850 
       
   851     "/ Delete vowels not at the start of the word
       
   852 
       
   853     word := word first asString , ((word from:2) reject:#isVowel).
       
   854 
       
   855     "/ Remove duplicate consecutive characters
       
   856 
       
   857     prev := nil.
       
   858     word := word 
       
   859                 collect:[:char |
       
   860                     char == prev ifTrue:[
       
   861                         $*
       
   862                     ] ifFalse:[
       
   863                         prev := char.
       
   864                         char.
       
   865                     ].    
       
   866                 ]
       
   867                 thenSelect:[:char | char ~~ $*].
       
   868 
       
   869     "/ If longer than 6 characters, take first and last 3            
       
   870     word size > 6 ifTrue:[
       
   871         word := (word copyFirst:3),(word copyLast:3)
       
   872     ].
       
   873     ^ word.
       
   874 
       
   875     "
       
   876      self new encode:'Catherine'            -> 'CTHRN'
       
   877      self new encode:'CatherineCatherine'   -> 'CTHHRN'
       
   878      self new encode:'Butter'               -> 'BTR'
       
   879      self new encode:'Byrne'                -> 'BYRN'
       
   880      self new encode:'Boern'                -> 'BRN'
       
   881      self new encode:'Smith'                -> 'SMTH'
       
   882      self new encode:'Smyth'                -> 'SMYTH'
       
   883      self new encode:'Kathryn'              -> 'KTHRYN'
       
   884     "
       
   885 
       
   886     "Created: / 28-07-2017 / 15:19:22 / cg"
       
   887     "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
       
   888 ! !
       
   889 
       
   890 !PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
       
   891 
       
   892 documentation
       
   893 "
       
   894     WARNING: this is the so called 'simplified soundex' algorithm;
       
   895       there are more variants like miracode (american soundex) or
       
   896       mysqlSoundex around.
       
   897       
       
   898       Be sure to use the correct algorithm, if the generated strings must be compatible
       
   899       (otherwise, the differences are probably too small to be noticed as effect, but
       
   900       your search will be different)
       
   901 
       
   902     The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
       
   903 
       
   904     SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
       
   905     components of names, but by doing so reports more matches. 
       
   906 
       
   907     There are some variations around in the literature; 
       
   908     the following is called 'simplified soundex', and the rules for coding a name are:
       
   909 
       
   910     1. The first letter of the name is used in its un-coded form to serve as the prefix
       
   911        character of the code. (The rest of the code is numerical).
       
   912 
       
   913     2. Thereafter, W and H are ignored entirely.
       
   914 
       
   915     3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
       
   916 
       
   917     4. Other letters of the name are converted to a numerical equivalent:
       
   918                  B, P, F, V              1 
       
   919                  C, G, J, K, Q, S, X, Z  2 
       
   920                  D, T                    3 
       
   921                  L                       4 
       
   922                  M, N                    5 
       
   923                  R                       6 
       
   924 
       
   925     5. There are two exceptions: 
       
   926         1. Letters that follow prefix letters which would, if coded, have the same
       
   927            numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
       
   928 
       
   929         2. The second letter of any pair of consonants having the same code number is likewise ignored, 
       
   930            i.e. unless there is a ''separator'' between them in the name.
       
   931 
       
   932     6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
       
   933        Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
       
   934 
       
   935     Notice, that in another variant, w and h are treated slightly differently.
       
   936     This is only of relevance, if you need to reconstruct original soundex codes of other programs
       
   937     or for the original 1880 us census data.
       
   938     
       
   939     Also notice, that soundex deals better with english. 
       
   940     For german and other languages, other algorithms may provide better results.
       
   941 "
       
   942 ! !
       
   943 
       
   944 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
       
   945 
       
   946 encode:word 
       
   947     |u p t prevCode|
       
   948 
       
   949     u := word asUppercase.
       
   950     p := u first asString.
       
   951     prevCode := self translate:u first.
       
   952     u from:2 to:u size do:[:c | 
       
   953         t := self translate:c.
       
   954         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
   955             p := p , t.
       
   956             p size == 4 ifTrue:[^ p ].
       
   957         ].
       
   958         prevCode := t
       
   959     ].
       
   960     [ p size < 4 ] whileTrue:[
       
   961         p := p , '0'
       
   962     ].
       
   963     ^ (p copyFrom:1 to:4)
       
   964 
       
   965     "
       
   966      self new encode:'washington' -> 'W252'
       
   967      self new encode:'lee'        -> 'L000'
       
   968      self new encode:'Gutierrez'  -> 'G362'
       
   969      self new encode:'Pfister'    -> 'P236'
       
   970      self new encode:'Jackson'    -> 'J250'
       
   971      self new encode:'Tymczak'    -> 'T522'
       
   972     "
       
   973     
       
   974     "notice:
       
   975      MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
       
   976      self new encode:'Ashcraft'   -> 'A226'
       
   977     "
       
   978 
       
   979     "Created: / 28-07-2017 / 15:21:23 / cg"
       
   980     "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
       
   981 ! !
       
   982 
       
   983 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
       
   984 
       
   985 translate:aCharacter
       
   986     "use simple if's for more speed when compiled"
       
   987 
       
   988     "vowels serve as separators"
       
   989     aCharacter == $A ifTrue:[^ '0' ].         
       
   990     aCharacter == $E ifTrue:[^ '0' ].
       
   991     aCharacter == $I ifTrue:[^ '0' ].
       
   992     aCharacter == $O ifTrue:[^ '0' ].
       
   993     aCharacter == $U ifTrue:[^ '0' ].
       
   994     aCharacter == $Y ifTrue:[^ '0' ].
       
   995 
       
   996     aCharacter == $B ifTrue:[^ '1' ]. 
       
   997     aCharacter == $P ifTrue:[^ '1' ]. 
       
   998     aCharacter == $F ifTrue:[^ '1' ]. 
       
   999     aCharacter == $V ifTrue:[^ '1' ]. 
       
  1000 
       
  1001     aCharacter == $C ifTrue:[^ '2' ]. 
       
  1002     aCharacter == $S ifTrue:[^ '2' ]. 
       
  1003     aCharacter == $K ifTrue:[^ '2' ]. 
       
  1004     aCharacter == $G ifTrue:[^ '2' ]. 
       
  1005     aCharacter == $J ifTrue:[^ '2' ]. 
       
  1006     aCharacter == $Q ifTrue:[^ '2' ]. 
       
  1007     aCharacter == $X ifTrue:[^ '2' ]. 
       
  1008     aCharacter == $Z ifTrue:[^ '2' ]. 
       
  1009 
       
  1010     aCharacter == $D ifTrue:[^ '3' ]. 
       
  1011     aCharacter == $T ifTrue:[^ '3' ]. 
       
  1012 
       
  1013     aCharacter == $L ifTrue:[^ '4' ]. 
       
  1014 
       
  1015     aCharacter == $M ifTrue:[^ '5' ]. 
       
  1016     aCharacter == $N ifTrue:[^ '5' ]. 
       
  1017 
       
  1018     aCharacter == $R ifTrue:[^ '6' ]. 
       
  1019     ^ nil
       
  1020 ! !
       
  1021 
       
  1022 !PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
       
  1023 
       
  1024 documentation
       
  1025 "
       
  1026     MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
       
  1027     and also removing vokals first, then removing duplicate codes
       
  1028     (whereas the soundex code does this in reverse order).
       
  1029 
       
  1030     These variations are important, if you need the miracode soundex codes to be generated.
       
  1031 "
       
  1032 ! !
       
  1033 
       
  1034 !PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
       
  1035 
       
  1036 encode:word 
       
  1037     |u p t prevCode|
       
  1038 
       
  1039     u := word asUppercase.
       
  1040     p := u first asString.
       
  1041     prevCode := self translate:u first.
       
  1042     u from:2 to:u size do:[:c |
       
  1043         t := self translate:c.
       
  1044         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
  1045             p := p , t.
       
  1046         ].
       
  1047         (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
       
  1048             prevCode := t.
       
  1049         ].
       
  1050     ].
       
  1051     [ p size < 4 ] whileTrue:[
       
  1052         p := p , '0'
       
  1053     ].
       
  1054     ^ p
       
  1055 
       
  1056     "Created: / 28-07-2017 / 15:23:41 / cg"
       
  1057     "Modified: / 31-07-2017 / 17:53:51 / cg"
       
  1058 ! !
       
  1059 
       
  1060 !PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
       
  1061 
       
  1062 documentation
       
  1063 "
       
  1064     NYSIIS Algorithm:
       
  1065 
       
  1066     1.
       
  1067         remove all ''S'' and ''Z'' chars from the end of the surname 
       
  1068 
       
  1069     2.
       
  1070         transcode initial strings
       
  1071             MAC => MC
       
  1072             PF => F
       
  1073 
       
  1074     3.
       
  1075         Transcode trailing strings as follows,
       
  1076         
       
  1077             IX => IC
       
  1078             EX => EC
       
  1079             YE,EE,IE => Y
       
  1080             NT,ND => D 
       
  1081 
       
  1082     4.
       
  1083         transcode ''EV'' to ''EF'' if not at start of name
       
  1084 
       
  1085     5.
       
  1086         use first character of name as first character of key 
       
  1087 
       
  1088     6.
       
  1089         remove any ''W'' that follows a vowel 
       
  1090 
       
  1091     7.
       
  1092         replace all vowels with ''A'' 
       
  1093 
       
  1094     8.
       
  1095         transcode ''GHT'' to ''GT'' 
       
  1096 
       
  1097     9.
       
  1098         transcode ''DG'' to ''G'' 
       
  1099 
       
  1100     10.
       
  1101         transcode ''PH'' to ''F'' 
       
  1102 
       
  1103     11.
       
  1104         if not first character, eliminate all ''H'' preceded or followed by a vowel 
       
  1105 
       
  1106     12.
       
  1107         change ''KN'' to ''N'', else ''K'' to ''C'' 
       
  1108 
       
  1109     13.
       
  1110         if not first character, change ''M'' to ''N'' 
       
  1111 
       
  1112     14.
       
  1113         if not first character, change ''Q'' to ''G'' 
       
  1114 
       
  1115     15.
       
  1116         transcode ''SH'' to ''S'' 
       
  1117 
       
  1118     16.
       
  1119         transcode ''SCH'' to ''S'' 
       
  1120 
       
  1121     17.
       
  1122         transcode ''YW'' to ''Y'' 
       
  1123 
       
  1124     18.
       
  1125         if not first or last character, change ''Y'' to ''A'' 
       
  1126 
       
  1127     19.
       
  1128         transcode ''WR'' to ''R'' 
       
  1129 
       
  1130     20.
       
  1131         if not first character, change ''Z'' to ''S'' 
       
  1132 
       
  1133     21.
       
  1134         transcode terminal ''AY'' to ''Y'' 
       
  1135 
       
  1136     22.
       
  1137         remove traling vowels 
       
  1138 
       
  1139     23.
       
  1140         collapse all strings of repeated characters 
       
  1141 
       
  1142     24.
       
  1143         if first char of original surname was a vowel, append it to the code
       
  1144 "
       
  1145 ! !
       
  1146 
       
  1147 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
       
  1148 
       
  1149 encode:aString 
       
  1150     |k|
       
  1151 
       
  1152     k := self rule1:(aString asUppercase).
       
  1153     k := self rule2:k.
       
  1154     k := self rule3:k.
       
  1155     k := self rule4:k.
       
  1156     k := self rule5:k.
       
  1157     k := self rule6:k.
       
  1158     k := self rule7:k.
       
  1159     k := self rule8:k.
       
  1160     k := self rule9:k.
       
  1161     k := self rule10:k.
       
  1162     k := self rule11:k.
       
  1163     k := self rule12:k.
       
  1164     k := self rule13:k.
       
  1165     k := self rule14:k.
       
  1166     k := self rule15:k.
       
  1167     k := self rule16:k.
       
  1168     k := self rule17:k.
       
  1169     k := self rule18:k.
       
  1170     k := self rule19:k.
       
  1171     k := self rule20:k.
       
  1172     k := self rule21:k.
       
  1173     k := self rule22:k.
       
  1174     k := self rule23:k.
       
  1175     k := self rule24:k originalKey:aString.
       
  1176     ^ k
       
  1177 
       
  1178     "
       
  1179      self new encode:'hello'
       
  1180      self new encode:'bliss'
       
  1181     "
       
  1182     "
       
  1183      self new phoneticStringsFor:'hello'
       
  1184      self new phoneticStringsFor:'bliss'
       
  1185     "
       
  1186 
       
  1187     "Created: / 28-07-2017 / 15:34:52 / cg"
       
  1188 ! !
       
  1189 
       
  1190 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
       
  1191 
       
  1192 rule10:key 
       
  1193     "10. transcode 'PH' to 'F' "
       
  1194     
       
  1195     ^ self 
       
  1196         transcodeAll:'PH'
       
  1197         of:key
       
  1198         to:'F'
       
  1199         startingAt:1
       
  1200 !
       
  1201 
       
  1202 rule11:key 
       
  1203     |k c|
       
  1204 
       
  1205     "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
       
  1206     k := key copy.
       
  1207     c := SortedCollection sortBlock:[:a :b | b < a ].
       
  1208     2 to:key size do:[:i | 
       
  1209         (key at:i) = $H ifTrue:[
       
  1210             ((key at:i - 1) isVowel 
       
  1211                 or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
       
  1212         ]
       
  1213     ].
       
  1214     c do:[:n | 
       
  1215         k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
       
  1216     ].
       
  1217     ^ k
       
  1218 !
       
  1219 
       
  1220 rule12:key 
       
  1221     |k|
       
  1222 
       
  1223     "12. change 'KN' to 'N', else 'K' to 'C' "
       
  1224     k := self 
       
  1225                 transcodeAll:'KN'
       
  1226                 of:key
       
  1227                 to:'K'
       
  1228                 startingAt:1.
       
  1229     k := self 
       
  1230                 transcodeAll:'K'
       
  1231                 of:k
       
  1232                 to:'C'
       
  1233                 startingAt:1.
       
  1234     ^ k
       
  1235 !
       
  1236 
       
  1237 rule13:key 
       
  1238     "13. if not first character, change 'M' to 'N' "
       
  1239     
       
  1240     ^ self 
       
  1241         transcodeAll:'M'
       
  1242         of:key
       
  1243         to:'N'
       
  1244         startingAt:2
       
  1245 !
       
  1246 
       
  1247 rule14:key 
       
  1248     "14. if not first character, change 'Q' to 'G' "
       
  1249     
       
  1250     ^ self 
       
  1251         transcodeAll:'Q'
       
  1252         of:key
       
  1253         to:'G'
       
  1254         startingAt:2
       
  1255 !
       
  1256 
       
  1257 rule15:key 
       
  1258     "15. transcode 'SH' to 'S' "
       
  1259     
       
  1260     ^ self 
       
  1261         transcodeAll:'SH'
       
  1262         of:key
       
  1263         to:'S'
       
  1264         startingAt:1
       
  1265 !
       
  1266 
       
  1267 rule16:key 
       
  1268     "16. transcode 'SCH' to 'S' "
       
  1269     
       
  1270     ^ self 
       
  1271         transcodeAll:'SCH'
       
  1272         of:key
       
  1273         to:'S'
       
  1274         startingAt:1
       
  1275 !
       
  1276 
       
  1277 rule17:key 
       
  1278     "17. transcode 'YW' to 'Y' "
       
  1279     
       
  1280     ^ self 
       
  1281         transcodeAll:'YW'
       
  1282         of:key
       
  1283         to:'Y'
       
  1284         startingAt:1
       
  1285 !
       
  1286 
       
  1287 rule18:key 
       
  1288     |k|
       
  1289 
       
  1290     "18. if not first or last character, change 'Y' to 'A' "
       
  1291     k := self 
       
  1292                 transcodeAll:'Y'
       
  1293                 of:key
       
  1294                 to:'A'
       
  1295                 startingAt:2.
       
  1296     key last = $Y ifTrue:[
       
  1297         k at:k size put:$Y
       
  1298     ].
       
  1299     ^ k
       
  1300 !
       
  1301 
       
  1302 rule19:key 
       
  1303     "19. transcode 'WR' to 'R' "
       
  1304     
       
  1305     ^ self 
       
  1306         transcodeAll:'WR'
       
  1307         of:key
       
  1308         to:'R'
       
  1309         startingAt:1
       
  1310 !
       
  1311 
       
  1312 rule1:key 
       
  1313     |k|
       
  1314 
       
  1315     k := key copy.
       
  1316      "1. Remove all 'S' and 'Z' chars from the end of the name"
       
  1317     [
       
  1318         'SZ' includes:k last
       
  1319     ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
       
  1320     ^ k
       
  1321 !
       
  1322 
       
  1323 rule20:key 
       
  1324     "20. if not first character, change 'Z' to 'S' "
       
  1325     
       
  1326     ^ self 
       
  1327         transcodeAll:'Z'
       
  1328         of:key
       
  1329         to:'S'
       
  1330         startingAt:2
       
  1331 !
       
  1332 
       
  1333 rule21:key 
       
  1334     "21. transcode terminal 'AY' to 'Y' "
       
  1335     
       
  1336     ^ self 
       
  1337         transcodeAll:'AY'
       
  1338         of:key
       
  1339         to:'Y'
       
  1340         startingAt:key size - 1
       
  1341 !
       
  1342 
       
  1343 rule22:key 
       
  1344     |k|
       
  1345 
       
  1346     "22. remove trailing vowels "
       
  1347     k := key copy.
       
  1348     [ k last isVowel ] whileTrue:[
       
  1349         k := k copyFrom:1 to:k size - 1
       
  1350     ].
       
  1351     ^ k
       
  1352 !
       
  1353 
       
  1354 rule23:key 
       
  1355     |k c|
       
  1356 
       
  1357     "23. collapse all strings of repeated characters "
       
  1358     k := key copy.
       
  1359     c := SortedCollection sortBlock:[:a :b | b < a ].
       
  1360     k size to:2 do:[:i | 
       
  1361         (k at:i) = (k at:i - 1) ifTrue:[
       
  1362             c add:i
       
  1363         ]
       
  1364     ].
       
  1365     c do:[:n | 
       
  1366         k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
       
  1367     ].
       
  1368     ^ k
       
  1369 !
       
  1370 
       
  1371 rule24:key originalKey:originalKey 
       
  1372     |k|
       
  1373 
       
  1374     "24. if first char of original surname was a vowel, append it to the code"
       
  1375     k := key copy.
       
  1376     originalKey first isVowel ifTrue:[
       
  1377         k := k , originalKey first asString asUppercase
       
  1378     ].
       
  1379     ^ k
       
  1380 !
       
  1381 
       
  1382 rule2:key 
       
  1383     |k|
       
  1384 
       
  1385     k := key copy.
       
  1386      "2. Transcode initial strings:  MAC => MC   PF => F"
       
  1387     (k startsWith:'MAC') ifTrue:[
       
  1388         k := 'MC' , (k copyFrom:4)
       
  1389     ].
       
  1390     (k startsWith:'PF') ifTrue:[
       
  1391         k := 'F' , (k copyFrom:3)
       
  1392     ].
       
  1393     ^ k
       
  1394 !
       
  1395 
       
  1396 rule3:key 
       
  1397     |k|
       
  1398 
       
  1399     "3. Transcode trailing strings as follows:
       
  1400         IX => IC
       
  1401           EX => EC
       
  1402           YE, EE, IE => Y
       
  1403            NT, ND => D"
       
  1404     k := key copy.
       
  1405     k := self 
       
  1406                 transcodeTrailing:#( 'IX' )
       
  1407                 of:k
       
  1408                 to:'IC'.
       
  1409     k := self 
       
  1410                 transcodeTrailing:#( 'EX' )
       
  1411                 of:k
       
  1412                 to:'EC'.
       
  1413     k := self 
       
  1414                 transcodeTrailing:#( 'YE' 'EE' 'IE' )
       
  1415                 of:k
       
  1416                 to:'Y'.
       
  1417     k := self 
       
  1418                 transcodeTrailing:#( 'NT' 'ND' )
       
  1419                 of:k
       
  1420                 to:'D'.
       
  1421     ^ k
       
  1422 !
       
  1423 
       
  1424 rule4:key 
       
  1425     "4. Transcode 'EV' to 'EF' if not at start of name"
       
  1426     
       
  1427     ^ self 
       
  1428         transcodeAll:'EV'
       
  1429         of:key
       
  1430         to:'EF'
       
  1431         startingAt:2
       
  1432 !
       
  1433 
       
  1434 rule5:key 
       
  1435     "5. Use first character of name as first character of key.  Ignored because we're doing an in-place conversion"
       
  1436     
       
  1437     ^ key
       
  1438 !
       
  1439 
       
  1440 rule6:key 
       
  1441     |k i|
       
  1442 
       
  1443     "6. Remove any 'W' that follows a vowel"
       
  1444     k := key copy.
       
  1445     i := 2.
       
  1446     [
       
  1447         (i := k indexOf:$W startingAt:i) > 0
       
  1448     ] whileTrue:[
       
  1449         (k at:i - 1) isVowel ifTrue:[
       
  1450             k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
       
  1451             i := i - 1
       
  1452         ]
       
  1453     ].
       
  1454     ^ k
       
  1455 !
       
  1456 
       
  1457 rule7:key 
       
  1458     |k|
       
  1459 
       
  1460     "7. replace all vowels with 'A' "
       
  1461     k := key copy.
       
  1462     1 to:key size do:[:i | 
       
  1463         (key at:i) isVowel ifTrue:[
       
  1464             k at:i put:$A
       
  1465         ]
       
  1466     ].
       
  1467     ^ k
       
  1468 !
       
  1469 
       
  1470 rule8:key 
       
  1471     "8. transcode 'GHT' to 'GT' "
       
  1472     
       
  1473     ^ self 
       
  1474         transcodeAll:'GHT'
       
  1475         of:key
       
  1476         to:'GT'
       
  1477         startingAt:1
       
  1478 !
       
  1479 
       
  1480 rule9:key 
       
  1481     "9. transcode 'DG' to 'G' "
       
  1482     
       
  1483     ^ self 
       
  1484         transcodeAll:'DG'
       
  1485         of:key
       
  1486         to:'G'
       
  1487         startingAt:1
       
  1488 !
       
  1489 
       
  1490 transcodeAll:aString of:key to:replacementString startingAt:start 
       
  1491     |k i|
       
  1492 
       
  1493     k := key copy.
       
  1494     [
       
  1495         (i := k indexOfSubCollection:aString startingAt:start) > 0
       
  1496     ] whileTrue:[
       
  1497         k := (k copyFrom:1 to:i - 1) , replacementString 
       
  1498                     , (k copyFrom:i + aString size to:k size)
       
  1499     ].
       
  1500     ^ k
       
  1501 !
       
  1502 
       
  1503 transcodeTrailing:anArrayOfStrings of:key to:replacementString 
       
  1504     |answer|
       
  1505 
       
  1506     answer := key copy.
       
  1507     anArrayOfStrings do:[:aString | 
       
  1508         answer := self 
       
  1509                     transcodeAll:aString
       
  1510                     of:answer
       
  1511                     to:replacementString
       
  1512                     startingAt:(answer size - aString size) + 1
       
  1513     ].
       
  1514     ^ answer
       
  1515 ! !
       
  1516 
       
  1517 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
       
  1518 
       
  1519 documentation
       
  1520 "
       
  1521     Implementation of the PHONEM algorithm, as described in
       
  1522     'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
       
  1523     Ein Programm fuer kontextsensitive phonetische Textumwandlung
       
  1524     ct Magazin fuer Computer & Technik 25/1998'
       
  1525     
       
  1526     This algorithm deals better with the german language (it cares for umlauts)
       
  1527 "
       
  1528 ! !
       
  1529 
       
  1530 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
       
  1531 
       
  1532 encode:aString 
       
  1533     |s idx t t2|
       
  1534 
       
  1535     s := aString asUppercase.
       
  1536 
       
  1537     idx := 1.
       
  1538     [idx < (s size-1)] whileTrue:[
       
  1539         t2 := nil.
       
  1540         t := s copyFrom:idx to:idx+1.
       
  1541         t = 'SC' ifTrue:[ t2 := 'C' ]
       
  1542         ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
       
  1543         ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
       
  1544         ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
       
  1545         ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
       
  1546         ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
       
  1547         ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
       
  1548         ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
       
  1549         ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
       
  1550         ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
       
  1551         ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
       
  1552         ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
       
  1553         ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
       
  1554         ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
       
  1555         ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
       
  1556         ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
       
  1557         ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
       
  1558         t2 notNil ifTrue:[
       
  1559             s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
       
  1560         ] ifFalse:[
       
  1561             idx := idx + 1.
       
  1562         ].
       
  1563     ].
       
  1564 
       
  1565     "/ single character substitutions via tr
       
  1566     s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
       
  1567     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
       
  1568     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
       
  1569     ^ s
       
  1570 
       
  1571     "
       
  1572      self basicNew encode:'müller'  -> 'MYLR'    
       
  1573      self basicNew encode:'mueller' -> 'MYLR'    
       
  1574      self basicNew encode:'möller'  -> 'MYLR'
       
  1575      self basicNew encode:'miller'  -> 'MYLR'     
       
  1576      self basicNew encode:'muller'  -> 'MULR' 
       
  1577      self basicNew encode:'muler'   -> 'MULR' 
       
  1578 
       
  1579      self basicNew phoneticStringsFor:'müller'  #('MYLR')    
       
  1580      self basicNew phoneticStringsFor:'mueller' #('MYLR')    
       
  1581      self basicNew phoneticStringsFor:'möller'  #('MYLR')
       
  1582      self basicNew phoneticStringsFor:'miller'  #('MYLR')     
       
  1583      self basicNew phoneticStringsFor:'muller'  #('MULR') 
       
  1584      self basicNew phoneticStringsFor:'muler'   #('MULR') 
       
  1585      
       
  1586      self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
       
  1587      self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
       
  1588      self basicNew phoneticStringsFor:'fischer'     #('VYCR')
       
  1589      self basicNew phoneticStringsFor:'weber'       #('VBR')
       
  1590      self basicNew phoneticStringsFor:'weeber'      #('VBR')
       
  1591      self basicNew phoneticStringsFor:'webber'      #('VBR')
       
  1592      self basicNew phoneticStringsFor:'wepper'      #('VBR')
       
  1593      
       
  1594      self basicNew phoneticStringsFor:'meyer'       #('MAYR')
       
  1595      self basicNew phoneticStringsFor:'maier'       #('MAYR')
       
  1596      self basicNew phoneticStringsFor:'mayer'       #('MAYR')
       
  1597      self basicNew phoneticStringsFor:'mayr'        #('MAYR')
       
  1598      self basicNew phoneticStringsFor:'meir'        #('MAYR')
       
  1599      
       
  1600      self basicNew phoneticStringsFor:'wagner'      #('VACNR')
       
  1601      self basicNew phoneticStringsFor:'schulz'      #('CULC')
       
  1602      self basicNew phoneticStringsFor:'becker'      #('BCR')
       
  1603      self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
       
  1604      self basicNew phoneticStringsFor:'haus'        #('AUS')
       
  1605      
       
  1606      self basicNew phoneticStringsFor:'schäfer'     #('CVR')
       
  1607      self basicNew phoneticStringsFor:'scheffer'    #('CVR')
       
  1608      self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
       
  1609      self basicNew phoneticStringsFor:'schaefer'    #('CVR')
       
  1610     "
       
  1611 
       
  1612     "Created: / 28-07-2017 / 15:38:08 / cg"
       
  1613 ! !
  1166 ! !
  1614 
  1167 
  1615 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
  1168 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
  1616 
  1169 
  1617 copyright
  1170 copyright
  3138         ]
  2691         ]
  3139 
  2692 
  3140     "Modified: / 28-07-2017 / 11:35:12 / cg"
  2693     "Modified: / 28-07-2017 / 11:35:12 / cg"
  3141 ! !
  2694 ! !
  3142 
  2695 
       
  2696 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
       
  2697 
       
  2698 documentation
       
  2699 "
       
  2700     There are many extended and enhanced soundex variants around;
       
  2701     here is one, called 'extended soundex'. It is destribed for example in
       
  2702     http://www.epidata.dk/documentation.php.
       
  2703     An author or origin is unknown.
       
  2704 
       
  2705     The number of digits is increased to 5 or 8;
       
  2706     The first character is not used literally; instead it is encoded like the rest.
       
  2707     This might have a negative effect on names starting with a vovel, though.
       
  2708 
       
  2709     Overall, it can be doubted if this is really an enhancement after all.
       
  2710 "
       
  2711 ! !
       
  2712 
       
  2713 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
       
  2714 
       
  2715 phoneticStringsFor:aString
       
  2716     "generates both an extended soundex of length 5 and one of length 8"
       
  2717 
       
  2718     |first second u t prevCode|
       
  2719 
       
  2720     u := aString asUppercase.
       
  2721     first := second := ''.
       
  2722     u do:[:c | 
       
  2723         t := self translate:c.
       
  2724         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
  2725             first := first , t.
       
  2726             second := second , t.
       
  2727             second size == 8 ifTrue:[
       
  2728                 ^ Array with:(first copyTo:5) with:second 
       
  2729             ].
       
  2730         ].
       
  2731         prevCode := t
       
  2732     ].
       
  2733     [ first size < 5 ] whileTrue:[
       
  2734         first := first , '0'.
       
  2735         second := second , '0'.
       
  2736     ].
       
  2737     [ second size < 8 ] whileTrue:[
       
  2738         second := second , '0'
       
  2739     ].
       
  2740     ^ Array with:first with:second
       
  2741 
       
  2742     "
       
  2743      self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
       
  2744      self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
       
  2745      self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
       
  2746      self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
       
  2747      self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
       
  2748      self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
       
  2749      self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
       
  2750      self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
       
  2751      self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
       
  2752      self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
       
  2753      self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
       
  2754      self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
       
  2755      self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
       
  2756      self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
       
  2757     "
       
  2758 ! !
       
  2759 
       
  2760 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
       
  2761 
       
  2762 translate:aCharacter
       
  2763     "use simple if's for more speed when compiled"
       
  2764 
       
  2765     "vowels serve as separators"
       
  2766     aCharacter == $A ifTrue:[^ '0' ].         
       
  2767     aCharacter == $E ifTrue:[^ '0' ].
       
  2768     aCharacter == $I ifTrue:[^ '0' ].
       
  2769     aCharacter == $O ifTrue:[^ '0' ].
       
  2770     aCharacter == $U ifTrue:[^ '0' ].
       
  2771     aCharacter == $Y ifTrue:[^ '0' ].
       
  2772 
       
  2773     aCharacter == $B ifTrue:[^ '1' ]. 
       
  2774     aCharacter == $P ifTrue:[^ '1' ].
       
  2775 
       
  2776     aCharacter == $F ifTrue:[^ '2' ]. 
       
  2777     aCharacter == $V ifTrue:[^ '2' ]. 
       
  2778 
       
  2779     aCharacter == $C ifTrue:[^ '3' ]. 
       
  2780     aCharacter == $S ifTrue:[^ '3' ]. 
       
  2781     aCharacter == $K ifTrue:[^ '3' ].
       
  2782 
       
  2783     aCharacter == $G ifTrue:[^ '4' ]. 
       
  2784     aCharacter == $J ifTrue:[^ '4' ].
       
  2785 
       
  2786     aCharacter == $Q ifTrue:[^ '5' ]. 
       
  2787     aCharacter == $X ifTrue:[^ '5' ]. 
       
  2788     aCharacter == $Z ifTrue:[^ '5' ]. 
       
  2789 
       
  2790     aCharacter == $D ifTrue:[^ '6' ]. 
       
  2791     aCharacter == $G ifTrue:[^ '6' ]. 
       
  2792     aCharacter == $T ifTrue:[^ '6' ]. 
       
  2793 
       
  2794     aCharacter == $L ifTrue:[^ '7' ]. 
       
  2795 
       
  2796     aCharacter == $M ifTrue:[^ '8' ]. 
       
  2797     aCharacter == $N ifTrue:[^ '8' ]. 
       
  2798 
       
  2799     aCharacter == $R ifTrue:[^ '9' ]. 
       
  2800     ^ nil
       
  2801 ! !
       
  2802 
       
  2803 !PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
       
  2804 
       
  2805 documentation
       
  2806 "
       
  2807     documentation to be added.
       
  2808 
       
  2809     [author:]
       
  2810         cg
       
  2811 
       
  2812     [instance variables:]
       
  2813 
       
  2814     [class variables:]
       
  2815 
       
  2816     [see also:]
       
  2817 
       
  2818 "
       
  2819 ! !
       
  2820 
       
  2821 !PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
       
  2822 
       
  2823 encode:word
       
  2824     ^ self subclassResponsibility
       
  2825 
       
  2826     "Created: / 28-07-2017 / 15:20:49 / cg"
       
  2827 !
       
  2828 
       
  2829 phoneticStringsFor:word 
       
  2830     ^ Array with:(self encode:word)
       
  2831 
       
  2832     "Created: / 28-07-2017 / 15:20:38 / cg"
       
  2833 ! !
       
  2834 
       
  2835 !PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
       
  2836 
       
  2837 documentation
       
  2838 "
       
  2839     Match Rating Approach Encoder
       
  2840 
       
  2841     The Western Airlines matching rating approach name encoder
       
  2842 
       
  2843     [see also:]
       
  2844         https://en.wikipedia.org/wiki/Match_Rating_Approach
       
  2845         
       
  2846         G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
       
  2847             ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' 
       
  2848             US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
       
  2849 "
       
  2850 !
       
  2851 
       
  2852 rCode
       
  2853 "<<END
       
  2854 ## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
       
  2855 ##
       
  2856 ## Redistribution and use in source and binary forms, with or without
       
  2857 ## modification, are permitted provided that the following conditions are
       
  2858 ## met:
       
  2859 ##
       
  2860 ##     Redistributions of source code must retain the above copyright
       
  2861 ##     notice, this list of conditions and the following disclaimer.
       
  2862 ##
       
  2863 ##     Redistributions in binary form must reproduce the above copyright
       
  2864 ##     notice, this list of conditions and the following disclaimer in
       
  2865 ##     the documentation and/or other materials provided with the
       
  2866 ##     distribution.
       
  2867 ##
       
  2868 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
       
  2869 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
       
  2870 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
       
  2871 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
       
  2872 ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
       
  2873 ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
       
  2874 ## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       
  2875 ## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       
  2876 ## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
  2877 ## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
  2878 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       
  2879 
       
  2880 #' @rdname mra
       
  2881 #' @title Match Rating Approach Encoder
       
  2882 #'
       
  2883 #' @description
       
  2884 #' The Western Airlines matching rating approach name encoder
       
  2885 #'
       
  2886 #' @param word string or vector of strings to encode
       
  2887 #' @param x MRA-encoded character vector
       
  2888 #' @param y MRA-encoded character vector
       
  2889 #'
       
  2890 #' @details
       
  2891 #'
       
  2892 #' The variable \code{word} is the name to be encoded.  The variable
       
  2893 #' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
       
  2894 #' because the algorithm itself is dependent upon its six-character
       
  2895 #' length.  The variables \code{x} and \code{y} are MRA-encoded and are
       
  2896 #' compared to each other using the MRA comparison specification.
       
  2897 #'
       
  2898 #' @return The \code{mra_encode} function returns match rating approach
       
  2899 #' encoded character vector.  The \code{mra_compare} returns a boolean
       
  2900 #' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
       
  2901 #' comparison test.
       
  2902 #'
       
  2903 #' @references
       
  2904 #'
       
  2905 #' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
       
  2906 #' \emph{Accessing Individual Records from Personal Data Files Using
       
  2907 #' Nonunique Identifiers,} US National Institute of Standards and
       
  2908 #' Technology, SP-500-2 (1977), p. 17.
       
  2909 #'
       
  2910 #' @family phonics
       
  2911 #'
       
  2912 #' @examples
       
  2913 #' mra_encode("William")
       
  2914 #' mra_encode(c("Peter", "Peady"))
       
  2915 #' mra_encode("Stevenson")
       
  2916 
       
  2917 #' @rdname mra
       
  2918 #' @name mra_encode
       
  2919 #' @export
       
  2920 mra_encode <- function(word) {
       
  2921 
       
  2922     ## First, remove any nonalphabetical characters and uppercase it
       
  2923     word <- gsub("[^[:alpha:]]*", "", word)
       
  2924     word <- toupper(word)
       
  2925 
       
  2926     ## First character of key = first character of name
       
  2927     first <- substr(word, 1, 1)
       
  2928     word <- substr(word, 2, nchar(word))
       
  2929 
       
  2930     ## Delete vowels not at the start of the word
       
  2931     word <- gsub("[AEIOU]", "", word)
       
  2932     word <- paste(first, word, sep = "")
       
  2933 
       
  2934     ## Remove duplicate consecutive characters
       
  2935     word <- gsub("([A-Z])\\1+", "\\1", word)
       
  2936 
       
  2937     ## If longer than 6 characters, take first and last 3...and we have
       
  2938     ## to vectorize it
       
  2939     for(i in 1:length(word)) {
       
  2940         if((l = nchar(word[i])) > 6) {
       
  2941             first <- substr(word[i], 1, 3)
       
  2942             last <- substr(word[i], l - 2, l)
       
  2943             word[i] <- paste(first, last, sep = "");
       
  2944         }
       
  2945     }
       
  2946 
       
  2947     return(word)
       
  2948 }
       
  2949 
       
  2950 #' @rdname mra
       
  2951 #' @name mra_compare
       
  2952 #' @export
       
  2953 mra_compare <- function(x, y) {
       
  2954     mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
       
  2955 
       
  2956     ## Obtain the minimum rating value by calculating the length sum of
       
  2957     ## the encoded strings and using table A (from Wikipedia).  We start
       
  2958     ## by setting the minimum to be the sum and move from there.
       
  2959     mra$lensum <- nchar(mra$x) + nchar(mra$y)
       
  2960     mra$min[mra$lensum == 12] <- 2
       
  2961     mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
       
  2962     mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
       
  2963     mra$min[mra$lensum <= 4] <- 5
       
  2964 
       
  2965     ## If the length difference between the encoded strings is 3 or
       
  2966     ## greater, then no similarity comparison is done.  For us, we
       
  2967     ## continue the similarity comparison out of laziness and ensure the
       
  2968     ## minimum is impossibly high to meet.
       
  2969     mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
       
  2970 
       
  2971     ## Start the comparison.
       
  2972     x <- strsplit(mra$x, split = "")
       
  2973     y <- strsplit(mra$y, split = "")
       
  2974     rows <- nrow(mra)
       
  2975     for(i in 1:rows) {
       
  2976         ## Process the encoded strings from left to right and remove any
       
  2977         ## identical characters found from both strings respectively.
       
  2978         j <- 1
       
  2979         while(j < min(length(x[[i]]), length(y[[i]]))) {
       
  2980             if(x[[i]][j] == y[[i]][j]) {
       
  2981                 x[[i]] <- x[[i]][-j]
       
  2982                 y[[i]] <- y[[i]][-j]
       
  2983             } else
       
  2984                 j <- j + 1
       
  2985         }
       
  2986 
       
  2987         ## Process the unmatched characters from right to left and
       
  2988         ## remove any identical characters found from both names
       
  2989         ## respectively.
       
  2990         x[[i]] <- rev(x[[i]])
       
  2991         y[[i]] <- rev(y[[i]])
       
  2992         j <- 1
       
  2993         while(j < min(length(x[[i]]), length(y[[i]]))) {
       
  2994             if(x[[i]][j] == y[[i]][j]) {
       
  2995                 x[[i]] <- x[[i]][-j]
       
  2996                 y[[i]] <- y[[i]][-j]
       
  2997             } else
       
  2998                 j <- j + 1
       
  2999         }
       
  3000         ## Subtract the number of unmatched characters from 6 in the
       
  3001         ## longer string. This is the similarity rating.
       
  3002         len <- min(length(x[[i]]), length(y[[i]]))
       
  3003         mra$sim[i] <- 6 - len
       
  3004     }
       
  3005 
       
  3006     ## If the similarity is greater than or equal to the minimum
       
  3007     ## required, it is a successful match.
       
  3008     mra$match <- (mra$sim >= mra$min)
       
  3009     return(mra$match)
       
  3010 }
       
  3011 
       
  3012 END>>
       
  3013 ! !
       
  3014 
       
  3015 !PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
       
  3016 
       
  3017 encode:wordIn 
       
  3018     "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
       
  3019     
       
  3020     |word prev|
       
  3021 
       
  3022     word := wordIn.
       
  3023     
       
  3024     "/ First, remove any nonalphabetical characters and uppercase it
       
  3025 
       
  3026     word := word select:#isLetter thenCollect:#asUppercase.
       
  3027 
       
  3028     "/ Delete vowels not at the start of the word
       
  3029 
       
  3030     word := word first asString , ((word from:2) reject:#isVowel).
       
  3031 
       
  3032     "/ Remove duplicate consecutive characters
       
  3033 
       
  3034     prev := nil.
       
  3035     word := word 
       
  3036                 collect:[:char |
       
  3037                     char == prev ifTrue:[
       
  3038                         $*
       
  3039                     ] ifFalse:[
       
  3040                         prev := char.
       
  3041                         char.
       
  3042                     ].    
       
  3043                 ]
       
  3044                 thenSelect:[:char | char ~~ $*].
       
  3045 
       
  3046     "/ If longer than 6 characters, take first and last 3            
       
  3047     word size > 6 ifTrue:[
       
  3048         word := (word copyFirst:3),(word copyLast:3)
       
  3049     ].
       
  3050     ^ word.
       
  3051 
       
  3052     "
       
  3053      self new encode:'Catherine'            -> 'CTHRN'
       
  3054      self new encode:'CatherineCatherine'   -> 'CTHHRN'
       
  3055      self new encode:'Butter'               -> 'BTR'
       
  3056      self new encode:'Byrne'                -> 'BYRN'
       
  3057      self new encode:'Boern'                -> 'BRN'
       
  3058      self new encode:'Smith'                -> 'SMTH'
       
  3059      self new encode:'Smyth'                -> 'SMYTH'
       
  3060      self new encode:'Kathryn'              -> 'KTHRYN'
       
  3061     "
       
  3062 
       
  3063     "Created: / 28-07-2017 / 15:19:22 / cg"
       
  3064     "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
       
  3065 ! !
       
  3066 
       
  3067 !PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'!
       
  3068 
       
  3069 documentation
       
  3070 "
       
  3071    Encodes a string into a Metaphone value.
       
  3072 
       
  3073    Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
       
  3074    Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
       
  3075 
       
  3076     Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.
       
  3077     Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
       
  3078     https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6
       
  3079 
       
  3080   They have had undocumented changes from the originally published algorithm.
       
  3081   For more information, see https://issues.apache.org/jira/browse/CODEC-57
       
  3082 
       
  3083   Metaphone uses the following rules:
       
  3084 
       
  3085     Doubled letters except 'c' -> drop 2nd letter.
       
  3086     Vowels are only kept when they are the first letter.
       
  3087     B -> B unless at the end of a word after 'm' as in 'dumb'
       
  3088     C -> X (sh) if -cia- or -ch-
       
  3089     S if -ci-, -ce- or -cy-
       
  3090     K otherwise, including -sch-
       
  3091     D -> J if in -dge-, -dgy- or -dgi-; T otherwise
       
  3092     F -> F
       
  3093     G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above)
       
  3094     J if before i or e or y if not double gg; K otherwise
       
  3095     H -> silent if after vowel and no vowel follows; H otherwise
       
  3096     J -> J
       
  3097     K -> silent if after 'c'; K otherwise
       
  3098     L -> L
       
  3099     M -> M
       
  3100     N -> N
       
  3101     P -> F if before 'h'; P otherwise
       
  3102     Q -> K
       
  3103     R -> R
       
  3104     S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise
       
  3105     T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise
       
  3106     V -> F
       
  3107     W -> silent if not followed by a vowel W if followed by a vowel
       
  3108     X -> KS
       
  3109     Y -> silent if not followed by a vowel Y if followed by a vowel
       
  3110     Z -> S
       
  3111 
       
  3112     Initial Letter Exceptions
       
  3113 
       
  3114     Initial kn-, gn- pn, ae- or wr- -> drop first letter
       
  3115     Initial x- -> change to 's'
       
  3116     Initial wh- -> change to 'w'
       
  3117 
       
  3118 
       
  3119      self new encode:'a'
       
  3120      self new encode:'dumb'
       
  3121      self new encode:'MILLER'
       
  3122      self new encode:'schmidt'
       
  3123      self new encode:'schneider'
       
  3124      self new encode:'FISCHER'
       
  3125      self new encode:'HEDGY'
       
  3126      self new encode:'weber'
       
  3127      self new encode:'wagner'
       
  3128      self new encode:'van gogh'
       
  3129 "
       
  3130 !
       
  3131 
       
  3132 javaCode
       
  3133 "<<END
       
  3134 /*
       
  3135  * Licensed to the Apache Software Foundation (ASF) under one or more
       
  3136  * contributor license agreements.  See the NOTICE file distributed with
       
  3137  * this work for additional information regarding copyright ownership.
       
  3138  * The ASF licenses this file to You under the Apache License, Version 2.0
       
  3139  * (the "License"); you may not use this file except in compliance with
       
  3140  * the License.  You may obtain a copy of the License at
       
  3141  *
       
  3142  *      http://www.apache.org/licenses/LICENSE-2.0
       
  3143  *
       
  3144  * Unless required by applicable law or agreed to in writing, software
       
  3145  * distributed under the License is distributed on an "AS IS" BASIS,
       
  3146  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
  3147  * See the License for the specific language governing permissions and
       
  3148  * limitations under the License.
       
  3149  */
       
  3150 
       
  3151 package org.apache.commons.codec.language;
       
  3152 
       
  3153 import org.apache.commons.codec.EncoderException;
       
  3154 import org.apache.commons.codec.StringEncoder;
       
  3155 
       
  3156 /**
       
  3157  * Encodes a string into a Metaphone value.
       
  3158  * <p>
       
  3159  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
       
  3160  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
       
  3161  * <p>
       
  3162  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
       
  3163  * p 39.</CITE>
       
  3164  * <p>
       
  3165  * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
       
  3166  * </p>
       
  3167  * <ul>
       
  3168  * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
       
  3169  *  (broken link 4/30/2013) </li>
       
  3170  * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
       
  3171  *  (link checked 4/30/2013) </li>
       
  3172  * </ul>
       
  3173  * <p>
       
  3174  * They have had undocumented changes from the originally published algorithm.
       
  3175  * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
       
  3176  * <p>
       
  3177  * This class is conditionally thread-safe.
       
  3178  * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
       
  3179  * but is not volatile, and accesses are not synchronized.
       
  3180  * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
       
  3181  * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
       
  3182  * after initial setup.
       
  3183  *
       
  3184  * @version $Id$
       
  3185  */
       
  3186 public class Metaphone implements StringEncoder {
       
  3187 
       
  3188     /**
       
  3189      * Five values in the English language
       
  3190      */
       
  3191     private static final String VOWELS = "AEIOU";
       
  3192 
       
  3193     /**
       
  3194      * Variable used in Metaphone algorithm
       
  3195      */
       
  3196     private static final String FRONTV = "EIY";
       
  3197 
       
  3198     /**
       
  3199      * Variable used in Metaphone algorithm
       
  3200      */
       
  3201     private static final String VARSON = "CSPTG";
       
  3202 
       
  3203     /**
       
  3204      * The max code length for metaphone is 4
       
  3205      */
       
  3206     private int maxCodeLen = 4;
       
  3207 
       
  3208     /**
       
  3209      * Creates an instance of the Metaphone encoder
       
  3210      */
       
  3211     public Metaphone() {
       
  3212         super();
       
  3213     }
       
  3214 
       
  3215     /**
       
  3216      * Find the metaphone value of a String. This is similar to the
       
  3217      * soundex algorithm, but better at finding similar sounding words.
       
  3218      * All input is converted to upper case.
       
  3219      * Limitations: Input format is expected to be a single ASCII word
       
  3220      * with only characters in the A - Z range, no punctuation or numbers.
       
  3221      *
       
  3222      * @param txt String to find the metaphone code for
       
  3223      * @return A metaphone code corresponding to the String supplied
       
  3224      */
       
  3225     public String metaphone(final String txt) {
       
  3226         boolean hard = false;
       
  3227         int txtLength;
       
  3228         if (txt == null || (txtLength = txt.length()) == 0) {
       
  3229             return "";
       
  3230         }
       
  3231         // single character is itself
       
  3232         if (txtLength == 1) {
       
  3233             return txt.toUpperCase(java.util.Locale.ENGLISH);
       
  3234         }
       
  3235 
       
  3236         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
       
  3237 
       
  3238         final StringBuilder local = new StringBuilder(40); // manipulate
       
  3239         final StringBuilder code = new StringBuilder(10); //   output
       
  3240         // handle initial 2 characters exceptions
       
  3241         switch(inwd[0]) {
       
  3242         case 'K':
       
  3243         case 'G':
       
  3244         case 'P': /* looking for KN, etc*/
       
  3245             if (inwd[1] == 'N') {
       
  3246                 local.append(inwd, 1, inwd.length - 1);
       
  3247             } else {
       
  3248                 local.append(inwd);
       
  3249             }
       
  3250             break;
       
  3251         case 'A': /* looking for AE */
       
  3252             if (inwd[1] == 'E') {
       
  3253                 local.append(inwd, 1, inwd.length - 1);
       
  3254             } else {
       
  3255                 local.append(inwd);
       
  3256             }
       
  3257             break;
       
  3258         case 'W': /* looking for WR or WH */
       
  3259             if (inwd[1] == 'R') {   // WR -> R
       
  3260                 local.append(inwd, 1, inwd.length - 1);
       
  3261                 break;
       
  3262             }
       
  3263             if (inwd[1] == 'H') {
       
  3264                 local.append(inwd, 1, inwd.length - 1);
       
  3265                 local.setCharAt(0, 'W'); // WH -> W
       
  3266             } else {
       
  3267                 local.append(inwd);
       
  3268             }
       
  3269             break;
       
  3270         case 'X': /* initial X becomes S */
       
  3271             inwd[0] = 'S';
       
  3272             local.append(inwd);
       
  3273             break;
       
  3274         default:
       
  3275             local.append(inwd);
       
  3276         } // now local has working string with initials fixed
       
  3277 
       
  3278         final int wdsz = local.length();
       
  3279         int n = 0;
       
  3280 
       
  3281         while (code.length() < this.getMaxCodeLen() &&
       
  3282                n < wdsz ) { // max code size of 4 works well
       
  3283             final char symb = local.charAt(n);
       
  3284             // remove duplicate letters except C
       
  3285             if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) {
       
  3286                 n++;
       
  3287             } else { // not dup
       
  3288                 switch(symb) {
       
  3289                 case 'A':
       
  3290                 case 'E':
       
  3291                 case 'I':
       
  3292                 case 'O':
       
  3293                 case 'U':
       
  3294                     if (n == 0) {
       
  3295                         code.append(symb);
       
  3296                     }
       
  3297                     break; // only use vowel if leading char
       
  3298                 case 'B':
       
  3299                     if ( isPreviousChar(local, n, 'M') &&
       
  3300                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
       
  3301                         break;
       
  3302                     }
       
  3303                     code.append(symb);
       
  3304                     break;
       
  3305                 case 'C': // lots of C special cases
       
  3306                     /* discard if SCI, SCE or SCY */
       
  3307                     if ( isPreviousChar(local, n, 'S') &&
       
  3308                          !!isLastChar(wdsz, n) &&
       
  3309                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
       
  3310                         break;
       
  3311                     }
       
  3312                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
       
  3313                         code.append('X');
       
  3314                         break;
       
  3315                     }
       
  3316                     if (!!isLastChar(wdsz, n) &&
       
  3317                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
       
  3318                         code.append('S');
       
  3319                         break; // CI,CE,CY -> S
       
  3320                     }
       
  3321                     if (isPreviousChar(local, n, 'S') &&
       
  3322                         isNextChar(local, n, 'H') ) { // SCH->sk
       
  3323                         code.append('K');
       
  3324                         break;
       
  3325                     }
       
  3326                     if (isNextChar(local, n, 'H')) { // detect CH
       
  3327                         if (n == 0 &&
       
  3328                             wdsz >= 3 &&
       
  3329                             isVowel(local,2) ) { // CH consonant -> K consonant
       
  3330                             code.append('K');
       
  3331                         } else {
       
  3332                             code.append('X'); // CHvowel -> X
       
  3333                         }
       
  3334                     } else {
       
  3335                         code.append('K');
       
  3336                     }
       
  3337                     break;
       
  3338                 case 'D':
       
  3339                     if (!!isLastChar(wdsz, n + 1) &&
       
  3340                         isNextChar(local, n, 'G') &&
       
  3341                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
       
  3342                         code.append('J'); n += 2;
       
  3343                     } else {
       
  3344                         code.append('T');
       
  3345                     }
       
  3346                     break;
       
  3347                 case 'G': // GH silent at end or before consonant
       
  3348                     if (isLastChar(wdsz, n + 1) &&
       
  3349                         isNextChar(local, n, 'H')) {
       
  3350                         break;
       
  3351                     }
       
  3352                     if (!!isLastChar(wdsz, n + 1) &&
       
  3353                         isNextChar(local,n,'H') &&
       
  3354                         !!isVowel(local,n+2)) {
       
  3355                         break;
       
  3356                     }
       
  3357                     if (n > 0 &&
       
  3358                         ( regionMatch(local, n, "GN") ||
       
  3359                           regionMatch(local, n, "GNED") ) ) {
       
  3360                         break; // silent G
       
  3361                     }
       
  3362                     if (isPreviousChar(local, n, 'G')) {
       
  3363                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
       
  3364                         hard = true;
       
  3365                     } else {
       
  3366                         hard = false;
       
  3367                     }
       
  3368                     if (!!isLastChar(wdsz, n) &&
       
  3369                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
       
  3370                         !!hard) {
       
  3371                         code.append('J');
       
  3372                     } else {
       
  3373                         code.append('K');
       
  3374                     }
       
  3375                     break;
       
  3376                 case 'H':
       
  3377                     if (isLastChar(wdsz, n)) {
       
  3378                         break; // terminal H
       
  3379                     }
       
  3380                     if (n > 0 &&
       
  3381                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
       
  3382                         break;
       
  3383                     }
       
  3384                     if (isVowel(local,n+1)) {
       
  3385                         code.append('H'); // Hvowel
       
  3386                     }
       
  3387                     break;
       
  3388                 case 'F':
       
  3389                 case 'J':
       
  3390                 case 'L':
       
  3391                 case 'M':
       
  3392                 case 'N':
       
  3393                 case 'R':
       
  3394                     code.append(symb);
       
  3395                     break;
       
  3396                 case 'K':
       
  3397                     if (n > 0) { // not initial
       
  3398                         if (!!isPreviousChar(local, n, 'C')) {
       
  3399                             code.append(symb);
       
  3400                         }
       
  3401                     } else {
       
  3402                         code.append(symb); // initial K
       
  3403                     }
       
  3404                     break;
       
  3405                 case 'P':
       
  3406                     if (isNextChar(local,n,'H')) {
       
  3407                         // PH -> F
       
  3408                         code.append('F');
       
  3409                     } else {
       
  3410                         code.append(symb);
       
  3411                     }
       
  3412                     break;
       
  3413                 case 'Q':
       
  3414                     code.append('K');
       
  3415                     break;
       
  3416                 case 'S':
       
  3417                     if (regionMatch(local,n,"SH") ||
       
  3418                         regionMatch(local,n,"SIO") ||
       
  3419                         regionMatch(local,n,"SIA")) {
       
  3420                         code.append('X');
       
  3421                     } else {
       
  3422                         code.append('S');
       
  3423                     }
       
  3424                     break;
       
  3425                 case 'T':
       
  3426                     if (regionMatch(local,n,"TIA") ||
       
  3427                         regionMatch(local,n,"TIO")) {
       
  3428                         code.append('X');
       
  3429                         break;
       
  3430                     }
       
  3431                     if (regionMatch(local,n,"TCH")) {
       
  3432                         // Silent if in "TCH"
       
  3433                         break;
       
  3434                     }
       
  3435                     // substitute numeral 0 for TH (resembles theta after all)
       
  3436                     if (regionMatch(local,n,"TH")) {
       
  3437                         code.append('0');
       
  3438                     } else {
       
  3439                         code.append('T');
       
  3440                     }
       
  3441                     break;
       
  3442                 case 'V':
       
  3443                     code.append('F'); break;
       
  3444                 case 'W':
       
  3445                 case 'Y': // silent if not followed by vowel
       
  3446                     if (!!isLastChar(wdsz,n) &&
       
  3447                         isVowel(local,n+1)) {
       
  3448                         code.append(symb);
       
  3449                     }
       
  3450                     break;
       
  3451                 case 'X':
       
  3452                     code.append('K');
       
  3453                     code.append('S');
       
  3454                     break;
       
  3455                 case 'Z':
       
  3456                     code.append('S');
       
  3457                     break;
       
  3458                 default:
       
  3459                     // do nothing
       
  3460                     break;
       
  3461                 } // end switch
       
  3462                 n++;
       
  3463             } // end else from symb !!= 'C'
       
  3464             if (code.length() > this.getMaxCodeLen()) {
       
  3465                 code.setLength(this.getMaxCodeLen());
       
  3466             }
       
  3467         }
       
  3468         return code.toString();
       
  3469     }
       
  3470 
       
  3471     private boolean isVowel(final StringBuilder string, final int index) {
       
  3472         return VOWELS.indexOf(string.charAt(index)) >= 0;
       
  3473     }
       
  3474 
       
  3475     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
       
  3476         boolean matches = false;
       
  3477         if( index > 0 &&
       
  3478             index < string.length() ) {
       
  3479             matches = string.charAt(index - 1) == c;
       
  3480         }
       
  3481         return matches;
       
  3482     }
       
  3483 
       
  3484     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
       
  3485         boolean matches = false;
       
  3486         if( index >= 0 &&
       
  3487             index < string.length() - 1 ) {
       
  3488             matches = string.charAt(index + 1) == c;
       
  3489         }
       
  3490         return matches;
       
  3491     }
       
  3492 
       
  3493     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
       
  3494         boolean matches = false;
       
  3495         if( index >= 0 &&
       
  3496             index + test.length() - 1 < string.length() ) {
       
  3497             final String substring = string.substring( index, index + test.length());
       
  3498             matches = substring.equals( test );
       
  3499         }
       
  3500         return matches;
       
  3501     }
       
  3502 
       
  3503     private boolean isLastChar(final int wdsz, final int n) {
       
  3504         return n + 1 == wdsz;
       
  3505     }
       
  3506 
       
  3507 
       
  3508     /**
       
  3509      * Encodes an Object using the metaphone algorithm.  This method
       
  3510      * is provided in order to satisfy the requirements of the
       
  3511      * Encoder interface, and will throw an EncoderException if the
       
  3512      * supplied object is not of type java.lang.String.
       
  3513      *
       
  3514      * @param obj Object to encode
       
  3515      * @return An object (or type java.lang.String) containing the
       
  3516      *         metaphone code which corresponds to the String supplied.
       
  3517      * @throws EncoderException if the parameter supplied is not
       
  3518      *                          of type java.lang.String
       
  3519      */
       
  3520     @Override
       
  3521     public Object encode(final Object obj) throws EncoderException {
       
  3522         if (!!(obj instanceof String)) {
       
  3523             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
       
  3524         }
       
  3525         return metaphone((String) obj);
       
  3526     }
       
  3527 
       
  3528     /**
       
  3529      * Encodes a String using the Metaphone algorithm.
       
  3530      *
       
  3531      * @param str String object to encode
       
  3532      * @return The metaphone code corresponding to the String supplied
       
  3533      */
       
  3534     @Override
       
  3535     public String encode(final String str) {
       
  3536         return metaphone(str);
       
  3537     }
       
  3538 
       
  3539     /**
       
  3540      * Tests is the metaphones of two strings are identical.
       
  3541      *
       
  3542      * @param str1 First of two strings to compare
       
  3543      * @param str2 Second of two strings to compare
       
  3544      * @return <code>true</code> if the metaphones of these strings are identical,
       
  3545      *        <code>false</code> otherwise.
       
  3546      */
       
  3547     public boolean isMetaphoneEqual(final String str1, final String str2) {
       
  3548         return metaphone(str1).equals(metaphone(str2));
       
  3549     }
       
  3550 
       
  3551     /**
       
  3552      * Returns the maxCodeLen.
       
  3553      * @return int
       
  3554      */
       
  3555     public int getMaxCodeLen() { return this.maxCodeLen; }
       
  3556 
       
  3557     /**
       
  3558      * Sets the maxCodeLen.
       
  3559      * @param maxCodeLen The maxCodeLen to set
       
  3560      */
       
  3561     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
       
  3562 
       
  3563 }
       
  3564 END>>"
       
  3565 ! !
       
  3566 
       
  3567 !PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'!
       
  3568 
       
  3569 encode:txt
       
  3570     "
       
  3571      self new encode:'a'
       
  3572      self new encode:'MILLER'
       
  3573      self new encode:'schmidt'
       
  3574      self new encode:'schneider'
       
  3575      self new encode:'FISCHER'
       
  3576      self new encode:'HEDGY'
       
  3577      self new encode:'weber'
       
  3578      self new encode:'wagner'
       
  3579      self new encode:'van gogh'
       
  3580      self new encode:'dumb'
       
  3581     "
       
  3582     
       
  3583     |hard txtLength local code inwd ch ch2 wdsz n maxCodeLen|
       
  3584 
       
  3585     inwd := txt.
       
  3586     hard := false.
       
  3587     txtLength := 0.
       
  3588     maxCodeLen := self maxCodeLen.
       
  3589     
       
  3590     (txtLength := txt size) == 0 ifTrue:[^ ''].
       
  3591 
       
  3592     inwd := txt asUppercase.
       
  3593     "/ single character is itself
       
  3594     (txtLength == 1) ifTrue:[
       
  3595         ^ inwd        
       
  3596     ].
       
  3597 
       
  3598     code := '' writeStream.
       
  3599     local := inwd.
       
  3600     
       
  3601     "/ handle initial 2 characters exceptions
       
  3602     ch := inwd at:(0+1).
       
  3603     ch2 := inwd at:(1+1).
       
  3604     ('KGP' includes:ch) ifTrue:[  
       
  3605         "/ looking for KN, etc
       
  3606         "/ KNx -> Nx 
       
  3607         "/ GNx -> Nx 
       
  3608         "/ PNx -> Nx 
       
  3609         (ch2 == $N) ifTrue:[
       
  3610             local := (inwd from:1+1)
       
  3611         ].
       
  3612     ] ifFalse:[
       
  3613     ('A' includes:ch) ifTrue:[  
       
  3614         "/ looking for AE
       
  3615         "/ AEx -> Ex 
       
  3616         (ch2 == $E) ifTrue:[
       
  3617             local := (inwd from:1+1)
       
  3618         ].
       
  3619     ] ifFalse:[
       
  3620     ('W' includes:ch) ifTrue:[  
       
  3621         "/ looking for WR or WH 
       
  3622         (ch2 == $R) ifTrue:[
       
  3623             "/ WRx -> Wx 
       
  3624             local := (inwd from:1+1)
       
  3625         ] ifFalse:[
       
  3626             (ch2 == $H) ifTrue:[
       
  3627                 "/ // WH -> W 
       
  3628                 local := 'W',(inwd from:2+1).
       
  3629             ]
       
  3630         ]
       
  3631     ] ifFalse:[
       
  3632     ('X' includes:ch) ifTrue:[  
       
  3633         "/ initial X becomes S */
       
  3634         "/ Xx -> Sx 
       
  3635         local := 'S',(inwd from:1+1).
       
  3636     ]]]].
       
  3637     
       
  3638     "/ now local has working string with initials fixed
       
  3639     
       
  3640     wdsz := local size.
       
  3641     n := 1.
       
  3642 
       
  3643     [ (code size < maxCodeLen) and:[ n <= wdsz ] ] whileTrue:[
       
  3644         "/ max code size of 4 works well
       
  3645 
       
  3646         |symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar|
       
  3647 
       
  3648         symb := local at:n.
       
  3649         (n > 1) ifTrue:[ prevChar := local at:(n-1) ]. 
       
  3650         (isLastChar := (n == wdsz)) ifFalse:[
       
  3651             nextChar := local at:(n+1) 
       
  3652         ].    
       
  3653         isPrevToLastChar := (n == (wdsz-1)).
       
  3654         (n+2) <= wdsz ifTrue:[
       
  3655             nextNextChar := local at:(n+2)
       
  3656         ].
       
  3657         
       
  3658         "/ remove duplicate letters except C
       
  3659         (symb ~~ $C and:[ nextChar == symb ]) ifFalse:[
       
  3660             "/ not dup
       
  3661             ('AEIOU' includes:symb) ifTrue:[
       
  3662                 "/ only use vowel if leading char
       
  3663                 (n == 1) ifTrue:[
       
  3664                     code nextPut:symb
       
  3665                 ]
       
  3666             ] ifFalse:[
       
  3667             ('B' includes:symb) ifTrue:[
       
  3668                 "/    if ( isPreviousChar(local, n, 'M') &&
       
  3669                 "/         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
       
  3670                 "/        break;
       
  3671                 "/    }
       
  3672                 "/    code.append(symb);
       
  3673                 "/    break;
       
  3674                 ((prevChar == $M) and:[isLastChar]) ifTrue:[
       
  3675                     "/ B is silent if word ends in MB 
       
  3676                 ] ifFalse:[
       
  3677                     code nextPut:symb.
       
  3678                 ].    
       
  3679             ] ifFalse:[
       
  3680             ('C' includes:symb) ifTrue:[
       
  3681                 "/ lots of C special cases    
       
  3682                 "/    /* discard if SCI, SCE or SCY */
       
  3683                 "/    if ( isPreviousChar(local, n, 'S') &&
       
  3684                 "/         !!isLastChar(wdsz, n) &&
       
  3685                 "/         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
       
  3686                 "/        break;
       
  3687                 "/    }
       
  3688                 "/    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
       
  3689                 "/        code.append('X');
       
  3690                 "/        break;
       
  3691                 "/    }
       
  3692                 "/    if (!!isLastChar(wdsz, n) &&
       
  3693                 "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
       
  3694                 "/        code.append('S');
       
  3695                 "/        break; // CI,CE,CY -> S
       
  3696                 "/    }
       
  3697                 "/    if (isPreviousChar(local, n, 'S') &&
       
  3698                 "/        isNextChar(local, n, 'H') ) { // SCH->sk
       
  3699                 "/        code.append('K');
       
  3700                 "/        break;
       
  3701                 "/    }
       
  3702                 "/    if (isNextChar(local, n, 'H')) { // detect CH
       
  3703                 "/        if (n == 0 &&
       
  3704                 "/            wdsz >= 3 &&
       
  3705                 "/            isVowel(local,2) ) { // CH consonant -> K consonant
       
  3706                 "/            code.append('K');
       
  3707                 "/        } else {
       
  3708                 "/            code.append('X'); // CHvowel -> X
       
  3709                 "/        }
       
  3710                 "/    } else {
       
  3711                 "/        code.append('K');
       
  3712                 "/    }
       
  3713                 "/    break;
       
  3714                 (prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[
       
  3715                     "/ discard if SCI, SCE or SCY
       
  3716                 ] ifFalse:[
       
  3717                     ((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[
       
  3718                         "/  "CIA" -> X 
       
  3719                         code nextPut:$X
       
  3720                     ] ifFalse:[
       
  3721                         ('IEY' includes:nextChar) ifTrue:[
       
  3722                             "/ CI,CE,CY -> S
       
  3723                             code nextPut:$S
       
  3724                         ] ifFalse:[ 
       
  3725                            ((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[
       
  3726                                "/ SCH->sk
       
  3727                                 code nextPut:$K
       
  3728                             ] ifFalse:[ 
       
  3729                                 nextChar == $H ifTrue:[
       
  3730                                     "/ CH
       
  3731                                     ('AEIOU' includes:nextNextChar) ifTrue:[
       
  3732                                         code nextPut:$K "/ CH consonant -> K consonant 
       
  3733                                     ] ifFalse:[    
       
  3734                                         code nextPut:$X "/ CHvowel -> X
       
  3735                                     ]    
       
  3736                                 ] ifFalse:[
       
  3737                                     code nextPut:$K
       
  3738                                 ].    
       
  3739                             ]
       
  3740                         ]
       
  3741                     ]
       
  3742                 ].    
       
  3743                 
       
  3744             ] ifFalse:[
       
  3745             ('D' includes:symb) ifTrue:[
       
  3746                 "/    if (!!isLastChar(wdsz, n + 1) &&
       
  3747                 "/        isNextChar(local, n, 'G') &&
       
  3748                 "/        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
       
  3749                 "/        code.append('J'); n += 2;
       
  3750                 "/    } else {
       
  3751                 "/        code.append('T');
       
  3752                 "/    }
       
  3753                 "/    break;
       
  3754                 ((nextChar == $G)
       
  3755                 and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ])
       
  3756                 ifTrue:[
       
  3757                     code nextPut:$J.
       
  3758                     n := n + 2.
       
  3759                 ] ifFalse:[    
       
  3760                     code nextPut:$T.
       
  3761                 ].    
       
  3762             ] ifFalse:[
       
  3763             ('G' includes:symb) ifTrue:[
       
  3764                 "/    GH silent at end or before consonant
       
  3765                 "/    if (isLastChar(wdsz, n + 1) &&
       
  3766                 "/        isNextChar(local, n, 'H')) {
       
  3767                 "/        break;
       
  3768                 "/    }
       
  3769                 "/    if (!!isLastChar(wdsz, n + 1) &&
       
  3770                 "/        isNextChar(local,n,'H') &&
       
  3771                 "/        !!isVowel(local,n+2)) {
       
  3772                 "/        break;
       
  3773                 "/    }
       
  3774                 "/    if (n > 0 &&
       
  3775                 "/        ( regionMatch(local, n, "GN") ||
       
  3776                 "/          regionMatch(local, n, "GNED") ) ) {
       
  3777                 "/        break; // silent G
       
  3778                 "/    }
       
  3779                 "/    if (isPreviousChar(local, n, 'G')) {
       
  3780                 "/        // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true
       
  3781                 "/        hard = true;
       
  3782                 "/    } else {
       
  3783                 "/        hard = false;
       
  3784                 "/    }
       
  3785                 "/    if (!!isLastChar(wdsz, n) &&
       
  3786                 "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
       
  3787                 "/        !!hard) {
       
  3788                 "/        code.append('J');
       
  3789                 "/    } else {
       
  3790                 "/        code.append('K');
       
  3791                 "/    }
       
  3792                 "/    break;
       
  3793                 (isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[
       
  3794                     "/ GH silent at end
       
  3795                 ] ifFalse:[
       
  3796                     (isPrevToLastChar not and:[ nextChar == $H 
       
  3797                       and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[
       
  3798                         "/ GH silent before consonant
       
  3799                     ] ifFalse:[
       
  3800                         (n > 1 and:[ nextChar == $N ]) ifTrue:[
       
  3801                             "/ GN -> silent G
       
  3802                         ] ifFalse:[
       
  3803                             hard := (prevChar == $G).
       
  3804                             (isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[
       
  3805                                 code nextPut:$J
       
  3806                             ] ifFalse:[
       
  3807                                 code nextPut:$K
       
  3808                             ].    
       
  3809                         ].    
       
  3810                     ].    
       
  3811                 ].    
       
  3812             ] ifFalse:[
       
  3813             ('H' includes:symb) ifTrue:[
       
  3814                 "/    case 'H':
       
  3815                 "/        if (isLastChar(wdsz, n)) {
       
  3816                 "/            break; // terminal H
       
  3817                 "/        }
       
  3818                 "/        if (n > 0 &&
       
  3819                 "/            VARSON.indexOf(local.charAt(n - 1)) >= 0) {
       
  3820                 "/            break;
       
  3821                 "/        }
       
  3822                 "/        if (isVowel(local,n+1)) {
       
  3823                 "/            code.append('H'); // Hvowel
       
  3824                 "/        }
       
  3825                 "/        break;
       
  3826                 isLastChar ifTrue:[
       
  3827                     "/ ignore terminal H
       
  3828                 ] ifFalse:[
       
  3829                     ('CSPTG' includes:prevChar) ifTrue:[
       
  3830                         "/ ignore CH, SH, PH, TH, GH (H treated there)
       
  3831                     ] ifFalse:[
       
  3832                         ('AEIOU' includes:nextChar) ifTrue:[
       
  3833                             "/ Hvowel
       
  3834                             code nextPut:$H
       
  3835                         ].    
       
  3836                     ].    
       
  3837                 ].    
       
  3838             ] ifFalse:[
       
  3839             ('FJLMNR' includes:symb) ifTrue:[
       
  3840                 "/    case 'F':
       
  3841                 "/    case 'J':
       
  3842                 "/    case 'L':
       
  3843                 "/    case 'M':
       
  3844                 "/    case 'N':
       
  3845                 "/    case 'R':
       
  3846                 "/        code.append(symb);
       
  3847                 "/        break;
       
  3848                 code nextPut:symb.
       
  3849             ] ifFalse:[
       
  3850             ('K' includes:symb) ifTrue:[
       
  3851                 "/    case 'K':
       
  3852                 "/        if (n > 0) { // not initial
       
  3853                 "/            if (!!isPreviousChar(local, n, 'C')) {
       
  3854                 "/                code.append(symb);
       
  3855                 "/            }
       
  3856                 "/        } else {
       
  3857                 "/            code.append(symb); // initial K
       
  3858                 "/        }
       
  3859                 "/        break;
       
  3860                 n > 1 ifTrue:[
       
  3861                     "/ not initial
       
  3862                     prevChar ~~ $C ifTrue:[
       
  3863                         code nextPut:$K. "/ initial K
       
  3864                     ].    
       
  3865                 ] ifFalse:[
       
  3866                     code nextPut:$K. "/ initial K
       
  3867                 ].
       
  3868             ] ifFalse:[
       
  3869             ('P' includes:symb) ifTrue:[
       
  3870                 "/    case 'P':
       
  3871                 "/        if (isNextChar(local,n,'H')) {
       
  3872                 "/            // PH -> F
       
  3873                 "/            code.append('F');
       
  3874                 "/        } else {
       
  3875                 "/            code.append(symb);
       
  3876                 "/        }
       
  3877                 "/        break;
       
  3878                 nextChar == $H ifTrue:[
       
  3879                     "/ PH -> F
       
  3880                     code nextPut:$F.
       
  3881                 ] ifFalse:[
       
  3882                     code nextPut:symb.
       
  3883                 ].    
       
  3884             ] ifFalse:[
       
  3885             ('Q' includes:symb) ifTrue:[
       
  3886                 "/    case 'Q':
       
  3887                 "/        code.append('K');
       
  3888                 "/        break;
       
  3889                 code nextPut:$K
       
  3890 
       
  3891             ] ifFalse:[
       
  3892             ('S' includes:symb) ifTrue:[
       
  3893 "/                case 'S':
       
  3894 "/                    if (regionMatch(local,n,"SH") ||
       
  3895 "/                        regionMatch(local,n,"SIO") ||
       
  3896 "/                        regionMatch(local,n,"SIA")) {
       
  3897 "/                        code.append('X');
       
  3898 "/                    } else {
       
  3899 "/                        code.append('S');
       
  3900 "/                    }
       
  3901 "/                    break;
       
  3902                 "/ SH -> X  (as in shave or ashton)
       
  3903                 "/ SIO -> X 
       
  3904                 "/ SIA -> X (as in ASIA)
       
  3905                 ((nextChar == $H) 
       
  3906                 or:[
       
  3907                     ((nextChar == $I) 
       
  3908                       and:[
       
  3909                         (((local from:n) startsWith:'SIO')
       
  3910                           or:[ ((local from:n) startsWith:'SIA') ]) 
       
  3911                       ]
       
  3912                     )
       
  3913                 ]) ifTrue:[
       
  3914                    code nextPut:$X
       
  3915                 ] ifFalse:[
       
  3916                    code nextPut:$S
       
  3917                 ]
       
  3918             ] ifFalse:[
       
  3919             ('T' includes:symb) ifTrue:[
       
  3920 "/                case 'T':
       
  3921 "/                    if (regionMatch(local,n,"TIA") ||
       
  3922 "/                        regionMatch(local,n,"TIO")) {
       
  3923 "/                        code.append('X');
       
  3924 "/                        break;
       
  3925 "/                    }
       
  3926 "/                    if (regionMatch(local,n,"TCH")) {
       
  3927 "/                        // Silent if in "TCH"
       
  3928 "/                        break;
       
  3929 "/                    }
       
  3930 "/                    // substitute numeral 0 for TH (resembles theta after all)
       
  3931 "/                    if (regionMatch(local,n,"TH")) {
       
  3932 "/                        code.append('0');
       
  3933 "/                    } else {
       
  3934 "/                        code.append('T');
       
  3935 "/                    }
       
  3936 "/                    break;
       
  3937                 self halt.
       
  3938             ] ifFalse:[
       
  3939             ('V' includes:symb) ifTrue:[
       
  3940                 "/    case 'V':
       
  3941                 "/        code.append('F'); break;
       
  3942                 code nextPut:$F
       
  3943 
       
  3944             ] ifFalse:[
       
  3945             ('WY' includes:symb) ifTrue:[
       
  3946                 "/    case 'W':
       
  3947                 "/    case 'Y': // silent if not followed by vowel
       
  3948                 "/        if (!!isLastChar(wdsz,n) &&
       
  3949                 "/            isVowel(local,n+1)) {
       
  3950                 "/            code.append(symb);
       
  3951                 "/        }
       
  3952                 "/        break;
       
  3953 
       
  3954                 "/ silent if not followed by vowel 
       
  3955                 (isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[
       
  3956                     code nextPut:symb
       
  3957                 ].    
       
  3958             ] ifFalse:[
       
  3959             ('X' includes:symb) ifTrue:[
       
  3960                 "/    case 'X':
       
  3961                 "/        code.append('K');
       
  3962                 "/        code.append('S');
       
  3963                 "/        break;
       
  3964                 code nextPutAll:'KS'
       
  3965             ] ifFalse:[
       
  3966             ('Z' includes:symb) ifTrue:[
       
  3967                 "/    case 'Z':
       
  3968                 "/        code.append('S');
       
  3969                 "/        break;
       
  3970                 code nextPut:$S
       
  3971             ] ifFalse:[
       
  3972 "/                default:
       
  3973 "/                    // do nothing
       
  3974 "/                    break;
       
  3975             ]]]]]]]]]]]]]]]]. "/ end switch
       
  3976         ]. "/ end else from symb !!= 'C'
       
  3977         n := n + 1.
       
  3978         (code size > maxCodeLen) ifTrue:[
       
  3979             code := code truncateTo:maxCodeLen
       
  3980         ]
       
  3981     ].
       
  3982     ^ code contents
       
  3983 
       
  3984     "Created: / 02-08-2017 / 09:51:31 / cg"
       
  3985     "Modified: / 02-08-2017 / 12:00:38 / cg"
       
  3986 !
       
  3987 
       
  3988 maxCodeLen
       
  3989     ^ 4
       
  3990 
       
  3991     "Created: / 02-08-2017 / 09:51:59 / cg"
       
  3992 ! !
       
  3993 
       
  3994 !PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
       
  3995 
       
  3996 documentation
       
  3997 "
       
  3998     WARNING: this is the so called 'simplified soundex' algorithm;
       
  3999       there are more variants like miracode (american soundex) or
       
  4000       mysqlSoundex around.
       
  4001       
       
  4002       Be sure to use the correct algorithm, if the generated strings must be compatible
       
  4003       (otherwise, the differences are probably too small to be noticed as effect, but
       
  4004       your search will be different)
       
  4005 
       
  4006     The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
       
  4007 
       
  4008     SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
       
  4009     components of names, but by doing so reports more matches. 
       
  4010 
       
  4011     There are some variations around in the literature; 
       
  4012     the following is called 'simplified soundex', and the rules for coding a name are:
       
  4013 
       
  4014     1. The first letter of the name is used in its un-coded form to serve as the prefix
       
  4015        character of the code. (The rest of the code is numerical).
       
  4016 
       
  4017     2. Thereafter, W and H are ignored entirely.
       
  4018 
       
  4019     3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
       
  4020 
       
  4021     4. Other letters of the name are converted to a numerical equivalent:
       
  4022                  B, P, F, V              1 
       
  4023                  C, G, J, K, Q, S, X, Z  2 
       
  4024                  D, T                    3 
       
  4025                  L                       4 
       
  4026                  M, N                    5 
       
  4027                  R                       6 
       
  4028 
       
  4029     5. There are two exceptions: 
       
  4030         1. Letters that follow prefix letters which would, if coded, have the same
       
  4031            numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
       
  4032 
       
  4033         2. The second letter of any pair of consonants having the same code number is likewise ignored, 
       
  4034            i.e. unless there is a ''separator'' between them in the name.
       
  4035 
       
  4036     6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
       
  4037        Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
       
  4038 
       
  4039     Notice, that in another variant, w and h are treated slightly differently.
       
  4040     This is only of relevance, if you need to reconstruct original soundex codes of other programs
       
  4041     or for the original 1880 us census data.
       
  4042      SoundexStringComparator  new encode:'Ashcraft' -> 'A226'
       
  4043     vs.
       
  4044      MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
       
  4045     
       
  4046     Also notice, that soundex deals better with english. 
       
  4047     For german and other languages, other algorithms may provide better results.
       
  4048 "
       
  4049 ! !
       
  4050 
       
  4051 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
       
  4052 
       
  4053 encode:word 
       
  4054     |u p t prevCode|
       
  4055 
       
  4056     u := word asUppercase.
       
  4057     p := u first asString.
       
  4058     prevCode := self translate:u first.
       
  4059     u from:2 to:u size do:[:c | 
       
  4060         t := self translate:c.
       
  4061         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
  4062             p := p , t.
       
  4063             p size == 4 ifTrue:[^ p ].
       
  4064         ].
       
  4065         prevCode := t
       
  4066     ].
       
  4067     [ p size < 4 ] whileTrue:[
       
  4068         p := p , '0'
       
  4069     ].
       
  4070     ^ (p copyFrom:1 to:4)
       
  4071 
       
  4072     "
       
  4073      self new encode:'washington' -> 'W252'
       
  4074      self new encode:'lee'        -> 'L000'
       
  4075      self new encode:'Gutierrez'  -> 'G362'
       
  4076      self new encode:'Pfister'    -> 'P236'
       
  4077      self new encode:'Jackson'    -> 'J250'
       
  4078      self new encode:'Tymczak'    -> 'T522'
       
  4079     "
       
  4080     
       
  4081     "notice:
       
  4082      MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
       
  4083      self new encode:'Ashcraft'   -> 'A226'
       
  4084     "
       
  4085 
       
  4086     "Created: / 28-07-2017 / 15:21:23 / cg"
       
  4087     "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
       
  4088 ! !
       
  4089 
       
  4090 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
       
  4091 
       
  4092 translate:aCharacter
       
  4093     "use simple if's for more speed when compiled"
       
  4094 
       
  4095     "vowels serve as separators"
       
  4096     aCharacter == $A ifTrue:[^ '0' ].         
       
  4097     aCharacter == $E ifTrue:[^ '0' ].
       
  4098     aCharacter == $I ifTrue:[^ '0' ].
       
  4099     aCharacter == $O ifTrue:[^ '0' ].
       
  4100     aCharacter == $U ifTrue:[^ '0' ].
       
  4101     aCharacter == $Y ifTrue:[^ '0' ].
       
  4102 
       
  4103     aCharacter == $B ifTrue:[^ '1' ]. 
       
  4104     aCharacter == $P ifTrue:[^ '1' ]. 
       
  4105     aCharacter == $F ifTrue:[^ '1' ]. 
       
  4106     aCharacter == $V ifTrue:[^ '1' ]. 
       
  4107 
       
  4108     aCharacter == $C ifTrue:[^ '2' ]. 
       
  4109     aCharacter == $S ifTrue:[^ '2' ]. 
       
  4110     aCharacter == $K ifTrue:[^ '2' ]. 
       
  4111     aCharacter == $G ifTrue:[^ '2' ]. 
       
  4112     aCharacter == $J ifTrue:[^ '2' ]. 
       
  4113     aCharacter == $Q ifTrue:[^ '2' ]. 
       
  4114     aCharacter == $X ifTrue:[^ '2' ]. 
       
  4115     aCharacter == $Z ifTrue:[^ '2' ]. 
       
  4116 
       
  4117     aCharacter == $D ifTrue:[^ '3' ]. 
       
  4118     aCharacter == $T ifTrue:[^ '3' ]. 
       
  4119 
       
  4120     aCharacter == $L ifTrue:[^ '4' ]. 
       
  4121 
       
  4122     aCharacter == $M ifTrue:[^ '5' ]. 
       
  4123     aCharacter == $N ifTrue:[^ '5' ]. 
       
  4124 
       
  4125     aCharacter == $R ifTrue:[^ '6' ]. 
       
  4126     ^ nil
       
  4127 
       
  4128     "Modified: / 02-08-2017 / 01:35:40 / cg"
       
  4129     "Modified (comment): / 02-08-2017 / 14:30:11 / cg"
       
  4130 ! !
       
  4131 
       
  4132 !PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
       
  4133 
       
  4134 documentation
       
  4135 "
       
  4136     MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
       
  4137     and also removing vokals first, then removing duplicate codes
       
  4138     (whereas the soundex code does this in reverse order).
       
  4139 
       
  4140     These variations are important, if you need the miracode soundex codes to be generated.
       
  4141 "
       
  4142 ! !
       
  4143 
       
  4144 !PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
       
  4145 
       
  4146 encode:word 
       
  4147     "same as inherited, but cares for 0, W and H"
       
  4148 
       
  4149     |u p t prevCode|
       
  4150 
       
  4151     u := word asUppercase.
       
  4152     p := u first asString.
       
  4153     prevCode := self translate:u first.
       
  4154     u from:2 to:u size do:[:c |
       
  4155         t := self translate:c.
       
  4156         (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
       
  4157             p := p , t.
       
  4158         ].
       
  4159         (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
       
  4160             prevCode := t.
       
  4161         ].
       
  4162     ].
       
  4163     [ p size < 4 ] whileTrue:[
       
  4164         p := p , '0'
       
  4165     ].
       
  4166     ^ p
       
  4167 
       
  4168     "Created: / 28-07-2017 / 15:23:41 / cg"
       
  4169     "Modified: / 31-07-2017 / 17:53:51 / cg"
       
  4170     "Modified (comment): / 02-08-2017 / 14:31:15 / cg"
       
  4171 ! !
       
  4172 
       
  4173 !PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
       
  4174 
       
  4175 documentation
       
  4176 "
       
  4177     NYSIIS Algorithm:
       
  4178 
       
  4179     1.
       
  4180         remove all ''S'' and ''Z'' chars from the end of the surname 
       
  4181 
       
  4182     2.
       
  4183         transcode initial strings
       
  4184             MAC => MC
       
  4185             PF => F
       
  4186 
       
  4187     3.
       
  4188         Transcode trailing strings as follows,
       
  4189         
       
  4190             IX => IC
       
  4191             EX => EC
       
  4192             YE,EE,IE => Y
       
  4193             NT,ND => D 
       
  4194 
       
  4195     4.
       
  4196         transcode ''EV'' to ''EF'' if not at start of name
       
  4197 
       
  4198     5.
       
  4199         use first character of name as first character of key 
       
  4200 
       
  4201     6.
       
  4202         remove any ''W'' that follows a vowel 
       
  4203 
       
  4204     7.
       
  4205         replace all vowels with ''A'' 
       
  4206 
       
  4207     8.
       
  4208         transcode ''GHT'' to ''GT'' 
       
  4209 
       
  4210     9.
       
  4211         transcode ''DG'' to ''G'' 
       
  4212 
       
  4213     10.
       
  4214         transcode ''PH'' to ''F'' 
       
  4215 
       
  4216     11.
       
  4217         if not first character, eliminate all ''H'' preceded or followed by a vowel 
       
  4218 
       
  4219     12.
       
  4220         change ''KN'' to ''N'', else ''K'' to ''C'' 
       
  4221 
       
  4222     13.
       
  4223         if not first character, change ''M'' to ''N'' 
       
  4224 
       
  4225     14.
       
  4226         if not first character, change ''Q'' to ''G'' 
       
  4227 
       
  4228     15.
       
  4229         transcode ''SH'' to ''S'' 
       
  4230 
       
  4231     16.
       
  4232         transcode ''SCH'' to ''S'' 
       
  4233 
       
  4234     17.
       
  4235         transcode ''YW'' to ''Y'' 
       
  4236 
       
  4237     18.
       
  4238         if not first or last character, change ''Y'' to ''A'' 
       
  4239 
       
  4240     19.
       
  4241         transcode ''WR'' to ''R'' 
       
  4242 
       
  4243     20.
       
  4244         if not first character, change ''Z'' to ''S'' 
       
  4245 
       
  4246     21.
       
  4247         transcode terminal ''AY'' to ''Y'' 
       
  4248 
       
  4249     22.
       
  4250         remove traling vowels 
       
  4251 
       
  4252     23.
       
  4253         collapse all strings of repeated characters 
       
  4254 
       
  4255     24.
       
  4256         if first char of original surname was a vowel, append it to the code
       
  4257 "
       
  4258 ! !
       
  4259 
       
  4260 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
       
  4261 
       
  4262 encode:aString 
       
  4263     |k|
       
  4264 
       
  4265     k := self rule1:(aString asUppercase).
       
  4266     "2. Transcode initial strings:  MAC => MC   PF => F"
       
  4267     k := self rule2:k.
       
  4268     k := self rule3:k.
       
  4269     k := self rule4:k.
       
  4270     k := self rule5:k.
       
  4271     k := self rule6:k.
       
  4272     k := self rule7:k.
       
  4273     k := self rule8:k.
       
  4274     k := self rule9:k.
       
  4275     k := self rule10:k.
       
  4276     k := self rule11:k.
       
  4277     k := self rule12:k.
       
  4278     k := self rule13:k.
       
  4279     k := self rule14:k.
       
  4280     k := self rule15:k.
       
  4281     k := self rule16:k.
       
  4282     k := self rule17:k.
       
  4283     k := self rule18:k.
       
  4284     k := self rule19:k.
       
  4285     k := self rule20:k.
       
  4286     k := self rule21:k.
       
  4287     k := self rule22:k.
       
  4288     k := self rule23:k.
       
  4289     k := self rule24:k originalKey:aString.
       
  4290     ^ k
       
  4291 
       
  4292     "
       
  4293      self new encode:'hello'
       
  4294      self new encode:'bliss'
       
  4295     "
       
  4296     "
       
  4297      self new phoneticStringsFor:'hello'
       
  4298      self new phoneticStringsFor:'bliss'
       
  4299     "
       
  4300 
       
  4301     "Created: / 28-07-2017 / 15:34:52 / cg"
       
  4302     "Modified (comment): / 02-08-2017 / 14:31:47 / cg"
       
  4303 ! !
       
  4304 
       
  4305 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
       
  4306 
       
  4307 rule10:key 
       
  4308     "10. transcode 'PH' to 'F' "
       
  4309     
       
  4310     ^ self transcodeAll:'PH' of:key to:'F' startingAt:1
       
  4311 
       
  4312     "Modified (format): / 02-08-2017 / 14:34:27 / cg"
       
  4313 !
       
  4314 
       
  4315 rule11:key 
       
  4316     |k c|
       
  4317 
       
  4318     "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
       
  4319     k := key copy.
       
  4320     c := SortedCollection sortBlock:[:a :b | b < a ].
       
  4321     2 to:key size do:[:i | 
       
  4322         (key at:i) = $H ifTrue:[
       
  4323             ((key at:i - 1) isVowel 
       
  4324                 or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
       
  4325         ]
       
  4326     ].
       
  4327     c do:[:n | 
       
  4328         k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
       
  4329     ].
       
  4330     ^ k
       
  4331 !
       
  4332 
       
  4333 rule12:key 
       
  4334     |k|
       
  4335 
       
  4336     "12. change 'KN' to 'N', else 'K' to 'C' "
       
  4337     k := self transcodeAll:'KN' of:key to:'K' startingAt:1.
       
  4338     k := self transcodeAll:'K' of:k to:'C' startingAt:1.
       
  4339     ^ k
       
  4340 
       
  4341     "Modified (format): / 02-08-2017 / 14:34:48 / cg"
       
  4342 !
       
  4343 
       
  4344 rule13:key 
       
  4345     "13. if not first character, change 'M' to 'N' "
       
  4346     
       
  4347     ^ self transcodeAll:'M' of:key to:'N' startingAt:2
       
  4348 
       
  4349     "Modified (format): / 02-08-2017 / 14:35:00 / cg"
       
  4350 !
       
  4351 
       
  4352 rule14:key 
       
  4353     "14. if not first character, change 'Q' to 'G' "
       
  4354     
       
  4355     ^ self transcodeAll:'Q' of:key to:'G' startingAt:2
       
  4356 
       
  4357     "Modified (format): / 02-08-2017 / 14:35:08 / cg"
       
  4358 !
       
  4359 
       
  4360 rule15:key 
       
  4361     "15. transcode 'SH' to 'S' "
       
  4362     
       
  4363     ^ self transcodeAll:'SH' of:key to:'S' startingAt:1
       
  4364 
       
  4365     "Modified (format): / 02-08-2017 / 14:35:18 / cg"
       
  4366 !
       
  4367 
       
  4368 rule16:key 
       
  4369     "16. transcode 'SCH' to 'S' "
       
  4370     
       
  4371     ^ self transcodeAll:'SCH' of:key to:'S' startingAt:1
       
  4372 
       
  4373     "Modified (format): / 02-08-2017 / 14:35:25 / cg"
       
  4374 !
       
  4375 
       
  4376 rule17:key 
       
  4377     "17. transcode 'YW' to 'Y' "
       
  4378     
       
  4379     ^ self transcodeAll:'YW' of:key to:'Y' startingAt:1
       
  4380 
       
  4381     "Modified (format): / 02-08-2017 / 14:35:33 / cg"
       
  4382 !
       
  4383 
       
  4384 rule18:key 
       
  4385     |k|
       
  4386 
       
  4387     "18. if not first or last character, change 'Y' to 'A' "
       
  4388     k := self transcodeAll:'Y' of:key to:'A' startingAt:2.
       
  4389     key last = $Y ifTrue:[
       
  4390         k at:k size put:$Y
       
  4391     ].
       
  4392     ^ k
       
  4393 
       
  4394     "Modified (format): / 02-08-2017 / 14:35:44 / cg"
       
  4395 !
       
  4396 
       
  4397 rule19:key 
       
  4398     "19. transcode 'WR' to 'R' "
       
  4399     
       
  4400     ^ self transcodeAll:'WR' of:key to:'R' startingAt:1
       
  4401 
       
  4402     "Modified (format): / 02-08-2017 / 14:35:52 / cg"
       
  4403 !
       
  4404 
       
  4405 rule1:key 
       
  4406     |k|
       
  4407 
       
  4408     k := key copy.
       
  4409      "1. Remove all 'S' and 'Z' chars from the end of the name"
       
  4410     [
       
  4411         'SZ' includes:k last
       
  4412     ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
       
  4413     ^ k
       
  4414 !
       
  4415 
       
  4416 rule20:key 
       
  4417     "20. if not first character, change 'Z' to 'S' "
       
  4418     
       
  4419     ^ self transcodeAll:'Z' of:key to:'S' startingAt:2
       
  4420 
       
  4421     "Modified (format): / 02-08-2017 / 14:36:00 / cg"
       
  4422 !
       
  4423 
       
  4424 rule21:key 
       
  4425     "21. transcode terminal 'AY' to 'Y' "
       
  4426     
       
  4427     ^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1
       
  4428 
       
  4429     "Modified (format): / 02-08-2017 / 14:36:08 / cg"
       
  4430 !
       
  4431 
       
  4432 rule22:key 
       
  4433     |k|
       
  4434 
       
  4435     "22. remove trailing vowels "
       
  4436     k := key copy.
       
  4437     [ k last isVowel ] whileTrue:[
       
  4438         k := k copyButLast
       
  4439     ].
       
  4440     ^ k
       
  4441 
       
  4442     "Modified: / 02-08-2017 / 14:36:42 / cg"
       
  4443 !
       
  4444 
       
  4445 rule23:key 
       
  4446     |k c|
       
  4447 
       
  4448     "23. collapse all strings of repeated characters "
       
  4449     k := key copy.
       
  4450     c := SortedCollection sortBlock:[:a :b | b < a ].
       
  4451     k size to:2 do:[:i | 
       
  4452         (k at:i) = (k at:i - 1) ifTrue:[
       
  4453             c add:i
       
  4454         ]
       
  4455     ].
       
  4456     c do:[:n | 
       
  4457         k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
       
  4458     ].
       
  4459     ^ k
       
  4460 !
       
  4461 
       
  4462 rule24:key originalKey:originalKey 
       
  4463     |k|
       
  4464 
       
  4465     "24. if first char of original surname was a vowel, append it to the code"
       
  4466     k := key copy.
       
  4467     originalKey first isVowel ifTrue:[
       
  4468         k := k , originalKey first asString asUppercase
       
  4469     ].
       
  4470     ^ k
       
  4471 !
       
  4472 
       
  4473 rule2:key 
       
  4474      "2. Transcode initial strings:  MAC => MC   PF => F"
       
  4475 
       
  4476     |k|
       
  4477 
       
  4478     k := key copy.
       
  4479     (k startsWith:'MAC') ifTrue:[
       
  4480         k := 'MC' , (k copyFrom:4)
       
  4481     ].
       
  4482     (k startsWith:'PF') ifTrue:[
       
  4483         k := 'F' , (k copyFrom:3)
       
  4484     ].
       
  4485     ^ k
       
  4486 
       
  4487     "Modified (format): / 02-08-2017 / 14:31:40 / cg"
       
  4488 !
       
  4489 
       
  4490 rule3:key 
       
  4491     |k|
       
  4492 
       
  4493     "3. Transcode trailing strings as follows:
       
  4494         IX => IC
       
  4495           EX => EC
       
  4496           YE, EE, IE => Y
       
  4497            NT, ND => D"
       
  4498            
       
  4499     k := key copy.
       
  4500     k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'.
       
  4501     k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'.
       
  4502     k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'.
       
  4503     k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'.
       
  4504     ^ k
       
  4505 
       
  4506     "Modified (format): / 02-08-2017 / 14:32:24 / cg"
       
  4507 !
       
  4508 
       
  4509 rule4:key 
       
  4510     "4. Transcode 'EV' to 'EF' if not at start of name"
       
  4511     
       
  4512     ^ self transcodeAll:'EV' of:key to:'EF' startingAt:2
       
  4513 
       
  4514     "Modified (format): / 02-08-2017 / 14:32:35 / cg"
       
  4515 !
       
  4516 
       
  4517 rule5:key 
       
  4518     "5. Use first character of name as first character of key.  
       
  4519         Ignored because we're doing an in-place conversion"
       
  4520     
       
  4521     ^ key
       
  4522 
       
  4523     "Modified (comment): / 02-08-2017 / 14:32:45 / cg"
       
  4524 !
       
  4525 
       
  4526 rule6:key 
       
  4527     |k i|
       
  4528 
       
  4529     "6. Remove any 'W' that follows a vowel"
       
  4530     k := key copy.
       
  4531     i := 2.
       
  4532     [
       
  4533         (i := k indexOf:$W startingAt:i) > 0
       
  4534     ] whileTrue:[
       
  4535         (k at:i - 1) isVowel ifTrue:[
       
  4536             k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
       
  4537             i := i - 1
       
  4538         ]
       
  4539     ].
       
  4540     ^ k
       
  4541 !
       
  4542 
       
  4543 rule7:key 
       
  4544     "7. replace all vowels with 'A' "
       
  4545     ^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]].
       
  4546 
       
  4547     "Modified: / 02-08-2017 / 14:33:56 / cg"
       
  4548 !
       
  4549 
       
  4550 rule8:key 
       
  4551     "8. transcode 'GHT' to 'GT' "
       
  4552     
       
  4553     ^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1
       
  4554 
       
  4555     "Modified (format): / 02-08-2017 / 14:34:05 / cg"
       
  4556 !
       
  4557 
       
  4558 rule9:key 
       
  4559     "9. transcode 'DG' to 'G' "
       
  4560     
       
  4561     ^ self transcodeAll:'DG' of:key to:'G' startingAt:1
       
  4562 
       
  4563     "Modified (format): / 02-08-2017 / 14:34:15 / cg"
       
  4564 !
       
  4565 
       
  4566 transcodeAll:aString of:key to:replacementString startingAt:start 
       
  4567     |k i|
       
  4568 
       
  4569     k := key copy.
       
  4570     [
       
  4571         (i := k indexOfSubCollection:aString startingAt:start) > 0
       
  4572     ] whileTrue:[
       
  4573         k := (k copyFrom:1 to:i - 1) , replacementString 
       
  4574                     , (k copyFrom:i + aString size to:k size)
       
  4575     ].
       
  4576     ^ k
       
  4577 !
       
  4578 
       
  4579 transcodeTrailing:anArrayOfStrings of:key to:replacementString 
       
  4580     |answer|
       
  4581 
       
  4582     answer := key copy.
       
  4583     anArrayOfStrings do:[:aString | 
       
  4584         answer := self 
       
  4585                     transcodeAll:aString
       
  4586                     of:answer
       
  4587                     to:replacementString
       
  4588                     startingAt:(answer size - aString size) + 1
       
  4589     ].
       
  4590     ^ answer
       
  4591 ! !
       
  4592 
       
  4593 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
       
  4594 
       
  4595 documentation
       
  4596 "
       
  4597     Implementation of the PHONEM algorithm, as described in
       
  4598     'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
       
  4599     Ein Programm fuer kontextsensitive phonetische Textumwandlung
       
  4600     ct Magazin fuer Computer & Technik 25/1998'
       
  4601     
       
  4602     This algorithm deals better with the german language (it cares for umlauts)
       
  4603 "
       
  4604 ! !
       
  4605 
       
  4606 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
       
  4607 
       
  4608 encode:aString 
       
  4609     |s idx t t2|
       
  4610 
       
  4611     s := aString asUppercase.
       
  4612 
       
  4613     idx := 1.
       
  4614     [idx < (s size-1)] whileTrue:[
       
  4615         t2 := nil.
       
  4616         t := s copyFrom:idx to:idx+1.
       
  4617         t = 'SC' ifTrue:[ t2 := 'C' ]
       
  4618         ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
       
  4619         ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
       
  4620         ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
       
  4621         ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
       
  4622         ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
       
  4623         ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
       
  4624         ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
       
  4625         ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
       
  4626         ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
       
  4627         ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
       
  4628         ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
       
  4629         ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
       
  4630         ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
       
  4631         ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
       
  4632         ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
       
  4633         ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
       
  4634         t2 notNil ifTrue:[
       
  4635             s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
       
  4636         ] ifFalse:[
       
  4637             idx := idx + 1.
       
  4638         ].
       
  4639     ].
       
  4640 
       
  4641     "/ single character substitutions via tr
       
  4642     s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
       
  4643     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
       
  4644     s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
       
  4645     ^ s
       
  4646 
       
  4647     "
       
  4648      self basicNew encode:'müller'  -> 'MYLR'    
       
  4649      self basicNew encode:'mueller' -> 'MYLR'    
       
  4650      self basicNew encode:'möller'  -> 'MYLR'
       
  4651      self basicNew encode:'miller'  -> 'MYLR'     
       
  4652      self basicNew encode:'muller'  -> 'MULR' 
       
  4653      self basicNew encode:'muler'   -> 'MULR' 
       
  4654 
       
  4655      self basicNew phoneticStringsFor:'müller'  #('MYLR')    
       
  4656      self basicNew phoneticStringsFor:'mueller' #('MYLR')    
       
  4657      self basicNew phoneticStringsFor:'möller'  #('MYLR')
       
  4658      self basicNew phoneticStringsFor:'miller'  #('MYLR')     
       
  4659      self basicNew phoneticStringsFor:'muller'  #('MULR') 
       
  4660      self basicNew phoneticStringsFor:'muler'   #('MULR') 
       
  4661      
       
  4662      self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
       
  4663      self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
       
  4664      self basicNew phoneticStringsFor:'fischer'     #('VYCR')
       
  4665      self basicNew phoneticStringsFor:'weber'       #('VBR')
       
  4666      self basicNew phoneticStringsFor:'weeber'      #('VBR')
       
  4667      self basicNew phoneticStringsFor:'webber'      #('VBR')
       
  4668      self basicNew phoneticStringsFor:'wepper'      #('VBR')
       
  4669      
       
  4670      self basicNew phoneticStringsFor:'meyer'       #('MAYR')
       
  4671      self basicNew phoneticStringsFor:'maier'       #('MAYR')
       
  4672      self basicNew phoneticStringsFor:'mayer'       #('MAYR')
       
  4673      self basicNew phoneticStringsFor:'mayr'        #('MAYR')
       
  4674      self basicNew phoneticStringsFor:'meir'        #('MAYR')
       
  4675      
       
  4676      self basicNew phoneticStringsFor:'wagner'      #('VACNR')
       
  4677      self basicNew phoneticStringsFor:'schulz'      #('CULC')
       
  4678      self basicNew phoneticStringsFor:'becker'      #('BCR')
       
  4679      self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
       
  4680      self basicNew phoneticStringsFor:'haus'        #('AUS')
       
  4681      
       
  4682      self basicNew phoneticStringsFor:'schäfer'     #('CVR')
       
  4683      self basicNew phoneticStringsFor:'scheffer'    #('CVR')
       
  4684      self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
       
  4685      self basicNew phoneticStringsFor:'schaefer'    #('CVR')
       
  4686     "
       
  4687 
       
  4688     "Created: / 28-07-2017 / 15:38:08 / cg"
       
  4689 ! !
       
  4690 
       
  4691 !PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'!
       
  4692 
       
  4693 documentation
       
  4694 "
       
  4695     Caverphone (2) Algorithm:
       
  4696 
       
  4697     see http://caversham.otago.ac.nz/files/working/ctp150804.pdf
       
  4698     
       
  4699     Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it,
       
  4700     with the proviso that the Caversham Project at the University of Otago should be acknowledged as the
       
  4701     original source (which is hereby done ;-).
       
  4702 
       
  4703     •  Start with a Surname or Firstname
       
  4704     •  Convert to lowercase
       
  4705         This coding system is case sensitive, implementations should acknowledge that a is not the same as A
       
  4706     •  Remove anything not A-Z
       
  4707         The main intention of this is to remove spaces, hyphens, and apostrophes.
       
  4708         example:  o'brian becomes obrian
       
  4709     •  If the name starts with cough make it cou2f
       
  4710         2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in.
       
  4711     •  If the name starts with rough make it rou2f
       
  4712     •  If the name starts with tough make it tou2f
       
  4713     •  If the name starts with enough make it enou2f
       
  4714     •  If the name starts with gn make it 2n
       
  4715     •  If the name ends with mb make it m2
       
  4716     •  replace cq with 2q
       
  4717     •  replace ci with si
       
  4718     •  replace ce with se
       
  4719     •  replace cy with sy
       
  4720     •  replace tch with 2ch
       
  4721     •  replace c with k
       
  4722     •  replace q with k
       
  4723     •  replace x with k
       
  4724     •  replace v with f
       
  4725     •  replace dg with 2g
       
  4726     •  replace tio with sio
       
  4727     •  replace tia with sia
       
  4728     •  replace d with t
       
  4729     •  replace ph with fh
       
  4730     •  replace b with p
       
  4731     •  replace sh with s2
       
  4732     •  replace z with s
       
  4733     •  replace and initial vowel with an A
       
  4734     •  replace all other vowels with a 3
       
  4735         3 is a temporary placeholder marking a vowel
       
  4736     •  replace 3gh3 with 3kh3
       
  4737         Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule.
       
  4738     •  replace gh with 22
       
  4739     •  replace g with k
       
  4740     •  replace groups of the letter s with a S
       
  4741         Continuous strings of s are replace by a single S
       
  4742     •  replace groups of the letter t with a T
       
  4743     •  replace groups of the letter p with a P
       
  4744     •  replace groups of the letter k with a K
       
  4745     •  replace groups of the letter f with a F
       
  4746     •  replace groups of the letter m with a M
       
  4747     •  replace groups of the letter n with a N
       
  4748     •  replace w3 with W3
       
  4749     •  replace wy with Wy
       
  4750     •  replace wh3 with Wh3
       
  4751     •  replace why with Why
       
  4752     •  replace w with 2
       
  4753     •  replace and initial h with an A
       
  4754     •  replace all other occurrences of h with a 2
       
  4755     •  replace r3 with R3
       
  4756     •  replace ry with Ry
       
  4757     •  replace r with 2
       
  4758     •  replace l3 with L3
       
  4759     •  replace ly with Ly
       
  4760     •  replace l with 2
       
  4761     •  replace j with y
       
  4762     •  replace y3 with Y3
       
  4763     •  replace y with 2
       
  4764     •  remove all 2s
       
  4765     •  remove all 3s
       
  4766     •  put six (v1) / ten (v2) 1s on the end
       
  4767     •  take the first six characters as the code (caverphone 1);
       
  4768        / take the first ten characters as the code (caverphone 2);
       
  4769 
       
  4770      self new encode:'david'      -> 'TFT1111111'
       
  4771      self new encode:'whittle'    -> 'WTA1111111'
       
  4772 
       
  4773      self new encode:'Stevenson'  -> 'STFNSN1111'
       
  4774      self new encode:'Peter'      -> 'PTA1111111'
       
  4775 
       
  4776      self new encode:'washington' -> 'WSNKTN1111'
       
  4777      self new encode:'lee'        -> 'LA11111111'
       
  4778      self new encode:'Gutierrez'  -> 'KTRS111111'
       
  4779      self new encode:'Pfister'    -> 'PFSTA11111'
       
  4780      self new encode:'Jackson'    -> 'YKSN111111'
       
  4781      self new encode:'Tymczak'    -> 'TMKSK11111'
       
  4782 
       
  4783      self new encode:'add'        -> 'AT11111111'
       
  4784      self new encode:'aid'        -> 'AT11111111'
       
  4785      self new encode:'at'         -> 'AT11111111'
       
  4786      self new encode:'art'        -> 'AT11111111'
       
  4787      self new encode:'earth'      -> 'AT11111111'
       
  4788      self new encode:'head'       -> 'AT11111111'
       
  4789      self new encode:'old'        -> 'AT11111111'
       
  4790 
       
  4791      self new encode:'ready'      -> 'RTA1111111'
       
  4792      self new encode:'rather'     -> 'RTA1111111'
       
  4793      self new encode:'able'       -> 'APA1111111'
       
  4794      self new encode:'appear'     -> 'APA1111111'
       
  4795 
       
  4796      self new encode:'Deedee'     -> 'TTA1111111'
       
  4797 "
       
  4798 ! !
       
  4799 
       
  4800 !PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'!
       
  4801 
       
  4802 encode:word 
       
  4803     |txt|
       
  4804 
       
  4805     word size == 0 ifTrue:[^ '1111111111' ].
       
  4806     
       
  4807     "/ 1. Convert to lowercase
       
  4808     txt := word asLowercase.
       
  4809 
       
  4810     "/ 2. Remove anything not A-Z
       
  4811     txt := txt select:#isLetter.
       
  4812 
       
  4813     #(
       
  4814     "/  oldSeq newSeq repeat
       
  4815 
       
  4816     "/ 2.5. Remove final e
       
  4817         'e$' '' false
       
  4818     "/ 3. Handle various start options
       
  4819         '^cough' 'cou2f' false
       
  4820         '^rough' 'rou2f' false
       
  4821         '^tough' 'tou2f' false
       
  4822         '^enough' 'enou2f' false
       
  4823         '^trough' 'trou2f' false
       
  4824 
       
  4825         '^gn' '2n' false
       
  4826         'mb$' 'm2' false
       
  4827         
       
  4828     "/ 4. Handle replacements
       
  4829         'cq' '2q' true
       
  4830         'ci' 'si' true
       
  4831         'ce' 'se' true
       
  4832         'cy' 'sy' true
       
  4833         'tch' '2ch' true
       
  4834         'c' 'k' true
       
  4835         'q' 'k' true
       
  4836         'x' 'k' true
       
  4837         'v' 'f' true
       
  4838         'dg' '2g' true
       
  4839         'tio' 'sio' true
       
  4840         'tia' 'sia' true
       
  4841         'd' 't' true
       
  4842         'ph' 'fh' true
       
  4843         'b' 'p' true
       
  4844         'sh' 's2' true
       
  4845         'z' 's' true
       
  4846         
       
  4847         '^a' 'A' false
       
  4848         '^e' 'A' false
       
  4849         '^i' 'A' false
       
  4850         '^o' 'A' false
       
  4851         '^u' 'A' false
       
  4852         
       
  4853         'a' '3' true
       
  4854         'e' '3' true
       
  4855         'i' '3' true
       
  4856         'o' '3' true
       
  4857         'u' '3' true
       
  4858         'j' 'y' true 
       
  4859         
       
  4860         '^y3' 'Y3' false 
       
  4861         '^y' 'A' false
       
  4862 
       
  4863         'y' '3'  true
       
  4864         '3gh3' '3kh3' true
       
  4865         'gh' '22' true
       
  4866         'g' 'k' true
       
  4867         's'  'S' true
       
  4868         'SS' 'S' true
       
  4869         't'  'T' true
       
  4870         'TT' 'T' true
       
  4871         'p'  'P' true
       
  4872         'PP' 'P' true
       
  4873         'k'  'K' true
       
  4874         'KK' 'K' true
       
  4875         'f'  'F' true
       
  4876         'FF' 'F' true
       
  4877         'm'  'M' true
       
  4878         'MM' 'M' true
       
  4879         'n'  'N' true
       
  4880         'NN' 'N' true
       
  4881         'w3' 'W3' true
       
  4882         'wh3' 'Wh3' true
       
  4883         'w$' '3'  false
       
  4884         'w' '2' true
       
  4885         '^h' 'A' false
       
  4886         'h' '2' true
       
  4887         'r3' 'R3' true
       
  4888         'r$' '3'  false
       
  4889         'r' '2' true
       
  4890         'l3' 'L3' true
       
  4891         'l$' '3' false
       
  4892         'l' '2' true
       
  4893 
       
  4894     "/ 5. removals
       
  4895 
       
  4896         '2' '' true
       
  4897         '3$' 'A' true
       
  4898         '3' '' true
       
  4899     ) inGroupsOf:3 do:[:pat :repl :repeat|
       
  4900         |s txtBefore|
       
  4901 
       
  4902         txtBefore := txt.
       
  4903         (pat startsWith:$^) ifTrue:[
       
  4904             s := pat copyButFirst.
       
  4905             repeat ifTrue:[
       
  4906                 [txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ]
       
  4907             ] ifFalse:[
       
  4908                 (txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ]
       
  4909             ].    
       
  4910         ] ifFalse:[
       
  4911             (pat endsWith:$$) ifTrue:[
       
  4912                 s := pat copyButLast.
       
  4913                 repeat ifTrue:[
       
  4914                     [txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ]
       
  4915                 ] ifFalse:[
       
  4916                     (txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ]
       
  4917                 ]
       
  4918             ] ifFalse:[
       
  4919                 repeat ifTrue:[
       
  4920                     txt := txt copyReplaceAllSubcollections:pat with:repl
       
  4921                 ] ifFalse:[
       
  4922                     txt := txt copyReplaceSubcollection:pat with:repl
       
  4923                 ]    
       
  4924             ]    
       
  4925         ].
       
  4926         "/ txt ~= txtBefore ifTrue:[
       
  4927         "/     Transcript showCR:(pat,' | ',repl,' -> ',txt).
       
  4928         "/ ].    
       
  4929     ].    
       
  4930 
       
  4931     "/ 6. put ten 1s on the end
       
  4932     txt := txt,'1111111111'.
       
  4933     
       
  4934     "/ 7. take the first ten characters as the code
       
  4935     ^ txt copyTo:10
       
  4936 
       
  4937     "
       
  4938      self new encode:'david'      -> 'TFT1111111'
       
  4939      self new encode:'whittle'    -> 'WTA1111111'
       
  4940 
       
  4941      self new encode:'Stevenson'  -> 'STFNSN1111'
       
  4942      self new encode:'Peter'      -> 'PTA1111111'
       
  4943 
       
  4944      self new encode:'washington' -> 'WSNKTN1111'
       
  4945      self new encode:'lee'        -> 'LA11111111'
       
  4946      self new encode:'Gutierrez'  -> 'KTRS111111'
       
  4947      self new encode:'Pfister'    -> 'PFSTA11111'
       
  4948      self new encode:'Jackson'    -> 'YKSN111111'
       
  4949      self new encode:'Tymczak'    -> 'TMKSK11111'
       
  4950 
       
  4951      self new encode:'add'        -> 'AT11111111'
       
  4952      self new encode:'aid'        -> 'AT11111111'
       
  4953      self new encode:'at'         -> 'AT11111111'
       
  4954      self new encode:'art'        -> 'AT11111111'
       
  4955      self new encode:'earth'      -> 'AT11111111'
       
  4956      self new encode:'head'       -> 'AT11111111'
       
  4957      self new encode:'old'        -> 'AT11111111'
       
  4958 
       
  4959      self new encode:'ready'      -> 'RTA1111111'
       
  4960      self new encode:'rather'     -> 'RTA1111111'
       
  4961      self new encode:'able'       -> 'APA1111111'
       
  4962      self new encode:'appear'     -> 'APA1111111'
       
  4963 
       
  4964      self new encode:'Deedee'     -> 'TTA1111111'
       
  4965     "
       
  4966 
       
  4967     "Created: / 28-07-2017 / 15:21:23 / cg"
       
  4968     "Modified: / 02-08-2017 / 01:42:35 / cg"
       
  4969 ! !
       
  4970 
  3143 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
  4971 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
  3144 
  4972 
  3145 documentation
  4973 documentation
  3146 "
  4974 "
  3147      The 'Kölner Phonetik' (cologne phonetic) code is for the german language 
  4975      The 'Kölner Phonetik' (cologne phonetic) code is for the german language 
  3529      self new encode:'Pfister'    -> 'P236'
  5357      self new encode:'Pfister'    -> 'P236'
  3530      self new encode:'Jackson'    -> 'J250'
  5358      self new encode:'Jackson'    -> 'J250'
  3531      self new encode:'Tymczak'    -> 'T522'
  5359      self new encode:'Tymczak'    -> 'T522'
  3532 
  5360 
  3533     notice:
  5361     notice:
  3534      MiracodeStringComparator new 
  5362      MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
  3535                     encode:'Ashcraft' -> 'A261'
  5363      SoundexStringComparator new encode:'Ashcraft'  -> 'A226'
  3536      SoundexStringComparator 
       
  3537                 new encode:'Ashcraft' -> 'A226'
       
  3538 
  5364 
  3539     see also:            
  5365     see also:            
  3540         https://www.archives.gov/research/census/soundex.html
  5366         https://www.archives.gov/research/census/soundex.html
  3541 "
  5367 "
  3542 ! !
  5368 ! !
  3543 
  5369 
  3544 !PhoneticStringUtilities::MiracodeStringComparator methodsFor:'api'!
  5370 !PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'!
  3545 
  5371 
  3546 encode:word 
  5372 encode:word 
       
  5373     "same as inherited, but cares for W and H"
       
  5374     
  3547     |u p t prevCode|
  5375     |u p t prevCode|
  3548 
  5376 
  3549     u := word asUppercase.
  5377     u := word asUppercase.
  3550     p := u first asString.
  5378     p := u first asString.
  3551     prevCode := self translate:u first.
  5379     prevCode := self translate:u first.
  3564     [ p size < 4 ] whileTrue:[
  5392     [ p size < 4 ] whileTrue:[
  3565         p := p , '0'
  5393         p := p , '0'
  3566     ].
  5394     ].
  3567     ^ (p copyFrom:1 to:4)
  5395     ^ (p copyFrom:1 to:4)
  3568 
  5396 
  3569     "
  5397     "Created: / 02-08-2017 / 00:19:47 / cg"
  3570      self new encode:'washington' -> 'W252'
  5398     "Modified (comment): / 02-08-2017 / 14:30:47 / cg"
  3571      self new encode:'lee'        -> 'L000'
       
  3572      self new encode:'Gutierrez'  -> 'G362'
       
  3573      self new encode:'Pfister'    -> 'P236'
       
  3574      self new encode:'Jackson'    -> 'J250'
       
  3575      self new encode:'Tymczak'    -> 'T522'
       
  3576     "
       
  3577 
       
  3578     "notice:
       
  3579      MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
       
  3580      self new encode:'Ashcraft'   -> 'A226'
       
  3581     "
       
  3582 
       
  3583     "Created: / 28-07-2017 / 15:23:16 / cg"
       
  3584     "Modified (comment): / 01-08-2017 / 19:01:51 / cg"
       
  3585 ! !
  5399 ! !
  3586 
  5400 
  3587 !PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!
  5401 !PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!
  3588 
  5402 
  3589 documentation
  5403 documentation