hg/stx-libbasic2: changeset 4491:d6c31bb1e928

--- a/PhoneticStringUtilities.st	Tue Aug 01 19:32:27 2017 +0200
+++ b/PhoneticStringUtilities.st	Wed Aug 02 14:37:29 2017 +0200
@@ -30,6 +30,22 @@
 	privateIn:PhoneticStringUtilities
 !
 
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
+
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
+
 PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
 	instanceVariableNames:''
 	classVariableNames:'CharacterTranslationDict'
@@ -51,6 +67,14 @@
 	privateIn:PhoneticStringUtilities
 !
 
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
+
 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
 	instanceVariableNames:''
 	classVariableNames:'CharacterTranslationDict'
@@ -79,10 +103,9 @@
 	privateIn:PhoneticStringUtilities
 !
 
-PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
-	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
-		currentIndex skipCount'
-	classVariableNames:''
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator
+	instanceVariableNames:''
+	classVariableNames:'CharacterTranslationDict'
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 !
@@ -153,6 +176,15 @@
         described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
         from 'ct Magazin fuer Computer & Technik 25/1999'.
 
+    mra
+        Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.
+
+    caverphone2
+        better than soundex
+
+    spanish phonetic code
+        an algorithm slightly adjusted to spanish names
+
     More info for german readers is found in:
         http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
 "
@@ -163,19 +195,33 @@
     for the 50 most common german names, we get:
 
                             ext. 
-    name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln
-
-    müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657
-    schmidt     S253    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        8628
-    schneider   S253    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627
-    fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387
-    weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317
-    meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67
-    wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       367
-    schulz      S242    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        85
-    becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147
-    hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036
-    schäfer     S216    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837
+    name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln  caverphone2  mra
+
+    müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657    MLA1111111   MLR
+    schmidt     S530    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        862    SKMT111111   SCHMDT
+    schneider   S536    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627   SKNTA11111   SCHNDR
+    fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387    FSKA111111   FSCHR
+    weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317    WPA1111111   WBR
+    meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67     MA11111111   MYR
+    wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       3467   WKNA111111   WGNR
+    schulz      S420    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        858    SKS1111111   SCHLZ
+    becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147    PKA1111111   BCKR
+    hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036    AFMN111111   HFMN
+    schäfer     S16ß    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837    SKFA111111   SCHFR
+
+    |cls|
+    
+    cls := MRAStringComparator.
+    cls := SoundexStringComparator.
+    cls := KoelnerPhoneticCodeStringComparator.
+    cls := Caverphone2StringComparator.
+    #('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer' 
+      'wagner' 'schulz'  'becker'    'hoffmann' 'schäfer')
+    do:[:name |
+        Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''.
+    ].
+
+    KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt'  -> '65752682'
 "
 ! !
 
@@ -463,6 +509,22 @@
     ^ self == PhoneticStringUtilities::PhoneticStringComparator
 ! !
 
+!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'!
+
+encode:word
+    ^ (self new phoneticStringsFor:word) first
+
+    "
+     SoundexStringComparator encode:'Fischer'             -> 'F260'
+     Caverphone2StringComparator encode:'Fischer'         -> 'FSKA111111'
+     KoelnerPhoneticCodeStringComparator encode:'Fischer' -> '387'
+     MRAStringComparator encode:'Fischer'                 -> 'FSCHR'
+     SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429'
+    "
+
+    "Created: / 02-08-2017 / 01:15:50 / cg"
+! !
+
 !PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!
 
 does:aString soundLike:anotherString 
@@ -516,1100 +578,591 @@
     "/ super initialize.   -- commented since inherited method does nothing
 ! !
 
-!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    There are many extended and enhanced soundex variants around;
-    here is one, called 'extended soundex'. It is destribed for example in
-    http://www.epidata.dk/documentation.php.
-    An author or origin is unknown.
-
-    The number of digits is increased to 5 or 8;
-    The first character is not used literally; instead it is encoded like the rest.
-    This might have a negative effect on names starting with a vovel, though.
-
-    Overall, it can be doubted if this is really an enhancement after all.
-"
-! !
-
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
-
-phoneticStringsFor:aString
-    "generates both an extended soundex of length 5 and one of length 8"
-
-    |first second u t prevCode|
-
-    u := aString asUppercase.
-    first := second := ''.
-    u do:[:c | 
-        t := self translate:c.
-        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-            first := first , t.
-            second := second , t.
-            second size == 8 ifTrue:[
-                ^ Array with:(first copyTo:5) with:second 
-            ].
-        ].
-        prevCode := t
-    ].
-    [ first size < 5 ] whileTrue:[
-        first := first , '0'.
-        second := second , '0'.
-    ].
-    [ second size < 8 ] whileTrue:[
-        second := second , '0'
-    ].
-    ^ Array with:first with:second
-
-    "
-     self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
-     self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
-     self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
-     self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
-     self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
-     self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
-     self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
-     self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
-     self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
-     self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
-     self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
-     self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
-     self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
-     self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
-    "
-! !
-
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
-
-translate:aCharacter
-    "use simple if's for more speed when compiled"
-
-    "vowels serve as separators"
-    aCharacter == $A ifTrue:[^ '0' ].         
-    aCharacter == $E ifTrue:[^ '0' ].
-    aCharacter == $I ifTrue:[^ '0' ].
-    aCharacter == $O ifTrue:[^ '0' ].
-    aCharacter == $U ifTrue:[^ '0' ].
-    aCharacter == $Y ifTrue:[^ '0' ].
-
-    aCharacter == $B ifTrue:[^ '1' ]. 
-    aCharacter == $P ifTrue:[^ '1' ].
-
-    aCharacter == $F ifTrue:[^ '2' ]. 
-    aCharacter == $V ifTrue:[^ '2' ]. 
-
-    aCharacter == $C ifTrue:[^ '3' ]. 
-    aCharacter == $S ifTrue:[^ '3' ]. 
-    aCharacter == $K ifTrue:[^ '3' ].
-
-    aCharacter == $G ifTrue:[^ '4' ]. 
-    aCharacter == $J ifTrue:[^ '4' ].
-
-    aCharacter == $Q ifTrue:[^ '5' ]. 
-    aCharacter == $X ifTrue:[^ '5' ]. 
-    aCharacter == $Z ifTrue:[^ '5' ]. 
-
-    aCharacter == $D ifTrue:[^ '6' ]. 
-    aCharacter == $G ifTrue:[^ '6' ]. 
-    aCharacter == $T ifTrue:[^ '6' ]. 
-
-    aCharacter == $L ifTrue:[^ '7' ]. 
-
-    aCharacter == $M ifTrue:[^ '8' ]. 
-    aCharacter == $N ifTrue:[^ '8' ]. 
-
-    aCharacter == $R ifTrue:[^ '9' ]. 
-    ^ nil
-! !
-
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    documentation to be added.
-
-    [author:]
-        cg
-
-    [instance variables:]
-
-    [class variables:]
-
-    [see also:]
-
-"
-! !
-
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
-
-encode:word
-    ^ self subclassResponsibility
-
-    "Created: / 28-07-2017 / 15:20:49 / cg"
-!
-
-phoneticStringsFor:word 
-    ^ Array with:(self encode:word)
-
-    "Created: / 28-07-2017 / 15:20:38 / cg"
-! !
-
-!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    Match Rating Approach Encoder
-
-    The Western Airlines matching rating approach name encoder
-
-    [see also:]
-        https://en.wikipedia.org/wiki/Match_Rating_Approach
-        
-        G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
-            ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' 
-            US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
-"
-!
-
-rCode
-"<<END
-## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
-##
-## Redistribution and use in source and binary forms, with or without
-## modification, are permitted provided that the following conditions are
-## met:
-##
-##     Redistributions of source code must retain the above copyright
-##     notice, this list of conditions and the following disclaimer.
-##
-##     Redistributions in binary form must reproduce the above copyright
-##     notice, this list of conditions and the following disclaimer in
-##     the documentation and/or other materials provided with the
-##     distribution.
-##
-## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#' @rdname mra
-#' @title Match Rating Approach Encoder
-#'
-#' @description
-#' The Western Airlines matching rating approach name encoder
-#'
-#' @param word string or vector of strings to encode
-#' @param x MRA-encoded character vector
-#' @param y MRA-encoded character vector
-#'
-#' @details
-#'
-#' The variable \code{word} is the name to be encoded.  The variable
-#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
-#' because the algorithm itself is dependent upon its six-character
-#' length.  The variables \code{x} and \code{y} are MRA-encoded and are
-#' compared to each other using the MRA comparison specification.
-#'
-#' @return The \code{mra_encode} function returns match rating approach
-#' encoded character vector.  The \code{mra_compare} returns a boolean
-#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
-#' comparison test.
-#'
-#' @references
-#'
-#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
-#' \emph{Accessing Individual Records from Personal Data Files Using
-#' Nonunique Identifiers,} US National Institute of Standards and
-#' Technology, SP-500-2 (1977), p. 17.
-#'
-#' @family phonics
-#'
-#' @examples
-#' mra_encode("William")
-#' mra_encode(c("Peter", "Peady"))
-#' mra_encode("Stevenson")
-
-#' @rdname mra
-#' @name mra_encode
-#' @export
-mra_encode <- function(word) {
-
-    ## First, remove any nonalphabetical characters and uppercase it
-    word <- gsub("[^[:alpha:]]*", "", word)
-    word <- toupper(word)
-
-    ## First character of key = first character of name
-    first <- substr(word, 1, 1)
-    word <- substr(word, 2, nchar(word))
-
-    ## Delete vowels not at the start of the word
-    word <- gsub("[AEIOU]", "", word)
-    word <- paste(first, word, sep = "")
-
-    ## Remove duplicate consecutive characters
-    word <- gsub("([A-Z])\\1+", "\\1", word)
-
-    ## If longer than 6 characters, take first and last 3...and we have
-    ## to vectorize it
-    for(i in 1:length(word)) {
-        if((l = nchar(word[i])) > 6) {
-            first <- substr(word[i], 1, 3)
-            last <- substr(word[i], l - 2, l)
-            word[i] <- paste(first, last, sep = "");
-        }
-    }
-
-    return(word)
-}
-
-#' @rdname mra
-#' @name mra_compare
-#' @export
-mra_compare <- function(x, y) {
-    mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
-
-    ## Obtain the minimum rating value by calculating the length sum of
-    ## the encoded strings and using table A (from Wikipedia).  We start
-    ## by setting the minimum to be the sum and move from there.
-    mra$lensum <- nchar(mra$x) + nchar(mra$y)
-    mra$min[mra$lensum == 12] <- 2
-    mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
-    mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
-    mra$min[mra$lensum <= 4] <- 5
-
-    ## If the length difference between the encoded strings is 3 or
-    ## greater, then no similarity comparison is done.  For us, we
-    ## continue the similarity comparison out of laziness and ensure the
-    ## minimum is impossibly high to meet.
-    mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
-
-    ## Start the comparison.
-    x <- strsplit(mra$x, split = "")
-    y <- strsplit(mra$y, split = "")
-    rows <- nrow(mra)
-    for(i in 1:rows) {
-        ## Process the encoded strings from left to right and remove any
-        ## identical characters found from both strings respectively.
-        j <- 1
-        while(j < min(length(x[[i]]), length(y[[i]]))) {
-            if(x[[i]][j] == y[[i]][j]) {
-                x[[i]] <- x[[i]][-j]
-                y[[i]] <- y[[i]][-j]
-            } else
-                j <- j + 1
-        }
-
-        ## Process the unmatched characters from right to left and
-        ## remove any identical characters found from both names
-        ## respectively.
-        x[[i]] <- rev(x[[i]])
-        y[[i]] <- rev(y[[i]])
-        j <- 1
-        while(j < min(length(x[[i]]), length(y[[i]]))) {
-            if(x[[i]][j] == y[[i]][j]) {
-                x[[i]] <- x[[i]][-j]
-                y[[i]] <- y[[i]][-j]
-            } else
-                j <- j + 1
-        }
-        ## Subtract the number of unmatched characters from 6 in the
-        ## longer string. This is the similarity rating.
-        len <- min(length(x[[i]]), length(y[[i]]))
-        mra$sim[i] <- 6 - len
-    }
-
-    ## If the similarity is greater than or equal to the minimum
-    ## required, it is a successful match.
-    mra$match <- (mra$sim >= mra$min)
-    return(mra$match)
-}
-
-END>>
-! !
-
-!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
-
-encode:wordIn 
-    "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
-    
-    |word prev|
-
-    word := wordIn.
-    
-    "/ First, remove any nonalphabetical characters and uppercase it
-
-    word := word select:#isLetter thenCollect:#asUppercase.
-
-    "/ Delete vowels not at the start of the word
-
-    word := word first asString , ((word from:2) reject:#isVowel).
-
-    "/ Remove duplicate consecutive characters
-
-    prev := nil.
-    word := word 
-                collect:[:char |
-                    char == prev ifTrue:[
-                        $*
-                    ] ifFalse:[
-                        prev := char.
-                        char.
-                    ].    
-                ]
-                thenSelect:[:char | char ~~ $*].
-
-    "/ If longer than 6 characters, take first and last 3            
-    word size > 6 ifTrue:[
-        word := (word copyFirst:3),(word copyLast:3)
-    ].
-    ^ word.
-
-    "
-     self new encode:'Catherine'            -> 'CTHRN'
-     self new encode:'CatherineCatherine'   -> 'CTHHRN'
-     self new encode:'Butter'               -> 'BTR'
-     self new encode:'Byrne'                -> 'BYRN'
-     self new encode:'Boern'                -> 'BRN'
-     self new encode:'Smith'                -> 'SMTH'
-     self new encode:'Smyth'                -> 'SMYTH'
-     self new encode:'Kathryn'              -> 'KTHRYN'
-    "
-
-    "Created: / 28-07-2017 / 15:19:22 / cg"
-    "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    WARNING: this is the so called 'simplified soundex' algorithm;
-      there are more variants like miracode (american soundex) or
-      mysqlSoundex around.
-      
-      Be sure to use the correct algorithm, if the generated strings must be compatible
-      (otherwise, the differences are probably too small to be noticed as effect, but
-      your search will be different)
-
-    The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
-
-    SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
-    components of names, but by doing so reports more matches. 
-
-    There are some variations around in the literature; 
-    the following is called 'simplified soundex', and the rules for coding a name are:
-
-    1. The first letter of the name is used in its un-coded form to serve as the prefix
-       character of the code. (The rest of the code is numerical).
-
-    2. Thereafter, W and H are ignored entirely.
-
-    3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
-
-    4. Other letters of the name are converted to a numerical equivalent:
-                 B, P, F, V              1 
-                 C, G, J, K, Q, S, X, Z  2 
-                 D, T                    3 
-                 L                       4 
-                 M, N                    5 
-                 R                       6 
-
-    5. There are two exceptions: 
-        1. Letters that follow prefix letters which would, if coded, have the same
-           numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
-
-        2. The second letter of any pair of consonants having the same code number is likewise ignored, 
-           i.e. unless there is a ''separator'' between them in the name.
-
-    6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
-       Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
-
-    Notice, that in another variant, w and h are treated slightly differently.
-    This is only of relevance, if you need to reconstruct original soundex codes of other programs
-    or for the original 1880 us census data.
-    
-    Also notice, that soundex deals better with english. 
-    For german and other languages, other algorithms may provide better results.
-"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
-
-encode:word 
-    |u p t prevCode|
-
-    u := word asUppercase.
-    p := u first asString.
-    prevCode := self translate:u first.
-    u from:2 to:u size do:[:c | 
-        t := self translate:c.
-        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-            p := p , t.
-            p size == 4 ifTrue:[^ p ].
-        ].
-        prevCode := t
-    ].
-    [ p size < 4 ] whileTrue:[
-        p := p , '0'
-    ].
-    ^ (p copyFrom:1 to:4)
-
-    "
-     self new encode:'washington' -> 'W252'
-     self new encode:'lee'        -> 'L000'
-     self new encode:'Gutierrez'  -> 'G362'
-     self new encode:'Pfister'    -> 'P236'
-     self new encode:'Jackson'    -> 'J250'
-     self new encode:'Tymczak'    -> 'T522'
-    "
-    
-    "notice:
-     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
-     self new encode:'Ashcraft'   -> 'A226'
-    "
-
-    "Created: / 28-07-2017 / 15:21:23 / cg"
-    "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
-
-translate:aCharacter
-    "use simple if's for more speed when compiled"
-
-    "vowels serve as separators"
-    aCharacter == $A ifTrue:[^ '0' ].         
-    aCharacter == $E ifTrue:[^ '0' ].
-    aCharacter == $I ifTrue:[^ '0' ].
-    aCharacter == $O ifTrue:[^ '0' ].
-    aCharacter == $U ifTrue:[^ '0' ].
-    aCharacter == $Y ifTrue:[^ '0' ].
-
-    aCharacter == $B ifTrue:[^ '1' ]. 
-    aCharacter == $P ifTrue:[^ '1' ]. 
-    aCharacter == $F ifTrue:[^ '1' ]. 
-    aCharacter == $V ifTrue:[^ '1' ]. 
-
-    aCharacter == $C ifTrue:[^ '2' ]. 
-    aCharacter == $S ifTrue:[^ '2' ]. 
-    aCharacter == $K ifTrue:[^ '2' ]. 
-    aCharacter == $G ifTrue:[^ '2' ]. 
-    aCharacter == $J ifTrue:[^ '2' ]. 
-    aCharacter == $Q ifTrue:[^ '2' ]. 
-    aCharacter == $X ifTrue:[^ '2' ]. 
-    aCharacter == $Z ifTrue:[^ '2' ]. 
-
-    aCharacter == $D ifTrue:[^ '3' ]. 
-    aCharacter == $T ifTrue:[^ '3' ]. 
-
-    aCharacter == $L ifTrue:[^ '4' ]. 
-
-    aCharacter == $M ifTrue:[^ '5' ]. 
-    aCharacter == $N ifTrue:[^ '5' ]. 
-
-    aCharacter == $R ifTrue:[^ '6' ]. 
-    ^ nil
-! !
-
-!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
-    and also removing vokals first, then removing duplicate codes
-    (whereas the soundex code does this in reverse order).
-
-    These variations are important, if you need the miracode soundex codes to be generated.
-"
-! !
-
-!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
-
-encode:word 
-    |u p t prevCode|
-
-    u := word asUppercase.
-    p := u first asString.
-    prevCode := self translate:u first.
-    u from:2 to:u size do:[:c |
-        t := self translate:c.
-        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-            p := p , t.
-        ].
-        (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
-            prevCode := t.
-        ].
-    ].
-    [ p size < 4 ] whileTrue:[
-        p := p , '0'
-    ].
-    ^ p
-
-    "Created: / 28-07-2017 / 15:23:41 / cg"
-    "Modified: / 31-07-2017 / 17:53:51 / cg"
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
+!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'!
 
 documentation
 "
-    NYSIIS Algorithm:
-
-    1.
-        remove all ''S'' and ''Z'' chars from the end of the surname 
-
-    2.
-        transcode initial strings
-            MAC => MC
-            PF => F
-
-    3.
-        Transcode trailing strings as follows,
-        
-            IX => IC
-            EX => EC
-            YE,EE,IE => Y
-            NT,ND => D 
-
-    4.
-        transcode ''EV'' to ''EF'' if not at start of name
-
-    5.
-        use first character of name as first character of key 
-
-    6.
-        remove any ''W'' that follows a vowel 
-
-    7.
-        replace all vowels with ''A'' 
-
-    8.
-        transcode ''GHT'' to ''GT'' 
-
-    9.
-        transcode ''DG'' to ''G'' 
-
-    10.
-        transcode ''PH'' to ''F'' 
-
-    11.
-        if not first character, eliminate all ''H'' preceded or followed by a vowel 
-
-    12.
-        change ''KN'' to ''N'', else ''K'' to ''C'' 
-
-    13.
-        if not first character, change ''M'' to ''N'' 
-
-    14.
-        if not first character, change ''Q'' to ''G'' 
-
-    15.
-        transcode ''SH'' to ''S'' 
-
-    16.
-        transcode ''SCH'' to ''S'' 
-
-    17.
-        transcode ''YW'' to ''Y'' 
-
-    18.
-        if not first or last character, change ''Y'' to ''A'' 
-
-    19.
-        transcode ''WR'' to ''R'' 
-
-    20.
-        if not first character, change ''Z'' to ''S'' 
-
-    21.
-        transcode terminal ''AY'' to ''Y'' 
-
-    22.
-        remove traling vowels 
-
-    23.
-        collapse all strings of repeated characters 
-
-    24.
-        if first char of original surname was a vowel, append it to the code
+    self encode:'AUERBACH' -> 097400, 097500
+
+    Encodes a string into a Daitch-Mokotoff Soundex value.
+    The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, 
+    yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation 
+    but differences in spelling.
+
+    The main differences compared to the other soundex variants are:
+        - coded names are 6 digits long
+        - the initial character of the name is coded
+        - rules to encoded multi-character n-grams
+        - multiple possible encodings for the same name (branching)
+
+    This implementation supports branching, depending on the used method:
+        encode:aString            - branching disabled, only the first code will be returned
+        phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|'
+
+    [see also:]
+        'Wikipedia - Daitch-Mokotoff Soundex'
+            http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex 
+
+        'Avotaynu - Soundexing and Genealogy'    
+            http://www.avotaynu.com/soundex.htm
 "
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
-
-encode:aString 
-    |k|
-
-    k := self rule1:(aString asUppercase).
-    k := self rule2:k.
-    k := self rule3:k.
-    k := self rule4:k.
-    k := self rule5:k.
-    k := self rule6:k.
-    k := self rule7:k.
-    k := self rule8:k.
-    k := self rule9:k.
-    k := self rule10:k.
-    k := self rule11:k.
-    k := self rule12:k.
-    k := self rule13:k.
-    k := self rule14:k.
-    k := self rule15:k.
-    k := self rule16:k.
-    k := self rule17:k.
-    k := self rule18:k.
-    k := self rule19:k.
-    k := self rule20:k.
-    k := self rule21:k.
-    k := self rule22:k.
-    k := self rule23:k.
-    k := self rule24:k originalKey:aString.
-    ^ k
-
-    "
-     self new encode:'hello'
-     self new encode:'bliss'
-    "
-    "
-     self new phoneticStringsFor:'hello'
-     self new phoneticStringsFor:'bliss'
-    "
-
-    "Created: / 28-07-2017 / 15:34:52 / cg"
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
-
-rule10:key 
-    "10. transcode 'PH' to 'F' "
-    
-    ^ self 
-        transcodeAll:'PH'
-        of:key
-        to:'F'
-        startingAt:1
-!
-
-rule11:key 
-    |k c|
-
-    "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
-    k := key copy.
-    c := SortedCollection sortBlock:[:a :b | b < a ].
-    2 to:key size do:[:i | 
-        (key at:i) = $H ifTrue:[
-            ((key at:i - 1) isVowel 
-                or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
-        ]
-    ].
-    c do:[:n | 
-        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
-    ].
-    ^ k
-!
-
-rule12:key 
-    |k|
-
-    "12. change 'KN' to 'N', else 'K' to 'C' "
-    k := self 
-                transcodeAll:'KN'
-                of:key
-                to:'K'
-                startingAt:1.
-    k := self 
-                transcodeAll:'K'
-                of:k
-                to:'C'
-                startingAt:1.
-    ^ k
-!
-
-rule13:key 
-    "13. if not first character, change 'M' to 'N' "
-    
-    ^ self 
-        transcodeAll:'M'
-        of:key
-        to:'N'
-        startingAt:2
-!
-
-rule14:key 
-    "14. if not first character, change 'Q' to 'G' "
-    
-    ^ self 
-        transcodeAll:'Q'
-        of:key
-        to:'G'
-        startingAt:2
-!
-
-rule15:key 
-    "15. transcode 'SH' to 'S' "
-    
-    ^ self 
-        transcodeAll:'SH'
-        of:key
-        to:'S'
-        startingAt:1
-!
-
-rule16:key 
-    "16. transcode 'SCH' to 'S' "
-    
-    ^ self 
-        transcodeAll:'SCH'
-        of:key
-        to:'S'
-        startingAt:1
-!
-
-rule17:key 
-    "17. transcode 'YW' to 'Y' "
-    
-    ^ self 
-        transcodeAll:'YW'
-        of:key
-        to:'Y'
-        startingAt:1
-!
-
-rule18:key 
-    |k|
-
-    "18. if not first or last character, change 'Y' to 'A' "
-    k := self 
-                transcodeAll:'Y'
-                of:key
-                to:'A'
-                startingAt:2.
-    key last = $Y ifTrue:[
-        k at:k size put:$Y
-    ].
-    ^ k
-!
-
-rule19:key 
-    "19. transcode 'WR' to 'R' "
-    
-    ^ self 
-        transcodeAll:'WR'
-        of:key
-        to:'R'
-        startingAt:1
-!
-
-rule1:key 
-    |k|
-
-    k := key copy.
-     "1. Remove all 'S' and 'Z' chars from the end of the name"
-    [
-        'SZ' includes:k last
-    ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
-    ^ k
-!
-
-rule20:key 
-    "20. if not first character, change 'Z' to 'S' "
-    
-    ^ self 
-        transcodeAll:'Z'
-        of:key
-        to:'S'
-        startingAt:2
 !
 
-rule21:key 
-    "21. transcode terminal 'AY' to 'Y' "
-    
-    ^ self 
-        transcodeAll:'AY'
-        of:key
-        to:'Y'
-        startingAt:key size - 1
-!
-
-rule22:key 
-    |k|
-
-    "22. remove trailing vowels "
-    k := key copy.
-    [ k last isVowel ] whileTrue:[
-        k := k copyFrom:1 to:k size - 1
-    ].
-    ^ k
-!
-
-rule23:key 
-    |k c|
-
-    "23. collapse all strings of repeated characters "
-    k := key copy.
-    c := SortedCollection sortBlock:[:a :b | b < a ].
-    k size to:2 do:[:i | 
-        (k at:i) = (k at:i - 1) ifTrue:[
-            c add:i
-        ]
-    ].
-    c do:[:n | 
-        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
-    ].
-    ^ k
-!
-
-rule24:key originalKey:originalKey 
-    |k|
-
-    "24. if first char of original surname was a vowel, append it to the code"
-    k := key copy.
-    originalKey first isVowel ifTrue:[
-        k := k , originalKey first asString asUppercase
-    ].
-    ^ k
-!
-
-rule2:key 
-    |k|
-
-    k := key copy.
-     "2. Transcode initial strings:  MAC => MC   PF => F"
-    (k startsWith:'MAC') ifTrue:[
-        k := 'MC' , (k copyFrom:4)
-    ].
-    (k startsWith:'PF') ifTrue:[
-        k := 'F' , (k copyFrom:3)
-    ].
-    ^ k
-!
-
-rule3:key 
-    |k|
-
-    "3. Transcode trailing strings as follows:
-        IX => IC
-          EX => EC
-          YE, EE, IE => Y
-           NT, ND => D"
-    k := key copy.
-    k := self 
-                transcodeTrailing:#( 'IX' )
-                of:k
-                to:'IC'.
-    k := self 
-                transcodeTrailing:#( 'EX' )
-                of:k
-                to:'EC'.
-    k := self 
-                transcodeTrailing:#( 'YE' 'EE' 'IE' )
-                of:k
-                to:'Y'.
-    k := self 
-                transcodeTrailing:#( 'NT' 'ND' )
-                of:k
-                to:'D'.
-    ^ k
-!
-
-rule4:key 
-    "4. Transcode 'EV' to 'EF' if not at start of name"
-    
-    ^ self 
-        transcodeAll:'EV'
-        of:key
-        to:'EF'
-        startingAt:2
-!
-
-rule5:key 
-    "5. Use first character of name as first character of key.  Ignored because we're doing an in-place conversion"
-    
-    ^ key
-!
-
-rule6:key 
-    |k i|
-
-    "6. Remove any 'W' that follows a vowel"
-    k := key copy.
-    i := 2.
-    [
-        (i := k indexOf:$W startingAt:i) > 0
-    ] whileTrue:[
-        (k at:i - 1) isVowel ifTrue:[
-            k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
-            i := i - 1
-        ]
-    ].
-    ^ k
-!
-
-rule7:key 
-    |k|
-
-    "7. replace all vowels with 'A' "
-    k := key copy.
-    1 to:key size do:[:i | 
-        (key at:i) isVowel ifTrue:[
-            k at:i put:$A
-        ]
-    ].
-    ^ k
-!
-
-rule8:key 
-    "8. transcode 'GHT' to 'GT' "
-    
-    ^ self 
-        transcodeAll:'GHT'
-        of:key
-        to:'GT'
-        startingAt:1
-!
-
-rule9:key 
-    "9. transcode 'DG' to 'G' "
-    
-    ^ self 
-        transcodeAll:'DG'
-        of:key
-        to:'G'
-        startingAt:1
-!
-
-transcodeAll:aString of:key to:replacementString startingAt:start 
-    |k i|
-
-    k := key copy.
-    [
-        (i := k indexOfSubCollection:aString startingAt:start) > 0
-    ] whileTrue:[
-        k := (k copyFrom:1 to:i - 1) , replacementString 
-                    , (k copyFrom:i + aString size to:k size)
-    ].
-    ^ k
-!
-
-transcodeTrailing:anArrayOfStrings of:key to:replacementString 
-    |answer|
-
-    answer := key copy.
-    anArrayOfStrings do:[:aString | 
-        answer := self 
-                    transcodeAll:aString
-                    of:answer
-                    to:replacementString
-                    startingAt:(answer size - aString size) + 1
-    ].
-    ^ answer
-! !
-
-!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
-
-documentation
-"
-    Implementation of the PHONEM algorithm, as described in
-    'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
-    Ein Programm fuer kontextsensitive phonetische Textumwandlung
-    ct Magazin fuer Computer & Technik 25/1998'
-    
-    This algorithm deals better with the german language (it cares for umlauts)
-"
-! !
-
-!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
-
-encode:aString 
-    |s idx t t2|
-
-    s := aString asUppercase.
-
-    idx := 1.
-    [idx < (s size-1)] whileTrue:[
-        t2 := nil.
-        t := s copyFrom:idx to:idx+1.
-        t = 'SC' ifTrue:[ t2 := 'C' ]
-        ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
-        ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
-        ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
-        ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
-        ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
-        ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
-        ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
-        ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
-        ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
-        ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
-        ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
-        ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
-        ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
-        ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
-        ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
-        ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
-        t2 notNil ifTrue:[
-            s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
-        ] ifFalse:[
-            idx := idx + 1.
-        ].
-    ].
-
-    "/ single character substitutions via tr
-    s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
-    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
-    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
-    ^ s
-
-    "
-     self basicNew encode:'müller'  -> 'MYLR'    
-     self basicNew encode:'mueller' -> 'MYLR'    
-     self basicNew encode:'möller'  -> 'MYLR'
-     self basicNew encode:'miller'  -> 'MYLR'     
-     self basicNew encode:'muller'  -> 'MULR' 
-     self basicNew encode:'muler'   -> 'MULR' 
-
-     self basicNew phoneticStringsFor:'müller'  #('MYLR')    
-     self basicNew phoneticStringsFor:'mueller' #('MYLR')    
-     self basicNew phoneticStringsFor:'möller'  #('MYLR')
-     self basicNew phoneticStringsFor:'miller'  #('MYLR')     
-     self basicNew phoneticStringsFor:'muller'  #('MULR') 
-     self basicNew phoneticStringsFor:'muler'   #('MULR') 
-     
-     self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
-     self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
-     self basicNew phoneticStringsFor:'fischer'     #('VYCR')
-     self basicNew phoneticStringsFor:'weber'       #('VBR')
-     self basicNew phoneticStringsFor:'weeber'      #('VBR')
-     self basicNew phoneticStringsFor:'webber'      #('VBR')
-     self basicNew phoneticStringsFor:'wepper'      #('VBR')
-     
-     self basicNew phoneticStringsFor:'meyer'       #('MAYR')
-     self basicNew phoneticStringsFor:'maier'       #('MAYR')
-     self basicNew phoneticStringsFor:'mayer'       #('MAYR')
-     self basicNew phoneticStringsFor:'mayr'        #('MAYR')
-     self basicNew phoneticStringsFor:'meir'        #('MAYR')
-     
-     self basicNew phoneticStringsFor:'wagner'      #('VACNR')
-     self basicNew phoneticStringsFor:'schulz'      #('CULC')
-     self basicNew phoneticStringsFor:'becker'      #('BCR')
-     self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
-     self basicNew phoneticStringsFor:'haus'        #('AUS')
-     
-     self basicNew phoneticStringsFor:'schäfer'     #('CVR')
-     self basicNew phoneticStringsFor:'scheffer'    #('CVR')
-     self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
-     self basicNew phoneticStringsFor:'schaefer'    #('CVR')
-    "
-
-    "Created: / 28-07-2017 / 15:38:08 / cg"
+javaCode
+"<<END
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.CharEncoding;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+import java.io.InputStream;
+import java.util.*;
+
+/**
+ * Encodes a string into a Daitch-Mokotoff Soundex value.
+ * <p>
+ * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
+ * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
+ * </p>
+ * <p>
+ * The main differences compared to the other soundex variants are:
+ * </p>
+ * <ul>
+ * <li>coded names are 6 digits long
+ * <li>the initial character of the name is coded
+ * <li>rules to encoded multi-character n-grams
+ * <li>multiple possible encodings for the same name (branching)
+ * </ul>
+ * <p>
+ * This implementation supports branching, depending on the used method:
+ * <ul>
+ * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
+ * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
+ * </ul>
+ * <p>
+ * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
+ * rules can be customized by overriding the default rules contained in the resource file
+ * {@code org/apache/commons/codec/language/dmrules.txt}.
+ * </p>
+ * <p>
+ * This class is thread-safe.
+ * </p>
+ *
+ * @see Soundex
+ * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
+ * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
+ *
+ * @version $Id$
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundex implements StringEncoder {
+
+    /**
+     * Inner class representing a branch during DM soundex encoding.
+     */
+    private static final class Branch {
+        private final StringBuilder builder;
+        private String cachedString;
+        private String lastReplacement;
+
+        private Branch() {
+            builder = new StringBuilder();
+            lastReplacement = null;
+            cachedString = null;
+        }
+
+        /**
+         * Creates a new branch, identical to this branch.
+         *
+         * @return a new, identical branch
+         */
+        public Branch createBranch() {
+            final Branch branch = new Branch();
+            branch.builder.append(toString());
+            branch.lastReplacement = this.lastReplacement;
+            return branch;
+        }
+
+        @Override
+        public boolean equals(final Object other) {
+            if (this == other) {
+                return true;
+            }
+            if (!!(other instanceof Branch)) {
+                return false;
+            }
+
+            return toString().equals(((Branch) other).toString());
+        }
+
+        /**
+         * Finish this branch by appending '0's until the maximum code length has been reached.
+         */
+        public void finish() {
+            while (builder.length() < MAX_LENGTH) {
+                builder.append('0');
+                cachedString = null;
+            }
+        }
+
+        @Override
+        public int hashCode() {
+            return toString().hashCode();
+        }
+
+        /**
+         * Process the next replacement to be added to this branch.
+         *
+         * @param replacement
+         *            the next replacement to append
+         * @param forceAppend
+         *            indicates if the default processing shall be overridden
+         */
+        public void processNextReplacement(final String replacement, final boolean forceAppend) {
+            final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend;
+
+            if (append && builder.length() < MAX_LENGTH) {
+                builder.append(replacement);
+                // remove all characters after the maximum length
+                if (builder.length() > MAX_LENGTH) {
+                    builder.delete(MAX_LENGTH, builder.length());
+                }
+                cachedString = null;
+            }
+
+            lastReplacement = replacement;
+        }
+
+        @Override
+        public String toString() {
+            if (cachedString == null) {
+                cachedString = builder.toString();
+            }
+            return cachedString;
+        }
+    }
+
+    /**
+     * Inner class for storing rules.
+     */
+    private static final class Rule {
+        private final String pattern;
+        private final String[] replacementAtStart;
+        private final String[] replacementBeforeVowel;
+        private final String[] replacementDefault;
+
+        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
+                final String replacementDefault) {
+            this.pattern = pattern;
+            this.replacementAtStart = replacementAtStart.split("\\|");
+            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
+            this.replacementDefault = replacementDefault.split("\\|");
+        }
+
+        public int getPatternLength() {
+            return pattern.length();
+        }
+
+        public String[] getReplacements(final String context, final boolean atStart) {
+            if (atStart) {
+                return replacementAtStart;
+            }
+
+            final int nextIndex = getPatternLength();
+            final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
+            if (nextCharIsVowel) {
+                return replacementBeforeVowel;
+            }
+
+            return replacementDefault;
+        }
+
+        private boolean isVowel(final char ch) {
+            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+        }
+
+        public boolean matches(final String context) {
+            return context.startsWith(pattern);
+        }
+
+        @Override
+        public String toString() {
+            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
+                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
+        }
+    }
+
+    private static final String COMMENT = "//";
+    private static final String DOUBLE_QUOTE = "\"";
+
+    private static final String MULTILINE_COMMENT_END = "*/";
+
+    private static final String MULTILINE_COMMENT_START = "/*";
+
+    /** The resource file containing the replacement and folding rules */
+    private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
+
+    /** The code length of a DM soundex value. */
+    private static final int MAX_LENGTH = 6;
+
+    /** Transformation rules indexed by the first character of their pattern. */
+    private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
+
+    /** Folding rules. */
+    private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
+
+    static {
+        final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
+        if (rulesIS == null) {
+            throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
+        }
+
+        final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
+        parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+        scanner.close();
+
+        // sort RULES by pattern length in descending order
+        for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
+            final List<Rule> ruleList = rule.getValue();
+            Collections.sort(ruleList, new Comparator<Rule>() {
+                @Override
+                public int compare(final Rule rule1, final Rule rule2) {
+                    return rule2.getPatternLength() - rule1.getPatternLength();
+                }
+            });
+        }
+    }
+
+    private static void parseRules(final Scanner scanner, final String location,
+            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
+        int currentLine = 0;
+        boolean inMultilineComment = false;
+
+        while (scanner.hasNextLine()) {
+            currentLine++;
+            final String rawLine = scanner.nextLine();
+            String line = rawLine;
+
+            if (inMultilineComment) {
+                if (line.endsWith(MULTILINE_COMMENT_END)) {
+                    inMultilineComment = false;
+                }
+                continue;
+            }
+
+            if (line.startsWith(MULTILINE_COMMENT_START)) {
+                inMultilineComment = true;
+            } else {
+                // discard comments
+                final int cmtI = line.indexOf(COMMENT);
+                if (cmtI >= 0) {
+                    line = line.substring(0, cmtI);
+                }
+
+                // trim leading-trailing whitespace
+                line = line.trim();
+
+                if (line.length() == 0) {
+                    continue; // empty lines can be safely skipped
+                }
+
+                if (line.contains("=")) {
+                    // folding
+                    final String[] parts = line.split("=");
+                    if (parts.length !!= 2) {
+                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
+                                " parts: " + rawLine + " in " + location);
+                    } else {
+                        final String leftCharacter = parts[0];
+                        final String rightCharacter = parts[1];
+
+                        if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) {
+                            throw new IllegalArgumentException("Malformed folding statement - " +
+                                    "patterns are not single characters: " + rawLine + " in " + location);
+                        }
+
+                        asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
+                    }
+                } else {
+                    // rule
+                    final String[] parts = line.split("\\s+");
+                    if (parts.length !!= 4) {
+                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
+                                " parts: " + rawLine + " in " + location);
+                    } else {
+                        try {
+                            final String pattern = stripQuotes(parts[0]);
+                            final String replacement1 = stripQuotes(parts[1]);
+                            final String replacement2 = stripQuotes(parts[2]);
+                            final String replacement3 = stripQuotes(parts[3]);
+
+                            final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+                            final char patternKey = r.pattern.charAt(0);
+                            List<Rule> rules = ruleMapping.get(patternKey);
+                            if (rules == null) {
+                                rules = new ArrayList<Rule>();
+                                ruleMapping.put(patternKey, rules);
+                            }
+                            rules.add(r);
+                        } catch (final IllegalArgumentException e) {
+                            throw new IllegalStateException(
+                                    "Problem parsing line '" + currentLine + "' in " + location, e);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private static String stripQuotes(String str) {
+        if (str.startsWith(DOUBLE_QUOTE)) {
+            str = str.substring(1);
+        }
+
+        if (str.endsWith(DOUBLE_QUOTE)) {
+            str = str.substring(0, str.length() - 1);
+        }
+
+        return str;
+    }
+
+    /** Whether to use ASCII folding prior to encoding. */
+    private final boolean folding;
+
+    /**
+     * Creates a new instance with ASCII-folding enabled.
+     */
+    public DaitchMokotoffSoundex() {
+        this(true);
+    }
+
+    /**
+     * Creates a new instance.
+     * <p>
+     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
+     * è -&gt; e.
+     * </p>
+     *
+     * @param folding
+     *            if ASCII-folding shall be performed before encoding
+     */
+    public DaitchMokotoffSoundex(final boolean folding) {
+        this.folding = folding;
+    }
+
+    /**
+     * Performs a cleanup of the input string before the actual soundex transformation.
+     * <p>
+     * Removes all whitespace characters and performs ASCII folding if enabled.
+     * </p>
+     *
+     * @param input
+     *            the input string to cleanup
+     * @return a cleaned up string
+     */
+    private String cleanup(final String input) {
+        final StringBuilder sb = new StringBuilder();
+        for (char ch : input.toCharArray()) {
+            if (Character.isWhitespace(ch)) {
+                continue;
+            }
+
+            ch = Character.toLowerCase(ch);
+            if (folding && FOLDINGS.containsKey(ch)) {
+                ch = FOLDINGS.get(ch);
+            }
+            sb.append(ch);
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+     * <p>
+     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+     * EncoderException if the supplied object is not of type java.lang.String.
+     * </p>
+     *
+     * @see #soundex(String)
+     *
+     * @param obj
+     *            Object to encode
+     * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+     *         supplied.
+     * @throws EncoderException
+     *             if the parameter supplied is not of type java.lang.String
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    @Override
+    public Object encode(final Object obj) throws EncoderException {
+        if (!!(obj instanceof String)) {
+            throw new EncoderException(
+                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+        }
+        return encode((String) obj);
+    }
+
+    /**
+     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
+     *
+     * @see #soundex(String)
+     *
+     * @param source
+     *            A String object to encode
+     * @return A DM Soundex code corresponding to the String supplied
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    @Override
+    public String encode(final String source) {
+        if (source == null) {
+            return null;
+        }
+        return soundex(source, false)[0];
+    }
+
+    /**
+     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
+     * <p>
+     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+     * separated by '|'.
+     * </p>
+     * <p>
+     * Example: the name "AUERBACH" is encoded as both
+     * </p>
+     * <ul>
+     * <li>097400</li>
+     * <li>097500</li>
+     * </ul>
+     * <p>
+     * Thus the result will be "097400|097500".
+     * </p>
+     *
+     * @param source
+     *            A String object to encode
+     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    public String soundex(final String source) {
+        final String[] branches = soundex(source, true);
+        final StringBuilder sb = new StringBuilder();
+        int index = 0;
+        for (final String branch : branches) {
+            sb.append(branch);
+            if (++index < branches.length) {
+                sb.append('|');
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Perform the actual DM Soundex algorithm on the input string.
+     *
+     * @param source
+     *            A String object to encode
+     * @param branching
+     *            If branching shall be performed
+     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
+     *         selected branching mode
+     */
+    private String[] soundex(final String source, final boolean branching) {
+        if (source == null) {
+            return null;
+        }
+
+        final String input = cleanup(source);
+
+        final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
+        currentBranches.add(new Branch());
+
+        char lastChar = '\0';
+        for (int index = 0; index < input.length(); index++) {
+            final char ch = input.charAt(index);
+
+            // ignore whitespace inside a name
+            if (Character.isWhitespace(ch)) {
+                continue;
+            }
+
+            final String inputContext = input.substring(index);
+            final List<Rule> rules = RULES.get(ch);
+            if (rules == null) {
+                continue;
+            }
+
+            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+            @SuppressWarnings("unchecked")
+            final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
+
+            for (final Rule rule : rules) {
+                if (rule.matches(inputContext)) {
+                    if (branching) {
+                        nextBranches.clear();
+                    }
+                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
+                    final boolean branchingRequired = replacements.length > 1 && branching;
+
+                    for (final Branch branch : currentBranches) {
+                        for (final String nextReplacement : replacements) {
+                            // if we have multiple replacements, always create a new branch
+                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
+
+                            // special rule: occurrences of mn or nm are treated differently
+                            final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+
+                            nextBranch.processNextReplacement(nextReplacement, force);
+
+                            if (branching) {
+                                nextBranches.add(nextBranch);
+                            } else {
+                                break;
+                            }
+                        }
+                    }
+
+                    if (branching) {
+                        currentBranches.clear();
+                        currentBranches.addAll(nextBranches);
+                    }
+                    index += rule.getPatternLength() - 1;
+                    break;
+                }
+            }
+
+            lastChar = ch;
+        }
+
+        final String[] result = new String[currentBranches.size()];
+        int index = 0;
+        for (final Branch branch : currentBranches) {
+            branch.finish();
+            result[index++] = branch.toString();
+        }
+
+        return result;
+    }
+}
+END>>"
 ! !
 
 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
@@ -3140,6 +2693,2281 @@
     "Modified: / 28-07-2017 / 11:35:12 / cg"
 ! !
 
+!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    There are many extended and enhanced soundex variants around;
+    here is one, called 'extended soundex'. It is destribed for example in
+    http://www.epidata.dk/documentation.php.
+    An author or origin is unknown.
+
+    The number of digits is increased to 5 or 8;
+    The first character is not used literally; instead it is encoded like the rest.
+    This might have a negative effect on names starting with a vovel, though.
+
+    Overall, it can be doubted if this is really an enhancement after all.
+"
+! !
+
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
+
+phoneticStringsFor:aString
+    "generates both an extended soundex of length 5 and one of length 8"
+
+    |first second u t prevCode|
+
+    u := aString asUppercase.
+    first := second := ''.
+    u do:[:c | 
+        t := self translate:c.
+        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+            first := first , t.
+            second := second , t.
+            second size == 8 ifTrue:[
+                ^ Array with:(first copyTo:5) with:second 
+            ].
+        ].
+        prevCode := t
+    ].
+    [ first size < 5 ] whileTrue:[
+        first := first , '0'.
+        second := second , '0'.
+    ].
+    [ second size < 8 ] whileTrue:[
+        second := second , '0'
+    ].
+    ^ Array with:first with:second
+
+    "
+     self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
+     self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
+     self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
+     self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
+     self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
+     self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
+     self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
+     self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
+     self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
+     self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
+     self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
+     self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
+     self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
+     self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
+    "
+! !
+
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
+
+translate:aCharacter
+    "use simple if's for more speed when compiled"
+
+    "vowels serve as separators"
+    aCharacter == $A ifTrue:[^ '0' ].         
+    aCharacter == $E ifTrue:[^ '0' ].
+    aCharacter == $I ifTrue:[^ '0' ].
+    aCharacter == $O ifTrue:[^ '0' ].
+    aCharacter == $U ifTrue:[^ '0' ].
+    aCharacter == $Y ifTrue:[^ '0' ].
+
+    aCharacter == $B ifTrue:[^ '1' ]. 
+    aCharacter == $P ifTrue:[^ '1' ].
+
+    aCharacter == $F ifTrue:[^ '2' ]. 
+    aCharacter == $V ifTrue:[^ '2' ]. 
+
+    aCharacter == $C ifTrue:[^ '3' ]. 
+    aCharacter == $S ifTrue:[^ '3' ]. 
+    aCharacter == $K ifTrue:[^ '3' ].
+
+    aCharacter == $G ifTrue:[^ '4' ]. 
+    aCharacter == $J ifTrue:[^ '4' ].
+
+    aCharacter == $Q ifTrue:[^ '5' ]. 
+    aCharacter == $X ifTrue:[^ '5' ]. 
+    aCharacter == $Z ifTrue:[^ '5' ]. 
+
+    aCharacter == $D ifTrue:[^ '6' ]. 
+    aCharacter == $G ifTrue:[^ '6' ]. 
+    aCharacter == $T ifTrue:[^ '6' ]. 
+
+    aCharacter == $L ifTrue:[^ '7' ]. 
+
+    aCharacter == $M ifTrue:[^ '8' ]. 
+    aCharacter == $N ifTrue:[^ '8' ]. 
+
+    aCharacter == $R ifTrue:[^ '9' ]. 
+    ^ nil
+! !
+
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    documentation to be added.
+
+    [author:]
+        cg
+
+    [instance variables:]
+
+    [class variables:]
+
+    [see also:]
+
+"
+! !
+
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
+
+encode:word
+    ^ self subclassResponsibility
+
+    "Created: / 28-07-2017 / 15:20:49 / cg"
+!
+
+phoneticStringsFor:word 
+    ^ Array with:(self encode:word)
+
+    "Created: / 28-07-2017 / 15:20:38 / cg"
+! !
+
+!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    Match Rating Approach Encoder
+
+    The Western Airlines matching rating approach name encoder
+
+    [see also:]
+        https://en.wikipedia.org/wiki/Match_Rating_Approach
+        
+        G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+            ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' 
+            US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
+"
+!
+
+rCode
+"<<END
+## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions are
+## met:
+##
+##     Redistributions of source code must retain the above copyright
+##     notice, this list of conditions and the following disclaimer.
+##
+##     Redistributions in binary form must reproduce the above copyright
+##     notice, this list of conditions and the following disclaimer in
+##     the documentation and/or other materials provided with the
+##     distribution.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#' @rdname mra
+#' @title Match Rating Approach Encoder
+#'
+#' @description
+#' The Western Airlines matching rating approach name encoder
+#'
+#' @param word string or vector of strings to encode
+#' @param x MRA-encoded character vector
+#' @param y MRA-encoded character vector
+#'
+#' @details
+#'
+#' The variable \code{word} is the name to be encoded.  The variable
+#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
+#' because the algorithm itself is dependent upon its six-character
+#' length.  The variables \code{x} and \code{y} are MRA-encoded and are
+#' compared to each other using the MRA comparison specification.
+#'
+#' @return The \code{mra_encode} function returns match rating approach
+#' encoded character vector.  The \code{mra_compare} returns a boolean
+#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
+#' comparison test.
+#'
+#' @references
+#'
+#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+#' \emph{Accessing Individual Records from Personal Data Files Using
+#' Nonunique Identifiers,} US National Institute of Standards and
+#' Technology, SP-500-2 (1977), p. 17.
+#'
+#' @family phonics
+#'
+#' @examples
+#' mra_encode("William")
+#' mra_encode(c("Peter", "Peady"))
+#' mra_encode("Stevenson")
+
+#' @rdname mra
+#' @name mra_encode
+#' @export
+mra_encode <- function(word) {
+
+    ## First, remove any nonalphabetical characters and uppercase it
+    word <- gsub("[^[:alpha:]]*", "", word)
+    word <- toupper(word)
+
+    ## First character of key = first character of name
+    first <- substr(word, 1, 1)
+    word <- substr(word, 2, nchar(word))
+
+    ## Delete vowels not at the start of the word
+    word <- gsub("[AEIOU]", "", word)
+    word <- paste(first, word, sep = "")
+
+    ## Remove duplicate consecutive characters
+    word <- gsub("([A-Z])\\1+", "\\1", word)
+
+    ## If longer than 6 characters, take first and last 3...and we have
+    ## to vectorize it
+    for(i in 1:length(word)) {
+        if((l = nchar(word[i])) > 6) {
+            first <- substr(word[i], 1, 3)
+            last <- substr(word[i], l - 2, l)
+            word[i] <- paste(first, last, sep = "");
+        }
+    }
+
+    return(word)
+}
+
+#' @rdname mra
+#' @name mra_compare
+#' @export
+mra_compare <- function(x, y) {
+    mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
+
+    ## Obtain the minimum rating value by calculating the length sum of
+    ## the encoded strings and using table A (from Wikipedia).  We start
+    ## by setting the minimum to be the sum and move from there.
+    mra$lensum <- nchar(mra$x) + nchar(mra$y)
+    mra$min[mra$lensum == 12] <- 2
+    mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
+    mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
+    mra$min[mra$lensum <= 4] <- 5
+
+    ## If the length difference between the encoded strings is 3 or
+    ## greater, then no similarity comparison is done.  For us, we
+    ## continue the similarity comparison out of laziness and ensure the
+    ## minimum is impossibly high to meet.
+    mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
+
+    ## Start the comparison.
+    x <- strsplit(mra$x, split = "")
+    y <- strsplit(mra$y, split = "")
+    rows <- nrow(mra)
+    for(i in 1:rows) {
+        ## Process the encoded strings from left to right and remove any
+        ## identical characters found from both strings respectively.
+        j <- 1
+        while(j < min(length(x[[i]]), length(y[[i]]))) {
+            if(x[[i]][j] == y[[i]][j]) {
+                x[[i]] <- x[[i]][-j]
+                y[[i]] <- y[[i]][-j]
+            } else
+                j <- j + 1
+        }
+
+        ## Process the unmatched characters from right to left and
+        ## remove any identical characters found from both names
+        ## respectively.
+        x[[i]] <- rev(x[[i]])
+        y[[i]] <- rev(y[[i]])
+        j <- 1
+        while(j < min(length(x[[i]]), length(y[[i]]))) {
+            if(x[[i]][j] == y[[i]][j]) {
+                x[[i]] <- x[[i]][-j]
+                y[[i]] <- y[[i]][-j]
+            } else
+                j <- j + 1
+        }
+        ## Subtract the number of unmatched characters from 6 in the
+        ## longer string. This is the similarity rating.
+        len <- min(length(x[[i]]), length(y[[i]]))
+        mra$sim[i] <- 6 - len
+    }
+
+    ## If the similarity is greater than or equal to the minimum
+    ## required, it is a successful match.
+    mra$match <- (mra$sim >= mra$min)
+    return(mra$match)
+}
+
+END>>
+! !
+
+!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
+
+encode:wordIn 
+    "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
+    
+    |word prev|
+
+    word := wordIn.
+    
+    "/ First, remove any nonalphabetical characters and uppercase it
+
+    word := word select:#isLetter thenCollect:#asUppercase.
+
+    "/ Delete vowels not at the start of the word
+
+    word := word first asString , ((word from:2) reject:#isVowel).
+
+    "/ Remove duplicate consecutive characters
+
+    prev := nil.
+    word := word 
+                collect:[:char |
+                    char == prev ifTrue:[
+                        $*
+                    ] ifFalse:[
+                        prev := char.
+                        char.
+                    ].    
+                ]
+                thenSelect:[:char | char ~~ $*].
+
+    "/ If longer than 6 characters, take first and last 3            
+    word size > 6 ifTrue:[
+        word := (word copyFirst:3),(word copyLast:3)
+    ].
+    ^ word.
+
+    "
+     self new encode:'Catherine'            -> 'CTHRN'
+     self new encode:'CatherineCatherine'   -> 'CTHHRN'
+     self new encode:'Butter'               -> 'BTR'
+     self new encode:'Byrne'                -> 'BYRN'
+     self new encode:'Boern'                -> 'BRN'
+     self new encode:'Smith'                -> 'SMTH'
+     self new encode:'Smyth'                -> 'SMYTH'
+     self new encode:'Kathryn'              -> 'KTHRYN'
+    "
+
+    "Created: / 28-07-2017 / 15:19:22 / cg"
+    "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
+! !
+
+!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+   Encodes a string into a Metaphone value.
+
+   Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+   Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+
+    Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.
+    Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+    https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6
+
+  They have had undocumented changes from the originally published algorithm.
+  For more information, see https://issues.apache.org/jira/browse/CODEC-57
+
+  Metaphone uses the following rules:
+
+    Doubled letters except 'c' -> drop 2nd letter.
+    Vowels are only kept when they are the first letter.
+    B -> B unless at the end of a word after 'm' as in 'dumb'
+    C -> X (sh) if -cia- or -ch-
+    S if -ci-, -ce- or -cy-
+    K otherwise, including -sch-
+    D -> J if in -dge-, -dgy- or -dgi-; T otherwise
+    F -> F
+    G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above)
+    J if before i or e or y if not double gg; K otherwise
+    H -> silent if after vowel and no vowel follows; H otherwise
+    J -> J
+    K -> silent if after 'c'; K otherwise
+    L -> L
+    M -> M
+    N -> N
+    P -> F if before 'h'; P otherwise
+    Q -> K
+    R -> R
+    S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise
+    T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise
+    V -> F
+    W -> silent if not followed by a vowel W if followed by a vowel
+    X -> KS
+    Y -> silent if not followed by a vowel Y if followed by a vowel
+    Z -> S
+
+    Initial Letter Exceptions
+
+    Initial kn-, gn- pn, ae- or wr- -> drop first letter
+    Initial x- -> change to 's'
+    Initial wh- -> change to 'w'
+
+
+     self new encode:'a'
+     self new encode:'dumb'
+     self new encode:'MILLER'
+     self new encode:'schmidt'
+     self new encode:'schneider'
+     self new encode:'FISCHER'
+     self new encode:'HEDGY'
+     self new encode:'weber'
+     self new encode:'wagner'
+     self new encode:'van gogh'
+"
+!
+
+javaCode
+"<<END
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Metaphone value.
+ * <p>
+ * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+ * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+ * <p>
+ * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
+ * p 39.</CITE>
+ * <p>
+ * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+ * </p>
+ * <ul>
+ * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
+ *  (broken link 4/30/2013) </li>
+ * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
+ *  (link checked 4/30/2013) </li>
+ * </ul>
+ * <p>
+ * They have had undocumented changes from the originally published algorithm.
+ * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
+ * <p>
+ * This class is conditionally thread-safe.
+ * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
+ * but is not volatile, and accesses are not synchronized.
+ * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
+ * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
+ * after initial setup.
+ *
+ * @version $Id$
+ */
+public class Metaphone implements StringEncoder {
+
+    /**
+     * Five values in the English language
+     */
+    private static final String VOWELS = "AEIOU";
+
+    /**
+     * Variable used in Metaphone algorithm
+     */
+    private static final String FRONTV = "EIY";
+
+    /**
+     * Variable used in Metaphone algorithm
+     */
+    private static final String VARSON = "CSPTG";
+
+    /**
+     * The max code length for metaphone is 4
+     */
+    private int maxCodeLen = 4;
+
+    /**
+     * Creates an instance of the Metaphone encoder
+     */
+    public Metaphone() {
+        super();
+    }
+
+    /**
+     * Find the metaphone value of a String. This is similar to the
+     * soundex algorithm, but better at finding similar sounding words.
+     * All input is converted to upper case.
+     * Limitations: Input format is expected to be a single ASCII word
+     * with only characters in the A - Z range, no punctuation or numbers.
+     *
+     * @param txt String to find the metaphone code for
+     * @return A metaphone code corresponding to the String supplied
+     */
+    public String metaphone(final String txt) {
+        boolean hard = false;
+        int txtLength;
+        if (txt == null || (txtLength = txt.length()) == 0) {
+            return "";
+        }
+        // single character is itself
+        if (txtLength == 1) {
+            return txt.toUpperCase(java.util.Locale.ENGLISH);
+        }
+
+        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
+
+        final StringBuilder local = new StringBuilder(40); // manipulate
+        final StringBuilder code = new StringBuilder(10); //   output
+        // handle initial 2 characters exceptions
+        switch(inwd[0]) {
+        case 'K':
+        case 'G':
+        case 'P': /* looking for KN, etc*/
+            if (inwd[1] == 'N') {
+                local.append(inwd, 1, inwd.length - 1);
+            } else {
+                local.append(inwd);
+            }
+            break;
+        case 'A': /* looking for AE */
+            if (inwd[1] == 'E') {
+                local.append(inwd, 1, inwd.length - 1);
+            } else {
+                local.append(inwd);
+            }
+            break;
+        case 'W': /* looking for WR or WH */
+            if (inwd[1] == 'R') {   // WR -> R
+                local.append(inwd, 1, inwd.length - 1);
+                break;
+            }
+            if (inwd[1] == 'H') {
+                local.append(inwd, 1, inwd.length - 1);
+                local.setCharAt(0, 'W'); // WH -> W
+            } else {
+                local.append(inwd);
+            }
+            break;
+        case 'X': /* initial X becomes S */
+            inwd[0] = 'S';
+            local.append(inwd);
+            break;
+        default:
+            local.append(inwd);
+        } // now local has working string with initials fixed
+
+        final int wdsz = local.length();
+        int n = 0;
+
+        while (code.length() < this.getMaxCodeLen() &&
+               n < wdsz ) { // max code size of 4 works well
+            final char symb = local.charAt(n);
+            // remove duplicate letters except C
+            if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) {
+                n++;
+            } else { // not dup
+                switch(symb) {
+                case 'A':
+                case 'E':
+                case 'I':
+                case 'O':
+                case 'U':
+                    if (n == 0) {
+                        code.append(symb);
+                    }
+                    break; // only use vowel if leading char
+                case 'B':
+                    if ( isPreviousChar(local, n, 'M') &&
+                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+                        break;
+                    }
+                    code.append(symb);
+                    break;
+                case 'C': // lots of C special cases
+                    /* discard if SCI, SCE or SCY */
+                    if ( isPreviousChar(local, n, 'S') &&
+                         !!isLastChar(wdsz, n) &&
+                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+                        break;
+                    }
+                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+                        code.append('X');
+                        break;
+                    }
+                    if (!!isLastChar(wdsz, n) &&
+                        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+                        code.append('S');
+                        break; // CI,CE,CY -> S
+                    }
+                    if (isPreviousChar(local, n, 'S') &&
+                        isNextChar(local, n, 'H') ) { // SCH->sk
+                        code.append('K');
+                        break;
+                    }
+                    if (isNextChar(local, n, 'H')) { // detect CH
+                        if (n == 0 &&
+                            wdsz >= 3 &&
+                            isVowel(local,2) ) { // CH consonant -> K consonant
+                            code.append('K');
+                        } else {
+                            code.append('X'); // CHvowel -> X
+                        }
+                    } else {
+                        code.append('K');
+                    }
+                    break;
+                case 'D':
+                    if (!!isLastChar(wdsz, n + 1) &&
+                        isNextChar(local, n, 'G') &&
+                        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+                        code.append('J'); n += 2;
+                    } else {
+                        code.append('T');
+                    }
+                    break;
+                case 'G': // GH silent at end or before consonant
+                    if (isLastChar(wdsz, n + 1) &&
+                        isNextChar(local, n, 'H')) {
+                        break;
+                    }
+                    if (!!isLastChar(wdsz, n + 1) &&
+                        isNextChar(local,n,'H') &&
+                        !!isVowel(local,n+2)) {
+                        break;
+                    }
+                    if (n > 0 &&
+                        ( regionMatch(local, n, "GN") ||
+                          regionMatch(local, n, "GNED") ) ) {
+                        break; // silent G
+                    }
+                    if (isPreviousChar(local, n, 'G')) {
+                        // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
+                        hard = true;
+                    } else {
+                        hard = false;
+                    }
+                    if (!!isLastChar(wdsz, n) &&
+                        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+                        !!hard) {
+                        code.append('J');
+                    } else {
+                        code.append('K');
+                    }
+                    break;
+                case 'H':
+                    if (isLastChar(wdsz, n)) {
+                        break; // terminal H
+                    }
+                    if (n > 0 &&
+                        VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+                        break;
+                    }
+                    if (isVowel(local,n+1)) {
+                        code.append('H'); // Hvowel
+                    }
+                    break;
+                case 'F':
+                case 'J':
+                case 'L':
+                case 'M':
+                case 'N':
+                case 'R':
+                    code.append(symb);
+                    break;
+                case 'K':
+                    if (n > 0) { // not initial
+                        if (!!isPreviousChar(local, n, 'C')) {
+                            code.append(symb);
+                        }
+                    } else {
+                        code.append(symb); // initial K
+                    }
+                    break;
+                case 'P':
+                    if (isNextChar(local,n,'H')) {
+                        // PH -> F
+                        code.append('F');
+                    } else {
+                        code.append(symb);
+                    }
+                    break;
+                case 'Q':
+                    code.append('K');
+                    break;
+                case 'S':
+                    if (regionMatch(local,n,"SH") ||
+                        regionMatch(local,n,"SIO") ||
+                        regionMatch(local,n,"SIA")) {
+                        code.append('X');
+                    } else {
+                        code.append('S');
+                    }
+                    break;
+                case 'T':
+                    if (regionMatch(local,n,"TIA") ||
+                        regionMatch(local,n,"TIO")) {
+                        code.append('X');
+                        break;
+                    }
+                    if (regionMatch(local,n,"TCH")) {
+                        // Silent if in "TCH"
+                        break;
+                    }
+                    // substitute numeral 0 for TH (resembles theta after all)
+                    if (regionMatch(local,n,"TH")) {
+                        code.append('0');
+                    } else {
+                        code.append('T');
+                    }
+                    break;
+                case 'V':
+                    code.append('F'); break;
+                case 'W':
+                case 'Y': // silent if not followed by vowel
+                    if (!!isLastChar(wdsz,n) &&
+                        isVowel(local,n+1)) {
+                        code.append(symb);
+                    }
+                    break;
+                case 'X':
+                    code.append('K');
+                    code.append('S');
+                    break;
+                case 'Z':
+                    code.append('S');
+                    break;
+                default:
+                    // do nothing
+                    break;
+                } // end switch
+                n++;
+            } // end else from symb !!= 'C'
+            if (code.length() > this.getMaxCodeLen()) {
+                code.setLength(this.getMaxCodeLen());
+            }
+        }
+        return code.toString();
+    }
+
+    private boolean isVowel(final StringBuilder string, final int index) {
+        return VOWELS.indexOf(string.charAt(index)) >= 0;
+    }
+
+    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
+        boolean matches = false;
+        if( index > 0 &&
+            index < string.length() ) {
+            matches = string.charAt(index - 1) == c;
+        }
+        return matches;
+    }
+
+    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
+        boolean matches = false;
+        if( index >= 0 &&
+            index < string.length() - 1 ) {
+            matches = string.charAt(index + 1) == c;
+        }
+        return matches;
+    }
+
+    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
+        boolean matches = false;
+        if( index >= 0 &&
+            index + test.length() - 1 < string.length() ) {
+            final String substring = string.substring( index, index + test.length());
+            matches = substring.equals( test );
+        }
+        return matches;
+    }
+
+    private boolean isLastChar(final int wdsz, final int n) {
+        return n + 1 == wdsz;
+    }
+
+
+    /**
+     * Encodes an Object using the metaphone algorithm.  This method
+     * is provided in order to satisfy the requirements of the
+     * Encoder interface, and will throw an EncoderException if the
+     * supplied object is not of type java.lang.String.
+     *
+     * @param obj Object to encode
+     * @return An object (or type java.lang.String) containing the
+     *         metaphone code which corresponds to the String supplied.
+     * @throws EncoderException if the parameter supplied is not
+     *                          of type java.lang.String
+     */
+    @Override
+    public Object encode(final Object obj) throws EncoderException {
+        if (!!(obj instanceof String)) {
+            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
+        }
+        return metaphone((String) obj);
+    }
+
+    /**
+     * Encodes a String using the Metaphone algorithm.
+     *
+     * @param str String object to encode
+     * @return The metaphone code corresponding to the String supplied
+     */
+    @Override
+    public String encode(final String str) {
+        return metaphone(str);
+    }
+
+    /**
+     * Tests is the metaphones of two strings are identical.
+     *
+     * @param str1 First of two strings to compare
+     * @param str2 Second of two strings to compare
+     * @return <code>true</code> if the metaphones of these strings are identical,
+     *        <code>false</code> otherwise.
+     */
+    public boolean isMetaphoneEqual(final String str1, final String str2) {
+        return metaphone(str1).equals(metaphone(str2));
+    }
+
+    /**
+     * Returns the maxCodeLen.
+     * @return int
+     */
+    public int getMaxCodeLen() { return this.maxCodeLen; }
+
+    /**
+     * Sets the maxCodeLen.
+     * @param maxCodeLen The maxCodeLen to set
+     */
+    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
+
+}
+END>>"
+! !
+
+!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'!
+
+encode:txt
+    "
+     self new encode:'a'
+     self new encode:'MILLER'
+     self new encode:'schmidt'
+     self new encode:'schneider'
+     self new encode:'FISCHER'
+     self new encode:'HEDGY'
+     self new encode:'weber'
+     self new encode:'wagner'
+     self new encode:'van gogh'
+     self new encode:'dumb'
+    "
+    
+    |hard txtLength local code inwd ch ch2 wdsz n maxCodeLen|
+
+    inwd := txt.
+    hard := false.
+    txtLength := 0.
+    maxCodeLen := self maxCodeLen.
+    
+    (txtLength := txt size) == 0 ifTrue:[^ ''].
+
+    inwd := txt asUppercase.
+    "/ single character is itself
+    (txtLength == 1) ifTrue:[
+        ^ inwd        
+    ].
+
+    code := '' writeStream.
+    local := inwd.
+    
+    "/ handle initial 2 characters exceptions
+    ch := inwd at:(0+1).
+    ch2 := inwd at:(1+1).
+    ('KGP' includes:ch) ifTrue:[  
+        "/ looking for KN, etc
+        "/ KNx -> Nx 
+        "/ GNx -> Nx 
+        "/ PNx -> Nx 
+        (ch2 == $N) ifTrue:[
+            local := (inwd from:1+1)
+        ].
+    ] ifFalse:[
+    ('A' includes:ch) ifTrue:[  
+        "/ looking for AE
+        "/ AEx -> Ex 
+        (ch2 == $E) ifTrue:[
+            local := (inwd from:1+1)
+        ].
+    ] ifFalse:[
+    ('W' includes:ch) ifTrue:[  
+        "/ looking for WR or WH 
+        (ch2 == $R) ifTrue:[
+            "/ WRx -> Wx 
+            local := (inwd from:1+1)
+        ] ifFalse:[
+            (ch2 == $H) ifTrue:[
+                "/ // WH -> W 
+                local := 'W',(inwd from:2+1).
+            ]
+        ]
+    ] ifFalse:[
+    ('X' includes:ch) ifTrue:[  
+        "/ initial X becomes S */
+        "/ Xx -> Sx 
+        local := 'S',(inwd from:1+1).
+    ]]]].
+    
+    "/ now local has working string with initials fixed
+    
+    wdsz := local size.
+    n := 1.
+
+    [ (code size < maxCodeLen) and:[ n <= wdsz ] ] whileTrue:[
+        "/ max code size of 4 works well
+
+        |symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar|
+
+        symb := local at:n.
+        (n > 1) ifTrue:[ prevChar := local at:(n-1) ]. 
+        (isLastChar := (n == wdsz)) ifFalse:[
+            nextChar := local at:(n+1) 
+        ].    
+        isPrevToLastChar := (n == (wdsz-1)).
+        (n+2) <= wdsz ifTrue:[
+            nextNextChar := local at:(n+2)
+        ].
+        
+        "/ remove duplicate letters except C
+        (symb ~~ $C and:[ nextChar == symb ]) ifFalse:[
+            "/ not dup
+            ('AEIOU' includes:symb) ifTrue:[
+                "/ only use vowel if leading char
+                (n == 1) ifTrue:[
+                    code nextPut:symb
+                ]
+            ] ifFalse:[
+            ('B' includes:symb) ifTrue:[
+                "/    if ( isPreviousChar(local, n, 'M') &&
+                "/         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+                "/        break;
+                "/    }
+                "/    code.append(symb);
+                "/    break;
+                ((prevChar == $M) and:[isLastChar]) ifTrue:[
+                    "/ B is silent if word ends in MB 
+                ] ifFalse:[
+                    code nextPut:symb.
+                ].    
+            ] ifFalse:[
+            ('C' includes:symb) ifTrue:[
+                "/ lots of C special cases    
+                "/    /* discard if SCI, SCE or SCY */
+                "/    if ( isPreviousChar(local, n, 'S') &&
+                "/         !!isLastChar(wdsz, n) &&
+                "/         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+                "/        break;
+                "/    }
+                "/    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+                "/        code.append('X');
+                "/        break;
+                "/    }
+                "/    if (!!isLastChar(wdsz, n) &&
+                "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+                "/        code.append('S');
+                "/        break; // CI,CE,CY -> S
+                "/    }
+                "/    if (isPreviousChar(local, n, 'S') &&
+                "/        isNextChar(local, n, 'H') ) { // SCH->sk
+                "/        code.append('K');
+                "/        break;
+                "/    }
+                "/    if (isNextChar(local, n, 'H')) { // detect CH
+                "/        if (n == 0 &&
+                "/            wdsz >= 3 &&
+                "/            isVowel(local,2) ) { // CH consonant -> K consonant
+                "/            code.append('K');
+                "/        } else {
+                "/            code.append('X'); // CHvowel -> X
+                "/        }
+                "/    } else {
+                "/        code.append('K');
+                "/    }
+                "/    break;
+                (prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[
+                    "/ discard if SCI, SCE or SCY
+                ] ifFalse:[
+                    ((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[
+                        "/  "CIA" -> X 
+                        code nextPut:$X
+                    ] ifFalse:[
+                        ('IEY' includes:nextChar) ifTrue:[
+                            "/ CI,CE,CY -> S
+                            code nextPut:$S
+                        ] ifFalse:[ 
+                           ((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[
+                               "/ SCH->sk
+                                code nextPut:$K
+                            ] ifFalse:[ 
+                                nextChar == $H ifTrue:[
+                                    "/ CH
+                                    ('AEIOU' includes:nextNextChar) ifTrue:[
+                                        code nextPut:$K "/ CH consonant -> K consonant 
+                                    ] ifFalse:[    
+                                        code nextPut:$X "/ CHvowel -> X
+                                    ]    
+                                ] ifFalse:[
+                                    code nextPut:$K
+                                ].    
+                            ]
+                        ]
+                    ]
+                ].    
+                
+            ] ifFalse:[
+            ('D' includes:symb) ifTrue:[
+                "/    if (!!isLastChar(wdsz, n + 1) &&
+                "/        isNextChar(local, n, 'G') &&
+                "/        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+                "/        code.append('J'); n += 2;
+                "/    } else {
+                "/        code.append('T');
+                "/    }
+                "/    break;
+                ((nextChar == $G)
+                and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ])
+                ifTrue:[
+                    code nextPut:$J.
+                    n := n + 2.
+                ] ifFalse:[    
+                    code nextPut:$T.
+                ].    
+            ] ifFalse:[
+            ('G' includes:symb) ifTrue:[
+                "/    GH silent at end or before consonant
+                "/    if (isLastChar(wdsz, n + 1) &&
+                "/        isNextChar(local, n, 'H')) {
+                "/        break;
+                "/    }
+                "/    if (!!isLastChar(wdsz, n + 1) &&
+                "/        isNextChar(local,n,'H') &&
+                "/        !!isVowel(local,n+2)) {
+                "/        break;
+                "/    }
+                "/    if (n > 0 &&
+                "/        ( regionMatch(local, n, "GN") ||
+                "/          regionMatch(local, n, "GNED") ) ) {
+                "/        break; // silent G
+                "/    }
+                "/    if (isPreviousChar(local, n, 'G')) {
+                "/        // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true
+                "/        hard = true;
+                "/    } else {
+                "/        hard = false;
+                "/    }
+                "/    if (!!isLastChar(wdsz, n) &&
+                "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+                "/        !!hard) {
+                "/        code.append('J');
+                "/    } else {
+                "/        code.append('K');
+                "/    }
+                "/    break;
+                (isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[
+                    "/ GH silent at end
+                ] ifFalse:[
+                    (isPrevToLastChar not and:[ nextChar == $H 
+                      and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[
+                        "/ GH silent before consonant
+                    ] ifFalse:[
+                        (n > 1 and:[ nextChar == $N ]) ifTrue:[
+                            "/ GN -> silent G
+                        ] ifFalse:[
+                            hard := (prevChar == $G).
+                            (isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[
+                                code nextPut:$J
+                            ] ifFalse:[
+                                code nextPut:$K
+                            ].    
+                        ].    
+                    ].    
+                ].    
+            ] ifFalse:[
+            ('H' includes:symb) ifTrue:[
+                "/    case 'H':
+                "/        if (isLastChar(wdsz, n)) {
+                "/            break; // terminal H
+                "/        }
+                "/        if (n > 0 &&
+                "/            VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+                "/            break;
+                "/        }
+                "/        if (isVowel(local,n+1)) {
+                "/            code.append('H'); // Hvowel
+                "/        }
+                "/        break;
+                isLastChar ifTrue:[
+                    "/ ignore terminal H
+                ] ifFalse:[
+                    ('CSPTG' includes:prevChar) ifTrue:[
+                        "/ ignore CH, SH, PH, TH, GH (H treated there)
+                    ] ifFalse:[
+                        ('AEIOU' includes:nextChar) ifTrue:[
+                            "/ Hvowel
+                            code nextPut:$H
+                        ].    
+                    ].    
+                ].    
+            ] ifFalse:[
+            ('FJLMNR' includes:symb) ifTrue:[
+                "/    case 'F':
+                "/    case 'J':
+                "/    case 'L':
+                "/    case 'M':
+                "/    case 'N':
+                "/    case 'R':
+                "/        code.append(symb);
+                "/        break;
+                code nextPut:symb.
+            ] ifFalse:[
+            ('K' includes:symb) ifTrue:[
+                "/    case 'K':
+                "/        if (n > 0) { // not initial
+                "/            if (!!isPreviousChar(local, n, 'C')) {
+                "/                code.append(symb);
+                "/            }
+                "/        } else {
+                "/            code.append(symb); // initial K
+                "/        }
+                "/        break;
+                n > 1 ifTrue:[
+                    "/ not initial
+                    prevChar ~~ $C ifTrue:[
+                        code nextPut:$K. "/ initial K
+                    ].    
+                ] ifFalse:[
+                    code nextPut:$K. "/ initial K
+                ].
+            ] ifFalse:[
+            ('P' includes:symb) ifTrue:[
+                "/    case 'P':
+                "/        if (isNextChar(local,n,'H')) {
+                "/            // PH -> F
+                "/            code.append('F');
+                "/        } else {
+                "/            code.append(symb);
+                "/        }
+                "/        break;
+                nextChar == $H ifTrue:[
+                    "/ PH -> F
+                    code nextPut:$F.
+                ] ifFalse:[
+                    code nextPut:symb.
+                ].    
+            ] ifFalse:[
+            ('Q' includes:symb) ifTrue:[
+                "/    case 'Q':
+                "/        code.append('K');
+                "/        break;
+                code nextPut:$K
+
+            ] ifFalse:[
+            ('S' includes:symb) ifTrue:[
+"/                case 'S':
+"/                    if (regionMatch(local,n,"SH") ||
+"/                        regionMatch(local,n,"SIO") ||
+"/                        regionMatch(local,n,"SIA")) {
+"/                        code.append('X');
+"/                    } else {
+"/                        code.append('S');
+"/                    }
+"/                    break;
+                "/ SH -> X  (as in shave or ashton)
+                "/ SIO -> X 
+                "/ SIA -> X (as in ASIA)
+                ((nextChar == $H) 
+                or:[
+                    ((nextChar == $I) 
+                      and:[
+                        (((local from:n) startsWith:'SIO')
+                          or:[ ((local from:n) startsWith:'SIA') ]) 
+                      ]
+                    )
+                ]) ifTrue:[
+                   code nextPut:$X
+                ] ifFalse:[
+                   code nextPut:$S
+                ]
+            ] ifFalse:[
+            ('T' includes:symb) ifTrue:[
+"/                case 'T':
+"/                    if (regionMatch(local,n,"TIA") ||
+"/                        regionMatch(local,n,"TIO")) {
+"/                        code.append('X');
+"/                        break;
+"/                    }
+"/                    if (regionMatch(local,n,"TCH")) {
+"/                        // Silent if in "TCH"
+"/                        break;
+"/                    }
+"/                    // substitute numeral 0 for TH (resembles theta after all)
+"/                    if (regionMatch(local,n,"TH")) {
+"/                        code.append('0');
+"/                    } else {
+"/                        code.append('T');
+"/                    }
+"/                    break;
+                self halt.
+            ] ifFalse:[
+            ('V' includes:symb) ifTrue:[
+                "/    case 'V':
+                "/        code.append('F'); break;
+                code nextPut:$F
+
+            ] ifFalse:[
+            ('WY' includes:symb) ifTrue:[
+                "/    case 'W':
+                "/    case 'Y': // silent if not followed by vowel
+                "/        if (!!isLastChar(wdsz,n) &&
+                "/            isVowel(local,n+1)) {
+                "/            code.append(symb);
+                "/        }
+                "/        break;
+
+                "/ silent if not followed by vowel 
+                (isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[
+                    code nextPut:symb
+                ].    
+            ] ifFalse:[
+            ('X' includes:symb) ifTrue:[
+                "/    case 'X':
+                "/        code.append('K');
+                "/        code.append('S');
+                "/        break;
+                code nextPutAll:'KS'
+            ] ifFalse:[
+            ('Z' includes:symb) ifTrue:[
+                "/    case 'Z':
+                "/        code.append('S');
+                "/        break;
+                code nextPut:$S
+            ] ifFalse:[
+"/                default:
+"/                    // do nothing
+"/                    break;
+            ]]]]]]]]]]]]]]]]. "/ end switch
+        ]. "/ end else from symb !!= 'C'
+        n := n + 1.
+        (code size > maxCodeLen) ifTrue:[
+            code := code truncateTo:maxCodeLen
+        ]
+    ].
+    ^ code contents
+
+    "Created: / 02-08-2017 / 09:51:31 / cg"
+    "Modified: / 02-08-2017 / 12:00:38 / cg"
+!
+
+maxCodeLen
+    ^ 4
+
+    "Created: / 02-08-2017 / 09:51:59 / cg"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    WARNING: this is the so called 'simplified soundex' algorithm;
+      there are more variants like miracode (american soundex) or
+      mysqlSoundex around.
+      
+      Be sure to use the correct algorithm, if the generated strings must be compatible
+      (otherwise, the differences are probably too small to be noticed as effect, but
+      your search will be different)
+
+    The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
+
+    SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
+    components of names, but by doing so reports more matches. 
+
+    There are some variations around in the literature; 
+    the following is called 'simplified soundex', and the rules for coding a name are:
+
+    1. The first letter of the name is used in its un-coded form to serve as the prefix
+       character of the code. (The rest of the code is numerical).
+
+    2. Thereafter, W and H are ignored entirely.
+
+    3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
+
+    4. Other letters of the name are converted to a numerical equivalent:
+                 B, P, F, V              1 
+                 C, G, J, K, Q, S, X, Z  2 
+                 D, T                    3 
+                 L                       4 
+                 M, N                    5 
+                 R                       6 
+
+    5. There are two exceptions: 
+        1. Letters that follow prefix letters which would, if coded, have the same
+           numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
+
+        2. The second letter of any pair of consonants having the same code number is likewise ignored, 
+           i.e. unless there is a ''separator'' between them in the name.
+
+    6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
+       Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
+
+    Notice, that in another variant, w and h are treated slightly differently.
+    This is only of relevance, if you need to reconstruct original soundex codes of other programs
+    or for the original 1880 us census data.
+     SoundexStringComparator  new encode:'Ashcraft' -> 'A226'
+    vs.
+     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+    
+    Also notice, that soundex deals better with english. 
+    For german and other languages, other algorithms may provide better results.
+"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
+
+encode:word 
+    |u p t prevCode|
+
+    u := word asUppercase.
+    p := u first asString.
+    prevCode := self translate:u first.
+    u from:2 to:u size do:[:c | 
+        t := self translate:c.
+        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+            p := p , t.
+            p size == 4 ifTrue:[^ p ].
+        ].
+        prevCode := t
+    ].
+    [ p size < 4 ] whileTrue:[
+        p := p , '0'
+    ].
+    ^ (p copyFrom:1 to:4)
+
+    "
+     self new encode:'washington' -> 'W252'
+     self new encode:'lee'        -> 'L000'
+     self new encode:'Gutierrez'  -> 'G362'
+     self new encode:'Pfister'    -> 'P236'
+     self new encode:'Jackson'    -> 'J250'
+     self new encode:'Tymczak'    -> 'T522'
+    "
+    
+    "notice:
+     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+     self new encode:'Ashcraft'   -> 'A226'
+    "
+
+    "Created: / 28-07-2017 / 15:21:23 / cg"
+    "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
+
+translate:aCharacter
+    "use simple if's for more speed when compiled"
+
+    "vowels serve as separators"
+    aCharacter == $A ifTrue:[^ '0' ].         
+    aCharacter == $E ifTrue:[^ '0' ].
+    aCharacter == $I ifTrue:[^ '0' ].
+    aCharacter == $O ifTrue:[^ '0' ].
+    aCharacter == $U ifTrue:[^ '0' ].
+    aCharacter == $Y ifTrue:[^ '0' ].
+
+    aCharacter == $B ifTrue:[^ '1' ]. 
+    aCharacter == $P ifTrue:[^ '1' ]. 
+    aCharacter == $F ifTrue:[^ '1' ]. 
+    aCharacter == $V ifTrue:[^ '1' ]. 
+
+    aCharacter == $C ifTrue:[^ '2' ]. 
+    aCharacter == $S ifTrue:[^ '2' ]. 
+    aCharacter == $K ifTrue:[^ '2' ]. 
+    aCharacter == $G ifTrue:[^ '2' ]. 
+    aCharacter == $J ifTrue:[^ '2' ]. 
+    aCharacter == $Q ifTrue:[^ '2' ]. 
+    aCharacter == $X ifTrue:[^ '2' ]. 
+    aCharacter == $Z ifTrue:[^ '2' ]. 
+
+    aCharacter == $D ifTrue:[^ '3' ]. 
+    aCharacter == $T ifTrue:[^ '3' ]. 
+
+    aCharacter == $L ifTrue:[^ '4' ]. 
+
+    aCharacter == $M ifTrue:[^ '5' ]. 
+    aCharacter == $N ifTrue:[^ '5' ]. 
+
+    aCharacter == $R ifTrue:[^ '6' ]. 
+    ^ nil
+
+    "Modified: / 02-08-2017 / 01:35:40 / cg"
+    "Modified (comment): / 02-08-2017 / 14:30:11 / cg"
+! !
+
+!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
+    and also removing vokals first, then removing duplicate codes
+    (whereas the soundex code does this in reverse order).
+
+    These variations are important, if you need the miracode soundex codes to be generated.
+"
+! !
+
+!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
+
+encode:word 
+    "same as inherited, but cares for 0, W and H"
+
+    |u p t prevCode|
+
+    u := word asUppercase.
+    p := u first asString.
+    prevCode := self translate:u first.
+    u from:2 to:u size do:[:c |
+        t := self translate:c.
+        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+            p := p , t.
+        ].
+        (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
+            prevCode := t.
+        ].
+    ].
+    [ p size < 4 ] whileTrue:[
+        p := p , '0'
+    ].
+    ^ p
+
+    "Created: / 28-07-2017 / 15:23:41 / cg"
+    "Modified: / 31-07-2017 / 17:53:51 / cg"
+    "Modified (comment): / 02-08-2017 / 14:31:15 / cg"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    NYSIIS Algorithm:
+
+    1.
+        remove all ''S'' and ''Z'' chars from the end of the surname 
+
+    2.
+        transcode initial strings
+            MAC => MC
+            PF => F
+
+    3.
+        Transcode trailing strings as follows,
+        
+            IX => IC
+            EX => EC
+            YE,EE,IE => Y
+            NT,ND => D 
+
+    4.
+        transcode ''EV'' to ''EF'' if not at start of name
+
+    5.
+        use first character of name as first character of key 
+
+    6.
+        remove any ''W'' that follows a vowel 
+
+    7.
+        replace all vowels with ''A'' 
+
+    8.
+        transcode ''GHT'' to ''GT'' 
+
+    9.
+        transcode ''DG'' to ''G'' 
+
+    10.
+        transcode ''PH'' to ''F'' 
+
+    11.
+        if not first character, eliminate all ''H'' preceded or followed by a vowel 
+
+    12.
+        change ''KN'' to ''N'', else ''K'' to ''C'' 
+
+    13.
+        if not first character, change ''M'' to ''N'' 
+
+    14.
+        if not first character, change ''Q'' to ''G'' 
+
+    15.
+        transcode ''SH'' to ''S'' 
+
+    16.
+        transcode ''SCH'' to ''S'' 
+
+    17.
+        transcode ''YW'' to ''Y'' 
+
+    18.
+        if not first or last character, change ''Y'' to ''A'' 
+
+    19.
+        transcode ''WR'' to ''R'' 
+
+    20.
+        if not first character, change ''Z'' to ''S'' 
+
+    21.
+        transcode terminal ''AY'' to ''Y'' 
+
+    22.
+        remove traling vowels 
+
+    23.
+        collapse all strings of repeated characters 
+
+    24.
+        if first char of original surname was a vowel, append it to the code
+"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
+
+encode:aString 
+    |k|
+
+    k := self rule1:(aString asUppercase).
+    "2. Transcode initial strings:  MAC => MC   PF => F"
+    k := self rule2:k.
+    k := self rule3:k.
+    k := self rule4:k.
+    k := self rule5:k.
+    k := self rule6:k.
+    k := self rule7:k.
+    k := self rule8:k.
+    k := self rule9:k.
+    k := self rule10:k.
+    k := self rule11:k.
+    k := self rule12:k.
+    k := self rule13:k.
+    k := self rule14:k.
+    k := self rule15:k.
+    k := self rule16:k.
+    k := self rule17:k.
+    k := self rule18:k.
+    k := self rule19:k.
+    k := self rule20:k.
+    k := self rule21:k.
+    k := self rule22:k.
+    k := self rule23:k.
+    k := self rule24:k originalKey:aString.
+    ^ k
+
+    "
+     self new encode:'hello'
+     self new encode:'bliss'
+    "
+    "
+     self new phoneticStringsFor:'hello'
+     self new phoneticStringsFor:'bliss'
+    "
+
+    "Created: / 28-07-2017 / 15:34:52 / cg"
+    "Modified (comment): / 02-08-2017 / 14:31:47 / cg"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
+
+rule10:key 
+    "10. transcode 'PH' to 'F' "
+    
+    ^ self transcodeAll:'PH' of:key to:'F' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:34:27 / cg"
+!
+
+rule11:key 
+    |k c|
+
+    "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
+    k := key copy.
+    c := SortedCollection sortBlock:[:a :b | b < a ].
+    2 to:key size do:[:i | 
+        (key at:i) = $H ifTrue:[
+            ((key at:i - 1) isVowel 
+                or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
+        ]
+    ].
+    c do:[:n | 
+        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+    ].
+    ^ k
+!
+
+rule12:key 
+    |k|
+
+    "12. change 'KN' to 'N', else 'K' to 'C' "
+    k := self transcodeAll:'KN' of:key to:'K' startingAt:1.
+    k := self transcodeAll:'K' of:k to:'C' startingAt:1.
+    ^ k
+
+    "Modified (format): / 02-08-2017 / 14:34:48 / cg"
+!
+
+rule13:key 
+    "13. if not first character, change 'M' to 'N' "
+    
+    ^ self transcodeAll:'M' of:key to:'N' startingAt:2
+
+    "Modified (format): / 02-08-2017 / 14:35:00 / cg"
+!
+
+rule14:key 
+    "14. if not first character, change 'Q' to 'G' "
+    
+    ^ self transcodeAll:'Q' of:key to:'G' startingAt:2
+
+    "Modified (format): / 02-08-2017 / 14:35:08 / cg"
+!
+
+rule15:key 
+    "15. transcode 'SH' to 'S' "
+    
+    ^ self transcodeAll:'SH' of:key to:'S' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:35:18 / cg"
+!
+
+rule16:key 
+    "16. transcode 'SCH' to 'S' "
+    
+    ^ self transcodeAll:'SCH' of:key to:'S' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:35:25 / cg"
+!
+
+rule17:key 
+    "17. transcode 'YW' to 'Y' "
+    
+    ^ self transcodeAll:'YW' of:key to:'Y' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:35:33 / cg"
+!
+
+rule18:key 
+    |k|
+
+    "18. if not first or last character, change 'Y' to 'A' "
+    k := self transcodeAll:'Y' of:key to:'A' startingAt:2.
+    key last = $Y ifTrue:[
+        k at:k size put:$Y
+    ].
+    ^ k
+
+    "Modified (format): / 02-08-2017 / 14:35:44 / cg"
+!
+
+rule19:key 
+    "19. transcode 'WR' to 'R' "
+    
+    ^ self transcodeAll:'WR' of:key to:'R' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:35:52 / cg"
+!
+
+rule1:key 
+    |k|
+
+    k := key copy.
+     "1. Remove all 'S' and 'Z' chars from the end of the name"
+    [
+        'SZ' includes:k last
+    ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
+    ^ k
+!
+
+rule20:key 
+    "20. if not first character, change 'Z' to 'S' "
+    
+    ^ self transcodeAll:'Z' of:key to:'S' startingAt:2
+
+    "Modified (format): / 02-08-2017 / 14:36:00 / cg"
+!
+
+rule21:key 
+    "21. transcode terminal 'AY' to 'Y' "
+    
+    ^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1
+
+    "Modified (format): / 02-08-2017 / 14:36:08 / cg"
+!
+
+rule22:key 
+    |k|
+
+    "22. remove trailing vowels "
+    k := key copy.
+    [ k last isVowel ] whileTrue:[
+        k := k copyButLast
+    ].
+    ^ k
+
+    "Modified: / 02-08-2017 / 14:36:42 / cg"
+!
+
+rule23:key 
+    |k c|
+
+    "23. collapse all strings of repeated characters "
+    k := key copy.
+    c := SortedCollection sortBlock:[:a :b | b < a ].
+    k size to:2 do:[:i | 
+        (k at:i) = (k at:i - 1) ifTrue:[
+            c add:i
+        ]
+    ].
+    c do:[:n | 
+        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+    ].
+    ^ k
+!
+
+rule24:key originalKey:originalKey 
+    |k|
+
+    "24. if first char of original surname was a vowel, append it to the code"
+    k := key copy.
+    originalKey first isVowel ifTrue:[
+        k := k , originalKey first asString asUppercase
+    ].
+    ^ k
+!
+
+rule2:key 
+     "2. Transcode initial strings:  MAC => MC   PF => F"
+
+    |k|
+
+    k := key copy.
+    (k startsWith:'MAC') ifTrue:[
+        k := 'MC' , (k copyFrom:4)
+    ].
+    (k startsWith:'PF') ifTrue:[
+        k := 'F' , (k copyFrom:3)
+    ].
+    ^ k
+
+    "Modified (format): / 02-08-2017 / 14:31:40 / cg"
+!
+
+rule3:key 
+    |k|
+
+    "3. Transcode trailing strings as follows:
+        IX => IC
+          EX => EC
+          YE, EE, IE => Y
+           NT, ND => D"
+           
+    k := key copy.
+    k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'.
+    k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'.
+    k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'.
+    k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'.
+    ^ k
+
+    "Modified (format): / 02-08-2017 / 14:32:24 / cg"
+!
+
+rule4:key 
+    "4. Transcode 'EV' to 'EF' if not at start of name"
+    
+    ^ self transcodeAll:'EV' of:key to:'EF' startingAt:2
+
+    "Modified (format): / 02-08-2017 / 14:32:35 / cg"
+!
+
+rule5:key 
+    "5. Use first character of name as first character of key.  
+        Ignored because we're doing an in-place conversion"
+    
+    ^ key
+
+    "Modified (comment): / 02-08-2017 / 14:32:45 / cg"
+!
+
+rule6:key 
+    |k i|
+
+    "6. Remove any 'W' that follows a vowel"
+    k := key copy.
+    i := 2.
+    [
+        (i := k indexOf:$W startingAt:i) > 0
+    ] whileTrue:[
+        (k at:i - 1) isVowel ifTrue:[
+            k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
+            i := i - 1
+        ]
+    ].
+    ^ k
+!
+
+rule7:key 
+    "7. replace all vowels with 'A' "
+    ^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]].
+
+    "Modified: / 02-08-2017 / 14:33:56 / cg"
+!
+
+rule8:key 
+    "8. transcode 'GHT' to 'GT' "
+    
+    ^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:34:05 / cg"
+!
+
+rule9:key 
+    "9. transcode 'DG' to 'G' "
+    
+    ^ self transcodeAll:'DG' of:key to:'G' startingAt:1
+
+    "Modified (format): / 02-08-2017 / 14:34:15 / cg"
+!
+
+transcodeAll:aString of:key to:replacementString startingAt:start 
+    |k i|
+
+    k := key copy.
+    [
+        (i := k indexOfSubCollection:aString startingAt:start) > 0
+    ] whileTrue:[
+        k := (k copyFrom:1 to:i - 1) , replacementString 
+                    , (k copyFrom:i + aString size to:k size)
+    ].
+    ^ k
+!
+
+transcodeTrailing:anArrayOfStrings of:key to:replacementString 
+    |answer|
+
+    answer := key copy.
+    anArrayOfStrings do:[:aString | 
+        answer := self 
+                    transcodeAll:aString
+                    of:answer
+                    to:replacementString
+                    startingAt:(answer size - aString size) + 1
+    ].
+    ^ answer
+! !
+
+!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    Implementation of the PHONEM algorithm, as described in
+    'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
+    Ein Programm fuer kontextsensitive phonetische Textumwandlung
+    ct Magazin fuer Computer & Technik 25/1998'
+    
+    This algorithm deals better with the german language (it cares for umlauts)
+"
+! !
+
+!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
+
+encode:aString 
+    |s idx t t2|
+
+    s := aString asUppercase.
+
+    idx := 1.
+    [idx < (s size-1)] whileTrue:[
+        t2 := nil.
+        t := s copyFrom:idx to:idx+1.
+        t = 'SC' ifTrue:[ t2 := 'C' ]
+        ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
+        ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
+        ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
+        ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
+        ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
+        ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
+        ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
+        ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
+        ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
+        ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
+        ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
+        ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
+        ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
+        ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
+        ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
+        ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
+        t2 notNil ifTrue:[
+            s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
+        ] ifFalse:[
+            idx := idx + 1.
+        ].
+    ].
+
+    "/ single character substitutions via tr
+    s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
+    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
+    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
+    ^ s
+
+    "
+     self basicNew encode:'müller'  -> 'MYLR'    
+     self basicNew encode:'mueller' -> 'MYLR'    
+     self basicNew encode:'möller'  -> 'MYLR'
+     self basicNew encode:'miller'  -> 'MYLR'     
+     self basicNew encode:'muller'  -> 'MULR' 
+     self basicNew encode:'muler'   -> 'MULR' 
+
+     self basicNew phoneticStringsFor:'müller'  #('MYLR')    
+     self basicNew phoneticStringsFor:'mueller' #('MYLR')    
+     self basicNew phoneticStringsFor:'möller'  #('MYLR')
+     self basicNew phoneticStringsFor:'miller'  #('MYLR')     
+     self basicNew phoneticStringsFor:'muller'  #('MULR') 
+     self basicNew phoneticStringsFor:'muler'   #('MULR') 
+     
+     self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
+     self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
+     self basicNew phoneticStringsFor:'fischer'     #('VYCR')
+     self basicNew phoneticStringsFor:'weber'       #('VBR')
+     self basicNew phoneticStringsFor:'weeber'      #('VBR')
+     self basicNew phoneticStringsFor:'webber'      #('VBR')
+     self basicNew phoneticStringsFor:'wepper'      #('VBR')
+     
+     self basicNew phoneticStringsFor:'meyer'       #('MAYR')
+     self basicNew phoneticStringsFor:'maier'       #('MAYR')
+     self basicNew phoneticStringsFor:'mayer'       #('MAYR')
+     self basicNew phoneticStringsFor:'mayr'        #('MAYR')
+     self basicNew phoneticStringsFor:'meir'        #('MAYR')
+     
+     self basicNew phoneticStringsFor:'wagner'      #('VACNR')
+     self basicNew phoneticStringsFor:'schulz'      #('CULC')
+     self basicNew phoneticStringsFor:'becker'      #('BCR')
+     self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
+     self basicNew phoneticStringsFor:'haus'        #('AUS')
+     
+     self basicNew phoneticStringsFor:'schäfer'     #('CVR')
+     self basicNew phoneticStringsFor:'scheffer'    #('CVR')
+     self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
+     self basicNew phoneticStringsFor:'schaefer'    #('CVR')
+    "
+
+    "Created: / 28-07-2017 / 15:38:08 / cg"
+! !
+
+!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'!
+
+documentation
+"
+    Caverphone (2) Algorithm:
+
+    see http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+    
+    Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it,
+    with the proviso that the Caversham Project at the University of Otago should be acknowledged as the
+    original source (which is hereby done ;-).
+
+    •  Start with a Surname or Firstname
+    •  Convert to lowercase
+        This coding system is case sensitive, implementations should acknowledge that a is not the same as A
+    •  Remove anything not A-Z
+        The main intention of this is to remove spaces, hyphens, and apostrophes.
+        example:  o'brian becomes obrian
+    •  If the name starts with cough make it cou2f
+        2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in.
+    •  If the name starts with rough make it rou2f
+    •  If the name starts with tough make it tou2f
+    •  If the name starts with enough make it enou2f
+    •  If the name starts with gn make it 2n
+    •  If the name ends with mb make it m2
+    •  replace cq with 2q
+    •  replace ci with si
+    •  replace ce with se
+    •  replace cy with sy
+    •  replace tch with 2ch
+    •  replace c with k
+    •  replace q with k
+    •  replace x with k
+    •  replace v with f
+    •  replace dg with 2g
+    •  replace tio with sio
+    •  replace tia with sia
+    •  replace d with t
+    •  replace ph with fh
+    •  replace b with p
+    •  replace sh with s2
+    •  replace z with s
+    •  replace and initial vowel with an A
+    •  replace all other vowels with a 3
+        3 is a temporary placeholder marking a vowel
+    •  replace 3gh3 with 3kh3
+        Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule.
+    •  replace gh with 22
+    •  replace g with k
+    •  replace groups of the letter s with a S
+        Continuous strings of s are replace by a single S
+    •  replace groups of the letter t with a T
+    •  replace groups of the letter p with a P
+    •  replace groups of the letter k with a K
+    •  replace groups of the letter f with a F
+    •  replace groups of the letter m with a M
+    •  replace groups of the letter n with a N
+    •  replace w3 with W3
+    •  replace wy with Wy
+    •  replace wh3 with Wh3
+    •  replace why with Why
+    •  replace w with 2
+    •  replace and initial h with an A
+    •  replace all other occurrences of h with a 2
+    •  replace r3 with R3
+    •  replace ry with Ry
+    •  replace r with 2
+    •  replace l3 with L3
+    •  replace ly with Ly
+    •  replace l with 2
+    •  replace j with y
+    •  replace y3 with Y3
+    •  replace y with 2
+    •  remove all 2s
+    •  remove all 3s
+    •  put six (v1) / ten (v2) 1s on the end
+    •  take the first six characters as the code (caverphone 1);
+       / take the first ten characters as the code (caverphone 2);
+
+     self new encode:'david'      -> 'TFT1111111'
+     self new encode:'whittle'    -> 'WTA1111111'
+
+     self new encode:'Stevenson'  -> 'STFNSN1111'
+     self new encode:'Peter'      -> 'PTA1111111'
+
+     self new encode:'washington' -> 'WSNKTN1111'
+     self new encode:'lee'        -> 'LA11111111'
+     self new encode:'Gutierrez'  -> 'KTRS111111'
+     self new encode:'Pfister'    -> 'PFSTA11111'
+     self new encode:'Jackson'    -> 'YKSN111111'
+     self new encode:'Tymczak'    -> 'TMKSK11111'
+
+     self new encode:'add'        -> 'AT11111111'
+     self new encode:'aid'        -> 'AT11111111'
+     self new encode:'at'         -> 'AT11111111'
+     self new encode:'art'        -> 'AT11111111'
+     self new encode:'earth'      -> 'AT11111111'
+     self new encode:'head'       -> 'AT11111111'
+     self new encode:'old'        -> 'AT11111111'
+
+     self new encode:'ready'      -> 'RTA1111111'
+     self new encode:'rather'     -> 'RTA1111111'
+     self new encode:'able'       -> 'APA1111111'
+     self new encode:'appear'     -> 'APA1111111'
+
+     self new encode:'Deedee'     -> 'TTA1111111'
+"
+! !
+
+!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'!
+
+encode:word 
+    |txt|
+
+    word size == 0 ifTrue:[^ '1111111111' ].
+    
+    "/ 1. Convert to lowercase
+    txt := word asLowercase.
+
+    "/ 2. Remove anything not A-Z
+    txt := txt select:#isLetter.
+
+    #(
+    "/  oldSeq newSeq repeat
+
+    "/ 2.5. Remove final e
+        'e$' '' false
+    "/ 3. Handle various start options
+        '^cough' 'cou2f' false
+        '^rough' 'rou2f' false
+        '^tough' 'tou2f' false
+        '^enough' 'enou2f' false
+        '^trough' 'trou2f' false
+
+        '^gn' '2n' false
+        'mb$' 'm2' false
+        
+    "/ 4. Handle replacements
+        'cq' '2q' true
+        'ci' 'si' true
+        'ce' 'se' true
+        'cy' 'sy' true
+        'tch' '2ch' true
+        'c' 'k' true
+        'q' 'k' true
+        'x' 'k' true
+        'v' 'f' true
+        'dg' '2g' true
+        'tio' 'sio' true
+        'tia' 'sia' true
+        'd' 't' true
+        'ph' 'fh' true
+        'b' 'p' true
+        'sh' 's2' true
+        'z' 's' true
+        
+        '^a' 'A' false
+        '^e' 'A' false
+        '^i' 'A' false
+        '^o' 'A' false
+        '^u' 'A' false
+        
+        'a' '3' true
+        'e' '3' true
+        'i' '3' true
+        'o' '3' true
+        'u' '3' true
+        'j' 'y' true 
+        
+        '^y3' 'Y3' false 
+        '^y' 'A' false
+
+        'y' '3'  true
+        '3gh3' '3kh3' true
+        'gh' '22' true
+        'g' 'k' true
+        's'  'S' true
+        'SS' 'S' true
+        't'  'T' true
+        'TT' 'T' true
+        'p'  'P' true
+        'PP' 'P' true
+        'k'  'K' true
+        'KK' 'K' true
+        'f'  'F' true
+        'FF' 'F' true
+        'm'  'M' true
+        'MM' 'M' true
+        'n'  'N' true
+        'NN' 'N' true
+        'w3' 'W3' true
+        'wh3' 'Wh3' true
+        'w$' '3'  false
+        'w' '2' true
+        '^h' 'A' false
+        'h' '2' true
+        'r3' 'R3' true
+        'r$' '3'  false
+        'r' '2' true
+        'l3' 'L3' true
+        'l$' '3' false
+        'l' '2' true
+
+    "/ 5. removals
+
+        '2' '' true
+        '3$' 'A' true
+        '3' '' true
+    ) inGroupsOf:3 do:[:pat :repl :repeat|
+        |s txtBefore|
+
+        txtBefore := txt.
+        (pat startsWith:$^) ifTrue:[
+            s := pat copyButFirst.
+            repeat ifTrue:[
+                [txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ]
+            ] ifFalse:[
+                (txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ]
+            ].    
+        ] ifFalse:[
+            (pat endsWith:$$) ifTrue:[
+                s := pat copyButLast.
+                repeat ifTrue:[
+                    [txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ]
+                ] ifFalse:[
+                    (txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ]
+                ]
+            ] ifFalse:[
+                repeat ifTrue:[
+                    txt := txt copyReplaceAllSubcollections:pat with:repl
+                ] ifFalse:[
+                    txt := txt copyReplaceSubcollection:pat with:repl
+                ]    
+            ]    
+        ].
+        "/ txt ~= txtBefore ifTrue:[
+        "/     Transcript showCR:(pat,' | ',repl,' -> ',txt).
+        "/ ].    
+    ].    
+
+    "/ 6. put ten 1s on the end
+    txt := txt,'1111111111'.
+    
+    "/ 7. take the first ten characters as the code
+    ^ txt copyTo:10
+
+    "
+     self new encode:'david'      -> 'TFT1111111'
+     self new encode:'whittle'    -> 'WTA1111111'
+
+     self new encode:'Stevenson'  -> 'STFNSN1111'
+     self new encode:'Peter'      -> 'PTA1111111'
+
+     self new encode:'washington' -> 'WSNKTN1111'
+     self new encode:'lee'        -> 'LA11111111'
+     self new encode:'Gutierrez'  -> 'KTRS111111'
+     self new encode:'Pfister'    -> 'PFSTA11111'
+     self new encode:'Jackson'    -> 'YKSN111111'
+     self new encode:'Tymczak'    -> 'TMKSK11111'
+
+     self new encode:'add'        -> 'AT11111111'
+     self new encode:'aid'        -> 'AT11111111'
+     self new encode:'at'         -> 'AT11111111'
+     self new encode:'art'        -> 'AT11111111'
+     self new encode:'earth'      -> 'AT11111111'
+     self new encode:'head'       -> 'AT11111111'
+     self new encode:'old'        -> 'AT11111111'
+
+     self new encode:'ready'      -> 'RTA1111111'
+     self new encode:'rather'     -> 'RTA1111111'
+     self new encode:'able'       -> 'APA1111111'
+     self new encode:'appear'     -> 'APA1111111'
+
+     self new encode:'Deedee'     -> 'TTA1111111'
+    "
+
+    "Created: / 28-07-2017 / 15:21:23 / cg"
+    "Modified: / 02-08-2017 / 01:42:35 / cg"
+! !
+
 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
 
 documentation
@@ -3531,19 +5359,19 @@
      self new encode:'Tymczak'    -> 'T522'
 
     notice:
-     MiracodeStringComparator new 
-                    encode:'Ashcraft' -> 'A261'
-     SoundexStringComparator 
-                new encode:'Ashcraft' -> 'A226'
+     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+     SoundexStringComparator new encode:'Ashcraft'  -> 'A226'
 
     see also:            
         https://www.archives.gov/research/census/soundex.html
 "
 ! !
 
-!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'api'!
+!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'!
 
 encode:word 
+    "same as inherited, but cares for W and H"
+    
     |u p t prevCode|
 
     u := word asUppercase.
@@ -3566,22 +5394,8 @@
     ].
     ^ (p copyFrom:1 to:4)
 
-    "
-     self new encode:'washington' -> 'W252'
-     self new encode:'lee'        -> 'L000'
-     self new encode:'Gutierrez'  -> 'G362'
-     self new encode:'Pfister'    -> 'P236'
-     self new encode:'Jackson'    -> 'J250'
-     self new encode:'Tymczak'    -> 'T522'
-    "
-
-    "notice:
-     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
-     self new encode:'Ashcraft'   -> 'A226'
-    "
-
-    "Created: / 28-07-2017 / 15:23:16 / cg"
-    "Modified (comment): / 01-08-2017 / 19:01:51 / cg"
+    "Created: / 02-08-2017 / 00:19:47 / cg"
+    "Modified (comment): / 02-08-2017 / 14:30:47 / cg"
 ! !
 
 !PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!
author	Claus Gittinger <cg@exept.de>
	Wed, 02 Aug 2017 14:37:29 +0200
changeset 4491	d6c31bb1e928
parent 4490	33b5fbfc4b5d
child 4492	05def04efc34