hg/stx-libbasic2: PhoneticStringUtilities.st@7355a4b11cb6


"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 1994 by Claus Gittinger
 COPYRIGHT (c) 2009 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#PhoneticStringUtilities
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

Object subclass:#PhoneticStringComparator
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator
	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
		currentIndex skipCount'
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
		currentIndex skipCount'
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::PhoneticStringComparator subclass:#SingleResultPhoneticStringComparator
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MRAStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator
	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
		currentIndex skipCount'
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SoundexStringComparator subclass:#MySQLSoundexStringComparator
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#NYSIISStringComparator
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#PhonemStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SoundexStringComparator subclass:#MiracodeStringComparator
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SpanishPhoneticCodeStringComparator
	instanceVariableNames:''
	classVariableNames:'CharacterTranslationDict'
	poolDictionaries:''
	privateIn:PhoneticStringUtilities
!

!PhoneticStringUtilities class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 1994 by Claus Gittinger
 COPYRIGHT (c) 2009 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    Utilities which are helpful to perform phonetic string searches or comparisons.
    These are all variations or improvements of the soundex algorithm, which usually fails
    to provide good results for non-english languages.
    
    soundexCode
        this algorithm was originally contained in the CharacterArray class;

    nysiis
        a modified soundex algorithm

    miracode
        another modified soundex algorithm ('american soundex') used in the 1880 census.

    mySQLSoundex
        another modified soundex algorithm used in mySQL.

    koelner phoneticCode 
        provides a functionality similar to soundex, but much more tuned towards the German language

    Double metaphone 
        works with most european languages.

    phonem
        described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
        from 'ct Magazin fuer Computer & Technik 25/1999'.

    mra
        Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.

    caverphone2
        better than soundex

    spanish phonetic code
        an algorithm slightly adjusted to spanish names

    More info for german readers is found in:
        http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
"
!

sampleData
"
    for the 50 most common german names, we get:

                            ext. 
    name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln  caverphone2  mra

    müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657    MLA1111111   MLR
    schmidt     S530    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        862    SKMT111111   SCHMDT
    schneider   S536    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627   SKNTA11111   SCHNDR
    fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387    FSKA111111   FSCHR
    weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317    WPA1111111   WBR
    meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67     MA11111111   MYR
    wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       3467   WKNA111111   WGNR
    schulz      S420    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        858    SKS1111111   SCHLZ
    becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147    PKA1111111   BCKR
    hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036    AFMN111111   HFMN
    schäfer     S16ß    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837    SKFA111111   SCHFR

    |cls|
    
    cls := MRAStringComparator.
    cls := SoundexStringComparator.
    cls := KoelnerPhoneticCodeStringComparator.
    cls := Caverphone2StringComparator.
    #('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer' 
      'wagner' 'schulz'  'becker'    'hoffmann' 'schäfer')
    do:[:name |
        Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''.
    ].

    KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt'  -> '65752682'
"
! !

!PhoneticStringUtilities class methodsFor:'phonetic codes'!

koelnerPhoneticCodeOf:aString
    "return a koelner phonetic code.
     The koelnerPhonetic code is for the german language what the soundex code is for english;
     it returns simular strings for similar sounding words. 
     There are some differences to soundex, though: 
        its length is not limited to 4, but depends on the length of the original string;
        it does not start with the first character of the input.
     This algorithm is described by Postel 1969"

    ^ (KoelnerPhoneticCodeStringComparator new phoneticStringsFor:aString) first

    "
     #(
        'Müller'
        'Miller'
        'Mueller'
        'Mühler'
        'Mühlherr'
        'Mülherr'
        'Myler'
        'Millar'
        'Myller'
        'Müllar'
        'Müler'
        'Muehler'
        'Mülller'
        'Müllerr'
        'Muehlherr'
        'Muellar'
        'Mueler'
        'Mülleer'
        'Mueller'
        'Nüller'
        'Nyller'
        'Niler'
        'Czerny'
        'Tscherny'
        'Czernie'
        'Tschernie'
        'Schernie'
        'Scherny'
        'Scherno'
        'Czerne'
        'Zerny'
        'Tzernie'
        'Breschnew'
     ) do:[:w |
         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w)
     ].
    "

    "
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph'. '17863'.
     PhoneticStringUtilities koelnerPhoneticCodeOf:'Preschnjiev'. '17863'.
    "
!

miracodeCodeOf:aString
    "return a miracode soundex phonetic code or nil.
     Miracode is a slightly modified soundex algorithm.
     Notice that there are better algorithms around (doubleMetaphone) "

    ^ (MiracodeStringComparator new phoneticStringsFor:aString) first

    "
     PhoneticStringUtilities miracodeCodeOf:'claus'   
     PhoneticStringUtilities miracodeCodeOf:'clause'   
     PhoneticStringUtilities miracodeCodeOf:'close'   
     PhoneticStringUtilities miracodeCodeOf:'smalltalk' 
     PhoneticStringUtilities miracodeCodeOf:'smaltalk'  
     PhoneticStringUtilities miracodeCodeOf:'smaltak'   
     PhoneticStringUtilities miracodeCodeOf:'smaltok'   
     PhoneticStringUtilities miracodeCodeOf:'smoltok'   
     PhoneticStringUtilities miracodeCodeOf:'aa'        
     PhoneticStringUtilities miracodeCodeOf:'by'        
     PhoneticStringUtilities miracodeCodeOf:'bab'       
     PhoneticStringUtilities miracodeCodeOf:'bob'       
     PhoneticStringUtilities miracodeCodeOf:'bop'       
     PhoneticStringUtilities miracodeCodeOf:'pub'       
    "

    "Created: / 28-07-2017 / 15:32:41 / cg"
!

mySQLSoundexCodeOf:aString
    "return the mySQL soundex code. The mysql soundex coed is different from the miracode 'american' soundex
     (no 4char limitation; different order of duplicate vowel vs. duplicate code elimination).
     Notice that there are better algorithms around (doubleMetaphone) "

    ^ (MySQLSoundexStringComparator new phoneticStringsFor:aString) first

    "
     #(
        'Müller'
        'Miller'
        'Mueller'
        'Mühler'
        'Mühlherr'
        'Mülherr'
        'Myler'
        'Millar'
        'Myller'
        'Müllar'
        'Müler'
        'Muehler'
        'Mülller'
        'Müllerr'
        'Muehlherr'
        'Muellar'
        'Mueler'
        'Mülleer'
        'Mueller'
        'Nüller'
        'Nyller'
        'Niler'
        'Czerny'
        'Tscherny'
        'Czernie'
        'Tschernie'
        'Schernie'
        'Scherny'
        'Scherno'
        'Czerne'
        'Zerny'
        'Tzernie'
        'Breschnew'
     ) do:[:w |
         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities mySQLSoundexCodeOf:w)
     ].
    "

    "
     PhoneticStringUtilities mySQLSoundexCodeOf:'Breschnew'. 
     PhoneticStringUtilities mySQLSoundexCodeOf:'Breschneff'. 
     PhoneticStringUtilities mySQLSoundexCodeOf:'Braeschneff'. 
     PhoneticStringUtilities mySQLSoundexCodeOf:'Braessneff'.
     PhoneticStringUtilities mySQLSoundexCodeOf:'Pressneff'. 
     PhoneticStringUtilities mySQLSoundexCodeOf:'Presznäph'. 
     PhoneticStringUtilities mySQLSoundexCodeOf:'Preschnjiev'.
    "

    "Modified (comment): / 28-07-2017 / 15:34:03 / cg"
!

soundexCodeOf:aString
    "return a soundex phonetic code or nil.
     Soundex (1918, 1922) returns similar codes for similar sounding words, making it a useful
     tool when searching for words where the correct spelling is unknown.
     (read Knuth or search the web if you don't know what a soundex code is).
     Caveat: 'similar sounding words' means: 'similar sounding in english'.
     Notice that there are better algorithms around (doubleMetaphone) "

    ^ (SoundexStringComparator new phoneticStringsFor:aString) first

"/ old code - now use code in private class...
"/    |inStream codeStream ch last lch codeLength codes code lastCode|
"/
"/    inStream := aString readStream.
"/    inStream skipSeparators.
"/    inStream atEnd ifTrue:[
"/        ^ nil
"/    ].
"/
"/    ch := inStream next.
"/    ch isLetter ifFalse:[
"/        ^ nil
"/    ].
"/    codeLength := 0.
"/
"/    codes := Dictionary new.
"/    codes atAll:'bpfv'     put:$1.
"/    codes atAll:'cskgjqxz' put:$2.
"/    codes atAll:'dt'       put:$3.
"/    codes atAll:'l'        put:$4.
"/    codes atAll:'nm'       put:$5.
"/    codes atAll:'r'        put:$6.
"/
"/    codeStream := WriteStream on:(String new:4).
"/    codeStream nextPut:(ch asUppercase).
"/    last := ch asLowercase.
"/    lastCode := codes at:last ifAbsent:nil.
"/
"/    [inStream atEnd] whileFalse:[
"/        ch := inStream next.
"/        lch := ch asLowercase.
"/        lch = last ifFalse:[
"/            last := lch.
"/
"/            code := codes at:lch ifAbsent:nil.
"/            (code notNil and:[ code ~= lastCode]) ifTrue:[
"/                codeLength < 3 ifTrue:[
"/                    codeStream nextPut:code.
"/                    codeLength := codeLength + 1.
"/                    codeLength > 3 ifTrue:[^ codeStream contents].
"/                ].
"/            ].
"/            lastCode := code.
"/        ]
"/    ].
"/    [ codeLength < 3 ] whileTrue:[
"/        codeStream nextPut:$0.
"/        codeLength := codeLength + 1.
"/    ].
"/
"/    ^ codeStream contents

    "
     PhoneticStringUtilities soundexCodeOf:'claus'   
     PhoneticStringUtilities soundexCodeOf:'clause'   
     PhoneticStringUtilities soundexCodeOf:'close'   
     PhoneticStringUtilities soundexCodeOf:'smalltalk' 
     PhoneticStringUtilities soundexCodeOf:'smaltalk'  
     PhoneticStringUtilities soundexCodeOf:'smaltak'   
     PhoneticStringUtilities soundexCodeOf:'smaltok'   
     PhoneticStringUtilities soundexCodeOf:'smoltok'   
     PhoneticStringUtilities soundexCodeOf:'aa'        
     PhoneticStringUtilities soundexCodeOf:'by'        
     PhoneticStringUtilities soundexCodeOf:'bab'       
     PhoneticStringUtilities soundexCodeOf:'bob'       
     PhoneticStringUtilities soundexCodeOf:'bop'       
    "

    "Modified (comment): / 28-07-2017 / 15:33:53 / cg"
! !

!PhoneticStringUtilities class methodsFor:'queries'!

isUtilityClass
    ^ self == PhoneticStringUtilities
! !

!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'constant'!

defaultClass
	^SoundexStringComparator
! !

!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'documentation'!

documentation
"
    abstract superclass for various phonetic comparators.
    They returns similar strings for similar sounding words, which can be used
    to find similar sounding words in a search list.
    
    Notice, that some comparators are better for particular languages.
"
!

examples
"
     PhoneticStringUtilities::SoundexStringComparator new
            does:'miller' soundLike:'miler'.   

     PhoneticStringUtilities::SoundexStringComparator new
            does:'miller' soundLike:'milner'.   

     PhoneticStringUtilities::SoundexStringComparator new
            does:'müller' soundLike:'mueller'.   

     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new
            does:'müller' soundLike:'mueller'.   
"
! !

!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'instance creation'!

new
    ^ self basicNew initialize.
! !

!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'queries'!

isAbstract
    ^ self == PhoneticStringUtilities::PhoneticStringComparator
! !

!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'!

does:aString soundLike:anotherString 
    "return true, if aString sounds similar to anotherString"

    ^ self new does:aString soundLike:anotherString.

    "
     PhoneticStringUtilities::SoundexStringComparator does:'miller' soundLike:'miler'.   

     PhoneticStringUtilities::SoundexStringComparator does:'miller' soundLike:'milner'.   

     PhoneticStringUtilities::SoundexStringComparator does:'müller' soundLike:'mueller'.   

     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator does:'müller' soundLike:'mueller'.   
     PhoneticStringUtilities::DoubleMetaphoneStringComparator does:'müller' soundLike:'mueller'.   
    "
!

encode:word
    "return a phonetic encoding for a word.
     This can eg. be used as key to map/hash similar sounding words"

    ^ (self new phoneticStringsFor:word) first

    "
     SoundexStringComparator encode:'Fischer'             -> 'F260'
     SoundexStringComparator encode:'Fiescher'            -> 'F260'
     Caverphone2StringComparator encode:'Fischer'         -> 'FSKA111111'
     Caverphone2StringComparator encode:'Fiescher'        -> 'FSKA111111'
     MRAStringComparator encode:'Fischer'                 -> 'FSCHR'
     MRAStringComparator encode:'Fiescher'                -> 'FSCHR'
     SpanishPhoneticCodeStringComparator encode:'Fischer'  -> '24429'
     SpanishPhoneticCodeStringComparator encode:'Fiescher' -> '24429'
     DoubleMetaphoneStringComparator encode:'Fischer'      -> 'FXR'
     DoubleMetaphoneStringComparator encode:'Fiescher'     -> 'FXR'
    "

    "Created: / 02-08-2017 / 01:15:50 / cg"
! !

!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!

does:aString soundLike:anotherString 
    "return true, if aString sounds similar to anotherString"

    |translations1 translations2|

    translations1 := self phoneticStringsFor:aString.    
    translations2 := self phoneticStringsFor:anotherString.  

    ^ translations1 contains:[:t1 | 
        translations2 contains:[:t2 | t1 = t2]
    ]

    "
     PhoneticStringUtilities::SoundexStringComparator new does:'miller' soundLike:'miler'.   
            
     PhoneticStringUtilities::SoundexStringComparator new
            does:'miller' soundLike:'milner'.   

     PhoneticStringUtilities::SoundexStringComparator new
            does:'müller' soundLike:'mueller'.   

     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new
            does:'müller' soundLike:'mueller'.   
    "

    "Modified (comment): / 13-07-2017 / 17:51:43 / cg"
!

phoneticStringsFor: aString
    "Should answer an array of alternate phonetic strings for the given input string."

    self subclassResponsibility

    "
     (PhoneticStringUtilities::SoundexStringComparator new
            phoneticStringsFor:'miller') first 
            
     'miller' asSoundexCode 
    "

    "Modified (comment): / 27-07-2017 / 15:07:59 / cg"
! !

!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'initialization'!

initialize
    "Invoked when a new instance is created."

    "/ please change as required (and remove this comment)

    "/ super initialize.   -- commented since inherited method does nothing
! !

!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'!

documentation
"
    self encode:'AUERBACH' -> 097400, 097500

    Encodes a string into a Daitch-Mokotoff Soundex value.
    The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, 
    yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation 
    but differences in spelling.

    The main differences compared to the other soundex variants are:
        - coded names are 6 digits long
        - the initial character of the name is coded
        - rules to encoded multi-character n-grams
        - multiple possible encodings for the same name (branching)

    This implementation supports branching, depending on the used method:
        encode:aString            - branching disabled, only the first code will be returned
        phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|'

    [see also:]
        'Wikipedia - Daitch-Mokotoff Soundex'
            http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex 

        'Avotaynu - Soundexing and Genealogy'    
            http://www.avotaynu.com/soundex.htm
"
!

javaCode
"<<END
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.codec.language;

import org.apache.commons.codec.CharEncoding;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

import java.io.InputStream;
import java.util.*;

/**
 * Encodes a string into a Daitch-Mokotoff Soundex value.
 * <p>
 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
 * </p>
 * <p>
 * The main differences compared to the other soundex variants are:
 * </p>
 * <ul>
 * <li>coded names are 6 digits long
 * <li>the initial character of the name is coded
 * <li>rules to encoded multi-character n-grams
 * <li>multiple possible encodings for the same name (branching)
 * </ul>
 * <p>
 * This implementation supports branching, depending on the used method:
 * <ul>
 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
 * </ul>
 * <p>
 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
 * rules can be customized by overriding the default rules contained in the resource file
 * {@code org/apache/commons/codec/language/dmrules.txt}.
 * </p>
 * <p>
 * This class is thread-safe.
 * </p>
 *
 * @see Soundex
 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
 *
 * @version $Id$
 * @since 1.10
 */
public class DaitchMokotoffSoundex implements StringEncoder {

    /**
     * Inner class representing a branch during DM soundex encoding.
     */
    private static final class Branch {
        private final StringBuilder builder;
        private String cachedString;
        private String lastReplacement;

        private Branch() {
            builder = new StringBuilder();
            lastReplacement = null;
            cachedString = null;
        }

        /**
         * Creates a new branch, identical to this branch.
         *
         * @return a new, identical branch
         */
        public Branch createBranch() {
            final Branch branch = new Branch();
            branch.builder.append(toString());
            branch.lastReplacement = this.lastReplacement;
            return branch;
        }

        @Override
        public boolean equals(final Object other) {
            if (this == other) {
                return true;
            }
            if (!!(other instanceof Branch)) {
                return false;
            }

            return toString().equals(((Branch) other).toString());
        }

        /**
         * Finish this branch by appending '0's until the maximum code length has been reached.
         */
        public void finish() {
            while (builder.length() < MAX_LENGTH) {
                builder.append('0');
                cachedString = null;
            }
        }

        @Override
        public int hashCode() {
            return toString().hashCode();
        }

        /**
         * Process the next replacement to be added to this branch.
         *
         * @param replacement
         *            the next replacement to append
         * @param forceAppend
         *            indicates if the default processing shall be overridden
         */
        public void processNextReplacement(final String replacement, final boolean forceAppend) {
            final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend;

            if (append && builder.length() < MAX_LENGTH) {
                builder.append(replacement);
                // remove all characters after the maximum length
                if (builder.length() > MAX_LENGTH) {
                    builder.delete(MAX_LENGTH, builder.length());
                }
                cachedString = null;
            }

            lastReplacement = replacement;
        }

        @Override
        public String toString() {
            if (cachedString == null) {
                cachedString = builder.toString();
            }
            return cachedString;
        }
    }

    /**
     * Inner class for storing rules.
     */
    private static final class Rule {
        private final String pattern;
        private final String[] replacementAtStart;
        private final String[] replacementBeforeVowel;
        private final String[] replacementDefault;

        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
                final String replacementDefault) {
            this.pattern = pattern;
            this.replacementAtStart = replacementAtStart.split("\\|");
            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
            this.replacementDefault = replacementDefault.split("\\|");
        }

        public int getPatternLength() {
            return pattern.length();
        }

        public String[] getReplacements(final String context, final boolean atStart) {
            if (atStart) {
                return replacementAtStart;
            }

            final int nextIndex = getPatternLength();
            final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
            if (nextCharIsVowel) {
                return replacementBeforeVowel;
            }

            return replacementDefault;
        }

        private boolean isVowel(final char ch) {
            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
        }

        public boolean matches(final String context) {
            return context.startsWith(pattern);
        }

        @Override
        public String toString() {
            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
        }
    }

    private static final String COMMENT = "//";
    private static final String DOUBLE_QUOTE = "\"";

    private static final String MULTILINE_COMMENT_END = "*/";

    private static final String MULTILINE_COMMENT_START = "/*";

    /** The resource file containing the replacement and folding rules */
    private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";

    /** The code length of a DM soundex value. */
    private static final int MAX_LENGTH = 6;

    /** Transformation rules indexed by the first character of their pattern. */
    private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();

    /** Folding rules. */
    private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();

    static {
        final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
        if (rulesIS == null) {
            throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
        }

        final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
        parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
        scanner.close();

        // sort RULES by pattern length in descending order
        for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
            final List<Rule> ruleList = rule.getValue();
            Collections.sort(ruleList, new Comparator<Rule>() {
                @Override
                public int compare(final Rule rule1, final Rule rule2) {
                    return rule2.getPatternLength() - rule1.getPatternLength();
                }
            });
        }
    }

    private static void parseRules(final Scanner scanner, final String location,
            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
        int currentLine = 0;
        boolean inMultilineComment = false;

        while (scanner.hasNextLine()) {
            currentLine++;
            final String rawLine = scanner.nextLine();
            String line = rawLine;

            if (inMultilineComment) {
                if (line.endsWith(MULTILINE_COMMENT_END)) {
                    inMultilineComment = false;
                }
                continue;
            }

            if (line.startsWith(MULTILINE_COMMENT_START)) {
                inMultilineComment = true;
            } else {
                // discard comments
                final int cmtI = line.indexOf(COMMENT);
                if (cmtI >= 0) {
                    line = line.substring(0, cmtI);
                }

                // trim leading-trailing whitespace
                line = line.trim();

                if (line.length() == 0) {
                    continue; // empty lines can be safely skipped
                }

                if (line.contains("=")) {
                    // folding
                    final String[] parts = line.split("=");
                    if (parts.length !!= 2) {
                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
                                " parts: " + rawLine + " in " + location);
                    } else {
                        final String leftCharacter = parts[0];
                        final String rightCharacter = parts[1];

                        if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) {
                            throw new IllegalArgumentException("Malformed folding statement - " +
                                    "patterns are not single characters: " + rawLine + " in " + location);
                        }

                        asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
                    }
                } else {
                    // rule
                    final String[] parts = line.split("\\s+");
                    if (parts.length !!= 4) {
                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
                                " parts: " + rawLine + " in " + location);
                    } else {
                        try {
                            final String pattern = stripQuotes(parts[0]);
                            final String replacement1 = stripQuotes(parts[1]);
                            final String replacement2 = stripQuotes(parts[2]);
                            final String replacement3 = stripQuotes(parts[3]);

                            final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
                            final char patternKey = r.pattern.charAt(0);
                            List<Rule> rules = ruleMapping.get(patternKey);
                            if (rules == null) {
                                rules = new ArrayList<Rule>();
                                ruleMapping.put(patternKey, rules);
                            }
                            rules.add(r);
                        } catch (final IllegalArgumentException e) {
                            throw new IllegalStateException(
                                    "Problem parsing line '" + currentLine + "' in " + location, e);
                        }
                    }
                }
            }
        }
    }

    private static String stripQuotes(String str) {
        if (str.startsWith(DOUBLE_QUOTE)) {
            str = str.substring(1);
        }

        if (str.endsWith(DOUBLE_QUOTE)) {
            str = str.substring(0, str.length() - 1);
        }

        return str;
    }

    /** Whether to use ASCII folding prior to encoding. */
    private final boolean folding;

    /**
     * Creates a new instance with ASCII-folding enabled.
     */
    public DaitchMokotoffSoundex() {
        this(true);
    }

    /**
     * Creates a new instance.
     * <p>
     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
     * è -&gt; e.
     * </p>
     *
     * @param folding
     *            if ASCII-folding shall be performed before encoding
     */
    public DaitchMokotoffSoundex(final boolean folding) {
        this.folding = folding;
    }

    /**
     * Performs a cleanup of the input string before the actual soundex transformation.
     * <p>
     * Removes all whitespace characters and performs ASCII folding if enabled.
     * </p>
     *
     * @param input
     *            the input string to cleanup
     * @return a cleaned up string
     */
    private String cleanup(final String input) {
        final StringBuilder sb = new StringBuilder();
        for (char ch : input.toCharArray()) {
            if (Character.isWhitespace(ch)) {
                continue;
            }

            ch = Character.toLowerCase(ch);
            if (folding && FOLDINGS.containsKey(ch)) {
                ch = FOLDINGS.get(ch);
            }
            sb.append(ch);
        }
        return sb.toString();
    }

    /**
     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
     * <p>
     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
     * EncoderException if the supplied object is not of type java.lang.String.
     * </p>
     *
     * @see #soundex(String)
     *
     * @param obj
     *            Object to encode
     * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
     *         supplied.
     * @throws EncoderException
     *             if the parameter supplied is not of type java.lang.String
     * @throws IllegalArgumentException
     *             if a character is not mapped
     */
    @Override
    public Object encode(final Object obj) throws EncoderException {
        if (!!(obj instanceof String)) {
            throw new EncoderException(
                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
        }
        return encode((String) obj);
    }

    /**
     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
     *
     * @see #soundex(String)
     *
     * @param source
     *            A String object to encode
     * @return A DM Soundex code corresponding to the String supplied
     * @throws IllegalArgumentException
     *             if a character is not mapped
     */
    @Override
    public String encode(final String source) {
        if (source == null) {
            return null;
        }
        return soundex(source, false)[0];
    }

    /**
     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
     * <p>
     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
     * separated by '|'.
     * </p>
     * <p>
     * Example: the name "AUERBACH" is encoded as both
     * </p>
     * <ul>
     * <li>097400</li>
     * <li>097500</li>
     * </ul>
     * <p>
     * Thus the result will be "097400|097500".
     * </p>
     *
     * @param source
     *            A String object to encode
     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
     * @throws IllegalArgumentException
     *             if a character is not mapped
     */
    public String soundex(final String source) {
        final String[] branches = soundex(source, true);
        final StringBuilder sb = new StringBuilder();
        int index = 0;
        for (final String branch : branches) {
            sb.append(branch);
            if (++index < branches.length) {
                sb.append('|');
            }
        }
        return sb.toString();
    }

    /**
     * Perform the actual DM Soundex algorithm on the input string.
     *
     * @param source
     *            A String object to encode
     * @param branching
     *            If branching shall be performed
     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
     *         selected branching mode
     */
    private String[] soundex(final String source, final boolean branching) {
        if (source == null) {
            return null;
        }

        final String input = cleanup(source);

        final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
        currentBranches.add(new Branch());

        char lastChar = '\0';
        for (int index = 0; index < input.length(); index++) {
            final char ch = input.charAt(index);

            // ignore whitespace inside a name
            if (Character.isWhitespace(ch)) {
                continue;
            }

            final String inputContext = input.substring(index);
            final List<Rule> rules = RULES.get(ch);
            if (rules == null) {
                continue;
            }

            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
            @SuppressWarnings("unchecked")
            final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;

            for (final Rule rule : rules) {
                if (rule.matches(inputContext)) {
                    if (branching) {
                        nextBranches.clear();
                    }
                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
                    final boolean branchingRequired = replacements.length > 1 && branching;

                    for (final Branch branch : currentBranches) {
                        for (final String nextReplacement : replacements) {
                            // if we have multiple replacements, always create a new branch
                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;

                            // special rule: occurrences of mn or nm are treated differently
                            final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');

                            nextBranch.processNextReplacement(nextReplacement, force);

                            if (branching) {
                                nextBranches.add(nextBranch);
                            } else {
                                break;
                            }
                        }
                    }

                    if (branching) {
                        currentBranches.clear();
                        currentBranches.addAll(nextBranches);
                    }
                    index += rule.getPatternLength() - 1;
                    break;
                }
            }

            lastChar = ch;
        }

        final String[] result = new String[currentBranches.size()];
        int index = 0;
        for (final Branch branch : currentBranches) {
            branch.finish();
            result[index++] = branch.toString();
        }

        return result;
    }
}
END>>"
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!

copyright
"
Copyright (c) 2002-2004 Robert Jarvis

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, 
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom 
the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial 
portions of the Software.

THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
USE OR OTHER DEALINGS IN THE SOFTWARE.'
"
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'classification'!

isSlavoGermanic:aString
    ^ #('w' 'k' 'cz' 'witz' 'ä' 'ö' 'ü' 'ß') contains:[:sub | aString includesString:sub]

    "
     self isSlavoGermanic:'walter'
     self isSlavoGermanic:'horowitz'
     self isSlavoGermanic:'müller'
     self isSlavoGermanic:'miller'
    "

    "Modified: / 28-07-2017 / 10:14:38 / cg"
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'documentation'!

documentation
"
    The Double Metaphone algorithm
    
    see internet: https://en.wikipedia.org/wiki/Metaphone
"
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'accessing'!

currentIndex
	^currentIndex
!

currentIndex: anInteger
	currentIndex := anInteger
!

inputKey
	^inputKey
!

inputKey: aString
    inputKey := aString asUppercase.
    "/ care for diareses
    (inputKey includesAny:'ÄÖÜ') ifTrue:[
        inputKey := inputKey copyReplaceString:'Ä' withString:'AE'.
        inputKey := inputKey copyReplaceString:'Ö' withString:'OE'.
        inputKey := inputKey copyReplaceString:'Ü' withString:'UE'.
    ].
!

primaryTranslation
	^primaryTranslation
!

primaryTranslation: anObject
	primaryTranslation := anObject
!

secondaryTranslation
	^secondaryTranslation
!

secondaryTranslation: anObject
	secondaryTranslation := anObject
!

skipCount
	^skipCount
!

skipCount: anInteger
	skipCount := anInteger
!

startIndex
	^startIndex
!

startIndex: anObject
	startIndex := anObject
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'api'!

phoneticStringsFor:aString 
    "Private - Answers an array of alternate phonetic strings for the given input string."

    self initialize.
    self inputKey:aString.
    self performInitialProcessing.
    self processRemainingCharacters.
    ^ Array with:primaryTranslation with:secondaryTranslation

    "Modified (format): / 28-07-2017 / 11:25:02 / cg"

    "
     PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'muller'
     PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'mueller' 
     PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'müller' 
    "
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'initialization'!

initialize
    super initialize.

    startIndex := 1.
    primaryTranslation := ''.
    secondaryTranslation := ''.
    skipCount := 0.
    currentIndex := 1.

    "Modified: / 28-07-2017 / 11:18:44 / cg"
! !

!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'private'!

addPrimaryTranslation:aString 
    primaryTranslation := (primaryTranslation , aString)

    "Modified: / 28-07-2017 / 11:19:09 / cg"
!

addSecondaryTranslation:aString 
    secondaryTranslation := secondaryTranslation , aString

    "Modified: / 28-07-2017 / 11:17:11 / cg"
!

isSlavoGermanic: aString
        ^((aString includesAny: 'WK') or:
                [ (aString indexOfSubCollection: 'CZ' startingAt: 1) > 0 ]) or:
                [ (aString indexOfSubCollection: 'WITZ' startingAt: 1) > 0 ]

    "Modified: / 09-10-2017 / 17:10:46 / stefan"
!

keyAt: anInteger
    (anInteger between:1 and:inputKey size) ifTrue: [ 
        ^ inputKey at: anInteger 
    ].
    ^ Character space

    "Modified: / 28-07-2017 / 11:38:30 / cg"
!

keyLeftString: lengthInteger
	^self keyMidString: lengthInteger from: 1
!

keyMidString: lengthInteger from: fromInteger
        | result from len additionalSpaces |

        result := ''.
        from := fromInteger.
        len := lengthInteger.

        "Prepend spaces if caller is requesting characters from before the start of the string"

        [ from < 1 ] whileTrue:
                [ result := result, ' '.
                from := from + 1.
                len := len - 1 ].

        from + len - 1 > inputKey size
                ifTrue:
                        [ additionalSpaces := from + len - 1 - inputKey size.
                        len := inputKey size - from + 1 ]
                ifFalse: [ additionalSpaces := 0 ].

        result := result, (inputKey copyFrom: from to: (from+len-1 min: inputKey size)).

        [ additionalSpaces > 0 ] whileTrue:
                [ result := result, ' '.
                additionalSpaces := additionalSpaces - 1 ].

        ^result

    "Modified: / 28-07-2017 / 11:20:43 / cg"
!

keyRightString: lengthInteger
        ^self keyMidString: lengthInteger from: inputKey size - lengthInteger + 1

    "Modified: / 28-07-2017 / 11:20:51 / cg"
!

performInitialProcessing
    |ch1|

    inputKey size > 1 ifTrue:[
        (inputKey startsWithAnyOf:#( 'GN' 'KN' 'PN' 'WR' 'PS' )) ifTrue:[
            startIndex := startIndex + 1
        ].
    ].
    
    ch1 := self keyAt:1.
    ch1 = $X ifTrue:[
        self
            addPrimaryTranslation:'S';
            addSecondaryTranslation:'S'.
        startIndex := startIndex + 1
    ].
    ch1 isVowel ifTrue:[
        self
            addPrimaryTranslation:'A';
            addSecondaryTranslation:'A'.
        startIndex := startIndex + 1
    ]

    "Modified: / 01-08-2017 / 19:29:19 / cg"
!

processB
    self
        addPrimaryTranslation: 'P';
        addSecondaryTranslation: 'P'.
        
    (self keyAt: (currentIndex + 1)) == $B ifTrue: [ 
        skipCount := skipCount + 1 
    ].

    "Modified: / 28-07-2017 / 11:26:03 / cg"
!

processC
        "i"
        ((((currentIndex >= 3
                and: [ (self keyAt: currentIndex-2) isVowel not ])
                and: [ (self keyMidString: 3 from: currentIndex-1) = 'ACH' ])
                and: [ (self keyAt: currentIndex+2) ~= $I ])
                and: [ ((self keyAt: currentIndex+2) ~= $E)
                                or: [ (self keyMidString: 6 from: currentIndex-2) ~= 'BACHER'
                                                and: [ (self keyMidString: 6 from: currentIndex-2) ~= 'MACHER' ] ] ])
                        ifTrue:
                                [ self addPrimaryTranslation: 'K'.
                                self addSecondaryTranslation: 'K'.
                                skipCount := skipCount + 2.
                                ^self ].

        "ii"
        (inputKey beginsWith: 'CAESAR')
                ifTrue:
                        [ self addPrimaryTranslation: 'S'.
                        self addSecondaryTranslation: 'S'.
                        skipCount := skipCount + 1.
                        ^self ].

        "iii"
        (self keyMidString: 4 from: currentIndex) = 'CHIA'
                ifTrue:
                        [ self addPrimaryTranslation: 'K'.
                        self addSecondaryTranslation: 'K'.
                        skipCount := skipCount + 1.
                        ^self ].

        "iv"
        (self keyMidString: 2 from: currentIndex) = 'CH'
                ifTrue:
                        [ (currentIndex > 1                "a"
                                        and: [ (self keyMidString: 4 from: currentIndex) = 'CHAE' ])
                                ifTrue: [ self
                                                addPrimaryTranslation: 'K';
                                                addSecondaryTranslation: 'X'.
                                          skipCount := skipCount + 1.
                                          ^self ].

                        (currentIndex = 1          "b"
                                        and: [ (inputKey size > 5 and: [(inputKey startsWith: 'CHARAC')
                                                        or: [ (inputKey startsWith: 'CHARIS') ]] )
                                                or: [inputKey size > 4 and: [ ((((inputKey startsWith: 'CHOR')
                                                        or: [ (inputKey startsWith: 'CHYM') ])
                                                        or: [ (inputKey startsWith: 'CHIA') ])
                                                        or: [ (inputKey startsWith: 'CHEM') ])
                                                        and: [ (inputKey startsWith: 'CHORE') not ] ] ] ])
                                ifTrue: [ self
                                                addPrimaryTranslation: 'K';
                                                addSecondaryTranslation: 'K'.
                                          skipCount := skipCount + 1.
                                          ^self ].

                        (((((#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: 4))              "c"
                                        or: [ (inputKey startsWith: 'SCH') ])
                                        or: [ #('ORCHES' 'ARCHIT' 'ORCHID')
                                                        includes: (self keyMidString: 6 from: currentIndex-2) ])
                                        or: [ #($T $S) includes: (self keyAt: currentIndex+2) ])
                                        or: [ ((currentIndex = 1)
                                                        or: [ #($A $O $U $E) includes: (self keyAt: currentIndex-1) ])
                                                and: [ #($L $R $N $M $B $H $F $V $W $ ) includes: (self keyAt: currentIndex+2) ] ] )
                                ifTrue:
                                        [ self
                                                addPrimaryTranslation: 'K';
                                                addSecondaryTranslation: 'K'.
                                          skipCount := skipCount + 1.
                                          ^self ]
                                ifFalse:
                                        [ currentIndex > 1
                                                ifTrue:
                                                        [ (inputKey startsWith: 'MC')
                                                                ifTrue:
                                                                                [ self
                                                                                                addPrimaryTranslation: 'K';
                                                                                                addSecondaryTranslation: 'K' ]
                                                                ifFalse:
                                                                                [ self
                                                                                                addPrimaryTranslation: 'X';
                                                                                                addSecondaryTranslation: 'K' ] ]
                                                ifFalse:
                                                        [ self
                                                                addPrimaryTranslation: 'X';
                                                                addSecondaryTranslation: 'X' ].
                                        skipCount := skipCount + 1.
                                        ^self ] ].

        "v"
        (self keyAt: currentIndex+1) = $Z
                ifTrue:
                        [ self
                                addPrimaryTranslation: 'S';
                                addSecondaryTranslation: 'X'.
                          skipCount := skipCount + 1.
                          ^self ].

        "vi"
        (self keyMidString: 3 from: currentIndex+1) = 'CIA'
                ifTrue:
                        [ self
                                addPrimaryTranslation: 'X';
                                addSecondaryTranslation: 'X'.
                          skipCount := skipCount + 2.
                          ^self ].

        "vii"
        ((self keyAt: currentIndex+1) = $C
                        and: [ ((currentIndex = 2)
                                and: [ (self keyAt: 1) = $M ]) not ])
                ifTrue:
                        [ ((#($I $E $H) includes: (self keyAt: currentIndex+2))
                                        and: [ (self keyMidString: 2 from: currentIndex+2) ~= 'HU' ])
                                ifTrue:
                                        [ ((currentIndex = 2 and: [ (self keyAt: 1) = $A ])
                                                        or: [ #('UCCEE' 'UCCES') includes: (self keyMidString: 5 from: currentIndex-1)])
                                                ifTrue:
                                                        [self
                                                                addPrimaryTranslation: 'KS';
                                                                addSecondaryTranslation: 'KS'.
                                                         skipCount := skipCount + 2.
                                                         ^self ]
                                                ifFalse:
                                                        [self
                                                                addPrimaryTranslation: 'X';
                                                                addSecondaryTranslation: 'X'.
                                                         skipCount := skipCount + 2.
                                                         ^self ] ]
                                ifFalse:
                                        [ self
                                                addPrimaryTranslation: 'K';
                                                addSecondaryTranslation: 'K'.
                                          skipCount := skipCount + 2.
                                          ^self ] ].

        "viii"
        (#($K $G $Q) includes: (self keyAt: currentIndex+1))
                ifTrue:
                        [ self
                                addPrimaryTranslation: 'K';
                                addSecondaryTranslation: 'K'.
                          skipCount := skipCount + 1.
                          ^self ].

        "ix"
        (#($I $E $Y) includes: (self keyAt: currentIndex+1))
                ifTrue:
                        [ (#('CIO' 'CIE' 'CIA') includes: (self keyMidString: 3 from: currentIndex))
                                ifTrue:
                                        [self
                                                addPrimaryTranslation: 'S';
                                                addSecondaryTranslation: 'X' ]
                                ifFalse:
                                        [self
                                                addPrimaryTranslation: 'S';
                                                addSecondaryTranslation: 'S'].
                        skipCount := skipCount + 1.
                        ^self ].

        "x"
        self
                addPrimaryTranslation: 'K';
                addSecondaryTranslation: 'K'.

        "xi"
        (#(' C' ' Q' ' G') includes: (self keyMidString: 2 from: currentIndex+1))
                ifTrue:
                        [ skipCount := skipCount + 2 ]
                ifFalse:
                        [ ((#($C $K $Q) includes: (self keyAt: currentIndex+1))
                                        and: [ (#('CE' 'CI') includes: (self keyMidString: 2 from: currentIndex+1)) not ])
                                ifTrue: [ skipCount := skipCount + 1] ]

    "Modified: / 28-07-2017 / 11:29:11 / cg"
!

processCedille 
	self
		addPrimaryTranslation: 'S';
		addSecondaryTranslation: 'S'
!

processD
        "i"
        (self keyAt: currentIndex+1) = $G
                ifTrue:
                        [ (#($I $E $Y) includes: (self keyAt: currentIndex+2))
                                ifTrue:
                                        [ self
                                                addPrimaryTranslation: 'J';
                                                addSecondaryTranslation: 'J'.
                                         skipCount := skipCount + 2.
                                        ^self ]
                                ifFalse:
                                        [ self
                                                addPrimaryTranslation: 'TK';
                                                addSecondaryTranslation: 'TK'.
                                        skipCount := skipCount + 1.
                                        ^self ] ].

        "ii"
        (#($T $D) includes: (self keyAt: currentIndex+1))
                ifTrue:
                        [ self
                                addPrimaryTranslation: 'T';
                                addSecondaryTranslation: 'T'.
                          skipCount := skipCount + 1.
                          ^self ].

        "iii"
        self
                addPrimaryTranslation: 'T';
                addSecondaryTranslation: 'T'

    "Modified: / 28-07-2017 / 11:27:39 / cg"
!

processF
        self
                addPrimaryTranslation: 'F';
                addSecondaryTranslation: 'F'.
                
        (self keyAt: currentIndex+1) = $F
                ifTrue: [ skipCount := skipCount + 1 ]

    "Modified (format): / 28-07-2017 / 11:29:21 / cg"
!

processG
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'G':
                if(GetAt(current + 1) == 'H')
          {"
        | word |
        (self keyAt: currentIndex + 1) = $H
        ifTrue: [
                "if((current > 0) AND !!IsVowel(current - 1))"

                (currentIndex > 1 and: [(self keyAt: currentIndex - 1) isVowel not])
                ifTrue: [
              " {
                   MetaphAdd(K);
                   current += 2;
                   break;
                }"

                        self 
                            addPrimaryTranslation: 'K';
                            addSecondaryTranslation: 'K'.
                        skipCount := skipCount + 1.
                        ^self 
                ].

                "if(current < 3)
          {"

                currentIndex < 4 
                ifTrue: [

                        " //'ghislane', ghiradelli
               if(current == 0)
               { "
                        currentIndex = 1 
                        ifTrue: [
                                "if(GetAt(current + 2) == 'I')"

                                (self keyAt: currentIndex + 2) = $I
                                ifTrue: [
                                        "MetaphAdd(J);"
                                        self addPrimaryTranslation: 'J';
                                        addSecondaryTranslation: 'J'.
                                ] ifFalse: [
                                        "MetaphAdd(K);"
                                        self addPrimaryTranslation: 'K';
                                        addSecondaryTranslation: 'K'.
                                ].
                                "  current += 2;
                                break;"
                                skipCount := skipCount + 1.
                                ^self 
                        ]
                ].

                " //Parker's rule (with some further refinements) - e.g., 'hugh'
                if(((current > 1) AND StringAt((current - 2), 1, B, H, D, ) )
                //e.g., 'bough'
                OR ((current > 2) AND StringAt((current - 3), 1, B, H, D, ) )
                //e.g., 'broughton'
                OR ((current > 3) AND StringAt((current - 4), 1, B, H, ) ) )
         "
                (((currentIndex > 2 and: [#($B $H $D) includes: (self keyAt: currentIndex - 2)]) 
                or: [currentIndex > 3 and: [#($B $H $D) includes: (self keyAt: currentIndex - 3)]])  
                or: [currentIndex > 4 and: [#($B $H) includes: (self keyAt: currentIndex - 4)]])   
                ifTrue: [                         
                        "current += 2;
                        break;"
                        skipCount := skipCount + 1.
                        ^self 
                ] ifFalse: [
                        " //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
               if((current > 2) 
               AND (GetAt(current - 1) == 'U') 
               AND StringAt((current - 3), 1, C, G, L, R, T, ) )"
                        (currentIndex > 3 and: [
                                ((self keyAt: currentIndex - 1) = $U) and: [
                                        #($C $G $L $R $T) includes: (self keyAt: currentIndex - 3)
                                ]
                        ]) ifTrue: [
                                "MetaphAdd(F);"
                                self addPrimaryTranslation: 'F';
                                addSecondaryTranslation: 'F'.
                        ] ifFalse: [
                                " if((current > 0) AND GetAt(current - 1) !!= 'I')
                    MetaphAdd(K);"
                                (currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= $I])
                                ifTrue: [
                                        self addPrimaryTranslation: 'K';
                                        addSecondaryTranslation: 'K'.
                                ].
                        ].
                        skipCount := skipCount + 1.
                        ^self 
                ].
        ].
                "if(GetAt(current + 1) == 'N')"
          (self keyAt: currentIndex + 1) = $N
                ifTrue: [
                        "if((current == 1) AND IsVowel(0) AND !!SlavoGermanic())"
                        (currentIndex = 2 and: [(inputKey at: 1) isVowel and: [(self isSlavoGermanic: inputKey) not]])
               ifTrue: [
                                "MetaphAdd(KN, N);"
                                self addPrimaryTranslation: 'KN';
                                addSecondaryTranslation: 'N'.
                        ] ifFalse: [
                                " //not e.g. 'cagney'
                                if(!!StringAt((current + 2), 2, EY, ) 
                                AND (GetAt(current + 1) !!= 'Y') 
                                AND !!SlavoGermanic())"
                                ((inputKey size >= (currentIndex + 2)) and: [
                                        (inputKey copyFrom: currentIndex + 2 to: (currentIndex + 4 min: inputKey size)) ~= 'EY' and: [
                                                (self keyAt: currentIndex + 1) ~= $Y and: [
                                                        (self isSlavoGermanic: inputKey) not
                                                ]
                                        ]
                                ]) ifTrue: [
                                        self addPrimaryTranslation: 'N';
                                        addSecondaryTranslation: 'KN'.
                                ] ifFalse: [
                                        self addPrimaryTranslation: 'KN';
                                        addSecondaryTranslation: 'KN'.
                                ].
                        ].
                        skipCount := skipCount + 1.
                        ^self 
                ].
                " //'tagliaro'
                if(StringAt((current + 1), 2, LI, ) AND !!SlavoGermanic())"
                ((inputKey size >= (currentIndex + 3)) and: [
                        (inputKey copyFrom: currentIndex + 1 to: currentIndex + 2) = 'LI' and: [
                                (self isSlavoGermanic: inputKey) not]])
                ifTrue: [
                        self addPrimaryTranslation: 'KL';
                        addSecondaryTranslation: 'L'.
                        skipCount := skipCount + 1.
                        ^self.
                ].
                " //-ges-,-gep-,-gel-, -gie- at beginning
                if((current == 0)
                AND ((GetAt(current + 1) == 'Y') 
                OR StringAt((current + 1), 2, ES, EP, EB, EL, EY, IB, IL, IN, IE, EI, ER, )) )"
                (currentIndex = 1 and: [
                        ((self keyAt: currentIndex + 1) = $Y) or: [
                        (#('ES' 'EP' 'EB' 'EL' 'EY' 'IB' 'IL' 'IN' 'IE' 'EI' 'ER') includes: 
                                (inputKey copyFrom: currentIndex + 1 to: currentIndex + 2))
                ]]) ifTrue: [
                        self addPrimaryTranslation: 'K';
                        addSecondaryTranslation: 'J'.
                        skipCount := skipCount + 1.
                        ^self.
                ].
                " // -ger-,  -gy-
                if((StringAt((current + 1), 2, ER, ) OR (GetAt(current + 1) == 'Y'))
                AND !!StringAt(0, 6, DANGER, RANGER, MANGER, )
                AND !!StringAt((current - 1), 1, E, I, ) 
                AND !!StringAt((current - 1), 3, RGY, OGY, ) )
                "
          (((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 3 min: inputKey size)) = 'ER' or: [
                                ((self keyAt: currentIndex + 1) = $Y)]) 
                        and: [((#('DANGER' 'RANGER' 'MANGER') includes: (word := inputKey copyFrom: 1 to: (6 min: inputKey size))) not)
                                and: [(self keyAt: currentIndex - 1) ~= $E
                                        and: [(#('RGY' 'OGY') includes: (inputKey copyFrom: currentIndex - 1 to: currentIndex + 1)) not]]])
                 ifTrue: [
                        self addPrimaryTranslation: 'K';
                        addSecondaryTranslation: 'J'.
                        skipCount := skipCount + 1.
                        ^self.
                ].

          " // italian e.g, 'biaggi'
           if(StringAt((current + 1), 1, E, I, Y, ) OR StringAt((current - 1), 4, AGGI, OGGI, ))
           "
                ((#($E $I $Y) includes: (self keyAt: (currentIndex + 1))) or: [(#('AGGI' 'OGGI') includes: (inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size)))])
                ifTrue: [
                        " //obvious germanic
                                        if((StringAt(0, 4, VAN , VON , ) OR StringAt(0, 3, SCH, ))
                                                OR StringAt((current + 1), 2, ET, ))                                                MetaphAdd(K);"
                        word := (inputKey copyFrom: 1 to: 4).
                        ((#('VAN ' 'VON ') includes: word) or: [(word startsWith: 'SCH') or: [(word startsWith: 'ET')]]) 
                        ifTrue: [
                                self addPrimaryTranslation: 'K';
                                addSecondaryTranslation: 'K'.
                        ] ifFalse: [
                            " //always soft if french ending
                                                if(StringAt((current + 1), 4, IER , ))
                                                        MetaphAdd(J);
                                                else
                                                        MetaphAdd(J, K);
                                        current += 2;
                                        break;"
                                (((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 5 min: inputKey size)), '    ') startsWith: 'IER ')
                                ifTrue: [
                                        self addPrimaryTranslation: 'J';
                                        addSecondaryTranslation: 'J'.
                                ] ifFalse: [
                                        self addPrimaryTranslation: 'J';
                                        addSecondaryTranslation: 'K'.
                                ].

                        ].
                        skipCount := skipCount + 1.
                        ^self.       
                ].                      

        " if(GetAt(current + 1) == 'G')
             current += 2;
         else
             current += 1;
         MetaphAdd(K);
            break;"

                (self keyAt: (currentIndex + 1)) = $G
                ifTrue: [
                        skipCount := skipCount + 1.
                ].
                self addPrimaryTranslation: 'K';
                addSecondaryTranslation: 'K'.

    "Modified: / 28-07-2017 / 11:31:33 / cg"
!

processH
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'H':
                                //only keep if first & before vowel or btw. 2 vowels
                                if(((current == 0) OR IsVowel(current - 1)) 
                                        AND IsVowel(current + 1))
                                {
                                        MetaphAdd(H);
                                        current += 2;
                                }else//also takes care of 'HH'
                                        current += 1;
                                break;
"

        (((currentIndex = 1) 
                or: [ (self keyAt: currentIndex - 1) isVowel]) 
        and: [(self keyAt: currentIndex + 1) isVowel])
        ifTrue: [               
                self addPrimaryTranslation: 'H';
                addSecondaryTranslation: 'H'.
                skipCount := skipCount + 1.
                ^self.
        ]

    "Modified: / 28-07-2017 / 11:29:52 / cg"
!

processJ
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'J':
                                //obvious spanish, 'jose', 'san jacinto'
                                if(StringAt(current, 4, JOSE, ) OR StringAt(0, 4, SAN , ) )
                                {
                                        if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, SAN , ) )
                                                MetaphAdd(H);
                                        else
                                        {
                                                MetaphAdd(J, H);
                                        }
                                        current +=1;
                                        break;
                                }

                                if((current == 0) AND !!StringAt(current, 4, JOSE, ))
                                        MetaphAdd(J, A);//Yankelovich/Jankelowicz
                                else
                                        //spanish pron. of e.g. 'bajador'
                                        if(IsVowel(current - 1) 
                                                AND !!SlavoGermanic()
                                                        AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O')))
                                                MetaphAdd(J, H);
                                        else
                                                if(current == last)
                                                        MetaphAdd(J,  );
                                                else
                                                        if(!!StringAt((current + 1), 1, L, T, K, S, N, M, B, Z, ) 
                                                                        AND !!StringAt((current - 1), 1, S, K, L, ))
                                                                MetaphAdd(J);

                                if(GetAt(current + 1) == 'J')//it could happen!!
                                        current += 2;
                                else
                                        current += 1;
                                break;
"
        | currentWord firstWord nextLetter |
        currentWord := inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size).
        firstWord := inputKey copyFrom: 1 to: (4 min: inputKey size).
        nextLetter := self keyAt: currentIndex + 1.
        (currentWord = 'JOSE' or: [firstWord = 'SAN '])
        ifTrue: [       
                ((currentIndex = 1 and: [inputKey size == 4 or: [inputKey size >= 5 and: [self keyAt: currentIndex + 4 = $ ]]])
                        or: [firstWord = 'SAN '])
                ifTrue: [
                        self addPrimaryTranslation: 'H';
                        addSecondaryTranslation: 'H'.
                ] ifFalse: [
                        self addPrimaryTranslation: 'J';
                        addSecondaryTranslation: 'H'.
                ].
                ^self.
        ].
        (currentIndex = 1 and: [firstWord ~= 'JOSE'])
        ifTrue: [
                self addPrimaryTranslation: 'J';
                addSecondaryTranslation: 'A'.
        ] ifFalse: [
                ((currentIndex > 1 and: [(self keyAt: currentIndex -1) isVowel])
                and: [(self isSlavoGermanic: inputKey) not and: [nextLetter == $A or: [nextLetter == $O]]])
                ifTrue: [
                        self addPrimaryTranslation: 'J';
                        addSecondaryTranslation: 'H'.
                ] ifFalse: [
                        currentIndex = inputKey size 
                        ifTrue: [
                                self addPrimaryTranslation: 'J';
                                addSecondaryTranslation: ' '.
                        ] ifFalse: [
                                ((#($L $T $K $S $N $M $B $Z) includes: nextLetter) not and: [(#($S $K $L) includes: (self keyAt: currentIndex - 1)) not])
                                ifTrue: [
                                        self addPrimaryTranslation: 'J';
                                        addSecondaryTranslation: 'J'.
                                ].
                        ].
                ].
        ].
        nextLetter == $J
        ifTrue: [
                skipCount := skipCount + 1.
        ].

    "Modified: / 28-07-2017 / 11:31:41 / cg"
!

processK
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'K':
                                if(GetAt(current + 1) == 'K')
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(K);
                                break;
        "

        (self keyAt: currentIndex + 1) = $K
        ifTrue: [
                skipCount := skipCount + 1
        ].
        self addPrimaryTranslation: 'K';
        addSecondaryTranslation: 'K'.

    "Modified: / 28-07-2017 / 11:31:46 / cg"
!

processL

"case 'L':
                                if(GetAt(current + 1) == 'L')
                                {
                                        //spanish e.g. 'cabrillo', 'gallegos'
                                        if(((current == (length - 3)) 
                                                AND StringAt((current - 1), 4, ILLO, ILLA, ALLE, ))
                                                         OR ((StringAt((last - 1), 2, AS, OS, ) OR StringAt(last, 1, A, O, )) 
                                                                AND StringAt((current - 1), 4, ALLE, )) )
                                        {
                                                MetaphAdd(L,  );
                                                current += 2;
                                                break;
                                        }
                                        current += 2;
                                }else
                                        current += 1;
                                MetaphAdd(L);
                                break;
"
        | currentWord |
        (self keyAt: currentIndex + 1) = $L 
        ifTrue: [
                (((currentIndex = (inputKey size - 2))
                and: [(currentIndex > 1 and: [#('ILLO' 'ILLA' 'ALLE') includes: (currentWord := inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size))])])
                or: [((#('AS' 'OS') includes: (inputKey copyFrom: inputKey size - 1 to: inputKey size)) or: [#($A $O) includes: (self keyAt: inputKey size)]) and: [currentWord = 'ALLE']
                        ])
                ifTrue: [
                        self addPrimaryTranslation: 'L';
                        addSecondaryTranslation: ' '.
                        skipCount := skipCount + 1.
                        ^self.
                ].
                skipCount := skipCount + 1.
        ].
        self addPrimaryTranslation: 'L';
        addSecondaryTranslation: 'L'.

    "Modified: / 28-07-2017 / 11:32:03 / cg"
!

processM

"case 'M':
                                if((StringAt((current - 1), 3, UMB, ) 
                                        AND (((current + 1) == last) OR StringAt((current + 2), 2, ER, )))
                                                //'dumb','thumb'
                                                OR  (GetAt(current + 1) == 'M') )
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(M);
                                break;
"
        (((currentIndex > 1 and: [(inputKey copyFrom: currentIndex - 1 to: (currentIndex +1 min: inputKey size)) = 'UMB'])
                and: [currentIndex + 1 = inputKey size or: [(inputKey copyFrom: (currentIndex + 2 min: inputKey size) to: (currentIndex + 4 min: inputKey size)) = 'ER']])
                or: [(self keyAt: currentIndex + 1) = $M])
                ifTrue: [
                        skipCount := skipCount + 1.
                ].
                self addPrimaryTranslation: 'M';
                addSecondaryTranslation: 'M'.

    "Modified: / 28-07-2017 / 11:32:08 / cg"
!

processN
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'N':
                                if(GetAt(current + 1) == 'N')
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(N);
                                break;

        "

        (self keyAt: currentIndex + 1) = $N
        ifTrue: [
                skipCount := skipCount + 1
        ].
        self addPrimaryTranslation: 'N';
        addSecondaryTranslation: 'N'.

    "Modified: / 28-07-2017 / 11:32:14 / cg"
!

processNtilde
        "case 'Ñ':
                                current += 1;
                                MetaphAdd(N);
                                break;
        "
        self addPrimaryTranslation: 'N';
        addSecondaryTranslation: 'N'.
!

processP
        "case 'P':
                                if(GetAt(current + 1) == 'H')
                                {
                                        MetaphAdd(F);
                                        current += 2;
                                        break;
                                }

                                //also account for campbell, raspberry
                                if(StringAt((current + 1), 1, P, B, ))
                                        current += 2;
                                else
                                        current += 1;
                                        MetaphAdd(P);
                                break;
"
        | nextLetter |
        (nextLetter := self keyAt: currentIndex + 1) = $H
        ifTrue: [
                self addPrimaryTranslation: 'F';
                addSecondaryTranslation: 'F'.
                skipCount := skipCount + 1.
                ^self.
        ].
        (#($P $B) includes: nextLetter)
        ifTrue: [
                skipCount := skipCount + 1.
        ] ifFalse: [
                self addPrimaryTranslation: 'P';
                addSecondaryTranslation: 'P'.
        ].

    "Modified: / 28-07-2017 / 11:32:28 / cg"
!

processQ
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'Q':
                                if(GetAt(current + 1) == 'Q')
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(K);
                                break;

        "

        (self keyAt: currentIndex + 1) = $Q
        ifTrue: [
                skipCount := skipCount + 1
        ].
        self addPrimaryTranslation: 'K';
        addSecondaryTranslation: 'K'.

    "Modified: / 28-07-2017 / 11:32:32 / cg"
!

processR
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'R':
                                //french e.g. 'rogier', but exclude 'hochmeier'
                                if((current == last)
                                        AND !!SlavoGermanic()
                                                AND StringAt((current - 2), 2, IE, ) 
                                                        AND !!StringAt((current - 4), 2, ME, MA, ))
                                        MetaphAdd(, R);
                                else
                                        MetaphAdd(R);

                                if(GetAt(current + 1) == 'R')
                                        current += 2;
                                else
                                        current += 1;
                                break;
        "
        (currentIndex = inputKey size and: [
                (self isSlavoGermanic: inputKey) not and: [
                        (inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)) = 'IE' and: [
                                (#('ME' 'MA') includes: (inputKey copyFrom: ((currentIndex - 4) max: 1) to: ((currentIndex - 3) max: 1))) not
                        ]
                ]
        ])
        ifTrue: [
                self addPrimaryTranslation: '';
                addSecondaryTranslation: 'R'.
        ] ifFalse: [
                self addPrimaryTranslation: 'R';
                addSecondaryTranslation: 'R'.
        ].
        (self keyAt: currentIndex + 1) = $R
        ifTrue: [
                skipCount := skipCount + 1
        ].

    "Modified: / 28-07-2017 / 11:32:37 / cg"
!

processRemainingCharacters
    startIndex to: inputKey size do:[ :i | 
        | c methodSelector |

        skipCount = 0 ifTrue:[ 
            ((primaryTranslation size > 4) and: [ secondaryTranslation size > 4 ])
                ifTrue: [ ^self ].

            currentIndex := i.
            c := self keyAt: i.

            (c isVowel not and: [c ~= $Y]) ifTrue:[ 
                c == $Ç ifTrue: [ 
                    methodSelector := #processCedille 
                ] ifFalse: [ c == $Ñ ifTrue: [ 
                    methodSelector := #processNtilde 
                ] ifFalse: [ 
                    methodSelector := ('process', c asString) asSymbol 
                ]].
                self perform: methodSelector 
            ] 
        ] ifFalse: [ 
            skipCount := skipCount - 1
        ] 
    ]

    "Modified: / 28-07-2017 / 11:24:15 / cg"
!

processS
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'S':
                                //special cases 'island', 'isle', 'carlisle', 'carlysle'
                                if(StringAt((current - 1), 3, ISL, YSL, ))
                                {
                                        current += 1;
                                        break;
                                }

                                //special case 'sugar-'
                                if((current == 0) AND StringAt(current, 5, SUGAR, ))
                                {
                                        MetaphAdd(X, S);
                                        current += 1;
                                        break;
                                }

                                if(StringAt(current, 2, SH, ))
                                {
                                        //germanic
                                        if(StringAt((current + 1), 4, HEIM, HOEK, HOLM, HOLZ, ))
                                                MetaphAdd(S);
                                        else
                                                MetaphAdd(X);
                                        current += 2;
                                        break;
                                }

                                //italian & armenian
                                if(StringAt(current, 3, SIO, SIA, ) OR StringAt(current, 4, SIAN, ))
                                {
                                        if(!!SlavoGermanic())
                                                MetaphAdd(S, X);
                                        else
                                                MetaphAdd(S);
                                        current += 3;
                                        break;
                                }

                                //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
                                //also, -sz- in slavic language altho in hungarian it is pronounced 's'
                                if(((current == 0) 
                                                AND StringAt((current + 1), 1, M, N, L, W, ))
                                                        OR StringAt((current + 1), 1, Z, ))
                                {
                                        MetaphAdd(S, X);
                                        if(StringAt((current + 1), 1, Z, ))
                                                current += 2;
                                        else
                                                current += 1;
                                        break;
                                }

                                if(StringAt(current, 2, SC, ))
                                {
                                        //Schlesinger's rule
                                        if(GetAt(current + 2) == 'H')
                                                //dutch origin, e.g. 'school', 'schooner'
                                                if(StringAt((current + 3), 2, OO, ER, EN, UY, ED, EM, ))
                                                {
                                                        //'schermerhorn', 'schenker'
                                                        if(StringAt((current + 3), 2, ER, EN, ))
                                                        {
                                                                MetaphAdd(X, SK);
                                                        }else
                                                                MetaphAdd(SK);
                                                        current += 3;
                                                        break;
                                                }else{
                                                        if((current == 0) AND !!IsVowel(3) AND (GetAt(3) !!= 'W'))
                                                                MetaphAdd(X, S);
                                                        else
                                                                MetaphAdd(X);
                                                        current += 3;
                                                        break;
                                                }

                                        if(StringAt((current + 2), 1, I, E, Y, ))
                                        {
                                                MetaphAdd(S);
                                                current += 3;
                                                break;
                                        }
                                        //else
                                        MetaphAdd(SK);
                                        current += 3;
                                        break;
                                }

                                //french e.g. 'resnais', 'artois'
                                if((current == last) AND StringAt((current - 2), 2, AI, OI, ))
                                        MetaphAdd(, S);
                                else
                                        MetaphAdd(S);

                                if(StringAt((current + 1), 1, S, Z, ))
                                        current += 2;
                                else
                                        current += 1;
                                break;
"

        | nextChar char2 chars char |
        (#('ISL' 'YSL') includes: (inputKey copyFrom: (currentIndex - 1 max: 1) to: (currentIndex + 1 min: inputKey size))) 
        ifTrue: [
                ^self
        ].
        (currentIndex = 1 and: [(inputKey copyFrom: 1 to: (5 min: inputKey size)) = 'SUGAR'])
        ifTrue: [
                self addPrimaryTranslation: 'X';
                addSecondaryTranslation: 'S'.
                ^self.
        ].
        (inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SH'
        ifTrue: [
                (#('HEIM' 'HOEK' 'HOLM' 'HOLZ') includes: (inputKey copyFrom: (currentIndex + 1 min: inputKey size) to: ((currentIndex + 5) min: inputKey size)))
                ifTrue: [
                        self addPrimaryTranslation: 'S';
                        addSecondaryTranslation: 'S'.
                ] ifFalse: [
                        self addPrimaryTranslation: 'X';
                        addSecondaryTranslation: 'X'.
                ].
                skipCount := skipCount + 1.
                ^self 
        ].
        ((#('SIO' 'SIA') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 2 min: inputKey size)))
                or: [(inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size)) = 'SIAN'])
        ifTrue: [
                (self isSlavoGermanic: inputKey) not
                ifTrue: [
                        self addPrimaryTranslation: 'S';
                        addSecondaryTranslation: 'X'.
                ] ifFalse: [
                        self addPrimaryTranslation: 'S';
                        addSecondaryTranslation: 'S'.
                ].
                skipCount := skipCount + 2.
                ^self 
        ].
        ((currentIndex = 1 and: [#($M $N $L $W) includes: (self keyAt: currentIndex + 1)])
                or: [(nextChar := self keyAt: currentIndex + 1) = $Z])
        ifTrue: [
                self addPrimaryTranslation: 'S';
                addSecondaryTranslation: 'X'.
                nextChar == $Z
                ifTrue: [
                    skipCount := skipCount + 1.
                        ^self.
                ].
                ^self.
        ].
        ((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SC')
        ifTrue: [
                (char2 := self keyAt: currentIndex + 2) = $H
                ifTrue: [
                        (#('OO' 'ER' 'EN' 'UY' 'ED' 'EM') includes: (chars := inputKey copyFrom: ((currentIndex + 3) min: inputKey size) to: ((currentIndex + 4) min: inputKey size)))
                        ifTrue: [
                                (#('ER' 'EN') includes: chars)
                                ifTrue: [
                                        self addPrimaryTranslation: 'X';
                                        addSecondaryTranslation: 'SK'.
                                ] ifFalse: [
                                        self addPrimaryTranslation: 'SK';
                                        addSecondaryTranslation: 'SK'.
                                ].
                                skipCount := skipCount + 2.
                                ^self.
                        ] ifFalse: [
                                ((currentIndex = 1 and: [(char := inputKey at: 4 ifAbsent: [$b]) isVowel not]) and: [char ~= $W])
                                ifTrue: [
                                        self addPrimaryTranslation: 'X';
                                        addSecondaryTranslation: 'S'.
                                ] ifFalse: [
                                        self addPrimaryTranslation: 'X';
                                        addSecondaryTranslation: 'X'.
                                ].
                                skipCount := skipCount + 2.
                                ^self .
                        ].
                ] ifFalse: [
                        (#($I $E $Y) includes: char2)
                        ifTrue: [
                                self addPrimaryTranslation: 'S';
                                addSecondaryTranslation: 'S'.
                                skipCount := skipCount + 2.
                                ^self .
                        ] ifFalse: [
                                self addPrimaryTranslation: 'SK';
                                addSecondaryTranslation: 'SK'.
                                skipCount := skipCount + 2.
                                ^self.
                        ]
                ].
        ].
        (currentIndex = inputKey size and: [(#('AI' 'OI') includes: (inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)))])
        ifTrue: [
                self addPrimaryTranslation: '';
                addSecondaryTranslation: 'S'.
        ] ifFalse: [
                self addPrimaryTranslation: 'S';
                addSecondaryTranslation: 'S'.
        ].
        (#($S $Z) includes: (self keyAt: currentIndex + 1))
        ifTrue: [
            skipCount := skipCount + 1.
                ^self.
        ].

    "Modified: / 28-07-2017 / 11:34:18 / cg"
!

processT
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'T':
                                if(StringAt(current, 4, TION, ))
                                {
                                        MetaphAdd(X);
                                        current += 3;
                                        break;
                                }

                                if(StringAt(current, 3, TIA, TCH, ))
                                {
                                        MetaphAdd(X);
                                        current += 3;
                                        break;
                                }

                                if(StringAt(current, 2, TH, ) 
                                        OR StringAt(current, 3, TTH, ))
                                {
                                        //special case 'thomas', 'thames' or germanic
                                        if(StringAt((current + 2), 2, OM, AM, ) 
                                                OR StringAt(0, 4, VAN , VON , ) 
                                                        OR StringAt(0, 3, SCH, ))
                                        {
                                                MetaphAdd(T);
                                        }else{
                                                MetaphAdd(0, T);
                                        }
                                        current += 2;
                                        break;
                                }

                                if(StringAt((current + 1), 1, T, D, ))
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(T);
                                break;
"
        ((inputKey copyFrom: currentIndex to: ((currentIndex + 3) min: inputKey size)) = 'TION')
        ifTrue: [
                self addPrimaryTranslation: 'X';
                addSecondaryTranslation: 'X'.
                skipCount := skipCount + 2.
                ^self.
        ].
        (#('TIA' 'TCH') includes: (inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size)))
        ifTrue: [
                self addPrimaryTranslation: 'X';
                addSecondaryTranslation: 'X'. 
                skipCount := skipCount + 2.
                ^self.
        ].
        (((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'TH') or: [
                ((inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size)) = 'TTH')
        ])
        ifTrue: [
                ((#('OM' 'AM') includes: (inputKey copyFrom: currentIndex + 2 to: ((currentIndex + 3) min: inputKey size)))
                or: [(#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: (4 min: inputKey size)))
                        or: [(inputKey copyFrom: 1 to: (3 min: inputKey size)) = 'SCH']
                        ])
                ifTrue: [
                        self addPrimaryTranslation: 'T';
                        addSecondaryTranslation: 'T'.   
                ] ifFalse: [
                        self addPrimaryTranslation: '0';
                        addSecondaryTranslation: 'T'.   
                ].
                skipCount := skipCount + 1.
                ^self.
        ].
        (#($T $D) includes: (self keyAt: currentIndex + 1))
        ifTrue: [
                skipCount := skipCount + 1.
        ].
        self addPrimaryTranslation: 'T';
        addSecondaryTranslation: 'T'.

    "Modified: / 28-07-2017 / 11:33:33 / cg"
!

processV
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'V':
                                if(GetAt(current + 1) == 'V')
                                        current += 2;
                                else
                                        current += 1;
                                MetaphAdd(F);
                                break;


        "

        (self keyAt: currentIndex + 1) = $V
        ifTrue: [
                skipCount := skipCount + 1
        ].
        self addPrimaryTranslation: 'F';
        addSecondaryTranslation: 'F'.

    "Modified: / 28-07-2017 / 11:34:27 / cg"
!

processW
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'W':
                                //can also be in middle of word
                                if(StringAt(current, 2, WR, ))
                                {
                                        MetaphAdd(R);
                                        current += 2;
                                        break;
                                }

                                if((current == 0) 
                                        AND (IsVowel(current + 1) OR StringAt(current, 2, WH, )))
                                {
                                        //Wasserman should match Vasserman
                                        if(IsVowel(current + 1))
                                                MetaphAdd(A, F);
                                        else
                                                //need Uomo to match Womo
                                                MetaphAdd(A);
                                }

                                //Arnow should match Arnoff
                                if(((current == last) AND IsVowel(current - 1)) 
                                        OR StringAt((current - 1), 5, EWSKI, EWSKY, OWSKI, OWSKY, ) 
                                                        OR StringAt(0, 3, SCH, ))
                                  {
                                        MetaphAdd(, F);
                                        current +=1;
                                        break;
                                }

                                //polish e.g. 'filipowicz'
                                if(StringAt(current, 4, WICZ, WITZ, ))
                                {
                                        MetaphAdd(TS, FX);
                                        current +=4;
                                        break;
                                }

                                //else skip it
                                current +=1;
                                break;
"
        | word nextLetter |
        ((word := inputKey copyFrom: currentIndex to: (currentIndex + 1 min: inputKey size)) = 'WR')
        ifTrue: [
                self addPrimaryTranslation: 'R';
                addSecondaryTranslation: 'R'.
                skipCount := skipCount + 1.
                ^self
        ].
        ((currentIndex = 1 and: [(nextLetter := self keyAt: currentIndex + 1) isVowel]) or: [
                word = 'WH'
        ])
        ifTrue: [
                nextLetter isVowel
                ifTrue: [
                        self addPrimaryTranslation: 'A';
                        addSecondaryTranslation: 'F'.
                ] ifFalse: [
                        self addPrimaryTranslation: 'A';
                        addSecondaryTranslation: 'A'.
                ]
        ].
        ((((currentIndex = inputKey size) and: [(self keyAt: currentIndex - 1) isVowel])
                or: [#('EWSKI' 'EWSKY' 'OWSKI' 'OWSKY') includes: (inputKey copyFrom: ((currentIndex - 1) max: 1) to: (currentIndex + 3 min: inputKey size))])
                        or: [inputKey startsWith:'SCH'])
        ifTrue: [
                self addPrimaryTranslation: '';
                addSecondaryTranslation: 'F'.
                ^self.
        ].
        (#('WICZ' 'WITZ') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 4 min: inputKey size)))
        ifTrue: [
                self addPrimaryTranslation: 'TS';
                addSecondaryTranslation: 'FX'.
                skipCount := skipCount + 3.
                ^self
        ].

    "Modified: / 28-07-2017 / 11:34:51 / cg"
!

processX
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'X':
                                //french e.g. breaux
                                if(!!((current == last) 
                                        AND (StringAt((current - 3), 3, IAU, EAU, ) 
                                                        OR StringAt((current - 2), 2, AU, OU, ))) )
                                        MetaphAdd(KS);

                                if(StringAt((current + 1), 1, C, X, ))
                                        current += 2;
                                else
                                        current += 1;
                                break;
"


        ((currentIndex = inputKey size) 
        and: [(#('IAU' 'EAU') includes: (inputKey copyFrom: ((currentIndex - 3) min: 1) to: currentIndex)) 
              or: [(#('AU' 'OU') includes: (inputKey copyFrom: ((currentIndex - 2) min: 1) to: currentIndex))]]) 
        ifFalse: [
                self addPrimaryTranslation: 'KS';
                addSecondaryTranslation: 'KS'.
        ].
        (#($C $X) includes: (self keyAt: currentIndex + 1))
        ifTrue: [
            skipCount := skipCount + 1.
                ^self
        ]

    "Modified: / 28-07-2017 / 11:34:44 / cg"
!

processZ
        "http://aspell.sourceforge.net/metaphone/dmetaph.cpp
        case 'Z':
                                //chinese pinyin e.g. 'zhao'
                                if(GetAt(current + 1) == 'H')
                                {
                                        MetaphAdd(J);
                                        current += 2;
                                        break;
                                }else
                                        if(StringAt((current + 1), 2, ZO, ZI, ZA, ) 
                                                OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) !!= 'T')))
                                        {
                                                MetaphAdd(S, TS);
                                        }
                                        else
                                                MetaphAdd(S);

                                if(GetAt(current + 1) == 'Z')
                                        current += 2;
                                else
                                        current += 1;
                                break;
"

        (self keyAt: currentIndex + 1) = $H
        ifTrue: [
                self addPrimaryTranslation: 'J';
                addSecondaryTranslation: 'J'.
                skipCount := skipCount + 1.
                ^self
        ] ifFalse: [
                ((#('ZO' 'ZI' 'ZA') includes: (inputKey copyFrom: ((currentIndex + 1) min: inputKey size) to: ((currentIndex + 2) min: inputKey size))) or: [
                        (self isSlavoGermanic: inputKey) and: [(currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= 'T'])]
                ])
                ifTrue: [
                        self addPrimaryTranslation: 'S';
                        addSecondaryTranslation: 'TS'.
                ] ifFalse: [
                        self addPrimaryTranslation: 'S';
                        addSecondaryTranslation: 'S'.
                ].
                (self keyAt: currentIndex + 1) = $Z
                ifTrue: [
                    skipCount := skipCount + 1.
                        ^self 
                ].
        ]

    "Modified: / 28-07-2017 / 11:35:12 / cg"
! !

!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!

documentation
"
    There are many extended and enhanced soundex variants around;
    here is one, called 'extended soundex'. It is destribed for example in
    http://www.epidata.dk/documentation.php.
    An author or origin is unknown.

    The number of digits is increased to 5 or 8;
    The first character is not used literally; instead it is encoded like the rest.
    This might have a negative effect on names starting with a vovel, though.

    Overall, it can be doubted if this is really an enhancement after all.
"
! !

!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!

phoneticStringsFor:aString
    "generates both an extended soundex of length 5 and one of length 8"

    |first second u t prevCode|

    u := aString asUppercase.
    first := second := ''.
    u do:[:c | 
        t := self translate:c.
        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
            first := first , t.
            second := second , t.
            second size == 8 ifTrue:[
                ^ Array with:(first copyTo:5) with:second 
            ].
        ].
        prevCode := t
    ].
    [ first size < 5 ] whileTrue:[
        first := first , '0'.
        second := second , '0'.
    ].
    [ second size < 8 ] whileTrue:[
        second := second , '0'
    ].
    ^ Array with:first with:second

    "
     self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')  
     self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')   
     self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')    
     self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
     self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
     self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
     self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
     self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
     self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
     self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
     self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
     self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
     self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
     self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
    "
! !

!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!

translate:aCharacter
    "use simple if's for more speed when compiled"

    "vowels serve as separators"
    aCharacter == $A ifTrue:[^ '0' ].         
    aCharacter == $E ifTrue:[^ '0' ].
    aCharacter == $I ifTrue:[^ '0' ].
    aCharacter == $O ifTrue:[^ '0' ].
    aCharacter == $U ifTrue:[^ '0' ].
    aCharacter == $Y ifTrue:[^ '0' ].

    aCharacter == $B ifTrue:[^ '1' ]. 
    aCharacter == $P ifTrue:[^ '1' ].

    aCharacter == $F ifTrue:[^ '2' ]. 
    aCharacter == $V ifTrue:[^ '2' ]. 

    aCharacter == $C ifTrue:[^ '3' ]. 
    aCharacter == $S ifTrue:[^ '3' ]. 
    aCharacter == $K ifTrue:[^ '3' ].

    aCharacter == $G ifTrue:[^ '4' ]. 
    aCharacter == $J ifTrue:[^ '4' ].

    aCharacter == $Q ifTrue:[^ '5' ]. 
    aCharacter == $X ifTrue:[^ '5' ]. 
    aCharacter == $Z ifTrue:[^ '5' ]. 

    aCharacter == $D ifTrue:[^ '6' ]. 
    aCharacter == $G ifTrue:[^ '6' ]. 
    aCharacter == $T ifTrue:[^ '6' ]. 

    aCharacter == $L ifTrue:[^ '7' ]. 

    aCharacter == $M ifTrue:[^ '8' ]. 
    aCharacter == $N ifTrue:[^ '8' ]. 

    aCharacter == $R ifTrue:[^ '9' ]. 
    ^ nil
! !

!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!

documentation
"
    documentation to be added.

    [author:]
        cg

    [instance variables:]

    [class variables:]

    [see also:]

"
! !

!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!

encode:word
    ^ self subclassResponsibility

    "Created: / 28-07-2017 / 15:20:49 / cg"
!

phoneticStringsFor:word 
    ^ Array with:(self encode:word)

    "Created: / 28-07-2017 / 15:20:38 / cg"
! !

!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!

documentation
"
    Match Rating Approach Encoder

    The Western Airlines matching rating approach name encoder

    [see also:]
        https://en.wikipedia.org/wiki/Match_Rating_Approach
        
        G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
            ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' 
            US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
"
!

rCode
"<<END
## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are
## met:
##
##     Redistributions of source code must retain the above copyright
##     notice, this list of conditions and the following disclaimer.
##
##     Redistributions in binary form must reproduce the above copyright
##     notice, this list of conditions and the following disclaimer in
##     the documentation and/or other materials provided with the
##     distribution.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#' @rdname mra
#' @title Match Rating Approach Encoder
#'
#' @description
#' The Western Airlines matching rating approach name encoder
#'
#' @param word string or vector of strings to encode
#' @param x MRA-encoded character vector
#' @param y MRA-encoded character vector
#'
#' @details
#'
#' The variable \code{word} is the name to be encoded.  The variable
#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
#' because the algorithm itself is dependent upon its six-character
#' length.  The variables \code{x} and \code{y} are MRA-encoded and are
#' compared to each other using the MRA comparison specification.
#'
#' @return The \code{mra_encode} function returns match rating approach
#' encoded character vector.  The \code{mra_compare} returns a boolean
#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
#' comparison test.
#'
#' @references
#'
#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
#' \emph{Accessing Individual Records from Personal Data Files Using
#' Nonunique Identifiers,} US National Institute of Standards and
#' Technology, SP-500-2 (1977), p. 17.
#'
#' @family phonics
#'
#' @examples
#' mra_encode("William")
#' mra_encode(c("Peter", "Peady"))
#' mra_encode("Stevenson")

#' @rdname mra
#' @name mra_encode
#' @export
mra_encode <- function(word) {

    ## First, remove any nonalphabetical characters and uppercase it
    word <- gsub("[^[:alpha:]]*", "", word)
    word <- toupper(word)

    ## First character of key = first character of name
    first <- substr(word, 1, 1)
    word <- substr(word, 2, nchar(word))

    ## Delete vowels not at the start of the word
    word <- gsub("[AEIOU]", "", word)
    word <- paste(first, word, sep = "")

    ## Remove duplicate consecutive characters
    word <- gsub("([A-Z])\\1+", "\\1", word)

    ## If longer than 6 characters, take first and last 3...and we have
    ## to vectorize it
    for(i in 1:length(word)) {
        if((l = nchar(word[i])) > 6) {
            first <- substr(word[i], 1, 3)
            last <- substr(word[i], l - 2, l)
            word[i] <- paste(first, last, sep = "");
        }
    }

    return(word)
}

#' @rdname mra
#' @name mra_compare
#' @export
mra_compare <- function(x, y) {
    mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)

    ## Obtain the minimum rating value by calculating the length sum of
    ## the encoded strings and using table A (from Wikipedia).  We start
    ## by setting the minimum to be the sum and move from there.
    mra$lensum <- nchar(mra$x) + nchar(mra$y)
    mra$min[mra$lensum == 12] <- 2
    mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
    mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
    mra$min[mra$lensum <= 4] <- 5

    ## If the length difference between the encoded strings is 3 or
    ## greater, then no similarity comparison is done.  For us, we
    ## continue the similarity comparison out of laziness and ensure the
    ## minimum is impossibly high to meet.
    mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100

    ## Start the comparison.
    x <- strsplit(mra$x, split = "")
    y <- strsplit(mra$y, split = "")
    rows <- nrow(mra)
    for(i in 1:rows) {
        ## Process the encoded strings from left to right and remove any
        ## identical characters found from both strings respectively.
        j <- 1
        while(j < min(length(x[[i]]), length(y[[i]]))) {
            if(x[[i]][j] == y[[i]][j]) {
                x[[i]] <- x[[i]][-j]
                y[[i]] <- y[[i]][-j]
            } else
                j <- j + 1
        }

        ## Process the unmatched characters from right to left and
        ## remove any identical characters found from both names
        ## respectively.
        x[[i]] <- rev(x[[i]])
        y[[i]] <- rev(y[[i]])
        j <- 1
        while(j < min(length(x[[i]]), length(y[[i]]))) {
            if(x[[i]][j] == y[[i]][j]) {
                x[[i]] <- x[[i]][-j]
                y[[i]] <- y[[i]][-j]
            } else
                j <- j + 1
        }
        ## Subtract the number of unmatched characters from 6 in the
        ## longer string. This is the similarity rating.
        len <- min(length(x[[i]]), length(y[[i]]))
        mra$sim[i] <- 6 - len
    }

    ## If the similarity is greater than or equal to the minimum
    ## required, it is a successful match.
    mra$match <- (mra$sim >= mra$min)
    return(mra$match)
}

END>>
! !

!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!

encode:wordIn 
    "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
    
    |word prev|

    word := wordIn.
    
    "/ First, remove any nonalphabetical characters and uppercase it

    word := word select:#isLetter thenCollect:#asUppercase.

    "/ Delete vowels not at the start of the word

    word := word first asString , ((word from:2) reject:#isVowel).

    "/ Remove duplicate consecutive characters

    prev := nil.
    word := word 
                collect:[:char |
                    char == prev ifTrue:[
                        $*
                    ] ifFalse:[
                        prev := char.
                        char.
                    ].    
                ]
                thenSelect:[:char | char ~~ $*].

    "/ If longer than 6 characters, take first and last 3            
    word size > 6 ifTrue:[
        word := (word copyFirst:3),(word copyLast:3)
    ].
    ^ word.

    "
     self new encode:'Catherine'            -> 'CTHRN'
     self new encode:'CatherineCatherine'   -> 'CTHHRN'
     self new encode:'Butter'               -> 'BTR'
     self new encode:'Byrne'                -> 'BYRN'
     self new encode:'Boern'                -> 'BRN'
     self new encode:'Smith'                -> 'SMTH'
     self new encode:'Smyth'                -> 'SMYTH'
     self new encode:'Kathryn'              -> 'KTHRYN'
    "

    "Created: / 28-07-2017 / 15:19:22 / cg"
    "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
! !

!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'!

documentation
"
   Ongoing work - do not use at the moment
   
   Encodes a string into a Metaphone value.

   Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
   Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.

    Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.
    Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
    https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6

  They have had undocumented changes from the originally published algorithm.
  For more information, see https://issues.apache.org/jira/browse/CODEC-57

  Metaphone uses the following rules:

    Doubled letters except 'c' -> drop 2nd letter.
    Vowels are only kept when they are the first letter.
    B -> B unless at the end of a word after 'm' as in 'dumb'
    C -> X (sh) if -cia- or -ch-
    S if -ci-, -ce- or -cy-
    K otherwise, including -sch-
    D -> J if in -dge-, -dgy- or -dgi-; T otherwise
    F -> F
    G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above)
    J if before i or e or y if not double gg; K otherwise
    H -> silent if after vowel and no vowel follows; H otherwise
    J -> J
    K -> silent if after 'c'; K otherwise
    L -> L
    M -> M
    N -> N
    P -> F if before 'h'; P otherwise
    Q -> K
    R -> R
    S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise
    T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise
    V -> F
    W -> silent if not followed by a vowel W if followed by a vowel
    X -> KS
    Y -> silent if not followed by a vowel Y if followed by a vowel
    Z -> S

    Initial Letter Exceptions

    Initial kn-, gn- pn, ae- or wr- -> drop first letter
    Initial x- -> change to 's'
    Initial wh- -> change to 'w'


     self new encode:'a'
     self new encode:'dumb'
     self new encode:'MILLER'
     self new encode:'schmidt'
     self new encode:'schneider'
     self new encode:'FISCHER'
     self new encode:'HEDGY'
     self new encode:'weber'
     self new encode:'wagner'
     self new encode:'van gogh'
"
!

javaCode
"<<END
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a Metaphone value.
 * <p>
 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
 * <p>
 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
 * p 39.</CITE>
 * <p>
 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
 * </p>
 * <ul>
 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
 *  (broken link 4/30/2013) </li>
 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
 *  (link checked 4/30/2013) </li>
 * </ul>
 * <p>
 * They have had undocumented changes from the originally published algorithm.
 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
 * <p>
 * This class is conditionally thread-safe.
 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
 * but is not volatile, and accesses are not synchronized.
 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
 * after initial setup.
 *
 * @version $Id$
 */
public class Metaphone implements StringEncoder {

    /**
     * Five values in the English language
     */
    private static final String VOWELS = "AEIOU";

    /**
     * Variable used in Metaphone algorithm
     */
    private static final String FRONTV = "EIY";

    /**
     * Variable used in Metaphone algorithm
     */
    private static final String VARSON = "CSPTG";

    /**
     * The max code length for metaphone is 4
     */
    private int maxCodeLen = 4;

    /**
     * Creates an instance of the Metaphone encoder
     */
    public Metaphone() {
        super();
    }

    /**
     * Find the metaphone value of a String. This is similar to the
     * soundex algorithm, but better at finding similar sounding words.
     * All input is converted to upper case.
     * Limitations: Input format is expected to be a single ASCII word
     * with only characters in the A - Z range, no punctuation or numbers.
     *
     * @param txt String to find the metaphone code for
     * @return A metaphone code corresponding to the String supplied
     */
    public String metaphone(final String txt) {
        boolean hard = false;
        int txtLength;
        if (txt == null || (txtLength = txt.length()) == 0) {
            return "";
        }
        // single character is itself
        if (txtLength == 1) {
            return txt.toUpperCase(java.util.Locale.ENGLISH);
        }

        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();

        final StringBuilder local = new StringBuilder(40); // manipulate
        final StringBuilder code = new StringBuilder(10); //   output
        // handle initial 2 characters exceptions
        switch(inwd[0]) {
        case 'K':
        case 'G':
        case 'P': /* looking for KN, etc*/
            if (inwd[1] == 'N') {
                local.append(inwd, 1, inwd.length - 1);
            } else {
                local.append(inwd);
            }
            break;
        case 'A': /* looking for AE */
            if (inwd[1] == 'E') {
                local.append(inwd, 1, inwd.length - 1);
            } else {
                local.append(inwd);
            }
            break;
        case 'W': /* looking for WR or WH */
            if (inwd[1] == 'R') {   // WR -> R
                local.append(inwd, 1, inwd.length - 1);
                break;
            }
            if (inwd[1] == 'H') {
                local.append(inwd, 1, inwd.length - 1);
                local.setCharAt(0, 'W'); // WH -> W
            } else {
                local.append(inwd);
            }
            break;
        case 'X': /* initial X becomes S */
            inwd[0] = 'S';
            local.append(inwd);
            break;
        default:
            local.append(inwd);
        } // now local has working string with initials fixed

        final int wdsz = local.length();
        int n = 0;

        while (code.length() < this.getMaxCodeLen() &&
               n < wdsz ) { // max code size of 4 works well
            final char symb = local.charAt(n);
            // remove duplicate letters except C
            if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) {
                n++;
            } else { // not dup
                switch(symb) {
                case 'A':
                case 'E':
                case 'I':
                case 'O':
                case 'U':
                    if (n == 0) {
                        code.append(symb);
                    }
                    break; // only use vowel if leading char
                case 'B':
                    if ( isPreviousChar(local, n, 'M') &&
                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
                        break;
                    }
                    code.append(symb);
                    break;
                case 'C': // lots of C special cases
                    /* discard if SCI, SCE or SCY */
                    if ( isPreviousChar(local, n, 'S') &&
                         !!isLastChar(wdsz, n) &&
                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
                        break;
                    }
                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
                        code.append('X');
                        break;
                    }
                    if (!!isLastChar(wdsz, n) &&
                        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
                        code.append('S');
                        break; // CI,CE,CY -> S
                    }
                    if (isPreviousChar(local, n, 'S') &&
                        isNextChar(local, n, 'H') ) { // SCH->sk
                        code.append('K');
                        break;
                    }
                    if (isNextChar(local, n, 'H')) { // detect CH
                        if (n == 0 &&
                            wdsz >= 3 &&
                            isVowel(local,2) ) { // CH consonant -> K consonant
                            code.append('K');
                        } else {
                            code.append('X'); // CHvowel -> X
                        }
                    } else {
                        code.append('K');
                    }
                    break;
                case 'D':
                    if (!!isLastChar(wdsz, n + 1) &&
                        isNextChar(local, n, 'G') &&
                        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
                        code.append('J'); n += 2;
                    } else {
                        code.append('T');
                    }
                    break;
                case 'G': // GH silent at end or before consonant
                    if (isLastChar(wdsz, n + 1) &&
                        isNextChar(local, n, 'H')) {
                        break;
                    }
                    if (!!isLastChar(wdsz, n + 1) &&
                        isNextChar(local,n,'H') &&
                        !!isVowel(local,n+2)) {
                        break;
                    }
                    if (n > 0 &&
                        ( regionMatch(local, n, "GN") ||
                          regionMatch(local, n, "GNED") ) ) {
                        break; // silent G
                    }
                    if (isPreviousChar(local, n, 'G')) {
                        // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
                        hard = true;
                    } else {
                        hard = false;
                    }
                    if (!!isLastChar(wdsz, n) &&
                        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
                        !!hard) {
                        code.append('J');
                    } else {
                        code.append('K');
                    }
                    break;
                case 'H':
                    if (isLastChar(wdsz, n)) {
                        break; // terminal H
                    }
                    if (n > 0 &&
                        VARSON.indexOf(local.charAt(n - 1)) >= 0) {
                        break;
                    }
                    if (isVowel(local,n+1)) {
                        code.append('H'); // Hvowel
                    }
                    break;
                case 'F':
                case 'J':
                case 'L':
                case 'M':
                case 'N':
                case 'R':
                    code.append(symb);
                    break;
                case 'K':
                    if (n > 0) { // not initial
                        if (!!isPreviousChar(local, n, 'C')) {
                            code.append(symb);
                        }
                    } else {
                        code.append(symb); // initial K
                    }
                    break;
                case 'P':
                    if (isNextChar(local,n,'H')) {
                        // PH -> F
                        code.append('F');
                    } else {
                        code.append(symb);
                    }
                    break;
                case 'Q':
                    code.append('K');
                    break;
                case 'S':
                    if (regionMatch(local,n,"SH") ||
                        regionMatch(local,n,"SIO") ||
                        regionMatch(local,n,"SIA")) {
                        code.append('X');
                    } else {
                        code.append('S');
                    }
                    break;
                case 'T':
                    if (regionMatch(local,n,"TIA") ||
                        regionMatch(local,n,"TIO")) {
                        code.append('X');
                        break;
                    }
                    if (regionMatch(local,n,"TCH")) {
                        // Silent if in "TCH"
                        break;
                    }
                    // substitute numeral 0 for TH (resembles theta after all)
                    if (regionMatch(local,n,"TH")) {
                        code.append('0');
                    } else {
                        code.append('T');
                    }
                    break;
                case 'V':
                    code.append('F'); break;
                case 'W':
                case 'Y': // silent if not followed by vowel
                    if (!!isLastChar(wdsz,n) &&
                        isVowel(local,n+1)) {
                        code.append(symb);
                    }
                    break;
                case 'X':
                    code.append('K');
                    code.append('S');
                    break;
                case 'Z':
                    code.append('S');
                    break;
                default:
                    // do nothing
                    break;
                } // end switch
                n++;
            } // end else from symb !!= 'C'
            if (code.length() > this.getMaxCodeLen()) {
                code.setLength(this.getMaxCodeLen());
            }
        }
        return code.toString();
    }

    private boolean isVowel(final StringBuilder string, final int index) {
        return VOWELS.indexOf(string.charAt(index)) >= 0;
    }

    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
        boolean matches = false;
        if( index > 0 &&
            index < string.length() ) {
            matches = string.charAt(index - 1) == c;
        }
        return matches;
    }

    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
        boolean matches = false;
        if( index >= 0 &&
            index < string.length() - 1 ) {
            matches = string.charAt(index + 1) == c;
        }
        return matches;
    }

    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
        boolean matches = false;
        if( index >= 0 &&
            index + test.length() - 1 < string.length() ) {
            final String substring = string.substring( index, index + test.length());
            matches = substring.equals( test );
        }
        return matches;
    }

    private boolean isLastChar(final int wdsz, final int n) {
        return n + 1 == wdsz;
    }


    /**
     * Encodes an Object using the metaphone algorithm.  This method
     * is provided in order to satisfy the requirements of the
     * Encoder interface, and will throw an EncoderException if the
     * supplied object is not of type java.lang.String.
     *
     * @param obj Object to encode
     * @return An object (or type java.lang.String) containing the
     *         metaphone code which corresponds to the String supplied.
     * @throws EncoderException if the parameter supplied is not
     *                          of type java.lang.String
     */
    @Override
    public Object encode(final Object obj) throws EncoderException {
        if (!!(obj instanceof String)) {
            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
        }
        return metaphone((String) obj);
    }

    /**
     * Encodes a String using the Metaphone algorithm.
     *
     * @param str String object to encode
     * @return The metaphone code corresponding to the String supplied
     */
    @Override
    public String encode(final String str) {
        return metaphone(str);
    }

    /**
     * Tests is the metaphones of two strings are identical.
     *
     * @param str1 First of two strings to compare
     * @param str2 Second of two strings to compare
     * @return <code>true</code> if the metaphones of these strings are identical,
     *        <code>false</code> otherwise.
     */
    public boolean isMetaphoneEqual(final String str1, final String str2) {
        return metaphone(str1).equals(metaphone(str2));
    }

    /**
     * Returns the maxCodeLen.
     * @return int
     */
    public int getMaxCodeLen() { return this.maxCodeLen; }

    /**
     * Sets the maxCodeLen.
     * @param maxCodeLen The maxCodeLen to set
     */
    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }

}
END>>"
! !

!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'!

encode:txt
    "
     self new encode:'a'
     self new encode:'MILLER'
     self new encode:'schmidt'
     self new encode:'schneider'
     self new encode:'FISCHER'
     self new encode:'HEDGY'
     self new encode:'weber'
     self new encode:'wagner'
     self new encode:'van gogh'
     self new encode:'dumb'
    "
    
    |hard txtLength local code inwd ch ch2 wdsz n|

    inwd := txt.
    hard := false.
    txtLength := 0.
    
    (txtLength := txt size) == 0 ifTrue:[^ ''].

    inwd := txt asUppercase.
    "/ single character is itself
    (txtLength == 1) ifTrue:[
        ^ inwd        
    ].

    code := '' writeStream.
    local := inwd.
    
    "/ handle initial 2 characters exceptions
    ch := inwd at:(0+1).
    ch2 := inwd at:(1+1).
    ('KGP' includes:ch) ifTrue:[  
        "/ looking for KN, etc
        "/ KNx -> Nx 
        "/ GNx -> Nx 
        "/ PNx -> Nx 
        (ch2 == $N) ifTrue:[
            local := (inwd from:1+1)
        ].
    ] ifFalse:[
    ('A' includes:ch) ifTrue:[  
        "/ looking for AE
        "/ AEx -> Ex 
        (ch2 == $E) ifTrue:[
            local := (inwd from:1+1)
        ].
    ] ifFalse:[
    ('W' includes:ch) ifTrue:[  
        "/ looking for WR or WH 
        (ch2 == $R) ifTrue:[
            "/ WRx -> Wx 
            local := (inwd from:1+1)
        ] ifFalse:[
            (ch2 == $H) ifTrue:[
                "/ // WH -> W 
                local := 'W',(inwd from:2+1).
            ]
        ]
    ] ifFalse:[
    ('X' includes:ch) ifTrue:[  
        "/ initial X becomes S */
        "/ Xx -> Sx 
        local := 'S',(inwd from:1+1).
    ]]]].
    
    "/ now local has working string with initials fixed
    
    wdsz := local size.
    n := 1.

    [ n <= wdsz ] whileTrue:[
        "/ max code size of 4 works well

        |symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar|

        symb := local at:n.
        (n > 1) ifTrue:[ prevChar := local at:(n-1) ]. 
        (isLastChar := (n == wdsz)) ifFalse:[
            nextChar := local at:(n+1) 
        ].    
        isPrevToLastChar := (n == (wdsz-1)).
        (n+2) <= wdsz ifTrue:[
            nextNextChar := local at:(n+2)
        ].
        
        "/ remove duplicate letters except C and except first
        (symb == $C or:[ nextChar ~~ symb or:[ n == 1] ]) ifTrue:[
            "/ not dup
            ('AEIOU' includes:symb) ifTrue:[
                "/ only use vowel if leading char
                (n == 1) ifTrue:[
                    code nextPut:symb
                ]
            ] ifFalse:[
            ('B' includes:symb) ifTrue:[
                "/    if ( isPreviousChar(local, n, 'M') &&
                "/         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
                "/        break;
                "/    }
                "/    code.append(symb);
                "/    break;
                (isLastChar and:[ prevChar == $M]) ifTrue:[
                    "/ B is silent if word ends in MB 
                ] ifFalse:[
                    code nextPut:symb.
                ].    
            ] ifFalse:[
            ('C' includes:symb) ifTrue:[
                "/ lots of C special cases    
                "/    /* discard if SCI, SCE or SCY */
                "/    if ( isPreviousChar(local, n, 'S') &&
                "/         !!isLastChar(wdsz, n) &&
                "/         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
                "/        break;
                "/    }
                "/    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
                "/        code.append('X');
                "/        break;
                "/    }
                "/    if (!!isLastChar(wdsz, n) &&
                "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
                "/        code.append('S');
                "/        break; // CI,CE,CY -> S
                "/    }
                "/    if (isPreviousChar(local, n, 'S') &&
                "/        isNextChar(local, n, 'H') ) { // SCH->sk
                "/        code.append('K');
                "/        break;
                "/    }
                "/    if (isNextChar(local, n, 'H')) { // detect CH
                "/        if (n == 0 &&
                "/            wdsz >= 3 &&
                "/            isVowel(local,2) ) { // CH consonant -> K consonant
                "/            code.append('K');
                "/        } else {
                "/            code.append('X'); // CHvowel -> X
                "/        }
                "/    } else {
                "/        code.append('K');
                "/    }
                "/    break;
                (prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[
                    "/ discard if SCI, SCE or SCY
                ] ifFalse:[
                    ((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[
                        "/  "CIA" -> X 
                        code nextPut:$X
                    ] ifFalse:[
                        ('IEY' includes:nextChar) ifTrue:[
                            "/ CI,CE,CY -> S
                            code nextPut:$S
                        ] ifFalse:[ 
                           ((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[
                               "/ SCH->sk
                                code nextPut:$K
                            ] ifFalse:[ 
                                nextChar == $H ifTrue:[
                                    "/ CH
                                    ('AEIOU' includes:nextNextChar) ifTrue:[
                                        code nextPut:$K "/ CH consonant -> K consonant 
                                    ] ifFalse:[    
                                        code nextPut:$X "/ CHvowel -> X
                                    ]    
                                ] ifFalse:[
                                    code nextPut:$K
                                ].    
                            ]
                        ]
                    ]
                ].    
                
            ] ifFalse:[
            ('D' includes:symb) ifTrue:[
                "/    if (!!isLastChar(wdsz, n + 1) &&
                "/        isNextChar(local, n, 'G') &&
                "/        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
                "/        code.append('J'); n += 2;
                "/    } else {
                "/        code.append('T');
                "/    }
                "/    break;
                ((nextChar == $G)
                and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ])
                ifTrue:[
                    code nextPut:$J.
                    n := n + 2.
                ] ifFalse:[    
                    code nextPut:$T.
                ].    
            ] ifFalse:[
            ('G' includes:symb) ifTrue:[
                "/    GH silent at end or before consonant
                "/    if (isLastChar(wdsz, n + 1) &&
                "/        isNextChar(local, n, 'H')) {
                "/        break;
                "/    }
                "/    if (!!isLastChar(wdsz, n + 1) &&
                "/        isNextChar(local,n,'H') &&
                "/        !!isVowel(local,n+2)) {
                "/        break;
                "/    }
                "/    if (n > 0 &&
                "/        ( regionMatch(local, n, "GN") ||
                "/          regionMatch(local, n, "GNED") ) ) {
                "/        break; // silent G
                "/    }
                "/    if (isPreviousChar(local, n, 'G')) {
                "/        // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true
                "/        hard = true;
                "/    } else {
                "/        hard = false;
                "/    }
                "/    if (!!isLastChar(wdsz, n) &&
                "/        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
                "/        !!hard) {
                "/        code.append('J');
                "/    } else {
                "/        code.append('K');
                "/    }
                "/    break;
                (isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[
                    "/ GH silent at end
                ] ifFalse:[
                    (isPrevToLastChar not and:[ nextChar == $H 
                      and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[
                        "/ GH silent before consonant
                    ] ifFalse:[
                        (n > 1 and:[ nextChar == $N ]) ifTrue:[
                            "/ GN -> silent G
                        ] ifFalse:[
                            hard := (prevChar == $G).
                            (isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[
                                code nextPut:$J
                            ] ifFalse:[
                                code nextPut:$K
                            ].    
                        ].    
                    ].    
                ].    
            ] ifFalse:[
            ('H' includes:symb) ifTrue:[
                "/    case 'H':
                "/        if (isLastChar(wdsz, n)) {
                "/            break; // terminal H
                "/        }
                "/        if (n > 0 &&
                "/            VARSON.indexOf(local.charAt(n - 1)) >= 0) {
                "/            break;
                "/        }
                "/        if (isVowel(local,n+1)) {
                "/            code.append('H'); // Hvowel
                "/        }
                "/        break;
                isLastChar ifTrue:[
                    "/ ignore terminal H
                ] ifFalse:[
                    ('CSPTG' includes:prevChar) ifTrue:[
                        "/ ignore CH, SH, PH, TH, GH (H treated there)
                    ] ifFalse:[
                        ('AEIOU' includes:nextChar) ifTrue:[
                            "/ Hvowel
                            code nextPut:$H
                        ].    
                    ].    
                ].    
            ] ifFalse:[
            ('FJLMNR' includes:symb) ifTrue:[
                "/    case 'F':
                "/    case 'J':
                "/    case 'L':
                "/    case 'M':
                "/    case 'N':
                "/    case 'R':
                "/        code.append(symb);
                "/        break;
                code nextPut:symb.
            ] ifFalse:[
            ('K' includes:symb) ifTrue:[
                "/    case 'K':
                "/        if (n > 0) { // not initial
                "/            if (!!isPreviousChar(local, n, 'C')) {
                "/                code.append(symb);
                "/            }
                "/        } else {
                "/            code.append(symb); // initial K
                "/        }
                "/        break;
                n > 1 ifTrue:[
                    "/ not initial
                    prevChar ~~ $C ifTrue:[
                        code nextPut:$K. "/ initial K
                    ].    
                ] ifFalse:[
                    code nextPut:$K. "/ initial K
                ].
            ] ifFalse:[
            ('P' includes:symb) ifTrue:[
                "/    case 'P':
                "/        if (isNextChar(local,n,'H')) {
                "/            // PH -> F
                "/            code.append('F');
                "/        } else {
                "/            code.append(symb);
                "/        }
                "/        break;
                nextChar == $H ifTrue:[
                    "/ PH -> F
                    code nextPut:$F.
                ] ifFalse:[
                    code nextPut:symb.
                ].    
            ] ifFalse:[
            ('Q' includes:symb) ifTrue:[
                "/    case 'Q':
                "/        code.append('K');
                "/        break;
                code nextPut:$K

            ] ifFalse:[
            ('S' includes:symb) ifTrue:[
                "/    case 'S':
                "/        if (regionMatch(local,n,"SH") ||
                "/            regionMatch(local,n,"SIO") ||
                "/            regionMatch(local,n,"SIA")) {
                "/            code.append('X');
                "/        } else {
                "/            code.append('S');
                "/        }
                "/        break;
                "/ SH -> X  (as in shave or ashton)
                "/ SIO -> X 
                "/ SIA -> X (as in ASIA)
                ((nextChar == $H) 
                  or:[ ((nextChar == $I) and:[ 'OA' includes:nextNextChar])]
                ) ifTrue:[
                    code nextPut:$X
                ] ifFalse:[
                    code nextPut:$S
                ]
            ] ifFalse:[
            ('T' includes:symb) ifTrue:[
                "/    case 'T':
                "/        if (regionMatch(local,n,"TIA") ||
                "/            regionMatch(local,n,"TIO")) {
                "/            code.append('X');
                "/            break;
                "/        }
                "/        if (regionMatch(local,n,"TCH")) {
                "/            // Silent if in "TCH"
                "/            break;
                "/        }
                "/        // substitute numeral 0 for TH (resembles theta after all)
                "/        if (regionMatch(local,n,"TH")) {
                "/            code.append('0');
                "/        } else {
                "/            code.append('T');
                "/        }
                "/        break;
                (nextChar == $I and:[ 'AO' includes:nextNextChar]) ifTrue:[
                    code nextPut:$X.
                ] ifFalse:[
                    (nextChar == $C and:[ nextNextChar == $H]) ifTrue:[
                        "/ Silent if in "TCH"
                        "/ cg - huh; hutch - methinksthereisat
                    ] ifFalse:[
                        "/ substitute numeral 0 for TH (resembles theta after all)
                        nextChar == $H ifTrue:[
                            code nextPut:$0.
                        ] ifFalse:[
                            code nextPut:$T.
                        ].    
                    ].    
                ].    
            ] ifFalse:[
            ('V' includes:symb) ifTrue:[
                "/    case 'V':
                "/        code.append('F'); break;
                code nextPut:$F

            ] ifFalse:[
            ('WY' includes:symb) ifTrue:[
                "/    case 'W':
                "/    case 'Y': // silent if not followed by vowel
                "/        if (!!isLastChar(wdsz,n) &&
                "/            isVowel(local,n+1)) {
                "/            code.append(symb);
                "/        }
                "/        break;

                "/ silent if not followed by vowel 
                (isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[
                    code nextPut:symb
                ].    
            ] ifFalse:[
            ('X' includes:symb) ifTrue:[
                "/    case 'X':
                "/        code.append('K');
                "/        code.append('S');
                "/        break;
                code nextPutAll:'KS'
            ] ifFalse:[
            ('Z' includes:symb) ifTrue:[
                "/    case 'Z':
                "/        code.append('S');
                "/        break;
                code nextPut:$S
            ] ifFalse:[
                "/    default:
                "/        // do nothing
                "/        break;
            ]]]]]]]]]]]]]]]]. "/ end switch
        ]. "/ end else from symb !!= 'C'
        n := n + 1.
    ].
    ^ code contents

    "Created: / 02-08-2017 / 09:51:31 / cg"
    "Modified: / 03-08-2017 / 14:55:22 / cg"
! !

!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!

documentation
"
    WARNING: this is the so called 'simplified soundex' algorithm;
      there are more variants like miracode (american soundex) or
      mysqlSoundex around.
      
      Be sure to use the correct algorithm, if the generated strings must be compatible
      (otherwise, the differences are probably too small to be noticed as effect, but
      your search will be different)

    The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm

    SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
    components of names, but by doing so reports more matches. 

    There are some variations around in the literature; 
    the following is called 'simplified soundex', and the rules for coding a name are:

    1. The first letter of the name is used in its un-coded form to serve as the prefix
       character of the code. (The rest of the code is numerical).

    2. Thereafter, W and H are ignored entirely.

    3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).

    4. Other letters of the name are converted to a numerical equivalent:
                 B, P, F, V              1 
                 C, G, J, K, Q, S, X, Z  2 
                 D, T                    3 
                 L                       4 
                 M, N                    5 
                 R                       6 

    5. There are two exceptions: 
        1. Letters that follow prefix letters which would, if coded, have the same
           numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.

        2. The second letter of any pair of consonants having the same code number is likewise ignored, 
           i.e. unless there is a ''separator'' between them in the name.

    6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
       Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.

    Notice, that in another variant, w and h are treated slightly differently.
    This is only of relevance, if you need to reconstruct original soundex codes of other programs
    or for the original 1880 us census data.
     SoundexStringComparator  new encode:'Ashcraft' -> 'A226'
    vs.
     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
    
    Also notice, that soundex deals better with english. 
    For german and other languages, other algorithms may provide better results.
"
! !

!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!

encode:word 
    |u p t prevCode|

    u := word asUppercase.
    p := u first asString.
    prevCode := self translate:u first.
    u from:2 to:u size do:[:c | 
        t := self translate:c.
        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
            p := p , t.
            p size == 4 ifTrue:[^ p ].
        ].
        prevCode := t
    ].
    [ p size < 4 ] whileTrue:[
        p := p , '0'
    ].
    ^ (p copyFrom:1 to:4)

    "
     self new encode:'washington' -> 'W252'
     self new encode:'lee'        -> 'L000'
     self new encode:'Gutierrez'  -> 'G362'
     self new encode:'Pfister'    -> 'P236'
     self new encode:'Jackson'    -> 'J250'
     self new encode:'Tymczak'    -> 'T522'
    "
    
    "notice:
     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
     self new encode:'Ashcraft'   -> 'A226'
    "

    "Created: / 28-07-2017 / 15:21:23 / cg"
    "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
! !

!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!

translate:aCharacter
    "use simple if's for more speed when compiled"

    "vowels serve as separators"
    aCharacter == $A ifTrue:[^ '0' ].         
    aCharacter == $E ifTrue:[^ '0' ].
    aCharacter == $I ifTrue:[^ '0' ].
    aCharacter == $O ifTrue:[^ '0' ].
    aCharacter == $U ifTrue:[^ '0' ].
    aCharacter == $Y ifTrue:[^ '0' ].

    aCharacter == $B ifTrue:[^ '1' ]. 
    aCharacter == $P ifTrue:[^ '1' ]. 
    aCharacter == $F ifTrue:[^ '1' ]. 
    aCharacter == $V ifTrue:[^ '1' ]. 

    aCharacter == $C ifTrue:[^ '2' ]. 
    aCharacter == $S ifTrue:[^ '2' ]. 
    aCharacter == $K ifTrue:[^ '2' ]. 
    aCharacter == $G ifTrue:[^ '2' ]. 
    aCharacter == $J ifTrue:[^ '2' ]. 
    aCharacter == $Q ifTrue:[^ '2' ]. 
    aCharacter == $X ifTrue:[^ '2' ]. 
    aCharacter == $Z ifTrue:[^ '2' ]. 

    aCharacter == $D ifTrue:[^ '3' ]. 
    aCharacter == $T ifTrue:[^ '3' ]. 

    aCharacter == $L ifTrue:[^ '4' ]. 

    aCharacter == $M ifTrue:[^ '5' ]. 
    aCharacter == $N ifTrue:[^ '5' ]. 

    aCharacter == $R ifTrue:[^ '6' ]. 
    ^ nil

    "Modified: / 02-08-2017 / 01:35:40 / cg"
    "Modified (comment): / 02-08-2017 / 14:30:11 / cg"
! !

!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!

documentation
"
    MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
    and also removing vokals first, then removing duplicate codes
    (whereas the soundex code does this in reverse order).

    These variations are important, if you need the miracode soundex codes to be generated.
"
! !

!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!

encode:word 
    "same as inherited, but cares for 0, W and H"

    |u p t prevCode|

    u := word asUppercase.
    p := u first asString.
    prevCode := self translate:u first.
    u from:2 to:u size do:[:c |
        t := self translate:c.
        (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
            p := p , t.
        ].
        (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
            prevCode := t.
        ].
    ].
    [ p size < 4 ] whileTrue:[
        p := p , '0'
    ].
    ^ p

    "Created: / 28-07-2017 / 15:23:41 / cg"
    "Modified: / 31-07-2017 / 17:53:51 / cg"
    "Modified (comment): / 02-08-2017 / 14:31:15 / cg"
! !

!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!

documentation
"
    NYSIIS Algorithm:

    1.
        remove all ''S'' and ''Z'' chars from the end of the surname 

    2.
        transcode initial strings
            MAC => MC
            PF => F

    3.
        Transcode trailing strings as follows,
        
            IX => IC
            EX => EC
            YE,EE,IE => Y
            NT,ND => D 

    4.
        transcode ''EV'' to ''EF'' if not at start of name

    5.
        use first character of name as first character of key 

    6.
        remove any ''W'' that follows a vowel 

    7.
        replace all vowels with ''A'' 

    8.
        transcode ''GHT'' to ''GT'' 

    9.
        transcode ''DG'' to ''G'' 

    10.
        transcode ''PH'' to ''F'' 

    11.
        if not first character, eliminate all ''H'' preceded or followed by a vowel 

    12.
        change ''KN'' to ''N'', else ''K'' to ''C'' 

    13.
        if not first character, change ''M'' to ''N'' 

    14.
        if not first character, change ''Q'' to ''G'' 

    15.
        transcode ''SH'' to ''S'' 

    16.
        transcode ''SCH'' to ''S'' 

    17.
        transcode ''YW'' to ''Y'' 

    18.
        if not first or last character, change ''Y'' to ''A'' 

    19.
        transcode ''WR'' to ''R'' 

    20.
        if not first character, change ''Z'' to ''S'' 

    21.
        transcode terminal ''AY'' to ''Y'' 

    22.
        remove traling vowels 

    23.
        collapse all strings of repeated characters 

    24.
        if first char of original surname was a vowel, append it to the code
"
! !

!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!

encode:aString 
    |k|

    k := self rule1:(aString asUppercase).
    "2. Transcode initial strings:  MAC => MC   PF => F"
    k := self rule2:k.
    k := self rule3:k.
    k := self rule4:k.
    k := self rule5:k.
    k := self rule6:k.
    k := self rule7:k.
    k := self rule8:k.
    k := self rule9:k.
    k := self rule10:k.
    k := self rule11:k.
    k := self rule12:k.
    k := self rule13:k.
    k := self rule14:k.
    k := self rule15:k.
    k := self rule16:k.
    k := self rule17:k.
    k := self rule18:k.
    k := self rule19:k.
    k := self rule20:k.
    k := self rule21:k.
    k := self rule22:k.
    k := self rule23:k.
    k := self rule24:k originalKey:aString.
    ^ k

    "
     self new encode:'hello'
     self new encode:'bliss'
    "
    "
     self new phoneticStringsFor:'hello'
     self new phoneticStringsFor:'bliss'
    "

    "Created: / 28-07-2017 / 15:34:52 / cg"
    "Modified (comment): / 02-08-2017 / 14:31:47 / cg"
! !

!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!

rule10:key 
    "10. transcode 'PH' to 'F' "
    
    ^ self transcodeAll:'PH' of:key to:'F' startingAt:1

    "Modified (format): / 02-08-2017 / 14:34:27 / cg"
!

rule11:key 
    |k c|

    "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
    k := key copy.
    c := SortedCollection sortBlock:[:a :b | b < a ].
    2 to:key size do:[:i | 
        (key at:i) = $H ifTrue:[
            ((key at:i - 1) isVowel 
                or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
        ]
    ].
    c do:[:n | 
        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
    ].
    ^ k
!

rule12:key 
    |k|

    "12. change 'KN' to 'N', else 'K' to 'C' "
    k := self transcodeAll:'KN' of:key to:'K' startingAt:1.
    k := self transcodeAll:'K' of:k to:'C' startingAt:1.
    ^ k

    "Modified (format): / 02-08-2017 / 14:34:48 / cg"
!

rule13:key 
    "13. if not first character, change 'M' to 'N' "
    
    ^ self transcodeAll:'M' of:key to:'N' startingAt:2

    "Modified (format): / 02-08-2017 / 14:35:00 / cg"
!

rule14:key 
    "14. if not first character, change 'Q' to 'G' "
    
    ^ self transcodeAll:'Q' of:key to:'G' startingAt:2

    "Modified (format): / 02-08-2017 / 14:35:08 / cg"
!

rule15:key 
    "15. transcode 'SH' to 'S' "
    
    ^ self transcodeAll:'SH' of:key to:'S' startingAt:1

    "Modified (format): / 02-08-2017 / 14:35:18 / cg"
!

rule16:key 
    "16. transcode 'SCH' to 'S' "
    
    ^ self transcodeAll:'SCH' of:key to:'S' startingAt:1

    "Modified (format): / 02-08-2017 / 14:35:25 / cg"
!

rule17:key 
    "17. transcode 'YW' to 'Y' "
    
    ^ self transcodeAll:'YW' of:key to:'Y' startingAt:1

    "Modified (format): / 02-08-2017 / 14:35:33 / cg"
!

rule18:key 
    |k|

    "18. if not first or last character, change 'Y' to 'A' "
    k := self transcodeAll:'Y' of:key to:'A' startingAt:2.
    key last = $Y ifTrue:[
        k at:k size put:$Y
    ].
    ^ k

    "Modified (format): / 02-08-2017 / 14:35:44 / cg"
!

rule19:key 
    "19. transcode 'WR' to 'R' "
    
    ^ self transcodeAll:'WR' of:key to:'R' startingAt:1

    "Modified (format): / 02-08-2017 / 14:35:52 / cg"
!

rule1:key 
    |k|

    k := key copy.
     "1. Remove all 'S' and 'Z' chars from the end of the name"
    [
        'SZ' includes:k last
    ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
    ^ k
!

rule20:key 
    "20. if not first character, change 'Z' to 'S' "
    
    ^ self transcodeAll:'Z' of:key to:'S' startingAt:2

    "Modified (format): / 02-08-2017 / 14:36:00 / cg"
!

rule21:key 
    "21. transcode terminal 'AY' to 'Y' "
    
    ^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1

    "Modified (format): / 02-08-2017 / 14:36:08 / cg"
!

rule22:key 
    |k|

    "22. remove trailing vowels "
    k := key copy.
    [ k last isVowel ] whileTrue:[
        k := k copyButLast
    ].
    ^ k

    "Modified: / 02-08-2017 / 14:36:42 / cg"
!

rule23:key 
    |k c|

    "23. collapse all strings of repeated characters "
    k := key copy.
    c := SortedCollection sortBlock:[:a :b | b < a ].
    k size to:2 do:[:i | 
        (k at:i) = (k at:i - 1) ifTrue:[
            c add:i
        ]
    ].
    c do:[:n | 
        k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
    ].
    ^ k
!

rule24:key originalKey:originalKey 
    |k|

    "24. if first char of original surname was a vowel, append it to the code"
    k := key copy.
    originalKey first isVowel ifTrue:[
        k := k , originalKey first asString asUppercase
    ].
    ^ k
!

rule2:key 
     "2. Transcode initial strings:  MAC => MC   PF => F"

    |k|

    k := key copy.
    (k startsWith:'MAC') ifTrue:[
        k := 'MC' , (k copyFrom:4)
    ].
    (k startsWith:'PF') ifTrue:[
        k := 'F' , (k copyFrom:3)
    ].
    ^ k

    "Modified (format): / 02-08-2017 / 14:31:40 / cg"
!

rule3:key 
    |k|

    "3. Transcode trailing strings as follows:
        IX => IC
          EX => EC
          YE, EE, IE => Y
           NT, ND => D"
           
    k := key copy.
    k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'.
    k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'.
    k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'.
    k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'.
    ^ k

    "Modified (format): / 02-08-2017 / 14:32:24 / cg"
!

rule4:key 
    "4. Transcode 'EV' to 'EF' if not at start of name"
    
    ^ self transcodeAll:'EV' of:key to:'EF' startingAt:2

    "Modified (format): / 02-08-2017 / 14:32:35 / cg"
!

rule5:key 
    "5. Use first character of name as first character of key.  
        Ignored because we're doing an in-place conversion"
    
    ^ key

    "Modified (comment): / 02-08-2017 / 14:32:45 / cg"
!

rule6:key 
    |k i|

    "6. Remove any 'W' that follows a vowel"
    k := key copy.
    i := 2.
    [
        (i := k indexOf:$W startingAt:i) > 0
    ] whileTrue:[
        (k at:i - 1) isVowel ifTrue:[
            k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
            i := i - 1
        ]
    ].
    ^ k
!

rule7:key 
    "7. replace all vowels with 'A' "
    ^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]].

    "Modified: / 02-08-2017 / 14:33:56 / cg"
!

rule8:key 
    "8. transcode 'GHT' to 'GT' "
    
    ^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1

    "Modified (format): / 02-08-2017 / 14:34:05 / cg"
!

rule9:key 
    "9. transcode 'DG' to 'G' "
    
    ^ self transcodeAll:'DG' of:key to:'G' startingAt:1

    "Modified (format): / 02-08-2017 / 14:34:15 / cg"
!

transcodeAll:aString of:key to:replacementString startingAt:start 
    |k i|

    k := key copy.
    [
        (i := k indexOfSubCollection:aString startingAt:start) > 0
    ] whileTrue:[
        k := (k copyFrom:1 to:i - 1) , replacementString 
                    , (k copyFrom:i + aString size to:k size)
    ].
    ^ k
!

transcodeTrailing:anArrayOfStrings of:key to:replacementString 
    |answer|

    answer := key copy.
    anArrayOfStrings do:[:aString | 
        answer := self 
                    transcodeAll:aString
                    of:answer
                    to:replacementString
                    startingAt:(answer size - aString size) + 1
    ].
    ^ answer
! !

!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!

documentation
"
    Implementation of the PHONEM algorithm, as described in
    'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
    Ein Programm fuer kontextsensitive phonetische Textumwandlung
    ct Magazin fuer Computer & Technik 25/1998'
    
    This algorithm deals better with the german language (it cares for umlauts)
"
! !

!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!

encode:aString 
    |s idx t t2|

    s := aString asUppercase.

    idx := 1.
    [idx < (s size-1)] whileTrue:[
        t2 := nil.
        t := s copyFrom:idx to:idx+1.
        t = 'SC' ifTrue:[ t2 := 'C' ]
        ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
        ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
        ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
        ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
        ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
        ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
        ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
        ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
        ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
        ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
        ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
        ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
        ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
        ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
        ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
        ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
        t2 notNil ifTrue:[
            s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
        ] ifFalse:[
            idx := idx + 1.
        ].
    ].

    "/ single character substitutions via tr
    s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
    s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
    ^ s

    "
     self basicNew encode:'müller'  -> 'MYLR'    
     self basicNew encode:'mueller' -> 'MYLR'    
     self basicNew encode:'möller'  -> 'MYLR'
     self basicNew encode:'miller'  -> 'MYLR'     
     self basicNew encode:'muller'  -> 'MULR' 
     self basicNew encode:'muler'   -> 'MULR' 

     self basicNew phoneticStringsFor:'müller'  #('MYLR')    
     self basicNew phoneticStringsFor:'mueller' #('MYLR')    
     self basicNew phoneticStringsFor:'möller'  #('MYLR')
     self basicNew phoneticStringsFor:'miller'  #('MYLR')     
     self basicNew phoneticStringsFor:'muller'  #('MULR') 
     self basicNew phoneticStringsFor:'muler'   #('MULR') 
     
     self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
     self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
     self basicNew phoneticStringsFor:'fischer'     #('VYCR')
     self basicNew phoneticStringsFor:'weber'       #('VBR')
     self basicNew phoneticStringsFor:'weeber'      #('VBR')
     self basicNew phoneticStringsFor:'webber'      #('VBR')
     self basicNew phoneticStringsFor:'wepper'      #('VBR')
     
     self basicNew phoneticStringsFor:'meyer'       #('MAYR')
     self basicNew phoneticStringsFor:'maier'       #('MAYR')
     self basicNew phoneticStringsFor:'mayer'       #('MAYR')
     self basicNew phoneticStringsFor:'mayr'        #('MAYR')
     self basicNew phoneticStringsFor:'meir'        #('MAYR')
     
     self basicNew phoneticStringsFor:'wagner'      #('VACNR')
     self basicNew phoneticStringsFor:'schulz'      #('CULC')
     self basicNew phoneticStringsFor:'becker'      #('BCR')
     self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
     self basicNew phoneticStringsFor:'haus'        #('AUS')
     
     self basicNew phoneticStringsFor:'schäfer'     #('CVR')
     self basicNew phoneticStringsFor:'scheffer'    #('CVR')
     self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
     self basicNew phoneticStringsFor:'schaefer'    #('CVR')
    "

    "Created: / 28-07-2017 / 15:38:08 / cg"
! !

!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'!

documentation
"
    Caverphone (2) Algorithm:

    see http://caversham.otago.ac.nz/files/working/ctp150804.pdf
    
    Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it,
    with the proviso that the Caversham Project at the University of Otago should be acknowledged as the
    original source (which is hereby done ;-).

    •  Start with a Surname or Firstname
    •  Convert to lowercase
        This coding system is case sensitive, implementations should acknowledge that a is not the same as A
    •  Remove anything not A-Z
        The main intention of this is to remove spaces, hyphens, and apostrophes.
        example:  o'brian becomes obrian
    •  If the name starts with cough make it cou2f
        2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in.
    •  If the name starts with rough make it rou2f
    •  If the name starts with tough make it tou2f
    •  If the name starts with enough make it enou2f
    •  If the name starts with gn make it 2n
    •  If the name ends with mb make it m2
    •  replace cq with 2q
    •  replace ci with si
    •  replace ce with se
    •  replace cy with sy
    •  replace tch with 2ch
    •  replace c with k
    •  replace q with k
    •  replace x with k
    •  replace v with f
    •  replace dg with 2g
    •  replace tio with sio
    •  replace tia with sia
    •  replace d with t
    •  replace ph with fh
    •  replace b with p
    •  replace sh with s2
    •  replace z with s
    •  replace and initial vowel with an A
    •  replace all other vowels with a 3
        3 is a temporary placeholder marking a vowel
    •  replace 3gh3 with 3kh3
        Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule.
    •  replace gh with 22
    •  replace g with k
    •  replace groups of the letter s with a S
        Continuous strings of s are replace by a single S
    •  replace groups of the letter t with a T
    •  replace groups of the letter p with a P
    •  replace groups of the letter k with a K
    •  replace groups of the letter f with a F
    •  replace groups of the letter m with a M
    •  replace groups of the letter n with a N
    •  replace w3 with W3
    •  replace wy with Wy
    •  replace wh3 with Wh3
    •  replace why with Why
    •  replace w with 2
    •  replace and initial h with an A
    •  replace all other occurrences of h with a 2
    •  replace r3 with R3
    •  replace ry with Ry
    •  replace r with 2
    •  replace l3 with L3
    •  replace ly with Ly
    •  replace l with 2
    •  replace j with y
    •  replace y3 with Y3
    •  replace y with 2
    •  remove all 2s
    •  remove all 3s
    •  put six (v1) / ten (v2) 1s on the end
    •  take the first six characters as the code (caverphone 1);
       / take the first ten characters as the code (caverphone 2);

     self new encode:'david'      -> 'TFT1111111'
     self new encode:'whittle'    -> 'WTA1111111'

     self new encode:'Stevenson'  -> 'STFNSN1111'
     self new encode:'Peter'      -> 'PTA1111111'

     self new encode:'washington' -> 'WSNKTN1111'
     self new encode:'lee'        -> 'LA11111111'
     self new encode:'Gutierrez'  -> 'KTRS111111'
     self new encode:'Pfister'    -> 'PFSTA11111'
     self new encode:'Jackson'    -> 'YKSN111111'
     self new encode:'Tymczak'    -> 'TMKSK11111'

     self new encode:'add'        -> 'AT11111111'
     self new encode:'aid'        -> 'AT11111111'
     self new encode:'at'         -> 'AT11111111'
     self new encode:'art'        -> 'AT11111111'
     self new encode:'earth'      -> 'AT11111111'
     self new encode:'head'       -> 'AT11111111'
     self new encode:'old'        -> 'AT11111111'

     self new encode:'ready'      -> 'RTA1111111'
     self new encode:'rather'     -> 'RTA1111111'
     self new encode:'able'       -> 'APA1111111'
     self new encode:'appear'     -> 'APA1111111'

     self new encode:'Deedee'     -> 'TTA1111111'
"
! !

!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'!

encode:word 
    |txt|

    word size == 0 ifTrue:[^ '1111111111' ].
    
    "/ 1. Convert to lowercase
    txt := word asLowercase.

    "/ 2. Remove anything not A-Z
    txt := txt select:#isLetter.

    #(
    "/  oldSeq newSeq repeat

    "/ 2.5. Remove final e
        'e$' '' false
    "/ 3. Handle various start options
        '^cough' 'cou2f' false
        '^rough' 'rou2f' false
        '^tough' 'tou2f' false
        '^enough' 'enou2f' false
        '^trough' 'trou2f' false

        '^gn' '2n' false
        'mb$' 'm2' false
        
    "/ 4. Handle replacements
        'cq' '2q' true
        'ci' 'si' true
        'ce' 'se' true
        'cy' 'sy' true
        'tch' '2ch' true
        'c' 'k' true
        'q' 'k' true
        'x' 'k' true
        'v' 'f' true
        'dg' '2g' true
        'tio' 'sio' true
        'tia' 'sia' true
        'd' 't' true
        'ph' 'fh' true
        'b' 'p' true
        'sh' 's2' true
        'z' 's' true
        
        '^a' 'A' false
        '^e' 'A' false
        '^i' 'A' false
        '^o' 'A' false
        '^u' 'A' false
        
        'a' '3' true
        'e' '3' true
        'i' '3' true
        'o' '3' true
        'u' '3' true
        'j' 'y' true 
        
        '^y3' 'Y3' false 
        '^y' 'A' false

        'y' '3'  true
        '3gh3' '3kh3' true
        'gh' '22' true
        'g' 'k' true
        's'  'S' true
        'SS' 'S' true
        't'  'T' true
        'TT' 'T' true
        'p'  'P' true
        'PP' 'P' true
        'k'  'K' true
        'KK' 'K' true
        'f'  'F' true
        'FF' 'F' true
        'm'  'M' true
        'MM' 'M' true
        'n'  'N' true
        'NN' 'N' true
        'w3' 'W3' true
        'wh3' 'Wh3' true
        'w$' '3'  false
        'w' '2' true
        '^h' 'A' false
        'h' '2' true
        'r3' 'R3' true
        'r$' '3'  false
        'r' '2' true
        'l3' 'L3' true
        'l$' '3' false
        'l' '2' true

    "/ 5. removals

        '2' '' true
        '3$' 'A' true
        '3' '' true
    ) inGroupsOf:3 do:[:pat :repl :repeat|
        |s txtBefore|

        txtBefore := txt.
        (pat startsWith:$^) ifTrue:[
            s := pat copyButFirst.
            repeat ifTrue:[
                [txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ]
            ] ifFalse:[
                (txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ]
            ].    
        ] ifFalse:[
            (pat endsWith:$$) ifTrue:[
                s := pat copyButLast.
                repeat ifTrue:[
                    [txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ]
                ] ifFalse:[
                    (txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ]
                ]
            ] ifFalse:[
                repeat ifTrue:[
                    txt := txt copyReplaceAllSubcollections:pat with:repl
                ] ifFalse:[
                    txt := txt copyReplaceSubcollection:pat with:repl
                ]    
            ]    
        ].
        "/ txt ~= txtBefore ifTrue:[
        "/     Transcript showCR:(pat,' | ',repl,' -> ',txt).
        "/ ].    
    ].    

    "/ 6. put ten 1s on the end
    txt := txt,'1111111111'.
    
    "/ 7. take the first ten characters as the code
    ^ txt copyTo:10

    "
     self new encode:'david'      -> 'TFT1111111'
     self new encode:'whittle'    -> 'WTA1111111'

     self new encode:'Stevenson'  -> 'STFNSN1111'
     self new encode:'Peter'      -> 'PTA1111111'

     self new encode:'washington' -> 'WSNKTN1111'
     self new encode:'lee'        -> 'LA11111111'
     self new encode:'Gutierrez'  -> 'KTRS111111'
     self new encode:'Pfister'    -> 'PFSTA11111'
     self new encode:'Jackson'    -> 'YKSN111111'
     self new encode:'Tymczak'    -> 'TMKSK11111'

     self new encode:'add'        -> 'AT11111111'
     self new encode:'aid'        -> 'AT11111111'
     self new encode:'at'         -> 'AT11111111'
     self new encode:'art'        -> 'AT11111111'
     self new encode:'earth'      -> 'AT11111111'
     self new encode:'head'       -> 'AT11111111'
     self new encode:'old'        -> 'AT11111111'

     self new encode:'ready'      -> 'RTA1111111'
     self new encode:'rather'     -> 'RTA1111111'
     self new encode:'able'       -> 'APA1111111'
     self new encode:'appear'     -> 'APA1111111'

     self new encode:'Deedee'     -> 'TTA1111111'
    "

    "Created: / 28-07-2017 / 15:21:23 / cg"
    "Modified: / 02-08-2017 / 01:42:35 / cg"
! !

!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!

documentation
"
     The 'Kölner Phonetik' (cologne phonetic) code is for the german language 
     what the soundex code is for english:
        it returns similar strings for similar sounding words 
     (but is specifically aware of the pronunciation of German and eastern languages) . 

     There are some other differences to soundex, though: 
        its length is not limited to 4, but depends on the length of the original string;
        it does not start with the first character of the input, but returns a pure numeric string.

     This algorithm was described by Postel 1969,
     See  http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik

    self new phoneticStringsFor:'Müller-Lüdenscheidt' -> #('65752682')
"
!

examples
"
   words sounding similar (german pronunciation) will deliver a similar code:
   
     #(
        'Müller'
        'Miller'
        'Mueller'
        'Mühler'
        'Mühlherr'
        'Mülherr'
        'Myler'
        'Millar'
        'Myller'
        'Müllar'
        'Müler'
        'Muehler'
        'Mülller'
        'Müllerr'
        'Muehlherr'
        'Muellar'
        'Mueler'
        'Mülleer'
        'Mueller'
        'Nüller'
        'Nyller'
        'Niler'
        'Czerny'
        'Tscherny'
        'Czernie'
        'Tschernie'
        'Schernie'
        'Scherny'
        'Scherno'
        'Czerne'
        'Zerny'
        'Tzernie'
        'Breschnew'
        'Breschnew'
        'Breschneff'
        'Breschnjeff'
        'Braeschneff'
        'Braessneff' 
        'Pressneff' 
        'Presznäph'
        'Präschnäf' 
        'Breschnjeff' 
        'Breschnijeff' 
        'Breschnieff' 
        'Bräschnieff' 
        'Braschnieff' 
        'Broschnieff' 
     ) do:[:w |
         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w)
     ].
"
! !

!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'api'!

encode: aString
    "return a koelner phonetic code.
     The koelnerPhonetic code is for the german language what the soundex code is for english;
     it returns simular strings for similar sounding words. 
     There are some differences to soundex, though: 
        its length is not limited to 4, but depends on the length of the original string;
        it does not start with the first character of the input.
     This algorithm is described by Postel 1969"

    |in ret val rslt|

    in := aString withoutSeparators asLowercase.
    in := in copyReplaceString:'ph' withString:'f'.
    (in includesAny:'öäüß') ifTrue:[
        in := in copyReplaceAll:$ü withAll:'u'.
        in := in copyReplaceAll:$ä withAll:'a'.
        in := in copyReplaceAll:$ö withAll:'o'.
        in := in copyReplaceAll:$ß withAll:'ss'.
    ].
    in := in select:[:ch | ch isLetter].
    in := '#',in,'#'.

    ret := ''.
    1 to:in size-2 do:[:i |
        |sub|

        sub := in copyFrom:i to:i+2.
        val := (i==1) 
                    ifTrue:[ self convertFirst:sub ] 
                    ifFalse:[ self convertRest:sub ].
        ret := ret,val
    ].

    ret := ret select:[:ch | ch ~= $-].

    (ret startsWith:'0') ifTrue:[
        ret := '0',(ret select:[:ch | ch ~= $0]).
    ] ifFalse:[
        ret := ret select:[:ch | ch ~= $0].
    ].

    rslt := String streamContents:[:s |
        |prev|

        ret do:[:ch |
            ch ~= prev ifTrue:[
                s nextPut:ch
            ].
            prev := ch.
        ].
      ].
    ^ rslt.

    "
     #(
        'Müller'
        'Miller'
        'Mueller'
        'Mühler'
        'Mühlherr'
        'Mülherr'
        'Myler'
        'Millar'
        'Myller'
        'Müllar'
        'Müler'
        'Muehler'
        'Mülller'
        'Müllerr'
        'Muehlherr'
        'Muellar'
        'Mueler'
        'Mülleer'
        'Mueller'
        'Nüller'
        'Nyller'
        'Niler'
        'Czerny'
        'Tscherny'
        'Czernie'
        'Tschernie'
        'Schernie'
        'Scherny'
        'Scherno'
        'Czerne'
        'Zerny'
        'Tzernie'
        'Breschnew'
        'Breschnew'
        'Breschneff'
        'Breschnjeff'
        'Braeschneff'
        'Braessneff' 
        'Pressneff' 
        'Presznäph'
        'Präschnäf' 
        'Breschnjeff' 
        'Breschnijeff' 
        'Breschnieff' 
     ) do:[:w |
         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w)
     ].
    "

    "
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnew' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschneff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braeschneff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braessneff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Pressneff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Presznäph' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Präschnäf' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnjeff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnijeff' -> '17863'
     PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnieff' -> '17863'
    "
    "
     self basicNew encode:'müller'      -> '657'   
     self basicNew encode:'möller'      -> '657'
     self basicNew encode:'miller'      -> '657'     
     self basicNew encode:'muller'      -> '657'
     self basicNew encode:'muler'       -> '657'
     self basicNew encode:'schmidt'     -> '862'   
     self basicNew encode:'schneider'   -> '8627' 
     self basicNew encode:'fischer'     -> '387' 
     self basicNew encode:'weber'       -> '317' 
     self basicNew encode:'meyer'       -> '67' 
     self basicNew encode:'wagner'      -> '3467' 
     self basicNew encode:'schulz'      -> '858'
     self basicNew encode:'becker'      -> '147'
     self basicNew encode:'hoffmann'    -> '036'
     self basicNew encode:'schäfer'     -> '837' 
    "

    "Created: / 28-07-2017 / 15:24:33 / cg"
! !

!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'!

convertFirst:chars
    |c2 c3|
    
    chars size == 3 ifTrue:[
        c2 := (chars at:2).
        c2 == $a ifTrue:[^ '0'].
        c2 == $e ifTrue:[^ '0'].
        c2 == $i ifTrue:[^ '0'].
        c2 == $j ifTrue:[^ '0'].
        c2 == $y ifTrue:[^ '0'].
        c2 == $o ifTrue:[^ '0'].
        c2 == $u ifTrue:[^ '0'].

        c2 == $c ifTrue:[
            c3 := (chars at:3).
            (c3 == $a) ifTrue:[^ '4'].
            (c3 == $h) ifTrue:[^ '4'].
            (c3 == $k) ifTrue:[^ '4'].
            (c3 == $l) ifTrue:[^ '4'].
            (c3 == $o) ifTrue:[^ '4'].
            (c3 == $q) ifTrue:[^ '4'].
            (c3 == $r) ifTrue:[^ '4'].
            (c3 == $u) ifTrue:[^ '4'].
            (c3 == $x) ifTrue:[^ '4'].
            ^ '8'
        ].    
        
"/        #(
"/            ('#a#' '0')
"/            ('#e#' '0')
"/            ('#i#' '0')
"/            ('#j#' '0')
"/            ('#y#' '0')
"/            ('#o#' '0')
"/            ('#u#' '0')
"/
"/            ('#ca' '4')
"/            ('#ch' '4')
"/            ('#ck' '4')
"/            ('#cl' '4')
"/            ('#co' '4')
"/            ('#cq' '4')
"/            ('#cr' '4')
"/            ('#cu' '4')
"/            ('#cx' '4')
"/
"/            ('#c#' '8')
"/        ) do:[:pair | 
"/            (pair first match:chars) ifTrue:[
"/                ^ pair second
"/            ]
"/        ].
    ].
    
    ^ self convertRest:chars

    "Modified: / 29-07-2017 / 14:22:20 / cg"
!

convertRest:chars
    chars size == 3 ifFalse:[
        self error:'cannot happen'.
        ^ '?' 
    ].
    
    #(
        "/ used to be matchpattern code,
        "/ but doing these glob-matches is too slow.
        "/ changed to:
        "/    start nil  code
        "/    nil   end  code
        "/    nil   char code
        "/    
        (nil 'ds' " '#ds' " '8')
        (nil 'dc' " '#dc' " '8')
        (nil 'dz' " '#dz' " '8')
        (nil 'ts' " '#ts' " '8')
        (nil 'tc' " '#tc' " '8')
        (nil 'tz' " '#tz' " '8')
        (nil $d   " '#d#' " '2')
        (nil $t   " '#t#' " '2')
        ('cx' nil " 'cx#' " '8')
        ('kx' nil " 'kx#' " '8')
        ('qx' nil " 'qx#' " '8')
        (nil $x   " '#x#' " '48')
        ('sc' nil " 'sc#' " '8')
        ('sz' nil " 'sz#' " '8')
        (nil 'ca' " '#ca' " '4')
        (nil 'co' " '#co' " '4')
        (nil 'cu' " '#cu' " '4')
        (nil 'ch' " '#ch' " '4')
        (nil 'ck' " '#ck' " '4')
        (nil 'cx' " '#cx' " '4')
        (nil 'cq' " '#cq' " '4')
        (nil $c   " '#c#' " '8')
        (nil $a   " '#a#' " '0')
        (nil $e   " '#e#' " '0')
        (nil $i   " '#i#' " '0')
        (nil $j   " '#j#' " '0')
        (nil $y   " '#y#' " '0')
        (nil $o   " '#o#' " '0')
        (nil $u   " '#u#' " '0')
        (nil $h   " '#h#' " '-')
        (nil $l   " '#l#' " '5')
        (nil $r   " '#r#' " '7')
        (nil $m   " '#m#' " '6')
        (nil $n   " '#n#' " '6')
        (nil $s   " '#s#' " '8')
        (nil $z   " '#z#' " '8')
        (nil $b   " '#b#' " '1')
        (nil $p   " '#p#' " '1')
        (nil $f   " '#f#' " '3')
        (nil $v   " '#v#' " '3')
        (nil $w   " '#w#' " '3')
        (nil $g   " '#g#' " '4')
        (nil $k   " '#k#' " '4')
        (nil $q   " '#q#' " '4')
        (nil nil  " '###' " '?')
    ) do:[:vector |
        |v1 v2|
        
        (v1 := vector at:1) notNil ifTrue:[
            "/ prefix
            (chars startsWith:v1) ifTrue:[^ (vector at:3) ].
        ] ifFalse:[                       
            (v2 := vector at:2) isCharacter ifTrue:[
                "/ middle character compare
                (chars at:2) == v2 ifTrue:[^ (vector at:3) ]. 
            ] ifFalse:[    
                v2 isString ifTrue:[
                    "/ suffix
                    (chars endsWith:v2) ifTrue:[^ (vector at:3) ].
                ] ifFalse:[
                   ^ '?' 
                ]
            ]
        ].
        
        "/ (vector first match:chars) ifTrue:[
        "/     ^ vector second
        "/ ]
    ].

    self error:'cannot happen'

    "Modified: / 29-07-2017 / 14:17:38 / cg"
! !

!PhoneticStringUtilities::MiracodeStringComparator class methodsFor:'documentation'!

documentation
"
    Miracode (also called << American Soundex >>) is like Soundex with the 
    addition that h and w are discarded if they separate consonants.

    These variants may be specifically important because they were used in 
    U.S. National Archives. 
    Most archive data were encoded with Miracode, 
    but there are some (older) entries encoded with Simplified Soundex. 

    The HW-rule was documented as a standard in 1910, 
    but actually data of 1880, 1900 and 1910 
    censuses were encoded with mixed methods.

     self new encode:'washington' -> 'W252'
     self new encode:'lee'        -> 'L000'
     self new encode:'Gutierrez'  -> 'G362'
     self new encode:'Pfister'    -> 'P236'
     self new encode:'Jackson'    -> 'J250'
     self new encode:'Tymczak'    -> 'T522'

    notice:
     MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
     SoundexStringComparator new encode:'Ashcraft'  -> 'A226'

    see also:            
        https://www.archives.gov/research/census/soundex.html
"
! !

!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'!

encode:word 
    "same as inherited, but cares for W and H"
    
    |u p t prevCode|

    u := word asUppercase.
    p := u first asString.
    prevCode := self translate:u first.
    u from:2 to:u size do:[:c | 
        t := self translate:c.
        (t notNil 
        and:[ t ~= '0' 
        and:[ t ~= prevCode ]]) ifTrue:[
            p := p , t.
            p size == 4 ifTrue:[^ p ].
        ].
        (c ~= $W and:[c ~= $H]) ifTrue:[
            prevCode := t.
        ].
    ].
    [ p size < 4 ] whileTrue:[
        p := p , '0'
    ].
    ^ (p copyFrom:1 to:4)

    "Created: / 02-08-2017 / 00:19:47 / cg"
    "Modified (comment): / 02-08-2017 / 14:30:47 / cg"
! !

!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!

documentation
"
     The 'Spanish Phonetik' (spanish phonetic) code is for the spanish language 
     what the soundex code is for english:
        it returns similar strings for similar sounding words 
     (but is specifically aware of the pronunciation of spanish) . 

     There are some other differences to soundex, though: 
        its length is not limited to 4, but depends on the length of the original string;
        it does not start with the first character of the input, 
        but returns a pure numeric string,
        it uses different character groups

     This algorithm was described by Marıa del Pilar Angeles, Adrian Espino-Gamez, 
     and Jonathan Gil-Moncada, in 
        'Comparison of a Modiﬁed Spanish phonetic,
         Soundex, and Phonex coding functions during data matching process'
     See  https://www.researchgate.net/publication/285589803_Comparison_of_a_Modified_Spanish_Phonetic_Soundex_and_Phonex_coding_functions_during_data_matching_process

"
!

examples
"
   words sounding similar (german pronunciation) will deliver a similar code:
   
     #(
        'Marıa'
        'Pilar'
        'Angeles'
        'Adrian'
        'Gamez'
     ) do:[:w |
         Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::SpanishPhoneticCodeStringComparator new encode:w)
     ].
"
! !

!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'api'!

encode: aString
    "return a spanish phonetic code.
     The spanishPhonetic code is for the spanish language what the soundex code is for english;
     it returns simular strings for similar sounding words. 
     There are some differences to soundex, though: 
        its length is not limited to 4, but depends on the length of the original string;
        it does not start with the first character of the input,
        it uses different character groups.
     This algorithm is described by Marıa del Pilar Angeles, Adrian Espino-Gamez, 
     Jonathan Gil-Moncada."

    |in|

    in := aString withoutSeparators asUppercase.
    
    ^ String streamContents:[:out |
        |prev|
        
        in do:[:ch |
            ch == prev ifFalse:[
                ch == $P ifTrue:[
                    out nextPut:$0.
                ] ifFalse:[ ('BV' includes:ch)  ifTrue:[
                    out nextPut:$1.
                ] ifFalse:[ ('FH' includes:ch)  ifTrue:[
                    out nextPut:$2.
                ] ifFalse:[ ('DT' includes:ch)  ifTrue:[
                    out nextPut:$3.
                ] ifFalse:[ ('SZCX' includes:ch)  ifTrue:[
                    out nextPut:$4.
                ] ifFalse:[ ('YL' includes:ch)  ifTrue:[
                    out nextPut:$5.
                ] ifFalse:[ ('NŃM' includes:ch)  ifTrue:[
                    out nextPut:$6.
                ] ifFalse:[ ('QK' includes:ch)  ifTrue:[
                    out nextPut:$7.
                ] ifFalse:[ ('GJ' includes:ch)  ifTrue:[
                    out nextPut:$8.
                ] ifFalse:[ ('R' includes:ch)  ifTrue:[
                    out nextPut:$9.
                ]]]]]]]]]].
                prev := ch.
            ].
        ].
    ].

    "
     self new encode:'Jose'
    "

    "Created: / 28-07-2017 / 15:24:33 / cg"
    "Modified: / 01-08-2017 / 18:48:50 / cg"
! !

!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'private'!

convertFirst:chars
    |c2 c3|
    
    chars size == 3 ifTrue:[
        c2 := (chars at:2).
        c2 == $a ifTrue:[^ '0'].
        c2 == $e ifTrue:[^ '0'].
        c2 == $i ifTrue:[^ '0'].
        c2 == $j ifTrue:[^ '0'].
        c2 == $y ifTrue:[^ '0'].
        c2 == $o ifTrue:[^ '0'].
        c2 == $u ifTrue:[^ '0'].

        c2 == $c ifTrue:[
            c3 := (chars at:3).
            (c3 == $a) ifTrue:[^ '4'].
            (c3 == $h) ifTrue:[^ '4'].
            (c3 == $k) ifTrue:[^ '4'].
            (c3 == $l) ifTrue:[^ '4'].
            (c3 == $o) ifTrue:[^ '4'].
            (c3 == $q) ifTrue:[^ '4'].
            (c3 == $r) ifTrue:[^ '4'].
            (c3 == $u) ifTrue:[^ '4'].
            (c3 == $x) ifTrue:[^ '4'].
            ^ '8'
        ].    
        
"/        #(
"/            ('#a#' '0')
"/            ('#e#' '0')
"/            ('#i#' '0')
"/            ('#j#' '0')
"/            ('#y#' '0')
"/            ('#o#' '0')
"/            ('#u#' '0')
"/
"/            ('#ca' '4')
"/            ('#ch' '4')
"/            ('#ck' '4')
"/            ('#cl' '4')
"/            ('#co' '4')
"/            ('#cq' '4')
"/            ('#cr' '4')
"/            ('#cu' '4')
"/            ('#cx' '4')
"/
"/            ('#c#' '8')
"/        ) do:[:pair | 
"/            (pair first match:chars) ifTrue:[
"/                ^ pair second
"/            ]
"/        ].
    ].
    
    ^ self convertRest:chars

    "Modified: / 29-07-2017 / 14:22:20 / cg"
!

convertRest:chars
    chars size == 3 ifFalse:[
        self error:'cannot happen'.
        ^ '?' 
    ].
    
    #(
        "/ used to be matchpattern code,
        "/ but doing these glob-matches is too slow.
        "/ changed to:
        "/    start nil  code
        "/    nil   end  code
        "/    nil   char code
        "/    
        (nil 'ds' " '#ds' " '8')
        (nil 'dc' " '#dc' " '8')
        (nil 'dz' " '#dz' " '8')
        (nil 'ts' " '#ts' " '8')
        (nil 'tc' " '#tc' " '8')
        (nil 'tz' " '#tz' " '8')
        (nil $d   " '#d#' " '2')
        (nil $t   " '#t#' " '2')
        ('cx' nil " 'cx#' " '8')
        ('kx' nil " 'kx#' " '8')
        ('qx' nil " 'qx#' " '8')
        (nil $x   " '#x#' " '48')
        ('sc' nil " 'sc#' " '8')
        ('sz' nil " 'sz#' " '8')
        (nil 'ca' " '#ca' " '4')
        (nil 'co' " '#co' " '4')
        (nil 'cu' " '#cu' " '4')
        (nil 'ch' " '#ch' " '4')
        (nil 'ck' " '#ck' " '4')
        (nil 'cx' " '#cx' " '4')
        (nil 'cq' " '#cq' " '4')
        (nil $c   " '#c#' " '8')
        (nil $a   " '#a#' " '0')
        (nil $e   " '#e#' " '0')
        (nil $i   " '#i#' " '0')
        (nil $j   " '#j#' " '0')
        (nil $y   " '#y#' " '0')
        (nil $o   " '#o#' " '0')
        (nil $u   " '#u#' " '0')
        (nil $h   " '#h#' " '-')
        (nil $l   " '#l#' " '5')
        (nil $r   " '#r#' " '7')
        (nil $m   " '#m#' " '6')
        (nil $n   " '#n#' " '6')
        (nil $s   " '#s#' " '8')
        (nil $z   " '#z#' " '8')
        (nil $b   " '#b#' " '1')
        (nil $p   " '#p#' " '1')
        (nil $f   " '#f#' " '3')
        (nil $v   " '#v#' " '3')
        (nil $w   " '#w#' " '3')
        (nil $g   " '#g#' " '4')
        (nil $k   " '#k#' " '4')
        (nil $q   " '#q#' " '4')
        (nil nil  " '###' " '?')
    ) do:[:vector |
        |v1 v2|
        
        (v1 := vector at:1) notNil ifTrue:[
            "/ prefix
            (chars startsWith:v1) ifTrue:[^ (vector at:3) ].
        ] ifFalse:[                       
            (v2 := vector at:2) isCharacter ifTrue:[
                "/ middle character compare
                (chars at:2) == v2 ifTrue:[^ (vector at:3) ]. 
            ] ifFalse:[    
                v2 isString ifTrue:[
                    "/ suffix
                    (chars endsWith:v2) ifTrue:[^ (vector at:3) ].
                ] ifFalse:[
                   ^ '?' 
                ]
            ]
        ].
        
        "/ (vector first match:chars) ifTrue:[
        "/     ^ vector second
        "/ ]
    ].

    self error:'cannot happen'

    "Modified: / 29-07-2017 / 14:17:38 / cg"
! !

!PhoneticStringUtilities class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !
author	Claus Gittinger <cg@exept.de>
	Sat, 02 May 2020 21:40:13 +0200
changeset 5476	7355a4b11cb6
parent 5456	3040ec2b4531
permissions	-rw-r--r--