hg/stx-libbasic2: comparison PhoneticStringUtilities.st

equal deleted inserted replaced

-:33b5fbfc4b5d
+:d6c31bb1e928
 	classVariableNames:''
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 !
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
 PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
 	instanceVariableNames:''
 	classVariableNames:'CharacterTranslationDict'
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 	classVariableNames:'CharacterTranslationDict'
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 !
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator
+	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+		currentIndex skipCount'
+	classVariableNames:''
+	poolDictionaries:''
+	privateIn:PhoneticStringUtilities
+!
 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
 	instanceVariableNames:''
 	classVariableNames:'CharacterTranslationDict'
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 	classVariableNames:'CharacterTranslationDict'
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 !
-PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator
-	instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+	instanceVariableNames:''
-		currentIndex skipCount'
+	classVariableNames:'CharacterTranslationDict'
-	classVariableNames:''
 	poolDictionaries:''
 	privateIn:PhoneticStringUtilities
 !
 PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator
 phonem
 described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
 from 'ct Magazin fuer Computer & Technik 25/1999'.
+mra
+Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.
+caverphone2
+better than soundex
+spanish phonetic code
+an algorithm slightly adjusted to spanish names
 More info for german readers is found in:
 http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
 "
 !
 sampleData
 "
 for the 50 most common german names, we get:
 ext.
-name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln
+name        soundex   soundex   metaphone   phonet  phonet2     phonix      daitsch phonem      koeln  caverphone2  mra
-müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657
+müller      M460    54600000    MLR         MÜLA    NILA        M4000000    689000  MYLR        657    MLA1111111   MLR
-schmidt     S253    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        8628
+schmidt     S530    25300000    SKMTT       SHMIT   ZNIT        S5300000    463000  CMYD        862    SKMT111111   SCHMDT
-schneider   S253    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627
+schneider   S536    25360000    SKNTR       SHNEIDA ZNEITA      S5300000    463900  CNAYDR      8627   SKNTA11111   SCHNDR
-fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387
+fischer     F260    12600000    FSKR        FISHA   FIZA        F8000000    749000  VYCR        387    FSKA111111   FSCHR
-weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317
+weber       W160    16000000    WBR         WEBA    FEBA        $1000000    779000  VBR         317    WPA1111111   WBR
-meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67
+meyer       M600    56000000    MYR         MEIA    NEIA        M0000000    619000  MAYR        67     MA11111111   MYR
-wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       367
+wagner      W256    25600000    WKNR        WAKNA   FAKNA       $2500000    756900  VACNR       3467   WKNA111111   WGNR
-schulz      S242    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        85
+schulz      S420    24200000    SKLS        SHULS   ZULZ        S4800000    484000  CULC        858    SKS1111111   SCHLZ
-becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147
+becker      B260    12600000    BKR         BEKA    BEKA        B2000000    759000  BCR         147    PKA1111111   BCKR
-hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036
+hoffmann    H155    15500000    HFMN        HOFMAN  UFNAN       $7550000    576600  OVMAN       036    AFMN111111   HFMN
-schäfer     S216    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837
+schäfer     S16ß    21600000    SKFR        SHEFA   ZEFA        S7000000    479000  CVR         837    SKFA111111   SCHFR
+|cls|
+cls := MRAStringComparator.
+cls := SoundexStringComparator.
+cls := KoelnerPhoneticCodeStringComparator.
+cls := Caverphone2StringComparator.
+#('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer'
+'wagner' 'schulz'  'becker'    'hoffmann' 'schäfer')
+do:[:name |
+Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''.
+].
+KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt'  -> '65752682'
 "
 ! !
 !PhoneticStringUtilities class methodsFor:'phonetic codes'!
 isAbstract
 ^ self == PhoneticStringUtilities::PhoneticStringComparator
 ! !
+!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'!
+encode:word
+^ (self new phoneticStringsFor:word) first
+"
+SoundexStringComparator encode:'Fischer'             -> 'F260'
+Caverphone2StringComparator encode:'Fischer'         -> 'FSKA111111'
+KoelnerPhoneticCodeStringComparator encode:'Fischer' -> '387'
+MRAStringComparator encode:'Fischer'                 -> 'FSCHR'
+SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429'
+"
+"Created: / 02-08-2017 / 01:15:50 / cg"
+! !
 !PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!
 does:aString soundLike:anotherString
 |translations1 translations2|
 "/ please change as required (and remove this comment)
 "/ super initialize.   -- commented since inherited method does nothing
 ! !
-!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
+!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'!
 documentation
 "
-There are many extended and enhanced soundex variants around;
+self encode:'AUERBACH' -> 097400, 097500
-here is one, called 'extended soundex'. It is destribed for example in
-http://www.epidata.dk/documentation.php.
+Encodes a string into a Daitch-Mokotoff Soundex value.
-An author or origin is unknown.
+The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms,
+yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation
-The number of digits is increased to 5 or 8;
+but differences in spelling.
-The first character is not used literally; instead it is encoded like the rest.
-This might have a negative effect on names starting with a vovel, though.
+The main differences compared to the other soundex variants are:
+- coded names are 6 digits long
-Overall, it can be doubted if this is really an enhancement after all.
+- the initial character of the name is coded
+- rules to encoded multi-character n-grams
+- multiple possible encodings for the same name (branching)
+This implementation supports branching, depending on the used method:
+encode:aString            - branching disabled, only the first code will be returned
+phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|'
+[see also:]
+'Wikipedia - Daitch-Mokotoff Soundex'
+http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+'Avotaynu - Soundexing and Genealogy'
+http://www.avotaynu.com/soundex.htm
 "
-! !
+!
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
+javaCode
-phoneticStringsFor:aString
-"generates both an extended soundex of length 5 and one of length 8"
-|first second u t prevCode|
-u := aString asUppercase.
-first := second := ''.
-u do:[:c |
-t := self translate:c.
-(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-first := first , t.
-second := second , t.
-second size == 8 ifTrue:[
-^ Array with:(first copyTo:5) with:second
-].
-].
-prevCode := t
-].
-[ first size < 5 ] whileTrue:[
-first := first , '0'.
-second := second , '0'.
-].
-[ second size < 8 ] whileTrue:[
-second := second , '0'
-].
-^ Array with:first with:second
-"
-self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')
-self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')
-self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')
-self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
-self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
-self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
-self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
-self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
-self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
-self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
-self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
-self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
-self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
-self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
-"
-! !
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
-translate:aCharacter
-"use simple if's for more speed when compiled"
-"vowels serve as separators"
-aCharacter == $A ifTrue:[^ '0' ].
-aCharacter == $E ifTrue:[^ '0' ].
-aCharacter == $I ifTrue:[^ '0' ].
-aCharacter == $O ifTrue:[^ '0' ].
-aCharacter == $U ifTrue:[^ '0' ].
-aCharacter == $Y ifTrue:[^ '0' ].
-aCharacter == $B ifTrue:[^ '1' ].
-aCharacter == $P ifTrue:[^ '1' ].
-aCharacter == $F ifTrue:[^ '2' ].
-aCharacter == $V ifTrue:[^ '2' ].
-aCharacter == $C ifTrue:[^ '3' ].
-aCharacter == $S ifTrue:[^ '3' ].
-aCharacter == $K ifTrue:[^ '3' ].
-aCharacter == $G ifTrue:[^ '4' ].
-aCharacter == $J ifTrue:[^ '4' ].
-aCharacter == $Q ifTrue:[^ '5' ].
-aCharacter == $X ifTrue:[^ '5' ].
-aCharacter == $Z ifTrue:[^ '5' ].
-aCharacter == $D ifTrue:[^ '6' ].
-aCharacter == $G ifTrue:[^ '6' ].
-aCharacter == $T ifTrue:[^ '6' ].
-aCharacter == $L ifTrue:[^ '7' ].
-aCharacter == $M ifTrue:[^ '8' ].
-aCharacter == $N ifTrue:[^ '8' ].
-aCharacter == $R ifTrue:[^ '9' ].
-^ nil
-! !
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
-documentation
-"
-documentation to be added.
-[author:]
-cg
-[instance variables:]
-[class variables:]
-[see also:]
-"
-! !
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
-encode:word
-^ self subclassResponsibility
-"Created: / 28-07-2017 / 15:20:49 / cg"
-!
-phoneticStringsFor:word
-^ Array with:(self encode:word)
-"Created: / 28-07-2017 / 15:20:38 / cg"
-! !
-!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
-documentation
-"
-Match Rating Approach Encoder
-The Western Airlines matching rating approach name encoder
-[see also:]
-https://en.wikipedia.org/wiki/Match_Rating_Approach
-G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
-''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers''
-US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
-"
-!
-rCode
 "<<END
-## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
+/*
-##
+* Licensed to the Apache Software Foundation (ASF) under one or more
-## Redistribution and use in source and binary forms, with or without
+* contributor license agreements.  See the NOTICE file distributed with
-## modification, are permitted provided that the following conditions are
+* this work for additional information regarding copyright ownership.
-## met:
+* The ASF licenses this file to You under the Apache License, Version 2.0
-##
+* (the "License"); you may not use this file except in compliance with
-##     Redistributions of source code must retain the above copyright
+* the License.  You may obtain a copy of the License at
-##     notice, this list of conditions and the following disclaimer.
+*
-##
+*      http://www.apache.org/licenses/LICENSE-2.0
-##     Redistributions in binary form must reproduce the above copyright
+*
-##     notice, this list of conditions and the following disclaimer in
+* Unless required by applicable law or agreed to in writing, software
-##     the documentation and/or other materials provided with the
+* distributed under the License is distributed on an "AS IS" BASIS,
-##     distribution.
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-##
+* See the License for the specific language governing permissions and
-## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* limitations under the License.
-## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*/
-## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+package org.apache.commons.codec.language;
-## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+import org.apache.commons.codec.CharEncoding;
-## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+import org.apache.commons.codec.EncoderException;
-## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+import org.apache.commons.codec.StringEncoder;
-## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+import java.io.InputStream;
-## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+import java.util.*;
-## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/**
-#' @rdname mra
+* Encodes a string into a Daitch-Mokotoff Soundex value.
-#' @title Match Rating Approach Encoder
+* <p>
-#'
+* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
-#' @description
+* accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
-#' The Western Airlines matching rating approach name encoder
+* </p>
-#'
+* <p>
-#' @param word string or vector of strings to encode
+* The main differences compared to the other soundex variants are:
-#' @param x MRA-encoded character vector
+* </p>
-#' @param y MRA-encoded character vector
+* <ul>
-#'
+* <li>coded names are 6 digits long
-#' @details
+* <li>the initial character of the name is coded
-#'
+* <li>rules to encoded multi-character n-grams
-#' The variable \code{word} is the name to be encoded.  The variable
+* <li>multiple possible encodings for the same name (branching)
-#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
+* </ul>
-#' because the algorithm itself is dependent upon its six-character
+* <p>
-#' length.  The variables \code{x} and \code{y} are MRA-encoded and are
+* This implementation supports branching, depending on the used method:
-#' compared to each other using the MRA comparison specification.
+* <ul>
-#'
+* <li>{@link #encode(String)} - branching disabled, only the first code will be returned
-#' @return The \code{mra_encode} function returns match rating approach
+* <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
-#' encoded character vector.  The \code{mra_compare} returns a boolean
+* </ul>
-#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
+* <p>
-#' comparison test.
+* Note: this implementation has additional branching rules compared to the original description of the algorithm. The
-#'
+* rules can be customized by overriding the default rules contained in the resource file
-#' @references
+* {@code org/apache/commons/codec/language/dmrules.txt}.
-#'
+* </p>
-#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+* <p>
-#' \emph{Accessing Individual Records from Personal Data Files Using
+* This class is thread-safe.
-#' Nonunique Identifiers,} US National Institute of Standards and
+* </p>
-#' Technology, SP-500-2 (1977), p. 17.
+*
-#'
+* @see Soundex
-#' @family phonics
+* @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
-#'
+* @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
-#' @examples
+*
-#' mra_encode("William")
+* @version $Id$
-#' mra_encode(c("Peter", "Peady"))
+* @since 1.10
-#' mra_encode("Stevenson")
+*/
+public class DaitchMokotoffSoundex implements StringEncoder {
-#' @rdname mra
-#' @name mra_encode
+/**
-#' @export
+* Inner class representing a branch during DM soundex encoding.
-mra_encode <- function(word) {
+*/
+private static final class Branch {
-## First, remove any nonalphabetical characters and uppercase it
+private final StringBuilder builder;
-word <- gsub("[^[:alpha:]]*", "", word)
+private String cachedString;
-word <- toupper(word)
+private String lastReplacement;
-## First character of key = first character of name
+private Branch() {
-first <- substr(word, 1, 1)
+builder = new StringBuilder();
-word <- substr(word, 2, nchar(word))
+lastReplacement = null;
+cachedString = null;
-## Delete vowels not at the start of the word
+}
-word <- gsub("[AEIOU]", "", word)
-word <- paste(first, word, sep = "")
+/**
+* Creates a new branch, identical to this branch.
-## Remove duplicate consecutive characters
+*
-word <- gsub("([A-Z])\\1+", "\\1", word)
+* @return a new, identical branch
+*/
-## If longer than 6 characters, take first and last 3...and we have
+public Branch createBranch() {
-## to vectorize it
+final Branch branch = new Branch();
-for(i in 1:length(word)) {
+branch.builder.append(toString());
-if((l = nchar(word[i])) > 6) {
+branch.lastReplacement = this.lastReplacement;
-first <- substr(word[i], 1, 3)
+return branch;
-last <- substr(word[i], l - 2, l)
+}
-word[i] <- paste(first, last, sep = "");
+@Override
+public boolean equals(final Object other) {
+if (this == other) {
+return true;
+}
+if (!!(other instanceof Branch)) {
+return false;
+}
+return toString().equals(((Branch) other).toString());
+}
+/**
+* Finish this branch by appending '0's until the maximum code length has been reached.
+*/
+public void finish() {
+while (builder.length() < MAX_LENGTH) {
+builder.append('0');
+cachedString = null;
+}
+}
+@Override
+public int hashCode() {
+return toString().hashCode();
+}
+/**
+* Process the next replacement to be added to this branch.
+*
+* @param replacement
+*            the next replacement to append
+* @param forceAppend
+*            indicates if the default processing shall be overridden
+*/
+public void processNextReplacement(final String replacement, final boolean forceAppend) {
+final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend;
+if (append && builder.length() < MAX_LENGTH) {
+builder.append(replacement);
+// remove all characters after the maximum length
+if (builder.length() > MAX_LENGTH) {
+builder.delete(MAX_LENGTH, builder.length());
+}
+cachedString = null;
+}
+lastReplacement = replacement;
+}
+@Override
+public String toString() {
+if (cachedString == null) {
+cachedString = builder.toString();
+}
+return cachedString;
 }
 }
-return(word)
+/**
+* Inner class for storing rules.
+*/
+private static final class Rule {
+private final String pattern;
+private final String[] replacementAtStart;
+private final String[] replacementBeforeVowel;
+private final String[] replacementDefault;
+protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
+final String replacementDefault) {
+this.pattern = pattern;
+this.replacementAtStart = replacementAtStart.split("\\|");
+this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
+this.replacementDefault = replacementDefault.split("\\|");
+}
+public int getPatternLength() {
+return pattern.length();
+}
+public String[] getReplacements(final String context, final boolean atStart) {
+if (atStart) {
+return replacementAtStart;
+}
+final int nextIndex = getPatternLength();
+final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
+if (nextCharIsVowel) {
+return replacementBeforeVowel;
+}
+return replacementDefault;
+}
+private boolean isVowel(final char ch) {
+return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+}
+public boolean matches(final String context) {
+return context.startsWith(pattern);
+}
+@Override
+public String toString() {
+return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
+Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
+}
+}
+private static final String COMMENT = "//";
+private static final String DOUBLE_QUOTE = "\"";
+private static final String MULTILINE_COMMENT_END = "*/";
+private static final String MULTILINE_COMMENT_START = "/*";
+/** The resource file containing the replacement and folding rules */
+private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
+/** The code length of a DM soundex value. */
+private static final int MAX_LENGTH = 6;
+/** Transformation rules indexed by the first character of their pattern. */
+private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
+/** Folding rules. */
+private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
+static {
+final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
+if (rulesIS == null) {
+throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
+}
+final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
+parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+scanner.close();
+// sort RULES by pattern length in descending order
+for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
+final List<Rule> ruleList = rule.getValue();
+Collections.sort(ruleList, new Comparator<Rule>() {
+@Override
+public int compare(final Rule rule1, final Rule rule2) {
+return rule2.getPatternLength() - rule1.getPatternLength();
+}
+});
+}
+}
+private static void parseRules(final Scanner scanner, final String location,
+final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
+int currentLine = 0;
+boolean inMultilineComment = false;
+while (scanner.hasNextLine()) {
+currentLine++;
+final String rawLine = scanner.nextLine();
+String line = rawLine;
+if (inMultilineComment) {
+if (line.endsWith(MULTILINE_COMMENT_END)) {
+inMultilineComment = false;
+}
+continue;
+}
+if (line.startsWith(MULTILINE_COMMENT_START)) {
+inMultilineComment = true;
+} else {
+// discard comments
+final int cmtI = line.indexOf(COMMENT);
+if (cmtI >= 0) {
+line = line.substring(0, cmtI);
+}
+// trim leading-trailing whitespace
+line = line.trim();
+if (line.length() == 0) {
+continue; // empty lines can be safely skipped
+}
+if (line.contains("=")) {
+// folding
+final String[] parts = line.split("=");
+if (parts.length !!= 2) {
+throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
+" parts: " + rawLine + " in " + location);
+} else {
+final String leftCharacter = parts[0];
+final String rightCharacter = parts[1];
+if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) {
+throw new IllegalArgumentException("Malformed folding statement - " +
+"patterns are not single characters: " + rawLine + " in " + location);
+}
+asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
+}
+} else {
+// rule
+final String[] parts = line.split("\\s+");
+if (parts.length !!= 4) {
+throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
+" parts: " + rawLine + " in " + location);
+} else {
+try {
+final String pattern = stripQuotes(parts[0]);
+final String replacement1 = stripQuotes(parts[1]);
+final String replacement2 = stripQuotes(parts[2]);
+final String replacement3 = stripQuotes(parts[3]);
+final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+final char patternKey = r.pattern.charAt(0);
+List<Rule> rules = ruleMapping.get(patternKey);
+if (rules == null) {
+rules = new ArrayList<Rule>();
+ruleMapping.put(patternKey, rules);
+}
+rules.add(r);
+} catch (final IllegalArgumentException e) {
+throw new IllegalStateException(
+"Problem parsing line '" + currentLine + "' in " + location, e);
+}
+}
+}
+}
+}
+}
+private static String stripQuotes(String str) {
+if (str.startsWith(DOUBLE_QUOTE)) {
+str = str.substring(1);
+}
+if (str.endsWith(DOUBLE_QUOTE)) {
+str = str.substring(0, str.length() - 1);
+}
+return str;
+}
+/** Whether to use ASCII folding prior to encoding. */
+private final boolean folding;
+/**
+* Creates a new instance with ASCII-folding enabled.
+*/
+public DaitchMokotoffSoundex() {
+this(true);
+}
+/**
+* Creates a new instance.
+* <p>
+* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
+* è -&gt; e.
+* </p>
+*
+* @param folding
+*            if ASCII-folding shall be performed before encoding
+*/
+public DaitchMokotoffSoundex(final boolean folding) {
+this.folding = folding;
+}
+/**
+* Performs a cleanup of the input string before the actual soundex transformation.
+* <p>
+* Removes all whitespace characters and performs ASCII folding if enabled.
+* </p>
+*
+* @param input
+*            the input string to cleanup
+* @return a cleaned up string
+*/
+private String cleanup(final String input) {
+final StringBuilder sb = new StringBuilder();
+for (char ch : input.toCharArray()) {
+if (Character.isWhitespace(ch)) {
+continue;
+}
+ch = Character.toLowerCase(ch);
+if (folding && FOLDINGS.containsKey(ch)) {
+ch = FOLDINGS.get(ch);
+}
+sb.append(ch);
+}
+return sb.toString();
+}
+/**
+* Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+* <p>
+* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+* EncoderException if the supplied object is not of type java.lang.String.
+* </p>
+*
+* @see #soundex(String)
+*
+* @param obj
+*            Object to encode
+* @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+*         supplied.
+* @throws EncoderException
+*             if the parameter supplied is not of type java.lang.String
+* @throws IllegalArgumentException
+*             if a character is not mapped
+*/
+@Override
+public Object encode(final Object obj) throws EncoderException {
+if (!!(obj instanceof String)) {
+throw new EncoderException(
+"Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+}
+return encode((String) obj);
+}
+/**
+* Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
+*
+* @see #soundex(String)
+*
+* @param source
+*            A String object to encode
+* @return A DM Soundex code corresponding to the String supplied
+* @throws IllegalArgumentException
+*             if a character is not mapped
+*/
+@Override
+public String encode(final String source) {
+if (source == null) {
+return null;
+}
+return soundex(source, false)[0];
+}
+/**
+* Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
+* <p>
+* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+* separated by '|'.
+* </p>
+* <p>
+* Example: the name "AUERBACH" is encoded as both
+* </p>
+* <ul>
+* <li>097400</li>
+* <li>097500</li>
+* </ul>
+* <p>
+* Thus the result will be "097400|097500".
+* </p>
+*
+* @param source
+*            A String object to encode
+* @return A string containing a set of DM Soundex codes corresponding to the String supplied
+* @throws IllegalArgumentException
+*             if a character is not mapped
+*/
+public String soundex(final String source) {
+final String[] branches = soundex(source, true);
+final StringBuilder sb = new StringBuilder();
+int index = 0;
+for (final String branch : branches) {
+sb.append(branch);
+if (++index < branches.length) {
+sb.append('|');
+}
+}
+return sb.toString();
+}
+/**
+* Perform the actual DM Soundex algorithm on the input string.
+*
+* @param source
+*            A String object to encode
+* @param branching
+*            If branching shall be performed
+* @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
+*         selected branching mode
+*/
+private String[] soundex(final String source, final boolean branching) {
+if (source == null) {
+return null;
+}
+final String input = cleanup(source);
+final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
+currentBranches.add(new Branch());
+char lastChar = '\0';
+for (int index = 0; index < input.length(); index++) {
+final char ch = input.charAt(index);
+// ignore whitespace inside a name
+if (Character.isWhitespace(ch)) {
+continue;
+}
+final String inputContext = input.substring(index);
+final List<Rule> rules = RULES.get(ch);
+if (rules == null) {
+continue;
+}
+// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+@SuppressWarnings("unchecked")
+final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
+for (final Rule rule : rules) {
+if (rule.matches(inputContext)) {
+if (branching) {
+nextBranches.clear();
+}
+final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
+final boolean branchingRequired = replacements.length > 1 && branching;
+for (final Branch branch : currentBranches) {
+for (final String nextReplacement : replacements) {
+// if we have multiple replacements, always create a new branch
+final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
+// special rule: occurrences of mn or nm are treated differently
+final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+nextBranch.processNextReplacement(nextReplacement, force);
+if (branching) {
+nextBranches.add(nextBranch);
+} else {
+break;
+}
+}
+}
+if (branching) {
+currentBranches.clear();
+currentBranches.addAll(nextBranches);
+}
+index += rule.getPatternLength() - 1;
+break;
+}
+}
+lastChar = ch;
+}
+final String[] result = new String[currentBranches.size()];
+int index = 0;
+for (final Branch branch : currentBranches) {
+branch.finish();
+result[index++] = branch.toString();
+}
+return result;
+}
 }
+END>>"
-#' @rdname mra
-#' @name mra_compare
-#' @export
-mra_compare <- function(x, y) {
-mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
-## Obtain the minimum rating value by calculating the length sum of
-## the encoded strings and using table A (from Wikipedia).  We start
-## by setting the minimum to be the sum and move from there.
-mra$lensum <- nchar(mra$x) + nchar(mra$y)
-mra$min[mra$lensum == 12] <- 2
-mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
-mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
-mra$min[mra$lensum <= 4] <- 5
-## If the length difference between the encoded strings is 3 or
-## greater, then no similarity comparison is done.  For us, we
-## continue the similarity comparison out of laziness and ensure the
-## minimum is impossibly high to meet.
-mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
-## Start the comparison.
-x <- strsplit(mra$x, split = "")
-y <- strsplit(mra$y, split = "")
-rows <- nrow(mra)
-for(i in 1:rows) {
-## Process the encoded strings from left to right and remove any
-## identical characters found from both strings respectively.
-j <- 1
-while(j < min(length(x[[i]]), length(y[[i]]))) {
-if(x[[i]][j] == y[[i]][j]) {
-x[[i]] <- x[[i]][-j]
-y[[i]] <- y[[i]][-j]
-} else
-j <- j + 1
-}
-## Process the unmatched characters from right to left and
-## remove any identical characters found from both names
-## respectively.
-x[[i]] <- rev(x[[i]])
-y[[i]] <- rev(y[[i]])
-j <- 1
-while(j < min(length(x[[i]]), length(y[[i]]))) {
-if(x[[i]][j] == y[[i]][j]) {
-x[[i]] <- x[[i]][-j]
-y[[i]] <- y[[i]][-j]
-} else
-j <- j + 1
-}
-## Subtract the number of unmatched characters from 6 in the
-## longer string. This is the similarity rating.
-len <- min(length(x[[i]]), length(y[[i]]))
-mra$sim[i] <- 6 - len
-}
-## If the similarity is greater than or equal to the minimum
-## required, it is a successful match.
-mra$match <- (mra$sim >= mra$min)
-return(mra$match)
-}
-END>>
-! !
-!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
-encode:wordIn
-"see https://en.wikipedia.org/wiki/Match_Rating_Approach"
-|word prev|
-word := wordIn.
-"/ First, remove any nonalphabetical characters and uppercase it
-word := word select:#isLetter thenCollect:#asUppercase.
-"/ Delete vowels not at the start of the word
-word := word first asString , ((word from:2) reject:#isVowel).
-"/ Remove duplicate consecutive characters
-prev := nil.
-word := word
-collect:[:char |
-char == prev ifTrue:[
-$*
-] ifFalse:[
-prev := char.
-char.
-].
-]
-thenSelect:[:char | char ~~ $*].
-"/ If longer than 6 characters, take first and last 3
-word size > 6 ifTrue:[
-word := (word copyFirst:3),(word copyLast:3)
-].
-^ word.
-"
-self new encode:'Catherine'            -> 'CTHRN'
-self new encode:'CatherineCatherine'   -> 'CTHHRN'
-self new encode:'Butter'               -> 'BTR'
-self new encode:'Byrne'                -> 'BYRN'
-self new encode:'Boern'                -> 'BRN'
-self new encode:'Smith'                -> 'SMTH'
-self new encode:'Smyth'                -> 'SMYTH'
-self new encode:'Kathryn'              -> 'KTHRYN'
-"
-"Created: / 28-07-2017 / 15:19:22 / cg"
-"Modified (comment): / 31-07-2017 / 15:14:31 / cg"
-! !
-!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
-documentation
-"
-WARNING: this is the so called 'simplified soundex' algorithm;
-there are more variants like miracode (american soundex) or
-mysqlSoundex around.
-Be sure to use the correct algorithm, if the generated strings must be compatible
-(otherwise, the differences are probably too small to be noticed as effect, but
-your search will be different)
-The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
-SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
-components of names, but by doing so reports more matches.
-There are some variations around in the literature;
-the following is called 'simplified soundex', and the rules for coding a name are:
-1. The first letter of the name is used in its un-coded form to serve as the prefix
-character of the code. (The rest of the code is numerical).
-2. Thereafter, W and H are ignored entirely.
-3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
-4. Other letters of the name are converted to a numerical equivalent:
-B, P, F, V              1
-C, G, J, K, Q, S, X, Z  2
-D, T                    3
-L                       4
-M, N                    5
-R                       6
-5. There are two exceptions:
-1. Letters that follow prefix letters which would, if coded, have the same
-numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
-2. The second letter of any pair of consonants having the same code number is likewise ignored,
-i.e. unless there is a ''separator'' between them in the name.
-6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
-Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
-Notice, that in another variant, w and h are treated slightly differently.
-This is only of relevance, if you need to reconstruct original soundex codes of other programs
-or for the original 1880 us census data.
-Also notice, that soundex deals better with english.
-For german and other languages, other algorithms may provide better results.
-"
-! !
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
-encode:word
-|u p t prevCode|
-u := word asUppercase.
-p := u first asString.
-prevCode := self translate:u first.
-u from:2 to:u size do:[:c |
-t := self translate:c.
-(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-p := p , t.
-p size == 4 ifTrue:[^ p ].
-].
-prevCode := t
-].
-[ p size < 4 ] whileTrue:[
-p := p , '0'
-].
-^ (p copyFrom:1 to:4)
-"
-self new encode:'washington' -> 'W252'
-self new encode:'lee'        -> 'L000'
-self new encode:'Gutierrez'  -> 'G362'
-self new encode:'Pfister'    -> 'P236'
-self new encode:'Jackson'    -> 'J250'
-self new encode:'Tymczak'    -> 'T522'
-"
-"notice:
-MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
-self new encode:'Ashcraft'   -> 'A226'
-"
-"Created: / 28-07-2017 / 15:21:23 / cg"
-"Modified (comment): / 01-08-2017 / 19:01:43 / cg"
-! !
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
-translate:aCharacter
-"use simple if's for more speed when compiled"
-"vowels serve as separators"
-aCharacter == $A ifTrue:[^ '0' ].
-aCharacter == $E ifTrue:[^ '0' ].
-aCharacter == $I ifTrue:[^ '0' ].
-aCharacter == $O ifTrue:[^ '0' ].
-aCharacter == $U ifTrue:[^ '0' ].
-aCharacter == $Y ifTrue:[^ '0' ].
-aCharacter == $B ifTrue:[^ '1' ].
-aCharacter == $P ifTrue:[^ '1' ].
-aCharacter == $F ifTrue:[^ '1' ].
-aCharacter == $V ifTrue:[^ '1' ].
-aCharacter == $C ifTrue:[^ '2' ].
-aCharacter == $S ifTrue:[^ '2' ].
-aCharacter == $K ifTrue:[^ '2' ].
-aCharacter == $G ifTrue:[^ '2' ].
-aCharacter == $J ifTrue:[^ '2' ].
-aCharacter == $Q ifTrue:[^ '2' ].
-aCharacter == $X ifTrue:[^ '2' ].
-aCharacter == $Z ifTrue:[^ '2' ].
-aCharacter == $D ifTrue:[^ '3' ].
-aCharacter == $T ifTrue:[^ '3' ].
-aCharacter == $L ifTrue:[^ '4' ].
-aCharacter == $M ifTrue:[^ '5' ].
-aCharacter == $N ifTrue:[^ '5' ].
-aCharacter == $R ifTrue:[^ '6' ].
-^ nil
-! !
-!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
-documentation
-"
-MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
-and also removing vokals first, then removing duplicate codes
-(whereas the soundex code does this in reverse order).
-These variations are important, if you need the miracode soundex codes to be generated.
-"
-! !
-!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
-encode:word
-|u p t prevCode|
-u := word asUppercase.
-p := u first asString.
-prevCode := self translate:u first.
-u from:2 to:u size do:[:c |
-t := self translate:c.
-(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
-p := p , t.
-].
-(t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
-prevCode := t.
-].
-].
-[ p size < 4 ] whileTrue:[
-p := p , '0'
-].
-^ p
-"Created: / 28-07-2017 / 15:23:41 / cg"
-"Modified: / 31-07-2017 / 17:53:51 / cg"
-! !
-!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
-documentation
-"
-NYSIIS Algorithm:
-1.
-remove all ''S'' and ''Z'' chars from the end of the surname
-2.
-transcode initial strings
-MAC => MC
-PF => F
-3.
-Transcode trailing strings as follows,
-IX => IC
-EX => EC
-YE,EE,IE => Y
-NT,ND => D
-4.
-transcode ''EV'' to ''EF'' if not at start of name
-5.
-use first character of name as first character of key
-6.
-remove any ''W'' that follows a vowel
-7.
-replace all vowels with ''A''
-8.
-transcode ''GHT'' to ''GT''
-9.
-transcode ''DG'' to ''G''
-10.
-transcode ''PH'' to ''F''
-11.
-if not first character, eliminate all ''H'' preceded or followed by a vowel
-12.
-change ''KN'' to ''N'', else ''K'' to ''C''
-13.
-if not first character, change ''M'' to ''N''
-14.
-if not first character, change ''Q'' to ''G''
-15.
-transcode ''SH'' to ''S''
-16.
-transcode ''SCH'' to ''S''
-17.
-transcode ''YW'' to ''Y''
-18.
-if not first or last character, change ''Y'' to ''A''
-19.
-transcode ''WR'' to ''R''
-20.
-if not first character, change ''Z'' to ''S''
-21.
-transcode terminal ''AY'' to ''Y''
-22.
-remove traling vowels
-23.
-collapse all strings of repeated characters
-24.
-if first char of original surname was a vowel, append it to the code
-"
-! !
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
-encode:aString
-|k|
-k := self rule1:(aString asUppercase).
-k := self rule2:k.
-k := self rule3:k.
-k := self rule4:k.
-k := self rule5:k.
-k := self rule6:k.
-k := self rule7:k.
-k := self rule8:k.
-k := self rule9:k.
-k := self rule10:k.
-k := self rule11:k.
-k := self rule12:k.
-k := self rule13:k.
-k := self rule14:k.
-k := self rule15:k.
-k := self rule16:k.
-k := self rule17:k.
-k := self rule18:k.
-k := self rule19:k.
-k := self rule20:k.
-k := self rule21:k.
-k := self rule22:k.
-k := self rule23:k.
-k := self rule24:k originalKey:aString.
-^ k
-"
-self new encode:'hello'
-self new encode:'bliss'
-"
-"
-self new phoneticStringsFor:'hello'
-self new phoneticStringsFor:'bliss'
-"
-"Created: / 28-07-2017 / 15:34:52 / cg"
-! !
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
-rule10:key
-"10. transcode 'PH' to 'F' "
-^ self
-transcodeAll:'PH'
-of:key
-to:'F'
-startingAt:1
-!
-rule11:key
-|k c|
-"11. if not first character, eliminate all 'H' preceded or followed by a vowel "
-k := key copy.
-c := SortedCollection sortBlock:[:a :b | b < a ].
-2 to:key size do:[:i |
-(key at:i) = $H ifTrue:[
-((key at:i - 1) isVowel
-or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
-]
-].
-c do:[:n |
-k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
-].
-^ k
-!
-rule12:key
-|k|
-"12. change 'KN' to 'N', else 'K' to 'C' "
-k := self
-transcodeAll:'KN'
-of:key
-to:'K'
-startingAt:1.
-k := self
-transcodeAll:'K'
-of:k
-to:'C'
-startingAt:1.
-^ k
-!
-rule13:key
-"13. if not first character, change 'M' to 'N' "
-^ self
-transcodeAll:'M'
-of:key
-to:'N'
-startingAt:2
-!
-rule14:key
-"14. if not first character, change 'Q' to 'G' "
-^ self
-transcodeAll:'Q'
-of:key
-to:'G'
-startingAt:2
-!
-rule15:key
-"15. transcode 'SH' to 'S' "
-^ self
-transcodeAll:'SH'
-of:key
-to:'S'
-startingAt:1
-!
-rule16:key
-"16. transcode 'SCH' to 'S' "
-^ self
-transcodeAll:'SCH'
-of:key
-to:'S'
-startingAt:1
-!
-rule17:key
-"17. transcode 'YW' to 'Y' "
-^ self
-transcodeAll:'YW'
-of:key
-to:'Y'
-startingAt:1
-!
-rule18:key
-|k|
-"18. if not first or last character, change 'Y' to 'A' "
-k := self
-transcodeAll:'Y'
-of:key
-to:'A'
-startingAt:2.
-key last = $Y ifTrue:[
-k at:k size put:$Y
-].
-^ k
-!
-rule19:key
-"19. transcode 'WR' to 'R' "
-^ self
-transcodeAll:'WR'
-of:key
-to:'R'
-startingAt:1
-!
-rule1:key
-|k|
-k := key copy.
-"1. Remove all 'S' and 'Z' chars from the end of the name"
-[
-'SZ' includes:k last
-] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
-^ k
-!
-rule20:key
-"20. if not first character, change 'Z' to 'S' "
-^ self
-transcodeAll:'Z'
-of:key
-to:'S'
-startingAt:2
-!
-rule21:key
-"21. transcode terminal 'AY' to 'Y' "
-^ self
-transcodeAll:'AY'
-of:key
-to:'Y'
-startingAt:key size - 1
-!
-rule22:key
-|k|
-"22. remove trailing vowels "
-k := key copy.
-[ k last isVowel ] whileTrue:[
-k := k copyFrom:1 to:k size - 1
-].
-^ k
-!
-rule23:key
-|k c|
-"23. collapse all strings of repeated characters "
-k := key copy.
-c := SortedCollection sortBlock:[:a :b | b < a ].
-k size to:2 do:[:i |
-(k at:i) = (k at:i - 1) ifTrue:[
-c add:i
-]
-].
-c do:[:n |
-k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
-].
-^ k
-!
-rule24:key originalKey:originalKey
-|k|
-"24. if first char of original surname was a vowel, append it to the code"
-k := key copy.
-originalKey first isVowel ifTrue:[
-k := k , originalKey first asString asUppercase
-].
-^ k
-!
-rule2:key
-|k|
-k := key copy.
-"2. Transcode initial strings:  MAC => MC   PF => F"
-(k startsWith:'MAC') ifTrue:[
-k := 'MC' , (k copyFrom:4)
-].
-(k startsWith:'PF') ifTrue:[
-k := 'F' , (k copyFrom:3)
-].
-^ k
-!
-rule3:key
-|k|
-"3. Transcode trailing strings as follows:
-IX => IC
-EX => EC
-YE, EE, IE => Y
-NT, ND => D"
-k := key copy.
-k := self
-transcodeTrailing:#( 'IX' )
-of:k
-to:'IC'.
-k := self
-transcodeTrailing:#( 'EX' )
-of:k
-to:'EC'.
-k := self
-transcodeTrailing:#( 'YE' 'EE' 'IE' )
-of:k
-to:'Y'.
-k := self
-transcodeTrailing:#( 'NT' 'ND' )
-of:k
-to:'D'.
-^ k
-!
-rule4:key
-"4. Transcode 'EV' to 'EF' if not at start of name"
-^ self
-transcodeAll:'EV'
-of:key
-to:'EF'
-startingAt:2
-!
-rule5:key
-"5. Use first character of name as first character of key.  Ignored because we're doing an in-place conversion"
-^ key
-!
-rule6:key
-|k i|
-"6. Remove any 'W' that follows a vowel"
-k := key copy.
-i := 2.
-[
-(i := k indexOf:$W startingAt:i) > 0
-] whileTrue:[
-(k at:i - 1) isVowel ifTrue:[
-k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
-i := i - 1
-]
-].
-^ k
-!
-rule7:key
-|k|
-"7. replace all vowels with 'A' "
-k := key copy.
-1 to:key size do:[:i |
-(key at:i) isVowel ifTrue:[
-k at:i put:$A
-]
-].
-^ k
-!
-rule8:key
-"8. transcode 'GHT' to 'GT' "
-^ self
-transcodeAll:'GHT'
-of:key
-to:'GT'
-startingAt:1
-!
-rule9:key
-"9. transcode 'DG' to 'G' "
-^ self
-transcodeAll:'DG'
-of:key
-to:'G'
-startingAt:1
-!
-transcodeAll:aString of:key to:replacementString startingAt:start
-|k i|
-k := key copy.
-[
-(i := k indexOfSubCollection:aString startingAt:start) > 0
-] whileTrue:[
-k := (k copyFrom:1 to:i - 1) , replacementString
-, (k copyFrom:i + aString size to:k size)
-].
-^ k
-!
-transcodeTrailing:anArrayOfStrings of:key to:replacementString
-|answer|
-answer := key copy.
-anArrayOfStrings do:[:aString |
-answer := self
-transcodeAll:aString
-of:answer
-to:replacementString
-startingAt:(answer size - aString size) + 1
-].
-^ answer
-! !
-!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
-documentation
-"
-Implementation of the PHONEM algorithm, as described in
-'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
-Ein Programm fuer kontextsensitive phonetische Textumwandlung
-ct Magazin fuer Computer & Technik 25/1998'
-This algorithm deals better with the german language (it cares for umlauts)
-"
-! !
-!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
-encode:aString
-|s idx t t2|
-s := aString asUppercase.
-idx := 1.
-[idx < (s size-1)] whileTrue:[
-t2 := nil.
-t := s copyFrom:idx to:idx+1.
-t = 'SC' ifTrue:[ t2 := 'C' ]
-ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
-ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
-ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
-ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
-ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
-ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
-ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
-ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
-ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
-ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
-ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
-ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
-ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
-ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
-ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
-ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
-t2 notNil ifTrue:[
-s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
-] ifFalse:[
-idx := idx + 1.
-].
-].
-"/ single character substitutions via tr
-s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
-s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
-s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
-^ s
-"
-self basicNew encode:'müller'  -> 'MYLR'
-self basicNew encode:'mueller' -> 'MYLR'
-self basicNew encode:'möller'  -> 'MYLR'
-self basicNew encode:'miller'  -> 'MYLR'
-self basicNew encode:'muller'  -> 'MULR'
-self basicNew encode:'muler'   -> 'MULR'
-self basicNew phoneticStringsFor:'müller'  #('MYLR')
-self basicNew phoneticStringsFor:'mueller' #('MYLR')
-self basicNew phoneticStringsFor:'möller'  #('MYLR')
-self basicNew phoneticStringsFor:'miller'  #('MYLR')
-self basicNew phoneticStringsFor:'muller'  #('MULR')
-self basicNew phoneticStringsFor:'muler'   #('MULR')
-self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
-self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
-self basicNew phoneticStringsFor:'fischer'     #('VYCR')
-self basicNew phoneticStringsFor:'weber'       #('VBR')
-self basicNew phoneticStringsFor:'weeber'      #('VBR')
-self basicNew phoneticStringsFor:'webber'      #('VBR')
-self basicNew phoneticStringsFor:'wepper'      #('VBR')
-self basicNew phoneticStringsFor:'meyer'       #('MAYR')
-self basicNew phoneticStringsFor:'maier'       #('MAYR')
-self basicNew phoneticStringsFor:'mayer'       #('MAYR')
-self basicNew phoneticStringsFor:'mayr'        #('MAYR')
-self basicNew phoneticStringsFor:'meir'        #('MAYR')
-self basicNew phoneticStringsFor:'wagner'      #('VACNR')
-self basicNew phoneticStringsFor:'schulz'      #('CULC')
-self basicNew phoneticStringsFor:'becker'      #('BCR')
-self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
-self basicNew phoneticStringsFor:'haus'        #('AUS')
-self basicNew phoneticStringsFor:'schäfer'     #('CVR')
-self basicNew phoneticStringsFor:'scheffer'    #('CVR')
-self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
-self basicNew phoneticStringsFor:'schaefer'    #('CVR')
-"
-"Created: / 28-07-2017 / 15:38:08 / cg"
 ! !
 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
 copyright
 ]
 "Modified: / 28-07-2017 / 11:35:12 / cg"
 ! !
+!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
+documentation
+"
+There are many extended and enhanced soundex variants around;
+here is one, called 'extended soundex'. It is destribed for example in
+http://www.epidata.dk/documentation.php.
+An author or origin is unknown.
+The number of digits is increased to 5 or 8;
+The first character is not used literally; instead it is encoded like the rest.
+This might have a negative effect on names starting with a vovel, though.
+Overall, it can be doubted if this is really an enhancement after all.
+"
+! !
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
+phoneticStringsFor:aString
+"generates both an extended soundex of length 5 and one of length 8"
+|first second u t prevCode|
+u := aString asUppercase.
+first := second := ''.
+u do:[:c |
+t := self translate:c.
+(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+first := first , t.
+second := second , t.
+second size == 8 ifTrue:[
+^ Array with:(first copyTo:5) with:second
+].
+].
+prevCode := t
+].
+[ first size < 5 ] whileTrue:[
+first := first , '0'.
+second := second , '0'.
+].
+[ second size < 8 ] whileTrue:[
+second := second , '0'
+].
+^ Array with:first with:second
+"
+self basicNew phoneticStringsFor:'müller'  #('87900' '87900000')
+self basicNew phoneticStringsFor:'miller'  #('87900' '87900000')
+self basicNew phoneticStringsFor:'muller'  #('87900' '87900000')
+self basicNew phoneticStringsFor:'muler'   #('87900' '87900000')
+self basicNew phoneticStringsFor:'schmidt'    #('38600' '38600000')
+self basicNew phoneticStringsFor:'schneider'  #('38690' '38690000')
+self basicNew phoneticStringsFor:'fischer'    #('23900' '23900000')
+self basicNew phoneticStringsFor:'weber'      #('19000' '19000000')
+self basicNew phoneticStringsFor:'meyer'      #('89000' '89000000')
+self basicNew phoneticStringsFor:'wagner'     #('48900' '48900000')
+self basicNew phoneticStringsFor:'schulz'     #('37500' '37500000')
+self basicNew phoneticStringsFor:'becker'     #('13900' '13900000')
+self basicNew phoneticStringsFor:'hoffmann'   #('28800' '28800000')
+self basicNew phoneticStringsFor:'schäfer'    #('32900' '32900000')
+"
+! !
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
+translate:aCharacter
+"use simple if's for more speed when compiled"
+"vowels serve as separators"
+aCharacter == $A ifTrue:[^ '0' ].
+aCharacter == $E ifTrue:[^ '0' ].
+aCharacter == $I ifTrue:[^ '0' ].
+aCharacter == $O ifTrue:[^ '0' ].
+aCharacter == $U ifTrue:[^ '0' ].
+aCharacter == $Y ifTrue:[^ '0' ].
+aCharacter == $B ifTrue:[^ '1' ].
+aCharacter == $P ifTrue:[^ '1' ].
+aCharacter == $F ifTrue:[^ '2' ].
+aCharacter == $V ifTrue:[^ '2' ].
+aCharacter == $C ifTrue:[^ '3' ].
+aCharacter == $S ifTrue:[^ '3' ].
+aCharacter == $K ifTrue:[^ '3' ].
+aCharacter == $G ifTrue:[^ '4' ].
+aCharacter == $J ifTrue:[^ '4' ].
+aCharacter == $Q ifTrue:[^ '5' ].
+aCharacter == $X ifTrue:[^ '5' ].
+aCharacter == $Z ifTrue:[^ '5' ].
+aCharacter == $D ifTrue:[^ '6' ].
+aCharacter == $G ifTrue:[^ '6' ].
+aCharacter == $T ifTrue:[^ '6' ].
+aCharacter == $L ifTrue:[^ '7' ].
+aCharacter == $M ifTrue:[^ '8' ].
+aCharacter == $N ifTrue:[^ '8' ].
+aCharacter == $R ifTrue:[^ '9' ].
+^ nil
+! !
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
+documentation
+"
+documentation to be added.
+[author:]
+cg
+[instance variables:]
+[class variables:]
+[see also:]
+"
+! !
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
+encode:word
+^ self subclassResponsibility
+"Created: / 28-07-2017 / 15:20:49 / cg"
+!
+phoneticStringsFor:word
+^ Array with:(self encode:word)
+"Created: / 28-07-2017 / 15:20:38 / cg"
+! !
+!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
+documentation
+"
+Match Rating Approach Encoder
+The Western Airlines matching rating approach name encoder
+[see also:]
+https://en.wikipedia.org/wiki/Match_Rating_Approach
+G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers''
+US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
+"
+!
+rCode
+"<<END
+## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions are
+## met:
+##
+##     Redistributions of source code must retain the above copyright
+##     notice, this list of conditions and the following disclaimer.
+##
+##     Redistributions in binary form must reproduce the above copyright
+##     notice, this list of conditions and the following disclaimer in
+##     the documentation and/or other materials provided with the
+##     distribution.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#' @rdname mra
+#' @title Match Rating Approach Encoder
+#'
+#' @description
+#' The Western Airlines matching rating approach name encoder
+#'
+#' @param word string or vector of strings to encode
+#' @param x MRA-encoded character vector
+#' @param y MRA-encoded character vector
+#'
+#' @details
+#'
+#' The variable \code{word} is the name to be encoded.  The variable
+#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
+#' because the algorithm itself is dependent upon its six-character
+#' length.  The variables \code{x} and \code{y} are MRA-encoded and are
+#' compared to each other using the MRA comparison specification.
+#'
+#' @return The \code{mra_encode} function returns match rating approach
+#' encoded character vector.  The \code{mra_compare} returns a boolean
+#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
+#' comparison test.
+#'
+#' @references
+#'
+#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+#' \emph{Accessing Individual Records from Personal Data Files Using
+#' Nonunique Identifiers,} US National Institute of Standards and
+#' Technology, SP-500-2 (1977), p. 17.
+#'
+#' @family phonics
+#'
+#' @examples
+#' mra_encode("William")
+#' mra_encode(c("Peter", "Peady"))
+#' mra_encode("Stevenson")
+#' @rdname mra
+#' @name mra_encode
+#' @export
+mra_encode <- function(word) {
+## First, remove any nonalphabetical characters and uppercase it
+word <- gsub("[^[:alpha:]]*", "", word)
+word <- toupper(word)
+## First character of key = first character of name
+first <- substr(word, 1, 1)
+word <- substr(word, 2, nchar(word))
+## Delete vowels not at the start of the word
+word <- gsub("[AEIOU]", "", word)
+word <- paste(first, word, sep = "")
+## Remove duplicate consecutive characters
+word <- gsub("([A-Z])\\1+", "\\1", word)
+## If longer than 6 characters, take first and last 3...and we have
+## to vectorize it
+for(i in 1:length(word)) {
+if((l = nchar(word[i])) > 6) {
+first <- substr(word[i], 1, 3)
+last <- substr(word[i], l - 2, l)
+word[i] <- paste(first, last, sep = "");
+}
+}
+return(word)
+}
+#' @rdname mra
+#' @name mra_compare
+#' @export
+mra_compare <- function(x, y) {
+mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
+## Obtain the minimum rating value by calculating the length sum of
+## the encoded strings and using table A (from Wikipedia).  We start
+## by setting the minimum to be the sum and move from there.
+mra$lensum <- nchar(mra$x) + nchar(mra$y)
+mra$min[mra$lensum == 12] <- 2
+mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
+mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
+mra$min[mra$lensum <= 4] <- 5
+## If the length difference between the encoded strings is 3 or
+## greater, then no similarity comparison is done.  For us, we
+## continue the similarity comparison out of laziness and ensure the
+## minimum is impossibly high to meet.
+mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
+## Start the comparison.
+x <- strsplit(mra$x, split = "")
+y <- strsplit(mra$y, split = "")
+rows <- nrow(mra)
+for(i in 1:rows) {
+## Process the encoded strings from left to right and remove any
+## identical characters found from both strings respectively.
+j <- 1
+while(j < min(length(x[[i]]), length(y[[i]]))) {
+if(x[[i]][j] == y[[i]][j]) {
+x[[i]] <- x[[i]][-j]
+y[[i]] <- y[[i]][-j]
+} else
+j <- j + 1
+}
+## Process the unmatched characters from right to left and
+## remove any identical characters found from both names
+## respectively.
+x[[i]] <- rev(x[[i]])
+y[[i]] <- rev(y[[i]])
+j <- 1
+while(j < min(length(x[[i]]), length(y[[i]]))) {
+if(x[[i]][j] == y[[i]][j]) {
+x[[i]] <- x[[i]][-j]
+y[[i]] <- y[[i]][-j]
+} else
+j <- j + 1
+}
+## Subtract the number of unmatched characters from 6 in the
+## longer string. This is the similarity rating.
+len <- min(length(x[[i]]), length(y[[i]]))
+mra$sim[i] <- 6 - len
+}
+## If the similarity is greater than or equal to the minimum
+## required, it is a successful match.
+mra$match <- (mra$sim >= mra$min)
+return(mra$match)
+}
+END>>
+! !
+!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
+encode:wordIn
+"see https://en.wikipedia.org/wiki/Match_Rating_Approach"
+|word prev|
+word := wordIn.
+"/ First, remove any nonalphabetical characters and uppercase it
+word := word select:#isLetter thenCollect:#asUppercase.
+"/ Delete vowels not at the start of the word
+word := word first asString , ((word from:2) reject:#isVowel).
+"/ Remove duplicate consecutive characters
+prev := nil.
+word := word
+collect:[:char |
+char == prev ifTrue:[
+$*
+] ifFalse:[
+prev := char.
+char.
+].
+]
+thenSelect:[:char | char ~~ $*].
+"/ If longer than 6 characters, take first and last 3
+word size > 6 ifTrue:[
+word := (word copyFirst:3),(word copyLast:3)
+].
+^ word.
+"
+self new encode:'Catherine'            -> 'CTHRN'
+self new encode:'CatherineCatherine'   -> 'CTHHRN'
+self new encode:'Butter'               -> 'BTR'
+self new encode:'Byrne'                -> 'BYRN'
+self new encode:'Boern'                -> 'BRN'
+self new encode:'Smith'                -> 'SMTH'
+self new encode:'Smyth'                -> 'SMYTH'
+self new encode:'Kathryn'              -> 'KTHRYN'
+"
+"Created: / 28-07-2017 / 15:19:22 / cg"
+"Modified (comment): / 31-07-2017 / 15:14:31 / cg"
+! !
+!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'!
+documentation
+"
+Encodes a string into a Metaphone value.
+Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.
+Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6
+They have had undocumented changes from the originally published algorithm.
+For more information, see https://issues.apache.org/jira/browse/CODEC-57
+Metaphone uses the following rules:
+Doubled letters except 'c' -> drop 2nd letter.
+Vowels are only kept when they are the first letter.
+B -> B unless at the end of a word after 'm' as in 'dumb'
+C -> X (sh) if -cia- or -ch-
+S if -ci-, -ce- or -cy-
+K otherwise, including -sch-
+D -> J if in -dge-, -dgy- or -dgi-; T otherwise
+F -> F
+G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above)
+J if before i or e or y if not double gg; K otherwise
+H -> silent if after vowel and no vowel follows; H otherwise
+J -> J
+K -> silent if after 'c'; K otherwise
+L -> L
+M -> M
+N -> N
+P -> F if before 'h'; P otherwise
+Q -> K
+R -> R
+S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise
+T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise
+V -> F
+W -> silent if not followed by a vowel W if followed by a vowel
+X -> KS
+Y -> silent if not followed by a vowel Y if followed by a vowel
+Z -> S
+Initial Letter Exceptions
+Initial kn-, gn- pn, ae- or wr- -> drop first letter
+Initial x- -> change to 's'
+Initial wh- -> change to 'w'
+self new encode:'a'
+self new encode:'dumb'
+self new encode:'MILLER'
+self new encode:'schmidt'
+self new encode:'schneider'
+self new encode:'FISCHER'
+self new encode:'HEDGY'
+self new encode:'weber'
+self new encode:'wagner'
+self new encode:'van gogh'
+"
+!
+javaCode
+"<<END
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.commons.codec.language;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+/**
+* Encodes a string into a Metaphone value.
+* <p>
+* Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+* Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+* <p>
+* <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
+* p 39.</CITE>
+* <p>
+* Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+* </p>
+* <ul>
+* <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
+*  (broken link 4/30/2013) </li>
+* <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
+*  (link checked 4/30/2013) </li>
+* </ul>
+* <p>
+* They have had undocumented changes from the originally published algorithm.
+* For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
+* <p>
+* This class is conditionally thread-safe.
+* The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
+* but is not volatile, and accesses are not synchronized.
+* If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
+* is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
+* after initial setup.
+*
+* @version $Id$
+*/
+public class Metaphone implements StringEncoder {
+/**
+* Five values in the English language
+*/
+private static final String VOWELS = "AEIOU";
+/**
+* Variable used in Metaphone algorithm
+*/
+private static final String FRONTV = "EIY";
+/**
+* Variable used in Metaphone algorithm
+*/
+private static final String VARSON = "CSPTG";
+/**
+* The max code length for metaphone is 4
+*/
+private int maxCodeLen = 4;
+/**
+* Creates an instance of the Metaphone encoder
+*/
+public Metaphone() {
+super();
+}
+/**
+* Find the metaphone value of a String. This is similar to the
+* soundex algorithm, but better at finding similar sounding words.
+* All input is converted to upper case.
+* Limitations: Input format is expected to be a single ASCII word
+* with only characters in the A - Z range, no punctuation or numbers.
+*
+* @param txt String to find the metaphone code for
+* @return A metaphone code corresponding to the String supplied
+*/
+public String metaphone(final String txt) {
+boolean hard = false;
+int txtLength;
+if (txt == null || (txtLength = txt.length()) == 0) {
+return "";
+}
+// single character is itself
+if (txtLength == 1) {
+return txt.toUpperCase(java.util.Locale.ENGLISH);
+}
+final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
+final StringBuilder local = new StringBuilder(40); // manipulate
+final StringBuilder code = new StringBuilder(10); //   output
+// handle initial 2 characters exceptions
+switch(inwd[0]) {
+case 'K':
+case 'G':
+case 'P': /* looking for KN, etc*/
+if (inwd[1] == 'N') {
+local.append(inwd, 1, inwd.length - 1);
+} else {
+local.append(inwd);
+}
+break;
+case 'A': /* looking for AE */
+if (inwd[1] == 'E') {
+local.append(inwd, 1, inwd.length - 1);
+} else {
+local.append(inwd);
+}
+break;
+case 'W': /* looking for WR or WH */
+if (inwd[1] == 'R') {   // WR -> R
+local.append(inwd, 1, inwd.length - 1);
+break;
+}
+if (inwd[1] == 'H') {
+local.append(inwd, 1, inwd.length - 1);
+local.setCharAt(0, 'W'); // WH -> W
+} else {
+local.append(inwd);
+}
+break;
+case 'X': /* initial X becomes S */
+inwd[0] = 'S';
+local.append(inwd);
+break;
+default:
+local.append(inwd);
+} // now local has working string with initials fixed
+final int wdsz = local.length();
+int n = 0;
+while (code.length() < this.getMaxCodeLen() &&
+n < wdsz ) { // max code size of 4 works well
+final char symb = local.charAt(n);
+// remove duplicate letters except C
+if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) {
+n++;
+} else { // not dup
+switch(symb) {
+case 'A':
+case 'E':
+case 'I':
+case 'O':
+case 'U':
+if (n == 0) {
+code.append(symb);
+}
+break; // only use vowel if leading char
+case 'B':
+if ( isPreviousChar(local, n, 'M') &&
+isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+break;
+}
+code.append(symb);
+break;
+case 'C': // lots of C special cases
+/* discard if SCI, SCE or SCY */
+if ( isPreviousChar(local, n, 'S') &&
+!!isLastChar(wdsz, n) &&
+FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+break;
+}
+if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+code.append('X');
+break;
+}
+if (!!isLastChar(wdsz, n) &&
+FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+code.append('S');
+break; // CI,CE,CY -> S
+}
+if (isPreviousChar(local, n, 'S') &&
+isNextChar(local, n, 'H') ) { // SCH->sk
+code.append('K');
+break;
+}
+if (isNextChar(local, n, 'H')) { // detect CH
+if (n == 0 &&
+wdsz >= 3 &&
+isVowel(local,2) ) { // CH consonant -> K consonant
+code.append('K');
+} else {
+code.append('X'); // CHvowel -> X
+}
+} else {
+code.append('K');
+}
+break;
+case 'D':
+if (!!isLastChar(wdsz, n + 1) &&
+isNextChar(local, n, 'G') &&
+FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+code.append('J'); n += 2;
+} else {
+code.append('T');
+}
+break;
+case 'G': // GH silent at end or before consonant
+if (isLastChar(wdsz, n + 1) &&
+isNextChar(local, n, 'H')) {
+break;
+}
+if (!!isLastChar(wdsz, n + 1) &&
+isNextChar(local,n,'H') &&
+!!isVowel(local,n+2)) {
+break;
+}
+if (n > 0 &&
+( regionMatch(local, n, "GN") ||
+regionMatch(local, n, "GNED") ) ) {
+break; // silent G
+}
+if (isPreviousChar(local, n, 'G')) {
+// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
+hard = true;
+} else {
+hard = false;
+}
+if (!!isLastChar(wdsz, n) &&
+FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+!!hard) {
+code.append('J');
+} else {
+code.append('K');
+}
+break;
+case 'H':
+if (isLastChar(wdsz, n)) {
+break; // terminal H
+}
+if (n > 0 &&
+VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+break;
+}
+if (isVowel(local,n+1)) {
+code.append('H'); // Hvowel
+}
+break;
+case 'F':
+case 'J':
+case 'L':
+case 'M':
+case 'N':
+case 'R':
+code.append(symb);
+break;
+case 'K':
+if (n > 0) { // not initial
+if (!!isPreviousChar(local, n, 'C')) {
+code.append(symb);
+}
+} else {
+code.append(symb); // initial K
+}
+break;
+case 'P':
+if (isNextChar(local,n,'H')) {
+// PH -> F
+code.append('F');
+} else {
+code.append(symb);
+}
+break;
+case 'Q':
+code.append('K');
+break;
+case 'S':
+if (regionMatch(local,n,"SH") ||
+regionMatch(local,n,"SIO") ||
+regionMatch(local,n,"SIA")) {
+code.append('X');
+} else {
+code.append('S');
+}
+break;
+case 'T':
+if (regionMatch(local,n,"TIA") ||
+regionMatch(local,n,"TIO")) {
+code.append('X');
+break;
+}
+if (regionMatch(local,n,"TCH")) {
+// Silent if in "TCH"
+break;
+}
+// substitute numeral 0 for TH (resembles theta after all)
+if (regionMatch(local,n,"TH")) {
+code.append('0');
+} else {
+code.append('T');
+}
+break;
+case 'V':
+code.append('F'); break;
+case 'W':
+case 'Y': // silent if not followed by vowel
+if (!!isLastChar(wdsz,n) &&
+isVowel(local,n+1)) {
+code.append(symb);
+}
+break;
+case 'X':
+code.append('K');
+code.append('S');
+break;
+case 'Z':
+code.append('S');
+break;
+default:
+// do nothing
+break;
+} // end switch
+n++;
+} // end else from symb !!= 'C'
+if (code.length() > this.getMaxCodeLen()) {
+code.setLength(this.getMaxCodeLen());
+}
+}
+return code.toString();
+}
+private boolean isVowel(final StringBuilder string, final int index) {
+return VOWELS.indexOf(string.charAt(index)) >= 0;
+}
+private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
+boolean matches = false;
+if( index > 0 &&
+index < string.length() ) {
+matches = string.charAt(index - 1) == c;
+}
+return matches;
+}
+private boolean isNextChar(final StringBuilder string, final int index, final char c) {
+boolean matches = false;
+if( index >= 0 &&
+index < string.length() - 1 ) {
+matches = string.charAt(index + 1) == c;
+}
+return matches;
+}
+private boolean regionMatch(final StringBuilder string, final int index, final String test) {
+boolean matches = false;
+if( index >= 0 &&
+index + test.length() - 1 < string.length() ) {
+final String substring = string.substring( index, index + test.length());
+matches = substring.equals( test );
+}
+return matches;
+}
+private boolean isLastChar(final int wdsz, final int n) {
+return n + 1 == wdsz;
+}
+/**
+* Encodes an Object using the metaphone algorithm.  This method
+* is provided in order to satisfy the requirements of the
+* Encoder interface, and will throw an EncoderException if the
+* supplied object is not of type java.lang.String.
+*
+* @param obj Object to encode
+* @return An object (or type java.lang.String) containing the
+*         metaphone code which corresponds to the String supplied.
+* @throws EncoderException if the parameter supplied is not
+*                          of type java.lang.String
+*/
+@Override
+public Object encode(final Object obj) throws EncoderException {
+if (!!(obj instanceof String)) {
+throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
+}
+return metaphone((String) obj);
+}
+/**
+* Encodes a String using the Metaphone algorithm.
+*
+* @param str String object to encode
+* @return The metaphone code corresponding to the String supplied
+*/
+@Override
+public String encode(final String str) {
+return metaphone(str);
+}
+/**
+* Tests is the metaphones of two strings are identical.
+*
+* @param str1 First of two strings to compare
+* @param str2 Second of two strings to compare
+* @return <code>true</code> if the metaphones of these strings are identical,
+*        <code>false</code> otherwise.
+*/
+public boolean isMetaphoneEqual(final String str1, final String str2) {
+return metaphone(str1).equals(metaphone(str2));
+}
+/**
+* Returns the maxCodeLen.
+* @return int
+*/
+public int getMaxCodeLen() { return this.maxCodeLen; }
+/**
+* Sets the maxCodeLen.
+* @param maxCodeLen The maxCodeLen to set
+*/
+public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
+}
+END>>"
+! !
+!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'!
+encode:txt
+"
+self new encode:'a'
+self new encode:'MILLER'
+self new encode:'schmidt'
+self new encode:'schneider'
+self new encode:'FISCHER'
+self new encode:'HEDGY'
+self new encode:'weber'
+self new encode:'wagner'
+self new encode:'van gogh'
+self new encode:'dumb'
+"
+|hard txtLength local code inwd ch ch2 wdsz n maxCodeLen|
+inwd := txt.
+hard := false.
+txtLength := 0.
+maxCodeLen := self maxCodeLen.
+(txtLength := txt size) == 0 ifTrue:[^ ''].
+inwd := txt asUppercase.
+"/ single character is itself
+(txtLength == 1) ifTrue:[
+^ inwd
+].
+code := '' writeStream.
+local := inwd.
+"/ handle initial 2 characters exceptions
+ch := inwd at:(0+1).
+ch2 := inwd at:(1+1).
+('KGP' includes:ch) ifTrue:[
+"/ looking for KN, etc
+"/ KNx -> Nx
+"/ GNx -> Nx
+"/ PNx -> Nx
+(ch2 == $N) ifTrue:[
+local := (inwd from:1+1)
+].
+] ifFalse:[
+('A' includes:ch) ifTrue:[
+"/ looking for AE
+"/ AEx -> Ex
+(ch2 == $E) ifTrue:[
+local := (inwd from:1+1)
+].
+] ifFalse:[
+('W' includes:ch) ifTrue:[
+"/ looking for WR or WH
+(ch2 == $R) ifTrue:[
+"/ WRx -> Wx
+local := (inwd from:1+1)
+] ifFalse:[
+(ch2 == $H) ifTrue:[
+"/ // WH -> W
+local := 'W',(inwd from:2+1).
+]
+]
+] ifFalse:[
+('X' includes:ch) ifTrue:[
+"/ initial X becomes S */
+"/ Xx -> Sx
+local := 'S',(inwd from:1+1).
+]]]].
+"/ now local has working string with initials fixed
+wdsz := local size.
+n := 1.
+[ (code size < maxCodeLen) and:[ n <= wdsz ] ] whileTrue:[
+"/ max code size of 4 works well
+|symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar|
+symb := local at:n.
+(n > 1) ifTrue:[ prevChar := local at:(n-1) ].
+(isLastChar := (n == wdsz)) ifFalse:[
+nextChar := local at:(n+1)
+].
+isPrevToLastChar := (n == (wdsz-1)).
+(n+2) <= wdsz ifTrue:[
+nextNextChar := local at:(n+2)
+].
+"/ remove duplicate letters except C
+(symb ~~ $C and:[ nextChar == symb ]) ifFalse:[
+"/ not dup
+('AEIOU' includes:symb) ifTrue:[
+"/ only use vowel if leading char
+(n == 1) ifTrue:[
+code nextPut:symb
+]
+] ifFalse:[
+('B' includes:symb) ifTrue:[
+"/    if ( isPreviousChar(local, n, 'M') &&
+"/         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+"/        break;
+"/    }
+"/    code.append(symb);
+"/    break;
+((prevChar == $M) and:[isLastChar]) ifTrue:[
+"/ B is silent if word ends in MB
+] ifFalse:[
+code nextPut:symb.
+].
+] ifFalse:[
+('C' includes:symb) ifTrue:[
+"/ lots of C special cases
+"/    /* discard if SCI, SCE or SCY */
+"/    if ( isPreviousChar(local, n, 'S') &&
+"/         !!isLastChar(wdsz, n) &&
+"/         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+"/        break;
+"/    }
+"/    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+"/        code.append('X');
+"/        break;
+"/    }
+"/    if (!!isLastChar(wdsz, n) &&
+"/        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+"/        code.append('S');
+"/        break; // CI,CE,CY -> S
+"/    }
+"/    if (isPreviousChar(local, n, 'S') &&
+"/        isNextChar(local, n, 'H') ) { // SCH->sk
+"/        code.append('K');
+"/        break;
+"/    }
+"/    if (isNextChar(local, n, 'H')) { // detect CH
+"/        if (n == 0 &&
+"/            wdsz >= 3 &&
+"/            isVowel(local,2) ) { // CH consonant -> K consonant
+"/            code.append('K');
+"/        } else {
+"/            code.append('X'); // CHvowel -> X
+"/        }
+"/    } else {
+"/        code.append('K');
+"/    }
+"/    break;
+(prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[
+"/ discard if SCI, SCE or SCY
+] ifFalse:[
+((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[
+"/  "CIA" -> X
+code nextPut:$X
+] ifFalse:[
+('IEY' includes:nextChar) ifTrue:[
+"/ CI,CE,CY -> S
+code nextPut:$S
+] ifFalse:[
+((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[
+"/ SCH->sk
+code nextPut:$K
+] ifFalse:[
+nextChar == $H ifTrue:[
+"/ CH
+('AEIOU' includes:nextNextChar) ifTrue:[
+code nextPut:$K "/ CH consonant -> K consonant
+] ifFalse:[
+code nextPut:$X "/ CHvowel -> X
+]
+] ifFalse:[
+code nextPut:$K
+].
+]
+]
+]
+].
+] ifFalse:[
+('D' includes:symb) ifTrue:[
+"/    if (!!isLastChar(wdsz, n + 1) &&
+"/        isNextChar(local, n, 'G') &&
+"/        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+"/        code.append('J'); n += 2;
+"/    } else {
+"/        code.append('T');
+"/    }
+"/    break;
+((nextChar == $G)
+and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ])
+ifTrue:[
+code nextPut:$J.
+n := n + 2.
+] ifFalse:[
+code nextPut:$T.
+].
+] ifFalse:[
+('G' includes:symb) ifTrue:[
+"/    GH silent at end or before consonant
+"/    if (isLastChar(wdsz, n + 1) &&
+"/        isNextChar(local, n, 'H')) {
+"/        break;
+"/    }
+"/    if (!!isLastChar(wdsz, n + 1) &&
+"/        isNextChar(local,n,'H') &&
+"/        !!isVowel(local,n+2)) {
+"/        break;
+"/    }
+"/    if (n > 0 &&
+"/        ( regionMatch(local, n, "GN") ||
+"/          regionMatch(local, n, "GNED") ) ) {
+"/        break; // silent G
+"/    }
+"/    if (isPreviousChar(local, n, 'G')) {
+"/        // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true
+"/        hard = true;
+"/    } else {
+"/        hard = false;
+"/    }
+"/    if (!!isLastChar(wdsz, n) &&
+"/        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+"/        !!hard) {
+"/        code.append('J');
+"/    } else {
+"/        code.append('K');
+"/    }
+"/    break;
+(isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[
+"/ GH silent at end
+] ifFalse:[
+(isPrevToLastChar not and:[ nextChar == $H
+and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[
+"/ GH silent before consonant
+] ifFalse:[
+(n > 1 and:[ nextChar == $N ]) ifTrue:[
+"/ GN -> silent G
+] ifFalse:[
+hard := (prevChar == $G).
+(isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[
+code nextPut:$J
+] ifFalse:[
+code nextPut:$K
+].
+].
+].
+].
+] ifFalse:[
+('H' includes:symb) ifTrue:[
+"/    case 'H':
+"/        if (isLastChar(wdsz, n)) {
+"/            break; // terminal H
+"/        }
+"/        if (n > 0 &&
+"/            VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+"/            break;
+"/        }
+"/        if (isVowel(local,n+1)) {
+"/            code.append('H'); // Hvowel
+"/        }
+"/        break;
+isLastChar ifTrue:[
+"/ ignore terminal H
+] ifFalse:[
+('CSPTG' includes:prevChar) ifTrue:[
+"/ ignore CH, SH, PH, TH, GH (H treated there)
+] ifFalse:[
+('AEIOU' includes:nextChar) ifTrue:[
+"/ Hvowel
+code nextPut:$H
+].
+].
+].
+] ifFalse:[
+('FJLMNR' includes:symb) ifTrue:[
+"/    case 'F':
+"/    case 'J':
+"/    case 'L':
+"/    case 'M':
+"/    case 'N':
+"/    case 'R':
+"/        code.append(symb);
+"/        break;
+code nextPut:symb.
+] ifFalse:[
+('K' includes:symb) ifTrue:[
+"/    case 'K':
+"/        if (n > 0) { // not initial
+"/            if (!!isPreviousChar(local, n, 'C')) {
+"/                code.append(symb);
+"/            }
+"/        } else {
+"/            code.append(symb); // initial K
+"/        }
+"/        break;
+n > 1 ifTrue:[
+"/ not initial
+prevChar ~~ $C ifTrue:[
+code nextPut:$K. "/ initial K
+].
+] ifFalse:[
+code nextPut:$K. "/ initial K
+].
+] ifFalse:[
+('P' includes:symb) ifTrue:[
+"/    case 'P':
+"/        if (isNextChar(local,n,'H')) {
+"/            // PH -> F
+"/            code.append('F');
+"/        } else {
+"/            code.append(symb);
+"/        }
+"/        break;
+nextChar == $H ifTrue:[
+"/ PH -> F
+code nextPut:$F.
+] ifFalse:[
+code nextPut:symb.
+].
+] ifFalse:[
+('Q' includes:symb) ifTrue:[
+"/    case 'Q':
+"/        code.append('K');
+"/        break;
+code nextPut:$K
+] ifFalse:[
+('S' includes:symb) ifTrue:[
+"/                case 'S':
+"/                    if (regionMatch(local,n,"SH") ||
+"/                        regionMatch(local,n,"SIO") ||
+"/                        regionMatch(local,n,"SIA")) {
+"/                        code.append('X');
+"/                    } else {
+"/                        code.append('S');
+"/                    }
+"/                    break;
+"/ SH -> X  (as in shave or ashton)
+"/ SIO -> X
+"/ SIA -> X (as in ASIA)
+((nextChar == $H)
+or:[
+((nextChar == $I)
+and:[
+(((local from:n) startsWith:'SIO')
+or:[ ((local from:n) startsWith:'SIA') ])
+]
+)
+]) ifTrue:[
+code nextPut:$X
+] ifFalse:[
+code nextPut:$S
+]
+] ifFalse:[
+('T' includes:symb) ifTrue:[
+"/                case 'T':
+"/                    if (regionMatch(local,n,"TIA") ||
+"/                        regionMatch(local,n,"TIO")) {
+"/                        code.append('X');
+"/                        break;
+"/                    }
+"/                    if (regionMatch(local,n,"TCH")) {
+"/                        // Silent if in "TCH"
+"/                        break;
+"/                    }
+"/                    // substitute numeral 0 for TH (resembles theta after all)
+"/                    if (regionMatch(local,n,"TH")) {
+"/                        code.append('0');
+"/                    } else {
+"/                        code.append('T');
+"/                    }
+"/                    break;
+self halt.
+] ifFalse:[
+('V' includes:symb) ifTrue:[
+"/    case 'V':
+"/        code.append('F'); break;
+code nextPut:$F
+] ifFalse:[
+('WY' includes:symb) ifTrue:[
+"/    case 'W':
+"/    case 'Y': // silent if not followed by vowel
+"/        if (!!isLastChar(wdsz,n) &&
+"/            isVowel(local,n+1)) {
+"/            code.append(symb);
+"/        }
+"/        break;
+"/ silent if not followed by vowel
+(isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[
+code nextPut:symb
+].
+] ifFalse:[
+('X' includes:symb) ifTrue:[
+"/    case 'X':
+"/        code.append('K');
+"/        code.append('S');
+"/        break;
+code nextPutAll:'KS'
+] ifFalse:[
+('Z' includes:symb) ifTrue:[
+"/    case 'Z':
+"/        code.append('S');
+"/        break;
+code nextPut:$S
+] ifFalse:[
+"/                default:
+"/                    // do nothing
+"/                    break;
+]]]]]]]]]]]]]]]]. "/ end switch
+]. "/ end else from symb !!= 'C'
+n := n + 1.
+(code size > maxCodeLen) ifTrue:[
+code := code truncateTo:maxCodeLen
+]
+].
+^ code contents
+"Created: / 02-08-2017 / 09:51:31 / cg"
+"Modified: / 02-08-2017 / 12:00:38 / cg"
+!
+maxCodeLen
+^ 4
+"Created: / 02-08-2017 / 09:51:59 / cg"
+! !
+!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
+documentation
+"
+WARNING: this is the so called 'simplified soundex' algorithm;
+there are more variants like miracode (american soundex) or
+mysqlSoundex around.
+Be sure to use the correct algorithm, if the generated strings must be compatible
+(otherwise, the differences are probably too small to be noticed as effect, but
+your search will be different)
+The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
+SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
+components of names, but by doing so reports more matches.
+There are some variations around in the literature;
+the following is called 'simplified soundex', and the rules for coding a name are:
+1. The first letter of the name is used in its un-coded form to serve as the prefix
+character of the code. (The rest of the code is numerical).
+2. Thereafter, W and H are ignored entirely.
+3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
+4. Other letters of the name are converted to a numerical equivalent:
+B, P, F, V              1
+C, G, J, K, Q, S, X, Z  2
+D, T                    3
+L                       4
+M, N                    5
+R                       6
+5. There are two exceptions:
+1. Letters that follow prefix letters which would, if coded, have the same
+numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
+2. The second letter of any pair of consonants having the same code number is likewise ignored,
+i.e. unless there is a ''separator'' between them in the name.
+6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
+Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
+Notice, that in another variant, w and h are treated slightly differently.
+This is only of relevance, if you need to reconstruct original soundex codes of other programs
+or for the original 1880 us census data.
+SoundexStringComparator  new encode:'Ashcraft' -> 'A226'
+vs.
+MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+Also notice, that soundex deals better with english.
+For german and other languages, other algorithms may provide better results.
+"
+! !
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
+encode:word
+|u p t prevCode|
+u := word asUppercase.
+p := u first asString.
+prevCode := self translate:u first.
+u from:2 to:u size do:[:c |
+t := self translate:c.
+(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+p := p , t.
+p size == 4 ifTrue:[^ p ].
+].
+prevCode := t
+].
+[ p size < 4 ] whileTrue:[
+p := p , '0'
+].
+^ (p copyFrom:1 to:4)
+"
+self new encode:'washington' -> 'W252'
+self new encode:'lee'        -> 'L000'
+self new encode:'Gutierrez'  -> 'G362'
+self new encode:'Pfister'    -> 'P236'
+self new encode:'Jackson'    -> 'J250'
+self new encode:'Tymczak'    -> 'T522'
+"
+"notice:
+MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+self new encode:'Ashcraft'   -> 'A226'
+"
+"Created: / 28-07-2017 / 15:21:23 / cg"
+"Modified (comment): / 01-08-2017 / 19:01:43 / cg"
+! !
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
+translate:aCharacter
+"use simple if's for more speed when compiled"
+"vowels serve as separators"
+aCharacter == $A ifTrue:[^ '0' ].
+aCharacter == $E ifTrue:[^ '0' ].
+aCharacter == $I ifTrue:[^ '0' ].
+aCharacter == $O ifTrue:[^ '0' ].
+aCharacter == $U ifTrue:[^ '0' ].
+aCharacter == $Y ifTrue:[^ '0' ].
+aCharacter == $B ifTrue:[^ '1' ].
+aCharacter == $P ifTrue:[^ '1' ].
+aCharacter == $F ifTrue:[^ '1' ].
+aCharacter == $V ifTrue:[^ '1' ].
+aCharacter == $C ifTrue:[^ '2' ].
+aCharacter == $S ifTrue:[^ '2' ].
+aCharacter == $K ifTrue:[^ '2' ].
+aCharacter == $G ifTrue:[^ '2' ].
+aCharacter == $J ifTrue:[^ '2' ].
+aCharacter == $Q ifTrue:[^ '2' ].
+aCharacter == $X ifTrue:[^ '2' ].
+aCharacter == $Z ifTrue:[^ '2' ].
+aCharacter == $D ifTrue:[^ '3' ].
+aCharacter == $T ifTrue:[^ '3' ].
+aCharacter == $L ifTrue:[^ '4' ].
+aCharacter == $M ifTrue:[^ '5' ].
+aCharacter == $N ifTrue:[^ '5' ].
+aCharacter == $R ifTrue:[^ '6' ].
+^ nil
+"Modified: / 02-08-2017 / 01:35:40 / cg"
+"Modified (comment): / 02-08-2017 / 14:30:11 / cg"
+! !
+!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
+documentation
+"
+MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
+and also removing vokals first, then removing duplicate codes
+(whereas the soundex code does this in reverse order).
+These variations are important, if you need the miracode soundex codes to be generated.
+"
+! !
+!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
+encode:word
+"same as inherited, but cares for 0, W and H"
+|u p t prevCode|
+u := word asUppercase.
+p := u first asString.
+prevCode := self translate:u first.
+u from:2 to:u size do:[:c |
+t := self translate:c.
+(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+p := p , t.
+].
+(t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
+prevCode := t.
+].
+].
+[ p size < 4 ] whileTrue:[
+p := p , '0'
+].
+^ p
+"Created: / 28-07-2017 / 15:23:41 / cg"
+"Modified: / 31-07-2017 / 17:53:51 / cg"
+"Modified (comment): / 02-08-2017 / 14:31:15 / cg"
+! !
+!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
+documentation
+"
+NYSIIS Algorithm:
+1.
+remove all ''S'' and ''Z'' chars from the end of the surname
+2.
+transcode initial strings
+MAC => MC
+PF => F
+3.
+Transcode trailing strings as follows,
+IX => IC
+EX => EC
+YE,EE,IE => Y
+NT,ND => D
+4.
+transcode ''EV'' to ''EF'' if not at start of name
+5.
+use first character of name as first character of key
+6.
+remove any ''W'' that follows a vowel
+7.
+replace all vowels with ''A''
+8.
+transcode ''GHT'' to ''GT''
+9.
+transcode ''DG'' to ''G''
+10.
+transcode ''PH'' to ''F''
+11.
+if not first character, eliminate all ''H'' preceded or followed by a vowel
+12.
+change ''KN'' to ''N'', else ''K'' to ''C''
+13.
+if not first character, change ''M'' to ''N''
+14.
+if not first character, change ''Q'' to ''G''
+15.
+transcode ''SH'' to ''S''
+16.
+transcode ''SCH'' to ''S''
+17.
+transcode ''YW'' to ''Y''
+18.
+if not first or last character, change ''Y'' to ''A''
+19.
+transcode ''WR'' to ''R''
+20.
+if not first character, change ''Z'' to ''S''
+21.
+transcode terminal ''AY'' to ''Y''
+22.
+remove traling vowels
+23.
+collapse all strings of repeated characters
+24.
+if first char of original surname was a vowel, append it to the code
+"
+! !
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
+encode:aString
+|k|
+k := self rule1:(aString asUppercase).
+"2. Transcode initial strings:  MAC => MC   PF => F"
+k := self rule2:k.
+k := self rule3:k.
+k := self rule4:k.
+k := self rule5:k.
+k := self rule6:k.
+k := self rule7:k.
+k := self rule8:k.
+k := self rule9:k.
+k := self rule10:k.
+k := self rule11:k.
+k := self rule12:k.
+k := self rule13:k.
+k := self rule14:k.
+k := self rule15:k.
+k := self rule16:k.
+k := self rule17:k.
+k := self rule18:k.
+k := self rule19:k.
+k := self rule20:k.
+k := self rule21:k.
+k := self rule22:k.
+k := self rule23:k.
+k := self rule24:k originalKey:aString.
+^ k
+"
+self new encode:'hello'
+self new encode:'bliss'
+"
+"
+self new phoneticStringsFor:'hello'
+self new phoneticStringsFor:'bliss'
+"
+"Created: / 28-07-2017 / 15:34:52 / cg"
+"Modified (comment): / 02-08-2017 / 14:31:47 / cg"
+! !
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
+rule10:key
+"10. transcode 'PH' to 'F' "
+^ self transcodeAll:'PH' of:key to:'F' startingAt:1
+"Modified (format): / 02-08-2017 / 14:34:27 / cg"
+!
+rule11:key
+|k c|
+"11. if not first character, eliminate all 'H' preceded or followed by a vowel "
+k := key copy.
+c := SortedCollection sortBlock:[:a :b | b < a ].
+2 to:key size do:[:i |
+(key at:i) = $H ifTrue:[
+((key at:i - 1) isVowel
+or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
+]
+].
+c do:[:n |
+k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+].
+^ k
+!
+rule12:key
+|k|
+"12. change 'KN' to 'N', else 'K' to 'C' "
+k := self transcodeAll:'KN' of:key to:'K' startingAt:1.
+k := self transcodeAll:'K' of:k to:'C' startingAt:1.
+^ k
+"Modified (format): / 02-08-2017 / 14:34:48 / cg"
+!
+rule13:key
+"13. if not first character, change 'M' to 'N' "
+^ self transcodeAll:'M' of:key to:'N' startingAt:2
+"Modified (format): / 02-08-2017 / 14:35:00 / cg"
+!
+rule14:key
+"14. if not first character, change 'Q' to 'G' "
+^ self transcodeAll:'Q' of:key to:'G' startingAt:2
+"Modified (format): / 02-08-2017 / 14:35:08 / cg"
+!
+rule15:key
+"15. transcode 'SH' to 'S' "
+^ self transcodeAll:'SH' of:key to:'S' startingAt:1
+"Modified (format): / 02-08-2017 / 14:35:18 / cg"
+!
+rule16:key
+"16. transcode 'SCH' to 'S' "
+^ self transcodeAll:'SCH' of:key to:'S' startingAt:1
+"Modified (format): / 02-08-2017 / 14:35:25 / cg"
+!
+rule17:key
+"17. transcode 'YW' to 'Y' "
+^ self transcodeAll:'YW' of:key to:'Y' startingAt:1
+"Modified (format): / 02-08-2017 / 14:35:33 / cg"
+!
+rule18:key
+|k|
+"18. if not first or last character, change 'Y' to 'A' "
+k := self transcodeAll:'Y' of:key to:'A' startingAt:2.
+key last = $Y ifTrue:[
+k at:k size put:$Y
+].
+^ k
+"Modified (format): / 02-08-2017 / 14:35:44 / cg"
+!
+rule19:key
+"19. transcode 'WR' to 'R' "
+^ self transcodeAll:'WR' of:key to:'R' startingAt:1
+"Modified (format): / 02-08-2017 / 14:35:52 / cg"
+!
+rule1:key
+|k|
+k := key copy.
+"1. Remove all 'S' and 'Z' chars from the end of the name"
+[
+'SZ' includes:k last
+] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
+^ k
+!
+rule20:key
+"20. if not first character, change 'Z' to 'S' "
+^ self transcodeAll:'Z' of:key to:'S' startingAt:2
+"Modified (format): / 02-08-2017 / 14:36:00 / cg"
+!
+rule21:key
+"21. transcode terminal 'AY' to 'Y' "
+^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1
+"Modified (format): / 02-08-2017 / 14:36:08 / cg"
+!
+rule22:key
+|k|
+"22. remove trailing vowels "
+k := key copy.
+[ k last isVowel ] whileTrue:[
+k := k copyButLast
+].
+^ k
+"Modified: / 02-08-2017 / 14:36:42 / cg"
+!
+rule23:key
+|k c|
+"23. collapse all strings of repeated characters "
+k := key copy.
+c := SortedCollection sortBlock:[:a :b | b < a ].
+k size to:2 do:[:i |
+(k at:i) = (k at:i - 1) ifTrue:[
+c add:i
+]
+].
+c do:[:n |
+k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+].
+^ k
+!
+rule24:key originalKey:originalKey
+|k|
+"24. if first char of original surname was a vowel, append it to the code"
+k := key copy.
+originalKey first isVowel ifTrue:[
+k := k , originalKey first asString asUppercase
+].
+^ k
+!
+rule2:key
+"2. Transcode initial strings:  MAC => MC   PF => F"
+|k|
+k := key copy.
+(k startsWith:'MAC') ifTrue:[
+k := 'MC' , (k copyFrom:4)
+].
+(k startsWith:'PF') ifTrue:[
+k := 'F' , (k copyFrom:3)
+].
+^ k
+"Modified (format): / 02-08-2017 / 14:31:40 / cg"
+!
+rule3:key
+|k|
+"3. Transcode trailing strings as follows:
+IX => IC
+EX => EC
+YE, EE, IE => Y
+NT, ND => D"
+k := key copy.
+k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'.
+k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'.
+k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'.
+k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'.
+^ k
+"Modified (format): / 02-08-2017 / 14:32:24 / cg"
+!
+rule4:key
+"4. Transcode 'EV' to 'EF' if not at start of name"
+^ self transcodeAll:'EV' of:key to:'EF' startingAt:2
+"Modified (format): / 02-08-2017 / 14:32:35 / cg"
+!
+rule5:key
+"5. Use first character of name as first character of key.
+Ignored because we're doing an in-place conversion"
+^ key
+"Modified (comment): / 02-08-2017 / 14:32:45 / cg"
+!
+rule6:key
+|k i|
+"6. Remove any 'W' that follows a vowel"
+k := key copy.
+i := 2.
+[
+(i := k indexOf:$W startingAt:i) > 0
+] whileTrue:[
+(k at:i - 1) isVowel ifTrue:[
+k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
+i := i - 1
+]
+].
+^ k
+!
+rule7:key
+"7. replace all vowels with 'A' "
+^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]].
+"Modified: / 02-08-2017 / 14:33:56 / cg"
+!
+rule8:key
+"8. transcode 'GHT' to 'GT' "
+^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1
+"Modified (format): / 02-08-2017 / 14:34:05 / cg"
+!
+rule9:key
+"9. transcode 'DG' to 'G' "
+^ self transcodeAll:'DG' of:key to:'G' startingAt:1
+"Modified (format): / 02-08-2017 / 14:34:15 / cg"
+!
+transcodeAll:aString of:key to:replacementString startingAt:start
+|k i|
+k := key copy.
+[
+(i := k indexOfSubCollection:aString startingAt:start) > 0
+] whileTrue:[
+k := (k copyFrom:1 to:i - 1) , replacementString
+, (k copyFrom:i + aString size to:k size)
+].
+^ k
+!
+transcodeTrailing:anArrayOfStrings of:key to:replacementString
+|answer|
+answer := key copy.
+anArrayOfStrings do:[:aString |
+answer := self
+transcodeAll:aString
+of:answer
+to:replacementString
+startingAt:(answer size - aString size) + 1
+].
+^ answer
+! !
+!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
+documentation
+"
+Implementation of the PHONEM algorithm, as described in
+'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
+Ein Programm fuer kontextsensitive phonetische Textumwandlung
+ct Magazin fuer Computer & Technik 25/1998'
+This algorithm deals better with the german language (it cares for umlauts)
+"
+! !
+!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
+encode:aString
+|s idx t t2|
+s := aString asUppercase.
+idx := 1.
+[idx < (s size-1)] whileTrue:[
+t2 := nil.
+t := s copyFrom:idx to:idx+1.
+t = 'SC' ifTrue:[ t2 := 'C' ]
+ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
+ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
+ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
+ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
+ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
+ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
+ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
+ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
+ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
+ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
+ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
+ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
+ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
+ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
+ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
+ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
+t2 notNil ifTrue:[
+s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
+] ifFalse:[
+idx := idx + 1.
+].
+].
+"/ single character substitutions via tr
+s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
+s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
+s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
+^ s
+"
+self basicNew encode:'müller'  -> 'MYLR'
+self basicNew encode:'mueller' -> 'MYLR'
+self basicNew encode:'möller'  -> 'MYLR'
+self basicNew encode:'miller'  -> 'MYLR'
+self basicNew encode:'muller'  -> 'MULR'
+self basicNew encode:'muler'   -> 'MULR'
+self basicNew phoneticStringsFor:'müller'  #('MYLR')
+self basicNew phoneticStringsFor:'mueller' #('MYLR')
+self basicNew phoneticStringsFor:'möller'  #('MYLR')
+self basicNew phoneticStringsFor:'miller'  #('MYLR')
+self basicNew phoneticStringsFor:'muller'  #('MULR')
+self basicNew phoneticStringsFor:'muler'   #('MULR')
+self basicNew phoneticStringsFor:'schmidt'     #('CMYD')
+self basicNew phoneticStringsFor:'schneider'   #('CNAYDR')
+self basicNew phoneticStringsFor:'fischer'     #('VYCR')
+self basicNew phoneticStringsFor:'weber'       #('VBR')
+self basicNew phoneticStringsFor:'weeber'      #('VBR')
+self basicNew phoneticStringsFor:'webber'      #('VBR')
+self basicNew phoneticStringsFor:'wepper'      #('VBR')
+self basicNew phoneticStringsFor:'meyer'       #('MAYR')
+self basicNew phoneticStringsFor:'maier'       #('MAYR')
+self basicNew phoneticStringsFor:'mayer'       #('MAYR')
+self basicNew phoneticStringsFor:'mayr'        #('MAYR')
+self basicNew phoneticStringsFor:'meir'        #('MAYR')
+self basicNew phoneticStringsFor:'wagner'      #('VACNR')
+self basicNew phoneticStringsFor:'schulz'      #('CULC')
+self basicNew phoneticStringsFor:'becker'      #('BCR')
+self basicNew phoneticStringsFor:'hoffmann'    #('OVMAN')
+self basicNew phoneticStringsFor:'haus'        #('AUS')
+self basicNew phoneticStringsFor:'schäfer'     #('CVR')
+self basicNew phoneticStringsFor:'scheffer'    #('CVR')
+self basicNew phoneticStringsFor:'schaeffer'   #('CVR')
+self basicNew phoneticStringsFor:'schaefer'    #('CVR')
+"
+"Created: / 28-07-2017 / 15:38:08 / cg"
+! !
+!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'!
+documentation
+"
+Caverphone (2) Algorithm:
+see http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it,
+with the proviso that the Caversham Project at the University of Otago should be acknowledged as the
+original source (which is hereby done ;-).
+•  Start with a Surname or Firstname
+•  Convert to lowercase
+This coding system is case sensitive, implementations should acknowledge that a is not the same as A
+•  Remove anything not A-Z
+The main intention of this is to remove spaces, hyphens, and apostrophes.
+example:  o'brian becomes obrian
+•  If the name starts with cough make it cou2f
+2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in.
+•  If the name starts with rough make it rou2f
+•  If the name starts with tough make it tou2f
+•  If the name starts with enough make it enou2f
+•  If the name starts with gn make it 2n
+•  If the name ends with mb make it m2
+•  replace cq with 2q
+•  replace ci with si
+•  replace ce with se
+•  replace cy with sy
+•  replace tch with 2ch
+•  replace c with k
+•  replace q with k
+•  replace x with k
+•  replace v with f
+•  replace dg with 2g
+•  replace tio with sio
+•  replace tia with sia
+•  replace d with t
+•  replace ph with fh
+•  replace b with p
+•  replace sh with s2
+•  replace z with s
+•  replace and initial vowel with an A
+•  replace all other vowels with a 3
+3 is a temporary placeholder marking a vowel
+•  replace 3gh3 with 3kh3
+Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule.
+•  replace gh with 22
+•  replace g with k
+•  replace groups of the letter s with a S
+Continuous strings of s are replace by a single S
+•  replace groups of the letter t with a T
+•  replace groups of the letter p with a P
+•  replace groups of the letter k with a K
+•  replace groups of the letter f with a F
+•  replace groups of the letter m with a M
+•  replace groups of the letter n with a N
+•  replace w3 with W3
+•  replace wy with Wy
+•  replace wh3 with Wh3
+•  replace why with Why
+•  replace w with 2
+•  replace and initial h with an A
+•  replace all other occurrences of h with a 2
+•  replace r3 with R3
+•  replace ry with Ry
+•  replace r with 2
+•  replace l3 with L3
+•  replace ly with Ly
+•  replace l with 2
+•  replace j with y
+•  replace y3 with Y3
+•  replace y with 2
+•  remove all 2s
+•  remove all 3s
+•  put six (v1) / ten (v2) 1s on the end
+•  take the first six characters as the code (caverphone 1);
+/ take the first ten characters as the code (caverphone 2);
+self new encode:'david'      -> 'TFT1111111'
+self new encode:'whittle'    -> 'WTA1111111'
+self new encode:'Stevenson'  -> 'STFNSN1111'
+self new encode:'Peter'      -> 'PTA1111111'
+self new encode:'washington' -> 'WSNKTN1111'
+self new encode:'lee'        -> 'LA11111111'
+self new encode:'Gutierrez'  -> 'KTRS111111'
+self new encode:'Pfister'    -> 'PFSTA11111'
+self new encode:'Jackson'    -> 'YKSN111111'
+self new encode:'Tymczak'    -> 'TMKSK11111'
+self new encode:'add'        -> 'AT11111111'
+self new encode:'aid'        -> 'AT11111111'
+self new encode:'at'         -> 'AT11111111'
+self new encode:'art'        -> 'AT11111111'
+self new encode:'earth'      -> 'AT11111111'
+self new encode:'head'       -> 'AT11111111'
+self new encode:'old'        -> 'AT11111111'
+self new encode:'ready'      -> 'RTA1111111'
+self new encode:'rather'     -> 'RTA1111111'
+self new encode:'able'       -> 'APA1111111'
+self new encode:'appear'     -> 'APA1111111'
+self new encode:'Deedee'     -> 'TTA1111111'
+"
+! !
+!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'!
+encode:word
+|txt|
+word size == 0 ifTrue:[^ '1111111111' ].
+"/ 1. Convert to lowercase
+txt := word asLowercase.
+"/ 2. Remove anything not A-Z
+txt := txt select:#isLetter.
+#(
+"/  oldSeq newSeq repeat
+"/ 2.5. Remove final e
+'e$' '' false
+"/ 3. Handle various start options
+'^cough' 'cou2f' false
+'^rough' 'rou2f' false
+'^tough' 'tou2f' false
+'^enough' 'enou2f' false
+'^trough' 'trou2f' false
+'^gn' '2n' false
+'mb$' 'm2' false
+"/ 4. Handle replacements
+'cq' '2q' true
+'ci' 'si' true
+'ce' 'se' true
+'cy' 'sy' true
+'tch' '2ch' true
+'c' 'k' true
+'q' 'k' true
+'x' 'k' true
+'v' 'f' true
+'dg' '2g' true
+'tio' 'sio' true
+'tia' 'sia' true
+'d' 't' true
+'ph' 'fh' true
+'b' 'p' true
+'sh' 's2' true
+'z' 's' true
+'^a' 'A' false
+'^e' 'A' false
+'^i' 'A' false
+'^o' 'A' false
+'^u' 'A' false
+'a' '3' true
+'e' '3' true
+'i' '3' true
+'o' '3' true
+'u' '3' true
+'j' 'y' true
+'^y3' 'Y3' false
+'^y' 'A' false
+'y' '3'  true
+'3gh3' '3kh3' true
+'gh' '22' true
+'g' 'k' true
+'s'  'S' true
+'SS' 'S' true
+'t'  'T' true
+'TT' 'T' true
+'p'  'P' true
+'PP' 'P' true
+'k'  'K' true
+'KK' 'K' true
+'f'  'F' true
+'FF' 'F' true
+'m'  'M' true
+'MM' 'M' true
+'n'  'N' true
+'NN' 'N' true
+'w3' 'W3' true
+'wh3' 'Wh3' true
+'w$' '3'  false
+'w' '2' true
+'^h' 'A' false
+'h' '2' true
+'r3' 'R3' true
+'r$' '3'  false
+'r' '2' true
+'l3' 'L3' true
+'l$' '3' false
+'l' '2' true
+"/ 5. removals
+'2' '' true
+'3$' 'A' true
+'3' '' true
+) inGroupsOf:3 do:[:pat :repl :repeat|
+|s txtBefore|
+txtBefore := txt.
+(pat startsWith:$^) ifTrue:[
+s := pat copyButFirst.
+repeat ifTrue:[
+[txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ]
+] ifFalse:[
+(txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ]
+].
+] ifFalse:[
+(pat endsWith:$$) ifTrue:[
+s := pat copyButLast.
+repeat ifTrue:[
+[txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ]
+] ifFalse:[
+(txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ]
+]
+] ifFalse:[
+repeat ifTrue:[
+txt := txt copyReplaceAllSubcollections:pat with:repl
+] ifFalse:[
+txt := txt copyReplaceSubcollection:pat with:repl
+]
+]
+].
+"/ txt ~= txtBefore ifTrue:[
+"/     Transcript showCR:(pat,' | ',repl,' -> ',txt).
+"/ ].
+].
+"/ 6. put ten 1s on the end
+txt := txt,'1111111111'.
+"/ 7. take the first ten characters as the code
+^ txt copyTo:10
+"
+self new encode:'david'      -> 'TFT1111111'
+self new encode:'whittle'    -> 'WTA1111111'
+self new encode:'Stevenson'  -> 'STFNSN1111'
+self new encode:'Peter'      -> 'PTA1111111'
+self new encode:'washington' -> 'WSNKTN1111'
+self new encode:'lee'        -> 'LA11111111'
+self new encode:'Gutierrez'  -> 'KTRS111111'
+self new encode:'Pfister'    -> 'PFSTA11111'
+self new encode:'Jackson'    -> 'YKSN111111'
+self new encode:'Tymczak'    -> 'TMKSK11111'
+self new encode:'add'        -> 'AT11111111'
+self new encode:'aid'        -> 'AT11111111'
+self new encode:'at'         -> 'AT11111111'
+self new encode:'art'        -> 'AT11111111'
+self new encode:'earth'      -> 'AT11111111'
+self new encode:'head'       -> 'AT11111111'
+self new encode:'old'        -> 'AT11111111'
+self new encode:'ready'      -> 'RTA1111111'
+self new encode:'rather'     -> 'RTA1111111'
+self new encode:'able'       -> 'APA1111111'
+self new encode:'appear'     -> 'APA1111111'
+self new encode:'Deedee'     -> 'TTA1111111'
+"
+"Created: / 28-07-2017 / 15:21:23 / cg"
+"Modified: / 02-08-2017 / 01:42:35 / cg"
+! !
 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
 documentation
 "
 The 'Kölner Phonetik' (cologne phonetic) code is for the german language
 self new encode:'Pfister'    -> 'P236'
 self new encode:'Jackson'    -> 'J250'
 self new encode:'Tymczak'    -> 'T522'
 notice:
-MiracodeStringComparator new
+MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
-encode:'Ashcraft' -> 'A261'
+SoundexStringComparator new encode:'Ashcraft'  -> 'A226'
-SoundexStringComparator
-new encode:'Ashcraft' -> 'A226'
 see also:
 https://www.archives.gov/research/census/soundex.html
 "
 ! !
-!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'api'!
+!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'!
 encode:word
+"same as inherited, but cares for W and H"
 |u p t prevCode|
 u := word asUppercase.
 p := u first asString.
 prevCode := self translate:u first.
 [ p size < 4 ] whileTrue:[
 p := p , '0'
 ].
 ^ (p copyFrom:1 to:4)
-"
+"Created: / 02-08-2017 / 00:19:47 / cg"
-self new encode:'washington' -> 'W252'
+"Modified (comment): / 02-08-2017 / 14:30:47 / cg"
-self new encode:'lee'        -> 'L000'
-self new encode:'Gutierrez'  -> 'G362'
-self new encode:'Pfister'    -> 'P236'
-self new encode:'Jackson'    -> 'J250'
-self new encode:'Tymczak'    -> 'T522'
-"
-"notice:
-MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
-self new encode:'Ashcraft'   -> 'A226'
-"
-"Created: / 28-07-2017 / 15:23:16 / cg"
-"Modified (comment): / 01-08-2017 / 19:01:51 / cg"
 ! !
 !PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!
 documentation

changeset 4491	d6c31bb1e928
parent 4490	33b5fbfc4b5d
child 4495	5d2da4bddbda