--- a/PhoneticStringUtilities.st Tue Aug 01 19:32:27 2017 +0200
+++ b/PhoneticStringUtilities.st Wed Aug 02 14:37:29 2017 +0200
@@ -30,6 +30,22 @@
privateIn:PhoneticStringUtilities
!
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator
+ instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+ currentIndex skipCount'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:PhoneticStringUtilities
+!
+
+PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
+ instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+ currentIndex skipCount'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:PhoneticStringUtilities
+!
+
PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator
instanceVariableNames:''
classVariableNames:'CharacterTranslationDict'
@@ -51,6 +67,14 @@
privateIn:PhoneticStringUtilities
!
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator
+ instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
+ currentIndex skipCount'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:PhoneticStringUtilities
+!
+
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator
instanceVariableNames:''
classVariableNames:'CharacterTranslationDict'
@@ -79,10 +103,9 @@
privateIn:PhoneticStringUtilities
!
-PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator
- instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex
- currentIndex skipCount'
- classVariableNames:''
+PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator
+ instanceVariableNames:''
+ classVariableNames:'CharacterTranslationDict'
poolDictionaries:''
privateIn:PhoneticStringUtilities
!
@@ -153,6 +176,15 @@
described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung'
from 'ct Magazin fuer Computer & Technik 25/1999'.
+ mra
+ Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.
+
+ caverphone2
+ better than soundex
+
+ spanish phonetic code
+ an algorithm slightly adjusted to spanish names
+
More info for german readers is found in:
http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
"
@@ -163,19 +195,33 @@
for the 50 most common german names, we get:
ext.
- name soundex soundex metaphone phonet phonet2 phonix daitsch phonem koeln
-
- müller M460 54600000 MLR MÜLA NILA M4000000 689000 MYLR 657
- schmidt S253 25300000 SKMTT SHMIT ZNIT S5300000 463000 CMYD 8628
- schneider S253 25360000 SKNTR SHNEIDA ZNEITA S5300000 463900 CNAYDR 8627
- fischer F260 12600000 FSKR FISHA FIZA F8000000 749000 VYCR 387
- weber W160 16000000 WBR WEBA FEBA $1000000 779000 VBR 317
- meyer M600 56000000 MYR MEIA NEIA M0000000 619000 MAYR 67
- wagner W256 25600000 WKNR WAKNA FAKNA $2500000 756900 VACNR 367
- schulz S242 24200000 SKLS SHULS ZULZ S4800000 484000 CULC 85
- becker B260 12600000 BKR BEKA BEKA B2000000 759000 BCR 147
- hoffmann H155 15500000 HFMN HOFMAN UFNAN $7550000 576600 OVMAN 036
- schäfer S216 21600000 SKFR SHEFA ZEFA S7000000 479000 CVR 837
+ name soundex soundex metaphone phonet phonet2 phonix daitsch phonem koeln caverphone2 mra
+
+ müller M460 54600000 MLR MÜLA NILA M4000000 689000 MYLR 657 MLA1111111 MLR
+ schmidt S530 25300000 SKMTT SHMIT ZNIT S5300000 463000 CMYD 862 SKMT111111 SCHMDT
+ schneider S536 25360000 SKNTR SHNEIDA ZNEITA S5300000 463900 CNAYDR 8627 SKNTA11111 SCHNDR
+ fischer F260 12600000 FSKR FISHA FIZA F8000000 749000 VYCR 387 FSKA111111 FSCHR
+ weber W160 16000000 WBR WEBA FEBA $1000000 779000 VBR 317 WPA1111111 WBR
+ meyer M600 56000000 MYR MEIA NEIA M0000000 619000 MAYR 67 MA11111111 MYR
+ wagner W256 25600000 WKNR WAKNA FAKNA $2500000 756900 VACNR 3467 WKNA111111 WGNR
+ schulz S420 24200000 SKLS SHULS ZULZ S4800000 484000 CULC 858 SKS1111111 SCHLZ
+ becker B260 12600000 BKR BEKA BEKA B2000000 759000 BCR 147 PKA1111111 BCKR
+ hoffmann H155 15500000 HFMN HOFMAN UFNAN $7550000 576600 OVMAN 036 AFMN111111 HFMN
+ schäfer S16ß 21600000 SKFR SHEFA ZEFA S7000000 479000 CVR 837 SKFA111111 SCHFR
+
+ |cls|
+
+ cls := MRAStringComparator.
+ cls := SoundexStringComparator.
+ cls := KoelnerPhoneticCodeStringComparator.
+ cls := Caverphone2StringComparator.
+ #('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer'
+ 'wagner' 'schulz' 'becker' 'hoffmann' 'schäfer')
+ do:[:name |
+ Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''.
+ ].
+
+ KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt' -> '65752682'
"
! !
@@ -463,6 +509,22 @@
^ self == PhoneticStringUtilities::PhoneticStringComparator
! !
+!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'!
+
+encode:word
+ ^ (self new phoneticStringsFor:word) first
+
+ "
+ SoundexStringComparator encode:'Fischer' -> 'F260'
+ Caverphone2StringComparator encode:'Fischer' -> 'FSKA111111'
+ KoelnerPhoneticCodeStringComparator encode:'Fischer' -> '387'
+ MRAStringComparator encode:'Fischer' -> 'FSCHR'
+ SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429'
+ "
+
+ "Created: / 02-08-2017 / 01:15:50 / cg"
+! !
+
!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'!
does:aString soundLike:anotherString
@@ -516,1100 +578,591 @@
"/ super initialize. -- commented since inherited method does nothing
! !
-!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- There are many extended and enhanced soundex variants around;
- here is one, called 'extended soundex'. It is destribed for example in
- http://www.epidata.dk/documentation.php.
- An author or origin is unknown.
-
- The number of digits is increased to 5 or 8;
- The first character is not used literally; instead it is encoded like the rest.
- This might have a negative effect on names starting with a vovel, though.
-
- Overall, it can be doubted if this is really an enhancement after all.
-"
-! !
-
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
-
-phoneticStringsFor:aString
- "generates both an extended soundex of length 5 and one of length 8"
-
- |first second u t prevCode|
-
- u := aString asUppercase.
- first := second := ''.
- u do:[:c |
- t := self translate:c.
- (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
- first := first , t.
- second := second , t.
- second size == 8 ifTrue:[
- ^ Array with:(first copyTo:5) with:second
- ].
- ].
- prevCode := t
- ].
- [ first size < 5 ] whileTrue:[
- first := first , '0'.
- second := second , '0'.
- ].
- [ second size < 8 ] whileTrue:[
- second := second , '0'
- ].
- ^ Array with:first with:second
-
- "
- self basicNew phoneticStringsFor:'müller' #('87900' '87900000')
- self basicNew phoneticStringsFor:'miller' #('87900' '87900000')
- self basicNew phoneticStringsFor:'muller' #('87900' '87900000')
- self basicNew phoneticStringsFor:'muler' #('87900' '87900000')
- self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000')
- self basicNew phoneticStringsFor:'schneider' #('38690' '38690000')
- self basicNew phoneticStringsFor:'fischer' #('23900' '23900000')
- self basicNew phoneticStringsFor:'weber' #('19000' '19000000')
- self basicNew phoneticStringsFor:'meyer' #('89000' '89000000')
- self basicNew phoneticStringsFor:'wagner' #('48900' '48900000')
- self basicNew phoneticStringsFor:'schulz' #('37500' '37500000')
- self basicNew phoneticStringsFor:'becker' #('13900' '13900000')
- self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000')
- self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000')
- "
-! !
-
-!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
-
-translate:aCharacter
- "use simple if's for more speed when compiled"
-
- "vowels serve as separators"
- aCharacter == $A ifTrue:[^ '0' ].
- aCharacter == $E ifTrue:[^ '0' ].
- aCharacter == $I ifTrue:[^ '0' ].
- aCharacter == $O ifTrue:[^ '0' ].
- aCharacter == $U ifTrue:[^ '0' ].
- aCharacter == $Y ifTrue:[^ '0' ].
-
- aCharacter == $B ifTrue:[^ '1' ].
- aCharacter == $P ifTrue:[^ '1' ].
-
- aCharacter == $F ifTrue:[^ '2' ].
- aCharacter == $V ifTrue:[^ '2' ].
-
- aCharacter == $C ifTrue:[^ '3' ].
- aCharacter == $S ifTrue:[^ '3' ].
- aCharacter == $K ifTrue:[^ '3' ].
-
- aCharacter == $G ifTrue:[^ '4' ].
- aCharacter == $J ifTrue:[^ '4' ].
-
- aCharacter == $Q ifTrue:[^ '5' ].
- aCharacter == $X ifTrue:[^ '5' ].
- aCharacter == $Z ifTrue:[^ '5' ].
-
- aCharacter == $D ifTrue:[^ '6' ].
- aCharacter == $G ifTrue:[^ '6' ].
- aCharacter == $T ifTrue:[^ '6' ].
-
- aCharacter == $L ifTrue:[^ '7' ].
-
- aCharacter == $M ifTrue:[^ '8' ].
- aCharacter == $N ifTrue:[^ '8' ].
-
- aCharacter == $R ifTrue:[^ '9' ].
- ^ nil
-! !
-
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- documentation to be added.
-
- [author:]
- cg
-
- [instance variables:]
-
- [class variables:]
-
- [see also:]
-
-"
-! !
-
-!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
-
-encode:word
- ^ self subclassResponsibility
-
- "Created: / 28-07-2017 / 15:20:49 / cg"
-!
-
-phoneticStringsFor:word
- ^ Array with:(self encode:word)
-
- "Created: / 28-07-2017 / 15:20:38 / cg"
-! !
-
-!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- Match Rating Approach Encoder
-
- The Western Airlines matching rating approach name encoder
-
- [see also:]
- https://en.wikipedia.org/wiki/Match_Rating_Approach
-
- G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
- ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers''
- US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
-"
-!
-
-rCode
-"<<END
-## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
-##
-## Redistribution and use in source and binary forms, with or without
-## modification, are permitted provided that the following conditions are
-## met:
-##
-## Redistributions of source code must retain the above copyright
-## notice, this list of conditions and the following disclaimer.
-##
-## Redistributions in binary form must reproduce the above copyright
-## notice, this list of conditions and the following disclaimer in
-## the documentation and/or other materials provided with the
-## distribution.
-##
-## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#' @rdname mra
-#' @title Match Rating Approach Encoder
-#'
-#' @description
-#' The Western Airlines matching rating approach name encoder
-#'
-#' @param word string or vector of strings to encode
-#' @param x MRA-encoded character vector
-#' @param y MRA-encoded character vector
-#'
-#' @details
-#'
-#' The variable \code{word} is the name to be encoded. The variable
-#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
-#' because the algorithm itself is dependent upon its six-character
-#' length. The variables \code{x} and \code{y} are MRA-encoded and are
-#' compared to each other using the MRA comparison specification.
-#'
-#' @return The \code{mra_encode} function returns match rating approach
-#' encoded character vector. The \code{mra_compare} returns a boolean
-#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
-#' comparison test.
-#'
-#' @references
-#'
-#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
-#' \emph{Accessing Individual Records from Personal Data Files Using
-#' Nonunique Identifiers,} US National Institute of Standards and
-#' Technology, SP-500-2 (1977), p. 17.
-#'
-#' @family phonics
-#'
-#' @examples
-#' mra_encode("William")
-#' mra_encode(c("Peter", "Peady"))
-#' mra_encode("Stevenson")
-
-#' @rdname mra
-#' @name mra_encode
-#' @export
-mra_encode <- function(word) {
-
- ## First, remove any nonalphabetical characters and uppercase it
- word <- gsub("[^[:alpha:]]*", "", word)
- word <- toupper(word)
-
- ## First character of key = first character of name
- first <- substr(word, 1, 1)
- word <- substr(word, 2, nchar(word))
-
- ## Delete vowels not at the start of the word
- word <- gsub("[AEIOU]", "", word)
- word <- paste(first, word, sep = "")
-
- ## Remove duplicate consecutive characters
- word <- gsub("([A-Z])\\1+", "\\1", word)
-
- ## If longer than 6 characters, take first and last 3...and we have
- ## to vectorize it
- for(i in 1:length(word)) {
- if((l = nchar(word[i])) > 6) {
- first <- substr(word[i], 1, 3)
- last <- substr(word[i], l - 2, l)
- word[i] <- paste(first, last, sep = "");
- }
- }
-
- return(word)
-}
-
-#' @rdname mra
-#' @name mra_compare
-#' @export
-mra_compare <- function(x, y) {
- mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
-
- ## Obtain the minimum rating value by calculating the length sum of
- ## the encoded strings and using table A (from Wikipedia). We start
- ## by setting the minimum to be the sum and move from there.
- mra$lensum <- nchar(mra$x) + nchar(mra$y)
- mra$min[mra$lensum == 12] <- 2
- mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
- mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
- mra$min[mra$lensum <= 4] <- 5
-
- ## If the length difference between the encoded strings is 3 or
- ## greater, then no similarity comparison is done. For us, we
- ## continue the similarity comparison out of laziness and ensure the
- ## minimum is impossibly high to meet.
- mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
-
- ## Start the comparison.
- x <- strsplit(mra$x, split = "")
- y <- strsplit(mra$y, split = "")
- rows <- nrow(mra)
- for(i in 1:rows) {
- ## Process the encoded strings from left to right and remove any
- ## identical characters found from both strings respectively.
- j <- 1
- while(j < min(length(x[[i]]), length(y[[i]]))) {
- if(x[[i]][j] == y[[i]][j]) {
- x[[i]] <- x[[i]][-j]
- y[[i]] <- y[[i]][-j]
- } else
- j <- j + 1
- }
-
- ## Process the unmatched characters from right to left and
- ## remove any identical characters found from both names
- ## respectively.
- x[[i]] <- rev(x[[i]])
- y[[i]] <- rev(y[[i]])
- j <- 1
- while(j < min(length(x[[i]]), length(y[[i]]))) {
- if(x[[i]][j] == y[[i]][j]) {
- x[[i]] <- x[[i]][-j]
- y[[i]] <- y[[i]][-j]
- } else
- j <- j + 1
- }
- ## Subtract the number of unmatched characters from 6 in the
- ## longer string. This is the similarity rating.
- len <- min(length(x[[i]]), length(y[[i]]))
- mra$sim[i] <- 6 - len
- }
-
- ## If the similarity is greater than or equal to the minimum
- ## required, it is a successful match.
- mra$match <- (mra$sim >= mra$min)
- return(mra$match)
-}
-
-END>>
-! !
-
-!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
-
-encode:wordIn
- "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
-
- |word prev|
-
- word := wordIn.
-
- "/ First, remove any nonalphabetical characters and uppercase it
-
- word := word select:#isLetter thenCollect:#asUppercase.
-
- "/ Delete vowels not at the start of the word
-
- word := word first asString , ((word from:2) reject:#isVowel).
-
- "/ Remove duplicate consecutive characters
-
- prev := nil.
- word := word
- collect:[:char |
- char == prev ifTrue:[
- $*
- ] ifFalse:[
- prev := char.
- char.
- ].
- ]
- thenSelect:[:char | char ~~ $*].
-
- "/ If longer than 6 characters, take first and last 3
- word size > 6 ifTrue:[
- word := (word copyFirst:3),(word copyLast:3)
- ].
- ^ word.
-
- "
- self new encode:'Catherine' -> 'CTHRN'
- self new encode:'CatherineCatherine' -> 'CTHHRN'
- self new encode:'Butter' -> 'BTR'
- self new encode:'Byrne' -> 'BYRN'
- self new encode:'Boern' -> 'BRN'
- self new encode:'Smith' -> 'SMTH'
- self new encode:'Smyth' -> 'SMYTH'
- self new encode:'Kathryn' -> 'KTHRYN'
- "
-
- "Created: / 28-07-2017 / 15:19:22 / cg"
- "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- WARNING: this is the so called 'simplified soundex' algorithm;
- there are more variants like miracode (american soundex) or
- mysqlSoundex around.
-
- Be sure to use the correct algorithm, if the generated strings must be compatible
- (otherwise, the differences are probably too small to be noticed as effect, but
- your search will be different)
-
- The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
-
- SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
- components of names, but by doing so reports more matches.
-
- There are some variations around in the literature;
- the following is called 'simplified soundex', and the rules for coding a name are:
-
- 1. The first letter of the name is used in its un-coded form to serve as the prefix
- character of the code. (The rest of the code is numerical).
-
- 2. Thereafter, W and H are ignored entirely.
-
- 3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
-
- 4. Other letters of the name are converted to a numerical equivalent:
- B, P, F, V 1
- C, G, J, K, Q, S, X, Z 2
- D, T 3
- L 4
- M, N 5
- R 6
-
- 5. There are two exceptions:
- 1. Letters that follow prefix letters which would, if coded, have the same
- numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
-
- 2. The second letter of any pair of consonants having the same code number is likewise ignored,
- i.e. unless there is a ''separator'' between them in the name.
-
- 6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
- Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
-
- Notice, that in another variant, w and h are treated slightly differently.
- This is only of relevance, if you need to reconstruct original soundex codes of other programs
- or for the original 1880 us census data.
-
- Also notice, that soundex deals better with english.
- For german and other languages, other algorithms may provide better results.
-"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
-
-encode:word
- |u p t prevCode|
-
- u := word asUppercase.
- p := u first asString.
- prevCode := self translate:u first.
- u from:2 to:u size do:[:c |
- t := self translate:c.
- (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
- p := p , t.
- p size == 4 ifTrue:[^ p ].
- ].
- prevCode := t
- ].
- [ p size < 4 ] whileTrue:[
- p := p , '0'
- ].
- ^ (p copyFrom:1 to:4)
-
- "
- self new encode:'washington' -> 'W252'
- self new encode:'lee' -> 'L000'
- self new encode:'Gutierrez' -> 'G362'
- self new encode:'Pfister' -> 'P236'
- self new encode:'Jackson' -> 'J250'
- self new encode:'Tymczak' -> 'T522'
- "
-
- "notice:
- MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
- self new encode:'Ashcraft' -> 'A226'
- "
-
- "Created: / 28-07-2017 / 15:21:23 / cg"
- "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
-! !
-
-!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
-
-translate:aCharacter
- "use simple if's for more speed when compiled"
-
- "vowels serve as separators"
- aCharacter == $A ifTrue:[^ '0' ].
- aCharacter == $E ifTrue:[^ '0' ].
- aCharacter == $I ifTrue:[^ '0' ].
- aCharacter == $O ifTrue:[^ '0' ].
- aCharacter == $U ifTrue:[^ '0' ].
- aCharacter == $Y ifTrue:[^ '0' ].
-
- aCharacter == $B ifTrue:[^ '1' ].
- aCharacter == $P ifTrue:[^ '1' ].
- aCharacter == $F ifTrue:[^ '1' ].
- aCharacter == $V ifTrue:[^ '1' ].
-
- aCharacter == $C ifTrue:[^ '2' ].
- aCharacter == $S ifTrue:[^ '2' ].
- aCharacter == $K ifTrue:[^ '2' ].
- aCharacter == $G ifTrue:[^ '2' ].
- aCharacter == $J ifTrue:[^ '2' ].
- aCharacter == $Q ifTrue:[^ '2' ].
- aCharacter == $X ifTrue:[^ '2' ].
- aCharacter == $Z ifTrue:[^ '2' ].
-
- aCharacter == $D ifTrue:[^ '3' ].
- aCharacter == $T ifTrue:[^ '3' ].
-
- aCharacter == $L ifTrue:[^ '4' ].
-
- aCharacter == $M ifTrue:[^ '5' ].
- aCharacter == $N ifTrue:[^ '5' ].
-
- aCharacter == $R ifTrue:[^ '6' ].
- ^ nil
-! !
-
-!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
- and also removing vokals first, then removing duplicate codes
- (whereas the soundex code does this in reverse order).
-
- These variations are important, if you need the miracode soundex codes to be generated.
-"
-! !
-
-!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
-
-encode:word
- |u p t prevCode|
-
- u := word asUppercase.
- p := u first asString.
- prevCode := self translate:u first.
- u from:2 to:u size do:[:c |
- t := self translate:c.
- (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
- p := p , t.
- ].
- (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
- prevCode := t.
- ].
- ].
- [ p size < 4 ] whileTrue:[
- p := p , '0'
- ].
- ^ p
-
- "Created: / 28-07-2017 / 15:23:41 / cg"
- "Modified: / 31-07-2017 / 17:53:51 / cg"
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
+!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'!
documentation
"
- NYSIIS Algorithm:
-
- 1.
- remove all ''S'' and ''Z'' chars from the end of the surname
-
- 2.
- transcode initial strings
- MAC => MC
- PF => F
-
- 3.
- Transcode trailing strings as follows,
-
- IX => IC
- EX => EC
- YE,EE,IE => Y
- NT,ND => D
-
- 4.
- transcode ''EV'' to ''EF'' if not at start of name
-
- 5.
- use first character of name as first character of key
-
- 6.
- remove any ''W'' that follows a vowel
-
- 7.
- replace all vowels with ''A''
-
- 8.
- transcode ''GHT'' to ''GT''
-
- 9.
- transcode ''DG'' to ''G''
-
- 10.
- transcode ''PH'' to ''F''
-
- 11.
- if not first character, eliminate all ''H'' preceded or followed by a vowel
-
- 12.
- change ''KN'' to ''N'', else ''K'' to ''C''
-
- 13.
- if not first character, change ''M'' to ''N''
-
- 14.
- if not first character, change ''Q'' to ''G''
-
- 15.
- transcode ''SH'' to ''S''
-
- 16.
- transcode ''SCH'' to ''S''
-
- 17.
- transcode ''YW'' to ''Y''
-
- 18.
- if not first or last character, change ''Y'' to ''A''
-
- 19.
- transcode ''WR'' to ''R''
-
- 20.
- if not first character, change ''Z'' to ''S''
-
- 21.
- transcode terminal ''AY'' to ''Y''
-
- 22.
- remove traling vowels
-
- 23.
- collapse all strings of repeated characters
-
- 24.
- if first char of original surname was a vowel, append it to the code
+ self encode:'AUERBACH' -> 097400, 097500
+
+ Encodes a string into a Daitch-Mokotoff Soundex value.
+ The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms,
+ yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation
+ but differences in spelling.
+
+ The main differences compared to the other soundex variants are:
+ - coded names are 6 digits long
+ - the initial character of the name is coded
+ - rules to encoded multi-character n-grams
+ - multiple possible encodings for the same name (branching)
+
+ This implementation supports branching, depending on the used method:
+ encode:aString - branching disabled, only the first code will be returned
+ phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|'
+
+ [see also:]
+ 'Wikipedia - Daitch-Mokotoff Soundex'
+ http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+
+ 'Avotaynu - Soundexing and Genealogy'
+ http://www.avotaynu.com/soundex.htm
"
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
-
-encode:aString
- |k|
-
- k := self rule1:(aString asUppercase).
- k := self rule2:k.
- k := self rule3:k.
- k := self rule4:k.
- k := self rule5:k.
- k := self rule6:k.
- k := self rule7:k.
- k := self rule8:k.
- k := self rule9:k.
- k := self rule10:k.
- k := self rule11:k.
- k := self rule12:k.
- k := self rule13:k.
- k := self rule14:k.
- k := self rule15:k.
- k := self rule16:k.
- k := self rule17:k.
- k := self rule18:k.
- k := self rule19:k.
- k := self rule20:k.
- k := self rule21:k.
- k := self rule22:k.
- k := self rule23:k.
- k := self rule24:k originalKey:aString.
- ^ k
-
- "
- self new encode:'hello'
- self new encode:'bliss'
- "
- "
- self new phoneticStringsFor:'hello'
- self new phoneticStringsFor:'bliss'
- "
-
- "Created: / 28-07-2017 / 15:34:52 / cg"
-! !
-
-!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
-
-rule10:key
- "10. transcode 'PH' to 'F' "
-
- ^ self
- transcodeAll:'PH'
- of:key
- to:'F'
- startingAt:1
-!
-
-rule11:key
- |k c|
-
- "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
- k := key copy.
- c := SortedCollection sortBlock:[:a :b | b < a ].
- 2 to:key size do:[:i |
- (key at:i) = $H ifTrue:[
- ((key at:i - 1) isVowel
- or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
- ]
- ].
- c do:[:n |
- k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
- ].
- ^ k
-!
-
-rule12:key
- |k|
-
- "12. change 'KN' to 'N', else 'K' to 'C' "
- k := self
- transcodeAll:'KN'
- of:key
- to:'K'
- startingAt:1.
- k := self
- transcodeAll:'K'
- of:k
- to:'C'
- startingAt:1.
- ^ k
-!
-
-rule13:key
- "13. if not first character, change 'M' to 'N' "
-
- ^ self
- transcodeAll:'M'
- of:key
- to:'N'
- startingAt:2
-!
-
-rule14:key
- "14. if not first character, change 'Q' to 'G' "
-
- ^ self
- transcodeAll:'Q'
- of:key
- to:'G'
- startingAt:2
-!
-
-rule15:key
- "15. transcode 'SH' to 'S' "
-
- ^ self
- transcodeAll:'SH'
- of:key
- to:'S'
- startingAt:1
-!
-
-rule16:key
- "16. transcode 'SCH' to 'S' "
-
- ^ self
- transcodeAll:'SCH'
- of:key
- to:'S'
- startingAt:1
-!
-
-rule17:key
- "17. transcode 'YW' to 'Y' "
-
- ^ self
- transcodeAll:'YW'
- of:key
- to:'Y'
- startingAt:1
-!
-
-rule18:key
- |k|
-
- "18. if not first or last character, change 'Y' to 'A' "
- k := self
- transcodeAll:'Y'
- of:key
- to:'A'
- startingAt:2.
- key last = $Y ifTrue:[
- k at:k size put:$Y
- ].
- ^ k
-!
-
-rule19:key
- "19. transcode 'WR' to 'R' "
-
- ^ self
- transcodeAll:'WR'
- of:key
- to:'R'
- startingAt:1
-!
-
-rule1:key
- |k|
-
- k := key copy.
- "1. Remove all 'S' and 'Z' chars from the end of the name"
- [
- 'SZ' includes:k last
- ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
- ^ k
-!
-
-rule20:key
- "20. if not first character, change 'Z' to 'S' "
-
- ^ self
- transcodeAll:'Z'
- of:key
- to:'S'
- startingAt:2
!
-rule21:key
- "21. transcode terminal 'AY' to 'Y' "
-
- ^ self
- transcodeAll:'AY'
- of:key
- to:'Y'
- startingAt:key size - 1
-!
-
-rule22:key
- |k|
-
- "22. remove trailing vowels "
- k := key copy.
- [ k last isVowel ] whileTrue:[
- k := k copyFrom:1 to:k size - 1
- ].
- ^ k
-!
-
-rule23:key
- |k c|
-
- "23. collapse all strings of repeated characters "
- k := key copy.
- c := SortedCollection sortBlock:[:a :b | b < a ].
- k size to:2 do:[:i |
- (k at:i) = (k at:i - 1) ifTrue:[
- c add:i
- ]
- ].
- c do:[:n |
- k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
- ].
- ^ k
-!
-
-rule24:key originalKey:originalKey
- |k|
-
- "24. if first char of original surname was a vowel, append it to the code"
- k := key copy.
- originalKey first isVowel ifTrue:[
- k := k , originalKey first asString asUppercase
- ].
- ^ k
-!
-
-rule2:key
- |k|
-
- k := key copy.
- "2. Transcode initial strings: MAC => MC PF => F"
- (k startsWith:'MAC') ifTrue:[
- k := 'MC' , (k copyFrom:4)
- ].
- (k startsWith:'PF') ifTrue:[
- k := 'F' , (k copyFrom:3)
- ].
- ^ k
-!
-
-rule3:key
- |k|
-
- "3. Transcode trailing strings as follows:
- IX => IC
- EX => EC
- YE, EE, IE => Y
- NT, ND => D"
- k := key copy.
- k := self
- transcodeTrailing:#( 'IX' )
- of:k
- to:'IC'.
- k := self
- transcodeTrailing:#( 'EX' )
- of:k
- to:'EC'.
- k := self
- transcodeTrailing:#( 'YE' 'EE' 'IE' )
- of:k
- to:'Y'.
- k := self
- transcodeTrailing:#( 'NT' 'ND' )
- of:k
- to:'D'.
- ^ k
-!
-
-rule4:key
- "4. Transcode 'EV' to 'EF' if not at start of name"
-
- ^ self
- transcodeAll:'EV'
- of:key
- to:'EF'
- startingAt:2
-!
-
-rule5:key
- "5. Use first character of name as first character of key. Ignored because we're doing an in-place conversion"
-
- ^ key
-!
-
-rule6:key
- |k i|
-
- "6. Remove any 'W' that follows a vowel"
- k := key copy.
- i := 2.
- [
- (i := k indexOf:$W startingAt:i) > 0
- ] whileTrue:[
- (k at:i - 1) isVowel ifTrue:[
- k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
- i := i - 1
- ]
- ].
- ^ k
-!
-
-rule7:key
- |k|
-
- "7. replace all vowels with 'A' "
- k := key copy.
- 1 to:key size do:[:i |
- (key at:i) isVowel ifTrue:[
- k at:i put:$A
- ]
- ].
- ^ k
-!
-
-rule8:key
- "8. transcode 'GHT' to 'GT' "
-
- ^ self
- transcodeAll:'GHT'
- of:key
- to:'GT'
- startingAt:1
-!
-
-rule9:key
- "9. transcode 'DG' to 'G' "
-
- ^ self
- transcodeAll:'DG'
- of:key
- to:'G'
- startingAt:1
-!
-
-transcodeAll:aString of:key to:replacementString startingAt:start
- |k i|
-
- k := key copy.
- [
- (i := k indexOfSubCollection:aString startingAt:start) > 0
- ] whileTrue:[
- k := (k copyFrom:1 to:i - 1) , replacementString
- , (k copyFrom:i + aString size to:k size)
- ].
- ^ k
-!
-
-transcodeTrailing:anArrayOfStrings of:key to:replacementString
- |answer|
-
- answer := key copy.
- anArrayOfStrings do:[:aString |
- answer := self
- transcodeAll:aString
- of:answer
- to:replacementString
- startingAt:(answer size - aString size) + 1
- ].
- ^ answer
-! !
-
-!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
-
-documentation
-"
- Implementation of the PHONEM algorithm, as described in
- 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
- Ein Programm fuer kontextsensitive phonetische Textumwandlung
- ct Magazin fuer Computer & Technik 25/1998'
-
- This algorithm deals better with the german language (it cares for umlauts)
-"
-! !
-
-!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
-
-encode:aString
- |s idx t t2|
-
- s := aString asUppercase.
-
- idx := 1.
- [idx < (s size-1)] whileTrue:[
- t2 := nil.
- t := s copyFrom:idx to:idx+1.
- t = 'SC' ifTrue:[ t2 := 'C' ]
- ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
- ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
- ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
- ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
- ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
- ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
- ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
- ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
- ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
- ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
- ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
- ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
- ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
- ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
- ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
- ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
- t2 notNil ifTrue:[
- s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
- ] ifFalse:[
- idx := idx + 1.
- ].
- ].
-
- "/ single character substitutions via tr
- s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
- s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
- s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
- ^ s
-
- "
- self basicNew encode:'müller' -> 'MYLR'
- self basicNew encode:'mueller' -> 'MYLR'
- self basicNew encode:'möller' -> 'MYLR'
- self basicNew encode:'miller' -> 'MYLR'
- self basicNew encode:'muller' -> 'MULR'
- self basicNew encode:'muler' -> 'MULR'
-
- self basicNew phoneticStringsFor:'müller' #('MYLR')
- self basicNew phoneticStringsFor:'mueller' #('MYLR')
- self basicNew phoneticStringsFor:'möller' #('MYLR')
- self basicNew phoneticStringsFor:'miller' #('MYLR')
- self basicNew phoneticStringsFor:'muller' #('MULR')
- self basicNew phoneticStringsFor:'muler' #('MULR')
-
- self basicNew phoneticStringsFor:'schmidt' #('CMYD')
- self basicNew phoneticStringsFor:'schneider' #('CNAYDR')
- self basicNew phoneticStringsFor:'fischer' #('VYCR')
- self basicNew phoneticStringsFor:'weber' #('VBR')
- self basicNew phoneticStringsFor:'weeber' #('VBR')
- self basicNew phoneticStringsFor:'webber' #('VBR')
- self basicNew phoneticStringsFor:'wepper' #('VBR')
-
- self basicNew phoneticStringsFor:'meyer' #('MAYR')
- self basicNew phoneticStringsFor:'maier' #('MAYR')
- self basicNew phoneticStringsFor:'mayer' #('MAYR')
- self basicNew phoneticStringsFor:'mayr' #('MAYR')
- self basicNew phoneticStringsFor:'meir' #('MAYR')
-
- self basicNew phoneticStringsFor:'wagner' #('VACNR')
- self basicNew phoneticStringsFor:'schulz' #('CULC')
- self basicNew phoneticStringsFor:'becker' #('BCR')
- self basicNew phoneticStringsFor:'hoffmann' #('OVMAN')
- self basicNew phoneticStringsFor:'haus' #('AUS')
-
- self basicNew phoneticStringsFor:'schäfer' #('CVR')
- self basicNew phoneticStringsFor:'scheffer' #('CVR')
- self basicNew phoneticStringsFor:'schaeffer' #('CVR')
- self basicNew phoneticStringsFor:'schaefer' #('CVR')
- "
-
- "Created: / 28-07-2017 / 15:38:08 / cg"
+javaCode
+"<<END
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.CharEncoding;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+import java.io.InputStream;
+import java.util.*;
+
+/**
+ * Encodes a string into a Daitch-Mokotoff Soundex value.
+ * <p>
+ * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
+ * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
+ * </p>
+ * <p>
+ * The main differences compared to the other soundex variants are:
+ * </p>
+ * <ul>
+ * <li>coded names are 6 digits long
+ * <li>the initial character of the name is coded
+ * <li>rules to encoded multi-character n-grams
+ * <li>multiple possible encodings for the same name (branching)
+ * </ul>
+ * <p>
+ * This implementation supports branching, depending on the used method:
+ * <ul>
+ * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
+ * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
+ * </ul>
+ * <p>
+ * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
+ * rules can be customized by overriding the default rules contained in the resource file
+ * {@code org/apache/commons/codec/language/dmrules.txt}.
+ * </p>
+ * <p>
+ * This class is thread-safe.
+ * </p>
+ *
+ * @see Soundex
+ * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
+ * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
+ *
+ * @version $Id$
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundex implements StringEncoder {
+
+ /**
+ * Inner class representing a branch during DM soundex encoding.
+ */
+ private static final class Branch {
+ private final StringBuilder builder;
+ private String cachedString;
+ private String lastReplacement;
+
+ private Branch() {
+ builder = new StringBuilder();
+ lastReplacement = null;
+ cachedString = null;
+ }
+
+ /**
+ * Creates a new branch, identical to this branch.
+ *
+ * @return a new, identical branch
+ */
+ public Branch createBranch() {
+ final Branch branch = new Branch();
+ branch.builder.append(toString());
+ branch.lastReplacement = this.lastReplacement;
+ return branch;
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!!(other instanceof Branch)) {
+ return false;
+ }
+
+ return toString().equals(((Branch) other).toString());
+ }
+
+ /**
+ * Finish this branch by appending '0's until the maximum code length has been reached.
+ */
+ public void finish() {
+ while (builder.length() < MAX_LENGTH) {
+ builder.append('0');
+ cachedString = null;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode();
+ }
+
+ /**
+ * Process the next replacement to be added to this branch.
+ *
+ * @param replacement
+ * the next replacement to append
+ * @param forceAppend
+ * indicates if the default processing shall be overridden
+ */
+ public void processNextReplacement(final String replacement, final boolean forceAppend) {
+ final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend;
+
+ if (append && builder.length() < MAX_LENGTH) {
+ builder.append(replacement);
+ // remove all characters after the maximum length
+ if (builder.length() > MAX_LENGTH) {
+ builder.delete(MAX_LENGTH, builder.length());
+ }
+ cachedString = null;
+ }
+
+ lastReplacement = replacement;
+ }
+
+ @Override
+ public String toString() {
+ if (cachedString == null) {
+ cachedString = builder.toString();
+ }
+ return cachedString;
+ }
+ }
+
+ /**
+ * Inner class for storing rules.
+ */
+ private static final class Rule {
+ private final String pattern;
+ private final String[] replacementAtStart;
+ private final String[] replacementBeforeVowel;
+ private final String[] replacementDefault;
+
+ protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
+ final String replacementDefault) {
+ this.pattern = pattern;
+ this.replacementAtStart = replacementAtStart.split("\\|");
+ this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
+ this.replacementDefault = replacementDefault.split("\\|");
+ }
+
+ public int getPatternLength() {
+ return pattern.length();
+ }
+
+ public String[] getReplacements(final String context, final boolean atStart) {
+ if (atStart) {
+ return replacementAtStart;
+ }
+
+ final int nextIndex = getPatternLength();
+ final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
+ if (nextCharIsVowel) {
+ return replacementBeforeVowel;
+ }
+
+ return replacementDefault;
+ }
+
+ private boolean isVowel(final char ch) {
+ return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+ }
+
+ public boolean matches(final String context) {
+ return context.startsWith(pattern);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
+ Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
+ }
+ }
+
+ private static final String COMMENT = "//";
+ private static final String DOUBLE_QUOTE = "\"";
+
+ private static final String MULTILINE_COMMENT_END = "*/";
+
+ private static final String MULTILINE_COMMENT_START = "/*";
+
+ /** The resource file containing the replacement and folding rules */
+ private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
+
+ /** The code length of a DM soundex value. */
+ private static final int MAX_LENGTH = 6;
+
+ /** Transformation rules indexed by the first character of their pattern. */
+ private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
+
+ /** Folding rules. */
+ private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
+
+ static {
+ final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
+ if (rulesIS == null) {
+ throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
+ }
+
+ final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
+ parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+ scanner.close();
+
+ // sort RULES by pattern length in descending order
+ for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
+ final List<Rule> ruleList = rule.getValue();
+ Collections.sort(ruleList, new Comparator<Rule>() {
+ @Override
+ public int compare(final Rule rule1, final Rule rule2) {
+ return rule2.getPatternLength() - rule1.getPatternLength();
+ }
+ });
+ }
+ }
+
+ private static void parseRules(final Scanner scanner, final String location,
+ final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
+ int currentLine = 0;
+ boolean inMultilineComment = false;
+
+ while (scanner.hasNextLine()) {
+ currentLine++;
+ final String rawLine = scanner.nextLine();
+ String line = rawLine;
+
+ if (inMultilineComment) {
+ if (line.endsWith(MULTILINE_COMMENT_END)) {
+ inMultilineComment = false;
+ }
+ continue;
+ }
+
+ if (line.startsWith(MULTILINE_COMMENT_START)) {
+ inMultilineComment = true;
+ } else {
+ // discard comments
+ final int cmtI = line.indexOf(COMMENT);
+ if (cmtI >= 0) {
+ line = line.substring(0, cmtI);
+ }
+
+ // trim leading-trailing whitespace
+ line = line.trim();
+
+ if (line.length() == 0) {
+ continue; // empty lines can be safely skipped
+ }
+
+ if (line.contains("=")) {
+ // folding
+ final String[] parts = line.split("=");
+ if (parts.length !!= 2) {
+ throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
+ " parts: " + rawLine + " in " + location);
+ } else {
+ final String leftCharacter = parts[0];
+ final String rightCharacter = parts[1];
+
+ if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) {
+ throw new IllegalArgumentException("Malformed folding statement - " +
+ "patterns are not single characters: " + rawLine + " in " + location);
+ }
+
+ asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
+ }
+ } else {
+ // rule
+ final String[] parts = line.split("\\s+");
+ if (parts.length !!= 4) {
+ throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
+ " parts: " + rawLine + " in " + location);
+ } else {
+ try {
+ final String pattern = stripQuotes(parts[0]);
+ final String replacement1 = stripQuotes(parts[1]);
+ final String replacement2 = stripQuotes(parts[2]);
+ final String replacement3 = stripQuotes(parts[3]);
+
+ final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+ final char patternKey = r.pattern.charAt(0);
+ List<Rule> rules = ruleMapping.get(patternKey);
+ if (rules == null) {
+ rules = new ArrayList<Rule>();
+ ruleMapping.put(patternKey, rules);
+ }
+ rules.add(r);
+ } catch (final IllegalArgumentException e) {
+ throw new IllegalStateException(
+ "Problem parsing line '" + currentLine + "' in " + location, e);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static String stripQuotes(String str) {
+ if (str.startsWith(DOUBLE_QUOTE)) {
+ str = str.substring(1);
+ }
+
+ if (str.endsWith(DOUBLE_QUOTE)) {
+ str = str.substring(0, str.length() - 1);
+ }
+
+ return str;
+ }
+
+ /** Whether to use ASCII folding prior to encoding. */
+ private final boolean folding;
+
+ /**
+ * Creates a new instance with ASCII-folding enabled.
+ */
+ public DaitchMokotoffSoundex() {
+ this(true);
+ }
+
+ /**
+ * Creates a new instance.
+ * <p>
+ * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
+ * è -> e.
+ * </p>
+ *
+ * @param folding
+ * if ASCII-folding shall be performed before encoding
+ */
+ public DaitchMokotoffSoundex(final boolean folding) {
+ this.folding = folding;
+ }
+
+ /**
+ * Performs a cleanup of the input string before the actual soundex transformation.
+ * <p>
+ * Removes all whitespace characters and performs ASCII folding if enabled.
+ * </p>
+ *
+ * @param input
+ * the input string to cleanup
+ * @return a cleaned up string
+ */
+ private String cleanup(final String input) {
+ final StringBuilder sb = new StringBuilder();
+ for (char ch : input.toCharArray()) {
+ if (Character.isWhitespace(ch)) {
+ continue;
+ }
+
+ ch = Character.toLowerCase(ch);
+ if (folding && FOLDINGS.containsKey(ch)) {
+ ch = FOLDINGS.get(ch);
+ }
+ sb.append(ch);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+ * <p>
+ * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+ * EncoderException if the supplied object is not of type java.lang.String.
+ * </p>
+ *
+ * @see #soundex(String)
+ *
+ * @param obj
+ * Object to encode
+ * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+ * supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ @Override
+ public Object encode(final Object obj) throws EncoderException {
+ if (!!(obj instanceof String)) {
+ throw new EncoderException(
+ "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+ }
+ return encode((String) obj);
+ }
+
+ /**
+ * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
+ *
+ * @see #soundex(String)
+ *
+ * @param source
+ * A String object to encode
+ * @return A DM Soundex code corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ @Override
+ public String encode(final String source) {
+ if (source == null) {
+ return null;
+ }
+ return soundex(source, false)[0];
+ }
+
+ /**
+ * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
+ * <p>
+ * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+ * separated by '|'.
+ * </p>
+ * <p>
+ * Example: the name "AUERBACH" is encoded as both
+ * </p>
+ * <ul>
+ * <li>097400</li>
+ * <li>097500</li>
+ * </ul>
+ * <p>
+ * Thus the result will be "097400|097500".
+ * </p>
+ *
+ * @param source
+ * A String object to encode
+ * @return A string containing a set of DM Soundex codes corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public String soundex(final String source) {
+ final String[] branches = soundex(source, true);
+ final StringBuilder sb = new StringBuilder();
+ int index = 0;
+ for (final String branch : branches) {
+ sb.append(branch);
+ if (++index < branches.length) {
+ sb.append('|');
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Perform the actual DM Soundex algorithm on the input string.
+ *
+ * @param source
+ * A String object to encode
+ * @param branching
+ * If branching shall be performed
+ * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
+ * selected branching mode
+ */
+ private String[] soundex(final String source, final boolean branching) {
+ if (source == null) {
+ return null;
+ }
+
+ final String input = cleanup(source);
+
+ final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
+ currentBranches.add(new Branch());
+
+ char lastChar = '\0';
+ for (int index = 0; index < input.length(); index++) {
+ final char ch = input.charAt(index);
+
+ // ignore whitespace inside a name
+ if (Character.isWhitespace(ch)) {
+ continue;
+ }
+
+ final String inputContext = input.substring(index);
+ final List<Rule> rules = RULES.get(ch);
+ if (rules == null) {
+ continue;
+ }
+
+ // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+ @SuppressWarnings("unchecked")
+ final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
+
+ for (final Rule rule : rules) {
+ if (rule.matches(inputContext)) {
+ if (branching) {
+ nextBranches.clear();
+ }
+ final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
+ final boolean branchingRequired = replacements.length > 1 && branching;
+
+ for (final Branch branch : currentBranches) {
+ for (final String nextReplacement : replacements) {
+ // if we have multiple replacements, always create a new branch
+ final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
+
+ // special rule: occurrences of mn or nm are treated differently
+ final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+
+ nextBranch.processNextReplacement(nextReplacement, force);
+
+ if (branching) {
+ nextBranches.add(nextBranch);
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (branching) {
+ currentBranches.clear();
+ currentBranches.addAll(nextBranches);
+ }
+ index += rule.getPatternLength() - 1;
+ break;
+ }
+ }
+
+ lastChar = ch;
+ }
+
+ final String[] result = new String[currentBranches.size()];
+ int index = 0;
+ for (final Branch branch : currentBranches) {
+ branch.finish();
+ result[index++] = branch.toString();
+ }
+
+ return result;
+ }
+}
+END>>"
! !
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'!
@@ -3140,6 +2693,2281 @@
"Modified: / 28-07-2017 / 11:35:12 / cg"
! !
+!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ There are many extended and enhanced soundex variants around;
+ here is one, called 'extended soundex'. It is destribed for example in
+ http://www.epidata.dk/documentation.php.
+ An author or origin is unknown.
+
+ The number of digits is increased to 5 or 8;
+ The first character is not used literally; instead it is encoded like the rest.
+ This might have a negative effect on names starting with a vovel, though.
+
+ Overall, it can be doubted if this is really an enhancement after all.
+"
+! !
+
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'!
+
+phoneticStringsFor:aString
+ "generates both an extended soundex of length 5 and one of length 8"
+
+ |first second u t prevCode|
+
+ u := aString asUppercase.
+ first := second := ''.
+ u do:[:c |
+ t := self translate:c.
+ (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+ first := first , t.
+ second := second , t.
+ second size == 8 ifTrue:[
+ ^ Array with:(first copyTo:5) with:second
+ ].
+ ].
+ prevCode := t
+ ].
+ [ first size < 5 ] whileTrue:[
+ first := first , '0'.
+ second := second , '0'.
+ ].
+ [ second size < 8 ] whileTrue:[
+ second := second , '0'
+ ].
+ ^ Array with:first with:second
+
+ "
+ self basicNew phoneticStringsFor:'müller' #('87900' '87900000')
+ self basicNew phoneticStringsFor:'miller' #('87900' '87900000')
+ self basicNew phoneticStringsFor:'muller' #('87900' '87900000')
+ self basicNew phoneticStringsFor:'muler' #('87900' '87900000')
+ self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000')
+ self basicNew phoneticStringsFor:'schneider' #('38690' '38690000')
+ self basicNew phoneticStringsFor:'fischer' #('23900' '23900000')
+ self basicNew phoneticStringsFor:'weber' #('19000' '19000000')
+ self basicNew phoneticStringsFor:'meyer' #('89000' '89000000')
+ self basicNew phoneticStringsFor:'wagner' #('48900' '48900000')
+ self basicNew phoneticStringsFor:'schulz' #('37500' '37500000')
+ self basicNew phoneticStringsFor:'becker' #('13900' '13900000')
+ self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000')
+ self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000')
+ "
+! !
+
+!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'!
+
+translate:aCharacter
+ "use simple if's for more speed when compiled"
+
+ "vowels serve as separators"
+ aCharacter == $A ifTrue:[^ '0' ].
+ aCharacter == $E ifTrue:[^ '0' ].
+ aCharacter == $I ifTrue:[^ '0' ].
+ aCharacter == $O ifTrue:[^ '0' ].
+ aCharacter == $U ifTrue:[^ '0' ].
+ aCharacter == $Y ifTrue:[^ '0' ].
+
+ aCharacter == $B ifTrue:[^ '1' ].
+ aCharacter == $P ifTrue:[^ '1' ].
+
+ aCharacter == $F ifTrue:[^ '2' ].
+ aCharacter == $V ifTrue:[^ '2' ].
+
+ aCharacter == $C ifTrue:[^ '3' ].
+ aCharacter == $S ifTrue:[^ '3' ].
+ aCharacter == $K ifTrue:[^ '3' ].
+
+ aCharacter == $G ifTrue:[^ '4' ].
+ aCharacter == $J ifTrue:[^ '4' ].
+
+ aCharacter == $Q ifTrue:[^ '5' ].
+ aCharacter == $X ifTrue:[^ '5' ].
+ aCharacter == $Z ifTrue:[^ '5' ].
+
+ aCharacter == $D ifTrue:[^ '6' ].
+ aCharacter == $G ifTrue:[^ '6' ].
+ aCharacter == $T ifTrue:[^ '6' ].
+
+ aCharacter == $L ifTrue:[^ '7' ].
+
+ aCharacter == $M ifTrue:[^ '8' ].
+ aCharacter == $N ifTrue:[^ '8' ].
+
+ aCharacter == $R ifTrue:[^ '9' ].
+ ^ nil
+! !
+
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ documentation to be added.
+
+ [author:]
+ cg
+
+ [instance variables:]
+
+ [class variables:]
+
+ [see also:]
+
+"
+! !
+
+!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'!
+
+encode:word
+ ^ self subclassResponsibility
+
+ "Created: / 28-07-2017 / 15:20:49 / cg"
+!
+
+phoneticStringsFor:word
+ ^ Array with:(self encode:word)
+
+ "Created: / 28-07-2017 / 15:20:38 / cg"
+! !
+
+!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ Match Rating Approach Encoder
+
+ The Western Airlines matching rating approach name encoder
+
+ [see also:]
+ https://en.wikipedia.org/wiki/Match_Rating_Approach
+
+ G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+ ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers''
+ US National Institute of Standards and Technology, SP-500-2 (1977), p. 17.
+"
+!
+
+rCode
+"<<END
+## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us>
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions are
+## met:
+##
+## Redistributions of source code must retain the above copyright
+## notice, this list of conditions and the following disclaimer.
+##
+## Redistributions in binary form must reproduce the above copyright
+## notice, this list of conditions and the following disclaimer in
+## the documentation and/or other materials provided with the
+## distribution.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#' @rdname mra
+#' @title Match Rating Approach Encoder
+#'
+#' @description
+#' The Western Airlines matching rating approach name encoder
+#'
+#' @param word string or vector of strings to encode
+#' @param x MRA-encoded character vector
+#' @param y MRA-encoded character vector
+#'
+#' @details
+#'
+#' The variable \code{word} is the name to be encoded. The variable
+#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder
+#' because the algorithm itself is dependent upon its six-character
+#' length. The variables \code{x} and \code{y} are MRA-encoded and are
+#' compared to each other using the MRA comparison specification.
+#'
+#' @return The \code{mra_encode} function returns match rating approach
+#' encoded character vector. The \code{mra_compare} returns a boolean
+#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA
+#' comparison test.
+#'
+#' @references
+#'
+#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery,
+#' \emph{Accessing Individual Records from Personal Data Files Using
+#' Nonunique Identifiers,} US National Institute of Standards and
+#' Technology, SP-500-2 (1977), p. 17.
+#'
+#' @family phonics
+#'
+#' @examples
+#' mra_encode("William")
+#' mra_encode(c("Peter", "Peady"))
+#' mra_encode("Stevenson")
+
+#' @rdname mra
+#' @name mra_encode
+#' @export
+mra_encode <- function(word) {
+
+ ## First, remove any nonalphabetical characters and uppercase it
+ word <- gsub("[^[:alpha:]]*", "", word)
+ word <- toupper(word)
+
+ ## First character of key = first character of name
+ first <- substr(word, 1, 1)
+ word <- substr(word, 2, nchar(word))
+
+ ## Delete vowels not at the start of the word
+ word <- gsub("[AEIOU]", "", word)
+ word <- paste(first, word, sep = "")
+
+ ## Remove duplicate consecutive characters
+ word <- gsub("([A-Z])\\1+", "\\1", word)
+
+ ## If longer than 6 characters, take first and last 3...and we have
+ ## to vectorize it
+ for(i in 1:length(word)) {
+ if((l = nchar(word[i])) > 6) {
+ first <- substr(word[i], 1, 3)
+ last <- substr(word[i], l - 2, l)
+ word[i] <- paste(first, last, sep = "");
+ }
+ }
+
+ return(word)
+}
+
+#' @rdname mra
+#' @name mra_compare
+#' @export
+mra_compare <- function(x, y) {
+ mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE)
+
+ ## Obtain the minimum rating value by calculating the length sum of
+ ## the encoded strings and using table A (from Wikipedia). We start
+ ## by setting the minimum to be the sum and move from there.
+ mra$lensum <- nchar(mra$x) + nchar(mra$y)
+ mra$min[mra$lensum == 12] <- 2
+ mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3
+ mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4
+ mra$min[mra$lensum <= 4] <- 5
+
+ ## If the length difference between the encoded strings is 3 or
+ ## greater, then no similarity comparison is done. For us, we
+ ## continue the similarity comparison out of laziness and ensure the
+ ## minimum is impossibly high to meet.
+ mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100
+
+ ## Start the comparison.
+ x <- strsplit(mra$x, split = "")
+ y <- strsplit(mra$y, split = "")
+ rows <- nrow(mra)
+ for(i in 1:rows) {
+ ## Process the encoded strings from left to right and remove any
+ ## identical characters found from both strings respectively.
+ j <- 1
+ while(j < min(length(x[[i]]), length(y[[i]]))) {
+ if(x[[i]][j] == y[[i]][j]) {
+ x[[i]] <- x[[i]][-j]
+ y[[i]] <- y[[i]][-j]
+ } else
+ j <- j + 1
+ }
+
+ ## Process the unmatched characters from right to left and
+ ## remove any identical characters found from both names
+ ## respectively.
+ x[[i]] <- rev(x[[i]])
+ y[[i]] <- rev(y[[i]])
+ j <- 1
+ while(j < min(length(x[[i]]), length(y[[i]]))) {
+ if(x[[i]][j] == y[[i]][j]) {
+ x[[i]] <- x[[i]][-j]
+ y[[i]] <- y[[i]][-j]
+ } else
+ j <- j + 1
+ }
+ ## Subtract the number of unmatched characters from 6 in the
+ ## longer string. This is the similarity rating.
+ len <- min(length(x[[i]]), length(y[[i]]))
+ mra$sim[i] <- 6 - len
+ }
+
+ ## If the similarity is greater than or equal to the minimum
+ ## required, it is a successful match.
+ mra$match <- (mra$sim >= mra$min)
+ return(mra$match)
+}
+
+END>>
+! !
+
+!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'!
+
+encode:wordIn
+ "see https://en.wikipedia.org/wiki/Match_Rating_Approach"
+
+ |word prev|
+
+ word := wordIn.
+
+ "/ First, remove any nonalphabetical characters and uppercase it
+
+ word := word select:#isLetter thenCollect:#asUppercase.
+
+ "/ Delete vowels not at the start of the word
+
+ word := word first asString , ((word from:2) reject:#isVowel).
+
+ "/ Remove duplicate consecutive characters
+
+ prev := nil.
+ word := word
+ collect:[:char |
+ char == prev ifTrue:[
+ $*
+ ] ifFalse:[
+ prev := char.
+ char.
+ ].
+ ]
+ thenSelect:[:char | char ~~ $*].
+
+ "/ If longer than 6 characters, take first and last 3
+ word size > 6 ifTrue:[
+ word := (word copyFirst:3),(word copyLast:3)
+ ].
+ ^ word.
+
+ "
+ self new encode:'Catherine' -> 'CTHRN'
+ self new encode:'CatherineCatherine' -> 'CTHHRN'
+ self new encode:'Butter' -> 'BTR'
+ self new encode:'Byrne' -> 'BYRN'
+ self new encode:'Boern' -> 'BRN'
+ self new encode:'Smith' -> 'SMTH'
+ self new encode:'Smyth' -> 'SMYTH'
+ self new encode:'Kathryn' -> 'KTHRYN'
+ "
+
+ "Created: / 28-07-2017 / 15:19:22 / cg"
+ "Modified (comment): / 31-07-2017 / 15:14:31 / cg"
+! !
+
+!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ Encodes a string into a Metaphone value.
+
+ Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+ Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+
+ Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.
+ Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+ https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6
+
+ They have had undocumented changes from the originally published algorithm.
+ For more information, see https://issues.apache.org/jira/browse/CODEC-57
+
+ Metaphone uses the following rules:
+
+ Doubled letters except 'c' -> drop 2nd letter.
+ Vowels are only kept when they are the first letter.
+ B -> B unless at the end of a word after 'm' as in 'dumb'
+ C -> X (sh) if -cia- or -ch-
+ S if -ci-, -ce- or -cy-
+ K otherwise, including -sch-
+ D -> J if in -dge-, -dgy- or -dgi-; T otherwise
+ F -> F
+ G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above)
+ J if before i or e or y if not double gg; K otherwise
+ H -> silent if after vowel and no vowel follows; H otherwise
+ J -> J
+ K -> silent if after 'c'; K otherwise
+ L -> L
+ M -> M
+ N -> N
+ P -> F if before 'h'; P otherwise
+ Q -> K
+ R -> R
+ S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise
+ T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise
+ V -> F
+ W -> silent if not followed by a vowel W if followed by a vowel
+ X -> KS
+ Y -> silent if not followed by a vowel Y if followed by a vowel
+ Z -> S
+
+ Initial Letter Exceptions
+
+ Initial kn-, gn- pn, ae- or wr- -> drop first letter
+ Initial x- -> change to 's'
+ Initial wh- -> change to 'w'
+
+
+ self new encode:'a'
+ self new encode:'dumb'
+ self new encode:'MILLER'
+ self new encode:'schmidt'
+ self new encode:'schneider'
+ self new encode:'FISCHER'
+ self new encode:'HEDGY'
+ self new encode:'weber'
+ self new encode:'wagner'
+ self new encode:'van gogh'
+"
+!
+
+javaCode
+"<<END
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Metaphone value.
+ * <p>
+ * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
+ * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
+ * <p>
+ * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
+ * p 39.</CITE>
+ * <p>
+ * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
+ * </p>
+ * <ul>
+ * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
+ * (broken link 4/30/2013) </li>
+ * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
+ * (link checked 4/30/2013) </li>
+ * </ul>
+ * <p>
+ * They have had undocumented changes from the originally published algorithm.
+ * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
+ * <p>
+ * This class is conditionally thread-safe.
+ * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
+ * but is not volatile, and accesses are not synchronized.
+ * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
+ * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
+ * after initial setup.
+ *
+ * @version $Id$
+ */
+public class Metaphone implements StringEncoder {
+
+ /**
+ * Five values in the English language
+ */
+ private static final String VOWELS = "AEIOU";
+
+ /**
+ * Variable used in Metaphone algorithm
+ */
+ private static final String FRONTV = "EIY";
+
+ /**
+ * Variable used in Metaphone algorithm
+ */
+ private static final String VARSON = "CSPTG";
+
+ /**
+ * The max code length for metaphone is 4
+ */
+ private int maxCodeLen = 4;
+
+ /**
+ * Creates an instance of the Metaphone encoder
+ */
+ public Metaphone() {
+ super();
+ }
+
+ /**
+ * Find the metaphone value of a String. This is similar to the
+ * soundex algorithm, but better at finding similar sounding words.
+ * All input is converted to upper case.
+ * Limitations: Input format is expected to be a single ASCII word
+ * with only characters in the A - Z range, no punctuation or numbers.
+ *
+ * @param txt String to find the metaphone code for
+ * @return A metaphone code corresponding to the String supplied
+ */
+ public String metaphone(final String txt) {
+ boolean hard = false;
+ int txtLength;
+ if (txt == null || (txtLength = txt.length()) == 0) {
+ return "";
+ }
+ // single character is itself
+ if (txtLength == 1) {
+ return txt.toUpperCase(java.util.Locale.ENGLISH);
+ }
+
+ final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
+
+ final StringBuilder local = new StringBuilder(40); // manipulate
+ final StringBuilder code = new StringBuilder(10); // output
+ // handle initial 2 characters exceptions
+ switch(inwd[0]) {
+ case 'K':
+ case 'G':
+ case 'P': /* looking for KN, etc*/
+ if (inwd[1] == 'N') {
+ local.append(inwd, 1, inwd.length - 1);
+ } else {
+ local.append(inwd);
+ }
+ break;
+ case 'A': /* looking for AE */
+ if (inwd[1] == 'E') {
+ local.append(inwd, 1, inwd.length - 1);
+ } else {
+ local.append(inwd);
+ }
+ break;
+ case 'W': /* looking for WR or WH */
+ if (inwd[1] == 'R') { // WR -> R
+ local.append(inwd, 1, inwd.length - 1);
+ break;
+ }
+ if (inwd[1] == 'H') {
+ local.append(inwd, 1, inwd.length - 1);
+ local.setCharAt(0, 'W'); // WH -> W
+ } else {
+ local.append(inwd);
+ }
+ break;
+ case 'X': /* initial X becomes S */
+ inwd[0] = 'S';
+ local.append(inwd);
+ break;
+ default:
+ local.append(inwd);
+ } // now local has working string with initials fixed
+
+ final int wdsz = local.length();
+ int n = 0;
+
+ while (code.length() < this.getMaxCodeLen() &&
+ n < wdsz ) { // max code size of 4 works well
+ final char symb = local.charAt(n);
+ // remove duplicate letters except C
+ if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) {
+ n++;
+ } else { // not dup
+ switch(symb) {
+ case 'A':
+ case 'E':
+ case 'I':
+ case 'O':
+ case 'U':
+ if (n == 0) {
+ code.append(symb);
+ }
+ break; // only use vowel if leading char
+ case 'B':
+ if ( isPreviousChar(local, n, 'M') &&
+ isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+ break;
+ }
+ code.append(symb);
+ break;
+ case 'C': // lots of C special cases
+ /* discard if SCI, SCE or SCY */
+ if ( isPreviousChar(local, n, 'S') &&
+ !!isLastChar(wdsz, n) &&
+ FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+ break;
+ }
+ if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+ code.append('X');
+ break;
+ }
+ if (!!isLastChar(wdsz, n) &&
+ FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+ code.append('S');
+ break; // CI,CE,CY -> S
+ }
+ if (isPreviousChar(local, n, 'S') &&
+ isNextChar(local, n, 'H') ) { // SCH->sk
+ code.append('K');
+ break;
+ }
+ if (isNextChar(local, n, 'H')) { // detect CH
+ if (n == 0 &&
+ wdsz >= 3 &&
+ isVowel(local,2) ) { // CH consonant -> K consonant
+ code.append('K');
+ } else {
+ code.append('X'); // CHvowel -> X
+ }
+ } else {
+ code.append('K');
+ }
+ break;
+ case 'D':
+ if (!!isLastChar(wdsz, n + 1) &&
+ isNextChar(local, n, 'G') &&
+ FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+ code.append('J'); n += 2;
+ } else {
+ code.append('T');
+ }
+ break;
+ case 'G': // GH silent at end or before consonant
+ if (isLastChar(wdsz, n + 1) &&
+ isNextChar(local, n, 'H')) {
+ break;
+ }
+ if (!!isLastChar(wdsz, n + 1) &&
+ isNextChar(local,n,'H') &&
+ !!isVowel(local,n+2)) {
+ break;
+ }
+ if (n > 0 &&
+ ( regionMatch(local, n, "GN") ||
+ regionMatch(local, n, "GNED") ) ) {
+ break; // silent G
+ }
+ if (isPreviousChar(local, n, 'G')) {
+ // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
+ hard = true;
+ } else {
+ hard = false;
+ }
+ if (!!isLastChar(wdsz, n) &&
+ FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+ !!hard) {
+ code.append('J');
+ } else {
+ code.append('K');
+ }
+ break;
+ case 'H':
+ if (isLastChar(wdsz, n)) {
+ break; // terminal H
+ }
+ if (n > 0 &&
+ VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+ break;
+ }
+ if (isVowel(local,n+1)) {
+ code.append('H'); // Hvowel
+ }
+ break;
+ case 'F':
+ case 'J':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'R':
+ code.append(symb);
+ break;
+ case 'K':
+ if (n > 0) { // not initial
+ if (!!isPreviousChar(local, n, 'C')) {
+ code.append(symb);
+ }
+ } else {
+ code.append(symb); // initial K
+ }
+ break;
+ case 'P':
+ if (isNextChar(local,n,'H')) {
+ // PH -> F
+ code.append('F');
+ } else {
+ code.append(symb);
+ }
+ break;
+ case 'Q':
+ code.append('K');
+ break;
+ case 'S':
+ if (regionMatch(local,n,"SH") ||
+ regionMatch(local,n,"SIO") ||
+ regionMatch(local,n,"SIA")) {
+ code.append('X');
+ } else {
+ code.append('S');
+ }
+ break;
+ case 'T':
+ if (regionMatch(local,n,"TIA") ||
+ regionMatch(local,n,"TIO")) {
+ code.append('X');
+ break;
+ }
+ if (regionMatch(local,n,"TCH")) {
+ // Silent if in "TCH"
+ break;
+ }
+ // substitute numeral 0 for TH (resembles theta after all)
+ if (regionMatch(local,n,"TH")) {
+ code.append('0');
+ } else {
+ code.append('T');
+ }
+ break;
+ case 'V':
+ code.append('F'); break;
+ case 'W':
+ case 'Y': // silent if not followed by vowel
+ if (!!isLastChar(wdsz,n) &&
+ isVowel(local,n+1)) {
+ code.append(symb);
+ }
+ break;
+ case 'X':
+ code.append('K');
+ code.append('S');
+ break;
+ case 'Z':
+ code.append('S');
+ break;
+ default:
+ // do nothing
+ break;
+ } // end switch
+ n++;
+ } // end else from symb !!= 'C'
+ if (code.length() > this.getMaxCodeLen()) {
+ code.setLength(this.getMaxCodeLen());
+ }
+ }
+ return code.toString();
+ }
+
+ private boolean isVowel(final StringBuilder string, final int index) {
+ return VOWELS.indexOf(string.charAt(index)) >= 0;
+ }
+
+ private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
+ boolean matches = false;
+ if( index > 0 &&
+ index < string.length() ) {
+ matches = string.charAt(index - 1) == c;
+ }
+ return matches;
+ }
+
+ private boolean isNextChar(final StringBuilder string, final int index, final char c) {
+ boolean matches = false;
+ if( index >= 0 &&
+ index < string.length() - 1 ) {
+ matches = string.charAt(index + 1) == c;
+ }
+ return matches;
+ }
+
+ private boolean regionMatch(final StringBuilder string, final int index, final String test) {
+ boolean matches = false;
+ if( index >= 0 &&
+ index + test.length() - 1 < string.length() ) {
+ final String substring = string.substring( index, index + test.length());
+ matches = substring.equals( test );
+ }
+ return matches;
+ }
+
+ private boolean isLastChar(final int wdsz, final int n) {
+ return n + 1 == wdsz;
+ }
+
+
+ /**
+ * Encodes an Object using the metaphone algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an EncoderException if the
+ * supplied object is not of type java.lang.String.
+ *
+ * @param obj Object to encode
+ * @return An object (or type java.lang.String) containing the
+ * metaphone code which corresponds to the String supplied.
+ * @throws EncoderException if the parameter supplied is not
+ * of type java.lang.String
+ */
+ @Override
+ public Object encode(final Object obj) throws EncoderException {
+ if (!!(obj instanceof String)) {
+ throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
+ }
+ return metaphone((String) obj);
+ }
+
+ /**
+ * Encodes a String using the Metaphone algorithm.
+ *
+ * @param str String object to encode
+ * @return The metaphone code corresponding to the String supplied
+ */
+ @Override
+ public String encode(final String str) {
+ return metaphone(str);
+ }
+
+ /**
+ * Tests is the metaphones of two strings are identical.
+ *
+ * @param str1 First of two strings to compare
+ * @param str2 Second of two strings to compare
+ * @return <code>true</code> if the metaphones of these strings are identical,
+ * <code>false</code> otherwise.
+ */
+ public boolean isMetaphoneEqual(final String str1, final String str2) {
+ return metaphone(str1).equals(metaphone(str2));
+ }
+
+ /**
+ * Returns the maxCodeLen.
+ * @return int
+ */
+ public int getMaxCodeLen() { return this.maxCodeLen; }
+
+ /**
+ * Sets the maxCodeLen.
+ * @param maxCodeLen The maxCodeLen to set
+ */
+ public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
+
+}
+END>>"
+! !
+
+!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'!
+
+encode:txt
+ "
+ self new encode:'a'
+ self new encode:'MILLER'
+ self new encode:'schmidt'
+ self new encode:'schneider'
+ self new encode:'FISCHER'
+ self new encode:'HEDGY'
+ self new encode:'weber'
+ self new encode:'wagner'
+ self new encode:'van gogh'
+ self new encode:'dumb'
+ "
+
+ |hard txtLength local code inwd ch ch2 wdsz n maxCodeLen|
+
+ inwd := txt.
+ hard := false.
+ txtLength := 0.
+ maxCodeLen := self maxCodeLen.
+
+ (txtLength := txt size) == 0 ifTrue:[^ ''].
+
+ inwd := txt asUppercase.
+ "/ single character is itself
+ (txtLength == 1) ifTrue:[
+ ^ inwd
+ ].
+
+ code := '' writeStream.
+ local := inwd.
+
+ "/ handle initial 2 characters exceptions
+ ch := inwd at:(0+1).
+ ch2 := inwd at:(1+1).
+ ('KGP' includes:ch) ifTrue:[
+ "/ looking for KN, etc
+ "/ KNx -> Nx
+ "/ GNx -> Nx
+ "/ PNx -> Nx
+ (ch2 == $N) ifTrue:[
+ local := (inwd from:1+1)
+ ].
+ ] ifFalse:[
+ ('A' includes:ch) ifTrue:[
+ "/ looking for AE
+ "/ AEx -> Ex
+ (ch2 == $E) ifTrue:[
+ local := (inwd from:1+1)
+ ].
+ ] ifFalse:[
+ ('W' includes:ch) ifTrue:[
+ "/ looking for WR or WH
+ (ch2 == $R) ifTrue:[
+ "/ WRx -> Wx
+ local := (inwd from:1+1)
+ ] ifFalse:[
+ (ch2 == $H) ifTrue:[
+ "/ // WH -> W
+ local := 'W',(inwd from:2+1).
+ ]
+ ]
+ ] ifFalse:[
+ ('X' includes:ch) ifTrue:[
+ "/ initial X becomes S */
+ "/ Xx -> Sx
+ local := 'S',(inwd from:1+1).
+ ]]]].
+
+ "/ now local has working string with initials fixed
+
+ wdsz := local size.
+ n := 1.
+
+ [ (code size < maxCodeLen) and:[ n <= wdsz ] ] whileTrue:[
+ "/ max code size of 4 works well
+
+ |symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar|
+
+ symb := local at:n.
+ (n > 1) ifTrue:[ prevChar := local at:(n-1) ].
+ (isLastChar := (n == wdsz)) ifFalse:[
+ nextChar := local at:(n+1)
+ ].
+ isPrevToLastChar := (n == (wdsz-1)).
+ (n+2) <= wdsz ifTrue:[
+ nextNextChar := local at:(n+2)
+ ].
+
+ "/ remove duplicate letters except C
+ (symb ~~ $C and:[ nextChar == symb ]) ifFalse:[
+ "/ not dup
+ ('AEIOU' includes:symb) ifTrue:[
+ "/ only use vowel if leading char
+ (n == 1) ifTrue:[
+ code nextPut:symb
+ ]
+ ] ifFalse:[
+ ('B' includes:symb) ifTrue:[
+ "/ if ( isPreviousChar(local, n, 'M') &&
+ "/ isLastChar(wdsz, n) ) { // B is silent if word ends in MB
+ "/ break;
+ "/ }
+ "/ code.append(symb);
+ "/ break;
+ ((prevChar == $M) and:[isLastChar]) ifTrue:[
+ "/ B is silent if word ends in MB
+ ] ifFalse:[
+ code nextPut:symb.
+ ].
+ ] ifFalse:[
+ ('C' includes:symb) ifTrue:[
+ "/ lots of C special cases
+ "/ /* discard if SCI, SCE or SCY */
+ "/ if ( isPreviousChar(local, n, 'S') &&
+ "/ !!isLastChar(wdsz, n) &&
+ "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
+ "/ break;
+ "/ }
+ "/ if (regionMatch(local, n, "CIA")) { // "CIA" -> X
+ "/ code.append('X');
+ "/ break;
+ "/ }
+ "/ if (!!isLastChar(wdsz, n) &&
+ "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
+ "/ code.append('S');
+ "/ break; // CI,CE,CY -> S
+ "/ }
+ "/ if (isPreviousChar(local, n, 'S') &&
+ "/ isNextChar(local, n, 'H') ) { // SCH->sk
+ "/ code.append('K');
+ "/ break;
+ "/ }
+ "/ if (isNextChar(local, n, 'H')) { // detect CH
+ "/ if (n == 0 &&
+ "/ wdsz >= 3 &&
+ "/ isVowel(local,2) ) { // CH consonant -> K consonant
+ "/ code.append('K');
+ "/ } else {
+ "/ code.append('X'); // CHvowel -> X
+ "/ }
+ "/ } else {
+ "/ code.append('K');
+ "/ }
+ "/ break;
+ (prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[
+ "/ discard if SCI, SCE or SCY
+ ] ifFalse:[
+ ((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[
+ "/ "CIA" -> X
+ code nextPut:$X
+ ] ifFalse:[
+ ('IEY' includes:nextChar) ifTrue:[
+ "/ CI,CE,CY -> S
+ code nextPut:$S
+ ] ifFalse:[
+ ((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[
+ "/ SCH->sk
+ code nextPut:$K
+ ] ifFalse:[
+ nextChar == $H ifTrue:[
+ "/ CH
+ ('AEIOU' includes:nextNextChar) ifTrue:[
+ code nextPut:$K "/ CH consonant -> K consonant
+ ] ifFalse:[
+ code nextPut:$X "/ CHvowel -> X
+ ]
+ ] ifFalse:[
+ code nextPut:$K
+ ].
+ ]
+ ]
+ ]
+ ].
+
+ ] ifFalse:[
+ ('D' includes:symb) ifTrue:[
+ "/ if (!!isLastChar(wdsz, n + 1) &&
+ "/ isNextChar(local, n, 'G') &&
+ "/ FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
+ "/ code.append('J'); n += 2;
+ "/ } else {
+ "/ code.append('T');
+ "/ }
+ "/ break;
+ ((nextChar == $G)
+ and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ])
+ ifTrue:[
+ code nextPut:$J.
+ n := n + 2.
+ ] ifFalse:[
+ code nextPut:$T.
+ ].
+ ] ifFalse:[
+ ('G' includes:symb) ifTrue:[
+ "/ GH silent at end or before consonant
+ "/ if (isLastChar(wdsz, n + 1) &&
+ "/ isNextChar(local, n, 'H')) {
+ "/ break;
+ "/ }
+ "/ if (!!isLastChar(wdsz, n + 1) &&
+ "/ isNextChar(local,n,'H') &&
+ "/ !!isVowel(local,n+2)) {
+ "/ break;
+ "/ }
+ "/ if (n > 0 &&
+ "/ ( regionMatch(local, n, "GN") ||
+ "/ regionMatch(local, n, "GNED") ) ) {
+ "/ break; // silent G
+ "/ }
+ "/ if (isPreviousChar(local, n, 'G')) {
+ "/ // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true
+ "/ hard = true;
+ "/ } else {
+ "/ hard = false;
+ "/ }
+ "/ if (!!isLastChar(wdsz, n) &&
+ "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
+ "/ !!hard) {
+ "/ code.append('J');
+ "/ } else {
+ "/ code.append('K');
+ "/ }
+ "/ break;
+ (isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[
+ "/ GH silent at end
+ ] ifFalse:[
+ (isPrevToLastChar not and:[ nextChar == $H
+ and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[
+ "/ GH silent before consonant
+ ] ifFalse:[
+ (n > 1 and:[ nextChar == $N ]) ifTrue:[
+ "/ GN -> silent G
+ ] ifFalse:[
+ hard := (prevChar == $G).
+ (isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[
+ code nextPut:$J
+ ] ifFalse:[
+ code nextPut:$K
+ ].
+ ].
+ ].
+ ].
+ ] ifFalse:[
+ ('H' includes:symb) ifTrue:[
+ "/ case 'H':
+ "/ if (isLastChar(wdsz, n)) {
+ "/ break; // terminal H
+ "/ }
+ "/ if (n > 0 &&
+ "/ VARSON.indexOf(local.charAt(n - 1)) >= 0) {
+ "/ break;
+ "/ }
+ "/ if (isVowel(local,n+1)) {
+ "/ code.append('H'); // Hvowel
+ "/ }
+ "/ break;
+ isLastChar ifTrue:[
+ "/ ignore terminal H
+ ] ifFalse:[
+ ('CSPTG' includes:prevChar) ifTrue:[
+ "/ ignore CH, SH, PH, TH, GH (H treated there)
+ ] ifFalse:[
+ ('AEIOU' includes:nextChar) ifTrue:[
+ "/ Hvowel
+ code nextPut:$H
+ ].
+ ].
+ ].
+ ] ifFalse:[
+ ('FJLMNR' includes:symb) ifTrue:[
+ "/ case 'F':
+ "/ case 'J':
+ "/ case 'L':
+ "/ case 'M':
+ "/ case 'N':
+ "/ case 'R':
+ "/ code.append(symb);
+ "/ break;
+ code nextPut:symb.
+ ] ifFalse:[
+ ('K' includes:symb) ifTrue:[
+ "/ case 'K':
+ "/ if (n > 0) { // not initial
+ "/ if (!!isPreviousChar(local, n, 'C')) {
+ "/ code.append(symb);
+ "/ }
+ "/ } else {
+ "/ code.append(symb); // initial K
+ "/ }
+ "/ break;
+ n > 1 ifTrue:[
+ "/ not initial
+ prevChar ~~ $C ifTrue:[
+ code nextPut:$K. "/ initial K
+ ].
+ ] ifFalse:[
+ code nextPut:$K. "/ initial K
+ ].
+ ] ifFalse:[
+ ('P' includes:symb) ifTrue:[
+ "/ case 'P':
+ "/ if (isNextChar(local,n,'H')) {
+ "/ // PH -> F
+ "/ code.append('F');
+ "/ } else {
+ "/ code.append(symb);
+ "/ }
+ "/ break;
+ nextChar == $H ifTrue:[
+ "/ PH -> F
+ code nextPut:$F.
+ ] ifFalse:[
+ code nextPut:symb.
+ ].
+ ] ifFalse:[
+ ('Q' includes:symb) ifTrue:[
+ "/ case 'Q':
+ "/ code.append('K');
+ "/ break;
+ code nextPut:$K
+
+ ] ifFalse:[
+ ('S' includes:symb) ifTrue:[
+"/ case 'S':
+"/ if (regionMatch(local,n,"SH") ||
+"/ regionMatch(local,n,"SIO") ||
+"/ regionMatch(local,n,"SIA")) {
+"/ code.append('X');
+"/ } else {
+"/ code.append('S');
+"/ }
+"/ break;
+ "/ SH -> X (as in shave or ashton)
+ "/ SIO -> X
+ "/ SIA -> X (as in ASIA)
+ ((nextChar == $H)
+ or:[
+ ((nextChar == $I)
+ and:[
+ (((local from:n) startsWith:'SIO')
+ or:[ ((local from:n) startsWith:'SIA') ])
+ ]
+ )
+ ]) ifTrue:[
+ code nextPut:$X
+ ] ifFalse:[
+ code nextPut:$S
+ ]
+ ] ifFalse:[
+ ('T' includes:symb) ifTrue:[
+"/ case 'T':
+"/ if (regionMatch(local,n,"TIA") ||
+"/ regionMatch(local,n,"TIO")) {
+"/ code.append('X');
+"/ break;
+"/ }
+"/ if (regionMatch(local,n,"TCH")) {
+"/ // Silent if in "TCH"
+"/ break;
+"/ }
+"/ // substitute numeral 0 for TH (resembles theta after all)
+"/ if (regionMatch(local,n,"TH")) {
+"/ code.append('0');
+"/ } else {
+"/ code.append('T');
+"/ }
+"/ break;
+ self halt.
+ ] ifFalse:[
+ ('V' includes:symb) ifTrue:[
+ "/ case 'V':
+ "/ code.append('F'); break;
+ code nextPut:$F
+
+ ] ifFalse:[
+ ('WY' includes:symb) ifTrue:[
+ "/ case 'W':
+ "/ case 'Y': // silent if not followed by vowel
+ "/ if (!!isLastChar(wdsz,n) &&
+ "/ isVowel(local,n+1)) {
+ "/ code.append(symb);
+ "/ }
+ "/ break;
+
+ "/ silent if not followed by vowel
+ (isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[
+ code nextPut:symb
+ ].
+ ] ifFalse:[
+ ('X' includes:symb) ifTrue:[
+ "/ case 'X':
+ "/ code.append('K');
+ "/ code.append('S');
+ "/ break;
+ code nextPutAll:'KS'
+ ] ifFalse:[
+ ('Z' includes:symb) ifTrue:[
+ "/ case 'Z':
+ "/ code.append('S');
+ "/ break;
+ code nextPut:$S
+ ] ifFalse:[
+"/ default:
+"/ // do nothing
+"/ break;
+ ]]]]]]]]]]]]]]]]. "/ end switch
+ ]. "/ end else from symb !!= 'C'
+ n := n + 1.
+ (code size > maxCodeLen) ifTrue:[
+ code := code truncateTo:maxCodeLen
+ ]
+ ].
+ ^ code contents
+
+ "Created: / 02-08-2017 / 09:51:31 / cg"
+ "Modified: / 02-08-2017 / 12:00:38 / cg"
+!
+
+maxCodeLen
+ ^ 4
+
+ "Created: / 02-08-2017 / 09:51:59 / cg"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ WARNING: this is the so called 'simplified soundex' algorithm;
+ there are more variants like miracode (american soundex) or
+ mysqlSoundex around.
+
+ Be sure to use the correct algorithm, if the generated strings must be compatible
+ (otherwise, the differences are probably too small to be noticed as effect, but
+ your search will be different)
+
+ The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm
+
+ SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable
+ components of names, but by doing so reports more matches.
+
+ There are some variations around in the literature;
+ the following is called 'simplified soundex', and the rules for coding a name are:
+
+ 1. The first letter of the name is used in its un-coded form to serve as the prefix
+ character of the code. (The rest of the code is numerical).
+
+ 2. Thereafter, W and H are ignored entirely.
+
+ 3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5).
+
+ 4. Other letters of the name are converted to a numerical equivalent:
+ B, P, F, V 1
+ C, G, J, K, Q, S, X, Z 2
+ D, T 3
+ L 4
+ M, N 5
+ R 6
+
+ 5. There are two exceptions:
+ 1. Letters that follow prefix letters which would, if coded, have the same
+ numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them.
+
+ 2. The second letter of any pair of consonants having the same code number is likewise ignored,
+ i.e. unless there is a ''separator'' between them in the name.
+
+ 6. The final SOUNDEX code consists of the prefix letter plus three numerical characters.
+ Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros.
+
+ Notice, that in another variant, w and h are treated slightly differently.
+ This is only of relevance, if you need to reconstruct original soundex codes of other programs
+ or for the original 1880 us census data.
+ SoundexStringComparator new encode:'Ashcraft' -> 'A226'
+ vs.
+ MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+
+ Also notice, that soundex deals better with english.
+ For german and other languages, other algorithms may provide better results.
+"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'!
+
+encode:word
+ |u p t prevCode|
+
+ u := word asUppercase.
+ p := u first asString.
+ prevCode := self translate:u first.
+ u from:2 to:u size do:[:c |
+ t := self translate:c.
+ (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+ p := p , t.
+ p size == 4 ifTrue:[^ p ].
+ ].
+ prevCode := t
+ ].
+ [ p size < 4 ] whileTrue:[
+ p := p , '0'
+ ].
+ ^ (p copyFrom:1 to:4)
+
+ "
+ self new encode:'washington' -> 'W252'
+ self new encode:'lee' -> 'L000'
+ self new encode:'Gutierrez' -> 'G362'
+ self new encode:'Pfister' -> 'P236'
+ self new encode:'Jackson' -> 'J250'
+ self new encode:'Tymczak' -> 'T522'
+ "
+
+ "notice:
+ MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+ self new encode:'Ashcraft' -> 'A226'
+ "
+
+ "Created: / 28-07-2017 / 15:21:23 / cg"
+ "Modified (comment): / 01-08-2017 / 19:01:43 / cg"
+! !
+
+!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'!
+
+translate:aCharacter
+ "use simple if's for more speed when compiled"
+
+ "vowels serve as separators"
+ aCharacter == $A ifTrue:[^ '0' ].
+ aCharacter == $E ifTrue:[^ '0' ].
+ aCharacter == $I ifTrue:[^ '0' ].
+ aCharacter == $O ifTrue:[^ '0' ].
+ aCharacter == $U ifTrue:[^ '0' ].
+ aCharacter == $Y ifTrue:[^ '0' ].
+
+ aCharacter == $B ifTrue:[^ '1' ].
+ aCharacter == $P ifTrue:[^ '1' ].
+ aCharacter == $F ifTrue:[^ '1' ].
+ aCharacter == $V ifTrue:[^ '1' ].
+
+ aCharacter == $C ifTrue:[^ '2' ].
+ aCharacter == $S ifTrue:[^ '2' ].
+ aCharacter == $K ifTrue:[^ '2' ].
+ aCharacter == $G ifTrue:[^ '2' ].
+ aCharacter == $J ifTrue:[^ '2' ].
+ aCharacter == $Q ifTrue:[^ '2' ].
+ aCharacter == $X ifTrue:[^ '2' ].
+ aCharacter == $Z ifTrue:[^ '2' ].
+
+ aCharacter == $D ifTrue:[^ '3' ].
+ aCharacter == $T ifTrue:[^ '3' ].
+
+ aCharacter == $L ifTrue:[^ '4' ].
+
+ aCharacter == $M ifTrue:[^ '5' ].
+ aCharacter == $N ifTrue:[^ '5' ].
+
+ aCharacter == $R ifTrue:[^ '6' ].
+ ^ nil
+
+ "Modified: / 02-08-2017 / 01:35:40 / cg"
+ "Modified (comment): / 02-08-2017 / 14:30:11 / cg"
+! !
+
+!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation,
+ and also removing vokals first, then removing duplicate codes
+ (whereas the soundex code does this in reverse order).
+
+ These variations are important, if you need the miracode soundex codes to be generated.
+"
+! !
+
+!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'!
+
+encode:word
+ "same as inherited, but cares for 0, W and H"
+
+ |u p t prevCode|
+
+ u := word asUppercase.
+ p := u first asString.
+ prevCode := self translate:u first.
+ u from:2 to:u size do:[:c |
+ t := self translate:c.
+ (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[
+ p := p , t.
+ ].
+ (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[
+ prevCode := t.
+ ].
+ ].
+ [ p size < 4 ] whileTrue:[
+ p := p , '0'
+ ].
+ ^ p
+
+ "Created: / 28-07-2017 / 15:23:41 / cg"
+ "Modified: / 31-07-2017 / 17:53:51 / cg"
+ "Modified (comment): / 02-08-2017 / 14:31:15 / cg"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ NYSIIS Algorithm:
+
+ 1.
+ remove all ''S'' and ''Z'' chars from the end of the surname
+
+ 2.
+ transcode initial strings
+ MAC => MC
+ PF => F
+
+ 3.
+ Transcode trailing strings as follows,
+
+ IX => IC
+ EX => EC
+ YE,EE,IE => Y
+ NT,ND => D
+
+ 4.
+ transcode ''EV'' to ''EF'' if not at start of name
+
+ 5.
+ use first character of name as first character of key
+
+ 6.
+ remove any ''W'' that follows a vowel
+
+ 7.
+ replace all vowels with ''A''
+
+ 8.
+ transcode ''GHT'' to ''GT''
+
+ 9.
+ transcode ''DG'' to ''G''
+
+ 10.
+ transcode ''PH'' to ''F''
+
+ 11.
+ if not first character, eliminate all ''H'' preceded or followed by a vowel
+
+ 12.
+ change ''KN'' to ''N'', else ''K'' to ''C''
+
+ 13.
+ if not first character, change ''M'' to ''N''
+
+ 14.
+ if not first character, change ''Q'' to ''G''
+
+ 15.
+ transcode ''SH'' to ''S''
+
+ 16.
+ transcode ''SCH'' to ''S''
+
+ 17.
+ transcode ''YW'' to ''Y''
+
+ 18.
+ if not first or last character, change ''Y'' to ''A''
+
+ 19.
+ transcode ''WR'' to ''R''
+
+ 20.
+ if not first character, change ''Z'' to ''S''
+
+ 21.
+ transcode terminal ''AY'' to ''Y''
+
+ 22.
+ remove traling vowels
+
+ 23.
+ collapse all strings of repeated characters
+
+ 24.
+ if first char of original surname was a vowel, append it to the code
+"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'!
+
+encode:aString
+ |k|
+
+ k := self rule1:(aString asUppercase).
+ "2. Transcode initial strings: MAC => MC PF => F"
+ k := self rule2:k.
+ k := self rule3:k.
+ k := self rule4:k.
+ k := self rule5:k.
+ k := self rule6:k.
+ k := self rule7:k.
+ k := self rule8:k.
+ k := self rule9:k.
+ k := self rule10:k.
+ k := self rule11:k.
+ k := self rule12:k.
+ k := self rule13:k.
+ k := self rule14:k.
+ k := self rule15:k.
+ k := self rule16:k.
+ k := self rule17:k.
+ k := self rule18:k.
+ k := self rule19:k.
+ k := self rule20:k.
+ k := self rule21:k.
+ k := self rule22:k.
+ k := self rule23:k.
+ k := self rule24:k originalKey:aString.
+ ^ k
+
+ "
+ self new encode:'hello'
+ self new encode:'bliss'
+ "
+ "
+ self new phoneticStringsFor:'hello'
+ self new phoneticStringsFor:'bliss'
+ "
+
+ "Created: / 28-07-2017 / 15:34:52 / cg"
+ "Modified (comment): / 02-08-2017 / 14:31:47 / cg"
+! !
+
+!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'!
+
+rule10:key
+ "10. transcode 'PH' to 'F' "
+
+ ^ self transcodeAll:'PH' of:key to:'F' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:34:27 / cg"
+!
+
+rule11:key
+ |k c|
+
+ "11. if not first character, eliminate all 'H' preceded or followed by a vowel "
+ k := key copy.
+ c := SortedCollection sortBlock:[:a :b | b < a ].
+ 2 to:key size do:[:i |
+ (key at:i) = $H ifTrue:[
+ ((key at:i - 1) isVowel
+ or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ]
+ ]
+ ].
+ c do:[:n |
+ k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+ ].
+ ^ k
+!
+
+rule12:key
+ |k|
+
+ "12. change 'KN' to 'N', else 'K' to 'C' "
+ k := self transcodeAll:'KN' of:key to:'K' startingAt:1.
+ k := self transcodeAll:'K' of:k to:'C' startingAt:1.
+ ^ k
+
+ "Modified (format): / 02-08-2017 / 14:34:48 / cg"
+!
+
+rule13:key
+ "13. if not first character, change 'M' to 'N' "
+
+ ^ self transcodeAll:'M' of:key to:'N' startingAt:2
+
+ "Modified (format): / 02-08-2017 / 14:35:00 / cg"
+!
+
+rule14:key
+ "14. if not first character, change 'Q' to 'G' "
+
+ ^ self transcodeAll:'Q' of:key to:'G' startingAt:2
+
+ "Modified (format): / 02-08-2017 / 14:35:08 / cg"
+!
+
+rule15:key
+ "15. transcode 'SH' to 'S' "
+
+ ^ self transcodeAll:'SH' of:key to:'S' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:35:18 / cg"
+!
+
+rule16:key
+ "16. transcode 'SCH' to 'S' "
+
+ ^ self transcodeAll:'SCH' of:key to:'S' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:35:25 / cg"
+!
+
+rule17:key
+ "17. transcode 'YW' to 'Y' "
+
+ ^ self transcodeAll:'YW' of:key to:'Y' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:35:33 / cg"
+!
+
+rule18:key
+ |k|
+
+ "18. if not first or last character, change 'Y' to 'A' "
+ k := self transcodeAll:'Y' of:key to:'A' startingAt:2.
+ key last = $Y ifTrue:[
+ k at:k size put:$Y
+ ].
+ ^ k
+
+ "Modified (format): / 02-08-2017 / 14:35:44 / cg"
+!
+
+rule19:key
+ "19. transcode 'WR' to 'R' "
+
+ ^ self transcodeAll:'WR' of:key to:'R' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:35:52 / cg"
+!
+
+rule1:key
+ |k|
+
+ k := key copy.
+ "1. Remove all 'S' and 'Z' chars from the end of the name"
+ [
+ 'SZ' includes:k last
+ ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ].
+ ^ k
+!
+
+rule20:key
+ "20. if not first character, change 'Z' to 'S' "
+
+ ^ self transcodeAll:'Z' of:key to:'S' startingAt:2
+
+ "Modified (format): / 02-08-2017 / 14:36:00 / cg"
+!
+
+rule21:key
+ "21. transcode terminal 'AY' to 'Y' "
+
+ ^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1
+
+ "Modified (format): / 02-08-2017 / 14:36:08 / cg"
+!
+
+rule22:key
+ |k|
+
+ "22. remove trailing vowels "
+ k := key copy.
+ [ k last isVowel ] whileTrue:[
+ k := k copyButLast
+ ].
+ ^ k
+
+ "Modified: / 02-08-2017 / 14:36:42 / cg"
+!
+
+rule23:key
+ |k c|
+
+ "23. collapse all strings of repeated characters "
+ k := key copy.
+ c := SortedCollection sortBlock:[:a :b | b < a ].
+ k size to:2 do:[:i |
+ (k at:i) = (k at:i - 1) ifTrue:[
+ c add:i
+ ]
+ ].
+ c do:[:n |
+ k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size)
+ ].
+ ^ k
+!
+
+rule24:key originalKey:originalKey
+ |k|
+
+ "24. if first char of original surname was a vowel, append it to the code"
+ k := key copy.
+ originalKey first isVowel ifTrue:[
+ k := k , originalKey first asString asUppercase
+ ].
+ ^ k
+!
+
+rule2:key
+ "2. Transcode initial strings: MAC => MC PF => F"
+
+ |k|
+
+ k := key copy.
+ (k startsWith:'MAC') ifTrue:[
+ k := 'MC' , (k copyFrom:4)
+ ].
+ (k startsWith:'PF') ifTrue:[
+ k := 'F' , (k copyFrom:3)
+ ].
+ ^ k
+
+ "Modified (format): / 02-08-2017 / 14:31:40 / cg"
+!
+
+rule3:key
+ |k|
+
+ "3. Transcode trailing strings as follows:
+ IX => IC
+ EX => EC
+ YE, EE, IE => Y
+ NT, ND => D"
+
+ k := key copy.
+ k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'.
+ k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'.
+ k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'.
+ k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'.
+ ^ k
+
+ "Modified (format): / 02-08-2017 / 14:32:24 / cg"
+!
+
+rule4:key
+ "4. Transcode 'EV' to 'EF' if not at start of name"
+
+ ^ self transcodeAll:'EV' of:key to:'EF' startingAt:2
+
+ "Modified (format): / 02-08-2017 / 14:32:35 / cg"
+!
+
+rule5:key
+ "5. Use first character of name as first character of key.
+ Ignored because we're doing an in-place conversion"
+
+ ^ key
+
+ "Modified (comment): / 02-08-2017 / 14:32:45 / cg"
+!
+
+rule6:key
+ |k i|
+
+ "6. Remove any 'W' that follows a vowel"
+ k := key copy.
+ i := 2.
+ [
+ (i := k indexOf:$W startingAt:i) > 0
+ ] whileTrue:[
+ (k at:i - 1) isVowel ifTrue:[
+ k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size).
+ i := i - 1
+ ]
+ ].
+ ^ k
+!
+
+rule7:key
+ "7. replace all vowels with 'A' "
+ ^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]].
+
+ "Modified: / 02-08-2017 / 14:33:56 / cg"
+!
+
+rule8:key
+ "8. transcode 'GHT' to 'GT' "
+
+ ^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:34:05 / cg"
+!
+
+rule9:key
+ "9. transcode 'DG' to 'G' "
+
+ ^ self transcodeAll:'DG' of:key to:'G' startingAt:1
+
+ "Modified (format): / 02-08-2017 / 14:34:15 / cg"
+!
+
+transcodeAll:aString of:key to:replacementString startingAt:start
+ |k i|
+
+ k := key copy.
+ [
+ (i := k indexOfSubCollection:aString startingAt:start) > 0
+ ] whileTrue:[
+ k := (k copyFrom:1 to:i - 1) , replacementString
+ , (k copyFrom:i + aString size to:k size)
+ ].
+ ^ k
+!
+
+transcodeTrailing:anArrayOfStrings of:key to:replacementString
+ |answer|
+
+ answer := key copy.
+ anArrayOfStrings do:[:aString |
+ answer := self
+ transcodeAll:aString
+ of:answer
+ to:replacementString
+ startingAt:(answer size - aString size) + 1
+ ].
+ ^ answer
+! !
+
+!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ Implementation of the PHONEM algorithm, as described in
+ 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht -
+ Ein Programm fuer kontextsensitive phonetische Textumwandlung
+ ct Magazin fuer Computer & Technik 25/1998'
+
+ This algorithm deals better with the german language (it cares for umlauts)
+"
+! !
+
+!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'!
+
+encode:aString
+ |s idx t t2|
+
+ s := aString asUppercase.
+
+ idx := 1.
+ [idx < (s size-1)] whileTrue:[
+ t2 := nil.
+ t := s copyFrom:idx to:idx+1.
+ t = 'SC' ifTrue:[ t2 := 'C' ]
+ ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ]
+ ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ]
+ ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ]
+ ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ]
+ ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ]
+ ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ]
+ ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ]
+ ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ]
+ ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ]
+ ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ]
+ ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ]
+ ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ]
+ ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ]
+ ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ]
+ ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ]
+ ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]].
+ t2 notNil ifTrue:[
+ s := (s copyTo:idx-1),t2,(s copyFrom:idx+2)
+ ] ifFalse:[
+ idx := idx + 1.
+ ].
+ ].
+
+ "/ single character substitutions via tr
+ s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'.
+ s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false.
+ s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true.
+ ^ s
+
+ "
+ self basicNew encode:'müller' -> 'MYLR'
+ self basicNew encode:'mueller' -> 'MYLR'
+ self basicNew encode:'möller' -> 'MYLR'
+ self basicNew encode:'miller' -> 'MYLR'
+ self basicNew encode:'muller' -> 'MULR'
+ self basicNew encode:'muler' -> 'MULR'
+
+ self basicNew phoneticStringsFor:'müller' #('MYLR')
+ self basicNew phoneticStringsFor:'mueller' #('MYLR')
+ self basicNew phoneticStringsFor:'möller' #('MYLR')
+ self basicNew phoneticStringsFor:'miller' #('MYLR')
+ self basicNew phoneticStringsFor:'muller' #('MULR')
+ self basicNew phoneticStringsFor:'muler' #('MULR')
+
+ self basicNew phoneticStringsFor:'schmidt' #('CMYD')
+ self basicNew phoneticStringsFor:'schneider' #('CNAYDR')
+ self basicNew phoneticStringsFor:'fischer' #('VYCR')
+ self basicNew phoneticStringsFor:'weber' #('VBR')
+ self basicNew phoneticStringsFor:'weeber' #('VBR')
+ self basicNew phoneticStringsFor:'webber' #('VBR')
+ self basicNew phoneticStringsFor:'wepper' #('VBR')
+
+ self basicNew phoneticStringsFor:'meyer' #('MAYR')
+ self basicNew phoneticStringsFor:'maier' #('MAYR')
+ self basicNew phoneticStringsFor:'mayer' #('MAYR')
+ self basicNew phoneticStringsFor:'mayr' #('MAYR')
+ self basicNew phoneticStringsFor:'meir' #('MAYR')
+
+ self basicNew phoneticStringsFor:'wagner' #('VACNR')
+ self basicNew phoneticStringsFor:'schulz' #('CULC')
+ self basicNew phoneticStringsFor:'becker' #('BCR')
+ self basicNew phoneticStringsFor:'hoffmann' #('OVMAN')
+ self basicNew phoneticStringsFor:'haus' #('AUS')
+
+ self basicNew phoneticStringsFor:'schäfer' #('CVR')
+ self basicNew phoneticStringsFor:'scheffer' #('CVR')
+ self basicNew phoneticStringsFor:'schaeffer' #('CVR')
+ self basicNew phoneticStringsFor:'schaefer' #('CVR')
+ "
+
+ "Created: / 28-07-2017 / 15:38:08 / cg"
+! !
+
+!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'!
+
+documentation
+"
+ Caverphone (2) Algorithm:
+
+ see http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+
+ Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it,
+ with the proviso that the Caversham Project at the University of Otago should be acknowledged as the
+ original source (which is hereby done ;-).
+
+ • Start with a Surname or Firstname
+ • Convert to lowercase
+ This coding system is case sensitive, implementations should acknowledge that a is not the same as A
+ • Remove anything not A-Z
+ The main intention of this is to remove spaces, hyphens, and apostrophes.
+ example: o'brian becomes obrian
+ • If the name starts with cough make it cou2f
+ 2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in.
+ • If the name starts with rough make it rou2f
+ • If the name starts with tough make it tou2f
+ • If the name starts with enough make it enou2f
+ • If the name starts with gn make it 2n
+ • If the name ends with mb make it m2
+ • replace cq with 2q
+ • replace ci with si
+ • replace ce with se
+ • replace cy with sy
+ • replace tch with 2ch
+ • replace c with k
+ • replace q with k
+ • replace x with k
+ • replace v with f
+ • replace dg with 2g
+ • replace tio with sio
+ • replace tia with sia
+ • replace d with t
+ • replace ph with fh
+ • replace b with p
+ • replace sh with s2
+ • replace z with s
+ • replace and initial vowel with an A
+ • replace all other vowels with a 3
+ 3 is a temporary placeholder marking a vowel
+ • replace 3gh3 with 3kh3
+ Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule.
+ • replace gh with 22
+ • replace g with k
+ • replace groups of the letter s with a S
+ Continuous strings of s are replace by a single S
+ • replace groups of the letter t with a T
+ • replace groups of the letter p with a P
+ • replace groups of the letter k with a K
+ • replace groups of the letter f with a F
+ • replace groups of the letter m with a M
+ • replace groups of the letter n with a N
+ • replace w3 with W3
+ • replace wy with Wy
+ • replace wh3 with Wh3
+ • replace why with Why
+ • replace w with 2
+ • replace and initial h with an A
+ • replace all other occurrences of h with a 2
+ • replace r3 with R3
+ • replace ry with Ry
+ • replace r with 2
+ • replace l3 with L3
+ • replace ly with Ly
+ • replace l with 2
+ • replace j with y
+ • replace y3 with Y3
+ • replace y with 2
+ • remove all 2s
+ • remove all 3s
+ • put six (v1) / ten (v2) 1s on the end
+ • take the first six characters as the code (caverphone 1);
+ / take the first ten characters as the code (caverphone 2);
+
+ self new encode:'david' -> 'TFT1111111'
+ self new encode:'whittle' -> 'WTA1111111'
+
+ self new encode:'Stevenson' -> 'STFNSN1111'
+ self new encode:'Peter' -> 'PTA1111111'
+
+ self new encode:'washington' -> 'WSNKTN1111'
+ self new encode:'lee' -> 'LA11111111'
+ self new encode:'Gutierrez' -> 'KTRS111111'
+ self new encode:'Pfister' -> 'PFSTA11111'
+ self new encode:'Jackson' -> 'YKSN111111'
+ self new encode:'Tymczak' -> 'TMKSK11111'
+
+ self new encode:'add' -> 'AT11111111'
+ self new encode:'aid' -> 'AT11111111'
+ self new encode:'at' -> 'AT11111111'
+ self new encode:'art' -> 'AT11111111'
+ self new encode:'earth' -> 'AT11111111'
+ self new encode:'head' -> 'AT11111111'
+ self new encode:'old' -> 'AT11111111'
+
+ self new encode:'ready' -> 'RTA1111111'
+ self new encode:'rather' -> 'RTA1111111'
+ self new encode:'able' -> 'APA1111111'
+ self new encode:'appear' -> 'APA1111111'
+
+ self new encode:'Deedee' -> 'TTA1111111'
+"
+! !
+
+!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'!
+
+encode:word
+ |txt|
+
+ word size == 0 ifTrue:[^ '1111111111' ].
+
+ "/ 1. Convert to lowercase
+ txt := word asLowercase.
+
+ "/ 2. Remove anything not A-Z
+ txt := txt select:#isLetter.
+
+ #(
+ "/ oldSeq newSeq repeat
+
+ "/ 2.5. Remove final e
+ 'e$' '' false
+ "/ 3. Handle various start options
+ '^cough' 'cou2f' false
+ '^rough' 'rou2f' false
+ '^tough' 'tou2f' false
+ '^enough' 'enou2f' false
+ '^trough' 'trou2f' false
+
+ '^gn' '2n' false
+ 'mb$' 'm2' false
+
+ "/ 4. Handle replacements
+ 'cq' '2q' true
+ 'ci' 'si' true
+ 'ce' 'se' true
+ 'cy' 'sy' true
+ 'tch' '2ch' true
+ 'c' 'k' true
+ 'q' 'k' true
+ 'x' 'k' true
+ 'v' 'f' true
+ 'dg' '2g' true
+ 'tio' 'sio' true
+ 'tia' 'sia' true
+ 'd' 't' true
+ 'ph' 'fh' true
+ 'b' 'p' true
+ 'sh' 's2' true
+ 'z' 's' true
+
+ '^a' 'A' false
+ '^e' 'A' false
+ '^i' 'A' false
+ '^o' 'A' false
+ '^u' 'A' false
+
+ 'a' '3' true
+ 'e' '3' true
+ 'i' '3' true
+ 'o' '3' true
+ 'u' '3' true
+ 'j' 'y' true
+
+ '^y3' 'Y3' false
+ '^y' 'A' false
+
+ 'y' '3' true
+ '3gh3' '3kh3' true
+ 'gh' '22' true
+ 'g' 'k' true
+ 's' 'S' true
+ 'SS' 'S' true
+ 't' 'T' true
+ 'TT' 'T' true
+ 'p' 'P' true
+ 'PP' 'P' true
+ 'k' 'K' true
+ 'KK' 'K' true
+ 'f' 'F' true
+ 'FF' 'F' true
+ 'm' 'M' true
+ 'MM' 'M' true
+ 'n' 'N' true
+ 'NN' 'N' true
+ 'w3' 'W3' true
+ 'wh3' 'Wh3' true
+ 'w$' '3' false
+ 'w' '2' true
+ '^h' 'A' false
+ 'h' '2' true
+ 'r3' 'R3' true
+ 'r$' '3' false
+ 'r' '2' true
+ 'l3' 'L3' true
+ 'l$' '3' false
+ 'l' '2' true
+
+ "/ 5. removals
+
+ '2' '' true
+ '3$' 'A' true
+ '3' '' true
+ ) inGroupsOf:3 do:[:pat :repl :repeat|
+ |s txtBefore|
+
+ txtBefore := txt.
+ (pat startsWith:$^) ifTrue:[
+ s := pat copyButFirst.
+ repeat ifTrue:[
+ [txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ]
+ ] ifFalse:[
+ (txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ]
+ ].
+ ] ifFalse:[
+ (pat endsWith:$$) ifTrue:[
+ s := pat copyButLast.
+ repeat ifTrue:[
+ [txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ]
+ ] ifFalse:[
+ (txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ]
+ ]
+ ] ifFalse:[
+ repeat ifTrue:[
+ txt := txt copyReplaceAllSubcollections:pat with:repl
+ ] ifFalse:[
+ txt := txt copyReplaceSubcollection:pat with:repl
+ ]
+ ]
+ ].
+ "/ txt ~= txtBefore ifTrue:[
+ "/ Transcript showCR:(pat,' | ',repl,' -> ',txt).
+ "/ ].
+ ].
+
+ "/ 6. put ten 1s on the end
+ txt := txt,'1111111111'.
+
+ "/ 7. take the first ten characters as the code
+ ^ txt copyTo:10
+
+ "
+ self new encode:'david' -> 'TFT1111111'
+ self new encode:'whittle' -> 'WTA1111111'
+
+ self new encode:'Stevenson' -> 'STFNSN1111'
+ self new encode:'Peter' -> 'PTA1111111'
+
+ self new encode:'washington' -> 'WSNKTN1111'
+ self new encode:'lee' -> 'LA11111111'
+ self new encode:'Gutierrez' -> 'KTRS111111'
+ self new encode:'Pfister' -> 'PFSTA11111'
+ self new encode:'Jackson' -> 'YKSN111111'
+ self new encode:'Tymczak' -> 'TMKSK11111'
+
+ self new encode:'add' -> 'AT11111111'
+ self new encode:'aid' -> 'AT11111111'
+ self new encode:'at' -> 'AT11111111'
+ self new encode:'art' -> 'AT11111111'
+ self new encode:'earth' -> 'AT11111111'
+ self new encode:'head' -> 'AT11111111'
+ self new encode:'old' -> 'AT11111111'
+
+ self new encode:'ready' -> 'RTA1111111'
+ self new encode:'rather' -> 'RTA1111111'
+ self new encode:'able' -> 'APA1111111'
+ self new encode:'appear' -> 'APA1111111'
+
+ self new encode:'Deedee' -> 'TTA1111111'
+ "
+
+ "Created: / 28-07-2017 / 15:21:23 / cg"
+ "Modified: / 02-08-2017 / 01:42:35 / cg"
+! !
+
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'!
documentation
@@ -3531,19 +5359,19 @@
self new encode:'Tymczak' -> 'T522'
notice:
- MiracodeStringComparator new
- encode:'Ashcraft' -> 'A261'
- SoundexStringComparator
- new encode:'Ashcraft' -> 'A226'
+ MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
+ SoundexStringComparator new encode:'Ashcraft' -> 'A226'
see also:
https://www.archives.gov/research/census/soundex.html
"
! !
-!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'api'!
+!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'!
encode:word
+ "same as inherited, but cares for W and H"
+
|u p t prevCode|
u := word asUppercase.
@@ -3566,22 +5394,8 @@
].
^ (p copyFrom:1 to:4)
- "
- self new encode:'washington' -> 'W252'
- self new encode:'lee' -> 'L000'
- self new encode:'Gutierrez' -> 'G362'
- self new encode:'Pfister' -> 'P236'
- self new encode:'Jackson' -> 'J250'
- self new encode:'Tymczak' -> 'T522'
- "
-
- "notice:
- MiracodeStringComparator new encode:'Ashcraft' -> 'A261'
- self new encode:'Ashcraft' -> 'A226'
- "
-
- "Created: / 28-07-2017 / 15:23:16 / cg"
- "Modified (comment): / 01-08-2017 / 19:01:51 / cg"
+ "Created: / 02-08-2017 / 00:19:47 / cg"
+ "Modified (comment): / 02-08-2017 / 14:30:47 / cg"
! !
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'!