author | Claus Gittinger <cg@exept.de> |
Sat, 02 May 2020 21:40:13 +0200 | |
changeset 5476 | 7355a4b11cb6 |
parent 5456 | 3040ec2b4531 |
permissions | -rw-r--r-- |
4488 | 1 |
"{ Encoding: utf8 }" |
2 |
||
2197 | 3 |
" |
4 |
COPYRIGHT (c) 1994 by Claus Gittinger |
|
5 |
COPYRIGHT (c) 2009 by eXept Software AG |
|
6 |
All Rights Reserved |
|
7 |
||
8 |
This software is furnished under a license and may be used |
|
9 |
only in accordance with the terms of that license and with the |
|
10 |
inclusion of the above copyright notice. This software may not |
|
11 |
be provided or otherwise made available to, or used by, any |
|
12 |
other person. No title to or ownership of the software is |
|
13 |
hereby transferred. |
|
14 |
" |
|
15 |
"{ Package: 'stx:libbasic2' }" |
|
16 |
||
3488
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
17 |
"{ NameSpace: Smalltalk }" |
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
18 |
|
2197 | 19 |
Object subclass:#PhoneticStringUtilities |
20 |
instanceVariableNames:'' |
|
21 |
classVariableNames:'' |
|
22 |
poolDictionaries:'' |
|
23 |
category:'Collections-Text-Support' |
|
24 |
! |
|
25 |
||
2208 | 26 |
Object subclass:#PhoneticStringComparator |
27 |
instanceVariableNames:'' |
|
28 |
classVariableNames:'' |
|
29 |
poolDictionaries:'' |
|
30 |
privateIn:PhoneticStringUtilities |
|
31 |
! |
|
32 |
||
4491 | 33 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator |
34 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
35 |
currentIndex skipCount' |
|
36 |
classVariableNames:'' |
|
37 |
poolDictionaries:'' |
|
38 |
privateIn:PhoneticStringUtilities |
|
39 |
! |
|
40 |
||
41 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator |
|
42 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
43 |
currentIndex skipCount' |
|
44 |
classVariableNames:'' |
|
45 |
poolDictionaries:'' |
|
46 |
privateIn:PhoneticStringUtilities |
|
47 |
! |
|
48 |
||
2211 | 49 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator |
50 |
instanceVariableNames:'' |
|
51 |
classVariableNames:'CharacterTranslationDict' |
|
52 |
poolDictionaries:'' |
|
53 |
privateIn:PhoneticStringUtilities |
|
54 |
! |
|
55 |
||
4488 | 56 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#SingleResultPhoneticStringComparator |
57 |
instanceVariableNames:'' |
|
58 |
classVariableNames:'' |
|
59 |
poolDictionaries:'' |
|
60 |
privateIn:PhoneticStringUtilities |
|
61 |
! |
|
62 |
||
63 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MRAStringComparator |
|
2208 | 64 |
instanceVariableNames:'' |
65 |
classVariableNames:'CharacterTranslationDict' |
|
66 |
poolDictionaries:'' |
|
67 |
privateIn:PhoneticStringUtilities |
|
68 |
! |
|
69 |
||
4491 | 70 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator |
71 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
72 |
currentIndex skipCount' |
|
73 |
classVariableNames:'' |
|
74 |
poolDictionaries:'' |
|
75 |
privateIn:PhoneticStringUtilities |
|
76 |
! |
|
77 |
||
4488 | 78 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator |
2208 | 79 |
instanceVariableNames:'' |
80 |
classVariableNames:'CharacterTranslationDict' |
|
81 |
poolDictionaries:'' |
|
82 |
privateIn:PhoneticStringUtilities |
|
83 |
! |
|
84 |
||
85 |
PhoneticStringUtilities::SoundexStringComparator subclass:#MySQLSoundexStringComparator |
|
86 |
instanceVariableNames:'' |
|
87 |
classVariableNames:'' |
|
88 |
poolDictionaries:'' |
|
89 |
privateIn:PhoneticStringUtilities |
|
90 |
! |
|
91 |
||
4488 | 92 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#NYSIISStringComparator |
2208 | 93 |
instanceVariableNames:'' |
94 |
classVariableNames:'' |
|
95 |
poolDictionaries:'' |
|
96 |
privateIn:PhoneticStringUtilities |
|
97 |
! |
|
98 |
||
4488 | 99 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#PhonemStringComparator |
2211 | 100 |
instanceVariableNames:'' |
101 |
classVariableNames:'CharacterTranslationDict' |
|
102 |
poolDictionaries:'' |
|
103 |
privateIn:PhoneticStringUtilities |
|
104 |
! |
|
105 |
||
4491 | 106 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator |
107 |
instanceVariableNames:'' |
|
108 |
classVariableNames:'CharacterTranslationDict' |
|
2208 | 109 |
poolDictionaries:'' |
110 |
privateIn:PhoneticStringUtilities |
|
111 |
! |
|
112 |
||
4488 | 113 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator |
114 |
instanceVariableNames:'' |
|
115 |
classVariableNames:'CharacterTranslationDict' |
|
116 |
poolDictionaries:'' |
|
117 |
privateIn:PhoneticStringUtilities |
|
118 |
! |
|
119 |
||
2208 | 120 |
PhoneticStringUtilities::SoundexStringComparator subclass:#MiracodeStringComparator |
121 |
instanceVariableNames:'' |
|
122 |
classVariableNames:'' |
|
123 |
poolDictionaries:'' |
|
124 |
privateIn:PhoneticStringUtilities |
|
125 |
! |
|
126 |
||
4489 | 127 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SpanishPhoneticCodeStringComparator |
128 |
instanceVariableNames:'' |
|
129 |
classVariableNames:'CharacterTranslationDict' |
|
130 |
poolDictionaries:'' |
|
131 |
privateIn:PhoneticStringUtilities |
|
132 |
! |
|
133 |
||
2197 | 134 |
!PhoneticStringUtilities class methodsFor:'documentation'! |
135 |
||
136 |
copyright |
|
137 |
" |
|
138 |
COPYRIGHT (c) 1994 by Claus Gittinger |
|
139 |
COPYRIGHT (c) 2009 by eXept Software AG |
|
140 |
All Rights Reserved |
|
141 |
||
142 |
This software is furnished under a license and may be used |
|
143 |
only in accordance with the terms of that license and with the |
|
144 |
inclusion of the above copyright notice. This software may not |
|
145 |
be provided or otherwise made available to, or used by, any |
|
146 |
other person. No title to or ownership of the software is |
|
147 |
hereby transferred. |
|
148 |
" |
|
149 |
! |
|
150 |
||
151 |
documentation |
|
152 |
" |
|
2445 | 153 |
Utilities which are helpful to perform phonetic string searches or comparisons. |
154 |
These are all variations or improvements of the soundex algorithm, which usually fails |
|
155 |
to provide good results for non-english languages. |
|
2285 | 156 |
|
2208 | 157 |
soundexCode |
158 |
this algorithm was originally contained in the CharacterArray class; |
|
159 |
||
160 |
nysiis |
|
161 |
a modified soundex algorithm |
|
162 |
||
2209 | 163 |
miracode |
164 |
another modified soundex algorithm ('american soundex') used in the 1880 census. |
|
165 |
||
166 |
mySQLSoundex |
|
167 |
another modified soundex algorithm used in mySQL. |
|
168 |
||
2208 | 169 |
koelner phoneticCode |
170 |
provides a functionality similar to soundex, but much more tuned towards the German language |
|
171 |
||
172 |
Double metaphone |
|
173 |
works with most european languages. |
|
2211 | 174 |
|
175 |
phonem |
|
176 |
described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung' |
|
177 |
from 'ct Magazin fuer Computer & Technik 25/1999'. |
|
178 |
||
4491 | 179 |
mra |
180 |
Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977. |
|
181 |
||
182 |
caverphone2 |
|
183 |
better than soundex |
|
184 |
||
185 |
spanish phonetic code |
|
186 |
an algorithm slightly adjusted to spanish names |
|
187 |
||
2211 | 188 |
More info for german readers is found in: |
189 |
http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf |
|
190 |
" |
|
191 |
! |
|
192 |
||
193 |
sampleData |
|
194 |
" |
|
195 |
for the 50 most common german names, we get: |
|
196 |
||
197 |
ext. |
|
4491 | 198 |
name soundex soundex metaphone phonet phonet2 phonix daitsch phonem koeln caverphone2 mra |
199 |
||
200 |
müller M460 54600000 MLR MÜLA NILA M4000000 689000 MYLR 657 MLA1111111 MLR |
|
201 |
schmidt S530 25300000 SKMTT SHMIT ZNIT S5300000 463000 CMYD 862 SKMT111111 SCHMDT |
|
202 |
schneider S536 25360000 SKNTR SHNEIDA ZNEITA S5300000 463900 CNAYDR 8627 SKNTA11111 SCHNDR |
|
203 |
fischer F260 12600000 FSKR FISHA FIZA F8000000 749000 VYCR 387 FSKA111111 FSCHR |
|
204 |
weber W160 16000000 WBR WEBA FEBA $1000000 779000 VBR 317 WPA1111111 WBR |
|
205 |
meyer M600 56000000 MYR MEIA NEIA M0000000 619000 MAYR 67 MA11111111 MYR |
|
206 |
wagner W256 25600000 WKNR WAKNA FAKNA $2500000 756900 VACNR 3467 WKNA111111 WGNR |
|
207 |
schulz S420 24200000 SKLS SHULS ZULZ S4800000 484000 CULC 858 SKS1111111 SCHLZ |
|
208 |
becker B260 12600000 BKR BEKA BEKA B2000000 759000 BCR 147 PKA1111111 BCKR |
|
209 |
hoffmann H155 15500000 HFMN HOFMAN UFNAN $7550000 576600 OVMAN 036 AFMN111111 HFMN |
|
210 |
schäfer S16ß 21600000 SKFR SHEFA ZEFA S7000000 479000 CVR 837 SKFA111111 SCHFR |
|
211 |
||
212 |
|cls| |
|
213 |
||
214 |
cls := MRAStringComparator. |
|
215 |
cls := SoundexStringComparator. |
|
216 |
cls := KoelnerPhoneticCodeStringComparator. |
|
217 |
cls := Caverphone2StringComparator. |
|
218 |
#('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer' |
|
219 |
'wagner' 'schulz' 'becker' 'hoffmann' 'schäfer') |
|
220 |
do:[:name | |
|
221 |
Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''. |
|
222 |
]. |
|
223 |
||
224 |
KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt' -> '65752682' |
|
2197 | 225 |
" |
226 |
! ! |
|
227 |
||
228 |
!PhoneticStringUtilities class methodsFor:'phonetic codes'! |
|
229 |
||
230 |
koelnerPhoneticCodeOf:aString |
|
231 |
"return a koelner phonetic code. |
|
232 |
The koelnerPhonetic code is for the german language what the soundex code is for english; |
|
233 |
it returns simular strings for similar sounding words. |
|
234 |
There are some differences to soundex, though: |
|
235 |
its length is not limited to 4, but depends on the length of the original string; |
|
2207 | 236 |
it does not start with the first character of the input. |
237 |
This algorithm is described by Postel 1969" |
|
2197 | 238 |
|
2209 | 239 |
^ (KoelnerPhoneticCodeStringComparator new phoneticStringsFor:aString) first |
2197 | 240 |
|
241 |
" |
|
242 |
#( |
|
4488 | 243 |
'Müller' |
2197 | 244 |
'Miller' |
245 |
'Mueller' |
|
4488 | 246 |
'Mühler' |
247 |
'Mühlherr' |
|
248 |
'Mülherr' |
|
2197 | 249 |
'Myler' |
250 |
'Millar' |
|
251 |
'Myller' |
|
4488 | 252 |
'Müllar' |
253 |
'Müler' |
|
2197 | 254 |
'Muehler' |
4488 | 255 |
'Mülller' |
256 |
'Müllerr' |
|
2197 | 257 |
'Muehlherr' |
258 |
'Muellar' |
|
259 |
'Mueler' |
|
4488 | 260 |
'Mülleer' |
2197 | 261 |
'Mueller' |
4488 | 262 |
'Nüller' |
2197 | 263 |
'Nyller' |
264 |
'Niler' |
|
265 |
'Czerny' |
|
266 |
'Tscherny' |
|
267 |
'Czernie' |
|
268 |
'Tschernie' |
|
269 |
'Schernie' |
|
270 |
'Scherny' |
|
271 |
'Scherno' |
|
272 |
'Czerne' |
|
273 |
'Zerny' |
|
274 |
'Tzernie' |
|
275 |
'Breschnew' |
|
276 |
) do:[:w | |
|
277 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w) |
|
278 |
]. |
|
279 |
" |
|
280 |
||
281 |
" |
|
2209 | 282 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew'. '17863'. |
283 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff'. '17863'. |
|
284 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff'. '17863'. |
|
285 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff'. '17863'. |
|
286 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff'. '17863'. |
|
4488 | 287 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph'. '17863'. |
2209 | 288 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Preschnjiev'. '17863'. |
289 |
" |
|
290 |
! |
|
291 |
||
4488 | 292 |
miracodeCodeOf:aString |
293 |
"return a miracode soundex phonetic code or nil. |
|
294 |
Miracode is a slightly modified soundex algorithm. |
|
295 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
296 |
||
297 |
^ (MiracodeStringComparator new phoneticStringsFor:aString) first |
|
298 |
||
299 |
" |
|
300 |
PhoneticStringUtilities miracodeCodeOf:'claus' |
|
301 |
PhoneticStringUtilities miracodeCodeOf:'clause' |
|
302 |
PhoneticStringUtilities miracodeCodeOf:'close' |
|
303 |
PhoneticStringUtilities miracodeCodeOf:'smalltalk' |
|
304 |
PhoneticStringUtilities miracodeCodeOf:'smaltalk' |
|
305 |
PhoneticStringUtilities miracodeCodeOf:'smaltak' |
|
306 |
PhoneticStringUtilities miracodeCodeOf:'smaltok' |
|
307 |
PhoneticStringUtilities miracodeCodeOf:'smoltok' |
|
308 |
PhoneticStringUtilities miracodeCodeOf:'aa' |
|
309 |
PhoneticStringUtilities miracodeCodeOf:'by' |
|
310 |
PhoneticStringUtilities miracodeCodeOf:'bab' |
|
311 |
PhoneticStringUtilities miracodeCodeOf:'bob' |
|
312 |
PhoneticStringUtilities miracodeCodeOf:'bop' |
|
313 |
PhoneticStringUtilities miracodeCodeOf:'pub' |
|
314 |
" |
|
315 |
||
316 |
"Created: / 28-07-2017 / 15:32:41 / cg" |
|
317 |
! |
|
318 |
||
2209 | 319 |
mySQLSoundexCodeOf:aString |
320 |
"return the mySQL soundex code. The mysql soundex coed is different from the miracode 'american' soundex |
|
4488 | 321 |
(no 4char limitation; different order of duplicate vowel vs. duplicate code elimination). |
322 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
2209 | 323 |
|
324 |
^ (MySQLSoundexStringComparator new phoneticStringsFor:aString) first |
|
325 |
||
326 |
" |
|
327 |
#( |
|
4488 | 328 |
'Müller' |
2209 | 329 |
'Miller' |
330 |
'Mueller' |
|
4488 | 331 |
'Mühler' |
332 |
'Mühlherr' |
|
333 |
'Mülherr' |
|
2209 | 334 |
'Myler' |
335 |
'Millar' |
|
336 |
'Myller' |
|
4488 | 337 |
'Müllar' |
338 |
'Müler' |
|
2209 | 339 |
'Muehler' |
4488 | 340 |
'Mülller' |
341 |
'Müllerr' |
|
2209 | 342 |
'Muehlherr' |
343 |
'Muellar' |
|
344 |
'Mueler' |
|
4488 | 345 |
'Mülleer' |
2209 | 346 |
'Mueller' |
4488 | 347 |
'Nüller' |
2209 | 348 |
'Nyller' |
349 |
'Niler' |
|
350 |
'Czerny' |
|
351 |
'Tscherny' |
|
352 |
'Czernie' |
|
353 |
'Tschernie' |
|
354 |
'Schernie' |
|
355 |
'Scherny' |
|
356 |
'Scherno' |
|
357 |
'Czerne' |
|
358 |
'Zerny' |
|
359 |
'Tzernie' |
|
360 |
'Breschnew' |
|
361 |
) do:[:w | |
|
362 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities mySQLSoundexCodeOf:w) |
|
363 |
]. |
|
364 |
" |
|
365 |
||
366 |
" |
|
367 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Breschnew'. |
|
368 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Breschneff'. |
|
369 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Braeschneff'. |
|
370 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Braessneff'. |
|
371 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Pressneff'. |
|
4488 | 372 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Presznäph'. |
2209 | 373 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Preschnjiev'. |
2197 | 374 |
" |
4488 | 375 |
|
376 |
"Modified (comment): / 28-07-2017 / 15:34:03 / cg" |
|
2197 | 377 |
! |
378 |
||
379 |
soundexCodeOf:aString |
|
380 |
"return a soundex phonetic code or nil. |
|
2207 | 381 |
Soundex (1918, 1922) returns similar codes for similar sounding words, making it a useful |
2197 | 382 |
tool when searching for words where the correct spelling is unknown. |
4194 | 383 |
(read Knuth or search the web if you don't know what a soundex code is). |
4488 | 384 |
Caveat: 'similar sounding words' means: 'similar sounding in english'. |
385 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
2197 | 386 |
|
2210 | 387 |
^ (SoundexStringComparator new phoneticStringsFor:aString) first |
2197 | 388 |
|
2210 | 389 |
"/ old code - now use code in private class... |
390 |
"/ |inStream codeStream ch last lch codeLength codes code lastCode| |
|
391 |
"/ |
|
392 |
"/ inStream := aString readStream. |
|
393 |
"/ inStream skipSeparators. |
|
394 |
"/ inStream atEnd ifTrue:[ |
|
395 |
"/ ^ nil |
|
396 |
"/ ]. |
|
397 |
"/ |
|
398 |
"/ ch := inStream next. |
|
399 |
"/ ch isLetter ifFalse:[ |
|
400 |
"/ ^ nil |
|
401 |
"/ ]. |
|
402 |
"/ codeLength := 0. |
|
403 |
"/ |
|
404 |
"/ codes := Dictionary new. |
|
405 |
"/ codes atAll:'bpfv' put:$1. |
|
406 |
"/ codes atAll:'cskgjqxz' put:$2. |
|
407 |
"/ codes atAll:'dt' put:$3. |
|
408 |
"/ codes atAll:'l' put:$4. |
|
409 |
"/ codes atAll:'nm' put:$5. |
|
410 |
"/ codes atAll:'r' put:$6. |
|
411 |
"/ |
|
412 |
"/ codeStream := WriteStream on:(String new:4). |
|
413 |
"/ codeStream nextPut:(ch asUppercase). |
|
414 |
"/ last := ch asLowercase. |
|
415 |
"/ lastCode := codes at:last ifAbsent:nil. |
|
416 |
"/ |
|
417 |
"/ [inStream atEnd] whileFalse:[ |
|
418 |
"/ ch := inStream next. |
|
419 |
"/ lch := ch asLowercase. |
|
420 |
"/ lch = last ifFalse:[ |
|
421 |
"/ last := lch. |
|
422 |
"/ |
|
423 |
"/ code := codes at:lch ifAbsent:nil. |
|
424 |
"/ (code notNil and:[ code ~= lastCode]) ifTrue:[ |
|
425 |
"/ codeLength < 3 ifTrue:[ |
|
426 |
"/ codeStream nextPut:code. |
|
427 |
"/ codeLength := codeLength + 1. |
|
428 |
"/ codeLength > 3 ifTrue:[^ codeStream contents]. |
|
429 |
"/ ]. |
|
430 |
"/ ]. |
|
431 |
"/ lastCode := code. |
|
432 |
"/ ] |
|
433 |
"/ ]. |
|
434 |
"/ [ codeLength < 3 ] whileTrue:[ |
|
435 |
"/ codeStream nextPut:$0. |
|
436 |
"/ codeLength := codeLength + 1. |
|
437 |
"/ ]. |
|
438 |
"/ |
|
439 |
"/ ^ codeStream contents |
|
2197 | 440 |
|
441 |
" |
|
442 |
PhoneticStringUtilities soundexCodeOf:'claus' |
|
443 |
PhoneticStringUtilities soundexCodeOf:'clause' |
|
444 |
PhoneticStringUtilities soundexCodeOf:'close' |
|
445 |
PhoneticStringUtilities soundexCodeOf:'smalltalk' |
|
446 |
PhoneticStringUtilities soundexCodeOf:'smaltalk' |
|
447 |
PhoneticStringUtilities soundexCodeOf:'smaltak' |
|
448 |
PhoneticStringUtilities soundexCodeOf:'smaltok' |
|
449 |
PhoneticStringUtilities soundexCodeOf:'smoltok' |
|
450 |
PhoneticStringUtilities soundexCodeOf:'aa' |
|
451 |
PhoneticStringUtilities soundexCodeOf:'by' |
|
452 |
PhoneticStringUtilities soundexCodeOf:'bab' |
|
453 |
PhoneticStringUtilities soundexCodeOf:'bob' |
|
454 |
PhoneticStringUtilities soundexCodeOf:'bop' |
|
455 |
" |
|
4488 | 456 |
|
457 |
"Modified (comment): / 28-07-2017 / 15:33:53 / cg" |
|
2197 | 458 |
! ! |
459 |
||
3648 | 460 |
!PhoneticStringUtilities class methodsFor:'queries'! |
461 |
||
462 |
isUtilityClass |
|
463 |
^ self == PhoneticStringUtilities |
|
464 |
! ! |
|
465 |
||
2208 | 466 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'constant'! |
467 |
||
468 |
defaultClass |
|
469 |
^SoundexStringComparator |
|
470 |
! ! |
|
471 |
||
3646 | 472 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'documentation'! |
473 |
||
474 |
documentation |
|
475 |
" |
|
476 |
abstract superclass for various phonetic comparators. |
|
477 |
They returns similar strings for similar sounding words, which can be used |
|
478 |
to find similar sounding words in a search list. |
|
479 |
||
480 |
Notice, that some comparators are better for particular languages. |
|
481 |
" |
|
4467 | 482 |
! |
483 |
||
484 |
examples |
|
485 |
" |
|
486 |
PhoneticStringUtilities::SoundexStringComparator new |
|
487 |
does:'miller' soundLike:'miler'. |
|
488 |
||
489 |
PhoneticStringUtilities::SoundexStringComparator new |
|
490 |
does:'miller' soundLike:'milner'. |
|
491 |
||
492 |
PhoneticStringUtilities::SoundexStringComparator new |
|
4488 | 493 |
does:'müller' soundLike:'mueller'. |
4467 | 494 |
|
495 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new |
|
4488 | 496 |
does:'müller' soundLike:'mueller'. |
4467 | 497 |
" |
3646 | 498 |
! ! |
499 |
||
2208 | 500 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'instance creation'! |
501 |
||
502 |
new |
|
503 |
^ self basicNew initialize. |
|
504 |
! ! |
|
505 |
||
3646 | 506 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'queries'! |
507 |
||
508 |
isAbstract |
|
509 |
^ self == PhoneticStringUtilities::PhoneticStringComparator |
|
510 |
! ! |
|
511 |
||
4491 | 512 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'! |
513 |
||
5236 | 514 |
does:aString soundLike:anotherString |
515 |
"return true, if aString sounds similar to anotherString" |
|
516 |
||
517 |
^ self new does:aString soundLike:anotherString. |
|
518 |
||
519 |
" |
|
520 |
PhoneticStringUtilities::SoundexStringComparator does:'miller' soundLike:'miler'. |
|
521 |
||
522 |
PhoneticStringUtilities::SoundexStringComparator does:'miller' soundLike:'milner'. |
|
523 |
||
524 |
PhoneticStringUtilities::SoundexStringComparator does:'müller' soundLike:'mueller'. |
|
525 |
||
526 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator does:'müller' soundLike:'mueller'. |
|
527 |
PhoneticStringUtilities::DoubleMetaphoneStringComparator does:'müller' soundLike:'mueller'. |
|
528 |
" |
|
529 |
! |
|
530 |
||
4491 | 531 |
encode:word |
5236 | 532 |
"return a phonetic encoding for a word. |
533 |
This can eg. be used as key to map/hash similar sounding words" |
|
534 |
||
4491 | 535 |
^ (self new phoneticStringsFor:word) first |
536 |
||
537 |
" |
|
538 |
SoundexStringComparator encode:'Fischer' -> 'F260' |
|
5236 | 539 |
SoundexStringComparator encode:'Fiescher' -> 'F260' |
4491 | 540 |
Caverphone2StringComparator encode:'Fischer' -> 'FSKA111111' |
5236 | 541 |
Caverphone2StringComparator encode:'Fiescher' -> 'FSKA111111' |
4491 | 542 |
MRAStringComparator encode:'Fischer' -> 'FSCHR' |
5236 | 543 |
MRAStringComparator encode:'Fiescher' -> 'FSCHR' |
544 |
SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429' |
|
545 |
SpanishPhoneticCodeStringComparator encode:'Fiescher' -> '24429' |
|
546 |
DoubleMetaphoneStringComparator encode:'Fischer' -> 'FXR' |
|
547 |
DoubleMetaphoneStringComparator encode:'Fiescher' -> 'FXR' |
|
4491 | 548 |
" |
549 |
||
550 |
"Created: / 02-08-2017 / 01:15:50 / cg" |
|
551 |
! ! |
|
552 |
||
2208 | 553 |
!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'! |
554 |
||
555 |
does:aString soundLike:anotherString |
|
5236 | 556 |
"return true, if aString sounds similar to anotherString" |
557 |
||
2208 | 558 |
|translations1 translations2| |
559 |
||
5236 | 560 |
translations1 := self phoneticStringsFor:aString. |
561 |
translations2 := self phoneticStringsFor:anotherString. |
|
2208 | 562 |
|
563 |
^ translations1 contains:[:t1 | |
|
5236 | 564 |
translations2 contains:[:t2 | t1 = t2] |
565 |
] |
|
2208 | 566 |
|
567 |
" |
|
5236 | 568 |
PhoneticStringUtilities::SoundexStringComparator new does:'miller' soundLike:'miler'. |
4467 | 569 |
|
2208 | 570 |
PhoneticStringUtilities::SoundexStringComparator new |
571 |
does:'miller' soundLike:'milner'. |
|
4467 | 572 |
|
573 |
PhoneticStringUtilities::SoundexStringComparator new |
|
4488 | 574 |
does:'müller' soundLike:'mueller'. |
4467 | 575 |
|
576 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new |
|
4488 | 577 |
does:'müller' soundLike:'mueller'. |
2208 | 578 |
" |
4467 | 579 |
|
580 |
"Modified (comment): / 13-07-2017 / 17:51:43 / cg" |
|
2208 | 581 |
! |
582 |
||
583 |
phoneticStringsFor: aString |
|
584 |
"Should answer an array of alternate phonetic strings for the given input string." |
|
4485 | 585 |
|
2208 | 586 |
self subclassResponsibility |
587 |
||
588 |
" |
|
589 |
(PhoneticStringUtilities::SoundexStringComparator new |
|
4485 | 590 |
phoneticStringsFor:'miller') first |
591 |
||
2208 | 592 |
'miller' asSoundexCode |
593 |
" |
|
4485 | 594 |
|
595 |
"Modified (comment): / 27-07-2017 / 15:07:59 / cg" |
|
2208 | 596 |
! ! |
597 |
||
598 |
!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'initialization'! |
|
599 |
||
600 |
initialize |
|
601 |
"Invoked when a new instance is created." |
|
602 |
||
603 |
"/ please change as required (and remove this comment) |
|
604 |
||
605 |
"/ super initialize. -- commented since inherited method does nothing |
|
606 |
! ! |
|
607 |
||
4491 | 608 |
!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'! |
2208 | 609 |
|
610 |
documentation |
|
611 |
" |
|
4491 | 612 |
self encode:'AUERBACH' -> 097400, 097500 |
613 |
||
614 |
Encodes a string into a Daitch-Mokotoff Soundex value. |
|
615 |
The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, |
|
616 |
yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation |
|
617 |
but differences in spelling. |
|
618 |
||
619 |
The main differences compared to the other soundex variants are: |
|
620 |
- coded names are 6 digits long |
|
621 |
- the initial character of the name is coded |
|
622 |
- rules to encoded multi-character n-grams |
|
623 |
- multiple possible encodings for the same name (branching) |
|
624 |
||
625 |
This implementation supports branching, depending on the used method: |
|
626 |
encode:aString - branching disabled, only the first code will be returned |
|
627 |
phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|' |
|
628 |
||
629 |
[see also:] |
|
630 |
'Wikipedia - Daitch-Mokotoff Soundex' |
|
631 |
http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex |
|
632 |
||
633 |
'Avotaynu - Soundexing and Genealogy' |
|
634 |
http://www.avotaynu.com/soundex.htm |
|
2208 | 635 |
" |
636 |
! |
|
637 |
||
4491 | 638 |
javaCode |
639 |
"<<END |
|
640 |
/* |
|
641 |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
642 |
* contributor license agreements. See the NOTICE file distributed with |
|
643 |
* this work for additional information regarding copyright ownership. |
|
644 |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
645 |
* (the "License"); you may not use this file except in compliance with |
|
646 |
* the License. You may obtain a copy of the License at |
|
647 |
* |
|
648 |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
649 |
* |
|
650 |
* Unless required by applicable law or agreed to in writing, software |
|
651 |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
652 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
653 |
* See the License for the specific language governing permissions and |
|
654 |
* limitations under the License. |
|
655 |
*/ |
|
656 |
package org.apache.commons.codec.language; |
|
657 |
||
658 |
import org.apache.commons.codec.CharEncoding; |
|
659 |
import org.apache.commons.codec.EncoderException; |
|
660 |
import org.apache.commons.codec.StringEncoder; |
|
661 |
||
662 |
import java.io.InputStream; |
|
663 |
import java.util.*; |
|
664 |
||
665 |
/** |
|
666 |
* Encodes a string into a Daitch-Mokotoff Soundex value. |
|
667 |
* <p> |
|
668 |
* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater |
|
669 |
* accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. |
|
670 |
* </p> |
|
671 |
* <p> |
|
672 |
* The main differences compared to the other soundex variants are: |
|
673 |
* </p> |
|
674 |
* <ul> |
|
675 |
* <li>coded names are 6 digits long |
|
676 |
* <li>the initial character of the name is coded |
|
677 |
* <li>rules to encoded multi-character n-grams |
|
678 |
* <li>multiple possible encodings for the same name (branching) |
|
679 |
* </ul> |
|
680 |
* <p> |
|
681 |
* This implementation supports branching, depending on the used method: |
|
682 |
* <ul> |
|
683 |
* <li>{@link #encode(String)} - branching disabled, only the first code will be returned |
|
684 |
* <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' |
|
685 |
* </ul> |
|
686 |
* <p> |
|
687 |
* Note: this implementation has additional branching rules compared to the original description of the algorithm. The |
|
688 |
* rules can be customized by overriding the default rules contained in the resource file |
|
689 |
* {@code org/apache/commons/codec/language/dmrules.txt}. |
|
690 |
* </p> |
|
691 |
* <p> |
|
692 |
* This class is thread-safe. |
|
693 |
* </p> |
|
694 |
* |
|
695 |
* @see Soundex |
|
696 |
* @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> |
|
697 |
* @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> |
|
698 |
* |
|
699 |
* @version $Id$ |
|
700 |
* @since 1.10 |
|
701 |
*/ |
|
702 |
public class DaitchMokotoffSoundex implements StringEncoder { |
|
703 |
||
704 |
/** |
|
705 |
* Inner class representing a branch during DM soundex encoding. |
|
706 |
*/ |
|
707 |
private static final class Branch { |
|
708 |
private final StringBuilder builder; |
|
709 |
private String cachedString; |
|
710 |
private String lastReplacement; |
|
711 |
||
712 |
private Branch() { |
|
713 |
builder = new StringBuilder(); |
|
714 |
lastReplacement = null; |
|
715 |
cachedString = null; |
|
716 |
} |
|
717 |
||
718 |
/** |
|
719 |
* Creates a new branch, identical to this branch. |
|
720 |
* |
|
721 |
* @return a new, identical branch |
|
722 |
*/ |
|
723 |
public Branch createBranch() { |
|
724 |
final Branch branch = new Branch(); |
|
725 |
branch.builder.append(toString()); |
|
726 |
branch.lastReplacement = this.lastReplacement; |
|
727 |
return branch; |
|
728 |
} |
|
729 |
||
730 |
@Override |
|
731 |
public boolean equals(final Object other) { |
|
732 |
if (this == other) { |
|
733 |
return true; |
|
734 |
} |
|
735 |
if (!!(other instanceof Branch)) { |
|
736 |
return false; |
|
737 |
} |
|
738 |
||
739 |
return toString().equals(((Branch) other).toString()); |
|
740 |
} |
|
741 |
||
742 |
/** |
|
743 |
* Finish this branch by appending '0's until the maximum code length has been reached. |
|
744 |
*/ |
|
745 |
public void finish() { |
|
746 |
while (builder.length() < MAX_LENGTH) { |
|
747 |
builder.append('0'); |
|
748 |
cachedString = null; |
|
749 |
} |
|
750 |
} |
|
751 |
||
752 |
@Override |
|
753 |
public int hashCode() { |
|
754 |
return toString().hashCode(); |
|
755 |
} |
|
756 |
||
757 |
/** |
|
758 |
* Process the next replacement to be added to this branch. |
|
759 |
* |
|
760 |
* @param replacement |
|
761 |
* the next replacement to append |
|
762 |
* @param forceAppend |
|
763 |
* indicates if the default processing shall be overridden |
|
764 |
*/ |
|
765 |
public void processNextReplacement(final String replacement, final boolean forceAppend) { |
|
766 |
final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend; |
|
767 |
||
768 |
if (append && builder.length() < MAX_LENGTH) { |
|
769 |
builder.append(replacement); |
|
770 |
// remove all characters after the maximum length |
|
771 |
if (builder.length() > MAX_LENGTH) { |
|
772 |
builder.delete(MAX_LENGTH, builder.length()); |
|
773 |
} |
|
774 |
cachedString = null; |
|
775 |
} |
|
776 |
||
777 |
lastReplacement = replacement; |
|
778 |
} |
|
779 |
||
780 |
@Override |
|
781 |
public String toString() { |
|
782 |
if (cachedString == null) { |
|
783 |
cachedString = builder.toString(); |
|
784 |
} |
|
785 |
return cachedString; |
|
786 |
} |
|
787 |
} |
|
788 |
||
789 |
/** |
|
790 |
* Inner class for storing rules. |
|
791 |
*/ |
|
792 |
private static final class Rule { |
|
793 |
private final String pattern; |
|
794 |
private final String[] replacementAtStart; |
|
795 |
private final String[] replacementBeforeVowel; |
|
796 |
private final String[] replacementDefault; |
|
797 |
||
798 |
protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, |
|
799 |
final String replacementDefault) { |
|
800 |
this.pattern = pattern; |
|
801 |
this.replacementAtStart = replacementAtStart.split("\\|"); |
|
802 |
this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); |
|
803 |
this.replacementDefault = replacementDefault.split("\\|"); |
|
804 |
} |
|
805 |
||
806 |
public int getPatternLength() { |
|
807 |
return pattern.length(); |
|
808 |
} |
|
809 |
||
810 |
public String[] getReplacements(final String context, final boolean atStart) { |
|
811 |
if (atStart) { |
|
812 |
return replacementAtStart; |
|
813 |
} |
|
814 |
||
815 |
final int nextIndex = getPatternLength(); |
|
816 |
final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; |
|
817 |
if (nextCharIsVowel) { |
|
818 |
return replacementBeforeVowel; |
|
819 |
} |
|
820 |
||
821 |
return replacementDefault; |
|
822 |
} |
|
823 |
||
824 |
private boolean isVowel(final char ch) { |
|
825 |
return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; |
|
826 |
} |
|
827 |
||
828 |
public boolean matches(final String context) { |
|
829 |
return context.startsWith(pattern); |
|
830 |
} |
|
831 |
||
832 |
@Override |
|
833 |
public String toString() { |
|
834 |
return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), |
|
835 |
Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); |
|
836 |
} |
|
837 |
} |
|
838 |
||
839 |
private static final String COMMENT = "//"; |
|
840 |
private static final String DOUBLE_QUOTE = "\""; |
|
841 |
||
842 |
private static final String MULTILINE_COMMENT_END = "*/"; |
|
843 |
||
844 |
private static final String MULTILINE_COMMENT_START = "/*"; |
|
845 |
||
846 |
/** The resource file containing the replacement and folding rules */ |
|
847 |
private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; |
|
848 |
||
849 |
/** The code length of a DM soundex value. */ |
|
850 |
private static final int MAX_LENGTH = 6; |
|
851 |
||
852 |
/** Transformation rules indexed by the first character of their pattern. */ |
|
853 |
private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); |
|
854 |
||
855 |
/** Folding rules. */ |
|
856 |
private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); |
|
857 |
||
858 |
static { |
|
859 |
final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); |
|
860 |
if (rulesIS == null) { |
|
861 |
throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); |
|
862 |
} |
|
863 |
||
864 |
final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); |
|
865 |
parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); |
|
866 |
scanner.close(); |
|
867 |
||
868 |
// sort RULES by pattern length in descending order |
|
869 |
for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { |
|
870 |
final List<Rule> ruleList = rule.getValue(); |
|
871 |
Collections.sort(ruleList, new Comparator<Rule>() { |
|
872 |
@Override |
|
873 |
public int compare(final Rule rule1, final Rule rule2) { |
|
874 |
return rule2.getPatternLength() - rule1.getPatternLength(); |
|
875 |
} |
|
876 |
}); |
|
877 |
} |
|
878 |
} |
|
879 |
||
880 |
private static void parseRules(final Scanner scanner, final String location, |
|
881 |
final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { |
|
882 |
int currentLine = 0; |
|
883 |
boolean inMultilineComment = false; |
|
884 |
||
885 |
while (scanner.hasNextLine()) { |
|
886 |
currentLine++; |
|
887 |
final String rawLine = scanner.nextLine(); |
|
888 |
String line = rawLine; |
|
889 |
||
890 |
if (inMultilineComment) { |
|
891 |
if (line.endsWith(MULTILINE_COMMENT_END)) { |
|
892 |
inMultilineComment = false; |
|
893 |
} |
|
894 |
continue; |
|
895 |
} |
|
896 |
||
897 |
if (line.startsWith(MULTILINE_COMMENT_START)) { |
|
898 |
inMultilineComment = true; |
|
899 |
} else { |
|
900 |
// discard comments |
|
901 |
final int cmtI = line.indexOf(COMMENT); |
|
902 |
if (cmtI >= 0) { |
|
903 |
line = line.substring(0, cmtI); |
|
904 |
} |
|
905 |
||
906 |
// trim leading-trailing whitespace |
|
907 |
line = line.trim(); |
|
908 |
||
909 |
if (line.length() == 0) { |
|
910 |
continue; // empty lines can be safely skipped |
|
911 |
} |
|
912 |
||
913 |
if (line.contains("=")) { |
|
914 |
// folding |
|
915 |
final String[] parts = line.split("="); |
|
916 |
if (parts.length !!= 2) { |
|
917 |
throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + |
|
918 |
" parts: " + rawLine + " in " + location); |
|
919 |
} else { |
|
920 |
final String leftCharacter = parts[0]; |
|
921 |
final String rightCharacter = parts[1]; |
|
922 |
||
923 |
if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) { |
|
924 |
throw new IllegalArgumentException("Malformed folding statement - " + |
|
925 |
"patterns are not single characters: " + rawLine + " in " + location); |
|
926 |
} |
|
927 |
||
928 |
asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); |
|
929 |
} |
|
930 |
} else { |
|
931 |
// rule |
|
932 |
final String[] parts = line.split("\\s+"); |
|
933 |
if (parts.length !!= 4) { |
|
934 |
throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
|
935 |
" parts: " + rawLine + " in " + location); |
|
936 |
} else { |
|
937 |
try { |
|
938 |
final String pattern = stripQuotes(parts[0]); |
|
939 |
final String replacement1 = stripQuotes(parts[1]); |
|
940 |
final String replacement2 = stripQuotes(parts[2]); |
|
941 |
final String replacement3 = stripQuotes(parts[3]); |
|
942 |
||
943 |
final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); |
|
944 |
final char patternKey = r.pattern.charAt(0); |
|
945 |
List<Rule> rules = ruleMapping.get(patternKey); |
|
946 |
if (rules == null) { |
|
947 |
rules = new ArrayList<Rule>(); |
|
948 |
ruleMapping.put(patternKey, rules); |
|
949 |
} |
|
950 |
rules.add(r); |
|
951 |
} catch (final IllegalArgumentException e) { |
|
952 |
throw new IllegalStateException( |
|
953 |
"Problem parsing line '" + currentLine + "' in " + location, e); |
|
954 |
} |
|
955 |
} |
|
956 |
} |
|
957 |
} |
|
958 |
} |
|
959 |
} |
|
960 |
||
961 |
private static String stripQuotes(String str) { |
|
962 |
if (str.startsWith(DOUBLE_QUOTE)) { |
|
963 |
str = str.substring(1); |
|
964 |
} |
|
965 |
||
966 |
if (str.endsWith(DOUBLE_QUOTE)) { |
|
967 |
str = str.substring(0, str.length() - 1); |
|
968 |
} |
|
969 |
||
970 |
return str; |
|
971 |
} |
|
972 |
||
973 |
/** Whether to use ASCII folding prior to encoding. */ |
|
974 |
private final boolean folding; |
|
975 |
||
976 |
/** |
|
977 |
* Creates a new instance with ASCII-folding enabled. |
|
978 |
*/ |
|
979 |
public DaitchMokotoffSoundex() { |
|
980 |
this(true); |
|
981 |
} |
|
982 |
||
983 |
/** |
|
984 |
* Creates a new instance. |
|
985 |
* <p> |
|
986 |
* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. |
|
987 |
* è -> e. |
|
988 |
* </p> |
|
989 |
* |
|
990 |
* @param folding |
|
991 |
* if ASCII-folding shall be performed before encoding |
|
992 |
*/ |
|
993 |
public DaitchMokotoffSoundex(final boolean folding) { |
|
994 |
this.folding = folding; |
|
995 |
} |
|
996 |
||
997 |
/** |
|
998 |
* Performs a cleanup of the input string before the actual soundex transformation. |
|
999 |
* <p> |
|
1000 |
* Removes all whitespace characters and performs ASCII folding if enabled. |
|
1001 |
* </p> |
|
1002 |
* |
|
1003 |
* @param input |
|
1004 |
* the input string to cleanup |
|
1005 |
* @return a cleaned up string |
|
1006 |
*/ |
|
1007 |
private String cleanup(final String input) { |
|
1008 |
final StringBuilder sb = new StringBuilder(); |
|
1009 |
for (char ch : input.toCharArray()) { |
|
1010 |
if (Character.isWhitespace(ch)) { |
|
1011 |
continue; |
|
1012 |
} |
|
1013 |
||
1014 |
ch = Character.toLowerCase(ch); |
|
1015 |
if (folding && FOLDINGS.containsKey(ch)) { |
|
1016 |
ch = FOLDINGS.get(ch); |
|
1017 |
} |
|
1018 |
sb.append(ch); |
|
1019 |
} |
|
1020 |
return sb.toString(); |
|
1021 |
} |
|
1022 |
||
1023 |
/** |
|
1024 |
* Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. |
|
1025 |
* <p> |
|
1026 |
* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an |
|
1027 |
* EncoderException if the supplied object is not of type java.lang.String. |
|
1028 |
* </p> |
|
1029 |
* |
|
1030 |
* @see #soundex(String) |
|
1031 |
* |
|
1032 |
* @param obj |
|
1033 |
* Object to encode |
|
1034 |
* @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String |
|
1035 |
* supplied. |
|
1036 |
* @throws EncoderException |
|
1037 |
* if the parameter supplied is not of type java.lang.String |
|
1038 |
* @throws IllegalArgumentException |
|
1039 |
* if a character is not mapped |
|
1040 |
*/ |
|
1041 |
@Override |
|
1042 |
public Object encode(final Object obj) throws EncoderException { |
|
1043 |
if (!!(obj instanceof String)) { |
|
1044 |
throw new EncoderException( |
|
1045 |
"Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); |
|
1046 |
} |
|
1047 |
return encode((String) obj); |
|
1048 |
} |
|
1049 |
||
1050 |
/** |
|
1051 |
* Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. |
|
1052 |
* |
|
1053 |
* @see #soundex(String) |
|
1054 |
* |
|
1055 |
* @param source |
|
1056 |
* A String object to encode |
|
1057 |
* @return A DM Soundex code corresponding to the String supplied |
|
1058 |
* @throws IllegalArgumentException |
|
1059 |
* if a character is not mapped |
|
1060 |
*/ |
|
1061 |
@Override |
|
1062 |
public String encode(final String source) { |
|
1063 |
if (source == null) { |
|
1064 |
return null; |
|
1065 |
} |
|
1066 |
return soundex(source, false)[0]; |
|
1067 |
} |
|
1068 |
||
1069 |
/** |
|
1070 |
* Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. |
|
1071 |
* <p> |
|
1072 |
* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, |
|
1073 |
* separated by '|'. |
|
1074 |
* </p> |
|
1075 |
* <p> |
|
1076 |
* Example: the name "AUERBACH" is encoded as both |
|
1077 |
* </p> |
|
1078 |
* <ul> |
|
1079 |
* <li>097400</li> |
|
1080 |
* <li>097500</li> |
|
1081 |
* </ul> |
|
1082 |
* <p> |
|
1083 |
* Thus the result will be "097400|097500". |
|
1084 |
* </p> |
|
1085 |
* |
|
1086 |
* @param source |
|
1087 |
* A String object to encode |
|
1088 |
* @return A string containing a set of DM Soundex codes corresponding to the String supplied |
|
1089 |
* @throws IllegalArgumentException |
|
1090 |
* if a character is not mapped |
|
1091 |
*/ |
|
1092 |
public String soundex(final String source) { |
|
1093 |
final String[] branches = soundex(source, true); |
|
1094 |
final StringBuilder sb = new StringBuilder(); |
|
1095 |
int index = 0; |
|
1096 |
for (final String branch : branches) { |
|
1097 |
sb.append(branch); |
|
1098 |
if (++index < branches.length) { |
|
1099 |
sb.append('|'); |
|
1100 |
} |
|
1101 |
} |
|
1102 |
return sb.toString(); |
|
1103 |
} |
|
1104 |
||
1105 |
/** |
|
1106 |
* Perform the actual DM Soundex algorithm on the input string. |
|
1107 |
* |
|
1108 |
* @param source |
|
1109 |
* A String object to encode |
|
1110 |
* @param branching |
|
1111 |
* If branching shall be performed |
|
1112 |
* @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the |
|
1113 |
* selected branching mode |
|
1114 |
*/ |
|
1115 |
private String[] soundex(final String source, final boolean branching) { |
|
1116 |
if (source == null) { |
|
1117 |
return null; |
|
1118 |
} |
|
1119 |
||
1120 |
final String input = cleanup(source); |
|
1121 |
||
1122 |
final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); |
|
1123 |
currentBranches.add(new Branch()); |
|
1124 |
||
1125 |
char lastChar = '\0'; |
|
1126 |
for (int index = 0; index < input.length(); index++) { |
|
1127 |
final char ch = input.charAt(index); |
|
1128 |
||
1129 |
// ignore whitespace inside a name |
|
1130 |
if (Character.isWhitespace(ch)) { |
|
1131 |
continue; |
|
1132 |
} |
|
1133 |
||
1134 |
final String inputContext = input.substring(index); |
|
1135 |
final List<Rule> rules = RULES.get(ch); |
|
1136 |
if (rules == null) { |
|
1137 |
continue; |
|
1138 |
} |
|
1139 |
||
1140 |
// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access |
|
1141 |
@SuppressWarnings("unchecked") |
|
1142 |
final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; |
|
1143 |
||
1144 |
for (final Rule rule : rules) { |
|
1145 |
if (rule.matches(inputContext)) { |
|
1146 |
if (branching) { |
|
1147 |
nextBranches.clear(); |
|
1148 |
} |
|
1149 |
final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); |
|
1150 |
final boolean branchingRequired = replacements.length > 1 && branching; |
|
1151 |
||
1152 |
for (final Branch branch : currentBranches) { |
|
1153 |
for (final String nextReplacement : replacements) { |
|
1154 |
// if we have multiple replacements, always create a new branch |
|
1155 |
final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; |
|
1156 |
||
1157 |
// special rule: occurrences of mn or nm are treated differently |
|
1158 |
final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); |
|
1159 |
||
1160 |
nextBranch.processNextReplacement(nextReplacement, force); |
|
1161 |
||
1162 |
if (branching) { |
|
1163 |
nextBranches.add(nextBranch); |
|
1164 |
} else { |
|
1165 |
break; |
|
1166 |
} |
|
1167 |
} |
|
1168 |
} |
|
1169 |
||
1170 |
if (branching) { |
|
1171 |
currentBranches.clear(); |
|
1172 |
currentBranches.addAll(nextBranches); |
|
1173 |
} |
|
1174 |
index += rule.getPatternLength() - 1; |
|
1175 |
break; |
|
1176 |
} |
|
1177 |
} |
|
1178 |
||
1179 |
lastChar = ch; |
|
1180 |
} |
|
1181 |
||
1182 |
final String[] result = new String[currentBranches.size()]; |
|
1183 |
int index = 0; |
|
1184 |
for (final Branch branch : currentBranches) { |
|
1185 |
branch.finish(); |
|
1186 |
result[index++] = branch.toString(); |
|
1187 |
} |
|
1188 |
||
1189 |
return result; |
|
1190 |
} |
|
1191 |
} |
|
1192 |
END>>" |
|
2211 | 1193 |
! ! |
1194 |
||
2208 | 1195 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1196 |
||
2209 | 1197 |
copyright |
1198 |
" |
|
1199 |
Copyright (c) 2002-2004 Robert Jarvis |
|
2208 | 1200 |
|
2209 | 1201 |
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation |
1202 |
files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, |
|
1203 |
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom |
|
1204 |
the Software is furnished to do so, subject to the following conditions: |
|
1205 |
||
1206 |
The above copyright notice and this permission notice shall be included in all copies or substantial |
|
1207 |
portions of the Software. |
|
2208 | 1208 |
|
2209 | 1209 |
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
1210 |
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|
1211 |
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
|
1212 |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
|
1213 |
USE OR OTHER DEALINGS IN THE SOFTWARE.' |
|
1214 |
" |
|
1215 |
! ! |
|
2208 | 1216 |
|
2213 | 1217 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'classification'! |
1218 |
||
1219 |
isSlavoGermanic:aString |
|
4488 | 1220 |
^ #('w' 'k' 'cz' 'witz' 'ä' 'ö' 'ü' 'ß') contains:[:sub | aString includesString:sub] |
2213 | 1221 |
|
1222 |
" |
|
1223 |
self isSlavoGermanic:'walter' |
|
4488 | 1224 |
self isSlavoGermanic:'horowitz' |
1225 |
self isSlavoGermanic:'müller' |
|
1226 |
self isSlavoGermanic:'miller' |
|
2213 | 1227 |
" |
4488 | 1228 |
|
1229 |
"Modified: / 28-07-2017 / 10:14:38 / cg" |
|
2213 | 1230 |
! ! |
1231 |
||
2209 | 1232 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'documentation'! |
2208 | 1233 |
|
3685 | 1234 |
documentation |
2209 | 1235 |
" |
4488 | 1236 |
The Double Metaphone algorithm |
1237 |
||
1238 |
see internet: https://en.wikipedia.org/wiki/Metaphone |
|
2209 | 1239 |
" |
2208 | 1240 |
! ! |
1241 |
||
1242 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'accessing'! |
|
1243 |
||
1244 |
currentIndex |
|
1245 |
^currentIndex |
|
1246 |
! |
|
1247 |
||
1248 |
currentIndex: anInteger |
|
1249 |
currentIndex := anInteger |
|
1250 |
! |
|
1251 |
||
1252 |
inputKey |
|
1253 |
^inputKey |
|
1254 |
! |
|
1255 |
||
1256 |
inputKey: aString |
|
5236 | 1257 |
inputKey := aString asUppercase. |
1258 |
"/ care for diareses |
|
1259 |
(inputKey includesAny:'ÄÖÜ') ifTrue:[ |
|
1260 |
inputKey := inputKey copyReplaceString:'Ä' withString:'AE'. |
|
1261 |
inputKey := inputKey copyReplaceString:'Ö' withString:'OE'. |
|
1262 |
inputKey := inputKey copyReplaceString:'Ü' withString:'UE'. |
|
1263 |
]. |
|
2208 | 1264 |
! |
1265 |
||
1266 |
primaryTranslation |
|
1267 |
^primaryTranslation |
|
1268 |
! |
|
1269 |
||
1270 |
primaryTranslation: anObject |
|
1271 |
primaryTranslation := anObject |
|
1272 |
! |
|
1273 |
||
1274 |
secondaryTranslation |
|
1275 |
^secondaryTranslation |
|
1276 |
! |
|
1277 |
||
1278 |
secondaryTranslation: anObject |
|
1279 |
secondaryTranslation := anObject |
|
1280 |
! |
|
1281 |
||
1282 |
skipCount |
|
1283 |
^skipCount |
|
1284 |
! |
|
1285 |
||
1286 |
skipCount: anInteger |
|
1287 |
skipCount := anInteger |
|
1288 |
! |
|
1289 |
||
1290 |
startIndex |
|
1291 |
^startIndex |
|
1292 |
! |
|
1293 |
||
1294 |
startIndex: anObject |
|
1295 |
startIndex := anObject |
|
1296 |
! ! |
|
1297 |
||
1298 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'api'! |
|
1299 |
||
4488 | 1300 |
phoneticStringsFor:aString |
1301 |
"Private - Answers an array of alternate phonetic strings for the given input string." |
|
5236 | 1302 |
|
1303 |
self initialize. |
|
1304 |
self inputKey:aString. |
|
4488 | 1305 |
self performInitialProcessing. |
1306 |
self processRemainingCharacters. |
|
1307 |
^ Array with:primaryTranslation with:secondaryTranslation |
|
1308 |
||
1309 |
"Modified (format): / 28-07-2017 / 11:25:02 / cg" |
|
5236 | 1310 |
|
1311 |
" |
|
1312 |
PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'muller' |
|
1313 |
PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'mueller' |
|
1314 |
PhoneticStringUtilities::DoubleMetaphoneStringComparator new phoneticStringsFor:'müller' |
|
1315 |
" |
|
2208 | 1316 |
! ! |
1317 |
||
1318 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'initialization'! |
|
1319 |
||
1320 |
initialize |
|
4488 | 1321 |
super initialize. |
1322 |
||
1323 |
startIndex := 1. |
|
1324 |
primaryTranslation := ''. |
|
1325 |
secondaryTranslation := ''. |
|
1326 |
skipCount := 0. |
|
1327 |
currentIndex := 1. |
|
1328 |
||
1329 |
"Modified: / 28-07-2017 / 11:18:44 / cg" |
|
2208 | 1330 |
! ! |
1331 |
||
1332 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'private'! |
|
1333 |
||
4488 | 1334 |
addPrimaryTranslation:aString |
1335 |
primaryTranslation := (primaryTranslation , aString) |
|
1336 |
||
1337 |
"Modified: / 28-07-2017 / 11:19:09 / cg" |
|
2208 | 1338 |
! |
1339 |
||
4488 | 1340 |
addSecondaryTranslation:aString |
1341 |
secondaryTranslation := secondaryTranslation , aString |
|
1342 |
||
1343 |
"Modified: / 28-07-2017 / 11:17:11 / cg" |
|
2208 | 1344 |
! |
1345 |
||
1346 |
isSlavoGermanic: aString |
|
4521 | 1347 |
^((aString includesAny: 'WK') or: |
5235 | 1348 |
[ (aString indexOfSubCollection: 'CZ' startingAt: 1) > 0 ]) or: |
1349 |
[ (aString indexOfSubCollection: 'WITZ' startingAt: 1) > 0 ] |
|
4521 | 1350 |
|
1351 |
"Modified: / 09-10-2017 / 17:10:46 / stefan" |
|
2208 | 1352 |
! |
1353 |
||
1354 |
keyAt: anInteger |
|
4488 | 1355 |
(anInteger between:1 and:inputKey size) ifTrue: [ |
1356 |
^ inputKey at: anInteger |
|
1357 |
]. |
|
1358 |
^ Character space |
|
1359 |
||
1360 |
"Modified: / 28-07-2017 / 11:38:30 / cg" |
|
2208 | 1361 |
! |
1362 |
||
1363 |
keyLeftString: lengthInteger |
|
1364 |
^self keyMidString: lengthInteger from: 1 |
|
1365 |
! |
|
1366 |
||
1367 |
keyMidString: lengthInteger from: fromInteger |
|
4488 | 1368 |
| result from len additionalSpaces | |
1369 |
||
1370 |
result := ''. |
|
1371 |
from := fromInteger. |
|
1372 |
len := lengthInteger. |
|
1373 |
||
1374 |
"Prepend spaces if caller is requesting characters from before the start of the string" |
|
1375 |
||
1376 |
[ from < 1 ] whileTrue: |
|
1377 |
[ result := result, ' '. |
|
1378 |
from := from + 1. |
|
1379 |
len := len - 1 ]. |
|
1380 |
||
1381 |
from + len - 1 > inputKey size |
|
1382 |
ifTrue: |
|
1383 |
[ additionalSpaces := from + len - 1 - inputKey size. |
|
1384 |
len := inputKey size - from + 1 ] |
|
1385 |
ifFalse: [ additionalSpaces := 0 ]. |
|
1386 |
||
1387 |
result := result, (inputKey copyFrom: from to: (from+len-1 min: inputKey size)). |
|
1388 |
||
1389 |
[ additionalSpaces > 0 ] whileTrue: |
|
1390 |
[ result := result, ' '. |
|
1391 |
additionalSpaces := additionalSpaces - 1 ]. |
|
1392 |
||
1393 |
^result |
|
1394 |
||
1395 |
"Modified: / 28-07-2017 / 11:20:43 / cg" |
|
2208 | 1396 |
! |
1397 |
||
1398 |
keyRightString: lengthInteger |
|
4488 | 1399 |
^self keyMidString: lengthInteger from: inputKey size - lengthInteger + 1 |
1400 |
||
1401 |
"Modified: / 28-07-2017 / 11:20:51 / cg" |
|
2208 | 1402 |
! |
1403 |
||
1404 |
performInitialProcessing |
|
5236 | 1405 |
|ch1| |
1406 |
||
4490 | 1407 |
inputKey size > 1 ifTrue:[ |
5236 | 1408 |
(inputKey startsWithAnyOf:#( 'GN' 'KN' 'PN' 'WR' 'PS' )) ifTrue:[ |
4490 | 1409 |
startIndex := startIndex + 1 |
1410 |
]. |
|
4488 | 1411 |
]. |
4490 | 1412 |
|
5236 | 1413 |
ch1 := self keyAt:1. |
1414 |
ch1 = $X ifTrue:[ |
|
4488 | 1415 |
self |
1416 |
addPrimaryTranslation:'S'; |
|
1417 |
addSecondaryTranslation:'S'. |
|
1418 |
startIndex := startIndex + 1 |
|
1419 |
]. |
|
5236 | 1420 |
ch1 isVowel ifTrue:[ |
4488 | 1421 |
self |
1422 |
addPrimaryTranslation:'A'; |
|
1423 |
addSecondaryTranslation:'A'. |
|
1424 |
startIndex := startIndex + 1 |
|
1425 |
] |
|
1426 |
||
4490 | 1427 |
"Modified: / 01-08-2017 / 19:29:19 / cg" |
2208 | 1428 |
! |
1429 |
||
1430 |
processB |
|
4488 | 1431 |
self |
1432 |
addPrimaryTranslation: 'P'; |
|
1433 |
addSecondaryTranslation: 'P'. |
|
1434 |
||
1435 |
(self keyAt: (currentIndex + 1)) == $B ifTrue: [ |
|
1436 |
skipCount := skipCount + 1 |
|
1437 |
]. |
|
1438 |
||
1439 |
"Modified: / 28-07-2017 / 11:26:03 / cg" |
|
2208 | 1440 |
! |
1441 |
||
1442 |
processC |
|
2213 | 1443 |
"i" |
1444 |
((((currentIndex >= 3 |
|
1445 |
and: [ (self keyAt: currentIndex-2) isVowel not ]) |
|
1446 |
and: [ (self keyMidString: 3 from: currentIndex-1) = 'ACH' ]) |
|
1447 |
and: [ (self keyAt: currentIndex+2) ~= $I ]) |
|
1448 |
and: [ ((self keyAt: currentIndex+2) ~= $E) |
|
1449 |
or: [ (self keyMidString: 6 from: currentIndex-2) ~= 'BACHER' |
|
1450 |
and: [ (self keyMidString: 6 from: currentIndex-2) ~= 'MACHER' ] ] ]) |
|
1451 |
ifTrue: |
|
1452 |
[ self addPrimaryTranslation: 'K'. |
|
1453 |
self addSecondaryTranslation: 'K'. |
|
4488 | 1454 |
skipCount := skipCount + 2. |
2213 | 1455 |
^self ]. |
1456 |
||
1457 |
"ii" |
|
4488 | 1458 |
(inputKey beginsWith: 'CAESAR') |
2213 | 1459 |
ifTrue: |
1460 |
[ self addPrimaryTranslation: 'S'. |
|
1461 |
self addSecondaryTranslation: 'S'. |
|
4488 | 1462 |
skipCount := skipCount + 1. |
2213 | 1463 |
^self ]. |
1464 |
||
1465 |
"iii" |
|
1466 |
(self keyMidString: 4 from: currentIndex) = 'CHIA' |
|
1467 |
ifTrue: |
|
1468 |
[ self addPrimaryTranslation: 'K'. |
|
1469 |
self addSecondaryTranslation: 'K'. |
|
4488 | 1470 |
skipCount := skipCount + 1. |
2213 | 1471 |
^self ]. |
1472 |
||
1473 |
"iv" |
|
1474 |
(self keyMidString: 2 from: currentIndex) = 'CH' |
|
1475 |
ifTrue: |
|
1476 |
[ (currentIndex > 1 "a" |
|
1477 |
and: [ (self keyMidString: 4 from: currentIndex) = 'CHAE' ]) |
|
1478 |
ifTrue: [ self |
|
1479 |
addPrimaryTranslation: 'K'; |
|
4488 | 1480 |
addSecondaryTranslation: 'X'. |
1481 |
skipCount := skipCount + 1. |
|
1482 |
^self ]. |
|
2213 | 1483 |
|
1484 |
(currentIndex = 1 "b" |
|
5456 | 1485 |
and: [ (inputKey size > 5 and: [(inputKey startsWith: 'CHARAC') |
1486 |
or: [ (inputKey startsWith: 'CHARIS') ]] ) |
|
1487 |
or: [inputKey size > 4 and: [ ((((inputKey startsWith: 'CHOR') |
|
1488 |
or: [ (inputKey startsWith: 'CHYM') ]) |
|
1489 |
or: [ (inputKey startsWith: 'CHIA') ]) |
|
1490 |
or: [ (inputKey startsWith: 'CHEM') ]) |
|
1491 |
and: [ (inputKey startsWith: 'CHORE') not ] ] ] ]) |
|
2213 | 1492 |
ifTrue: [ self |
1493 |
addPrimaryTranslation: 'K'; |
|
4488 | 1494 |
addSecondaryTranslation: 'K'. |
1495 |
skipCount := skipCount + 1. |
|
1496 |
^self ]. |
|
1497 |
||
1498 |
(((((#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: 4)) "c" |
|
5456 | 1499 |
or: [ (inputKey startsWith: 'SCH') ]) |
2213 | 1500 |
or: [ #('ORCHES' 'ARCHIT' 'ORCHID') |
1501 |
includes: (self keyMidString: 6 from: currentIndex-2) ]) |
|
1502 |
or: [ #($T $S) includes: (self keyAt: currentIndex+2) ]) |
|
1503 |
or: [ ((currentIndex = 1) |
|
1504 |
or: [ #($A $O $U $E) includes: (self keyAt: currentIndex-1) ]) |
|
1505 |
and: [ #($L $R $N $M $B $H $F $V $W $ ) includes: (self keyAt: currentIndex+2) ] ] ) |
|
1506 |
ifTrue: |
|
1507 |
[ self |
|
1508 |
addPrimaryTranslation: 'K'; |
|
4488 | 1509 |
addSecondaryTranslation: 'K'. |
1510 |
skipCount := skipCount + 1. |
|
1511 |
^self ] |
|
2213 | 1512 |
ifFalse: |
1513 |
[ currentIndex > 1 |
|
1514 |
ifTrue: |
|
5456 | 1515 |
[ (inputKey startsWith: 'MC') |
2213 | 1516 |
ifTrue: |
1517 |
[ self |
|
1518 |
addPrimaryTranslation: 'K'; |
|
1519 |
addSecondaryTranslation: 'K' ] |
|
1520 |
ifFalse: |
|
1521 |
[ self |
|
1522 |
addPrimaryTranslation: 'X'; |
|
1523 |
addSecondaryTranslation: 'K' ] ] |
|
1524 |
ifFalse: |
|
1525 |
[ self |
|
1526 |
addPrimaryTranslation: 'X'; |
|
1527 |
addSecondaryTranslation: 'X' ]. |
|
4488 | 1528 |
skipCount := skipCount + 1. |
2213 | 1529 |
^self ] ]. |
1530 |
||
1531 |
"v" |
|
1532 |
(self keyAt: currentIndex+1) = $Z |
|
1533 |
ifTrue: |
|
1534 |
[ self |
|
1535 |
addPrimaryTranslation: 'S'; |
|
4488 | 1536 |
addSecondaryTranslation: 'X'. |
1537 |
skipCount := skipCount + 1. |
|
1538 |
^self ]. |
|
2213 | 1539 |
|
1540 |
"vi" |
|
1541 |
(self keyMidString: 3 from: currentIndex+1) = 'CIA' |
|
1542 |
ifTrue: |
|
1543 |
[ self |
|
1544 |
addPrimaryTranslation: 'X'; |
|
4488 | 1545 |
addSecondaryTranslation: 'X'. |
1546 |
skipCount := skipCount + 2. |
|
1547 |
^self ]. |
|
2213 | 1548 |
|
1549 |
"vii" |
|
1550 |
((self keyAt: currentIndex+1) = $C |
|
1551 |
and: [ ((currentIndex = 2) |
|
1552 |
and: [ (self keyAt: 1) = $M ]) not ]) |
|
1553 |
ifTrue: |
|
1554 |
[ ((#($I $E $H) includes: (self keyAt: currentIndex+2)) |
|
1555 |
and: [ (self keyMidString: 2 from: currentIndex+2) ~= 'HU' ]) |
|
1556 |
ifTrue: |
|
1557 |
[ ((currentIndex = 2 and: [ (self keyAt: 1) = $A ]) |
|
1558 |
or: [ #('UCCEE' 'UCCES') includes: (self keyMidString: 5 from: currentIndex-1)]) |
|
1559 |
ifTrue: |
|
1560 |
[self |
|
1561 |
addPrimaryTranslation: 'KS'; |
|
4488 | 1562 |
addSecondaryTranslation: 'KS'. |
1563 |
skipCount := skipCount + 2. |
|
1564 |
^self ] |
|
2213 | 1565 |
ifFalse: |
1566 |
[self |
|
1567 |
addPrimaryTranslation: 'X'; |
|
4488 | 1568 |
addSecondaryTranslation: 'X'. |
1569 |
skipCount := skipCount + 2. |
|
1570 |
^self ] ] |
|
2213 | 1571 |
ifFalse: |
1572 |
[ self |
|
1573 |
addPrimaryTranslation: 'K'; |
|
4488 | 1574 |
addSecondaryTranslation: 'K'. |
1575 |
skipCount := skipCount + 2. |
|
1576 |
^self ] ]. |
|
2213 | 1577 |
|
1578 |
"viii" |
|
1579 |
(#($K $G $Q) includes: (self keyAt: currentIndex+1)) |
|
1580 |
ifTrue: |
|
1581 |
[ self |
|
1582 |
addPrimaryTranslation: 'K'; |
|
4488 | 1583 |
addSecondaryTranslation: 'K'. |
1584 |
skipCount := skipCount + 1. |
|
1585 |
^self ]. |
|
2213 | 1586 |
|
1587 |
"ix" |
|
1588 |
(#($I $E $Y) includes: (self keyAt: currentIndex+1)) |
|
1589 |
ifTrue: |
|
1590 |
[ (#('CIO' 'CIE' 'CIA') includes: (self keyMidString: 3 from: currentIndex)) |
|
1591 |
ifTrue: |
|
1592 |
[self |
|
1593 |
addPrimaryTranslation: 'S'; |
|
1594 |
addSecondaryTranslation: 'X' ] |
|
1595 |
ifFalse: |
|
1596 |
[self |
|
1597 |
addPrimaryTranslation: 'S'; |
|
1598 |
addSecondaryTranslation: 'S']. |
|
4488 | 1599 |
skipCount := skipCount + 1. |
2213 | 1600 |
^self ]. |
1601 |
||
1602 |
"x" |
|
1603 |
self |
|
1604 |
addPrimaryTranslation: 'K'; |
|
1605 |
addSecondaryTranslation: 'K'. |
|
1606 |
||
1607 |
"xi" |
|
1608 |
(#(' C' ' Q' ' G') includes: (self keyMidString: 2 from: currentIndex+1)) |
|
1609 |
ifTrue: |
|
4488 | 1610 |
[ skipCount := skipCount + 2 ] |
2213 | 1611 |
ifFalse: |
1612 |
[ ((#($C $K $Q) includes: (self keyAt: currentIndex+1)) |
|
1613 |
and: [ (#('CE' 'CI') includes: (self keyMidString: 2 from: currentIndex+1)) not ]) |
|
4488 | 1614 |
ifTrue: [ skipCount := skipCount + 1] ] |
1615 |
||
1616 |
"Modified: / 28-07-2017 / 11:29:11 / cg" |
|
2208 | 1617 |
! |
1618 |
||
1619 |
processCedille |
|
1620 |
self |
|
1621 |
addPrimaryTranslation: 'S'; |
|
1622 |
addSecondaryTranslation: 'S' |
|
1623 |
! |
|
1624 |
||
1625 |
processD |
|
2213 | 1626 |
"i" |
1627 |
(self keyAt: currentIndex+1) = $G |
|
1628 |
ifTrue: |
|
1629 |
[ (#($I $E $Y) includes: (self keyAt: currentIndex+2)) |
|
1630 |
ifTrue: |
|
1631 |
[ self |
|
1632 |
addPrimaryTranslation: 'J'; |
|
4488 | 1633 |
addSecondaryTranslation: 'J'. |
1634 |
skipCount := skipCount + 2. |
|
2213 | 1635 |
^self ] |
1636 |
ifFalse: |
|
1637 |
[ self |
|
1638 |
addPrimaryTranslation: 'TK'; |
|
4488 | 1639 |
addSecondaryTranslation: 'TK'. |
1640 |
skipCount := skipCount + 1. |
|
2213 | 1641 |
^self ] ]. |
1642 |
||
1643 |
"ii" |
|
1644 |
(#($T $D) includes: (self keyAt: currentIndex+1)) |
|
1645 |
ifTrue: |
|
1646 |
[ self |
|
1647 |
addPrimaryTranslation: 'T'; |
|
4488 | 1648 |
addSecondaryTranslation: 'T'. |
1649 |
skipCount := skipCount + 1. |
|
1650 |
^self ]. |
|
2213 | 1651 |
|
1652 |
"iii" |
|
1653 |
self |
|
1654 |
addPrimaryTranslation: 'T'; |
|
1655 |
addSecondaryTranslation: 'T' |
|
4488 | 1656 |
|
1657 |
"Modified: / 28-07-2017 / 11:27:39 / cg" |
|
2208 | 1658 |
! |
1659 |
||
1660 |
processF |
|
4488 | 1661 |
self |
1662 |
addPrimaryTranslation: 'F'; |
|
1663 |
addSecondaryTranslation: 'F'. |
|
1664 |
||
1665 |
(self keyAt: currentIndex+1) = $F |
|
1666 |
ifTrue: [ skipCount := skipCount + 1 ] |
|
1667 |
||
1668 |
"Modified (format): / 28-07-2017 / 11:29:21 / cg" |
|
2208 | 1669 |
! |
1670 |
||
1671 |
processG |
|
1672 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
|
1673 |
case 'G': |
|
1674 |
if(GetAt(current + 1) == 'H') |
|
1675 |
{" |
|
1676 |
| word | |
|
2213 | 1677 |
(self keyAt: currentIndex + 1) = $H |
2208 | 1678 |
ifTrue: [ |
1679 |
"if((current > 0) AND !!IsVowel(current - 1))" |
|
1680 |
||
2213 | 1681 |
(currentIndex > 1 and: [(self keyAt: currentIndex - 1) isVowel not]) |
2208 | 1682 |
ifTrue: [ |
1683 |
" { |
|
1684 |
MetaphAdd(K); |
|
1685 |
current += 2; |
|
1686 |
break; |
|
1687 |
}" |
|
1688 |
||
4488 | 1689 |
self |
1690 |
addPrimaryTranslation: 'K'; |
|
1691 |
addSecondaryTranslation: 'K'. |
|
1692 |
skipCount := skipCount + 1. |
|
1693 |
^self |
|
2208 | 1694 |
]. |
1695 |
||
1696 |
"if(current < 3) |
|
1697 |
{" |
|
1698 |
||
1699 |
currentIndex < 4 |
|
1700 |
ifTrue: [ |
|
1701 |
||
1702 |
" //'ghislane', ghiradelli |
|
1703 |
if(current == 0) |
|
1704 |
{ " |
|
1705 |
currentIndex = 1 |
|
1706 |
ifTrue: [ |
|
1707 |
"if(GetAt(current + 2) == 'I')" |
|
1708 |
||
2213 | 1709 |
(self keyAt: currentIndex + 2) = $I |
2208 | 1710 |
ifTrue: [ |
1711 |
"MetaphAdd(J);" |
|
1712 |
self addPrimaryTranslation: 'J'; |
|
1713 |
addSecondaryTranslation: 'J'. |
|
1714 |
] ifFalse: [ |
|
1715 |
"MetaphAdd(K);" |
|
1716 |
self addPrimaryTranslation: 'K'; |
|
1717 |
addSecondaryTranslation: 'K'. |
|
1718 |
]. |
|
1719 |
" current += 2; |
|
1720 |
break;" |
|
4488 | 1721 |
skipCount := skipCount + 1. |
1722 |
^self |
|
2208 | 1723 |
] |
1724 |
]. |
|
1725 |
||
1726 |
" //Parker's rule (with some further refinements) - e.g., 'hugh' |
|
1727 |
if(((current > 1) AND StringAt((current - 2), 1, B, H, D, ) ) |
|
1728 |
//e.g., 'bough' |
|
1729 |
OR ((current > 2) AND StringAt((current - 3), 1, B, H, D, ) ) |
|
1730 |
//e.g., 'broughton' |
|
1731 |
OR ((current > 3) AND StringAt((current - 4), 1, B, H, ) ) ) |
|
1732 |
" |
|
2213 | 1733 |
(((currentIndex > 2 and: [#($B $H $D) includes: (self keyAt: currentIndex - 2)]) |
1734 |
or: [currentIndex > 3 and: [#($B $H $D) includes: (self keyAt: currentIndex - 3)]]) |
|
1735 |
or: [currentIndex > 4 and: [#($B $H) includes: (self keyAt: currentIndex - 4)]]) |
|
2208 | 1736 |
ifTrue: [ |
1737 |
"current += 2; |
|
1738 |
break;" |
|
4488 | 1739 |
skipCount := skipCount + 1. |
1740 |
^self |
|
2208 | 1741 |
] ifFalse: [ |
1742 |
" //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' |
|
1743 |
if((current > 2) |
|
1744 |
AND (GetAt(current - 1) == 'U') |
|
1745 |
AND StringAt((current - 3), 1, C, G, L, R, T, ) )" |
|
1746 |
(currentIndex > 3 and: [ |
|
2213 | 1747 |
((self keyAt: currentIndex - 1) = $U) and: [ |
1748 |
#($C $G $L $R $T) includes: (self keyAt: currentIndex - 3) |
|
2208 | 1749 |
] |
1750 |
]) ifTrue: [ |
|
1751 |
"MetaphAdd(F);" |
|
1752 |
self addPrimaryTranslation: 'F'; |
|
1753 |
addSecondaryTranslation: 'F'. |
|
1754 |
] ifFalse: [ |
|
1755 |
" if((current > 0) AND GetAt(current - 1) !!= 'I') |
|
1756 |
MetaphAdd(K);" |
|
2213 | 1757 |
(currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= $I]) |
2208 | 1758 |
ifTrue: [ |
1759 |
self addPrimaryTranslation: 'K'; |
|
1760 |
addSecondaryTranslation: 'K'. |
|
1761 |
]. |
|
1762 |
]. |
|
4488 | 1763 |
skipCount := skipCount + 1. |
1764 |
^self |
|
2208 | 1765 |
]. |
1766 |
]. |
|
1767 |
"if(GetAt(current + 1) == 'N')" |
|
2213 | 1768 |
(self keyAt: currentIndex + 1) = $N |
2208 | 1769 |
ifTrue: [ |
1770 |
"if((current == 1) AND IsVowel(0) AND !!SlavoGermanic())" |
|
4488 | 1771 |
(currentIndex = 2 and: [(inputKey at: 1) isVowel and: [(self isSlavoGermanic: inputKey) not]]) |
2208 | 1772 |
ifTrue: [ |
1773 |
"MetaphAdd(KN, N);" |
|
1774 |
self addPrimaryTranslation: 'KN'; |
|
1775 |
addSecondaryTranslation: 'N'. |
|
1776 |
] ifFalse: [ |
|
1777 |
" //not e.g. 'cagney' |
|
1778 |
if(!!StringAt((current + 2), 2, EY, ) |
|
1779 |
AND (GetAt(current + 1) !!= 'Y') |
|
1780 |
AND !!SlavoGermanic())" |
|
4488 | 1781 |
((inputKey size >= (currentIndex + 2)) and: [ |
1782 |
(inputKey copyFrom: currentIndex + 2 to: (currentIndex + 4 min: inputKey size)) ~= 'EY' and: [ |
|
2213 | 1783 |
(self keyAt: currentIndex + 1) ~= $Y and: [ |
4488 | 1784 |
(self isSlavoGermanic: inputKey) not |
2208 | 1785 |
] |
1786 |
] |
|
1787 |
]) ifTrue: [ |
|
1788 |
self addPrimaryTranslation: 'N'; |
|
1789 |
addSecondaryTranslation: 'KN'. |
|
1790 |
] ifFalse: [ |
|
1791 |
self addPrimaryTranslation: 'KN'; |
|
1792 |
addSecondaryTranslation: 'KN'. |
|
1793 |
]. |
|
1794 |
]. |
|
4488 | 1795 |
skipCount := skipCount + 1. |
1796 |
^self |
|
2208 | 1797 |
]. |
1798 |
" //'tagliaro' |
|
1799 |
if(StringAt((current + 1), 2, LI, ) AND !!SlavoGermanic())" |
|
4488 | 1800 |
((inputKey size >= (currentIndex + 3)) and: [ |
1801 |
(inputKey copyFrom: currentIndex + 1 to: currentIndex + 2) = 'LI' and: [ |
|
1802 |
(self isSlavoGermanic: inputKey) not]]) |
|
2208 | 1803 |
ifTrue: [ |
1804 |
self addPrimaryTranslation: 'KL'; |
|
1805 |
addSecondaryTranslation: 'L'. |
|
4488 | 1806 |
skipCount := skipCount + 1. |
1807 |
^self. |
|
2208 | 1808 |
]. |
1809 |
" //-ges-,-gep-,-gel-, -gie- at beginning |
|
1810 |
if((current == 0) |
|
1811 |
AND ((GetAt(current + 1) == 'Y') |
|
1812 |
OR StringAt((current + 1), 2, ES, EP, EB, EL, EY, IB, IL, IN, IE, EI, ER, )) )" |
|
2213 | 1813 |
(currentIndex = 1 and: [ |
1814 |
((self keyAt: currentIndex + 1) = $Y) or: [ |
|
2208 | 1815 |
(#('ES' 'EP' 'EB' 'EL' 'EY' 'IB' 'IL' 'IN' 'IE' 'EI' 'ER') includes: |
4488 | 1816 |
(inputKey copyFrom: currentIndex + 1 to: currentIndex + 2)) |
2208 | 1817 |
]]) ifTrue: [ |
1818 |
self addPrimaryTranslation: 'K'; |
|
1819 |
addSecondaryTranslation: 'J'. |
|
4488 | 1820 |
skipCount := skipCount + 1. |
1821 |
^self. |
|
2208 | 1822 |
]. |
1823 |
" // -ger-, -gy- |
|
1824 |
if((StringAt((current + 1), 2, ER, ) OR (GetAt(current + 1) == 'Y')) |
|
1825 |
AND !!StringAt(0, 6, DANGER, RANGER, MANGER, ) |
|
1826 |
AND !!StringAt((current - 1), 1, E, I, ) |
|
1827 |
AND !!StringAt((current - 1), 3, RGY, OGY, ) ) |
|
1828 |
" |
|
4488 | 1829 |
(((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 3 min: inputKey size)) = 'ER' or: [ |
2213 | 1830 |
((self keyAt: currentIndex + 1) = $Y)]) |
4488 | 1831 |
and: [((#('DANGER' 'RANGER' 'MANGER') includes: (word := inputKey copyFrom: 1 to: (6 min: inputKey size))) not) |
2213 | 1832 |
and: [(self keyAt: currentIndex - 1) ~= $E |
4488 | 1833 |
and: [(#('RGY' 'OGY') includes: (inputKey copyFrom: currentIndex - 1 to: currentIndex + 1)) not]]]) |
2208 | 1834 |
ifTrue: [ |
1835 |
self addPrimaryTranslation: 'K'; |
|
1836 |
addSecondaryTranslation: 'J'. |
|
4488 | 1837 |
skipCount := skipCount + 1. |
1838 |
^self. |
|
2208 | 1839 |
]. |
1840 |
||
1841 |
" // italian e.g, 'biaggi' |
|
1842 |
if(StringAt((current + 1), 1, E, I, Y, ) OR StringAt((current - 1), 4, AGGI, OGGI, )) |
|
1843 |
" |
|
4488 | 1844 |
((#($E $I $Y) includes: (self keyAt: (currentIndex + 1))) or: [(#('AGGI' 'OGGI') includes: (inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size)))]) |
2208 | 1845 |
ifTrue: [ |
1846 |
" //obvious germanic |
|
1847 |
if((StringAt(0, 4, VAN , VON , ) OR StringAt(0, 3, SCH, )) |
|
1848 |
OR StringAt((current + 1), 2, ET, )) MetaphAdd(K);" |
|
4488 | 1849 |
word := (inputKey copyFrom: 1 to: 4). |
5456 | 1850 |
((#('VAN ' 'VON ') includes: word) or: [(word startsWith: 'SCH') or: [(word startsWith: 'ET')]]) |
2208 | 1851 |
ifTrue: [ |
1852 |
self addPrimaryTranslation: 'K'; |
|
1853 |
addSecondaryTranslation: 'K'. |
|
1854 |
] ifFalse: [ |
|
1855 |
" //always soft if french ending |
|
1856 |
if(StringAt((current + 1), 4, IER , )) |
|
1857 |
MetaphAdd(J); |
|
1858 |
else |
|
1859 |
MetaphAdd(J, K); |
|
1860 |
current += 2; |
|
1861 |
break;" |
|
5456 | 1862 |
(((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 5 min: inputKey size)), ' ') startsWith: 'IER ') |
2208 | 1863 |
ifTrue: [ |
1864 |
self addPrimaryTranslation: 'J'; |
|
1865 |
addSecondaryTranslation: 'J'. |
|
1866 |
] ifFalse: [ |
|
1867 |
self addPrimaryTranslation: 'J'; |
|
1868 |
addSecondaryTranslation: 'K'. |
|
1869 |
]. |
|
1870 |
||
1871 |
]. |
|
4488 | 1872 |
skipCount := skipCount + 1. |
1873 |
^self. |
|
2208 | 1874 |
]. |
1875 |
||
1876 |
" if(GetAt(current + 1) == 'G') |
|
1877 |
current += 2; |
|
1878 |
else |
|
1879 |
current += 1; |
|
1880 |
MetaphAdd(K); |
|
1881 |
break;" |
|
1882 |
||
2213 | 1883 |
(self keyAt: (currentIndex + 1)) = $G |
2208 | 1884 |
ifTrue: [ |
4488 | 1885 |
skipCount := skipCount + 1. |
2208 | 1886 |
]. |
1887 |
self addPrimaryTranslation: 'K'; |
|
1888 |
addSecondaryTranslation: 'K'. |
|
4488 | 1889 |
|
1890 |
"Modified: / 28-07-2017 / 11:31:33 / cg" |
|
2208 | 1891 |
! |
1892 |
||
1893 |
processH |
|
2213 | 1894 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
1895 |
case 'H': |
|
2208 | 1896 |
//only keep if first & before vowel or btw. 2 vowels |
1897 |
if(((current == 0) OR IsVowel(current - 1)) |
|
1898 |
AND IsVowel(current + 1)) |
|
1899 |
{ |
|
1900 |
MetaphAdd(H); |
|
1901 |
current += 2; |
|
1902 |
}else//also takes care of 'HH' |
|
1903 |
current += 1; |
|
1904 |
break; |
|
1905 |
" |
|
1906 |
||
2213 | 1907 |
(((currentIndex = 1) |
1908 |
or: [ (self keyAt: currentIndex - 1) isVowel]) |
|
1909 |
and: [(self keyAt: currentIndex + 1) isVowel]) |
|
1910 |
ifTrue: [ |
|
1911 |
self addPrimaryTranslation: 'H'; |
|
1912 |
addSecondaryTranslation: 'H'. |
|
4488 | 1913 |
skipCount := skipCount + 1. |
1914 |
^self. |
|
2213 | 1915 |
] |
4488 | 1916 |
|
1917 |
"Modified: / 28-07-2017 / 11:29:52 / cg" |
|
2208 | 1918 |
! |
1919 |
||
1920 |
processJ |
|
2213 | 1921 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
1922 |
case 'J': |
|
2208 | 1923 |
//obvious spanish, 'jose', 'san jacinto' |
1924 |
if(StringAt(current, 4, JOSE, ) OR StringAt(0, 4, SAN , ) ) |
|
1925 |
{ |
|
1926 |
if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, SAN , ) ) |
|
1927 |
MetaphAdd(H); |
|
1928 |
else |
|
1929 |
{ |
|
1930 |
MetaphAdd(J, H); |
|
1931 |
} |
|
1932 |
current +=1; |
|
1933 |
break; |
|
1934 |
} |
|
1935 |
||
1936 |
if((current == 0) AND !!StringAt(current, 4, JOSE, )) |
|
1937 |
MetaphAdd(J, A);//Yankelovich/Jankelowicz |
|
1938 |
else |
|
1939 |
//spanish pron. of e.g. 'bajador' |
|
1940 |
if(IsVowel(current - 1) |
|
1941 |
AND !!SlavoGermanic() |
|
1942 |
AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O'))) |
|
1943 |
MetaphAdd(J, H); |
|
1944 |
else |
|
1945 |
if(current == last) |
|
1946 |
MetaphAdd(J, ); |
|
1947 |
else |
|
1948 |
if(!!StringAt((current + 1), 1, L, T, K, S, N, M, B, Z, ) |
|
1949 |
AND !!StringAt((current - 1), 1, S, K, L, )) |
|
1950 |
MetaphAdd(J); |
|
1951 |
||
1952 |
if(GetAt(current + 1) == 'J')//it could happen!! |
|
1953 |
current += 2; |
|
1954 |
else |
|
1955 |
current += 1; |
|
1956 |
break; |
|
1957 |
" |
|
2213 | 1958 |
| currentWord firstWord nextLetter | |
4488 | 1959 |
currentWord := inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size). |
1960 |
firstWord := inputKey copyFrom: 1 to: (4 min: inputKey size). |
|
2213 | 1961 |
nextLetter := self keyAt: currentIndex + 1. |
1962 |
(currentWord = 'JOSE' or: [firstWord = 'SAN ']) |
|
1963 |
ifTrue: [ |
|
5212 | 1964 |
((currentIndex = 1 and: [inputKey size == 4 or: [inputKey size >= 5 and: [self keyAt: currentIndex + 4 = $ ]]]) |
2213 | 1965 |
or: [firstWord = 'SAN ']) |
1966 |
ifTrue: [ |
|
1967 |
self addPrimaryTranslation: 'H'; |
|
1968 |
addSecondaryTranslation: 'H'. |
|
1969 |
] ifFalse: [ |
|
1970 |
self addPrimaryTranslation: 'J'; |
|
1971 |
addSecondaryTranslation: 'H'. |
|
1972 |
]. |
|
1973 |
^self. |
|
1974 |
]. |
|
1975 |
(currentIndex = 1 and: [firstWord ~= 'JOSE']) |
|
1976 |
ifTrue: [ |
|
1977 |
self addPrimaryTranslation: 'J'; |
|
1978 |
addSecondaryTranslation: 'A'. |
|
1979 |
] ifFalse: [ |
|
1980 |
((currentIndex > 1 and: [(self keyAt: currentIndex -1) isVowel]) |
|
4488 | 1981 |
and: [(self isSlavoGermanic: inputKey) not and: [nextLetter == $A or: [nextLetter == $O]]]) |
2213 | 1982 |
ifTrue: [ |
1983 |
self addPrimaryTranslation: 'J'; |
|
1984 |
addSecondaryTranslation: 'H'. |
|
1985 |
] ifFalse: [ |
|
4488 | 1986 |
currentIndex = inputKey size |
2213 | 1987 |
ifTrue: [ |
1988 |
self addPrimaryTranslation: 'J'; |
|
1989 |
addSecondaryTranslation: ' '. |
|
1990 |
] ifFalse: [ |
|
1991 |
((#($L $T $K $S $N $M $B $Z) includes: nextLetter) not and: [(#($S $K $L) includes: (self keyAt: currentIndex - 1)) not]) |
|
1992 |
ifTrue: [ |
|
1993 |
self addPrimaryTranslation: 'J'; |
|
1994 |
addSecondaryTranslation: 'J'. |
|
1995 |
]. |
|
1996 |
]. |
|
1997 |
]. |
|
1998 |
]. |
|
3489
6ef5f530df03
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3488
diff
changeset
|
1999 |
nextLetter == $J |
2213 | 2000 |
ifTrue: [ |
4488 | 2001 |
skipCount := skipCount + 1. |
2213 | 2002 |
]. |
4488 | 2003 |
|
2004 |
"Modified: / 28-07-2017 / 11:31:41 / cg" |
|
2208 | 2005 |
! |
2006 |
||
2007 |
processK |
|
2213 | 2008 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2009 |
case 'K': |
|
2208 | 2010 |
if(GetAt(current + 1) == 'K') |
2011 |
current += 2; |
|
2012 |
else |
|
2013 |
current += 1; |
|
2014 |
MetaphAdd(K); |
|
2015 |
break; |
|
2213 | 2016 |
" |
2017 |
||
2018 |
(self keyAt: currentIndex + 1) = $K |
|
2019 |
ifTrue: [ |
|
4488 | 2020 |
skipCount := skipCount + 1 |
2213 | 2021 |
]. |
2022 |
self addPrimaryTranslation: 'K'; |
|
2023 |
addSecondaryTranslation: 'K'. |
|
4488 | 2024 |
|
2025 |
"Modified: / 28-07-2017 / 11:31:46 / cg" |
|
2208 | 2026 |
! |
2027 |
||
2028 |
processL |
|
2029 |
||
2030 |
"case 'L': |
|
2031 |
if(GetAt(current + 1) == 'L') |
|
2032 |
{ |
|
2033 |
//spanish e.g. 'cabrillo', 'gallegos' |
|
2034 |
if(((current == (length - 3)) |
|
2035 |
AND StringAt((current - 1), 4, ILLO, ILLA, ALLE, )) |
|
2036 |
OR ((StringAt((last - 1), 2, AS, OS, ) OR StringAt(last, 1, A, O, )) |
|
2037 |
AND StringAt((current - 1), 4, ALLE, )) ) |
|
2038 |
{ |
|
2039 |
MetaphAdd(L, ); |
|
2040 |
current += 2; |
|
2041 |
break; |
|
2042 |
} |
|
2043 |
current += 2; |
|
2044 |
}else |
|
2045 |
current += 1; |
|
2046 |
MetaphAdd(L); |
|
2047 |
break; |
|
2048 |
" |
|
2213 | 2049 |
| currentWord | |
2050 |
(self keyAt: currentIndex + 1) = $L |
|
2051 |
ifTrue: [ |
|
4488 | 2052 |
(((currentIndex = (inputKey size - 2)) |
2053 |
and: [(currentIndex > 1 and: [#('ILLO' 'ILLA' 'ALLE') includes: (currentWord := inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size))])]) |
|
2054 |
or: [((#('AS' 'OS') includes: (inputKey copyFrom: inputKey size - 1 to: inputKey size)) or: [#($A $O) includes: (self keyAt: inputKey size)]) and: [currentWord = 'ALLE'] |
|
2213 | 2055 |
]) |
2056 |
ifTrue: [ |
|
2057 |
self addPrimaryTranslation: 'L'; |
|
2058 |
addSecondaryTranslation: ' '. |
|
4488 | 2059 |
skipCount := skipCount + 1. |
2060 |
^self. |
|
2213 | 2061 |
]. |
4488 | 2062 |
skipCount := skipCount + 1. |
2213 | 2063 |
]. |
2064 |
self addPrimaryTranslation: 'L'; |
|
4488 | 2065 |
addSecondaryTranslation: 'L'. |
2066 |
||
2067 |
"Modified: / 28-07-2017 / 11:32:03 / cg" |
|
2208 | 2068 |
! |
2069 |
||
2070 |
processM |
|
2071 |
||
2072 |
"case 'M': |
|
2073 |
if((StringAt((current - 1), 3, UMB, ) |
|
2074 |
AND (((current + 1) == last) OR StringAt((current + 2), 2, ER, ))) |
|
2075 |
//'dumb','thumb' |
|
2076 |
OR (GetAt(current + 1) == 'M') ) |
|
2077 |
current += 2; |
|
2078 |
else |
|
2079 |
current += 1; |
|
2080 |
MetaphAdd(M); |
|
2081 |
break; |
|
2082 |
" |
|
4488 | 2083 |
(((currentIndex > 1 and: [(inputKey copyFrom: currentIndex - 1 to: (currentIndex +1 min: inputKey size)) = 'UMB']) |
2084 |
and: [currentIndex + 1 = inputKey size or: [(inputKey copyFrom: (currentIndex + 2 min: inputKey size) to: (currentIndex + 4 min: inputKey size)) = 'ER']]) |
|
2213 | 2085 |
or: [(self keyAt: currentIndex + 1) = $M]) |
2086 |
ifTrue: [ |
|
4488 | 2087 |
skipCount := skipCount + 1. |
2213 | 2088 |
]. |
2089 |
self addPrimaryTranslation: 'M'; |
|
2090 |
addSecondaryTranslation: 'M'. |
|
4488 | 2091 |
|
2092 |
"Modified: / 28-07-2017 / 11:32:08 / cg" |
|
2208 | 2093 |
! |
2094 |
||
2095 |
processN |
|
2213 | 2096 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2097 |
case 'N': |
|
2208 | 2098 |
if(GetAt(current + 1) == 'N') |
2099 |
current += 2; |
|
2100 |
else |
|
2101 |
current += 1; |
|
2102 |
MetaphAdd(N); |
|
2103 |
break; |
|
2104 |
||
2213 | 2105 |
" |
2106 |
||
2107 |
(self keyAt: currentIndex + 1) = $N |
|
2108 |
ifTrue: [ |
|
4488 | 2109 |
skipCount := skipCount + 1 |
2213 | 2110 |
]. |
2111 |
self addPrimaryTranslation: 'N'; |
|
2112 |
addSecondaryTranslation: 'N'. |
|
4488 | 2113 |
|
2114 |
"Modified: / 28-07-2017 / 11:32:14 / cg" |
|
2208 | 2115 |
! |
2116 |
||
2117 |
processNtilde |
|
4488 | 2118 |
"case 'Ñ': |
2208 | 2119 |
current += 1; |
2120 |
MetaphAdd(N); |
|
2121 |
break; |
|
2122 |
" |
|
2123 |
self addPrimaryTranslation: 'N'; |
|
2124 |
addSecondaryTranslation: 'N'. |
|
2125 |
! |
|
2126 |
||
2127 |
processP |
|
2213 | 2128 |
"case 'P': |
2208 | 2129 |
if(GetAt(current + 1) == 'H') |
2130 |
{ |
|
2131 |
MetaphAdd(F); |
|
2132 |
current += 2; |
|
2133 |
break; |
|
2134 |
} |
|
2135 |
||
2136 |
//also account for campbell, raspberry |
|
2137 |
if(StringAt((current + 1), 1, P, B, )) |
|
2138 |
current += 2; |
|
2139 |
else |
|
2140 |
current += 1; |
|
2141 |
MetaphAdd(P); |
|
2142 |
break; |
|
2143 |
" |
|
2213 | 2144 |
| nextLetter | |
2145 |
(nextLetter := self keyAt: currentIndex + 1) = $H |
|
2146 |
ifTrue: [ |
|
2147 |
self addPrimaryTranslation: 'F'; |
|
2148 |
addSecondaryTranslation: 'F'. |
|
4488 | 2149 |
skipCount := skipCount + 1. |
2150 |
^self. |
|
2213 | 2151 |
]. |
2152 |
(#($P $B) includes: nextLetter) |
|
2153 |
ifTrue: [ |
|
4488 | 2154 |
skipCount := skipCount + 1. |
2213 | 2155 |
] ifFalse: [ |
2156 |
self addPrimaryTranslation: 'P'; |
|
2157 |
addSecondaryTranslation: 'P'. |
|
2158 |
]. |
|
4488 | 2159 |
|
2160 |
"Modified: / 28-07-2017 / 11:32:28 / cg" |
|
2208 | 2161 |
! |
2162 |
||
2163 |
processQ |
|
2213 | 2164 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2165 |
case 'Q': |
|
2208 | 2166 |
if(GetAt(current + 1) == 'Q') |
2167 |
current += 2; |
|
2168 |
else |
|
2169 |
current += 1; |
|
2170 |
MetaphAdd(K); |
|
2171 |
break; |
|
2172 |
||
2213 | 2173 |
" |
2174 |
||
2175 |
(self keyAt: currentIndex + 1) = $Q |
|
2176 |
ifTrue: [ |
|
4488 | 2177 |
skipCount := skipCount + 1 |
2213 | 2178 |
]. |
2179 |
self addPrimaryTranslation: 'K'; |
|
2180 |
addSecondaryTranslation: 'K'. |
|
4488 | 2181 |
|
2182 |
"Modified: / 28-07-2017 / 11:32:32 / cg" |
|
2208 | 2183 |
! |
2184 |
||
2185 |
processR |
|
2213 | 2186 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2187 |
case 'R': |
|
2208 | 2188 |
//french e.g. 'rogier', but exclude 'hochmeier' |
2189 |
if((current == last) |
|
2190 |
AND !!SlavoGermanic() |
|
2191 |
AND StringAt((current - 2), 2, IE, ) |
|
2192 |
AND !!StringAt((current - 4), 2, ME, MA, )) |
|
2193 |
MetaphAdd(, R); |
|
2194 |
else |
|
2195 |
MetaphAdd(R); |
|
2196 |
||
2197 |
if(GetAt(current + 1) == 'R') |
|
2198 |
current += 2; |
|
2199 |
else |
|
2200 |
current += 1; |
|
2201 |
break; |
|
2213 | 2202 |
" |
4488 | 2203 |
(currentIndex = inputKey size and: [ |
2204 |
(self isSlavoGermanic: inputKey) not and: [ |
|
2205 |
(inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)) = 'IE' and: [ |
|
2206 |
(#('ME' 'MA') includes: (inputKey copyFrom: ((currentIndex - 4) max: 1) to: ((currentIndex - 3) max: 1))) not |
|
2213 | 2207 |
] |
2208 |
] |
|
2209 |
]) |
|
2210 |
ifTrue: [ |
|
2211 |
self addPrimaryTranslation: ''; |
|
2212 |
addSecondaryTranslation: 'R'. |
|
2213 |
] ifFalse: [ |
|
2214 |
self addPrimaryTranslation: 'R'; |
|
2215 |
addSecondaryTranslation: 'R'. |
|
2216 |
]. |
|
2217 |
(self keyAt: currentIndex + 1) = $R |
|
2218 |
ifTrue: [ |
|
4488 | 2219 |
skipCount := skipCount + 1 |
2213 | 2220 |
]. |
4488 | 2221 |
|
2222 |
"Modified: / 28-07-2017 / 11:32:37 / cg" |
|
2208 | 2223 |
! |
2224 |
||
2225 |
processRemainingCharacters |
|
4488 | 2226 |
startIndex to: inputKey size do:[ :i | |
2208 | 2227 |
| c methodSelector | |
2228 |
||
4488 | 2229 |
skipCount = 0 ifTrue:[ |
2230 |
((primaryTranslation size > 4) and: [ secondaryTranslation size > 4 ]) |
|
2208 | 2231 |
ifTrue: [ ^self ]. |
2232 |
||
4488 | 2233 |
currentIndex := i. |
2208 | 2234 |
c := self keyAt: i. |
2235 |
||
2236 |
(c isVowel not and: [c ~= $Y]) ifTrue:[ |
|
4488 | 2237 |
c == $Ç ifTrue: [ |
2208 | 2238 |
methodSelector := #processCedille |
4488 | 2239 |
] ifFalse: [ c == $Ñ ifTrue: [ |
2208 | 2240 |
methodSelector := #processNtilde |
2241 |
] ifFalse: [ |
|
2242 |
methodSelector := ('process', c asString) asSymbol |
|
2243 |
]]. |
|
2244 |
self perform: methodSelector |
|
2245 |
] |
|
2246 |
] ifFalse: [ |
|
4488 | 2247 |
skipCount := skipCount - 1 |
2208 | 2248 |
] |
2249 |
] |
|
4488 | 2250 |
|
2251 |
"Modified: / 28-07-2017 / 11:24:15 / cg" |
|
2208 | 2252 |
! |
2253 |
||
2254 |
processS |
|
2213 | 2255 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2256 |
case 'S': |
|
2208 | 2257 |
//special cases 'island', 'isle', 'carlisle', 'carlysle' |
2258 |
if(StringAt((current - 1), 3, ISL, YSL, )) |
|
2259 |
{ |
|
2260 |
current += 1; |
|
2261 |
break; |
|
2262 |
} |
|
2263 |
||
2264 |
//special case 'sugar-' |
|
2265 |
if((current == 0) AND StringAt(current, 5, SUGAR, )) |
|
2266 |
{ |
|
2267 |
MetaphAdd(X, S); |
|
2268 |
current += 1; |
|
2269 |
break; |
|
2270 |
} |
|
2271 |
||
2272 |
if(StringAt(current, 2, SH, )) |
|
2273 |
{ |
|
2274 |
//germanic |
|
2275 |
if(StringAt((current + 1), 4, HEIM, HOEK, HOLM, HOLZ, )) |
|
2276 |
MetaphAdd(S); |
|
2277 |
else |
|
2278 |
MetaphAdd(X); |
|
2279 |
current += 2; |
|
2280 |
break; |
|
2281 |
} |
|
2282 |
||
2283 |
//italian & armenian |
|
2284 |
if(StringAt(current, 3, SIO, SIA, ) OR StringAt(current, 4, SIAN, )) |
|
2285 |
{ |
|
2286 |
if(!!SlavoGermanic()) |
|
2287 |
MetaphAdd(S, X); |
|
2288 |
else |
|
2289 |
MetaphAdd(S); |
|
2290 |
current += 3; |
|
2291 |
break; |
|
2292 |
} |
|
2293 |
||
2294 |
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' |
|
2295 |
//also, -sz- in slavic language altho in hungarian it is pronounced 's' |
|
2296 |
if(((current == 0) |
|
2297 |
AND StringAt((current + 1), 1, M, N, L, W, )) |
|
2298 |
OR StringAt((current + 1), 1, Z, )) |
|
2299 |
{ |
|
2300 |
MetaphAdd(S, X); |
|
2301 |
if(StringAt((current + 1), 1, Z, )) |
|
2302 |
current += 2; |
|
2303 |
else |
|
2304 |
current += 1; |
|
2305 |
break; |
|
2306 |
} |
|
2307 |
||
2308 |
if(StringAt(current, 2, SC, )) |
|
2309 |
{ |
|
2310 |
//Schlesinger's rule |
|
2311 |
if(GetAt(current + 2) == 'H') |
|
2312 |
//dutch origin, e.g. 'school', 'schooner' |
|
2313 |
if(StringAt((current + 3), 2, OO, ER, EN, UY, ED, EM, )) |
|
2314 |
{ |
|
2315 |
//'schermerhorn', 'schenker' |
|
2316 |
if(StringAt((current + 3), 2, ER, EN, )) |
|
2317 |
{ |
|
2318 |
MetaphAdd(X, SK); |
|
2319 |
}else |
|
2320 |
MetaphAdd(SK); |
|
2321 |
current += 3; |
|
2322 |
break; |
|
2323 |
}else{ |
|
2324 |
if((current == 0) AND !!IsVowel(3) AND (GetAt(3) !!= 'W')) |
|
2325 |
MetaphAdd(X, S); |
|
2326 |
else |
|
2327 |
MetaphAdd(X); |
|
2328 |
current += 3; |
|
2329 |
break; |
|
2330 |
} |
|
2331 |
||
2332 |
if(StringAt((current + 2), 1, I, E, Y, )) |
|
2333 |
{ |
|
2334 |
MetaphAdd(S); |
|
2335 |
current += 3; |
|
2336 |
break; |
|
2337 |
} |
|
2338 |
//else |
|
2339 |
MetaphAdd(SK); |
|
2340 |
current += 3; |
|
2341 |
break; |
|
2342 |
} |
|
2343 |
||
2344 |
//french e.g. 'resnais', 'artois' |
|
2345 |
if((current == last) AND StringAt((current - 2), 2, AI, OI, )) |
|
2346 |
MetaphAdd(, S); |
|
2347 |
else |
|
2348 |
MetaphAdd(S); |
|
2349 |
||
2350 |
if(StringAt((current + 1), 1, S, Z, )) |
|
2351 |
current += 2; |
|
2352 |
else |
|
2353 |
current += 1; |
|
2354 |
break; |
|
2355 |
" |
|
2356 |
||
2213 | 2357 |
| nextChar char2 chars char | |
4488 | 2358 |
(#('ISL' 'YSL') includes: (inputKey copyFrom: (currentIndex - 1 max: 1) to: (currentIndex + 1 min: inputKey size))) |
2213 | 2359 |
ifTrue: [ |
2360 |
^self |
|
2361 |
]. |
|
4488 | 2362 |
(currentIndex = 1 and: [(inputKey copyFrom: 1 to: (5 min: inputKey size)) = 'SUGAR']) |
2213 | 2363 |
ifTrue: [ |
2364 |
self addPrimaryTranslation: 'X'; |
|
2365 |
addSecondaryTranslation: 'S'. |
|
2366 |
^self. |
|
2367 |
]. |
|
4488 | 2368 |
(inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SH' |
2213 | 2369 |
ifTrue: [ |
4488 | 2370 |
(#('HEIM' 'HOEK' 'HOLM' 'HOLZ') includes: (inputKey copyFrom: (currentIndex + 1 min: inputKey size) to: ((currentIndex + 5) min: inputKey size))) |
2213 | 2371 |
ifTrue: [ |
2372 |
self addPrimaryTranslation: 'S'; |
|
2373 |
addSecondaryTranslation: 'S'. |
|
2374 |
] ifFalse: [ |
|
2375 |
self addPrimaryTranslation: 'X'; |
|
2376 |
addSecondaryTranslation: 'X'. |
|
2377 |
]. |
|
4488 | 2378 |
skipCount := skipCount + 1. |
2379 |
^self |
|
2213 | 2380 |
]. |
4488 | 2381 |
((#('SIO' 'SIA') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 2 min: inputKey size))) |
2382 |
or: [(inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size)) = 'SIAN']) |
|
2213 | 2383 |
ifTrue: [ |
4488 | 2384 |
(self isSlavoGermanic: inputKey) not |
2213 | 2385 |
ifTrue: [ |
2386 |
self addPrimaryTranslation: 'S'; |
|
2387 |
addSecondaryTranslation: 'X'. |
|
2388 |
] ifFalse: [ |
|
2389 |
self addPrimaryTranslation: 'S'; |
|
2390 |
addSecondaryTranslation: 'S'. |
|
2391 |
]. |
|
4488 | 2392 |
skipCount := skipCount + 2. |
2393 |
^self |
|
2213 | 2394 |
]. |
2395 |
((currentIndex = 1 and: [#($M $N $L $W) includes: (self keyAt: currentIndex + 1)]) |
|
2396 |
or: [(nextChar := self keyAt: currentIndex + 1) = $Z]) |
|
2397 |
ifTrue: [ |
|
2398 |
self addPrimaryTranslation: 'S'; |
|
2399 |
addSecondaryTranslation: 'X'. |
|
3488
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
2400 |
nextChar == $Z |
2213 | 2401 |
ifTrue: [ |
4488 | 2402 |
skipCount := skipCount + 1. |
2403 |
^self. |
|
2213 | 2404 |
]. |
2405 |
^self. |
|
2406 |
]. |
|
4488 | 2407 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SC') |
2213 | 2408 |
ifTrue: [ |
2409 |
(char2 := self keyAt: currentIndex + 2) = $H |
|
2410 |
ifTrue: [ |
|
4488 | 2411 |
(#('OO' 'ER' 'EN' 'UY' 'ED' 'EM') includes: (chars := inputKey copyFrom: ((currentIndex + 3) min: inputKey size) to: ((currentIndex + 4) min: inputKey size))) |
2213 | 2412 |
ifTrue: [ |
2413 |
(#('ER' 'EN') includes: chars) |
|
2414 |
ifTrue: [ |
|
2415 |
self addPrimaryTranslation: 'X'; |
|
2416 |
addSecondaryTranslation: 'SK'. |
|
2417 |
] ifFalse: [ |
|
2418 |
self addPrimaryTranslation: 'SK'; |
|
2419 |
addSecondaryTranslation: 'SK'. |
|
2420 |
]. |
|
4488 | 2421 |
skipCount := skipCount + 2. |
2422 |
^self. |
|
2213 | 2423 |
] ifFalse: [ |
4488 | 2424 |
((currentIndex = 1 and: [(char := inputKey at: 4 ifAbsent: [$b]) isVowel not]) and: [char ~= $W]) |
2213 | 2425 |
ifTrue: [ |
2426 |
self addPrimaryTranslation: 'X'; |
|
2427 |
addSecondaryTranslation: 'S'. |
|
2428 |
] ifFalse: [ |
|
2429 |
self addPrimaryTranslation: 'X'; |
|
2430 |
addSecondaryTranslation: 'X'. |
|
2431 |
]. |
|
4488 | 2432 |
skipCount := skipCount + 2. |
2433 |
^self . |
|
2213 | 2434 |
]. |
2435 |
] ifFalse: [ |
|
2436 |
(#($I $E $Y) includes: char2) |
|
2437 |
ifTrue: [ |
|
2438 |
self addPrimaryTranslation: 'S'; |
|
2439 |
addSecondaryTranslation: 'S'. |
|
4488 | 2440 |
skipCount := skipCount + 2. |
2441 |
^self . |
|
2213 | 2442 |
] ifFalse: [ |
2443 |
self addPrimaryTranslation: 'SK'; |
|
2444 |
addSecondaryTranslation: 'SK'. |
|
4488 | 2445 |
skipCount := skipCount + 2. |
2446 |
^self. |
|
2213 | 2447 |
] |
2448 |
]. |
|
2449 |
]. |
|
4488 | 2450 |
(currentIndex = inputKey size and: [(#('AI' 'OI') includes: (inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)))]) |
2213 | 2451 |
ifTrue: [ |
2452 |
self addPrimaryTranslation: ''; |
|
2453 |
addSecondaryTranslation: 'S'. |
|
2454 |
] ifFalse: [ |
|
2455 |
self addPrimaryTranslation: 'S'; |
|
2456 |
addSecondaryTranslation: 'S'. |
|
2457 |
]. |
|
2458 |
(#($S $Z) includes: (self keyAt: currentIndex + 1)) |
|
2459 |
ifTrue: [ |
|
4488 | 2460 |
skipCount := skipCount + 1. |
2461 |
^self. |
|
2213 | 2462 |
]. |
4488 | 2463 |
|
2464 |
"Modified: / 28-07-2017 / 11:34:18 / cg" |
|
2208 | 2465 |
! |
2466 |
||
2467 |
processT |
|
2213 | 2468 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2469 |
case 'T': |
|
2208 | 2470 |
if(StringAt(current, 4, TION, )) |
2471 |
{ |
|
2472 |
MetaphAdd(X); |
|
2473 |
current += 3; |
|
2474 |
break; |
|
2475 |
} |
|
2476 |
||
2477 |
if(StringAt(current, 3, TIA, TCH, )) |
|
2478 |
{ |
|
2479 |
MetaphAdd(X); |
|
2480 |
current += 3; |
|
2481 |
break; |
|
2482 |
} |
|
2483 |
||
2484 |
if(StringAt(current, 2, TH, ) |
|
2485 |
OR StringAt(current, 3, TTH, )) |
|
2486 |
{ |
|
2487 |
//special case 'thomas', 'thames' or germanic |
|
2488 |
if(StringAt((current + 2), 2, OM, AM, ) |
|
2489 |
OR StringAt(0, 4, VAN , VON , ) |
|
2490 |
OR StringAt(0, 3, SCH, )) |
|
2491 |
{ |
|
2492 |
MetaphAdd(T); |
|
2493 |
}else{ |
|
2494 |
MetaphAdd(0, T); |
|
2495 |
} |
|
2496 |
current += 2; |
|
2497 |
break; |
|
2498 |
} |
|
2499 |
||
2500 |
if(StringAt((current + 1), 1, T, D, )) |
|
2501 |
current += 2; |
|
2502 |
else |
|
2503 |
current += 1; |
|
2504 |
MetaphAdd(T); |
|
2505 |
break; |
|
2506 |
" |
|
4488 | 2507 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 3) min: inputKey size)) = 'TION') |
2213 | 2508 |
ifTrue: [ |
2509 |
self addPrimaryTranslation: 'X'; |
|
4488 | 2510 |
addSecondaryTranslation: 'X'. |
2511 |
skipCount := skipCount + 2. |
|
2512 |
^self. |
|
2213 | 2513 |
]. |
4488 | 2514 |
(#('TIA' 'TCH') includes: (inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size))) |
2213 | 2515 |
ifTrue: [ |
2516 |
self addPrimaryTranslation: 'X'; |
|
4488 | 2517 |
addSecondaryTranslation: 'X'. |
2518 |
skipCount := skipCount + 2. |
|
2519 |
^self. |
|
2213 | 2520 |
]. |
4488 | 2521 |
(((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'TH') or: [ |
2522 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size)) = 'TTH') |
|
2213 | 2523 |
]) |
2524 |
ifTrue: [ |
|
4488 | 2525 |
((#('OM' 'AM') includes: (inputKey copyFrom: currentIndex + 2 to: ((currentIndex + 3) min: inputKey size))) |
2526 |
or: [(#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: (4 min: inputKey size))) |
|
2527 |
or: [(inputKey copyFrom: 1 to: (3 min: inputKey size)) = 'SCH'] |
|
2213 | 2528 |
]) |
2529 |
ifTrue: [ |
|
2530 |
self addPrimaryTranslation: 'T'; |
|
2531 |
addSecondaryTranslation: 'T'. |
|
2532 |
] ifFalse: [ |
|
2533 |
self addPrimaryTranslation: '0'; |
|
2534 |
addSecondaryTranslation: 'T'. |
|
2535 |
]. |
|
4488 | 2536 |
skipCount := skipCount + 1. |
2537 |
^self. |
|
2213 | 2538 |
]. |
2539 |
(#($T $D) includes: (self keyAt: currentIndex + 1)) |
|
2540 |
ifTrue: [ |
|
4488 | 2541 |
skipCount := skipCount + 1. |
2213 | 2542 |
]. |
2543 |
self addPrimaryTranslation: 'T'; |
|
4488 | 2544 |
addSecondaryTranslation: 'T'. |
2545 |
||
2546 |
"Modified: / 28-07-2017 / 11:33:33 / cg" |
|
2208 | 2547 |
! |
2548 |
||
2549 |
processV |
|
2213 | 2550 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2551 |
case 'V': |
|
2208 | 2552 |
if(GetAt(current + 1) == 'V') |
2553 |
current += 2; |
|
2554 |
else |
|
2555 |
current += 1; |
|
2556 |
MetaphAdd(F); |
|
2557 |
break; |
|
2558 |
||
2559 |
||
2213 | 2560 |
" |
2561 |
||
2562 |
(self keyAt: currentIndex + 1) = $V |
|
2563 |
ifTrue: [ |
|
4488 | 2564 |
skipCount := skipCount + 1 |
2213 | 2565 |
]. |
2566 |
self addPrimaryTranslation: 'F'; |
|
2567 |
addSecondaryTranslation: 'F'. |
|
4488 | 2568 |
|
2569 |
"Modified: / 28-07-2017 / 11:34:27 / cg" |
|
2208 | 2570 |
! |
2571 |
||
2572 |
processW |
|
2213 | 2573 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2574 |
case 'W': |
|
2208 | 2575 |
//can also be in middle of word |
2576 |
if(StringAt(current, 2, WR, )) |
|
2577 |
{ |
|
2578 |
MetaphAdd(R); |
|
2579 |
current += 2; |
|
2580 |
break; |
|
2581 |
} |
|
2582 |
||
2583 |
if((current == 0) |
|
2584 |
AND (IsVowel(current + 1) OR StringAt(current, 2, WH, ))) |
|
2585 |
{ |
|
2586 |
//Wasserman should match Vasserman |
|
2587 |
if(IsVowel(current + 1)) |
|
2588 |
MetaphAdd(A, F); |
|
2589 |
else |
|
2590 |
//need Uomo to match Womo |
|
2591 |
MetaphAdd(A); |
|
2592 |
} |
|
2593 |
||
2594 |
//Arnow should match Arnoff |
|
2595 |
if(((current == last) AND IsVowel(current - 1)) |
|
2596 |
OR StringAt((current - 1), 5, EWSKI, EWSKY, OWSKI, OWSKY, ) |
|
2597 |
OR StringAt(0, 3, SCH, )) |
|
2213 | 2598 |
{ |
2208 | 2599 |
MetaphAdd(, F); |
2600 |
current +=1; |
|
2601 |
break; |
|
2602 |
} |
|
2603 |
||
2604 |
//polish e.g. 'filipowicz' |
|
2605 |
if(StringAt(current, 4, WICZ, WITZ, )) |
|
2606 |
{ |
|
2607 |
MetaphAdd(TS, FX); |
|
2608 |
current +=4; |
|
2609 |
break; |
|
2610 |
} |
|
2611 |
||
2612 |
//else skip it |
|
2613 |
current +=1; |
|
2614 |
break; |
|
2615 |
" |
|
2213 | 2616 |
| word nextLetter | |
4488 | 2617 |
((word := inputKey copyFrom: currentIndex to: (currentIndex + 1 min: inputKey size)) = 'WR') |
2213 | 2618 |
ifTrue: [ |
2619 |
self addPrimaryTranslation: 'R'; |
|
2620 |
addSecondaryTranslation: 'R'. |
|
4488 | 2621 |
skipCount := skipCount + 1. |
2622 |
^self |
|
2213 | 2623 |
]. |
2624 |
((currentIndex = 1 and: [(nextLetter := self keyAt: currentIndex + 1) isVowel]) or: [ |
|
2625 |
word = 'WH' |
|
2626 |
]) |
|
2627 |
ifTrue: [ |
|
2628 |
nextLetter isVowel |
|
2629 |
ifTrue: [ |
|
2630 |
self addPrimaryTranslation: 'A'; |
|
2631 |
addSecondaryTranslation: 'F'. |
|
2632 |
] ifFalse: [ |
|
2633 |
self addPrimaryTranslation: 'A'; |
|
2634 |
addSecondaryTranslation: 'A'. |
|
2635 |
] |
|
2636 |
]. |
|
4488 | 2637 |
((((currentIndex = inputKey size) and: [(self keyAt: currentIndex - 1) isVowel]) |
2638 |
or: [#('EWSKI' 'EWSKY' 'OWSKI' 'OWSKY') includes: (inputKey copyFrom: ((currentIndex - 1) max: 1) to: (currentIndex + 3 min: inputKey size))]) |
|
2639 |
or: [inputKey startsWith:'SCH']) |
|
2213 | 2640 |
ifTrue: [ |
2641 |
self addPrimaryTranslation: ''; |
|
2642 |
addSecondaryTranslation: 'F'. |
|
2643 |
^self. |
|
2644 |
]. |
|
4488 | 2645 |
(#('WICZ' 'WITZ') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 4 min: inputKey size))) |
2213 | 2646 |
ifTrue: [ |
2647 |
self addPrimaryTranslation: 'TS'; |
|
2648 |
addSecondaryTranslation: 'FX'. |
|
4488 | 2649 |
skipCount := skipCount + 3. |
2650 |
^self |
|
2213 | 2651 |
]. |
4488 | 2652 |
|
2653 |
"Modified: / 28-07-2017 / 11:34:51 / cg" |
|
2208 | 2654 |
! |
2655 |
||
2656 |
processX |
|
2213 | 2657 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2658 |
case 'X': |
|
2208 | 2659 |
//french e.g. breaux |
2660 |
if(!!((current == last) |
|
2661 |
AND (StringAt((current - 3), 3, IAU, EAU, ) |
|
2662 |
OR StringAt((current - 2), 2, AU, OU, ))) ) |
|
2663 |
MetaphAdd(KS); |
|
2664 |
||
2665 |
if(StringAt((current + 1), 1, C, X, )) |
|
2666 |
current += 2; |
|
2667 |
else |
|
2668 |
current += 1; |
|
2669 |
break; |
|
2670 |
" |
|
2671 |
||
2672 |
||
4488 | 2673 |
((currentIndex = inputKey size) |
2674 |
and: [(#('IAU' 'EAU') includes: (inputKey copyFrom: ((currentIndex - 3) min: 1) to: currentIndex)) |
|
2675 |
or: [(#('AU' 'OU') includes: (inputKey copyFrom: ((currentIndex - 2) min: 1) to: currentIndex))]]) |
|
2580
7ce713ba2618
not ifTrue -> ifFalse (trying the rewrite tool ;-)
Claus Gittinger <cg@exept.de>
parents:
2445
diff
changeset
|
2676 |
ifFalse: [ |
2213 | 2677 |
self addPrimaryTranslation: 'KS'; |
2678 |
addSecondaryTranslation: 'KS'. |
|
2679 |
]. |
|
2680 |
(#($C $X) includes: (self keyAt: currentIndex + 1)) |
|
2681 |
ifTrue: [ |
|
4488 | 2682 |
skipCount := skipCount + 1. |
2683 |
^self |
|
2213 | 2684 |
] |
2580
7ce713ba2618
not ifTrue -> ifFalse (trying the rewrite tool ;-)
Claus Gittinger <cg@exept.de>
parents:
2445
diff
changeset
|
2685 |
|
4488 | 2686 |
"Modified: / 28-07-2017 / 11:34:44 / cg" |
2208 | 2687 |
! |
2688 |
||
2689 |
processZ |
|
2213 | 2690 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2691 |
case 'Z': |
|
2208 | 2692 |
//chinese pinyin e.g. 'zhao' |
2693 |
if(GetAt(current + 1) == 'H') |
|
2694 |
{ |
|
2695 |
MetaphAdd(J); |
|
2696 |
current += 2; |
|
2697 |
break; |
|
2698 |
}else |
|
2699 |
if(StringAt((current + 1), 2, ZO, ZI, ZA, ) |
|
2700 |
OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) !!= 'T'))) |
|
2701 |
{ |
|
2702 |
MetaphAdd(S, TS); |
|
2703 |
} |
|
2704 |
else |
|
2705 |
MetaphAdd(S); |
|
2706 |
||
2707 |
if(GetAt(current + 1) == 'Z') |
|
2708 |
current += 2; |
|
2709 |
else |
|
2710 |
current += 1; |
|
2711 |
break; |
|
2712 |
" |
|
2713 |
||
2213 | 2714 |
(self keyAt: currentIndex + 1) = $H |
2715 |
ifTrue: [ |
|
2716 |
self addPrimaryTranslation: 'J'; |
|
2717 |
addSecondaryTranslation: 'J'. |
|
4488 | 2718 |
skipCount := skipCount + 1. |
2719 |
^self |
|
2213 | 2720 |
] ifFalse: [ |
4488 | 2721 |
((#('ZO' 'ZI' 'ZA') includes: (inputKey copyFrom: ((currentIndex + 1) min: inputKey size) to: ((currentIndex + 2) min: inputKey size))) or: [ |
2722 |
(self isSlavoGermanic: inputKey) and: [(currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= 'T'])] |
|
2213 | 2723 |
]) |
2724 |
ifTrue: [ |
|
2725 |
self addPrimaryTranslation: 'S'; |
|
2726 |
addSecondaryTranslation: 'TS'. |
|
2727 |
] ifFalse: [ |
|
2728 |
self addPrimaryTranslation: 'S'; |
|
2729 |
addSecondaryTranslation: 'S'. |
|
2730 |
]. |
|
2731 |
(self keyAt: currentIndex + 1) = $Z |
|
2732 |
ifTrue: [ |
|
4488 | 2733 |
skipCount := skipCount + 1. |
2734 |
^self |
|
2213 | 2735 |
]. |
2736 |
] |
|
4488 | 2737 |
|
2738 |
"Modified: / 28-07-2017 / 11:35:12 / cg" |
|
2739 |
! ! |
|
2740 |
||
4491 | 2741 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'! |
2742 |
||
2743 |
documentation |
|
2744 |
" |
|
2745 |
There are many extended and enhanced soundex variants around; |
|
2746 |
here is one, called 'extended soundex'. It is destribed for example in |
|
2747 |
http://www.epidata.dk/documentation.php. |
|
2748 |
An author or origin is unknown. |
|
2749 |
||
2750 |
The number of digits is increased to 5 or 8; |
|
2751 |
The first character is not used literally; instead it is encoded like the rest. |
|
2752 |
This might have a negative effect on names starting with a vovel, though. |
|
2753 |
||
2754 |
Overall, it can be doubted if this is really an enhancement after all. |
|
2755 |
" |
|
2756 |
! ! |
|
2757 |
||
2758 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'! |
|
2759 |
||
2760 |
phoneticStringsFor:aString |
|
2761 |
"generates both an extended soundex of length 5 and one of length 8" |
|
2762 |
||
2763 |
|first second u t prevCode| |
|
2764 |
||
2765 |
u := aString asUppercase. |
|
2766 |
first := second := ''. |
|
2767 |
u do:[:c | |
|
2768 |
t := self translate:c. |
|
2769 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
2770 |
first := first , t. |
|
2771 |
second := second , t. |
|
2772 |
second size == 8 ifTrue:[ |
|
2773 |
^ Array with:(first copyTo:5) with:second |
|
2774 |
]. |
|
2775 |
]. |
|
2776 |
prevCode := t |
|
2777 |
]. |
|
2778 |
[ first size < 5 ] whileTrue:[ |
|
2779 |
first := first , '0'. |
|
2780 |
second := second , '0'. |
|
2781 |
]. |
|
2782 |
[ second size < 8 ] whileTrue:[ |
|
2783 |
second := second , '0' |
|
2784 |
]. |
|
2785 |
^ Array with:first with:second |
|
2786 |
||
2787 |
" |
|
2788 |
self basicNew phoneticStringsFor:'müller' #('87900' '87900000') |
|
2789 |
self basicNew phoneticStringsFor:'miller' #('87900' '87900000') |
|
2790 |
self basicNew phoneticStringsFor:'muller' #('87900' '87900000') |
|
2791 |
self basicNew phoneticStringsFor:'muler' #('87900' '87900000') |
|
2792 |
self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000') |
|
2793 |
self basicNew phoneticStringsFor:'schneider' #('38690' '38690000') |
|
2794 |
self basicNew phoneticStringsFor:'fischer' #('23900' '23900000') |
|
2795 |
self basicNew phoneticStringsFor:'weber' #('19000' '19000000') |
|
2796 |
self basicNew phoneticStringsFor:'meyer' #('89000' '89000000') |
|
2797 |
self basicNew phoneticStringsFor:'wagner' #('48900' '48900000') |
|
2798 |
self basicNew phoneticStringsFor:'schulz' #('37500' '37500000') |
|
2799 |
self basicNew phoneticStringsFor:'becker' #('13900' '13900000') |
|
2800 |
self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000') |
|
2801 |
self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000') |
|
2802 |
" |
|
2803 |
! ! |
|
2804 |
||
2805 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'! |
|
2806 |
||
2807 |
translate:aCharacter |
|
2808 |
"use simple if's for more speed when compiled" |
|
2809 |
||
2810 |
"vowels serve as separators" |
|
2811 |
aCharacter == $A ifTrue:[^ '0' ]. |
|
2812 |
aCharacter == $E ifTrue:[^ '0' ]. |
|
2813 |
aCharacter == $I ifTrue:[^ '0' ]. |
|
2814 |
aCharacter == $O ifTrue:[^ '0' ]. |
|
2815 |
aCharacter == $U ifTrue:[^ '0' ]. |
|
2816 |
aCharacter == $Y ifTrue:[^ '0' ]. |
|
2817 |
||
2818 |
aCharacter == $B ifTrue:[^ '1' ]. |
|
2819 |
aCharacter == $P ifTrue:[^ '1' ]. |
|
2820 |
||
2821 |
aCharacter == $F ifTrue:[^ '2' ]. |
|
2822 |
aCharacter == $V ifTrue:[^ '2' ]. |
|
2823 |
||
2824 |
aCharacter == $C ifTrue:[^ '3' ]. |
|
2825 |
aCharacter == $S ifTrue:[^ '3' ]. |
|
2826 |
aCharacter == $K ifTrue:[^ '3' ]. |
|
2827 |
||
2828 |
aCharacter == $G ifTrue:[^ '4' ]. |
|
2829 |
aCharacter == $J ifTrue:[^ '4' ]. |
|
2830 |
||
2831 |
aCharacter == $Q ifTrue:[^ '5' ]. |
|
2832 |
aCharacter == $X ifTrue:[^ '5' ]. |
|
2833 |
aCharacter == $Z ifTrue:[^ '5' ]. |
|
2834 |
||
2835 |
aCharacter == $D ifTrue:[^ '6' ]. |
|
2836 |
aCharacter == $G ifTrue:[^ '6' ]. |
|
2837 |
aCharacter == $T ifTrue:[^ '6' ]. |
|
2838 |
||
2839 |
aCharacter == $L ifTrue:[^ '7' ]. |
|
2840 |
||
2841 |
aCharacter == $M ifTrue:[^ '8' ]. |
|
2842 |
aCharacter == $N ifTrue:[^ '8' ]. |
|
2843 |
||
2844 |
aCharacter == $R ifTrue:[^ '9' ]. |
|
2845 |
^ nil |
|
2846 |
! ! |
|
2847 |
||
2848 |
!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'! |
|
2849 |
||
2850 |
documentation |
|
2851 |
" |
|
2852 |
documentation to be added. |
|
2853 |
||
2854 |
[author:] |
|
2855 |
cg |
|
2856 |
||
2857 |
[instance variables:] |
|
2858 |
||
2859 |
[class variables:] |
|
2860 |
||
2861 |
[see also:] |
|
2862 |
||
2863 |
" |
|
2864 |
! ! |
|
2865 |
||
2866 |
!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'! |
|
2867 |
||
2868 |
encode:word |
|
2869 |
^ self subclassResponsibility |
|
2870 |
||
2871 |
"Created: / 28-07-2017 / 15:20:49 / cg" |
|
2872 |
! |
|
2873 |
||
2874 |
phoneticStringsFor:word |
|
2875 |
^ Array with:(self encode:word) |
|
2876 |
||
2877 |
"Created: / 28-07-2017 / 15:20:38 / cg" |
|
2878 |
! ! |
|
2879 |
||
2880 |
!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'! |
|
2881 |
||
2882 |
documentation |
|
2883 |
" |
|
2884 |
Match Rating Approach Encoder |
|
2885 |
||
2886 |
The Western Airlines matching rating approach name encoder |
|
2887 |
||
2888 |
[see also:] |
|
2889 |
https://en.wikipedia.org/wiki/Match_Rating_Approach |
|
2890 |
||
2891 |
G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2892 |
''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' |
|
2893 |
US National Institute of Standards and Technology, SP-500-2 (1977), p. 17. |
|
2894 |
" |
|
2895 |
! |
|
2896 |
||
2897 |
rCode |
|
2898 |
"<<END |
|
2899 |
## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us> |
|
2900 |
## |
|
2901 |
## Redistribution and use in source and binary forms, with or without |
|
2902 |
## modification, are permitted provided that the following conditions are |
|
2903 |
## met: |
|
2904 |
## |
|
2905 |
## Redistributions of source code must retain the above copyright |
|
2906 |
## notice, this list of conditions and the following disclaimer. |
|
2907 |
## |
|
2908 |
## Redistributions in binary form must reproduce the above copyright |
|
2909 |
## notice, this list of conditions and the following disclaimer in |
|
2910 |
## the documentation and/or other materials provided with the |
|
2911 |
## distribution. |
|
2912 |
## |
|
2913 |
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
2914 |
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
2915 |
## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
2916 |
## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
2917 |
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
2918 |
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
2919 |
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
2920 |
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
2921 |
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
2922 |
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
2923 |
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
2924 |
||
2925 |
#' @rdname mra |
|
2926 |
#' @title Match Rating Approach Encoder |
|
2927 |
#' |
|
2928 |
#' @description |
|
2929 |
#' The Western Airlines matching rating approach name encoder |
|
2930 |
#' |
|
2931 |
#' @param word string or vector of strings to encode |
|
2932 |
#' @param x MRA-encoded character vector |
|
2933 |
#' @param y MRA-encoded character vector |
|
2934 |
#' |
|
2935 |
#' @details |
|
2936 |
#' |
|
2937 |
#' The variable \code{word} is the name to be encoded. The variable |
|
2938 |
#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder |
|
2939 |
#' because the algorithm itself is dependent upon its six-character |
|
2940 |
#' length. The variables \code{x} and \code{y} are MRA-encoded and are |
|
2941 |
#' compared to each other using the MRA comparison specification. |
|
2942 |
#' |
|
2943 |
#' @return The \code{mra_encode} function returns match rating approach |
|
2944 |
#' encoded character vector. The \code{mra_compare} returns a boolean |
|
2945 |
#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA |
|
2946 |
#' comparison test. |
|
2947 |
#' |
|
2948 |
#' @references |
|
2949 |
#' |
|
2950 |
#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2951 |
#' \emph{Accessing Individual Records from Personal Data Files Using |
|
2952 |
#' Nonunique Identifiers,} US National Institute of Standards and |
|
2953 |
#' Technology, SP-500-2 (1977), p. 17. |
|
2954 |
#' |
|
2955 |
#' @family phonics |
|
2956 |
#' |
|
2957 |
#' @examples |
|
2958 |
#' mra_encode("William") |
|
2959 |
#' mra_encode(c("Peter", "Peady")) |
|
2960 |
#' mra_encode("Stevenson") |
|
2961 |
||
2962 |
#' @rdname mra |
|
2963 |
#' @name mra_encode |
|
2964 |
#' @export |
|
2965 |
mra_encode <- function(word) { |
|
2966 |
||
2967 |
## First, remove any nonalphabetical characters and uppercase it |
|
2968 |
word <- gsub("[^[:alpha:]]*", "", word) |
|
2969 |
word <- toupper(word) |
|
2970 |
||
2971 |
## First character of key = first character of name |
|
2972 |
first <- substr(word, 1, 1) |
|
2973 |
word <- substr(word, 2, nchar(word)) |
|
2974 |
||
2975 |
## Delete vowels not at the start of the word |
|
2976 |
word <- gsub("[AEIOU]", "", word) |
|
2977 |
word <- paste(first, word, sep = "") |
|
2978 |
||
2979 |
## Remove duplicate consecutive characters |
|
2980 |
word <- gsub("([A-Z])\\1+", "\\1", word) |
|
2981 |
||
2982 |
## If longer than 6 characters, take first and last 3...and we have |
|
2983 |
## to vectorize it |
|
2984 |
for(i in 1:length(word)) { |
|
2985 |
if((l = nchar(word[i])) > 6) { |
|
2986 |
first <- substr(word[i], 1, 3) |
|
2987 |
last <- substr(word[i], l - 2, l) |
|
2988 |
word[i] <- paste(first, last, sep = ""); |
|
2989 |
} |
|
2990 |
} |
|
2991 |
||
2992 |
return(word) |
|
2993 |
} |
|
2994 |
||
2995 |
#' @rdname mra |
|
2996 |
#' @name mra_compare |
|
2997 |
#' @export |
|
2998 |
mra_compare <- function(x, y) { |
|
2999 |
mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE) |
|
3000 |
||
3001 |
## Obtain the minimum rating value by calculating the length sum of |
|
3002 |
## the encoded strings and using table A (from Wikipedia). We start |
|
3003 |
## by setting the minimum to be the sum and move from there. |
|
3004 |
mra$lensum <- nchar(mra$x) + nchar(mra$y) |
|
3005 |
mra$min[mra$lensum == 12] <- 2 |
|
3006 |
mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3 |
|
3007 |
mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4 |
|
3008 |
mra$min[mra$lensum <= 4] <- 5 |
|
3009 |
||
3010 |
## If the length difference between the encoded strings is 3 or |
|
3011 |
## greater, then no similarity comparison is done. For us, we |
|
3012 |
## continue the similarity comparison out of laziness and ensure the |
|
3013 |
## minimum is impossibly high to meet. |
|
3014 |
mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100 |
|
3015 |
||
3016 |
## Start the comparison. |
|
3017 |
x <- strsplit(mra$x, split = "") |
|
3018 |
y <- strsplit(mra$y, split = "") |
|
3019 |
rows <- nrow(mra) |
|
3020 |
for(i in 1:rows) { |
|
3021 |
## Process the encoded strings from left to right and remove any |
|
3022 |
## identical characters found from both strings respectively. |
|
3023 |
j <- 1 |
|
3024 |
while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
3025 |
if(x[[i]][j] == y[[i]][j]) { |
|
3026 |
x[[i]] <- x[[i]][-j] |
|
3027 |
y[[i]] <- y[[i]][-j] |
|
3028 |
} else |
|
3029 |
j <- j + 1 |
|
3030 |
} |
|
3031 |
||
3032 |
## Process the unmatched characters from right to left and |
|
3033 |
## remove any identical characters found from both names |
|
3034 |
## respectively. |
|
3035 |
x[[i]] <- rev(x[[i]]) |
|
3036 |
y[[i]] <- rev(y[[i]]) |
|
3037 |
j <- 1 |
|
3038 |
while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
3039 |
if(x[[i]][j] == y[[i]][j]) { |
|
3040 |
x[[i]] <- x[[i]][-j] |
|
3041 |
y[[i]] <- y[[i]][-j] |
|
3042 |
} else |
|
3043 |
j <- j + 1 |
|
3044 |
} |
|
3045 |
## Subtract the number of unmatched characters from 6 in the |
|
3046 |
## longer string. This is the similarity rating. |
|
3047 |
len <- min(length(x[[i]]), length(y[[i]])) |
|
3048 |
mra$sim[i] <- 6 - len |
|
3049 |
} |
|
3050 |
||
3051 |
## If the similarity is greater than or equal to the minimum |
|
3052 |
## required, it is a successful match. |
|
3053 |
mra$match <- (mra$sim >= mra$min) |
|
3054 |
return(mra$match) |
|
3055 |
} |
|
3056 |
||
3057 |
END>> |
|
3058 |
! ! |
|
3059 |
||
3060 |
!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'! |
|
3061 |
||
3062 |
encode:wordIn |
|
3063 |
"see https://en.wikipedia.org/wiki/Match_Rating_Approach" |
|
3064 |
||
3065 |
|word prev| |
|
3066 |
||
3067 |
word := wordIn. |
|
3068 |
||
3069 |
"/ First, remove any nonalphabetical characters and uppercase it |
|
3070 |
||
3071 |
word := word select:#isLetter thenCollect:#asUppercase. |
|
3072 |
||
3073 |
"/ Delete vowels not at the start of the word |
|
3074 |
||
3075 |
word := word first asString , ((word from:2) reject:#isVowel). |
|
3076 |
||
3077 |
"/ Remove duplicate consecutive characters |
|
3078 |
||
3079 |
prev := nil. |
|
3080 |
word := word |
|
3081 |
collect:[:char | |
|
3082 |
char == prev ifTrue:[ |
|
3083 |
$* |
|
3084 |
] ifFalse:[ |
|
3085 |
prev := char. |
|
3086 |
char. |
|
3087 |
]. |
|
3088 |
] |
|
3089 |
thenSelect:[:char | char ~~ $*]. |
|
3090 |
||
3091 |
"/ If longer than 6 characters, take first and last 3 |
|
3092 |
word size > 6 ifTrue:[ |
|
3093 |
word := (word copyFirst:3),(word copyLast:3) |
|
3094 |
]. |
|
3095 |
^ word. |
|
3096 |
||
3097 |
" |
|
3098 |
self new encode:'Catherine' -> 'CTHRN' |
|
3099 |
self new encode:'CatherineCatherine' -> 'CTHHRN' |
|
3100 |
self new encode:'Butter' -> 'BTR' |
|
3101 |
self new encode:'Byrne' -> 'BYRN' |
|
3102 |
self new encode:'Boern' -> 'BRN' |
|
3103 |
self new encode:'Smith' -> 'SMTH' |
|
3104 |
self new encode:'Smyth' -> 'SMYTH' |
|
3105 |
self new encode:'Kathryn' -> 'KTHRYN' |
|
3106 |
" |
|
3107 |
||
3108 |
"Created: / 28-07-2017 / 15:19:22 / cg" |
|
3109 |
"Modified (comment): / 31-07-2017 / 15:14:31 / cg" |
|
3110 |
! ! |
|
3111 |
||
3112 |
!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'! |
|
3113 |
||
3114 |
documentation |
|
3115 |
" |
|
4495 | 3116 |
Ongoing work - do not use at the moment |
3117 |
||
4491 | 3118 |
Encodes a string into a Metaphone value. |
3119 |
||
3120 |
Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3121 |
Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3122 |
||
3123 |
Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39. |
|
3124 |
Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3125 |
https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6 |
|
3126 |
||
3127 |
They have had undocumented changes from the originally published algorithm. |
|
3128 |
For more information, see https://issues.apache.org/jira/browse/CODEC-57 |
|
3129 |
||
3130 |
Metaphone uses the following rules: |
|
3131 |
||
3132 |
Doubled letters except 'c' -> drop 2nd letter. |
|
3133 |
Vowels are only kept when they are the first letter. |
|
3134 |
B -> B unless at the end of a word after 'm' as in 'dumb' |
|
3135 |
C -> X (sh) if -cia- or -ch- |
|
3136 |
S if -ci-, -ce- or -cy- |
|
3137 |
K otherwise, including -sch- |
|
3138 |
D -> J if in -dge-, -dgy- or -dgi-; T otherwise |
|
3139 |
F -> F |
|
3140 |
G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above) |
|
3141 |
J if before i or e or y if not double gg; K otherwise |
|
3142 |
H -> silent if after vowel and no vowel follows; H otherwise |
|
3143 |
J -> J |
|
3144 |
K -> silent if after 'c'; K otherwise |
|
3145 |
L -> L |
|
3146 |
M -> M |
|
3147 |
N -> N |
|
3148 |
P -> F if before 'h'; P otherwise |
|
3149 |
Q -> K |
|
3150 |
R -> R |
|
3151 |
S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise |
|
3152 |
T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise |
|
3153 |
V -> F |
|
3154 |
W -> silent if not followed by a vowel W if followed by a vowel |
|
3155 |
X -> KS |
|
3156 |
Y -> silent if not followed by a vowel Y if followed by a vowel |
|
3157 |
Z -> S |
|
3158 |
||
3159 |
Initial Letter Exceptions |
|
3160 |
||
3161 |
Initial kn-, gn- pn, ae- or wr- -> drop first letter |
|
3162 |
Initial x- -> change to 's' |
|
3163 |
Initial wh- -> change to 'w' |
|
3164 |
||
3165 |
||
3166 |
self new encode:'a' |
|
3167 |
self new encode:'dumb' |
|
3168 |
self new encode:'MILLER' |
|
3169 |
self new encode:'schmidt' |
|
3170 |
self new encode:'schneider' |
|
3171 |
self new encode:'FISCHER' |
|
3172 |
self new encode:'HEDGY' |
|
3173 |
self new encode:'weber' |
|
3174 |
self new encode:'wagner' |
|
3175 |
self new encode:'van gogh' |
|
3176 |
" |
|
3177 |
! |
|
3178 |
||
3179 |
javaCode |
|
3180 |
"<<END |
|
3181 |
/* |
|
3182 |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
3183 |
* contributor license agreements. See the NOTICE file distributed with |
|
3184 |
* this work for additional information regarding copyright ownership. |
|
3185 |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
3186 |
* (the "License"); you may not use this file except in compliance with |
|
3187 |
* the License. You may obtain a copy of the License at |
|
3188 |
* |
|
3189 |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
3190 |
* |
|
3191 |
* Unless required by applicable law or agreed to in writing, software |
|
3192 |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
3193 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
3194 |
* See the License for the specific language governing permissions and |
|
3195 |
* limitations under the License. |
|
3196 |
*/ |
|
3197 |
||
3198 |
package org.apache.commons.codec.language; |
|
3199 |
||
3200 |
import org.apache.commons.codec.EncoderException; |
|
3201 |
import org.apache.commons.codec.StringEncoder; |
|
3202 |
||
3203 |
/** |
|
3204 |
* Encodes a string into a Metaphone value. |
|
3205 |
* <p> |
|
3206 |
* Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3207 |
* Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3208 |
* <p> |
|
3209 |
* <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, |
|
3210 |
* p 39.</CITE> |
|
3211 |
* <p> |
|
3212 |
* Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3213 |
* </p> |
|
3214 |
* <ul> |
|
3215 |
* <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> |
|
3216 |
* (broken link 4/30/2013) </li> |
|
3217 |
* <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> |
|
3218 |
* (link checked 4/30/2013) </li> |
|
3219 |
* </ul> |
|
3220 |
* <p> |
|
3221 |
* They have had undocumented changes from the originally published algorithm. |
|
3222 |
* For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. |
|
3223 |
* <p> |
|
3224 |
* This class is conditionally thread-safe. |
|
3225 |
* The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} |
|
3226 |
* but is not volatile, and accesses are not synchronized. |
|
3227 |
* If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization |
|
3228 |
* is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} |
|
3229 |
* after initial setup. |
|
3230 |
* |
|
3231 |
* @version $Id$ |
|
3232 |
*/ |
|
3233 |
public class Metaphone implements StringEncoder { |
|
3234 |
||
3235 |
/** |
|
3236 |
* Five values in the English language |
|
3237 |
*/ |
|
3238 |
private static final String VOWELS = "AEIOU"; |
|
3239 |
||
3240 |
/** |
|
3241 |
* Variable used in Metaphone algorithm |
|
3242 |
*/ |
|
3243 |
private static final String FRONTV = "EIY"; |
|
3244 |
||
3245 |
/** |
|
3246 |
* Variable used in Metaphone algorithm |
|
3247 |
*/ |
|
3248 |
private static final String VARSON = "CSPTG"; |
|
3249 |
||
3250 |
/** |
|
3251 |
* The max code length for metaphone is 4 |
|
3252 |
*/ |
|
3253 |
private int maxCodeLen = 4; |
|
3254 |
||
3255 |
/** |
|
3256 |
* Creates an instance of the Metaphone encoder |
|
3257 |
*/ |
|
3258 |
public Metaphone() { |
|
3259 |
super(); |
|
3260 |
} |
|
3261 |
||
3262 |
/** |
|
3263 |
* Find the metaphone value of a String. This is similar to the |
|
3264 |
* soundex algorithm, but better at finding similar sounding words. |
|
3265 |
* All input is converted to upper case. |
|
3266 |
* Limitations: Input format is expected to be a single ASCII word |
|
3267 |
* with only characters in the A - Z range, no punctuation or numbers. |
|
3268 |
* |
|
3269 |
* @param txt String to find the metaphone code for |
|
3270 |
* @return A metaphone code corresponding to the String supplied |
|
3271 |
*/ |
|
3272 |
public String metaphone(final String txt) { |
|
3273 |
boolean hard = false; |
|
3274 |
int txtLength; |
|
3275 |
if (txt == null || (txtLength = txt.length()) == 0) { |
|
3276 |
return ""; |
|
3277 |
} |
|
3278 |
// single character is itself |
|
3279 |
if (txtLength == 1) { |
|
3280 |
return txt.toUpperCase(java.util.Locale.ENGLISH); |
|
3281 |
} |
|
3282 |
||
3283 |
final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); |
|
3284 |
||
3285 |
final StringBuilder local = new StringBuilder(40); // manipulate |
|
3286 |
final StringBuilder code = new StringBuilder(10); // output |
|
3287 |
// handle initial 2 characters exceptions |
|
3288 |
switch(inwd[0]) { |
|
3289 |
case 'K': |
|
3290 |
case 'G': |
|
3291 |
case 'P': /* looking for KN, etc*/ |
|
3292 |
if (inwd[1] == 'N') { |
|
3293 |
local.append(inwd, 1, inwd.length - 1); |
|
3294 |
} else { |
|
3295 |
local.append(inwd); |
|
3296 |
} |
|
3297 |
break; |
|
3298 |
case 'A': /* looking for AE */ |
|
3299 |
if (inwd[1] == 'E') { |
|
3300 |
local.append(inwd, 1, inwd.length - 1); |
|
3301 |
} else { |
|
3302 |
local.append(inwd); |
|
3303 |
} |
|
3304 |
break; |
|
3305 |
case 'W': /* looking for WR or WH */ |
|
3306 |
if (inwd[1] == 'R') { // WR -> R |
|
3307 |
local.append(inwd, 1, inwd.length - 1); |
|
3308 |
break; |
|
3309 |
} |
|
3310 |
if (inwd[1] == 'H') { |
|
3311 |
local.append(inwd, 1, inwd.length - 1); |
|
3312 |
local.setCharAt(0, 'W'); // WH -> W |
|
3313 |
} else { |
|
3314 |
local.append(inwd); |
|
3315 |
} |
|
3316 |
break; |
|
3317 |
case 'X': /* initial X becomes S */ |
|
3318 |
inwd[0] = 'S'; |
|
3319 |
local.append(inwd); |
|
3320 |
break; |
|
3321 |
default: |
|
3322 |
local.append(inwd); |
|
3323 |
} // now local has working string with initials fixed |
|
3324 |
||
3325 |
final int wdsz = local.length(); |
|
3326 |
int n = 0; |
|
3327 |
||
3328 |
while (code.length() < this.getMaxCodeLen() && |
|
3329 |
n < wdsz ) { // max code size of 4 works well |
|
3330 |
final char symb = local.charAt(n); |
|
3331 |
// remove duplicate letters except C |
|
3332 |
if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) { |
|
3333 |
n++; |
|
3334 |
} else { // not dup |
|
3335 |
switch(symb) { |
|
3336 |
case 'A': |
|
3337 |
case 'E': |
|
3338 |
case 'I': |
|
3339 |
case 'O': |
|
3340 |
case 'U': |
|
3341 |
if (n == 0) { |
|
3342 |
code.append(symb); |
|
3343 |
} |
|
3344 |
break; // only use vowel if leading char |
|
3345 |
case 'B': |
|
3346 |
if ( isPreviousChar(local, n, 'M') && |
|
3347 |
isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3348 |
break; |
|
3349 |
} |
|
3350 |
code.append(symb); |
|
3351 |
break; |
|
3352 |
case 'C': // lots of C special cases |
|
3353 |
/* discard if SCI, SCE or SCY */ |
|
3354 |
if ( isPreviousChar(local, n, 'S') && |
|
3355 |
!!isLastChar(wdsz, n) && |
|
3356 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3357 |
break; |
|
3358 |
} |
|
3359 |
if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3360 |
code.append('X'); |
|
3361 |
break; |
|
3362 |
} |
|
3363 |
if (!!isLastChar(wdsz, n) && |
|
3364 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3365 |
code.append('S'); |
|
3366 |
break; // CI,CE,CY -> S |
|
3367 |
} |
|
3368 |
if (isPreviousChar(local, n, 'S') && |
|
3369 |
isNextChar(local, n, 'H') ) { // SCH->sk |
|
3370 |
code.append('K'); |
|
3371 |
break; |
|
3372 |
} |
|
3373 |
if (isNextChar(local, n, 'H')) { // detect CH |
|
3374 |
if (n == 0 && |
|
3375 |
wdsz >= 3 && |
|
3376 |
isVowel(local,2) ) { // CH consonant -> K consonant |
|
3377 |
code.append('K'); |
|
3378 |
} else { |
|
3379 |
code.append('X'); // CHvowel -> X |
|
3380 |
} |
|
3381 |
} else { |
|
3382 |
code.append('K'); |
|
3383 |
} |
|
3384 |
break; |
|
3385 |
case 'D': |
|
3386 |
if (!!isLastChar(wdsz, n + 1) && |
|
3387 |
isNextChar(local, n, 'G') && |
|
3388 |
FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3389 |
code.append('J'); n += 2; |
|
3390 |
} else { |
|
3391 |
code.append('T'); |
|
3392 |
} |
|
3393 |
break; |
|
3394 |
case 'G': // GH silent at end or before consonant |
|
3395 |
if (isLastChar(wdsz, n + 1) && |
|
3396 |
isNextChar(local, n, 'H')) { |
|
3397 |
break; |
|
3398 |
} |
|
3399 |
if (!!isLastChar(wdsz, n + 1) && |
|
3400 |
isNextChar(local,n,'H') && |
|
3401 |
!!isVowel(local,n+2)) { |
|
3402 |
break; |
|
3403 |
} |
|
3404 |
if (n > 0 && |
|
3405 |
( regionMatch(local, n, "GN") || |
|
3406 |
regionMatch(local, n, "GNED") ) ) { |
|
3407 |
break; // silent G |
|
3408 |
} |
|
3409 |
if (isPreviousChar(local, n, 'G')) { |
|
3410 |
// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true |
|
3411 |
hard = true; |
|
3412 |
} else { |
|
3413 |
hard = false; |
|
3414 |
} |
|
3415 |
if (!!isLastChar(wdsz, n) && |
|
3416 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3417 |
!!hard) { |
|
3418 |
code.append('J'); |
|
3419 |
} else { |
|
3420 |
code.append('K'); |
|
3421 |
} |
|
3422 |
break; |
|
3423 |
case 'H': |
|
3424 |
if (isLastChar(wdsz, n)) { |
|
3425 |
break; // terminal H |
|
3426 |
} |
|
3427 |
if (n > 0 && |
|
3428 |
VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3429 |
break; |
|
3430 |
} |
|
3431 |
if (isVowel(local,n+1)) { |
|
3432 |
code.append('H'); // Hvowel |
|
3433 |
} |
|
3434 |
break; |
|
3435 |
case 'F': |
|
3436 |
case 'J': |
|
3437 |
case 'L': |
|
3438 |
case 'M': |
|
3439 |
case 'N': |
|
3440 |
case 'R': |
|
3441 |
code.append(symb); |
|
3442 |
break; |
|
3443 |
case 'K': |
|
3444 |
if (n > 0) { // not initial |
|
3445 |
if (!!isPreviousChar(local, n, 'C')) { |
|
3446 |
code.append(symb); |
|
3447 |
} |
|
3448 |
} else { |
|
3449 |
code.append(symb); // initial K |
|
3450 |
} |
|
3451 |
break; |
|
3452 |
case 'P': |
|
3453 |
if (isNextChar(local,n,'H')) { |
|
3454 |
// PH -> F |
|
3455 |
code.append('F'); |
|
3456 |
} else { |
|
3457 |
code.append(symb); |
|
3458 |
} |
|
3459 |
break; |
|
3460 |
case 'Q': |
|
3461 |
code.append('K'); |
|
3462 |
break; |
|
3463 |
case 'S': |
|
3464 |
if (regionMatch(local,n,"SH") || |
|
3465 |
regionMatch(local,n,"SIO") || |
|
3466 |
regionMatch(local,n,"SIA")) { |
|
3467 |
code.append('X'); |
|
3468 |
} else { |
|
3469 |
code.append('S'); |
|
3470 |
} |
|
3471 |
break; |
|
3472 |
case 'T': |
|
3473 |
if (regionMatch(local,n,"TIA") || |
|
3474 |
regionMatch(local,n,"TIO")) { |
|
3475 |
code.append('X'); |
|
3476 |
break; |
|
3477 |
} |
|
3478 |
if (regionMatch(local,n,"TCH")) { |
|
3479 |
// Silent if in "TCH" |
|
3480 |
break; |
|
3481 |
} |
|
3482 |
// substitute numeral 0 for TH (resembles theta after all) |
|
3483 |
if (regionMatch(local,n,"TH")) { |
|
3484 |
code.append('0'); |
|
3485 |
} else { |
|
3486 |
code.append('T'); |
|
3487 |
} |
|
3488 |
break; |
|
3489 |
case 'V': |
|
3490 |
code.append('F'); break; |
|
3491 |
case 'W': |
|
3492 |
case 'Y': // silent if not followed by vowel |
|
3493 |
if (!!isLastChar(wdsz,n) && |
|
3494 |
isVowel(local,n+1)) { |
|
3495 |
code.append(symb); |
|
3496 |
} |
|
3497 |
break; |
|
3498 |
case 'X': |
|
3499 |
code.append('K'); |
|
3500 |
code.append('S'); |
|
3501 |
break; |
|
3502 |
case 'Z': |
|
3503 |
code.append('S'); |
|
3504 |
break; |
|
3505 |
default: |
|
3506 |
// do nothing |
|
3507 |
break; |
|
3508 |
} // end switch |
|
3509 |
n++; |
|
3510 |
} // end else from symb !!= 'C' |
|
3511 |
if (code.length() > this.getMaxCodeLen()) { |
|
3512 |
code.setLength(this.getMaxCodeLen()); |
|
3513 |
} |
|
3514 |
} |
|
3515 |
return code.toString(); |
|
3516 |
} |
|
3517 |
||
3518 |
private boolean isVowel(final StringBuilder string, final int index) { |
|
3519 |
return VOWELS.indexOf(string.charAt(index)) >= 0; |
|
3520 |
} |
|
3521 |
||
3522 |
private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { |
|
3523 |
boolean matches = false; |
|
3524 |
if( index > 0 && |
|
3525 |
index < string.length() ) { |
|
3526 |
matches = string.charAt(index - 1) == c; |
|
3527 |
} |
|
3528 |
return matches; |
|
3529 |
} |
|
3530 |
||
3531 |
private boolean isNextChar(final StringBuilder string, final int index, final char c) { |
|
3532 |
boolean matches = false; |
|
3533 |
if( index >= 0 && |
|
3534 |
index < string.length() - 1 ) { |
|
3535 |
matches = string.charAt(index + 1) == c; |
|
3536 |
} |
|
3537 |
return matches; |
|
3538 |
} |
|
3539 |
||
3540 |
private boolean regionMatch(final StringBuilder string, final int index, final String test) { |
|
3541 |
boolean matches = false; |
|
3542 |
if( index >= 0 && |
|
3543 |
index + test.length() - 1 < string.length() ) { |
|
3544 |
final String substring = string.substring( index, index + test.length()); |
|
3545 |
matches = substring.equals( test ); |
|
3546 |
} |
|
3547 |
return matches; |
|
3548 |
} |
|
3549 |
||
3550 |
private boolean isLastChar(final int wdsz, final int n) { |
|
3551 |
return n + 1 == wdsz; |
|
3552 |
} |
|
3553 |
||
3554 |
||
3555 |
/** |
|
3556 |
* Encodes an Object using the metaphone algorithm. This method |
|
3557 |
* is provided in order to satisfy the requirements of the |
|
3558 |
* Encoder interface, and will throw an EncoderException if the |
|
3559 |
* supplied object is not of type java.lang.String. |
|
3560 |
* |
|
3561 |
* @param obj Object to encode |
|
3562 |
* @return An object (or type java.lang.String) containing the |
|
3563 |
* metaphone code which corresponds to the String supplied. |
|
3564 |
* @throws EncoderException if the parameter supplied is not |
|
3565 |
* of type java.lang.String |
|
3566 |
*/ |
|
3567 |
@Override |
|
3568 |
public Object encode(final Object obj) throws EncoderException { |
|
3569 |
if (!!(obj instanceof String)) { |
|
3570 |
throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
|
3571 |
} |
|
3572 |
return metaphone((String) obj); |
|
3573 |
} |
|
3574 |
||
3575 |
/** |
|
3576 |
* Encodes a String using the Metaphone algorithm. |
|
3577 |
* |
|
3578 |
* @param str String object to encode |
|
3579 |
* @return The metaphone code corresponding to the String supplied |
|
3580 |
*/ |
|
3581 |
@Override |
|
3582 |
public String encode(final String str) { |
|
3583 |
return metaphone(str); |
|
3584 |
} |
|
3585 |
||
3586 |
/** |
|
3587 |
* Tests is the metaphones of two strings are identical. |
|
3588 |
* |
|
3589 |
* @param str1 First of two strings to compare |
|
3590 |
* @param str2 Second of two strings to compare |
|
3591 |
* @return <code>true</code> if the metaphones of these strings are identical, |
|
3592 |
* <code>false</code> otherwise. |
|
3593 |
*/ |
|
3594 |
public boolean isMetaphoneEqual(final String str1, final String str2) { |
|
3595 |
return metaphone(str1).equals(metaphone(str2)); |
|
3596 |
} |
|
3597 |
||
3598 |
/** |
|
3599 |
* Returns the maxCodeLen. |
|
3600 |
* @return int |
|
3601 |
*/ |
|
3602 |
public int getMaxCodeLen() { return this.maxCodeLen; } |
|
3603 |
||
3604 |
/** |
|
3605 |
* Sets the maxCodeLen. |
|
3606 |
* @param maxCodeLen The maxCodeLen to set |
|
3607 |
*/ |
|
3608 |
public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
|
3609 |
||
3610 |
} |
|
3611 |
END>>" |
|
3612 |
! ! |
|
3613 |
||
3614 |
!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'! |
|
3615 |
||
3616 |
encode:txt |
|
3617 |
" |
|
3618 |
self new encode:'a' |
|
3619 |
self new encode:'MILLER' |
|
3620 |
self new encode:'schmidt' |
|
3621 |
self new encode:'schneider' |
|
3622 |
self new encode:'FISCHER' |
|
3623 |
self new encode:'HEDGY' |
|
3624 |
self new encode:'weber' |
|
3625 |
self new encode:'wagner' |
|
3626 |
self new encode:'van gogh' |
|
3627 |
self new encode:'dumb' |
|
3628 |
" |
|
3629 |
||
4495 | 3630 |
|hard txtLength local code inwd ch ch2 wdsz n| |
4491 | 3631 |
|
3632 |
inwd := txt. |
|
3633 |
hard := false. |
|
3634 |
txtLength := 0. |
|
3635 |
||
3636 |
(txtLength := txt size) == 0 ifTrue:[^ '']. |
|
3637 |
||
3638 |
inwd := txt asUppercase. |
|
3639 |
"/ single character is itself |
|
3640 |
(txtLength == 1) ifTrue:[ |
|
3641 |
^ inwd |
|
3642 |
]. |
|
3643 |
||
3644 |
code := '' writeStream. |
|
3645 |
local := inwd. |
|
3646 |
||
3647 |
"/ handle initial 2 characters exceptions |
|
3648 |
ch := inwd at:(0+1). |
|
3649 |
ch2 := inwd at:(1+1). |
|
3650 |
('KGP' includes:ch) ifTrue:[ |
|
3651 |
"/ looking for KN, etc |
|
3652 |
"/ KNx -> Nx |
|
3653 |
"/ GNx -> Nx |
|
3654 |
"/ PNx -> Nx |
|
3655 |
(ch2 == $N) ifTrue:[ |
|
3656 |
local := (inwd from:1+1) |
|
3657 |
]. |
|
3658 |
] ifFalse:[ |
|
3659 |
('A' includes:ch) ifTrue:[ |
|
3660 |
"/ looking for AE |
|
3661 |
"/ AEx -> Ex |
|
3662 |
(ch2 == $E) ifTrue:[ |
|
3663 |
local := (inwd from:1+1) |
|
3664 |
]. |
|
3665 |
] ifFalse:[ |
|
3666 |
('W' includes:ch) ifTrue:[ |
|
3667 |
"/ looking for WR or WH |
|
3668 |
(ch2 == $R) ifTrue:[ |
|
3669 |
"/ WRx -> Wx |
|
3670 |
local := (inwd from:1+1) |
|
3671 |
] ifFalse:[ |
|
3672 |
(ch2 == $H) ifTrue:[ |
|
3673 |
"/ // WH -> W |
|
3674 |
local := 'W',(inwd from:2+1). |
|
3675 |
] |
|
3676 |
] |
|
3677 |
] ifFalse:[ |
|
3678 |
('X' includes:ch) ifTrue:[ |
|
3679 |
"/ initial X becomes S */ |
|
3680 |
"/ Xx -> Sx |
|
3681 |
local := 'S',(inwd from:1+1). |
|
3682 |
]]]]. |
|
3683 |
||
3684 |
"/ now local has working string with initials fixed |
|
3685 |
||
3686 |
wdsz := local size. |
|
3687 |
n := 1. |
|
3688 |
||
4495 | 3689 |
[ n <= wdsz ] whileTrue:[ |
4491 | 3690 |
"/ max code size of 4 works well |
3691 |
||
3692 |
|symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar| |
|
3693 |
||
3694 |
symb := local at:n. |
|
3695 |
(n > 1) ifTrue:[ prevChar := local at:(n-1) ]. |
|
3696 |
(isLastChar := (n == wdsz)) ifFalse:[ |
|
3697 |
nextChar := local at:(n+1) |
|
3698 |
]. |
|
3699 |
isPrevToLastChar := (n == (wdsz-1)). |
|
3700 |
(n+2) <= wdsz ifTrue:[ |
|
3701 |
nextNextChar := local at:(n+2) |
|
3702 |
]. |
|
3703 |
||
4495 | 3704 |
"/ remove duplicate letters except C and except first |
3705 |
(symb == $C or:[ nextChar ~~ symb or:[ n == 1] ]) ifTrue:[ |
|
4491 | 3706 |
"/ not dup |
3707 |
('AEIOU' includes:symb) ifTrue:[ |
|
3708 |
"/ only use vowel if leading char |
|
3709 |
(n == 1) ifTrue:[ |
|
3710 |
code nextPut:symb |
|
3711 |
] |
|
3712 |
] ifFalse:[ |
|
3713 |
('B' includes:symb) ifTrue:[ |
|
3714 |
"/ if ( isPreviousChar(local, n, 'M') && |
|
3715 |
"/ isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3716 |
"/ break; |
|
3717 |
"/ } |
|
3718 |
"/ code.append(symb); |
|
3719 |
"/ break; |
|
4495 | 3720 |
(isLastChar and:[ prevChar == $M]) ifTrue:[ |
4491 | 3721 |
"/ B is silent if word ends in MB |
3722 |
] ifFalse:[ |
|
3723 |
code nextPut:symb. |
|
3724 |
]. |
|
3725 |
] ifFalse:[ |
|
3726 |
('C' includes:symb) ifTrue:[ |
|
3727 |
"/ lots of C special cases |
|
3728 |
"/ /* discard if SCI, SCE or SCY */ |
|
3729 |
"/ if ( isPreviousChar(local, n, 'S') && |
|
3730 |
"/ !!isLastChar(wdsz, n) && |
|
3731 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3732 |
"/ break; |
|
3733 |
"/ } |
|
3734 |
"/ if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3735 |
"/ code.append('X'); |
|
3736 |
"/ break; |
|
3737 |
"/ } |
|
3738 |
"/ if (!!isLastChar(wdsz, n) && |
|
3739 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3740 |
"/ code.append('S'); |
|
3741 |
"/ break; // CI,CE,CY -> S |
|
3742 |
"/ } |
|
3743 |
"/ if (isPreviousChar(local, n, 'S') && |
|
3744 |
"/ isNextChar(local, n, 'H') ) { // SCH->sk |
|
3745 |
"/ code.append('K'); |
|
3746 |
"/ break; |
|
3747 |
"/ } |
|
3748 |
"/ if (isNextChar(local, n, 'H')) { // detect CH |
|
3749 |
"/ if (n == 0 && |
|
3750 |
"/ wdsz >= 3 && |
|
3751 |
"/ isVowel(local,2) ) { // CH consonant -> K consonant |
|
3752 |
"/ code.append('K'); |
|
3753 |
"/ } else { |
|
3754 |
"/ code.append('X'); // CHvowel -> X |
|
3755 |
"/ } |
|
3756 |
"/ } else { |
|
3757 |
"/ code.append('K'); |
|
3758 |
"/ } |
|
3759 |
"/ break; |
|
3760 |
(prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[ |
|
3761 |
"/ discard if SCI, SCE or SCY |
|
3762 |
] ifFalse:[ |
|
3763 |
((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[ |
|
3764 |
"/ "CIA" -> X |
|
3765 |
code nextPut:$X |
|
3766 |
] ifFalse:[ |
|
3767 |
('IEY' includes:nextChar) ifTrue:[ |
|
3768 |
"/ CI,CE,CY -> S |
|
3769 |
code nextPut:$S |
|
3770 |
] ifFalse:[ |
|
3771 |
((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[ |
|
3772 |
"/ SCH->sk |
|
3773 |
code nextPut:$K |
|
3774 |
] ifFalse:[ |
|
3775 |
nextChar == $H ifTrue:[ |
|
3776 |
"/ CH |
|
3777 |
('AEIOU' includes:nextNextChar) ifTrue:[ |
|
3778 |
code nextPut:$K "/ CH consonant -> K consonant |
|
3779 |
] ifFalse:[ |
|
3780 |
code nextPut:$X "/ CHvowel -> X |
|
3781 |
] |
|
3782 |
] ifFalse:[ |
|
3783 |
code nextPut:$K |
|
3784 |
]. |
|
3785 |
] |
|
3786 |
] |
|
3787 |
] |
|
3788 |
]. |
|
3789 |
||
3790 |
] ifFalse:[ |
|
3791 |
('D' includes:symb) ifTrue:[ |
|
3792 |
"/ if (!!isLastChar(wdsz, n + 1) && |
|
3793 |
"/ isNextChar(local, n, 'G') && |
|
3794 |
"/ FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3795 |
"/ code.append('J'); n += 2; |
|
3796 |
"/ } else { |
|
3797 |
"/ code.append('T'); |
|
3798 |
"/ } |
|
3799 |
"/ break; |
|
3800 |
((nextChar == $G) |
|
3801 |
and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ]) |
|
3802 |
ifTrue:[ |
|
3803 |
code nextPut:$J. |
|
3804 |
n := n + 2. |
|
3805 |
] ifFalse:[ |
|
3806 |
code nextPut:$T. |
|
3807 |
]. |
|
3808 |
] ifFalse:[ |
|
3809 |
('G' includes:symb) ifTrue:[ |
|
3810 |
"/ GH silent at end or before consonant |
|
3811 |
"/ if (isLastChar(wdsz, n + 1) && |
|
3812 |
"/ isNextChar(local, n, 'H')) { |
|
3813 |
"/ break; |
|
3814 |
"/ } |
|
3815 |
"/ if (!!isLastChar(wdsz, n + 1) && |
|
3816 |
"/ isNextChar(local,n,'H') && |
|
3817 |
"/ !!isVowel(local,n+2)) { |
|
3818 |
"/ break; |
|
3819 |
"/ } |
|
3820 |
"/ if (n > 0 && |
|
3821 |
"/ ( regionMatch(local, n, "GN") || |
|
3822 |
"/ regionMatch(local, n, "GNED") ) ) { |
|
3823 |
"/ break; // silent G |
|
3824 |
"/ } |
|
3825 |
"/ if (isPreviousChar(local, n, 'G')) { |
|
3826 |
"/ // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true |
|
3827 |
"/ hard = true; |
|
3828 |
"/ } else { |
|
3829 |
"/ hard = false; |
|
3830 |
"/ } |
|
3831 |
"/ if (!!isLastChar(wdsz, n) && |
|
3832 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3833 |
"/ !!hard) { |
|
3834 |
"/ code.append('J'); |
|
3835 |
"/ } else { |
|
3836 |
"/ code.append('K'); |
|
3837 |
"/ } |
|
3838 |
"/ break; |
|
3839 |
(isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[ |
|
3840 |
"/ GH silent at end |
|
3841 |
] ifFalse:[ |
|
3842 |
(isPrevToLastChar not and:[ nextChar == $H |
|
3843 |
and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[ |
|
3844 |
"/ GH silent before consonant |
|
3845 |
] ifFalse:[ |
|
3846 |
(n > 1 and:[ nextChar == $N ]) ifTrue:[ |
|
3847 |
"/ GN -> silent G |
|
3848 |
] ifFalse:[ |
|
3849 |
hard := (prevChar == $G). |
|
3850 |
(isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[ |
|
3851 |
code nextPut:$J |
|
3852 |
] ifFalse:[ |
|
3853 |
code nextPut:$K |
|
3854 |
]. |
|
3855 |
]. |
|
3856 |
]. |
|
3857 |
]. |
|
3858 |
] ifFalse:[ |
|
3859 |
('H' includes:symb) ifTrue:[ |
|
3860 |
"/ case 'H': |
|
3861 |
"/ if (isLastChar(wdsz, n)) { |
|
3862 |
"/ break; // terminal H |
|
3863 |
"/ } |
|
3864 |
"/ if (n > 0 && |
|
3865 |
"/ VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3866 |
"/ break; |
|
3867 |
"/ } |
|
3868 |
"/ if (isVowel(local,n+1)) { |
|
3869 |
"/ code.append('H'); // Hvowel |
|
3870 |
"/ } |
|
3871 |
"/ break; |
|
3872 |
isLastChar ifTrue:[ |
|
3873 |
"/ ignore terminal H |
|
3874 |
] ifFalse:[ |
|
3875 |
('CSPTG' includes:prevChar) ifTrue:[ |
|
3876 |
"/ ignore CH, SH, PH, TH, GH (H treated there) |
|
3877 |
] ifFalse:[ |
|
3878 |
('AEIOU' includes:nextChar) ifTrue:[ |
|
3879 |
"/ Hvowel |
|
3880 |
code nextPut:$H |
|
3881 |
]. |
|
3882 |
]. |
|
3883 |
]. |
|
3884 |
] ifFalse:[ |
|
3885 |
('FJLMNR' includes:symb) ifTrue:[ |
|
3886 |
"/ case 'F': |
|
3887 |
"/ case 'J': |
|
3888 |
"/ case 'L': |
|
3889 |
"/ case 'M': |
|
3890 |
"/ case 'N': |
|
3891 |
"/ case 'R': |
|
3892 |
"/ code.append(symb); |
|
3893 |
"/ break; |
|
3894 |
code nextPut:symb. |
|
3895 |
] ifFalse:[ |
|
3896 |
('K' includes:symb) ifTrue:[ |
|
3897 |
"/ case 'K': |
|
3898 |
"/ if (n > 0) { // not initial |
|
3899 |
"/ if (!!isPreviousChar(local, n, 'C')) { |
|
3900 |
"/ code.append(symb); |
|
3901 |
"/ } |
|
3902 |
"/ } else { |
|
3903 |
"/ code.append(symb); // initial K |
|
3904 |
"/ } |
|
3905 |
"/ break; |
|
3906 |
n > 1 ifTrue:[ |
|
3907 |
"/ not initial |
|
3908 |
prevChar ~~ $C ifTrue:[ |
|
3909 |
code nextPut:$K. "/ initial K |
|
3910 |
]. |
|
3911 |
] ifFalse:[ |
|
3912 |
code nextPut:$K. "/ initial K |
|
3913 |
]. |
|
3914 |
] ifFalse:[ |
|
3915 |
('P' includes:symb) ifTrue:[ |
|
3916 |
"/ case 'P': |
|
3917 |
"/ if (isNextChar(local,n,'H')) { |
|
3918 |
"/ // PH -> F |
|
3919 |
"/ code.append('F'); |
|
3920 |
"/ } else { |
|
3921 |
"/ code.append(symb); |
|
3922 |
"/ } |
|
3923 |
"/ break; |
|
3924 |
nextChar == $H ifTrue:[ |
|
3925 |
"/ PH -> F |
|
3926 |
code nextPut:$F. |
|
3927 |
] ifFalse:[ |
|
3928 |
code nextPut:symb. |
|
3929 |
]. |
|
3930 |
] ifFalse:[ |
|
3931 |
('Q' includes:symb) ifTrue:[ |
|
3932 |
"/ case 'Q': |
|
3933 |
"/ code.append('K'); |
|
3934 |
"/ break; |
|
3935 |
code nextPut:$K |
|
3936 |
||
3937 |
] ifFalse:[ |
|
3938 |
('S' includes:symb) ifTrue:[ |
|
4495 | 3939 |
"/ case 'S': |
3940 |
"/ if (regionMatch(local,n,"SH") || |
|
3941 |
"/ regionMatch(local,n,"SIO") || |
|
3942 |
"/ regionMatch(local,n,"SIA")) { |
|
3943 |
"/ code.append('X'); |
|
3944 |
"/ } else { |
|
3945 |
"/ code.append('S'); |
|
3946 |
"/ } |
|
3947 |
"/ break; |
|
4491 | 3948 |
"/ SH -> X (as in shave or ashton) |
3949 |
"/ SIO -> X |
|
3950 |
"/ SIA -> X (as in ASIA) |
|
3951 |
((nextChar == $H) |
|
4495 | 3952 |
or:[ ((nextChar == $I) and:[ 'OA' includes:nextNextChar])] |
3953 |
) ifTrue:[ |
|
3954 |
code nextPut:$X |
|
4491 | 3955 |
] ifFalse:[ |
4495 | 3956 |
code nextPut:$S |
4491 | 3957 |
] |
3958 |
] ifFalse:[ |
|
3959 |
('T' includes:symb) ifTrue:[ |
|
4495 | 3960 |
"/ case 'T': |
3961 |
"/ if (regionMatch(local,n,"TIA") || |
|
3962 |
"/ regionMatch(local,n,"TIO")) { |
|
3963 |
"/ code.append('X'); |
|
3964 |
"/ break; |
|
3965 |
"/ } |
|
3966 |
"/ if (regionMatch(local,n,"TCH")) { |
|
3967 |
"/ // Silent if in "TCH" |
|
3968 |
"/ break; |
|
3969 |
"/ } |
|
3970 |
"/ // substitute numeral 0 for TH (resembles theta after all) |
|
3971 |
"/ if (regionMatch(local,n,"TH")) { |
|
3972 |
"/ code.append('0'); |
|
3973 |
"/ } else { |
|
3974 |
"/ code.append('T'); |
|
3975 |
"/ } |
|
3976 |
"/ break; |
|
3977 |
(nextChar == $I and:[ 'AO' includes:nextNextChar]) ifTrue:[ |
|
3978 |
code nextPut:$X. |
|
3979 |
] ifFalse:[ |
|
3980 |
(nextChar == $C and:[ nextNextChar == $H]) ifTrue:[ |
|
3981 |
"/ Silent if in "TCH" |
|
3982 |
"/ cg - huh; hutch - methinksthereisat |
|
3983 |
] ifFalse:[ |
|
3984 |
"/ substitute numeral 0 for TH (resembles theta after all) |
|
3985 |
nextChar == $H ifTrue:[ |
|
3986 |
code nextPut:$0. |
|
3987 |
] ifFalse:[ |
|
3988 |
code nextPut:$T. |
|
3989 |
]. |
|
3990 |
]. |
|
3991 |
]. |
|
4491 | 3992 |
] ifFalse:[ |
3993 |
('V' includes:symb) ifTrue:[ |
|
3994 |
"/ case 'V': |
|
3995 |
"/ code.append('F'); break; |
|
3996 |
code nextPut:$F |
|
3997 |
||
3998 |
] ifFalse:[ |
|
3999 |
('WY' includes:symb) ifTrue:[ |
|
4000 |
"/ case 'W': |
|
4001 |
"/ case 'Y': // silent if not followed by vowel |
|
4002 |
"/ if (!!isLastChar(wdsz,n) && |
|
4003 |
"/ isVowel(local,n+1)) { |
|
4004 |
"/ code.append(symb); |
|
4005 |
"/ } |
|
4006 |
"/ break; |
|
4007 |
||
4008 |
"/ silent if not followed by vowel |
|
4009 |
(isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[ |
|
4010 |
code nextPut:symb |
|
4011 |
]. |
|
4012 |
] ifFalse:[ |
|
4013 |
('X' includes:symb) ifTrue:[ |
|
4014 |
"/ case 'X': |
|
4015 |
"/ code.append('K'); |
|
4016 |
"/ code.append('S'); |
|
4017 |
"/ break; |
|
4018 |
code nextPutAll:'KS' |
|
4019 |
] ifFalse:[ |
|
4020 |
('Z' includes:symb) ifTrue:[ |
|
4021 |
"/ case 'Z': |
|
4022 |
"/ code.append('S'); |
|
4023 |
"/ break; |
|
4024 |
code nextPut:$S |
|
4025 |
] ifFalse:[ |
|
4495 | 4026 |
"/ default: |
4027 |
"/ // do nothing |
|
4028 |
"/ break; |
|
4491 | 4029 |
]]]]]]]]]]]]]]]]. "/ end switch |
4030 |
]. "/ end else from symb !!= 'C' |
|
4031 |
n := n + 1. |
|
4032 |
]. |
|
4033 |
^ code contents |
|
4034 |
||
4035 |
"Created: / 02-08-2017 / 09:51:31 / cg" |
|
4495 | 4036 |
"Modified: / 03-08-2017 / 14:55:22 / cg" |
4491 | 4037 |
! ! |
4038 |
||
4039 |
!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'! |
|
4040 |
||
4041 |
documentation |
|
4042 |
" |
|
4043 |
WARNING: this is the so called 'simplified soundex' algorithm; |
|
4044 |
there are more variants like miracode (american soundex) or |
|
4045 |
mysqlSoundex around. |
|
4046 |
||
4047 |
Be sure to use the correct algorithm, if the generated strings must be compatible |
|
4048 |
(otherwise, the differences are probably too small to be noticed as effect, but |
|
4049 |
your search will be different) |
|
4050 |
||
4051 |
The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm |
|
4052 |
||
4053 |
SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable |
|
4054 |
components of names, but by doing so reports more matches. |
|
4055 |
||
4056 |
There are some variations around in the literature; |
|
4057 |
the following is called 'simplified soundex', and the rules for coding a name are: |
|
4058 |
||
4059 |
1. The first letter of the name is used in its un-coded form to serve as the prefix |
|
4060 |
character of the code. (The rest of the code is numerical). |
|
4061 |
||
4062 |
2. Thereafter, W and H are ignored entirely. |
|
4063 |
||
4064 |
3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5). |
|
4065 |
||
4066 |
4. Other letters of the name are converted to a numerical equivalent: |
|
4067 |
B, P, F, V 1 |
|
4068 |
C, G, J, K, Q, S, X, Z 2 |
|
4069 |
D, T 3 |
|
4070 |
L 4 |
|
4071 |
M, N 5 |
|
4072 |
R 6 |
|
4073 |
||
4074 |
5. There are two exceptions: |
|
4075 |
1. Letters that follow prefix letters which would, if coded, have the same |
|
4076 |
numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them. |
|
4077 |
||
4078 |
2. The second letter of any pair of consonants having the same code number is likewise ignored, |
|
4079 |
i.e. unless there is a ''separator'' between them in the name. |
|
4080 |
||
4081 |
6. The final SOUNDEX code consists of the prefix letter plus three numerical characters. |
|
4082 |
Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros. |
|
4083 |
||
4084 |
Notice, that in another variant, w and h are treated slightly differently. |
|
4085 |
This is only of relevance, if you need to reconstruct original soundex codes of other programs |
|
4086 |
or for the original 1880 us census data. |
|
4087 |
SoundexStringComparator new encode:'Ashcraft' -> 'A226' |
|
4088 |
vs. |
|
4089 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4090 |
||
4091 |
Also notice, that soundex deals better with english. |
|
4092 |
For german and other languages, other algorithms may provide better results. |
|
4093 |
" |
|
4094 |
! ! |
|
4095 |
||
4096 |
!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'! |
|
4097 |
||
4098 |
encode:word |
|
4099 |
|u p t prevCode| |
|
4100 |
||
4101 |
u := word asUppercase. |
|
4102 |
p := u first asString. |
|
4103 |
prevCode := self translate:u first. |
|
4104 |
u from:2 to:u size do:[:c | |
|
4105 |
t := self translate:c. |
|
4106 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4107 |
p := p , t. |
|
4108 |
p size == 4 ifTrue:[^ p ]. |
|
4109 |
]. |
|
4110 |
prevCode := t |
|
4111 |
]. |
|
4112 |
[ p size < 4 ] whileTrue:[ |
|
4113 |
p := p , '0' |
|
4114 |
]. |
|
4115 |
^ (p copyFrom:1 to:4) |
|
4116 |
||
4117 |
" |
|
4118 |
self new encode:'washington' -> 'W252' |
|
4119 |
self new encode:'lee' -> 'L000' |
|
4120 |
self new encode:'Gutierrez' -> 'G362' |
|
4121 |
self new encode:'Pfister' -> 'P236' |
|
4122 |
self new encode:'Jackson' -> 'J250' |
|
4123 |
self new encode:'Tymczak' -> 'T522' |
|
4124 |
" |
|
4125 |
||
4126 |
"notice: |
|
4127 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4128 |
self new encode:'Ashcraft' -> 'A226' |
|
4129 |
" |
|
4130 |
||
4131 |
"Created: / 28-07-2017 / 15:21:23 / cg" |
|
4132 |
"Modified (comment): / 01-08-2017 / 19:01:43 / cg" |
|
4133 |
! ! |
|
4134 |
||
4135 |
!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'! |
|
4136 |
||
4137 |
translate:aCharacter |
|
4138 |
"use simple if's for more speed when compiled" |
|
4139 |
||
4140 |
"vowels serve as separators" |
|
4141 |
aCharacter == $A ifTrue:[^ '0' ]. |
|
4142 |
aCharacter == $E ifTrue:[^ '0' ]. |
|
4143 |
aCharacter == $I ifTrue:[^ '0' ]. |
|
4144 |
aCharacter == $O ifTrue:[^ '0' ]. |
|
4145 |
aCharacter == $U ifTrue:[^ '0' ]. |
|
4146 |
aCharacter == $Y ifTrue:[^ '0' ]. |
|
4147 |
||
4148 |
aCharacter == $B ifTrue:[^ '1' ]. |
|
4149 |
aCharacter == $P ifTrue:[^ '1' ]. |
|
4150 |
aCharacter == $F ifTrue:[^ '1' ]. |
|
4151 |
aCharacter == $V ifTrue:[^ '1' ]. |
|
4152 |
||
4153 |
aCharacter == $C ifTrue:[^ '2' ]. |
|
4154 |
aCharacter == $S ifTrue:[^ '2' ]. |
|
4155 |
aCharacter == $K ifTrue:[^ '2' ]. |
|
4156 |
aCharacter == $G ifTrue:[^ '2' ]. |
|
4157 |
aCharacter == $J ifTrue:[^ '2' ]. |
|
4158 |
aCharacter == $Q ifTrue:[^ '2' ]. |
|
4159 |
aCharacter == $X ifTrue:[^ '2' ]. |
|
4160 |
aCharacter == $Z ifTrue:[^ '2' ]. |
|
4161 |
||
4162 |
aCharacter == $D ifTrue:[^ '3' ]. |
|
4163 |
aCharacter == $T ifTrue:[^ '3' ]. |
|
4164 |
||
4165 |
aCharacter == $L ifTrue:[^ '4' ]. |
|
4166 |
||
4167 |
aCharacter == $M ifTrue:[^ '5' ]. |
|
4168 |
aCharacter == $N ifTrue:[^ '5' ]. |
|
4169 |
||
4170 |
aCharacter == $R ifTrue:[^ '6' ]. |
|
4171 |
^ nil |
|
4172 |
||
4173 |
"Modified: / 02-08-2017 / 01:35:40 / cg" |
|
4174 |
"Modified (comment): / 02-08-2017 / 14:30:11 / cg" |
|
4175 |
! ! |
|
4176 |
||
4177 |
!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'! |
|
4178 |
||
4179 |
documentation |
|
4180 |
" |
|
4181 |
MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation, |
|
4182 |
and also removing vokals first, then removing duplicate codes |
|
4183 |
(whereas the soundex code does this in reverse order). |
|
4184 |
||
4185 |
These variations are important, if you need the miracode soundex codes to be generated. |
|
4186 |
" |
|
4187 |
! ! |
|
4188 |
||
4189 |
!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'! |
|
4190 |
||
4191 |
encode:word |
|
4192 |
"same as inherited, but cares for 0, W and H" |
|
4193 |
||
4194 |
|u p t prevCode| |
|
4195 |
||
4196 |
u := word asUppercase. |
|
4197 |
p := u first asString. |
|
4198 |
prevCode := self translate:u first. |
|
4199 |
u from:2 to:u size do:[:c | |
|
4200 |
t := self translate:c. |
|
4201 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4202 |
p := p , t. |
|
4203 |
]. |
|
4204 |
(t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[ |
|
4205 |
prevCode := t. |
|
4206 |
]. |
|
4207 |
]. |
|
4208 |
[ p size < 4 ] whileTrue:[ |
|
4209 |
p := p , '0' |
|
4210 |
]. |
|
4211 |
^ p |
|
4212 |
||
4213 |
"Created: / 28-07-2017 / 15:23:41 / cg" |
|
4214 |
"Modified: / 31-07-2017 / 17:53:51 / cg" |
|
4215 |
"Modified (comment): / 02-08-2017 / 14:31:15 / cg" |
|
4216 |
! ! |
|
4217 |
||
4218 |
!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'! |
|
4219 |
||
4220 |
documentation |
|
4221 |
" |
|
4222 |
NYSIIS Algorithm: |
|
4223 |
||
4224 |
1. |
|
4225 |
remove all ''S'' and ''Z'' chars from the end of the surname |
|
4226 |
||
4227 |
2. |
|
4228 |
transcode initial strings |
|
4229 |
MAC => MC |
|
4230 |
PF => F |
|
4231 |
||
4232 |
3. |
|
4233 |
Transcode trailing strings as follows, |
|
4234 |
||
4235 |
IX => IC |
|
4236 |
EX => EC |
|
4237 |
YE,EE,IE => Y |
|
4238 |
NT,ND => D |
|
4239 |
||
4240 |
4. |
|
4241 |
transcode ''EV'' to ''EF'' if not at start of name |
|
4242 |
||
4243 |
5. |
|
4244 |
use first character of name as first character of key |
|
4245 |
||
4246 |
6. |
|
4247 |
remove any ''W'' that follows a vowel |
|
4248 |
||
4249 |
7. |
|
4250 |
replace all vowels with ''A'' |
|
4251 |
||
4252 |
8. |
|
4253 |
transcode ''GHT'' to ''GT'' |
|
4254 |
||
4255 |
9. |
|
4256 |
transcode ''DG'' to ''G'' |
|
4257 |
||
4258 |
10. |
|
4259 |
transcode ''PH'' to ''F'' |
|
4260 |
||
4261 |
11. |
|
4262 |
if not first character, eliminate all ''H'' preceded or followed by a vowel |
|
4263 |
||
4264 |
12. |
|
4265 |
change ''KN'' to ''N'', else ''K'' to ''C'' |
|
4266 |
||
4267 |
13. |
|
4268 |
if not first character, change ''M'' to ''N'' |
|
4269 |
||
4270 |
14. |
|
4271 |
if not first character, change ''Q'' to ''G'' |
|
4272 |
||
4273 |
15. |
|
4274 |
transcode ''SH'' to ''S'' |
|
4275 |
||
4276 |
16. |
|
4277 |
transcode ''SCH'' to ''S'' |
|
4278 |
||
4279 |
17. |
|
4280 |
transcode ''YW'' to ''Y'' |
|
4281 |
||
4282 |
18. |
|
4283 |
if not first or last character, change ''Y'' to ''A'' |
|
4284 |
||
4285 |
19. |
|
4286 |
transcode ''WR'' to ''R'' |
|
4287 |
||
4288 |
20. |
|
4289 |
if not first character, change ''Z'' to ''S'' |
|
4290 |
||
4291 |
21. |
|
4292 |
transcode terminal ''AY'' to ''Y'' |
|
4293 |
||
4294 |
22. |
|
4295 |
remove traling vowels |
|
4296 |
||
4297 |
23. |
|
4298 |
collapse all strings of repeated characters |
|
4299 |
||
4300 |
24. |
|
4301 |
if first char of original surname was a vowel, append it to the code |
|
4302 |
" |
|
4303 |
! ! |
|
4304 |
||
4305 |
!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'! |
|
4306 |
||
4307 |
encode:aString |
|
4308 |
|k| |
|
4309 |
||
4310 |
k := self rule1:(aString asUppercase). |
|
4311 |
"2. Transcode initial strings: MAC => MC PF => F" |
|
4312 |
k := self rule2:k. |
|
4313 |
k := self rule3:k. |
|
4314 |
k := self rule4:k. |
|
4315 |
k := self rule5:k. |
|
4316 |
k := self rule6:k. |
|
4317 |
k := self rule7:k. |
|
4318 |
k := self rule8:k. |
|
4319 |
k := self rule9:k. |
|
4320 |
k := self rule10:k. |
|
4321 |
k := self rule11:k. |
|
4322 |
k := self rule12:k. |
|
4323 |
k := self rule13:k. |
|
4324 |
k := self rule14:k. |
|
4325 |
k := self rule15:k. |
|
4326 |
k := self rule16:k. |
|
4327 |
k := self rule17:k. |
|
4328 |
k := self rule18:k. |
|
4329 |
k := self rule19:k. |
|
4330 |
k := self rule20:k. |
|
4331 |
k := self rule21:k. |
|
4332 |
k := self rule22:k. |
|
4333 |
k := self rule23:k. |
|
4334 |
k := self rule24:k originalKey:aString. |
|
4335 |
^ k |
|
4336 |
||
4337 |
" |
|
4338 |
self new encode:'hello' |
|
4339 |
self new encode:'bliss' |
|
4340 |
" |
|
4341 |
" |
|
4342 |
self new phoneticStringsFor:'hello' |
|
4343 |
self new phoneticStringsFor:'bliss' |
|
4344 |
" |
|
4345 |
||
4346 |
"Created: / 28-07-2017 / 15:34:52 / cg" |
|
4347 |
"Modified (comment): / 02-08-2017 / 14:31:47 / cg" |
|
4348 |
! ! |
|
4349 |
||
4350 |
!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'! |
|
4351 |
||
4352 |
rule10:key |
|
4353 |
"10. transcode 'PH' to 'F' " |
|
4354 |
||
4355 |
^ self transcodeAll:'PH' of:key to:'F' startingAt:1 |
|
4356 |
||
4357 |
"Modified (format): / 02-08-2017 / 14:34:27 / cg" |
|
4358 |
! |
|
4359 |
||
4360 |
rule11:key |
|
4361 |
|k c| |
|
4362 |
||
4363 |
"11. if not first character, eliminate all 'H' preceded or followed by a vowel " |
|
4364 |
k := key copy. |
|
4365 |
c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4366 |
2 to:key size do:[:i | |
|
4367 |
(key at:i) = $H ifTrue:[ |
|
4368 |
((key at:i - 1) isVowel |
|
4369 |
or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ] |
|
4370 |
] |
|
4371 |
]. |
|
4372 |
c do:[:n | |
|
4373 |
k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4374 |
]. |
|
4375 |
^ k |
|
4376 |
! |
|
4377 |
||
4378 |
rule12:key |
|
4379 |
|k| |
|
4380 |
||
4381 |
"12. change 'KN' to 'N', else 'K' to 'C' " |
|
4382 |
k := self transcodeAll:'KN' of:key to:'K' startingAt:1. |
|
4383 |
k := self transcodeAll:'K' of:k to:'C' startingAt:1. |
|
4384 |
^ k |
|
4385 |
||
4386 |
"Modified (format): / 02-08-2017 / 14:34:48 / cg" |
|
4387 |
! |
|
4388 |
||
4389 |
rule13:key |
|
4390 |
"13. if not first character, change 'M' to 'N' " |
|
4391 |
||
4392 |
^ self transcodeAll:'M' of:key to:'N' startingAt:2 |
|
4393 |
||
4394 |
"Modified (format): / 02-08-2017 / 14:35:00 / cg" |
|
4395 |
! |
|
4396 |
||
4397 |
rule14:key |
|
4398 |
"14. if not first character, change 'Q' to 'G' " |
|
4399 |
||
4400 |
^ self transcodeAll:'Q' of:key to:'G' startingAt:2 |
|
4401 |
||
4402 |
"Modified (format): / 02-08-2017 / 14:35:08 / cg" |
|
4403 |
! |
|
4404 |
||
4405 |
rule15:key |
|
4406 |
"15. transcode 'SH' to 'S' " |
|
4407 |
||
4408 |
^ self transcodeAll:'SH' of:key to:'S' startingAt:1 |
|
4409 |
||
4410 |
"Modified (format): / 02-08-2017 / 14:35:18 / cg" |
|
4411 |
! |
|
4412 |
||
4413 |
rule16:key |
|
4414 |
"16. transcode 'SCH' to 'S' " |
|
4415 |
||
4416 |
^ self transcodeAll:'SCH' of:key to:'S' startingAt:1 |
|
4417 |
||
4418 |
"Modified (format): / 02-08-2017 / 14:35:25 / cg" |
|
4419 |
! |
|
4420 |
||
4421 |
rule17:key |
|
4422 |
"17. transcode 'YW' to 'Y' " |
|
4423 |
||
4424 |
^ self transcodeAll:'YW' of:key to:'Y' startingAt:1 |
|
4425 |
||
4426 |
"Modified (format): / 02-08-2017 / 14:35:33 / cg" |
|
4427 |
! |
|
4428 |
||
4429 |
rule18:key |
|
4430 |
|k| |
|
4431 |
||
4432 |
"18. if not first or last character, change 'Y' to 'A' " |
|
4433 |
k := self transcodeAll:'Y' of:key to:'A' startingAt:2. |
|
4434 |
key last = $Y ifTrue:[ |
|
4435 |
k at:k size put:$Y |
|
4436 |
]. |
|
4437 |
^ k |
|
4438 |
||
4439 |
"Modified (format): / 02-08-2017 / 14:35:44 / cg" |
|
4440 |
! |
|
4441 |
||
4442 |
rule19:key |
|
4443 |
"19. transcode 'WR' to 'R' " |
|
4444 |
||
4445 |
^ self transcodeAll:'WR' of:key to:'R' startingAt:1 |
|
4446 |
||
4447 |
"Modified (format): / 02-08-2017 / 14:35:52 / cg" |
|
4448 |
! |
|
4449 |
||
4450 |
rule1:key |
|
4451 |
|k| |
|
4452 |
||
4453 |
k := key copy. |
|
4454 |
"1. Remove all 'S' and 'Z' chars from the end of the name" |
|
4455 |
[ |
|
4456 |
'SZ' includes:k last |
|
4457 |
] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ]. |
|
4458 |
^ k |
|
4459 |
! |
|
4460 |
||
4461 |
rule20:key |
|
4462 |
"20. if not first character, change 'Z' to 'S' " |
|
4463 |
||
4464 |
^ self transcodeAll:'Z' of:key to:'S' startingAt:2 |
|
4465 |
||
4466 |
"Modified (format): / 02-08-2017 / 14:36:00 / cg" |
|
4467 |
! |
|
4468 |
||
4469 |
rule21:key |
|
4470 |
"21. transcode terminal 'AY' to 'Y' " |
|
4471 |
||
4472 |
^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1 |
|
4473 |
||
4474 |
"Modified (format): / 02-08-2017 / 14:36:08 / cg" |
|
4475 |
! |
|
4476 |
||
4477 |
rule22:key |
|
4478 |
|k| |
|
4479 |
||
4480 |
"22. remove trailing vowels " |
|
4481 |
k := key copy. |
|
4482 |
[ k last isVowel ] whileTrue:[ |
|
4483 |
k := k copyButLast |
|
4484 |
]. |
|
4485 |
^ k |
|
4486 |
||
4487 |
"Modified: / 02-08-2017 / 14:36:42 / cg" |
|
4488 |
! |
|
4489 |
||
4490 |
rule23:key |
|
4491 |
|k c| |
|
4492 |
||
4493 |
"23. collapse all strings of repeated characters " |
|
4494 |
k := key copy. |
|
4495 |
c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4496 |
k size to:2 do:[:i | |
|
4497 |
(k at:i) = (k at:i - 1) ifTrue:[ |
|
4498 |
c add:i |
|
4499 |
] |
|
4500 |
]. |
|
4501 |
c do:[:n | |
|
4502 |
k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4503 |
]. |
|
4504 |
^ k |
|
4505 |
! |
|
4506 |
||
4507 |
rule24:key originalKey:originalKey |
|
4508 |
|k| |
|
4509 |
||
4510 |
"24. if first char of original surname was a vowel, append it to the code" |
|
4511 |
k := key copy. |
|
4512 |
originalKey first isVowel ifTrue:[ |
|
4513 |
k := k , originalKey first asString asUppercase |
|
4514 |
]. |
|
4515 |
^ k |
|
4516 |
! |
|
4517 |
||
4518 |
rule2:key |
|
4519 |
"2. Transcode initial strings: MAC => MC PF => F" |
|
4520 |
||
4521 |
|k| |
|
4522 |
||
4523 |
k := key copy. |
|
4524 |
(k startsWith:'MAC') ifTrue:[ |
|
4525 |
k := 'MC' , (k copyFrom:4) |
|
4526 |
]. |
|
4527 |
(k startsWith:'PF') ifTrue:[ |
|
4528 |
k := 'F' , (k copyFrom:3) |
|
4529 |
]. |
|
4530 |
^ k |
|
4531 |
||
4532 |
"Modified (format): / 02-08-2017 / 14:31:40 / cg" |
|
4533 |
! |
|
4534 |
||
4535 |
rule3:key |
|
4536 |
|k| |
|
4537 |
||
4538 |
"3. Transcode trailing strings as follows: |
|
4539 |
IX => IC |
|
4540 |
EX => EC |
|
4541 |
YE, EE, IE => Y |
|
4542 |
NT, ND => D" |
|
4543 |
||
4544 |
k := key copy. |
|
4545 |
k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'. |
|
4546 |
k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'. |
|
4547 |
k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'. |
|
4548 |
k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'. |
|
4549 |
^ k |
|
4550 |
||
4551 |
"Modified (format): / 02-08-2017 / 14:32:24 / cg" |
|
4552 |
! |
|
4553 |
||
4554 |
rule4:key |
|
4555 |
"4. Transcode 'EV' to 'EF' if not at start of name" |
|
4556 |
||
4557 |
^ self transcodeAll:'EV' of:key to:'EF' startingAt:2 |
|
4558 |
||
4559 |
"Modified (format): / 02-08-2017 / 14:32:35 / cg" |
|
4560 |
! |
|
4561 |
||
4562 |
rule5:key |
|
4563 |
"5. Use first character of name as first character of key. |
|
4564 |
Ignored because we're doing an in-place conversion" |
|
4565 |
||
4566 |
^ key |
|
4567 |
||
4568 |
"Modified (comment): / 02-08-2017 / 14:32:45 / cg" |
|
4569 |
! |
|
4570 |
||
4571 |
rule6:key |
|
4572 |
|k i| |
|
4573 |
||
4574 |
"6. Remove any 'W' that follows a vowel" |
|
4575 |
k := key copy. |
|
4576 |
i := 2. |
|
4577 |
[ |
|
4578 |
(i := k indexOf:$W startingAt:i) > 0 |
|
4579 |
] whileTrue:[ |
|
4580 |
(k at:i - 1) isVowel ifTrue:[ |
|
4581 |
k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size). |
|
4582 |
i := i - 1 |
|
4583 |
] |
|
4584 |
]. |
|
4585 |
^ k |
|
4586 |
! |
|
4587 |
||
4588 |
rule7:key |
|
4589 |
"7. replace all vowels with 'A' " |
|
4590 |
^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]]. |
|
4591 |
||
4592 |
"Modified: / 02-08-2017 / 14:33:56 / cg" |
|
4593 |
! |
|
4594 |
||
4595 |
rule8:key |
|
4596 |
"8. transcode 'GHT' to 'GT' " |
|
4597 |
||
4598 |
^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1 |
|
4599 |
||
4600 |
"Modified (format): / 02-08-2017 / 14:34:05 / cg" |
|
4601 |
! |
|
4602 |
||
4603 |
rule9:key |
|
4604 |
"9. transcode 'DG' to 'G' " |
|
4605 |
||
4606 |
^ self transcodeAll:'DG' of:key to:'G' startingAt:1 |
|
4607 |
||
4608 |
"Modified (format): / 02-08-2017 / 14:34:15 / cg" |
|
4609 |
! |
|
4610 |
||
4611 |
transcodeAll:aString of:key to:replacementString startingAt:start |
|
4612 |
|k i| |
|
4613 |
||
4614 |
k := key copy. |
|
4615 |
[ |
|
4616 |
(i := k indexOfSubCollection:aString startingAt:start) > 0 |
|
4617 |
] whileTrue:[ |
|
4618 |
k := (k copyFrom:1 to:i - 1) , replacementString |
|
4619 |
, (k copyFrom:i + aString size to:k size) |
|
4620 |
]. |
|
4621 |
^ k |
|
4622 |
! |
|
4623 |
||
4624 |
transcodeTrailing:anArrayOfStrings of:key to:replacementString |
|
4625 |
|answer| |
|
4626 |
||
4627 |
answer := key copy. |
|
4628 |
anArrayOfStrings do:[:aString | |
|
4629 |
answer := self |
|
4630 |
transcodeAll:aString |
|
4631 |
of:answer |
|
4632 |
to:replacementString |
|
4633 |
startingAt:(answer size - aString size) + 1 |
|
4634 |
]. |
|
4635 |
^ answer |
|
4636 |
! ! |
|
4637 |
||
4638 |
!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'! |
|
4639 |
||
4640 |
documentation |
|
4641 |
" |
|
4642 |
Implementation of the PHONEM algorithm, as described in |
|
4643 |
'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht - |
|
4644 |
Ein Programm fuer kontextsensitive phonetische Textumwandlung |
|
4645 |
ct Magazin fuer Computer & Technik 25/1998' |
|
4646 |
||
4647 |
This algorithm deals better with the german language (it cares for umlauts) |
|
4648 |
" |
|
4649 |
! ! |
|
4650 |
||
4651 |
!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'! |
|
4652 |
||
4653 |
encode:aString |
|
4654 |
|s idx t t2| |
|
4655 |
||
4656 |
s := aString asUppercase. |
|
4657 |
||
4658 |
idx := 1. |
|
4659 |
[idx < (s size-1)] whileTrue:[ |
|
4660 |
t2 := nil. |
|
4661 |
t := s copyFrom:idx to:idx+1. |
|
4662 |
t = 'SC' ifTrue:[ t2 := 'C' ] |
|
4663 |
ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ] |
|
4664 |
ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ] |
|
4665 |
ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ] |
|
4666 |
ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ] |
|
4667 |
ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ] |
|
4668 |
ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ] |
|
4669 |
ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ] |
|
4670 |
ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ] |
|
4671 |
ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ] |
|
4672 |
ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ] |
|
4673 |
ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ] |
|
4674 |
ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ] |
|
4675 |
ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ] |
|
4676 |
ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ] |
|
4677 |
ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ] |
|
4678 |
ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]]. |
|
4679 |
t2 notNil ifTrue:[ |
|
4680 |
s := (s copyTo:idx-1),t2,(s copyFrom:idx+2) |
|
4681 |
] ifFalse:[ |
|
4682 |
idx := idx + 1. |
|
4683 |
]. |
|
4684 |
]. |
|
4685 |
||
4686 |
"/ single character substitutions via tr |
|
4687 |
s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'. |
|
4688 |
s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false. |
|
4689 |
s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true. |
|
4690 |
^ s |
|
4691 |
||
4692 |
" |
|
4693 |
self basicNew encode:'müller' -> 'MYLR' |
|
4694 |
self basicNew encode:'mueller' -> 'MYLR' |
|
4695 |
self basicNew encode:'möller' -> 'MYLR' |
|
4696 |
self basicNew encode:'miller' -> 'MYLR' |
|
4697 |
self basicNew encode:'muller' -> 'MULR' |
|
4698 |
self basicNew encode:'muler' -> 'MULR' |
|
4699 |
||
4700 |
self basicNew phoneticStringsFor:'müller' #('MYLR') |
|
4701 |
self basicNew phoneticStringsFor:'mueller' #('MYLR') |
|
4702 |
self basicNew phoneticStringsFor:'möller' #('MYLR') |
|
4703 |
self basicNew phoneticStringsFor:'miller' #('MYLR') |
|
4704 |
self basicNew phoneticStringsFor:'muller' #('MULR') |
|
4705 |
self basicNew phoneticStringsFor:'muler' #('MULR') |
|
4706 |
||
4707 |
self basicNew phoneticStringsFor:'schmidt' #('CMYD') |
|
4708 |
self basicNew phoneticStringsFor:'schneider' #('CNAYDR') |
|
4709 |
self basicNew phoneticStringsFor:'fischer' #('VYCR') |
|
4710 |
self basicNew phoneticStringsFor:'weber' #('VBR') |
|
4711 |
self basicNew phoneticStringsFor:'weeber' #('VBR') |
|
4712 |
self basicNew phoneticStringsFor:'webber' #('VBR') |
|
4713 |
self basicNew phoneticStringsFor:'wepper' #('VBR') |
|
4714 |
||
4715 |
self basicNew phoneticStringsFor:'meyer' #('MAYR') |
|
4716 |
self basicNew phoneticStringsFor:'maier' #('MAYR') |
|
4717 |
self basicNew phoneticStringsFor:'mayer' #('MAYR') |
|
4718 |
self basicNew phoneticStringsFor:'mayr' #('MAYR') |
|
4719 |
self basicNew phoneticStringsFor:'meir' #('MAYR') |
|
4720 |
||
4721 |
self basicNew phoneticStringsFor:'wagner' #('VACNR') |
|
4722 |
self basicNew phoneticStringsFor:'schulz' #('CULC') |
|
4723 |
self basicNew phoneticStringsFor:'becker' #('BCR') |
|
4724 |
self basicNew phoneticStringsFor:'hoffmann' #('OVMAN') |
|
4725 |
self basicNew phoneticStringsFor:'haus' #('AUS') |
|
4726 |
||
4727 |
self basicNew phoneticStringsFor:'schäfer' #('CVR') |
|
4728 |
self basicNew phoneticStringsFor:'scheffer' #('CVR') |
|
4729 |
self basicNew phoneticStringsFor:'schaeffer' #('CVR') |
|
4730 |
self basicNew phoneticStringsFor:'schaefer' #('CVR') |
|
4731 |
" |
|
4732 |
||
4733 |
"Created: / 28-07-2017 / 15:38:08 / cg" |
|
4734 |
! ! |
|
4735 |
||
4736 |
!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'! |
|
4737 |
||
4738 |
documentation |
|
4739 |
" |
|
4740 |
Caverphone (2) Algorithm: |
|
4741 |
||
4742 |
see http://caversham.otago.ac.nz/files/working/ctp150804.pdf |
|
4743 |
||
4744 |
Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it, |
|
4745 |
with the proviso that the Caversham Project at the University of Otago should be acknowledged as the |
|
4746 |
original source (which is hereby done ;-). |
|
4747 |
||
4748 |
• Start with a Surname or Firstname |
|
4749 |
• Convert to lowercase |
|
4750 |
This coding system is case sensitive, implementations should acknowledge that a is not the same as A |
|
4751 |
• Remove anything not A-Z |
|
4752 |
The main intention of this is to remove spaces, hyphens, and apostrophes. |
|
4753 |
example: o'brian becomes obrian |
|
4754 |
• If the name starts with cough make it cou2f |
|
4755 |
2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in. |
|
4756 |
• If the name starts with rough make it rou2f |
|
4757 |
• If the name starts with tough make it tou2f |
|
4758 |
• If the name starts with enough make it enou2f |
|
4759 |
• If the name starts with gn make it 2n |
|
4760 |
• If the name ends with mb make it m2 |
|
4761 |
• replace cq with 2q |
|
4762 |
• replace ci with si |
|
4763 |
• replace ce with se |
|
4764 |
• replace cy with sy |
|
4765 |
• replace tch with 2ch |
|
4766 |
• replace c with k |
|
4767 |
• replace q with k |
|
4768 |
• replace x with k |
|
4769 |
• replace v with f |
|
4770 |
• replace dg with 2g |
|
4771 |
• replace tio with sio |
|
4772 |
• replace tia with sia |
|
4773 |
• replace d with t |
|
4774 |
• replace ph with fh |
|
4775 |
• replace b with p |
|
4776 |
• replace sh with s2 |
|
4777 |
• replace z with s |
|
4778 |
• replace and initial vowel with an A |
|
4779 |
• replace all other vowels with a 3 |
|
4780 |
3 is a temporary placeholder marking a vowel |
|
4781 |
• replace 3gh3 with 3kh3 |
|
4782 |
Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule. |
|
4783 |
• replace gh with 22 |
|
4784 |
• replace g with k |
|
4785 |
• replace groups of the letter s with a S |
|
4786 |
Continuous strings of s are replace by a single S |
|
4787 |
• replace groups of the letter t with a T |
|
4788 |
• replace groups of the letter p with a P |
|
4789 |
• replace groups of the letter k with a K |
|
4790 |
• replace groups of the letter f with a F |
|
4791 |
• replace groups of the letter m with a M |
|
4792 |
• replace groups of the letter n with a N |
|
4793 |
• replace w3 with W3 |
|
4794 |
• replace wy with Wy |
|
4795 |
• replace wh3 with Wh3 |
|
4796 |
• replace why with Why |
|
4797 |
• replace w with 2 |
|
4798 |
• replace and initial h with an A |
|
4799 |
• replace all other occurrences of h with a 2 |
|
4800 |
• replace r3 with R3 |
|
4801 |
• replace ry with Ry |
|
4802 |
• replace r with 2 |
|
4803 |
• replace l3 with L3 |
|
4804 |
• replace ly with Ly |
|
4805 |
• replace l with 2 |
|
4806 |
• replace j with y |
|
4807 |
• replace y3 with Y3 |
|
4808 |
• replace y with 2 |
|
4809 |
• remove all 2s |
|
4810 |
• remove all 3s |
|
4811 |
• put six (v1) / ten (v2) 1s on the end |
|
4812 |
• take the first six characters as the code (caverphone 1); |
|
4813 |
/ take the first ten characters as the code (caverphone 2); |
|
4814 |
||
4815 |
self new encode:'david' -> 'TFT1111111' |
|
4816 |
self new encode:'whittle' -> 'WTA1111111' |
|
4817 |
||
4818 |
self new encode:'Stevenson' -> 'STFNSN1111' |
|
4819 |
self new encode:'Peter' -> 'PTA1111111' |
|
4820 |
||
4821 |
self new encode:'washington' -> 'WSNKTN1111' |
|
4822 |
self new encode:'lee' -> 'LA11111111' |
|
4823 |
self new encode:'Gutierrez' -> 'KTRS111111' |
|
4824 |
self new encode:'Pfister' -> 'PFSTA11111' |
|
4825 |
self new encode:'Jackson' -> 'YKSN111111' |
|
4826 |
self new encode:'Tymczak' -> 'TMKSK11111' |
|
4827 |
||
4828 |
self new encode:'add' -> 'AT11111111' |
|
4829 |
self new encode:'aid' -> 'AT11111111' |
|
4830 |
self new encode:'at' -> 'AT11111111' |
|
4831 |
self new encode:'art' -> 'AT11111111' |
|
4832 |
self new encode:'earth' -> 'AT11111111' |
|
4833 |
self new encode:'head' -> 'AT11111111' |
|
4834 |
self new encode:'old' -> 'AT11111111' |
|
4835 |
||
4836 |
self new encode:'ready' -> 'RTA1111111' |
|
4837 |
self new encode:'rather' -> 'RTA1111111' |
|
4838 |
self new encode:'able' -> 'APA1111111' |
|
4839 |
self new encode:'appear' -> 'APA1111111' |
|
4840 |
||
4841 |
self new encode:'Deedee' -> 'TTA1111111' |
|
4842 |
" |
|
4843 |
! ! |
|
4844 |
||
4845 |
!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'! |
|
4846 |
||
4847 |
encode:word |
|
4848 |
|txt| |
|
4849 |
||
4850 |
word size == 0 ifTrue:[^ '1111111111' ]. |
|
4851 |
||
4852 |
"/ 1. Convert to lowercase |
|
4853 |
txt := word asLowercase. |
|
4854 |
||
4855 |
"/ 2. Remove anything not A-Z |
|
4856 |
txt := txt select:#isLetter. |
|
4857 |
||
4858 |
#( |
|
4859 |
"/ oldSeq newSeq repeat |
|
4860 |
||
4861 |
"/ 2.5. Remove final e |
|
4862 |
'e$' '' false |
|
4863 |
"/ 3. Handle various start options |
|
4864 |
'^cough' 'cou2f' false |
|
4865 |
'^rough' 'rou2f' false |
|
4866 |
'^tough' 'tou2f' false |
|
4867 |
'^enough' 'enou2f' false |
|
4868 |
'^trough' 'trou2f' false |
|
4869 |
||
4870 |
'^gn' '2n' false |
|
4871 |
'mb$' 'm2' false |
|
4872 |
||
4873 |
"/ 4. Handle replacements |
|
4874 |
'cq' '2q' true |
|
4875 |
'ci' 'si' true |
|
4876 |
'ce' 'se' true |
|
4877 |
'cy' 'sy' true |
|
4878 |
'tch' '2ch' true |
|
4879 |
'c' 'k' true |
|
4880 |
'q' 'k' true |
|
4881 |
'x' 'k' true |
|
4882 |
'v' 'f' true |
|
4883 |
'dg' '2g' true |
|
4884 |
'tio' 'sio' true |
|
4885 |
'tia' 'sia' true |
|
4886 |
'd' 't' true |
|
4887 |
'ph' 'fh' true |
|
4888 |
'b' 'p' true |
|
4889 |
'sh' 's2' true |
|
4890 |
'z' 's' true |
|
4891 |
||
4892 |
'^a' 'A' false |
|
4893 |
'^e' 'A' false |
|
4894 |
'^i' 'A' false |
|
4895 |
'^o' 'A' false |
|
4896 |
'^u' 'A' false |
|
4897 |
||
4898 |
'a' '3' true |
|
4899 |
'e' '3' true |
|
4900 |
'i' '3' true |
|
4901 |
'o' '3' true |
|
4902 |
'u' '3' true |
|
4903 |
'j' 'y' true |
|
4904 |
||
4905 |
'^y3' 'Y3' false |
|
4906 |
'^y' 'A' false |
|
4907 |
||
4908 |
'y' '3' true |
|
4909 |
'3gh3' '3kh3' true |
|
4910 |
'gh' '22' true |
|
4911 |
'g' 'k' true |
|
4912 |
's' 'S' true |
|
4913 |
'SS' 'S' true |
|
4914 |
't' 'T' true |
|
4915 |
'TT' 'T' true |
|
4916 |
'p' 'P' true |
|
4917 |
'PP' 'P' true |
|
4918 |
'k' 'K' true |
|
4919 |
'KK' 'K' true |
|
4920 |
'f' 'F' true |
|
4921 |
'FF' 'F' true |
|
4922 |
'm' 'M' true |
|
4923 |
'MM' 'M' true |
|
4924 |
'n' 'N' true |
|
4925 |
'NN' 'N' true |
|
4926 |
'w3' 'W3' true |
|
4927 |
'wh3' 'Wh3' true |
|
4928 |
'w$' '3' false |
|
4929 |
'w' '2' true |
|
4930 |
'^h' 'A' false |
|
4931 |
'h' '2' true |
|
4932 |
'r3' 'R3' true |
|
4933 |
'r$' '3' false |
|
4934 |
'r' '2' true |
|
4935 |
'l3' 'L3' true |
|
4936 |
'l$' '3' false |
|
4937 |
'l' '2' true |
|
4938 |
||
4939 |
"/ 5. removals |
|
4940 |
||
4941 |
'2' '' true |
|
4942 |
'3$' 'A' true |
|
4943 |
'3' '' true |
|
4944 |
) inGroupsOf:3 do:[:pat :repl :repeat| |
|
4945 |
|s txtBefore| |
|
4946 |
||
4947 |
txtBefore := txt. |
|
4948 |
(pat startsWith:$^) ifTrue:[ |
|
4949 |
s := pat copyButFirst. |
|
4950 |
repeat ifTrue:[ |
|
4951 |
[txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4952 |
] ifFalse:[ |
|
4953 |
(txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4954 |
]. |
|
4955 |
] ifFalse:[ |
|
4956 |
(pat endsWith:$$) ifTrue:[ |
|
4957 |
s := pat copyButLast. |
|
4958 |
repeat ifTrue:[ |
|
4959 |
[txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4960 |
] ifFalse:[ |
|
4961 |
(txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4962 |
] |
|
4963 |
] ifFalse:[ |
|
4964 |
repeat ifTrue:[ |
|
4965 |
txt := txt copyReplaceAllSubcollections:pat with:repl |
|
4966 |
] ifFalse:[ |
|
4967 |
txt := txt copyReplaceSubcollection:pat with:repl |
|
4968 |
] |
|
4969 |
] |
|
4970 |
]. |
|
4971 |
"/ txt ~= txtBefore ifTrue:[ |
|
4972 |
"/ Transcript showCR:(pat,' | ',repl,' -> ',txt). |
|
4973 |
"/ ]. |
|
4974 |
]. |
|
4975 |
||
4976 |
"/ 6. put ten 1s on the end |
|
4977 |
txt := txt,'1111111111'. |
|
4978 |
||
4979 |
"/ 7. take the first ten characters as the code |
|
4980 |
^ txt copyTo:10 |
|
4981 |
||
4982 |
" |
|
4983 |
self new encode:'david' -> 'TFT1111111' |
|
4984 |
self new encode:'whittle' -> 'WTA1111111' |
|
4985 |
||
4986 |
self new encode:'Stevenson' -> 'STFNSN1111' |
|
4987 |
self new encode:'Peter' -> 'PTA1111111' |
|
4988 |
||
4989 |
self new encode:'washington' -> 'WSNKTN1111' |
|
4990 |
self new encode:'lee' -> 'LA11111111' |
|
4991 |
self new encode:'Gutierrez' -> 'KTRS111111' |
|
4992 |
self new encode:'Pfister' -> 'PFSTA11111' |
|
4993 |
self new encode:'Jackson' -> 'YKSN111111' |
|
4994 |
self new encode:'Tymczak' -> 'TMKSK11111' |
|
4995 |
||
4996 |
self new encode:'add' -> 'AT11111111' |
|
4997 |
self new encode:'aid' -> 'AT11111111' |
|
4998 |
self new encode:'at' -> 'AT11111111' |
|
4999 |
self new encode:'art' -> 'AT11111111' |
|
5000 |
self new encode:'earth' -> 'AT11111111' |
|
5001 |
self new encode:'head' -> 'AT11111111' |
|
5002 |
self new encode:'old' -> 'AT11111111' |
|
5003 |
||
5004 |
self new encode:'ready' -> 'RTA1111111' |
|
5005 |
self new encode:'rather' -> 'RTA1111111' |
|
5006 |
self new encode:'able' -> 'APA1111111' |
|
5007 |
self new encode:'appear' -> 'APA1111111' |
|
5008 |
||
5009 |
self new encode:'Deedee' -> 'TTA1111111' |
|
5010 |
" |
|
5011 |
||
5012 |
"Created: / 28-07-2017 / 15:21:23 / cg" |
|
5013 |
"Modified: / 02-08-2017 / 01:42:35 / cg" |
|
5014 |
! ! |
|
5015 |
||
4488 | 5016 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
5017 |
||
5018 |
documentation |
|
5019 |
" |
|
5020 |
The 'Kölner Phonetik' (cologne phonetic) code is for the german language |
|
5021 |
what the soundex code is for english: |
|
5022 |
it returns similar strings for similar sounding words |
|
5023 |
(but is specifically aware of the pronunciation of German and eastern languages) . |
|
5024 |
||
5025 |
There are some other differences to soundex, though: |
|
5026 |
its length is not limited to 4, but depends on the length of the original string; |
|
5027 |
it does not start with the first character of the input, but returns a pure numeric string. |
|
5028 |
||
5029 |
This algorithm was described by Postel 1969, |
|
5030 |
See http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik |
|
5031 |
||
5032 |
self new phoneticStringsFor:'Müller-Lüdenscheidt' -> #('65752682') |
|
5033 |
" |
|
5034 |
! |
|
5035 |
||
5036 |
examples |
|
5037 |
" |
|
5038 |
words sounding similar (german pronunciation) will deliver a similar code: |
|
5039 |
||
5040 |
#( |
|
5041 |
'Müller' |
|
5042 |
'Miller' |
|
5043 |
'Mueller' |
|
5044 |
'Mühler' |
|
5045 |
'Mühlherr' |
|
5046 |
'Mülherr' |
|
5047 |
'Myler' |
|
5048 |
'Millar' |
|
5049 |
'Myller' |
|
5050 |
'Müllar' |
|
5051 |
'Müler' |
|
5052 |
'Muehler' |
|
5053 |
'Mülller' |
|
5054 |
'Müllerr' |
|
5055 |
'Muehlherr' |
|
5056 |
'Muellar' |
|
5057 |
'Mueler' |
|
5058 |
'Mülleer' |
|
5059 |
'Mueller' |
|
5060 |
'Nüller' |
|
5061 |
'Nyller' |
|
5062 |
'Niler' |
|
5063 |
'Czerny' |
|
5064 |
'Tscherny' |
|
5065 |
'Czernie' |
|
5066 |
'Tschernie' |
|
5067 |
'Schernie' |
|
5068 |
'Scherny' |
|
5069 |
'Scherno' |
|
5070 |
'Czerne' |
|
5071 |
'Zerny' |
|
5072 |
'Tzernie' |
|
5073 |
'Breschnew' |
|
5074 |
'Breschnew' |
|
5075 |
'Breschneff' |
|
5076 |
'Breschnjeff' |
|
5077 |
'Braeschneff' |
|
5078 |
'Braessneff' |
|
5079 |
'Pressneff' |
|
5080 |
'Presznäph' |
|
5081 |
'Präschnäf' |
|
5082 |
'Breschnjeff' |
|
5083 |
'Breschnijeff' |
|
5084 |
'Breschnieff' |
|
5085 |
'Bräschnieff' |
|
5086 |
'Braschnieff' |
|
5087 |
'Broschnieff' |
|
5088 |
) do:[:w | |
|
5089 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w) |
|
5090 |
]. |
|
5091 |
" |
|
5092 |
! ! |
|
5093 |
||
5094 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'api'! |
|
5095 |
||
5096 |
encode: aString |
|
5097 |
"return a koelner phonetic code. |
|
5098 |
The koelnerPhonetic code is for the german language what the soundex code is for english; |
|
5099 |
it returns simular strings for similar sounding words. |
|
5100 |
There are some differences to soundex, though: |
|
5101 |
its length is not limited to 4, but depends on the length of the original string; |
|
5102 |
it does not start with the first character of the input. |
|
5103 |
This algorithm is described by Postel 1969" |
|
5104 |
||
5105 |
|in ret val rslt| |
|
5106 |
||
5107 |
in := aString withoutSeparators asLowercase. |
|
5108 |
in := in copyReplaceString:'ph' withString:'f'. |
|
5109 |
(in includesAny:'öäüß') ifTrue:[ |
|
5110 |
in := in copyReplaceAll:$ü withAll:'u'. |
|
5111 |
in := in copyReplaceAll:$ä withAll:'a'. |
|
5112 |
in := in copyReplaceAll:$ö withAll:'o'. |
|
5113 |
in := in copyReplaceAll:$ß withAll:'ss'. |
|
5114 |
]. |
|
5115 |
in := in select:[:ch | ch isLetter]. |
|
5116 |
in := '#',in,'#'. |
|
5117 |
||
5118 |
ret := ''. |
|
5119 |
1 to:in size-2 do:[:i | |
|
5120 |
|sub| |
|
5121 |
||
5122 |
sub := in copyFrom:i to:i+2. |
|
5123 |
val := (i==1) |
|
5124 |
ifTrue:[ self convertFirst:sub ] |
|
5125 |
ifFalse:[ self convertRest:sub ]. |
|
5126 |
ret := ret,val |
|
5127 |
]. |
|
5128 |
||
5129 |
ret := ret select:[:ch | ch ~= $-]. |
|
5130 |
||
5131 |
(ret startsWith:'0') ifTrue:[ |
|
5132 |
ret := '0',(ret select:[:ch | ch ~= $0]). |
|
5133 |
] ifFalse:[ |
|
5134 |
ret := ret select:[:ch | ch ~= $0]. |
|
5135 |
]. |
|
5136 |
||
5137 |
rslt := String streamContents:[:s | |
|
5138 |
|prev| |
|
5139 |
||
5140 |
ret do:[:ch | |
|
5141 |
ch ~= prev ifTrue:[ |
|
5142 |
s nextPut:ch |
|
5143 |
]. |
|
5144 |
prev := ch. |
|
5145 |
]. |
|
5146 |
]. |
|
5147 |
^ rslt. |
|
5148 |
||
5149 |
" |
|
5150 |
#( |
|
5151 |
'Müller' |
|
5152 |
'Miller' |
|
5153 |
'Mueller' |
|
5154 |
'Mühler' |
|
5155 |
'Mühlherr' |
|
5156 |
'Mülherr' |
|
5157 |
'Myler' |
|
5158 |
'Millar' |
|
5159 |
'Myller' |
|
5160 |
'Müllar' |
|
5161 |
'Müler' |
|
5162 |
'Muehler' |
|
5163 |
'Mülller' |
|
5164 |
'Müllerr' |
|
5165 |
'Muehlherr' |
|
5166 |
'Muellar' |
|
5167 |
'Mueler' |
|
5168 |
'Mülleer' |
|
5169 |
'Mueller' |
|
5170 |
'Nüller' |
|
5171 |
'Nyller' |
|
5172 |
'Niler' |
|
5173 |
'Czerny' |
|
5174 |
'Tscherny' |
|
5175 |
'Czernie' |
|
5176 |
'Tschernie' |
|
5177 |
'Schernie' |
|
5178 |
'Scherny' |
|
5179 |
'Scherno' |
|
5180 |
'Czerne' |
|
5181 |
'Zerny' |
|
5182 |
'Tzernie' |
|
5183 |
'Breschnew' |
|
5184 |
'Breschnew' |
|
5185 |
'Breschneff' |
|
5186 |
'Breschnjeff' |
|
5187 |
'Braeschneff' |
|
5188 |
'Braessneff' |
|
5189 |
'Pressneff' |
|
5190 |
'Presznäph' |
|
5191 |
'Präschnäf' |
|
5192 |
'Breschnjeff' |
|
5193 |
'Breschnijeff' |
|
5194 |
'Breschnieff' |
|
5195 |
) do:[:w | |
|
5196 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w) |
|
5197 |
]. |
|
5198 |
" |
|
5199 |
||
5200 |
" |
|
5201 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnew' -> '17863' |
|
5202 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschneff' -> '17863' |
|
5203 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braeschneff' -> '17863' |
|
5204 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braessneff' -> '17863' |
|
5205 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Pressneff' -> '17863' |
|
5206 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Presznäph' -> '17863' |
|
5207 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Präschnäf' -> '17863' |
|
5208 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnjeff' -> '17863' |
|
5209 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnijeff' -> '17863' |
|
5210 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnieff' -> '17863' |
|
5211 |
" |
|
5212 |
" |
|
5213 |
self basicNew encode:'müller' -> '657' |
|
5214 |
self basicNew encode:'möller' -> '657' |
|
5215 |
self basicNew encode:'miller' -> '657' |
|
5216 |
self basicNew encode:'muller' -> '657' |
|
5217 |
self basicNew encode:'muler' -> '657' |
|
5218 |
self basicNew encode:'schmidt' -> '862' |
|
5219 |
self basicNew encode:'schneider' -> '8627' |
|
5220 |
self basicNew encode:'fischer' -> '387' |
|
5221 |
self basicNew encode:'weber' -> '317' |
|
5222 |
self basicNew encode:'meyer' -> '67' |
|
5223 |
self basicNew encode:'wagner' -> '3467' |
|
5224 |
self basicNew encode:'schulz' -> '858' |
|
5225 |
self basicNew encode:'becker' -> '147' |
|
5226 |
self basicNew encode:'hoffmann' -> '036' |
|
5227 |
self basicNew encode:'schäfer' -> '837' |
|
5228 |
" |
|
5229 |
||
5230 |
"Created: / 28-07-2017 / 15:24:33 / cg" |
|
5231 |
! ! |
|
5232 |
||
5233 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'! |
|
5234 |
||
5235 |
convertFirst:chars |
|
5236 |
|c2 c3| |
|
5237 |
||
5238 |
chars size == 3 ifTrue:[ |
|
5239 |
c2 := (chars at:2). |
|
5240 |
c2 == $a ifTrue:[^ '0']. |
|
5241 |
c2 == $e ifTrue:[^ '0']. |
|
5242 |
c2 == $i ifTrue:[^ '0']. |
|
5243 |
c2 == $j ifTrue:[^ '0']. |
|
5244 |
c2 == $y ifTrue:[^ '0']. |
|
5245 |
c2 == $o ifTrue:[^ '0']. |
|
5246 |
c2 == $u ifTrue:[^ '0']. |
|
5247 |
||
5248 |
c2 == $c ifTrue:[ |
|
5249 |
c3 := (chars at:3). |
|
5250 |
(c3 == $a) ifTrue:[^ '4']. |
|
5251 |
(c3 == $h) ifTrue:[^ '4']. |
|
5252 |
(c3 == $k) ifTrue:[^ '4']. |
|
5253 |
(c3 == $l) ifTrue:[^ '4']. |
|
5254 |
(c3 == $o) ifTrue:[^ '4']. |
|
5255 |
(c3 == $q) ifTrue:[^ '4']. |
|
5256 |
(c3 == $r) ifTrue:[^ '4']. |
|
5257 |
(c3 == $u) ifTrue:[^ '4']. |
|
5258 |
(c3 == $x) ifTrue:[^ '4']. |
|
5259 |
^ '8' |
|
5260 |
]. |
|
5261 |
||
5262 |
"/ #( |
|
5263 |
"/ ('#a#' '0') |
|
5264 |
"/ ('#e#' '0') |
|
5265 |
"/ ('#i#' '0') |
|
5266 |
"/ ('#j#' '0') |
|
5267 |
"/ ('#y#' '0') |
|
5268 |
"/ ('#o#' '0') |
|
5269 |
"/ ('#u#' '0') |
|
5270 |
"/ |
|
5271 |
"/ ('#ca' '4') |
|
5272 |
"/ ('#ch' '4') |
|
5273 |
"/ ('#ck' '4') |
|
5274 |
"/ ('#cl' '4') |
|
5275 |
"/ ('#co' '4') |
|
5276 |
"/ ('#cq' '4') |
|
5277 |
"/ ('#cr' '4') |
|
5278 |
"/ ('#cu' '4') |
|
5279 |
"/ ('#cx' '4') |
|
5280 |
"/ |
|
5281 |
"/ ('#c#' '8') |
|
5282 |
"/ ) do:[:pair | |
|
5283 |
"/ (pair first match:chars) ifTrue:[ |
|
5284 |
"/ ^ pair second |
|
5285 |
"/ ] |
|
5286 |
"/ ]. |
|
5287 |
]. |
|
5288 |
||
5289 |
^ self convertRest:chars |
|
5290 |
||
5291 |
"Modified: / 29-07-2017 / 14:22:20 / cg" |
|
5292 |
! |
|
5293 |
||
5294 |
convertRest:chars |
|
5295 |
chars size == 3 ifFalse:[ |
|
5296 |
self error:'cannot happen'. |
|
5297 |
^ '?' |
|
5298 |
]. |
|
5299 |
||
5300 |
#( |
|
5301 |
"/ used to be matchpattern code, |
|
5302 |
"/ but doing these glob-matches is too slow. |
|
5303 |
"/ changed to: |
|
5304 |
"/ start nil code |
|
5305 |
"/ nil end code |
|
5306 |
"/ nil char code |
|
5307 |
"/ |
|
5308 |
(nil 'ds' " '#ds' " '8') |
|
5309 |
(nil 'dc' " '#dc' " '8') |
|
5310 |
(nil 'dz' " '#dz' " '8') |
|
5311 |
(nil 'ts' " '#ts' " '8') |
|
5312 |
(nil 'tc' " '#tc' " '8') |
|
5313 |
(nil 'tz' " '#tz' " '8') |
|
5314 |
(nil $d " '#d#' " '2') |
|
5315 |
(nil $t " '#t#' " '2') |
|
5316 |
('cx' nil " 'cx#' " '8') |
|
5317 |
('kx' nil " 'kx#' " '8') |
|
5318 |
('qx' nil " 'qx#' " '8') |
|
5319 |
(nil $x " '#x#' " '48') |
|
5320 |
('sc' nil " 'sc#' " '8') |
|
5321 |
('sz' nil " 'sz#' " '8') |
|
5322 |
(nil 'ca' " '#ca' " '4') |
|
5323 |
(nil 'co' " '#co' " '4') |
|
5324 |
(nil 'cu' " '#cu' " '4') |
|
5325 |
(nil 'ch' " '#ch' " '4') |
|
5326 |
(nil 'ck' " '#ck' " '4') |
|
5327 |
(nil 'cx' " '#cx' " '4') |
|
5328 |
(nil 'cq' " '#cq' " '4') |
|
5329 |
(nil $c " '#c#' " '8') |
|
5330 |
(nil $a " '#a#' " '0') |
|
5331 |
(nil $e " '#e#' " '0') |
|
5332 |
(nil $i " '#i#' " '0') |
|
5333 |
(nil $j " '#j#' " '0') |
|
5334 |
(nil $y " '#y#' " '0') |
|
5335 |
(nil $o " '#o#' " '0') |
|
5336 |
(nil $u " '#u#' " '0') |
|
5337 |
(nil $h " '#h#' " '-') |
|
5338 |
(nil $l " '#l#' " '5') |
|
5339 |
(nil $r " '#r#' " '7') |
|
5340 |
(nil $m " '#m#' " '6') |
|
5341 |
(nil $n " '#n#' " '6') |
|
5342 |
(nil $s " '#s#' " '8') |
|
5343 |
(nil $z " '#z#' " '8') |
|
5344 |
(nil $b " '#b#' " '1') |
|
5345 |
(nil $p " '#p#' " '1') |
|
5346 |
(nil $f " '#f#' " '3') |
|
5347 |
(nil $v " '#v#' " '3') |
|
5348 |
(nil $w " '#w#' " '3') |
|
5349 |
(nil $g " '#g#' " '4') |
|
5350 |
(nil $k " '#k#' " '4') |
|
5351 |
(nil $q " '#q#' " '4') |
|
5352 |
(nil nil " '###' " '?') |
|
5353 |
) do:[:vector | |
|
5354 |
|v1 v2| |
|
5355 |
||
5356 |
(v1 := vector at:1) notNil ifTrue:[ |
|
5357 |
"/ prefix |
|
5358 |
(chars startsWith:v1) ifTrue:[^ (vector at:3) ]. |
|
5359 |
] ifFalse:[ |
|
5360 |
(v2 := vector at:2) isCharacter ifTrue:[ |
|
5361 |
"/ middle character compare |
|
5362 |
(chars at:2) == v2 ifTrue:[^ (vector at:3) ]. |
|
5363 |
] ifFalse:[ |
|
5364 |
v2 isString ifTrue:[ |
|
5365 |
"/ suffix |
|
5366 |
(chars endsWith:v2) ifTrue:[^ (vector at:3) ]. |
|
5367 |
] ifFalse:[ |
|
5368 |
^ '?' |
|
5369 |
] |
|
5370 |
] |
|
5371 |
]. |
|
5372 |
||
5373 |
"/ (vector first match:chars) ifTrue:[ |
|
5374 |
"/ ^ vector second |
|
5375 |
"/ ] |
|
5376 |
]. |
|
5377 |
||
5378 |
self error:'cannot happen' |
|
5379 |
||
5380 |
"Modified: / 29-07-2017 / 14:17:38 / cg" |
|
2208 | 5381 |
! ! |
5382 |
||
5383 |
!PhoneticStringUtilities::MiracodeStringComparator class methodsFor:'documentation'! |
|
5384 |
||
5385 |
documentation |
|
5386 |
" |
|
4489 | 5387 |
Miracode (also called << American Soundex >>) is like Soundex with the |
5388 |
addition that h and w are discarded if they separate consonants. |
|
5389 |
||
5390 |
These variants may be specifically important because they were used in |
|
5391 |
U.S. National Archives. |
|
5392 |
Most archive data were encoded with Miracode, |
|
5393 |
but there are some (older) entries encoded with Simplified Soundex. |
|
5394 |
||
5395 |
The HW-rule was documented as a standard in 1910, |
|
5396 |
but actually data of 1880, 1900 and 1910 |
|
3185
9833bbba2050
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
2580
diff
changeset
|
5397 |
censuses were encoded with mixed methods. |
4489 | 5398 |
|
5399 |
self new encode:'washington' -> 'W252' |
|
5400 |
self new encode:'lee' -> 'L000' |
|
5401 |
self new encode:'Gutierrez' -> 'G362' |
|
5402 |
self new encode:'Pfister' -> 'P236' |
|
5403 |
self new encode:'Jackson' -> 'J250' |
|
5404 |
self new encode:'Tymczak' -> 'T522' |
|
5405 |
||
5406 |
notice: |
|
4491 | 5407 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
5408 |
SoundexStringComparator new encode:'Ashcraft' -> 'A226' |
|
4489 | 5409 |
|
5410 |
see also: |
|
5411 |
https://www.archives.gov/research/census/soundex.html |
|
2208 | 5412 |
" |
5413 |
! ! |
|
5414 |
||
4491 | 5415 |
!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'! |
2208 | 5416 |
|
4488 | 5417 |
encode:word |
4491 | 5418 |
"same as inherited, but cares for W and H" |
5419 |
||
2208 | 5420 |
|u p t prevCode| |
5421 |
||
4488 | 5422 |
u := word asUppercase. |
2208 | 5423 |
p := u first asString. |
5424 |
prevCode := self translate:u first. |
|
5425 |
u from:2 to:u size do:[:c | |
|
5426 |
t := self translate:c. |
|
5427 |
(t notNil |
|
5428 |
and:[ t ~= '0' |
|
5429 |
and:[ t ~= prevCode ]]) ifTrue:[ |
|
5430 |
p := p , t. |
|
4488 | 5431 |
p size == 4 ifTrue:[^ p ]. |
2208 | 5432 |
]. |
5433 |
(c ~= $W and:[c ~= $H]) ifTrue:[ |
|
5434 |
prevCode := t. |
|
5435 |
]. |
|
5436 |
]. |
|
5437 |
[ p size < 4 ] whileTrue:[ |
|
5438 |
p := p , '0' |
|
5439 |
]. |
|
4488 | 5440 |
^ (p copyFrom:1 to:4) |
5441 |
||
4491 | 5442 |
"Created: / 02-08-2017 / 00:19:47 / cg" |
5443 |
"Modified (comment): / 02-08-2017 / 14:30:47 / cg" |
|
4489 | 5444 |
! ! |
5445 |
||
5446 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'! |
|
5447 |
||
5448 |
documentation |
|
5449 |
" |
|
5450 |
The 'Spanish Phonetik' (spanish phonetic) code is for the spanish language |
|
5451 |
what the soundex code is for english: |
|
5452 |
it returns similar strings for similar sounding words |
|
5453 |
(but is specifically aware of the pronunciation of spanish) . |
|
5454 |
||
5455 |
There are some other differences to soundex, though: |
|
5456 |
its length is not limited to 4, but depends on the length of the original string; |
|
5457 |
it does not start with the first character of the input, |
|
5458 |
but returns a pure numeric string, |
|
5459 |
it uses different character groups |
|
5460 |
||
5461 |
This algorithm was described by Marıa del Pilar Angeles, Adrian Espino-Gamez, |
|
5462 |
and Jonathan Gil-Moncada, in |
|
5463 |
'Comparison of a Modified Spanish phonetic, |
|
5464 |
Soundex, and Phonex coding functions during data matching process' |
|
5465 |
See https://www.researchgate.net/publication/285589803_Comparison_of_a_Modified_Spanish_Phonetic_Soundex_and_Phonex_coding_functions_during_data_matching_process |
|
5466 |
||
5467 |
" |
|
5468 |
! |
|
5469 |
||
5470 |
examples |
|
5471 |
" |
|
5472 |
words sounding similar (german pronunciation) will deliver a similar code: |
|
5473 |
||
5474 |
#( |
|
5475 |
'Marıa' |
|
5476 |
'Pilar' |
|
5477 |
'Angeles' |
|
5478 |
'Adrian' |
|
5479 |
'Gamez' |
|
5480 |
) do:[:w | |
|
5481 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::SpanishPhoneticCodeStringComparator new encode:w) |
|
5482 |
]. |
|
5483 |
" |
|
5484 |
! ! |
|
5485 |
||
5486 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'api'! |
|
5487 |
||
5488 |
encode: aString |
|
5489 |
"return a spanish phonetic code. |
|
5490 |
The spanishPhonetic code is for the spanish language what the soundex code is for english; |
|
5491 |
it returns simular strings for similar sounding words. |
|
5492 |
There are some differences to soundex, though: |
|
5493 |
its length is not limited to 4, but depends on the length of the original string; |
|
5494 |
it does not start with the first character of the input, |
|
5495 |
it uses different character groups. |
|
5496 |
This algorithm is described by Marıa del Pilar Angeles, Adrian Espino-Gamez, |
|
5497 |
Jonathan Gil-Moncada." |
|
5498 |
||
5499 |
|in| |
|
5500 |
||
5501 |
in := aString withoutSeparators asUppercase. |
|
5502 |
||
5503 |
^ String streamContents:[:out | |
|
5504 |
|prev| |
|
5505 |
||
5506 |
in do:[:ch | |
|
5507 |
ch == prev ifFalse:[ |
|
5508 |
ch == $P ifTrue:[ |
|
5509 |
out nextPut:$0. |
|
5510 |
] ifFalse:[ ('BV' includes:ch) ifTrue:[ |
|
5511 |
out nextPut:$1. |
|
5512 |
] ifFalse:[ ('FH' includes:ch) ifTrue:[ |
|
5513 |
out nextPut:$2. |
|
5514 |
] ifFalse:[ ('DT' includes:ch) ifTrue:[ |
|
5515 |
out nextPut:$3. |
|
5516 |
] ifFalse:[ ('SZCX' includes:ch) ifTrue:[ |
|
5517 |
out nextPut:$4. |
|
5518 |
] ifFalse:[ ('YL' includes:ch) ifTrue:[ |
|
5519 |
out nextPut:$5. |
|
5520 |
] ifFalse:[ ('NŃM' includes:ch) ifTrue:[ |
|
5521 |
out nextPut:$6. |
|
5522 |
] ifFalse:[ ('QK' includes:ch) ifTrue:[ |
|
5523 |
out nextPut:$7. |
|
5524 |
] ifFalse:[ ('GJ' includes:ch) ifTrue:[ |
|
5525 |
out nextPut:$8. |
|
5526 |
] ifFalse:[ ('R' includes:ch) ifTrue:[ |
|
5527 |
out nextPut:$9. |
|
5528 |
]]]]]]]]]]. |
|
5529 |
prev := ch. |
|
5530 |
]. |
|
5531 |
]. |
|
5532 |
]. |
|
5533 |
||
5534 |
" |
|
5535 |
self new encode:'Jose' |
|
5536 |
" |
|
5537 |
||
5538 |
"Created: / 28-07-2017 / 15:24:33 / cg" |
|
5539 |
"Modified: / 01-08-2017 / 18:48:50 / cg" |
|
5540 |
! ! |
|
5541 |
||
5542 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'private'! |
|
5543 |
||
5544 |
convertFirst:chars |
|
5545 |
|c2 c3| |
|
5546 |
||
5547 |
chars size == 3 ifTrue:[ |
|
5548 |
c2 := (chars at:2). |
|
5549 |
c2 == $a ifTrue:[^ '0']. |
|
5550 |
c2 == $e ifTrue:[^ '0']. |
|
5551 |
c2 == $i ifTrue:[^ '0']. |
|
5552 |
c2 == $j ifTrue:[^ '0']. |
|
5553 |
c2 == $y ifTrue:[^ '0']. |
|
5554 |
c2 == $o ifTrue:[^ '0']. |
|
5555 |
c2 == $u ifTrue:[^ '0']. |
|
5556 |
||
5557 |
c2 == $c ifTrue:[ |
|
5558 |
c3 := (chars at:3). |
|
5559 |
(c3 == $a) ifTrue:[^ '4']. |
|
5560 |
(c3 == $h) ifTrue:[^ '4']. |
|
5561 |
(c3 == $k) ifTrue:[^ '4']. |
|
5562 |
(c3 == $l) ifTrue:[^ '4']. |
|
5563 |
(c3 == $o) ifTrue:[^ '4']. |
|
5564 |
(c3 == $q) ifTrue:[^ '4']. |
|
5565 |
(c3 == $r) ifTrue:[^ '4']. |
|
5566 |
(c3 == $u) ifTrue:[^ '4']. |
|
5567 |
(c3 == $x) ifTrue:[^ '4']. |
|
5568 |
^ '8' |
|
5569 |
]. |
|
5570 |
||
5571 |
"/ #( |
|
5572 |
"/ ('#a#' '0') |
|
5573 |
"/ ('#e#' '0') |
|
5574 |
"/ ('#i#' '0') |
|
5575 |
"/ ('#j#' '0') |
|
5576 |
"/ ('#y#' '0') |
|
5577 |
"/ ('#o#' '0') |
|
5578 |
"/ ('#u#' '0') |
|
5579 |
"/ |
|
5580 |
"/ ('#ca' '4') |
|
5581 |
"/ ('#ch' '4') |
|
5582 |
"/ ('#ck' '4') |
|
5583 |
"/ ('#cl' '4') |
|
5584 |
"/ ('#co' '4') |
|
5585 |
"/ ('#cq' '4') |
|
5586 |
"/ ('#cr' '4') |
|
5587 |
"/ ('#cu' '4') |
|
5588 |
"/ ('#cx' '4') |
|
5589 |
"/ |
|
5590 |
"/ ('#c#' '8') |
|
5591 |
"/ ) do:[:pair | |
|
5592 |
"/ (pair first match:chars) ifTrue:[ |
|
5593 |
"/ ^ pair second |
|
5594 |
"/ ] |
|
5595 |
"/ ]. |
|
5596 |
]. |
|
5597 |
||
5598 |
^ self convertRest:chars |
|
5599 |
||
5600 |
"Modified: / 29-07-2017 / 14:22:20 / cg" |
|
5601 |
! |
|
5602 |
||
5603 |
convertRest:chars |
|
5604 |
chars size == 3 ifFalse:[ |
|
5605 |
self error:'cannot happen'. |
|
5606 |
^ '?' |
|
5607 |
]. |
|
5608 |
||
5609 |
#( |
|
5610 |
"/ used to be matchpattern code, |
|
5611 |
"/ but doing these glob-matches is too slow. |
|
5612 |
"/ changed to: |
|
5613 |
"/ start nil code |
|
5614 |
"/ nil end code |
|
5615 |
"/ nil char code |
|
5616 |
"/ |
|
5617 |
(nil 'ds' " '#ds' " '8') |
|
5618 |
(nil 'dc' " '#dc' " '8') |
|
5619 |
(nil 'dz' " '#dz' " '8') |
|
5620 |
(nil 'ts' " '#ts' " '8') |
|
5621 |
(nil 'tc' " '#tc' " '8') |
|
5622 |
(nil 'tz' " '#tz' " '8') |
|
5623 |
(nil $d " '#d#' " '2') |
|
5624 |
(nil $t " '#t#' " '2') |
|
5625 |
('cx' nil " 'cx#' " '8') |
|
5626 |
('kx' nil " 'kx#' " '8') |
|
5627 |
('qx' nil " 'qx#' " '8') |
|
5628 |
(nil $x " '#x#' " '48') |
|
5629 |
('sc' nil " 'sc#' " '8') |
|
5630 |
('sz' nil " 'sz#' " '8') |
|
5631 |
(nil 'ca' " '#ca' " '4') |
|
5632 |
(nil 'co' " '#co' " '4') |
|
5633 |
(nil 'cu' " '#cu' " '4') |
|
5634 |
(nil 'ch' " '#ch' " '4') |
|
5635 |
(nil 'ck' " '#ck' " '4') |
|
5636 |
(nil 'cx' " '#cx' " '4') |
|
5637 |
(nil 'cq' " '#cq' " '4') |
|
5638 |
(nil $c " '#c#' " '8') |
|
5639 |
(nil $a " '#a#' " '0') |
|
5640 |
(nil $e " '#e#' " '0') |
|
5641 |
(nil $i " '#i#' " '0') |
|
5642 |
(nil $j " '#j#' " '0') |
|
5643 |
(nil $y " '#y#' " '0') |
|
5644 |
(nil $o " '#o#' " '0') |
|
5645 |
(nil $u " '#u#' " '0') |
|
5646 |
(nil $h " '#h#' " '-') |
|
5647 |
(nil $l " '#l#' " '5') |
|
5648 |
(nil $r " '#r#' " '7') |
|
5649 |
(nil $m " '#m#' " '6') |
|
5650 |
(nil $n " '#n#' " '6') |
|
5651 |
(nil $s " '#s#' " '8') |
|
5652 |
(nil $z " '#z#' " '8') |
|
5653 |
(nil $b " '#b#' " '1') |
|
5654 |
(nil $p " '#p#' " '1') |
|
5655 |
(nil $f " '#f#' " '3') |
|
5656 |
(nil $v " '#v#' " '3') |
|
5657 |
(nil $w " '#w#' " '3') |
|
5658 |
(nil $g " '#g#' " '4') |
|
5659 |
(nil $k " '#k#' " '4') |
|
5660 |
(nil $q " '#q#' " '4') |
|
5661 |
(nil nil " '###' " '?') |
|
5662 |
) do:[:vector | |
|
5663 |
|v1 v2| |
|
5664 |
||
5665 |
(v1 := vector at:1) notNil ifTrue:[ |
|
5666 |
"/ prefix |
|
5667 |
(chars startsWith:v1) ifTrue:[^ (vector at:3) ]. |
|
5668 |
] ifFalse:[ |
|
5669 |
(v2 := vector at:2) isCharacter ifTrue:[ |
|
5670 |
"/ middle character compare |
|
5671 |
(chars at:2) == v2 ifTrue:[^ (vector at:3) ]. |
|
5672 |
] ifFalse:[ |
|
5673 |
v2 isString ifTrue:[ |
|
5674 |
"/ suffix |
|
5675 |
(chars endsWith:v2) ifTrue:[^ (vector at:3) ]. |
|
5676 |
] ifFalse:[ |
|
5677 |
^ '?' |
|
5678 |
] |
|
5679 |
] |
|
5680 |
]. |
|
5681 |
||
5682 |
"/ (vector first match:chars) ifTrue:[ |
|
5683 |
"/ ^ vector second |
|
5684 |
"/ ] |
|
5685 |
]. |
|
5686 |
||
5687 |
self error:'cannot happen' |
|
5688 |
||
5689 |
"Modified: / 29-07-2017 / 14:17:38 / cg" |
|
2208 | 5690 |
! ! |
5691 |
||
2197 | 5692 |
!PhoneticStringUtilities class methodsFor:'documentation'! |
5693 |
||
5694 |
version |
|
3646 | 5695 |
^ '$Header$' |
2285 | 5696 |
! |
5697 |
||
5698 |
version_CVS |
|
3646 | 5699 |
^ '$Header$' |
2197 | 5700 |
! ! |
3185
9833bbba2050
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
2580
diff
changeset
|
5701 |