author | Claus Gittinger <cg@exept.de> |
Tue, 25 Jun 2019 14:28:51 +0200 | |
changeset 5050 | 44fa8672d102 |
parent 4521 | cfe4f333794f |
child 5212 | 76ae0b6f061e |
permissions | -rw-r--r-- |
4488 | 1 |
"{ Encoding: utf8 }" |
2 |
||
2197 | 3 |
" |
4 |
COPYRIGHT (c) 1994 by Claus Gittinger |
|
5 |
COPYRIGHT (c) 2009 by eXept Software AG |
|
6 |
All Rights Reserved |
|
7 |
||
8 |
This software is furnished under a license and may be used |
|
9 |
only in accordance with the terms of that license and with the |
|
10 |
inclusion of the above copyright notice. This software may not |
|
11 |
be provided or otherwise made available to, or used by, any |
|
12 |
other person. No title to or ownership of the software is |
|
13 |
hereby transferred. |
|
14 |
" |
|
15 |
"{ Package: 'stx:libbasic2' }" |
|
16 |
||
3488
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
17 |
"{ NameSpace: Smalltalk }" |
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
18 |
|
2197 | 19 |
Object subclass:#PhoneticStringUtilities |
20 |
instanceVariableNames:'' |
|
21 |
classVariableNames:'' |
|
22 |
poolDictionaries:'' |
|
23 |
category:'Collections-Text-Support' |
|
24 |
! |
|
25 |
||
2208 | 26 |
Object subclass:#PhoneticStringComparator |
27 |
instanceVariableNames:'' |
|
28 |
classVariableNames:'' |
|
29 |
poolDictionaries:'' |
|
30 |
privateIn:PhoneticStringUtilities |
|
31 |
! |
|
32 |
||
4491 | 33 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#DaitchMokotoffStringComparator |
34 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
35 |
currentIndex skipCount' |
|
36 |
classVariableNames:'' |
|
37 |
poolDictionaries:'' |
|
38 |
privateIn:PhoneticStringUtilities |
|
39 |
! |
|
40 |
||
41 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#DoubleMetaphoneStringComparator |
|
42 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
43 |
currentIndex skipCount' |
|
44 |
classVariableNames:'' |
|
45 |
poolDictionaries:'' |
|
46 |
privateIn:PhoneticStringUtilities |
|
47 |
! |
|
48 |
||
2211 | 49 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#ExtendedSoundexStringComparator |
50 |
instanceVariableNames:'' |
|
51 |
classVariableNames:'CharacterTranslationDict' |
|
52 |
poolDictionaries:'' |
|
53 |
privateIn:PhoneticStringUtilities |
|
54 |
! |
|
55 |
||
4488 | 56 |
PhoneticStringUtilities::PhoneticStringComparator subclass:#SingleResultPhoneticStringComparator |
57 |
instanceVariableNames:'' |
|
58 |
classVariableNames:'' |
|
59 |
poolDictionaries:'' |
|
60 |
privateIn:PhoneticStringUtilities |
|
61 |
! |
|
62 |
||
63 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MRAStringComparator |
|
2208 | 64 |
instanceVariableNames:'' |
65 |
classVariableNames:'CharacterTranslationDict' |
|
66 |
poolDictionaries:'' |
|
67 |
privateIn:PhoneticStringUtilities |
|
68 |
! |
|
69 |
||
4491 | 70 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#MetaphoneStringComparator |
71 |
instanceVariableNames:'inputKey primaryTranslation secondaryTranslation startIndex |
|
72 |
currentIndex skipCount' |
|
73 |
classVariableNames:'' |
|
74 |
poolDictionaries:'' |
|
75 |
privateIn:PhoneticStringUtilities |
|
76 |
! |
|
77 |
||
4488 | 78 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SoundexStringComparator |
2208 | 79 |
instanceVariableNames:'' |
80 |
classVariableNames:'CharacterTranslationDict' |
|
81 |
poolDictionaries:'' |
|
82 |
privateIn:PhoneticStringUtilities |
|
83 |
! |
|
84 |
||
85 |
PhoneticStringUtilities::SoundexStringComparator subclass:#MySQLSoundexStringComparator |
|
86 |
instanceVariableNames:'' |
|
87 |
classVariableNames:'' |
|
88 |
poolDictionaries:'' |
|
89 |
privateIn:PhoneticStringUtilities |
|
90 |
! |
|
91 |
||
4488 | 92 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#NYSIISStringComparator |
2208 | 93 |
instanceVariableNames:'' |
94 |
classVariableNames:'' |
|
95 |
poolDictionaries:'' |
|
96 |
privateIn:PhoneticStringUtilities |
|
97 |
! |
|
98 |
||
4488 | 99 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#PhonemStringComparator |
2211 | 100 |
instanceVariableNames:'' |
101 |
classVariableNames:'CharacterTranslationDict' |
|
102 |
poolDictionaries:'' |
|
103 |
privateIn:PhoneticStringUtilities |
|
104 |
! |
|
105 |
||
4491 | 106 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#Caverphone2StringComparator |
107 |
instanceVariableNames:'' |
|
108 |
classVariableNames:'CharacterTranslationDict' |
|
2208 | 109 |
poolDictionaries:'' |
110 |
privateIn:PhoneticStringUtilities |
|
111 |
! |
|
112 |
||
4488 | 113 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#KoelnerPhoneticCodeStringComparator |
114 |
instanceVariableNames:'' |
|
115 |
classVariableNames:'CharacterTranslationDict' |
|
116 |
poolDictionaries:'' |
|
117 |
privateIn:PhoneticStringUtilities |
|
118 |
! |
|
119 |
||
2208 | 120 |
PhoneticStringUtilities::SoundexStringComparator subclass:#MiracodeStringComparator |
121 |
instanceVariableNames:'' |
|
122 |
classVariableNames:'' |
|
123 |
poolDictionaries:'' |
|
124 |
privateIn:PhoneticStringUtilities |
|
125 |
! |
|
126 |
||
4489 | 127 |
PhoneticStringUtilities::SingleResultPhoneticStringComparator subclass:#SpanishPhoneticCodeStringComparator |
128 |
instanceVariableNames:'' |
|
129 |
classVariableNames:'CharacterTranslationDict' |
|
130 |
poolDictionaries:'' |
|
131 |
privateIn:PhoneticStringUtilities |
|
132 |
! |
|
133 |
||
2197 | 134 |
!PhoneticStringUtilities class methodsFor:'documentation'! |
135 |
||
136 |
copyright |
|
137 |
" |
|
138 |
COPYRIGHT (c) 1994 by Claus Gittinger |
|
139 |
COPYRIGHT (c) 2009 by eXept Software AG |
|
140 |
All Rights Reserved |
|
141 |
||
142 |
This software is furnished under a license and may be used |
|
143 |
only in accordance with the terms of that license and with the |
|
144 |
inclusion of the above copyright notice. This software may not |
|
145 |
be provided or otherwise made available to, or used by, any |
|
146 |
other person. No title to or ownership of the software is |
|
147 |
hereby transferred. |
|
148 |
" |
|
149 |
! |
|
150 |
||
151 |
documentation |
|
152 |
" |
|
2445 | 153 |
Utilities which are helpful to perform phonetic string searches or comparisons. |
154 |
These are all variations or improvements of the soundex algorithm, which usually fails |
|
155 |
to provide good results for non-english languages. |
|
2285 | 156 |
|
2208 | 157 |
soundexCode |
158 |
this algorithm was originally contained in the CharacterArray class; |
|
159 |
||
160 |
nysiis |
|
161 |
a modified soundex algorithm |
|
162 |
||
2209 | 163 |
miracode |
164 |
another modified soundex algorithm ('american soundex') used in the 1880 census. |
|
165 |
||
166 |
mySQLSoundex |
|
167 |
another modified soundex algorithm used in mySQL. |
|
168 |
||
2208 | 169 |
koelner phoneticCode |
170 |
provides a functionality similar to soundex, but much more tuned towards the German language |
|
171 |
||
172 |
Double metaphone |
|
173 |
works with most european languages. |
|
2211 | 174 |
|
175 |
phonem |
|
176 |
described in Georg Wilde and Carsten Meyer, 'Doppelgaenger gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung' |
|
177 |
from 'ct Magazin fuer Computer & Technik 25/1999'. |
|
178 |
||
4491 | 179 |
mra |
180 |
Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977. |
|
181 |
||
182 |
caverphone2 |
|
183 |
better than soundex |
|
184 |
||
185 |
spanish phonetic code |
|
186 |
an algorithm slightly adjusted to spanish names |
|
187 |
||
2211 | 188 |
More info for german readers is found in: |
189 |
http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf |
|
190 |
" |
|
191 |
! |
|
192 |
||
193 |
sampleData |
|
194 |
" |
|
195 |
for the 50 most common german names, we get: |
|
196 |
||
197 |
ext. |
|
4491 | 198 |
name soundex soundex metaphone phonet phonet2 phonix daitsch phonem koeln caverphone2 mra |
199 |
||
200 |
müller M460 54600000 MLR MÜLA NILA M4000000 689000 MYLR 657 MLA1111111 MLR |
|
201 |
schmidt S530 25300000 SKMTT SHMIT ZNIT S5300000 463000 CMYD 862 SKMT111111 SCHMDT |
|
202 |
schneider S536 25360000 SKNTR SHNEIDA ZNEITA S5300000 463900 CNAYDR 8627 SKNTA11111 SCHNDR |
|
203 |
fischer F260 12600000 FSKR FISHA FIZA F8000000 749000 VYCR 387 FSKA111111 FSCHR |
|
204 |
weber W160 16000000 WBR WEBA FEBA $1000000 779000 VBR 317 WPA1111111 WBR |
|
205 |
meyer M600 56000000 MYR MEIA NEIA M0000000 619000 MAYR 67 MA11111111 MYR |
|
206 |
wagner W256 25600000 WKNR WAKNA FAKNA $2500000 756900 VACNR 3467 WKNA111111 WGNR |
|
207 |
schulz S420 24200000 SKLS SHULS ZULZ S4800000 484000 CULC 858 SKS1111111 SCHLZ |
|
208 |
becker B260 12600000 BKR BEKA BEKA B2000000 759000 BCR 147 PKA1111111 BCKR |
|
209 |
hoffmann H155 15500000 HFMN HOFMAN UFNAN $7550000 576600 OVMAN 036 AFMN111111 HFMN |
|
210 |
schäfer S16ß 21600000 SKFR SHEFA ZEFA S7000000 479000 CVR 837 SKFA111111 SCHFR |
|
211 |
||
212 |
|cls| |
|
213 |
||
214 |
cls := MRAStringComparator. |
|
215 |
cls := SoundexStringComparator. |
|
216 |
cls := KoelnerPhoneticCodeStringComparator. |
|
217 |
cls := Caverphone2StringComparator. |
|
218 |
#('müller' 'schmidt' 'schneider' 'fischer' 'weber' 'meyer' |
|
219 |
'wagner' 'schulz' 'becker' 'hoffmann' 'schäfer') |
|
220 |
do:[:name | |
|
221 |
Transcript show:''''; show:name; show:''' -> '''; show:(cls encode:name); showCR:''''. |
|
222 |
]. |
|
223 |
||
224 |
KoelnerPhoneticCodeStringComparator encode:'Müller-Lüdenscheidt' -> '65752682' |
|
2197 | 225 |
" |
226 |
! ! |
|
227 |
||
228 |
!PhoneticStringUtilities class methodsFor:'phonetic codes'! |
|
229 |
||
230 |
koelnerPhoneticCodeOf:aString |
|
231 |
"return a koelner phonetic code. |
|
232 |
The koelnerPhonetic code is for the german language what the soundex code is for english; |
|
233 |
it returns simular strings for similar sounding words. |
|
234 |
There are some differences to soundex, though: |
|
235 |
its length is not limited to 4, but depends on the length of the original string; |
|
2207 | 236 |
it does not start with the first character of the input. |
237 |
This algorithm is described by Postel 1969" |
|
2197 | 238 |
|
2209 | 239 |
^ (KoelnerPhoneticCodeStringComparator new phoneticStringsFor:aString) first |
2197 | 240 |
|
241 |
" |
|
242 |
#( |
|
4488 | 243 |
'Müller' |
2197 | 244 |
'Miller' |
245 |
'Mueller' |
|
4488 | 246 |
'Mühler' |
247 |
'Mühlherr' |
|
248 |
'Mülherr' |
|
2197 | 249 |
'Myler' |
250 |
'Millar' |
|
251 |
'Myller' |
|
4488 | 252 |
'Müllar' |
253 |
'Müler' |
|
2197 | 254 |
'Muehler' |
4488 | 255 |
'Mülller' |
256 |
'Müllerr' |
|
2197 | 257 |
'Muehlherr' |
258 |
'Muellar' |
|
259 |
'Mueler' |
|
4488 | 260 |
'Mülleer' |
2197 | 261 |
'Mueller' |
4488 | 262 |
'Nüller' |
2197 | 263 |
'Nyller' |
264 |
'Niler' |
|
265 |
'Czerny' |
|
266 |
'Tscherny' |
|
267 |
'Czernie' |
|
268 |
'Tschernie' |
|
269 |
'Schernie' |
|
270 |
'Scherny' |
|
271 |
'Scherno' |
|
272 |
'Czerne' |
|
273 |
'Zerny' |
|
274 |
'Tzernie' |
|
275 |
'Breschnew' |
|
276 |
) do:[:w | |
|
277 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities koelnerPhoneticCodeOf:w) |
|
278 |
]. |
|
279 |
" |
|
280 |
||
281 |
" |
|
2209 | 282 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschnew'. '17863'. |
283 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Breschneff'. '17863'. |
|
284 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Braeschneff'. '17863'. |
|
285 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Braessneff'. '17863'. |
|
286 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Pressneff'. '17863'. |
|
4488 | 287 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Presznäph'. '17863'. |
2209 | 288 |
PhoneticStringUtilities koelnerPhoneticCodeOf:'Preschnjiev'. '17863'. |
289 |
" |
|
290 |
! |
|
291 |
||
4488 | 292 |
miracodeCodeOf:aString |
293 |
"return a miracode soundex phonetic code or nil. |
|
294 |
Miracode is a slightly modified soundex algorithm. |
|
295 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
296 |
||
297 |
^ (MiracodeStringComparator new phoneticStringsFor:aString) first |
|
298 |
||
299 |
" |
|
300 |
PhoneticStringUtilities miracodeCodeOf:'claus' |
|
301 |
PhoneticStringUtilities miracodeCodeOf:'clause' |
|
302 |
PhoneticStringUtilities miracodeCodeOf:'close' |
|
303 |
PhoneticStringUtilities miracodeCodeOf:'smalltalk' |
|
304 |
PhoneticStringUtilities miracodeCodeOf:'smaltalk' |
|
305 |
PhoneticStringUtilities miracodeCodeOf:'smaltak' |
|
306 |
PhoneticStringUtilities miracodeCodeOf:'smaltok' |
|
307 |
PhoneticStringUtilities miracodeCodeOf:'smoltok' |
|
308 |
PhoneticStringUtilities miracodeCodeOf:'aa' |
|
309 |
PhoneticStringUtilities miracodeCodeOf:'by' |
|
310 |
PhoneticStringUtilities miracodeCodeOf:'bab' |
|
311 |
PhoneticStringUtilities miracodeCodeOf:'bob' |
|
312 |
PhoneticStringUtilities miracodeCodeOf:'bop' |
|
313 |
PhoneticStringUtilities miracodeCodeOf:'pub' |
|
314 |
" |
|
315 |
||
316 |
"Created: / 28-07-2017 / 15:32:41 / cg" |
|
317 |
! |
|
318 |
||
2209 | 319 |
mySQLSoundexCodeOf:aString |
320 |
"return the mySQL soundex code. The mysql soundex coed is different from the miracode 'american' soundex |
|
4488 | 321 |
(no 4char limitation; different order of duplicate vowel vs. duplicate code elimination). |
322 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
2209 | 323 |
|
324 |
^ (MySQLSoundexStringComparator new phoneticStringsFor:aString) first |
|
325 |
||
326 |
" |
|
327 |
#( |
|
4488 | 328 |
'Müller' |
2209 | 329 |
'Miller' |
330 |
'Mueller' |
|
4488 | 331 |
'Mühler' |
332 |
'Mühlherr' |
|
333 |
'Mülherr' |
|
2209 | 334 |
'Myler' |
335 |
'Millar' |
|
336 |
'Myller' |
|
4488 | 337 |
'Müllar' |
338 |
'Müler' |
|
2209 | 339 |
'Muehler' |
4488 | 340 |
'Mülller' |
341 |
'Müllerr' |
|
2209 | 342 |
'Muehlherr' |
343 |
'Muellar' |
|
344 |
'Mueler' |
|
4488 | 345 |
'Mülleer' |
2209 | 346 |
'Mueller' |
4488 | 347 |
'Nüller' |
2209 | 348 |
'Nyller' |
349 |
'Niler' |
|
350 |
'Czerny' |
|
351 |
'Tscherny' |
|
352 |
'Czernie' |
|
353 |
'Tschernie' |
|
354 |
'Schernie' |
|
355 |
'Scherny' |
|
356 |
'Scherno' |
|
357 |
'Czerne' |
|
358 |
'Zerny' |
|
359 |
'Tzernie' |
|
360 |
'Breschnew' |
|
361 |
) do:[:w | |
|
362 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities mySQLSoundexCodeOf:w) |
|
363 |
]. |
|
364 |
" |
|
365 |
||
366 |
" |
|
367 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Breschnew'. |
|
368 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Breschneff'. |
|
369 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Braeschneff'. |
|
370 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Braessneff'. |
|
371 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Pressneff'. |
|
4488 | 372 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Presznäph'. |
2209 | 373 |
PhoneticStringUtilities mySQLSoundexCodeOf:'Preschnjiev'. |
2197 | 374 |
" |
4488 | 375 |
|
376 |
"Modified (comment): / 28-07-2017 / 15:34:03 / cg" |
|
2197 | 377 |
! |
378 |
||
379 |
soundexCodeOf:aString |
|
380 |
"return a soundex phonetic code or nil. |
|
2207 | 381 |
Soundex (1918, 1922) returns similar codes for similar sounding words, making it a useful |
2197 | 382 |
tool when searching for words where the correct spelling is unknown. |
4194 | 383 |
(read Knuth or search the web if you don't know what a soundex code is). |
4488 | 384 |
Caveat: 'similar sounding words' means: 'similar sounding in english'. |
385 |
Notice that there are better algorithms around (doubleMetaphone) " |
|
2197 | 386 |
|
2210 | 387 |
^ (SoundexStringComparator new phoneticStringsFor:aString) first |
2197 | 388 |
|
2210 | 389 |
"/ old code - now use code in private class... |
390 |
"/ |inStream codeStream ch last lch codeLength codes code lastCode| |
|
391 |
"/ |
|
392 |
"/ inStream := aString readStream. |
|
393 |
"/ inStream skipSeparators. |
|
394 |
"/ inStream atEnd ifTrue:[ |
|
395 |
"/ ^ nil |
|
396 |
"/ ]. |
|
397 |
"/ |
|
398 |
"/ ch := inStream next. |
|
399 |
"/ ch isLetter ifFalse:[ |
|
400 |
"/ ^ nil |
|
401 |
"/ ]. |
|
402 |
"/ codeLength := 0. |
|
403 |
"/ |
|
404 |
"/ codes := Dictionary new. |
|
405 |
"/ codes atAll:'bpfv' put:$1. |
|
406 |
"/ codes atAll:'cskgjqxz' put:$2. |
|
407 |
"/ codes atAll:'dt' put:$3. |
|
408 |
"/ codes atAll:'l' put:$4. |
|
409 |
"/ codes atAll:'nm' put:$5. |
|
410 |
"/ codes atAll:'r' put:$6. |
|
411 |
"/ |
|
412 |
"/ codeStream := WriteStream on:(String new:4). |
|
413 |
"/ codeStream nextPut:(ch asUppercase). |
|
414 |
"/ last := ch asLowercase. |
|
415 |
"/ lastCode := codes at:last ifAbsent:nil. |
|
416 |
"/ |
|
417 |
"/ [inStream atEnd] whileFalse:[ |
|
418 |
"/ ch := inStream next. |
|
419 |
"/ lch := ch asLowercase. |
|
420 |
"/ lch = last ifFalse:[ |
|
421 |
"/ last := lch. |
|
422 |
"/ |
|
423 |
"/ code := codes at:lch ifAbsent:nil. |
|
424 |
"/ (code notNil and:[ code ~= lastCode]) ifTrue:[ |
|
425 |
"/ codeLength < 3 ifTrue:[ |
|
426 |
"/ codeStream nextPut:code. |
|
427 |
"/ codeLength := codeLength + 1. |
|
428 |
"/ codeLength > 3 ifTrue:[^ codeStream contents]. |
|
429 |
"/ ]. |
|
430 |
"/ ]. |
|
431 |
"/ lastCode := code. |
|
432 |
"/ ] |
|
433 |
"/ ]. |
|
434 |
"/ [ codeLength < 3 ] whileTrue:[ |
|
435 |
"/ codeStream nextPut:$0. |
|
436 |
"/ codeLength := codeLength + 1. |
|
437 |
"/ ]. |
|
438 |
"/ |
|
439 |
"/ ^ codeStream contents |
|
2197 | 440 |
|
441 |
" |
|
442 |
PhoneticStringUtilities soundexCodeOf:'claus' |
|
443 |
PhoneticStringUtilities soundexCodeOf:'clause' |
|
444 |
PhoneticStringUtilities soundexCodeOf:'close' |
|
445 |
PhoneticStringUtilities soundexCodeOf:'smalltalk' |
|
446 |
PhoneticStringUtilities soundexCodeOf:'smaltalk' |
|
447 |
PhoneticStringUtilities soundexCodeOf:'smaltak' |
|
448 |
PhoneticStringUtilities soundexCodeOf:'smaltok' |
|
449 |
PhoneticStringUtilities soundexCodeOf:'smoltok' |
|
450 |
PhoneticStringUtilities soundexCodeOf:'aa' |
|
451 |
PhoneticStringUtilities soundexCodeOf:'by' |
|
452 |
PhoneticStringUtilities soundexCodeOf:'bab' |
|
453 |
PhoneticStringUtilities soundexCodeOf:'bob' |
|
454 |
PhoneticStringUtilities soundexCodeOf:'bop' |
|
455 |
" |
|
4488 | 456 |
|
457 |
"Modified (comment): / 28-07-2017 / 15:33:53 / cg" |
|
2197 | 458 |
! ! |
459 |
||
3648 | 460 |
!PhoneticStringUtilities class methodsFor:'queries'! |
461 |
||
462 |
isUtilityClass |
|
463 |
^ self == PhoneticStringUtilities |
|
464 |
! ! |
|
465 |
||
2208 | 466 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'constant'! |
467 |
||
468 |
defaultClass |
|
469 |
^SoundexStringComparator |
|
470 |
! ! |
|
471 |
||
3646 | 472 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'documentation'! |
473 |
||
474 |
documentation |
|
475 |
" |
|
476 |
abstract superclass for various phonetic comparators. |
|
477 |
They returns similar strings for similar sounding words, which can be used |
|
478 |
to find similar sounding words in a search list. |
|
479 |
||
480 |
Notice, that some comparators are better for particular languages. |
|
481 |
" |
|
4467 | 482 |
! |
483 |
||
484 |
examples |
|
485 |
" |
|
486 |
PhoneticStringUtilities::SoundexStringComparator new |
|
487 |
does:'miller' soundLike:'miler'. |
|
488 |
||
489 |
PhoneticStringUtilities::SoundexStringComparator new |
|
490 |
does:'miller' soundLike:'milner'. |
|
491 |
||
492 |
PhoneticStringUtilities::SoundexStringComparator new |
|
4488 | 493 |
does:'müller' soundLike:'mueller'. |
4467 | 494 |
|
495 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new |
|
4488 | 496 |
does:'müller' soundLike:'mueller'. |
4467 | 497 |
" |
3646 | 498 |
! ! |
499 |
||
2208 | 500 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'instance creation'! |
501 |
||
502 |
new |
|
503 |
^ self basicNew initialize. |
|
504 |
! ! |
|
505 |
||
3646 | 506 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'queries'! |
507 |
||
508 |
isAbstract |
|
509 |
^ self == PhoneticStringUtilities::PhoneticStringComparator |
|
510 |
! ! |
|
511 |
||
4491 | 512 |
!PhoneticStringUtilities::PhoneticStringComparator class methodsFor:'utilities'! |
513 |
||
514 |
encode:word |
|
515 |
^ (self new phoneticStringsFor:word) first |
|
516 |
||
517 |
" |
|
518 |
SoundexStringComparator encode:'Fischer' -> 'F260' |
|
519 |
Caverphone2StringComparator encode:'Fischer' -> 'FSKA111111' |
|
520 |
KoelnerPhoneticCodeStringComparator encode:'Fischer' -> '387' |
|
521 |
MRAStringComparator encode:'Fischer' -> 'FSCHR' |
|
522 |
SpanishPhoneticCodeStringComparator encode:'Fischer' -> '24429' |
|
523 |
" |
|
524 |
||
525 |
"Created: / 02-08-2017 / 01:15:50 / cg" |
|
526 |
! ! |
|
527 |
||
2208 | 528 |
!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'api'! |
529 |
||
530 |
does:aString soundLike:anotherString |
|
531 |
|translations1 translations2| |
|
532 |
||
533 |
translations1 := self phoneticStringsFor:aString. |
|
534 |
translations2 := self phoneticStringsFor:anotherString. |
|
535 |
||
536 |
^ translations1 contains:[:t1 | |
|
537 |
translations2 contains:[:t2 | t1 = t2]] |
|
538 |
||
539 |
" |
|
540 |
PhoneticStringUtilities::SoundexStringComparator new |
|
541 |
does:'miller' soundLike:'miler'. |
|
4467 | 542 |
|
2208 | 543 |
PhoneticStringUtilities::SoundexStringComparator new |
544 |
does:'miller' soundLike:'milner'. |
|
4467 | 545 |
|
546 |
PhoneticStringUtilities::SoundexStringComparator new |
|
4488 | 547 |
does:'müller' soundLike:'mueller'. |
4467 | 548 |
|
549 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new |
|
4488 | 550 |
does:'müller' soundLike:'mueller'. |
2208 | 551 |
" |
4467 | 552 |
|
553 |
"Modified (comment): / 13-07-2017 / 17:51:43 / cg" |
|
2208 | 554 |
! |
555 |
||
556 |
phoneticStringsFor: aString |
|
557 |
"Should answer an array of alternate phonetic strings for the given input string." |
|
4485 | 558 |
|
2208 | 559 |
self subclassResponsibility |
560 |
||
561 |
" |
|
562 |
(PhoneticStringUtilities::SoundexStringComparator new |
|
4485 | 563 |
phoneticStringsFor:'miller') first |
564 |
||
2208 | 565 |
'miller' asSoundexCode |
566 |
" |
|
4485 | 567 |
|
568 |
"Modified (comment): / 27-07-2017 / 15:07:59 / cg" |
|
2208 | 569 |
! ! |
570 |
||
571 |
!PhoneticStringUtilities::PhoneticStringComparator methodsFor:'initialization'! |
|
572 |
||
573 |
initialize |
|
574 |
"Invoked when a new instance is created." |
|
575 |
||
576 |
"/ please change as required (and remove this comment) |
|
577 |
||
578 |
"/ super initialize. -- commented since inherited method does nothing |
|
579 |
! ! |
|
580 |
||
4491 | 581 |
!PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'! |
2208 | 582 |
|
583 |
documentation |
|
584 |
" |
|
4491 | 585 |
self encode:'AUERBACH' -> 097400, 097500 |
586 |
||
587 |
Encodes a string into a Daitch-Mokotoff Soundex value. |
|
588 |
The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, |
|
589 |
yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation |
|
590 |
but differences in spelling. |
|
591 |
||
592 |
The main differences compared to the other soundex variants are: |
|
593 |
- coded names are 6 digits long |
|
594 |
- the initial character of the name is coded |
|
595 |
- rules to encoded multi-character n-grams |
|
596 |
- multiple possible encodings for the same name (branching) |
|
597 |
||
598 |
This implementation supports branching, depending on the used method: |
|
599 |
encode:aString - branching disabled, only the first code will be returned |
|
600 |
phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|' |
|
601 |
||
602 |
[see also:] |
|
603 |
'Wikipedia - Daitch-Mokotoff Soundex' |
|
604 |
http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex |
|
605 |
||
606 |
'Avotaynu - Soundexing and Genealogy' |
|
607 |
http://www.avotaynu.com/soundex.htm |
|
2208 | 608 |
" |
609 |
! |
|
610 |
||
4491 | 611 |
javaCode |
612 |
"<<END |
|
613 |
/* |
|
614 |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
615 |
* contributor license agreements. See the NOTICE file distributed with |
|
616 |
* this work for additional information regarding copyright ownership. |
|
617 |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
618 |
* (the "License"); you may not use this file except in compliance with |
|
619 |
* the License. You may obtain a copy of the License at |
|
620 |
* |
|
621 |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
622 |
* |
|
623 |
* Unless required by applicable law or agreed to in writing, software |
|
624 |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
625 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
626 |
* See the License for the specific language governing permissions and |
|
627 |
* limitations under the License. |
|
628 |
*/ |
|
629 |
package org.apache.commons.codec.language; |
|
630 |
||
631 |
import org.apache.commons.codec.CharEncoding; |
|
632 |
import org.apache.commons.codec.EncoderException; |
|
633 |
import org.apache.commons.codec.StringEncoder; |
|
634 |
||
635 |
import java.io.InputStream; |
|
636 |
import java.util.*; |
|
637 |
||
638 |
/** |
|
639 |
* Encodes a string into a Daitch-Mokotoff Soundex value. |
|
640 |
* <p> |
|
641 |
* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater |
|
642 |
* accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. |
|
643 |
* </p> |
|
644 |
* <p> |
|
645 |
* The main differences compared to the other soundex variants are: |
|
646 |
* </p> |
|
647 |
* <ul> |
|
648 |
* <li>coded names are 6 digits long |
|
649 |
* <li>the initial character of the name is coded |
|
650 |
* <li>rules to encoded multi-character n-grams |
|
651 |
* <li>multiple possible encodings for the same name (branching) |
|
652 |
* </ul> |
|
653 |
* <p> |
|
654 |
* This implementation supports branching, depending on the used method: |
|
655 |
* <ul> |
|
656 |
* <li>{@link #encode(String)} - branching disabled, only the first code will be returned |
|
657 |
* <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' |
|
658 |
* </ul> |
|
659 |
* <p> |
|
660 |
* Note: this implementation has additional branching rules compared to the original description of the algorithm. The |
|
661 |
* rules can be customized by overriding the default rules contained in the resource file |
|
662 |
* {@code org/apache/commons/codec/language/dmrules.txt}. |
|
663 |
* </p> |
|
664 |
* <p> |
|
665 |
* This class is thread-safe. |
|
666 |
* </p> |
|
667 |
* |
|
668 |
* @see Soundex |
|
669 |
* @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> |
|
670 |
* @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> |
|
671 |
* |
|
672 |
* @version $Id$ |
|
673 |
* @since 1.10 |
|
674 |
*/ |
|
675 |
public class DaitchMokotoffSoundex implements StringEncoder { |
|
676 |
||
677 |
/** |
|
678 |
* Inner class representing a branch during DM soundex encoding. |
|
679 |
*/ |
|
680 |
private static final class Branch { |
|
681 |
private final StringBuilder builder; |
|
682 |
private String cachedString; |
|
683 |
private String lastReplacement; |
|
684 |
||
685 |
private Branch() { |
|
686 |
builder = new StringBuilder(); |
|
687 |
lastReplacement = null; |
|
688 |
cachedString = null; |
|
689 |
} |
|
690 |
||
691 |
/** |
|
692 |
* Creates a new branch, identical to this branch. |
|
693 |
* |
|
694 |
* @return a new, identical branch |
|
695 |
*/ |
|
696 |
public Branch createBranch() { |
|
697 |
final Branch branch = new Branch(); |
|
698 |
branch.builder.append(toString()); |
|
699 |
branch.lastReplacement = this.lastReplacement; |
|
700 |
return branch; |
|
701 |
} |
|
702 |
||
703 |
@Override |
|
704 |
public boolean equals(final Object other) { |
|
705 |
if (this == other) { |
|
706 |
return true; |
|
707 |
} |
|
708 |
if (!!(other instanceof Branch)) { |
|
709 |
return false; |
|
710 |
} |
|
711 |
||
712 |
return toString().equals(((Branch) other).toString()); |
|
713 |
} |
|
714 |
||
715 |
/** |
|
716 |
* Finish this branch by appending '0's until the maximum code length has been reached. |
|
717 |
*/ |
|
718 |
public void finish() { |
|
719 |
while (builder.length() < MAX_LENGTH) { |
|
720 |
builder.append('0'); |
|
721 |
cachedString = null; |
|
722 |
} |
|
723 |
} |
|
724 |
||
725 |
@Override |
|
726 |
public int hashCode() { |
|
727 |
return toString().hashCode(); |
|
728 |
} |
|
729 |
||
730 |
/** |
|
731 |
* Process the next replacement to be added to this branch. |
|
732 |
* |
|
733 |
* @param replacement |
|
734 |
* the next replacement to append |
|
735 |
* @param forceAppend |
|
736 |
* indicates if the default processing shall be overridden |
|
737 |
*/ |
|
738 |
public void processNextReplacement(final String replacement, final boolean forceAppend) { |
|
739 |
final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend; |
|
740 |
||
741 |
if (append && builder.length() < MAX_LENGTH) { |
|
742 |
builder.append(replacement); |
|
743 |
// remove all characters after the maximum length |
|
744 |
if (builder.length() > MAX_LENGTH) { |
|
745 |
builder.delete(MAX_LENGTH, builder.length()); |
|
746 |
} |
|
747 |
cachedString = null; |
|
748 |
} |
|
749 |
||
750 |
lastReplacement = replacement; |
|
751 |
} |
|
752 |
||
753 |
@Override |
|
754 |
public String toString() { |
|
755 |
if (cachedString == null) { |
|
756 |
cachedString = builder.toString(); |
|
757 |
} |
|
758 |
return cachedString; |
|
759 |
} |
|
760 |
} |
|
761 |
||
762 |
/** |
|
763 |
* Inner class for storing rules. |
|
764 |
*/ |
|
765 |
private static final class Rule { |
|
766 |
private final String pattern; |
|
767 |
private final String[] replacementAtStart; |
|
768 |
private final String[] replacementBeforeVowel; |
|
769 |
private final String[] replacementDefault; |
|
770 |
||
771 |
protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, |
|
772 |
final String replacementDefault) { |
|
773 |
this.pattern = pattern; |
|
774 |
this.replacementAtStart = replacementAtStart.split("\\|"); |
|
775 |
this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); |
|
776 |
this.replacementDefault = replacementDefault.split("\\|"); |
|
777 |
} |
|
778 |
||
779 |
public int getPatternLength() { |
|
780 |
return pattern.length(); |
|
781 |
} |
|
782 |
||
783 |
public String[] getReplacements(final String context, final boolean atStart) { |
|
784 |
if (atStart) { |
|
785 |
return replacementAtStart; |
|
786 |
} |
|
787 |
||
788 |
final int nextIndex = getPatternLength(); |
|
789 |
final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; |
|
790 |
if (nextCharIsVowel) { |
|
791 |
return replacementBeforeVowel; |
|
792 |
} |
|
793 |
||
794 |
return replacementDefault; |
|
795 |
} |
|
796 |
||
797 |
private boolean isVowel(final char ch) { |
|
798 |
return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; |
|
799 |
} |
|
800 |
||
801 |
public boolean matches(final String context) { |
|
802 |
return context.startsWith(pattern); |
|
803 |
} |
|
804 |
||
805 |
@Override |
|
806 |
public String toString() { |
|
807 |
return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), |
|
808 |
Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); |
|
809 |
} |
|
810 |
} |
|
811 |
||
812 |
private static final String COMMENT = "//"; |
|
813 |
private static final String DOUBLE_QUOTE = "\""; |
|
814 |
||
815 |
private static final String MULTILINE_COMMENT_END = "*/"; |
|
816 |
||
817 |
private static final String MULTILINE_COMMENT_START = "/*"; |
|
818 |
||
819 |
/** The resource file containing the replacement and folding rules */ |
|
820 |
private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; |
|
821 |
||
822 |
/** The code length of a DM soundex value. */ |
|
823 |
private static final int MAX_LENGTH = 6; |
|
824 |
||
825 |
/** Transformation rules indexed by the first character of their pattern. */ |
|
826 |
private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); |
|
827 |
||
828 |
/** Folding rules. */ |
|
829 |
private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); |
|
830 |
||
831 |
static { |
|
832 |
final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); |
|
833 |
if (rulesIS == null) { |
|
834 |
throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); |
|
835 |
} |
|
836 |
||
837 |
final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); |
|
838 |
parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); |
|
839 |
scanner.close(); |
|
840 |
||
841 |
// sort RULES by pattern length in descending order |
|
842 |
for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { |
|
843 |
final List<Rule> ruleList = rule.getValue(); |
|
844 |
Collections.sort(ruleList, new Comparator<Rule>() { |
|
845 |
@Override |
|
846 |
public int compare(final Rule rule1, final Rule rule2) { |
|
847 |
return rule2.getPatternLength() - rule1.getPatternLength(); |
|
848 |
} |
|
849 |
}); |
|
850 |
} |
|
851 |
} |
|
852 |
||
853 |
private static void parseRules(final Scanner scanner, final String location, |
|
854 |
final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { |
|
855 |
int currentLine = 0; |
|
856 |
boolean inMultilineComment = false; |
|
857 |
||
858 |
while (scanner.hasNextLine()) { |
|
859 |
currentLine++; |
|
860 |
final String rawLine = scanner.nextLine(); |
|
861 |
String line = rawLine; |
|
862 |
||
863 |
if (inMultilineComment) { |
|
864 |
if (line.endsWith(MULTILINE_COMMENT_END)) { |
|
865 |
inMultilineComment = false; |
|
866 |
} |
|
867 |
continue; |
|
868 |
} |
|
869 |
||
870 |
if (line.startsWith(MULTILINE_COMMENT_START)) { |
|
871 |
inMultilineComment = true; |
|
872 |
} else { |
|
873 |
// discard comments |
|
874 |
final int cmtI = line.indexOf(COMMENT); |
|
875 |
if (cmtI >= 0) { |
|
876 |
line = line.substring(0, cmtI); |
|
877 |
} |
|
878 |
||
879 |
// trim leading-trailing whitespace |
|
880 |
line = line.trim(); |
|
881 |
||
882 |
if (line.length() == 0) { |
|
883 |
continue; // empty lines can be safely skipped |
|
884 |
} |
|
885 |
||
886 |
if (line.contains("=")) { |
|
887 |
// folding |
|
888 |
final String[] parts = line.split("="); |
|
889 |
if (parts.length !!= 2) { |
|
890 |
throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + |
|
891 |
" parts: " + rawLine + " in " + location); |
|
892 |
} else { |
|
893 |
final String leftCharacter = parts[0]; |
|
894 |
final String rightCharacter = parts[1]; |
|
895 |
||
896 |
if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) { |
|
897 |
throw new IllegalArgumentException("Malformed folding statement - " + |
|
898 |
"patterns are not single characters: " + rawLine + " in " + location); |
|
899 |
} |
|
900 |
||
901 |
asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); |
|
902 |
} |
|
903 |
} else { |
|
904 |
// rule |
|
905 |
final String[] parts = line.split("\\s+"); |
|
906 |
if (parts.length !!= 4) { |
|
907 |
throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
|
908 |
" parts: " + rawLine + " in " + location); |
|
909 |
} else { |
|
910 |
try { |
|
911 |
final String pattern = stripQuotes(parts[0]); |
|
912 |
final String replacement1 = stripQuotes(parts[1]); |
|
913 |
final String replacement2 = stripQuotes(parts[2]); |
|
914 |
final String replacement3 = stripQuotes(parts[3]); |
|
915 |
||
916 |
final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); |
|
917 |
final char patternKey = r.pattern.charAt(0); |
|
918 |
List<Rule> rules = ruleMapping.get(patternKey); |
|
919 |
if (rules == null) { |
|
920 |
rules = new ArrayList<Rule>(); |
|
921 |
ruleMapping.put(patternKey, rules); |
|
922 |
} |
|
923 |
rules.add(r); |
|
924 |
} catch (final IllegalArgumentException e) { |
|
925 |
throw new IllegalStateException( |
|
926 |
"Problem parsing line '" + currentLine + "' in " + location, e); |
|
927 |
} |
|
928 |
} |
|
929 |
} |
|
930 |
} |
|
931 |
} |
|
932 |
} |
|
933 |
||
934 |
private static String stripQuotes(String str) { |
|
935 |
if (str.startsWith(DOUBLE_QUOTE)) { |
|
936 |
str = str.substring(1); |
|
937 |
} |
|
938 |
||
939 |
if (str.endsWith(DOUBLE_QUOTE)) { |
|
940 |
str = str.substring(0, str.length() - 1); |
|
941 |
} |
|
942 |
||
943 |
return str; |
|
944 |
} |
|
945 |
||
946 |
/** Whether to use ASCII folding prior to encoding. */ |
|
947 |
private final boolean folding; |
|
948 |
||
949 |
/** |
|
950 |
* Creates a new instance with ASCII-folding enabled. |
|
951 |
*/ |
|
952 |
public DaitchMokotoffSoundex() { |
|
953 |
this(true); |
|
954 |
} |
|
955 |
||
956 |
/** |
|
957 |
* Creates a new instance. |
|
958 |
* <p> |
|
959 |
* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. |
|
960 |
* è -> e. |
|
961 |
* </p> |
|
962 |
* |
|
963 |
* @param folding |
|
964 |
* if ASCII-folding shall be performed before encoding |
|
965 |
*/ |
|
966 |
public DaitchMokotoffSoundex(final boolean folding) { |
|
967 |
this.folding = folding; |
|
968 |
} |
|
969 |
||
970 |
/** |
|
971 |
* Performs a cleanup of the input string before the actual soundex transformation. |
|
972 |
* <p> |
|
973 |
* Removes all whitespace characters and performs ASCII folding if enabled. |
|
974 |
* </p> |
|
975 |
* |
|
976 |
* @param input |
|
977 |
* the input string to cleanup |
|
978 |
* @return a cleaned up string |
|
979 |
*/ |
|
980 |
private String cleanup(final String input) { |
|
981 |
final StringBuilder sb = new StringBuilder(); |
|
982 |
for (char ch : input.toCharArray()) { |
|
983 |
if (Character.isWhitespace(ch)) { |
|
984 |
continue; |
|
985 |
} |
|
986 |
||
987 |
ch = Character.toLowerCase(ch); |
|
988 |
if (folding && FOLDINGS.containsKey(ch)) { |
|
989 |
ch = FOLDINGS.get(ch); |
|
990 |
} |
|
991 |
sb.append(ch); |
|
992 |
} |
|
993 |
return sb.toString(); |
|
994 |
} |
|
995 |
||
996 |
/** |
|
997 |
* Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. |
|
998 |
* <p> |
|
999 |
* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an |
|
1000 |
* EncoderException if the supplied object is not of type java.lang.String. |
|
1001 |
* </p> |
|
1002 |
* |
|
1003 |
* @see #soundex(String) |
|
1004 |
* |
|
1005 |
* @param obj |
|
1006 |
* Object to encode |
|
1007 |
* @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String |
|
1008 |
* supplied. |
|
1009 |
* @throws EncoderException |
|
1010 |
* if the parameter supplied is not of type java.lang.String |
|
1011 |
* @throws IllegalArgumentException |
|
1012 |
* if a character is not mapped |
|
1013 |
*/ |
|
1014 |
@Override |
|
1015 |
public Object encode(final Object obj) throws EncoderException { |
|
1016 |
if (!!(obj instanceof String)) { |
|
1017 |
throw new EncoderException( |
|
1018 |
"Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); |
|
1019 |
} |
|
1020 |
return encode((String) obj); |
|
1021 |
} |
|
1022 |
||
1023 |
/** |
|
1024 |
* Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. |
|
1025 |
* |
|
1026 |
* @see #soundex(String) |
|
1027 |
* |
|
1028 |
* @param source |
|
1029 |
* A String object to encode |
|
1030 |
* @return A DM Soundex code corresponding to the String supplied |
|
1031 |
* @throws IllegalArgumentException |
|
1032 |
* if a character is not mapped |
|
1033 |
*/ |
|
1034 |
@Override |
|
1035 |
public String encode(final String source) { |
|
1036 |
if (source == null) { |
|
1037 |
return null; |
|
1038 |
} |
|
1039 |
return soundex(source, false)[0]; |
|
1040 |
} |
|
1041 |
||
1042 |
/** |
|
1043 |
* Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. |
|
1044 |
* <p> |
|
1045 |
* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, |
|
1046 |
* separated by '|'. |
|
1047 |
* </p> |
|
1048 |
* <p> |
|
1049 |
* Example: the name "AUERBACH" is encoded as both |
|
1050 |
* </p> |
|
1051 |
* <ul> |
|
1052 |
* <li>097400</li> |
|
1053 |
* <li>097500</li> |
|
1054 |
* </ul> |
|
1055 |
* <p> |
|
1056 |
* Thus the result will be "097400|097500". |
|
1057 |
* </p> |
|
1058 |
* |
|
1059 |
* @param source |
|
1060 |
* A String object to encode |
|
1061 |
* @return A string containing a set of DM Soundex codes corresponding to the String supplied |
|
1062 |
* @throws IllegalArgumentException |
|
1063 |
* if a character is not mapped |
|
1064 |
*/ |
|
1065 |
public String soundex(final String source) { |
|
1066 |
final String[] branches = soundex(source, true); |
|
1067 |
final StringBuilder sb = new StringBuilder(); |
|
1068 |
int index = 0; |
|
1069 |
for (final String branch : branches) { |
|
1070 |
sb.append(branch); |
|
1071 |
if (++index < branches.length) { |
|
1072 |
sb.append('|'); |
|
1073 |
} |
|
1074 |
} |
|
1075 |
return sb.toString(); |
|
1076 |
} |
|
1077 |
||
1078 |
/** |
|
1079 |
* Perform the actual DM Soundex algorithm on the input string. |
|
1080 |
* |
|
1081 |
* @param source |
|
1082 |
* A String object to encode |
|
1083 |
* @param branching |
|
1084 |
* If branching shall be performed |
|
1085 |
* @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the |
|
1086 |
* selected branching mode |
|
1087 |
*/ |
|
1088 |
private String[] soundex(final String source, final boolean branching) { |
|
1089 |
if (source == null) { |
|
1090 |
return null; |
|
1091 |
} |
|
1092 |
||
1093 |
final String input = cleanup(source); |
|
1094 |
||
1095 |
final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); |
|
1096 |
currentBranches.add(new Branch()); |
|
1097 |
||
1098 |
char lastChar = '\0'; |
|
1099 |
for (int index = 0; index < input.length(); index++) { |
|
1100 |
final char ch = input.charAt(index); |
|
1101 |
||
1102 |
// ignore whitespace inside a name |
|
1103 |
if (Character.isWhitespace(ch)) { |
|
1104 |
continue; |
|
1105 |
} |
|
1106 |
||
1107 |
final String inputContext = input.substring(index); |
|
1108 |
final List<Rule> rules = RULES.get(ch); |
|
1109 |
if (rules == null) { |
|
1110 |
continue; |
|
1111 |
} |
|
1112 |
||
1113 |
// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access |
|
1114 |
@SuppressWarnings("unchecked") |
|
1115 |
final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; |
|
1116 |
||
1117 |
for (final Rule rule : rules) { |
|
1118 |
if (rule.matches(inputContext)) { |
|
1119 |
if (branching) { |
|
1120 |
nextBranches.clear(); |
|
1121 |
} |
|
1122 |
final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); |
|
1123 |
final boolean branchingRequired = replacements.length > 1 && branching; |
|
1124 |
||
1125 |
for (final Branch branch : currentBranches) { |
|
1126 |
for (final String nextReplacement : replacements) { |
|
1127 |
// if we have multiple replacements, always create a new branch |
|
1128 |
final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; |
|
1129 |
||
1130 |
// special rule: occurrences of mn or nm are treated differently |
|
1131 |
final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); |
|
1132 |
||
1133 |
nextBranch.processNextReplacement(nextReplacement, force); |
|
1134 |
||
1135 |
if (branching) { |
|
1136 |
nextBranches.add(nextBranch); |
|
1137 |
} else { |
|
1138 |
break; |
|
1139 |
} |
|
1140 |
} |
|
1141 |
} |
|
1142 |
||
1143 |
if (branching) { |
|
1144 |
currentBranches.clear(); |
|
1145 |
currentBranches.addAll(nextBranches); |
|
1146 |
} |
|
1147 |
index += rule.getPatternLength() - 1; |
|
1148 |
break; |
|
1149 |
} |
|
1150 |
} |
|
1151 |
||
1152 |
lastChar = ch; |
|
1153 |
} |
|
1154 |
||
1155 |
final String[] result = new String[currentBranches.size()]; |
|
1156 |
int index = 0; |
|
1157 |
for (final Branch branch : currentBranches) { |
|
1158 |
branch.finish(); |
|
1159 |
result[index++] = branch.toString(); |
|
1160 |
} |
|
1161 |
||
1162 |
return result; |
|
1163 |
} |
|
1164 |
} |
|
1165 |
END>>" |
|
2211 | 1166 |
! ! |
1167 |
||
2208 | 1168 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1169 |
||
2209 | 1170 |
copyright |
1171 |
" |
|
1172 |
Copyright (c) 2002-2004 Robert Jarvis |
|
2208 | 1173 |
|
2209 | 1174 |
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation |
1175 |
files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, |
|
1176 |
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom |
|
1177 |
the Software is furnished to do so, subject to the following conditions: |
|
1178 |
||
1179 |
The above copyright notice and this permission notice shall be included in all copies or substantial |
|
1180 |
portions of the Software. |
|
2208 | 1181 |
|
2209 | 1182 |
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
1183 |
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|
1184 |
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
|
1185 |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
|
1186 |
USE OR OTHER DEALINGS IN THE SOFTWARE.' |
|
1187 |
" |
|
1188 |
! ! |
|
2208 | 1189 |
|
2213 | 1190 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'classification'! |
1191 |
||
1192 |
isSlavoGermanic:aString |
|
4488 | 1193 |
^ #('w' 'k' 'cz' 'witz' 'ä' 'ö' 'ü' 'ß') contains:[:sub | aString includesString:sub] |
2213 | 1194 |
|
1195 |
" |
|
1196 |
self isSlavoGermanic:'walter' |
|
4488 | 1197 |
self isSlavoGermanic:'horowitz' |
1198 |
self isSlavoGermanic:'müller' |
|
1199 |
self isSlavoGermanic:'miller' |
|
2213 | 1200 |
" |
4488 | 1201 |
|
1202 |
"Modified: / 28-07-2017 / 10:14:38 / cg" |
|
2213 | 1203 |
! ! |
1204 |
||
2209 | 1205 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'documentation'! |
2208 | 1206 |
|
3685 | 1207 |
documentation |
2209 | 1208 |
" |
4488 | 1209 |
The Double Metaphone algorithm |
1210 |
||
1211 |
see internet: https://en.wikipedia.org/wiki/Metaphone |
|
2209 | 1212 |
" |
2208 | 1213 |
! ! |
1214 |
||
1215 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'accessing'! |
|
1216 |
||
1217 |
currentIndex |
|
1218 |
^currentIndex |
|
1219 |
! |
|
1220 |
||
1221 |
currentIndex: anInteger |
|
1222 |
currentIndex := anInteger |
|
1223 |
! |
|
1224 |
||
1225 |
inputKey |
|
1226 |
^inputKey |
|
1227 |
! |
|
1228 |
||
1229 |
inputKey: aString |
|
1230 |
inputKey := aString asUppercase |
|
1231 |
! |
|
1232 |
||
1233 |
primaryTranslation |
|
1234 |
^primaryTranslation |
|
1235 |
! |
|
1236 |
||
1237 |
primaryTranslation: anObject |
|
1238 |
primaryTranslation := anObject |
|
1239 |
! |
|
1240 |
||
1241 |
secondaryTranslation |
|
1242 |
^secondaryTranslation |
|
1243 |
! |
|
1244 |
||
1245 |
secondaryTranslation: anObject |
|
1246 |
secondaryTranslation := anObject |
|
1247 |
! |
|
1248 |
||
1249 |
skipCount |
|
1250 |
^skipCount |
|
1251 |
! |
|
1252 |
||
1253 |
skipCount: anInteger |
|
1254 |
skipCount := anInteger |
|
1255 |
! |
|
1256 |
||
1257 |
startIndex |
|
1258 |
^startIndex |
|
1259 |
! |
|
1260 |
||
1261 |
startIndex: anObject |
|
1262 |
startIndex := anObject |
|
1263 |
! ! |
|
1264 |
||
1265 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'api'! |
|
1266 |
||
4488 | 1267 |
phoneticStringsFor:aString |
1268 |
"Private - Answers an array of alternate phonetic strings for the given input string." |
|
1269 |
||
1270 |
inputKey := aString. |
|
1271 |
self performInitialProcessing. |
|
1272 |
self processRemainingCharacters. |
|
1273 |
^ Array with:primaryTranslation with:secondaryTranslation |
|
1274 |
||
1275 |
"Modified (format): / 28-07-2017 / 11:25:02 / cg" |
|
2208 | 1276 |
! ! |
1277 |
||
1278 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'initialization'! |
|
1279 |
||
1280 |
initialize |
|
4488 | 1281 |
super initialize. |
1282 |
||
1283 |
startIndex := 1. |
|
1284 |
primaryTranslation := ''. |
|
1285 |
secondaryTranslation := ''. |
|
1286 |
skipCount := 0. |
|
1287 |
currentIndex := 1. |
|
1288 |
||
1289 |
"Modified: / 28-07-2017 / 11:18:44 / cg" |
|
2208 | 1290 |
! ! |
1291 |
||
1292 |
!PhoneticStringUtilities::DoubleMetaphoneStringComparator methodsFor:'private'! |
|
1293 |
||
4488 | 1294 |
addPrimaryTranslation:aString |
1295 |
primaryTranslation := (primaryTranslation , aString) |
|
1296 |
||
1297 |
"Modified: / 28-07-2017 / 11:19:09 / cg" |
|
2208 | 1298 |
! |
1299 |
||
4488 | 1300 |
addSecondaryTranslation:aString |
1301 |
secondaryTranslation := secondaryTranslation , aString |
|
1302 |
||
1303 |
"Modified: / 28-07-2017 / 11:17:11 / cg" |
|
2208 | 1304 |
! |
1305 |
||
1306 |
isSlavoGermanic: aString |
|
4521 | 1307 |
^((aString includesAny: 'WK') or: |
1308 |
[ (aString indexOfSubCollection: 'CZ' startingAt: 1) >= 1 ]) or: |
|
1309 |
[ (aString indexOfSubCollection: 'WITZ' startingAt: 1) >= 1 ] |
|
1310 |
||
1311 |
"Modified: / 09-10-2017 / 17:10:46 / stefan" |
|
2208 | 1312 |
! |
1313 |
||
1314 |
keyAt: anInteger |
|
4488 | 1315 |
(anInteger between:1 and:inputKey size) ifTrue: [ |
1316 |
^ inputKey at: anInteger |
|
1317 |
]. |
|
1318 |
^ Character space |
|
1319 |
||
1320 |
"Modified: / 28-07-2017 / 11:38:30 / cg" |
|
2208 | 1321 |
! |
1322 |
||
1323 |
keyLeftString: lengthInteger |
|
1324 |
^self keyMidString: lengthInteger from: 1 |
|
1325 |
! |
|
1326 |
||
1327 |
keyMidString: lengthInteger from: fromInteger |
|
4488 | 1328 |
| result from len additionalSpaces | |
1329 |
||
1330 |
result := ''. |
|
1331 |
from := fromInteger. |
|
1332 |
len := lengthInteger. |
|
1333 |
||
1334 |
"Prepend spaces if caller is requesting characters from before the start of the string" |
|
1335 |
||
1336 |
[ from < 1 ] whileTrue: |
|
1337 |
[ result := result, ' '. |
|
1338 |
from := from + 1. |
|
1339 |
len := len - 1 ]. |
|
1340 |
||
1341 |
from + len - 1 > inputKey size |
|
1342 |
ifTrue: |
|
1343 |
[ additionalSpaces := from + len - 1 - inputKey size. |
|
1344 |
len := inputKey size - from + 1 ] |
|
1345 |
ifFalse: [ additionalSpaces := 0 ]. |
|
1346 |
||
1347 |
result := result, (inputKey copyFrom: from to: (from+len-1 min: inputKey size)). |
|
1348 |
||
1349 |
[ additionalSpaces > 0 ] whileTrue: |
|
1350 |
[ result := result, ' '. |
|
1351 |
additionalSpaces := additionalSpaces - 1 ]. |
|
1352 |
||
1353 |
^result |
|
1354 |
||
1355 |
"Modified: / 28-07-2017 / 11:20:43 / cg" |
|
2208 | 1356 |
! |
1357 |
||
1358 |
keyRightString: lengthInteger |
|
4488 | 1359 |
^self keyMidString: lengthInteger from: inputKey size - lengthInteger + 1 |
1360 |
||
1361 |
"Modified: / 28-07-2017 / 11:20:51 / cg" |
|
2208 | 1362 |
! |
1363 |
||
1364 |
performInitialProcessing |
|
4490 | 1365 |
inputKey size > 1 ifTrue:[ |
1366 |
(#( 'GN' 'KN' 'PN' 'WR' 'PS' ) includes:(inputKey copyFrom:1 to:2)) ifTrue:[ |
|
1367 |
startIndex := startIndex + 1 |
|
1368 |
]. |
|
4488 | 1369 |
]. |
4490 | 1370 |
|
4488 | 1371 |
(self keyAt:1) = $X ifTrue:[ |
1372 |
self |
|
1373 |
addPrimaryTranslation:'S'; |
|
1374 |
addSecondaryTranslation:'S'. |
|
1375 |
startIndex := startIndex + 1 |
|
1376 |
]. |
|
1377 |
(self keyAt:1) isVowel ifTrue:[ |
|
1378 |
self |
|
1379 |
addPrimaryTranslation:'A'; |
|
1380 |
addSecondaryTranslation:'A'. |
|
1381 |
startIndex := startIndex + 1 |
|
1382 |
] |
|
1383 |
||
4490 | 1384 |
"Modified: / 01-08-2017 / 19:29:19 / cg" |
2208 | 1385 |
! |
1386 |
||
1387 |
processB |
|
4488 | 1388 |
self |
1389 |
addPrimaryTranslation: 'P'; |
|
1390 |
addSecondaryTranslation: 'P'. |
|
1391 |
||
1392 |
(self keyAt: (currentIndex + 1)) == $B ifTrue: [ |
|
1393 |
skipCount := skipCount + 1 |
|
1394 |
]. |
|
1395 |
||
1396 |
"Modified: / 28-07-2017 / 11:26:03 / cg" |
|
2208 | 1397 |
! |
1398 |
||
1399 |
processC |
|
2213 | 1400 |
"i" |
1401 |
((((currentIndex >= 3 |
|
1402 |
and: [ (self keyAt: currentIndex-2) isVowel not ]) |
|
1403 |
and: [ (self keyMidString: 3 from: currentIndex-1) = 'ACH' ]) |
|
1404 |
and: [ (self keyAt: currentIndex+2) ~= $I ]) |
|
1405 |
and: [ ((self keyAt: currentIndex+2) ~= $E) |
|
1406 |
or: [ (self keyMidString: 6 from: currentIndex-2) ~= 'BACHER' |
|
1407 |
and: [ (self keyMidString: 6 from: currentIndex-2) ~= 'MACHER' ] ] ]) |
|
1408 |
ifTrue: |
|
1409 |
[ self addPrimaryTranslation: 'K'. |
|
1410 |
self addSecondaryTranslation: 'K'. |
|
4488 | 1411 |
skipCount := skipCount + 2. |
2213 | 1412 |
^self ]. |
1413 |
||
1414 |
"ii" |
|
4488 | 1415 |
(inputKey beginsWith: 'CAESAR') |
2213 | 1416 |
ifTrue: |
1417 |
[ self addPrimaryTranslation: 'S'. |
|
1418 |
self addSecondaryTranslation: 'S'. |
|
4488 | 1419 |
skipCount := skipCount + 1. |
2213 | 1420 |
^self ]. |
1421 |
||
1422 |
"iii" |
|
1423 |
(self keyMidString: 4 from: currentIndex) = 'CHIA' |
|
1424 |
ifTrue: |
|
1425 |
[ self addPrimaryTranslation: 'K'. |
|
1426 |
self addSecondaryTranslation: 'K'. |
|
4488 | 1427 |
skipCount := skipCount + 1. |
2213 | 1428 |
^self ]. |
1429 |
||
1430 |
"iv" |
|
1431 |
(self keyMidString: 2 from: currentIndex) = 'CH' |
|
1432 |
ifTrue: |
|
1433 |
[ (currentIndex > 1 "a" |
|
1434 |
and: [ (self keyMidString: 4 from: currentIndex) = 'CHAE' ]) |
|
1435 |
ifTrue: [ self |
|
1436 |
addPrimaryTranslation: 'K'; |
|
4488 | 1437 |
addSecondaryTranslation: 'X'. |
1438 |
skipCount := skipCount + 1. |
|
1439 |
^self ]. |
|
2213 | 1440 |
|
1441 |
(currentIndex = 1 "b" |
|
4488 | 1442 |
and: [ (inputKey size > 5 and: [(inputKey copyFrom: 1 to: 6) = 'CHARAC' |
1443 |
or: [ (inputKey copyFrom: 1 to: 6) = 'CHARIS' ]] ) |
|
1444 |
or: [inputKey size > 4 and: [ ((((inputKey copyFrom: 1 to: 4) = 'CHOR' |
|
1445 |
or: [ (inputKey copyFrom: 1 to: 4) = 'CHYM' ]) |
|
1446 |
or: [ (inputKey copyFrom: 1 to: 4) = 'CHIA' ]) |
|
1447 |
or: [ (inputKey copyFrom: 1 to: 4) = 'CHEM' ]) |
|
1448 |
and: [ (inputKey copyFrom: 1 to: 4) ~= 'CHORE' ] ] ] ]) |
|
2213 | 1449 |
ifTrue: [ self |
1450 |
addPrimaryTranslation: 'K'; |
|
4488 | 1451 |
addSecondaryTranslation: 'K'. |
1452 |
skipCount := skipCount + 1. |
|
1453 |
^self ]. |
|
1454 |
||
1455 |
(((((#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: 4)) "c" |
|
1456 |
or: [ (inputKey copyFrom: 1 to: 3) = 'SCH' ]) |
|
2213 | 1457 |
or: [ #('ORCHES' 'ARCHIT' 'ORCHID') |
1458 |
includes: (self keyMidString: 6 from: currentIndex-2) ]) |
|
1459 |
or: [ #($T $S) includes: (self keyAt: currentIndex+2) ]) |
|
1460 |
or: [ ((currentIndex = 1) |
|
1461 |
or: [ #($A $O $U $E) includes: (self keyAt: currentIndex-1) ]) |
|
1462 |
and: [ #($L $R $N $M $B $H $F $V $W $ ) includes: (self keyAt: currentIndex+2) ] ] ) |
|
1463 |
ifTrue: |
|
1464 |
[ self |
|
1465 |
addPrimaryTranslation: 'K'; |
|
4488 | 1466 |
addSecondaryTranslation: 'K'. |
1467 |
skipCount := skipCount + 1. |
|
1468 |
^self ] |
|
2213 | 1469 |
ifFalse: |
1470 |
[ currentIndex > 1 |
|
1471 |
ifTrue: |
|
4488 | 1472 |
[ (inputKey copyFrom: 1 to: 2) = 'MC' |
2213 | 1473 |
ifTrue: |
1474 |
[ self |
|
1475 |
addPrimaryTranslation: 'K'; |
|
1476 |
addSecondaryTranslation: 'K' ] |
|
1477 |
ifFalse: |
|
1478 |
[ self |
|
1479 |
addPrimaryTranslation: 'X'; |
|
1480 |
addSecondaryTranslation: 'K' ] ] |
|
1481 |
ifFalse: |
|
1482 |
[ self |
|
1483 |
addPrimaryTranslation: 'X'; |
|
1484 |
addSecondaryTranslation: 'X' ]. |
|
4488 | 1485 |
skipCount := skipCount + 1. |
2213 | 1486 |
^self ] ]. |
1487 |
||
1488 |
"v" |
|
1489 |
(self keyAt: currentIndex+1) = $Z |
|
1490 |
ifTrue: |
|
1491 |
[ self |
|
1492 |
addPrimaryTranslation: 'S'; |
|
4488 | 1493 |
addSecondaryTranslation: 'X'. |
1494 |
skipCount := skipCount + 1. |
|
1495 |
^self ]. |
|
2213 | 1496 |
|
1497 |
"vi" |
|
1498 |
(self keyMidString: 3 from: currentIndex+1) = 'CIA' |
|
1499 |
ifTrue: |
|
1500 |
[ self |
|
1501 |
addPrimaryTranslation: 'X'; |
|
4488 | 1502 |
addSecondaryTranslation: 'X'. |
1503 |
skipCount := skipCount + 2. |
|
1504 |
^self ]. |
|
2213 | 1505 |
|
1506 |
"vii" |
|
1507 |
((self keyAt: currentIndex+1) = $C |
|
1508 |
and: [ ((currentIndex = 2) |
|
1509 |
and: [ (self keyAt: 1) = $M ]) not ]) |
|
1510 |
ifTrue: |
|
1511 |
[ ((#($I $E $H) includes: (self keyAt: currentIndex+2)) |
|
1512 |
and: [ (self keyMidString: 2 from: currentIndex+2) ~= 'HU' ]) |
|
1513 |
ifTrue: |
|
1514 |
[ ((currentIndex = 2 and: [ (self keyAt: 1) = $A ]) |
|
1515 |
or: [ #('UCCEE' 'UCCES') includes: (self keyMidString: 5 from: currentIndex-1)]) |
|
1516 |
ifTrue: |
|
1517 |
[self |
|
1518 |
addPrimaryTranslation: 'KS'; |
|
4488 | 1519 |
addSecondaryTranslation: 'KS'. |
1520 |
skipCount := skipCount + 2. |
|
1521 |
^self ] |
|
2213 | 1522 |
ifFalse: |
1523 |
[self |
|
1524 |
addPrimaryTranslation: 'X'; |
|
4488 | 1525 |
addSecondaryTranslation: 'X'. |
1526 |
skipCount := skipCount + 2. |
|
1527 |
^self ] ] |
|
2213 | 1528 |
ifFalse: |
1529 |
[ self |
|
1530 |
addPrimaryTranslation: 'K'; |
|
4488 | 1531 |
addSecondaryTranslation: 'K'. |
1532 |
skipCount := skipCount + 2. |
|
1533 |
^self ] ]. |
|
2213 | 1534 |
|
1535 |
"viii" |
|
1536 |
(#($K $G $Q) includes: (self keyAt: currentIndex+1)) |
|
1537 |
ifTrue: |
|
1538 |
[ self |
|
1539 |
addPrimaryTranslation: 'K'; |
|
4488 | 1540 |
addSecondaryTranslation: 'K'. |
1541 |
skipCount := skipCount + 1. |
|
1542 |
^self ]. |
|
2213 | 1543 |
|
1544 |
"ix" |
|
1545 |
(#($I $E $Y) includes: (self keyAt: currentIndex+1)) |
|
1546 |
ifTrue: |
|
1547 |
[ (#('CIO' 'CIE' 'CIA') includes: (self keyMidString: 3 from: currentIndex)) |
|
1548 |
ifTrue: |
|
1549 |
[self |
|
1550 |
addPrimaryTranslation: 'S'; |
|
1551 |
addSecondaryTranslation: 'X' ] |
|
1552 |
ifFalse: |
|
1553 |
[self |
|
1554 |
addPrimaryTranslation: 'S'; |
|
1555 |
addSecondaryTranslation: 'S']. |
|
4488 | 1556 |
skipCount := skipCount + 1. |
2213 | 1557 |
^self ]. |
1558 |
||
1559 |
"x" |
|
1560 |
self |
|
1561 |
addPrimaryTranslation: 'K'; |
|
1562 |
addSecondaryTranslation: 'K'. |
|
1563 |
||
1564 |
"xi" |
|
1565 |
(#(' C' ' Q' ' G') includes: (self keyMidString: 2 from: currentIndex+1)) |
|
1566 |
ifTrue: |
|
4488 | 1567 |
[ skipCount := skipCount + 2 ] |
2213 | 1568 |
ifFalse: |
1569 |
[ ((#($C $K $Q) includes: (self keyAt: currentIndex+1)) |
|
1570 |
and: [ (#('CE' 'CI') includes: (self keyMidString: 2 from: currentIndex+1)) not ]) |
|
4488 | 1571 |
ifTrue: [ skipCount := skipCount + 1] ] |
1572 |
||
1573 |
"Modified: / 28-07-2017 / 11:29:11 / cg" |
|
2208 | 1574 |
! |
1575 |
||
1576 |
processCedille |
|
1577 |
self |
|
1578 |
addPrimaryTranslation: 'S'; |
|
1579 |
addSecondaryTranslation: 'S' |
|
1580 |
! |
|
1581 |
||
1582 |
processD |
|
2213 | 1583 |
"i" |
1584 |
(self keyAt: currentIndex+1) = $G |
|
1585 |
ifTrue: |
|
1586 |
[ (#($I $E $Y) includes: (self keyAt: currentIndex+2)) |
|
1587 |
ifTrue: |
|
1588 |
[ self |
|
1589 |
addPrimaryTranslation: 'J'; |
|
4488 | 1590 |
addSecondaryTranslation: 'J'. |
1591 |
skipCount := skipCount + 2. |
|
2213 | 1592 |
^self ] |
1593 |
ifFalse: |
|
1594 |
[ self |
|
1595 |
addPrimaryTranslation: 'TK'; |
|
4488 | 1596 |
addSecondaryTranslation: 'TK'. |
1597 |
skipCount := skipCount + 1. |
|
2213 | 1598 |
^self ] ]. |
1599 |
||
1600 |
"ii" |
|
1601 |
(#($T $D) includes: (self keyAt: currentIndex+1)) |
|
1602 |
ifTrue: |
|
1603 |
[ self |
|
1604 |
addPrimaryTranslation: 'T'; |
|
4488 | 1605 |
addSecondaryTranslation: 'T'. |
1606 |
skipCount := skipCount + 1. |
|
1607 |
^self ]. |
|
2213 | 1608 |
|
1609 |
"iii" |
|
1610 |
self |
|
1611 |
addPrimaryTranslation: 'T'; |
|
1612 |
addSecondaryTranslation: 'T' |
|
4488 | 1613 |
|
1614 |
"Modified: / 28-07-2017 / 11:27:39 / cg" |
|
2208 | 1615 |
! |
1616 |
||
1617 |
processF |
|
4488 | 1618 |
self |
1619 |
addPrimaryTranslation: 'F'; |
|
1620 |
addSecondaryTranslation: 'F'. |
|
1621 |
||
1622 |
(self keyAt: currentIndex+1) = $F |
|
1623 |
ifTrue: [ skipCount := skipCount + 1 ] |
|
1624 |
||
1625 |
"Modified (format): / 28-07-2017 / 11:29:21 / cg" |
|
2208 | 1626 |
! |
1627 |
||
1628 |
processG |
|
1629 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
|
1630 |
case 'G': |
|
1631 |
if(GetAt(current + 1) == 'H') |
|
1632 |
{" |
|
1633 |
| word | |
|
2213 | 1634 |
(self keyAt: currentIndex + 1) = $H |
2208 | 1635 |
ifTrue: [ |
1636 |
"if((current > 0) AND !!IsVowel(current - 1))" |
|
1637 |
||
2213 | 1638 |
(currentIndex > 1 and: [(self keyAt: currentIndex - 1) isVowel not]) |
2208 | 1639 |
ifTrue: [ |
1640 |
" { |
|
1641 |
MetaphAdd(K); |
|
1642 |
current += 2; |
|
1643 |
break; |
|
1644 |
}" |
|
1645 |
||
4488 | 1646 |
self |
1647 |
addPrimaryTranslation: 'K'; |
|
1648 |
addSecondaryTranslation: 'K'. |
|
1649 |
skipCount := skipCount + 1. |
|
1650 |
^self |
|
2208 | 1651 |
]. |
1652 |
||
1653 |
"if(current < 3) |
|
1654 |
{" |
|
1655 |
||
1656 |
currentIndex < 4 |
|
1657 |
ifTrue: [ |
|
1658 |
||
1659 |
" //'ghislane', ghiradelli |
|
1660 |
if(current == 0) |
|
1661 |
{ " |
|
1662 |
currentIndex = 1 |
|
1663 |
ifTrue: [ |
|
1664 |
"if(GetAt(current + 2) == 'I')" |
|
1665 |
||
2213 | 1666 |
(self keyAt: currentIndex + 2) = $I |
2208 | 1667 |
ifTrue: [ |
1668 |
"MetaphAdd(J);" |
|
1669 |
self addPrimaryTranslation: 'J'; |
|
1670 |
addSecondaryTranslation: 'J'. |
|
1671 |
] ifFalse: [ |
|
1672 |
"MetaphAdd(K);" |
|
1673 |
self addPrimaryTranslation: 'K'; |
|
1674 |
addSecondaryTranslation: 'K'. |
|
1675 |
]. |
|
1676 |
" current += 2; |
|
1677 |
break;" |
|
4488 | 1678 |
skipCount := skipCount + 1. |
1679 |
^self |
|
2208 | 1680 |
] |
1681 |
]. |
|
1682 |
||
1683 |
" //Parker's rule (with some further refinements) - e.g., 'hugh' |
|
1684 |
if(((current > 1) AND StringAt((current - 2), 1, B, H, D, ) ) |
|
1685 |
//e.g., 'bough' |
|
1686 |
OR ((current > 2) AND StringAt((current - 3), 1, B, H, D, ) ) |
|
1687 |
//e.g., 'broughton' |
|
1688 |
OR ((current > 3) AND StringAt((current - 4), 1, B, H, ) ) ) |
|
1689 |
" |
|
2213 | 1690 |
(((currentIndex > 2 and: [#($B $H $D) includes: (self keyAt: currentIndex - 2)]) |
1691 |
or: [currentIndex > 3 and: [#($B $H $D) includes: (self keyAt: currentIndex - 3)]]) |
|
1692 |
or: [currentIndex > 4 and: [#($B $H) includes: (self keyAt: currentIndex - 4)]]) |
|
2208 | 1693 |
ifTrue: [ |
1694 |
"current += 2; |
|
1695 |
break;" |
|
4488 | 1696 |
skipCount := skipCount + 1. |
1697 |
^self |
|
2208 | 1698 |
] ifFalse: [ |
1699 |
" //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' |
|
1700 |
if((current > 2) |
|
1701 |
AND (GetAt(current - 1) == 'U') |
|
1702 |
AND StringAt((current - 3), 1, C, G, L, R, T, ) )" |
|
1703 |
(currentIndex > 3 and: [ |
|
2213 | 1704 |
((self keyAt: currentIndex - 1) = $U) and: [ |
1705 |
#($C $G $L $R $T) includes: (self keyAt: currentIndex - 3) |
|
2208 | 1706 |
] |
1707 |
]) ifTrue: [ |
|
1708 |
"MetaphAdd(F);" |
|
1709 |
self addPrimaryTranslation: 'F'; |
|
1710 |
addSecondaryTranslation: 'F'. |
|
1711 |
] ifFalse: [ |
|
1712 |
" if((current > 0) AND GetAt(current - 1) !!= 'I') |
|
1713 |
MetaphAdd(K);" |
|
2213 | 1714 |
(currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= $I]) |
2208 | 1715 |
ifTrue: [ |
1716 |
self addPrimaryTranslation: 'K'; |
|
1717 |
addSecondaryTranslation: 'K'. |
|
1718 |
]. |
|
1719 |
]. |
|
4488 | 1720 |
skipCount := skipCount + 1. |
1721 |
^self |
|
2208 | 1722 |
]. |
1723 |
]. |
|
1724 |
"if(GetAt(current + 1) == 'N')" |
|
2213 | 1725 |
(self keyAt: currentIndex + 1) = $N |
2208 | 1726 |
ifTrue: [ |
1727 |
"if((current == 1) AND IsVowel(0) AND !!SlavoGermanic())" |
|
4488 | 1728 |
(currentIndex = 2 and: [(inputKey at: 1) isVowel and: [(self isSlavoGermanic: inputKey) not]]) |
2208 | 1729 |
ifTrue: [ |
1730 |
"MetaphAdd(KN, N);" |
|
1731 |
self addPrimaryTranslation: 'KN'; |
|
1732 |
addSecondaryTranslation: 'N'. |
|
1733 |
] ifFalse: [ |
|
1734 |
" //not e.g. 'cagney' |
|
1735 |
if(!!StringAt((current + 2), 2, EY, ) |
|
1736 |
AND (GetAt(current + 1) !!= 'Y') |
|
1737 |
AND !!SlavoGermanic())" |
|
4488 | 1738 |
((inputKey size >= (currentIndex + 2)) and: [ |
1739 |
(inputKey copyFrom: currentIndex + 2 to: (currentIndex + 4 min: inputKey size)) ~= 'EY' and: [ |
|
2213 | 1740 |
(self keyAt: currentIndex + 1) ~= $Y and: [ |
4488 | 1741 |
(self isSlavoGermanic: inputKey) not |
2208 | 1742 |
] |
1743 |
] |
|
1744 |
]) ifTrue: [ |
|
1745 |
self addPrimaryTranslation: 'N'; |
|
1746 |
addSecondaryTranslation: 'KN'. |
|
1747 |
] ifFalse: [ |
|
1748 |
self addPrimaryTranslation: 'KN'; |
|
1749 |
addSecondaryTranslation: 'KN'. |
|
1750 |
]. |
|
1751 |
]. |
|
4488 | 1752 |
skipCount := skipCount + 1. |
1753 |
^self |
|
2208 | 1754 |
]. |
1755 |
" //'tagliaro' |
|
1756 |
if(StringAt((current + 1), 2, LI, ) AND !!SlavoGermanic())" |
|
4488 | 1757 |
((inputKey size >= (currentIndex + 3)) and: [ |
1758 |
(inputKey copyFrom: currentIndex + 1 to: currentIndex + 2) = 'LI' and: [ |
|
1759 |
(self isSlavoGermanic: inputKey) not]]) |
|
2208 | 1760 |
ifTrue: [ |
1761 |
self addPrimaryTranslation: 'KL'; |
|
1762 |
addSecondaryTranslation: 'L'. |
|
4488 | 1763 |
skipCount := skipCount + 1. |
1764 |
^self. |
|
2208 | 1765 |
]. |
1766 |
" //-ges-,-gep-,-gel-, -gie- at beginning |
|
1767 |
if((current == 0) |
|
1768 |
AND ((GetAt(current + 1) == 'Y') |
|
1769 |
OR StringAt((current + 1), 2, ES, EP, EB, EL, EY, IB, IL, IN, IE, EI, ER, )) )" |
|
2213 | 1770 |
(currentIndex = 1 and: [ |
1771 |
((self keyAt: currentIndex + 1) = $Y) or: [ |
|
2208 | 1772 |
(#('ES' 'EP' 'EB' 'EL' 'EY' 'IB' 'IL' 'IN' 'IE' 'EI' 'ER') includes: |
4488 | 1773 |
(inputKey copyFrom: currentIndex + 1 to: currentIndex + 2)) |
2208 | 1774 |
]]) ifTrue: [ |
1775 |
self addPrimaryTranslation: 'K'; |
|
1776 |
addSecondaryTranslation: 'J'. |
|
4488 | 1777 |
skipCount := skipCount + 1. |
1778 |
^self. |
|
2208 | 1779 |
]. |
1780 |
" // -ger-, -gy- |
|
1781 |
if((StringAt((current + 1), 2, ER, ) OR (GetAt(current + 1) == 'Y')) |
|
1782 |
AND !!StringAt(0, 6, DANGER, RANGER, MANGER, ) |
|
1783 |
AND !!StringAt((current - 1), 1, E, I, ) |
|
1784 |
AND !!StringAt((current - 1), 3, RGY, OGY, ) ) |
|
1785 |
" |
|
4488 | 1786 |
(((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 3 min: inputKey size)) = 'ER' or: [ |
2213 | 1787 |
((self keyAt: currentIndex + 1) = $Y)]) |
4488 | 1788 |
and: [((#('DANGER' 'RANGER' 'MANGER') includes: (word := inputKey copyFrom: 1 to: (6 min: inputKey size))) not) |
2213 | 1789 |
and: [(self keyAt: currentIndex - 1) ~= $E |
4488 | 1790 |
and: [(#('RGY' 'OGY') includes: (inputKey copyFrom: currentIndex - 1 to: currentIndex + 1)) not]]]) |
2208 | 1791 |
ifTrue: [ |
1792 |
self addPrimaryTranslation: 'K'; |
|
1793 |
addSecondaryTranslation: 'J'. |
|
4488 | 1794 |
skipCount := skipCount + 1. |
1795 |
^self. |
|
2208 | 1796 |
]. |
1797 |
||
1798 |
" // italian e.g, 'biaggi' |
|
1799 |
if(StringAt((current + 1), 1, E, I, Y, ) OR StringAt((current - 1), 4, AGGI, OGGI, )) |
|
1800 |
" |
|
4488 | 1801 |
((#($E $I $Y) includes: (self keyAt: (currentIndex + 1))) or: [(#('AGGI' 'OGGI') includes: (inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size)))]) |
2208 | 1802 |
ifTrue: [ |
1803 |
" //obvious germanic |
|
1804 |
if((StringAt(0, 4, VAN , VON , ) OR StringAt(0, 3, SCH, )) |
|
1805 |
OR StringAt((current + 1), 2, ET, )) MetaphAdd(K);" |
|
4488 | 1806 |
word := (inputKey copyFrom: 1 to: 4). |
2208 | 1807 |
((#('VAN ' 'VON ') includes: word) or: [(word copyFrom: 1 to: 3) = 'SCH' or: [(word copyFrom: 1 to: 2) = 'ET']]) |
1808 |
ifTrue: [ |
|
1809 |
self addPrimaryTranslation: 'K'; |
|
1810 |
addSecondaryTranslation: 'K'. |
|
1811 |
] ifFalse: [ |
|
1812 |
" //always soft if french ending |
|
1813 |
if(StringAt((current + 1), 4, IER , )) |
|
1814 |
MetaphAdd(J); |
|
1815 |
else |
|
1816 |
MetaphAdd(J, K); |
|
1817 |
current += 2; |
|
1818 |
break;" |
|
4488 | 1819 |
(((inputKey copyFrom: currentIndex + 1 to: (currentIndex + 5 min: inputKey size)), ' ') copyFrom: 1 to: 4) = 'IER ' |
2208 | 1820 |
ifTrue: [ |
1821 |
self addPrimaryTranslation: 'J'; |
|
1822 |
addSecondaryTranslation: 'J'. |
|
1823 |
] ifFalse: [ |
|
1824 |
self addPrimaryTranslation: 'J'; |
|
1825 |
addSecondaryTranslation: 'K'. |
|
1826 |
]. |
|
1827 |
||
1828 |
]. |
|
4488 | 1829 |
skipCount := skipCount + 1. |
1830 |
^self. |
|
2208 | 1831 |
]. |
1832 |
||
1833 |
" if(GetAt(current + 1) == 'G') |
|
1834 |
current += 2; |
|
1835 |
else |
|
1836 |
current += 1; |
|
1837 |
MetaphAdd(K); |
|
1838 |
break;" |
|
1839 |
||
2213 | 1840 |
(self keyAt: (currentIndex + 1)) = $G |
2208 | 1841 |
ifTrue: [ |
4488 | 1842 |
skipCount := skipCount + 1. |
2208 | 1843 |
]. |
1844 |
self addPrimaryTranslation: 'K'; |
|
1845 |
addSecondaryTranslation: 'K'. |
|
4488 | 1846 |
|
1847 |
"Modified: / 28-07-2017 / 11:31:33 / cg" |
|
2208 | 1848 |
! |
1849 |
||
1850 |
processH |
|
2213 | 1851 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
1852 |
case 'H': |
|
2208 | 1853 |
//only keep if first & before vowel or btw. 2 vowels |
1854 |
if(((current == 0) OR IsVowel(current - 1)) |
|
1855 |
AND IsVowel(current + 1)) |
|
1856 |
{ |
|
1857 |
MetaphAdd(H); |
|
1858 |
current += 2; |
|
1859 |
}else//also takes care of 'HH' |
|
1860 |
current += 1; |
|
1861 |
break; |
|
1862 |
" |
|
1863 |
||
2213 | 1864 |
(((currentIndex = 1) |
1865 |
or: [ (self keyAt: currentIndex - 1) isVowel]) |
|
1866 |
and: [(self keyAt: currentIndex + 1) isVowel]) |
|
1867 |
ifTrue: [ |
|
1868 |
self addPrimaryTranslation: 'H'; |
|
1869 |
addSecondaryTranslation: 'H'. |
|
4488 | 1870 |
skipCount := skipCount + 1. |
1871 |
^self. |
|
2213 | 1872 |
] |
4488 | 1873 |
|
1874 |
"Modified: / 28-07-2017 / 11:29:52 / cg" |
|
2208 | 1875 |
! |
1876 |
||
1877 |
processJ |
|
2213 | 1878 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
1879 |
case 'J': |
|
2208 | 1880 |
//obvious spanish, 'jose', 'san jacinto' |
1881 |
if(StringAt(current, 4, JOSE, ) OR StringAt(0, 4, SAN , ) ) |
|
1882 |
{ |
|
1883 |
if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, SAN , ) ) |
|
1884 |
MetaphAdd(H); |
|
1885 |
else |
|
1886 |
{ |
|
1887 |
MetaphAdd(J, H); |
|
1888 |
} |
|
1889 |
current +=1; |
|
1890 |
break; |
|
1891 |
} |
|
1892 |
||
1893 |
if((current == 0) AND !!StringAt(current, 4, JOSE, )) |
|
1894 |
MetaphAdd(J, A);//Yankelovich/Jankelowicz |
|
1895 |
else |
|
1896 |
//spanish pron. of e.g. 'bajador' |
|
1897 |
if(IsVowel(current - 1) |
|
1898 |
AND !!SlavoGermanic() |
|
1899 |
AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O'))) |
|
1900 |
MetaphAdd(J, H); |
|
1901 |
else |
|
1902 |
if(current == last) |
|
1903 |
MetaphAdd(J, ); |
|
1904 |
else |
|
1905 |
if(!!StringAt((current + 1), 1, L, T, K, S, N, M, B, Z, ) |
|
1906 |
AND !!StringAt((current - 1), 1, S, K, L, )) |
|
1907 |
MetaphAdd(J); |
|
1908 |
||
1909 |
if(GetAt(current + 1) == 'J')//it could happen!! |
|
1910 |
current += 2; |
|
1911 |
else |
|
1912 |
current += 1; |
|
1913 |
break; |
|
1914 |
" |
|
2213 | 1915 |
| currentWord firstWord nextLetter | |
4488 | 1916 |
currentWord := inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size). |
1917 |
firstWord := inputKey copyFrom: 1 to: (4 min: inputKey size). |
|
2213 | 1918 |
nextLetter := self keyAt: currentIndex + 1. |
1919 |
(currentWord = 'JOSE' or: [firstWord = 'SAN ']) |
|
1920 |
ifTrue: [ |
|
4488 | 1921 |
((currentIndex = 1 and: [inputKey size = 4 or: [inputKey size >= 5 and: [self keyAt: currentIndex + 4 = $ ]]]) |
2213 | 1922 |
or: [firstWord = 'SAN ']) |
1923 |
ifTrue: [ |
|
1924 |
self addPrimaryTranslation: 'H'; |
|
1925 |
addSecondaryTranslation: 'H'. |
|
1926 |
] ifFalse: [ |
|
1927 |
self addPrimaryTranslation: 'J'; |
|
1928 |
addSecondaryTranslation: 'H'. |
|
1929 |
]. |
|
1930 |
^self. |
|
1931 |
]. |
|
1932 |
(currentIndex = 1 and: [firstWord ~= 'JOSE']) |
|
1933 |
ifTrue: [ |
|
1934 |
self addPrimaryTranslation: 'J'; |
|
1935 |
addSecondaryTranslation: 'A'. |
|
1936 |
] ifFalse: [ |
|
1937 |
((currentIndex > 1 and: [(self keyAt: currentIndex -1) isVowel]) |
|
4488 | 1938 |
and: [(self isSlavoGermanic: inputKey) not and: [nextLetter == $A or: [nextLetter == $O]]]) |
2213 | 1939 |
ifTrue: [ |
1940 |
self addPrimaryTranslation: 'J'; |
|
1941 |
addSecondaryTranslation: 'H'. |
|
1942 |
] ifFalse: [ |
|
4488 | 1943 |
currentIndex = inputKey size |
2213 | 1944 |
ifTrue: [ |
1945 |
self addPrimaryTranslation: 'J'; |
|
1946 |
addSecondaryTranslation: ' '. |
|
1947 |
] ifFalse: [ |
|
1948 |
((#($L $T $K $S $N $M $B $Z) includes: nextLetter) not and: [(#($S $K $L) includes: (self keyAt: currentIndex - 1)) not]) |
|
1949 |
ifTrue: [ |
|
1950 |
self addPrimaryTranslation: 'J'; |
|
1951 |
addSecondaryTranslation: 'J'. |
|
1952 |
]. |
|
1953 |
]. |
|
1954 |
]. |
|
1955 |
]. |
|
3489
6ef5f530df03
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3488
diff
changeset
|
1956 |
nextLetter == $J |
2213 | 1957 |
ifTrue: [ |
4488 | 1958 |
skipCount := skipCount + 1. |
2213 | 1959 |
]. |
4488 | 1960 |
|
1961 |
"Modified: / 28-07-2017 / 11:31:41 / cg" |
|
2208 | 1962 |
! |
1963 |
||
1964 |
processK |
|
2213 | 1965 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
1966 |
case 'K': |
|
2208 | 1967 |
if(GetAt(current + 1) == 'K') |
1968 |
current += 2; |
|
1969 |
else |
|
1970 |
current += 1; |
|
1971 |
MetaphAdd(K); |
|
1972 |
break; |
|
2213 | 1973 |
" |
1974 |
||
1975 |
(self keyAt: currentIndex + 1) = $K |
|
1976 |
ifTrue: [ |
|
4488 | 1977 |
skipCount := skipCount + 1 |
2213 | 1978 |
]. |
1979 |
self addPrimaryTranslation: 'K'; |
|
1980 |
addSecondaryTranslation: 'K'. |
|
4488 | 1981 |
|
1982 |
"Modified: / 28-07-2017 / 11:31:46 / cg" |
|
2208 | 1983 |
! |
1984 |
||
1985 |
processL |
|
1986 |
||
1987 |
"case 'L': |
|
1988 |
if(GetAt(current + 1) == 'L') |
|
1989 |
{ |
|
1990 |
//spanish e.g. 'cabrillo', 'gallegos' |
|
1991 |
if(((current == (length - 3)) |
|
1992 |
AND StringAt((current - 1), 4, ILLO, ILLA, ALLE, )) |
|
1993 |
OR ((StringAt((last - 1), 2, AS, OS, ) OR StringAt(last, 1, A, O, )) |
|
1994 |
AND StringAt((current - 1), 4, ALLE, )) ) |
|
1995 |
{ |
|
1996 |
MetaphAdd(L, ); |
|
1997 |
current += 2; |
|
1998 |
break; |
|
1999 |
} |
|
2000 |
current += 2; |
|
2001 |
}else |
|
2002 |
current += 1; |
|
2003 |
MetaphAdd(L); |
|
2004 |
break; |
|
2005 |
" |
|
2213 | 2006 |
| currentWord | |
2007 |
(self keyAt: currentIndex + 1) = $L |
|
2008 |
ifTrue: [ |
|
4488 | 2009 |
(((currentIndex = (inputKey size - 2)) |
2010 |
and: [(currentIndex > 1 and: [#('ILLO' 'ILLA' 'ALLE') includes: (currentWord := inputKey copyFrom: currentIndex - 1 to: (currentIndex + 2 min: inputKey size))])]) |
|
2011 |
or: [((#('AS' 'OS') includes: (inputKey copyFrom: inputKey size - 1 to: inputKey size)) or: [#($A $O) includes: (self keyAt: inputKey size)]) and: [currentWord = 'ALLE'] |
|
2213 | 2012 |
]) |
2013 |
ifTrue: [ |
|
2014 |
self addPrimaryTranslation: 'L'; |
|
2015 |
addSecondaryTranslation: ' '. |
|
4488 | 2016 |
skipCount := skipCount + 1. |
2017 |
^self. |
|
2213 | 2018 |
]. |
4488 | 2019 |
skipCount := skipCount + 1. |
2213 | 2020 |
]. |
2021 |
self addPrimaryTranslation: 'L'; |
|
4488 | 2022 |
addSecondaryTranslation: 'L'. |
2023 |
||
2024 |
"Modified: / 28-07-2017 / 11:32:03 / cg" |
|
2208 | 2025 |
! |
2026 |
||
2027 |
processM |
|
2028 |
||
2029 |
"case 'M': |
|
2030 |
if((StringAt((current - 1), 3, UMB, ) |
|
2031 |
AND (((current + 1) == last) OR StringAt((current + 2), 2, ER, ))) |
|
2032 |
//'dumb','thumb' |
|
2033 |
OR (GetAt(current + 1) == 'M') ) |
|
2034 |
current += 2; |
|
2035 |
else |
|
2036 |
current += 1; |
|
2037 |
MetaphAdd(M); |
|
2038 |
break; |
|
2039 |
" |
|
4488 | 2040 |
(((currentIndex > 1 and: [(inputKey copyFrom: currentIndex - 1 to: (currentIndex +1 min: inputKey size)) = 'UMB']) |
2041 |
and: [currentIndex + 1 = inputKey size or: [(inputKey copyFrom: (currentIndex + 2 min: inputKey size) to: (currentIndex + 4 min: inputKey size)) = 'ER']]) |
|
2213 | 2042 |
or: [(self keyAt: currentIndex + 1) = $M]) |
2043 |
ifTrue: [ |
|
4488 | 2044 |
skipCount := skipCount + 1. |
2213 | 2045 |
]. |
2046 |
self addPrimaryTranslation: 'M'; |
|
2047 |
addSecondaryTranslation: 'M'. |
|
4488 | 2048 |
|
2049 |
"Modified: / 28-07-2017 / 11:32:08 / cg" |
|
2208 | 2050 |
! |
2051 |
||
2052 |
processN |
|
2213 | 2053 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2054 |
case 'N': |
|
2208 | 2055 |
if(GetAt(current + 1) == 'N') |
2056 |
current += 2; |
|
2057 |
else |
|
2058 |
current += 1; |
|
2059 |
MetaphAdd(N); |
|
2060 |
break; |
|
2061 |
||
2213 | 2062 |
" |
2063 |
||
2064 |
(self keyAt: currentIndex + 1) = $N |
|
2065 |
ifTrue: [ |
|
4488 | 2066 |
skipCount := skipCount + 1 |
2213 | 2067 |
]. |
2068 |
self addPrimaryTranslation: 'N'; |
|
2069 |
addSecondaryTranslation: 'N'. |
|
4488 | 2070 |
|
2071 |
"Modified: / 28-07-2017 / 11:32:14 / cg" |
|
2208 | 2072 |
! |
2073 |
||
2074 |
processNtilde |
|
4488 | 2075 |
"case 'Ñ': |
2208 | 2076 |
current += 1; |
2077 |
MetaphAdd(N); |
|
2078 |
break; |
|
2079 |
" |
|
2080 |
self addPrimaryTranslation: 'N'; |
|
2081 |
addSecondaryTranslation: 'N'. |
|
2082 |
! |
|
2083 |
||
2084 |
processP |
|
2213 | 2085 |
"case 'P': |
2208 | 2086 |
if(GetAt(current + 1) == 'H') |
2087 |
{ |
|
2088 |
MetaphAdd(F); |
|
2089 |
current += 2; |
|
2090 |
break; |
|
2091 |
} |
|
2092 |
||
2093 |
//also account for campbell, raspberry |
|
2094 |
if(StringAt((current + 1), 1, P, B, )) |
|
2095 |
current += 2; |
|
2096 |
else |
|
2097 |
current += 1; |
|
2098 |
MetaphAdd(P); |
|
2099 |
break; |
|
2100 |
" |
|
2213 | 2101 |
| nextLetter | |
2102 |
(nextLetter := self keyAt: currentIndex + 1) = $H |
|
2103 |
ifTrue: [ |
|
2104 |
self addPrimaryTranslation: 'F'; |
|
2105 |
addSecondaryTranslation: 'F'. |
|
4488 | 2106 |
skipCount := skipCount + 1. |
2107 |
^self. |
|
2213 | 2108 |
]. |
2109 |
(#($P $B) includes: nextLetter) |
|
2110 |
ifTrue: [ |
|
4488 | 2111 |
skipCount := skipCount + 1. |
2213 | 2112 |
] ifFalse: [ |
2113 |
self addPrimaryTranslation: 'P'; |
|
2114 |
addSecondaryTranslation: 'P'. |
|
2115 |
]. |
|
4488 | 2116 |
|
2117 |
"Modified: / 28-07-2017 / 11:32:28 / cg" |
|
2208 | 2118 |
! |
2119 |
||
2120 |
processQ |
|
2213 | 2121 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2122 |
case 'Q': |
|
2208 | 2123 |
if(GetAt(current + 1) == 'Q') |
2124 |
current += 2; |
|
2125 |
else |
|
2126 |
current += 1; |
|
2127 |
MetaphAdd(K); |
|
2128 |
break; |
|
2129 |
||
2213 | 2130 |
" |
2131 |
||
2132 |
(self keyAt: currentIndex + 1) = $Q |
|
2133 |
ifTrue: [ |
|
4488 | 2134 |
skipCount := skipCount + 1 |
2213 | 2135 |
]. |
2136 |
self addPrimaryTranslation: 'K'; |
|
2137 |
addSecondaryTranslation: 'K'. |
|
4488 | 2138 |
|
2139 |
"Modified: / 28-07-2017 / 11:32:32 / cg" |
|
2208 | 2140 |
! |
2141 |
||
2142 |
processR |
|
2213 | 2143 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2144 |
case 'R': |
|
2208 | 2145 |
//french e.g. 'rogier', but exclude 'hochmeier' |
2146 |
if((current == last) |
|
2147 |
AND !!SlavoGermanic() |
|
2148 |
AND StringAt((current - 2), 2, IE, ) |
|
2149 |
AND !!StringAt((current - 4), 2, ME, MA, )) |
|
2150 |
MetaphAdd(, R); |
|
2151 |
else |
|
2152 |
MetaphAdd(R); |
|
2153 |
||
2154 |
if(GetAt(current + 1) == 'R') |
|
2155 |
current += 2; |
|
2156 |
else |
|
2157 |
current += 1; |
|
2158 |
break; |
|
2213 | 2159 |
" |
4488 | 2160 |
(currentIndex = inputKey size and: [ |
2161 |
(self isSlavoGermanic: inputKey) not and: [ |
|
2162 |
(inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)) = 'IE' and: [ |
|
2163 |
(#('ME' 'MA') includes: (inputKey copyFrom: ((currentIndex - 4) max: 1) to: ((currentIndex - 3) max: 1))) not |
|
2213 | 2164 |
] |
2165 |
] |
|
2166 |
]) |
|
2167 |
ifTrue: [ |
|
2168 |
self addPrimaryTranslation: ''; |
|
2169 |
addSecondaryTranslation: 'R'. |
|
2170 |
] ifFalse: [ |
|
2171 |
self addPrimaryTranslation: 'R'; |
|
2172 |
addSecondaryTranslation: 'R'. |
|
2173 |
]. |
|
2174 |
(self keyAt: currentIndex + 1) = $R |
|
2175 |
ifTrue: [ |
|
4488 | 2176 |
skipCount := skipCount + 1 |
2213 | 2177 |
]. |
4488 | 2178 |
|
2179 |
"Modified: / 28-07-2017 / 11:32:37 / cg" |
|
2208 | 2180 |
! |
2181 |
||
2182 |
processRemainingCharacters |
|
4488 | 2183 |
startIndex to: inputKey size do:[ :i | |
2208 | 2184 |
| c methodSelector | |
2185 |
||
4488 | 2186 |
skipCount = 0 ifTrue:[ |
2187 |
((primaryTranslation size > 4) and: [ secondaryTranslation size > 4 ]) |
|
2208 | 2188 |
ifTrue: [ ^self ]. |
2189 |
||
4488 | 2190 |
currentIndex := i. |
2208 | 2191 |
c := self keyAt: i. |
2192 |
||
2193 |
(c isVowel not and: [c ~= $Y]) ifTrue:[ |
|
4488 | 2194 |
c == $Ç ifTrue: [ |
2208 | 2195 |
methodSelector := #processCedille |
4488 | 2196 |
] ifFalse: [ c == $Ñ ifTrue: [ |
2208 | 2197 |
methodSelector := #processNtilde |
2198 |
] ifFalse: [ |
|
2199 |
methodSelector := ('process', c asString) asSymbol |
|
2200 |
]]. |
|
2201 |
self perform: methodSelector |
|
2202 |
] |
|
2203 |
] ifFalse: [ |
|
4488 | 2204 |
skipCount := skipCount - 1 |
2208 | 2205 |
] |
2206 |
] |
|
4488 | 2207 |
|
2208 |
"Modified: / 28-07-2017 / 11:24:15 / cg" |
|
2208 | 2209 |
! |
2210 |
||
2211 |
processS |
|
2213 | 2212 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2213 |
case 'S': |
|
2208 | 2214 |
//special cases 'island', 'isle', 'carlisle', 'carlysle' |
2215 |
if(StringAt((current - 1), 3, ISL, YSL, )) |
|
2216 |
{ |
|
2217 |
current += 1; |
|
2218 |
break; |
|
2219 |
} |
|
2220 |
||
2221 |
//special case 'sugar-' |
|
2222 |
if((current == 0) AND StringAt(current, 5, SUGAR, )) |
|
2223 |
{ |
|
2224 |
MetaphAdd(X, S); |
|
2225 |
current += 1; |
|
2226 |
break; |
|
2227 |
} |
|
2228 |
||
2229 |
if(StringAt(current, 2, SH, )) |
|
2230 |
{ |
|
2231 |
//germanic |
|
2232 |
if(StringAt((current + 1), 4, HEIM, HOEK, HOLM, HOLZ, )) |
|
2233 |
MetaphAdd(S); |
|
2234 |
else |
|
2235 |
MetaphAdd(X); |
|
2236 |
current += 2; |
|
2237 |
break; |
|
2238 |
} |
|
2239 |
||
2240 |
//italian & armenian |
|
2241 |
if(StringAt(current, 3, SIO, SIA, ) OR StringAt(current, 4, SIAN, )) |
|
2242 |
{ |
|
2243 |
if(!!SlavoGermanic()) |
|
2244 |
MetaphAdd(S, X); |
|
2245 |
else |
|
2246 |
MetaphAdd(S); |
|
2247 |
current += 3; |
|
2248 |
break; |
|
2249 |
} |
|
2250 |
||
2251 |
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' |
|
2252 |
//also, -sz- in slavic language altho in hungarian it is pronounced 's' |
|
2253 |
if(((current == 0) |
|
2254 |
AND StringAt((current + 1), 1, M, N, L, W, )) |
|
2255 |
OR StringAt((current + 1), 1, Z, )) |
|
2256 |
{ |
|
2257 |
MetaphAdd(S, X); |
|
2258 |
if(StringAt((current + 1), 1, Z, )) |
|
2259 |
current += 2; |
|
2260 |
else |
|
2261 |
current += 1; |
|
2262 |
break; |
|
2263 |
} |
|
2264 |
||
2265 |
if(StringAt(current, 2, SC, )) |
|
2266 |
{ |
|
2267 |
//Schlesinger's rule |
|
2268 |
if(GetAt(current + 2) == 'H') |
|
2269 |
//dutch origin, e.g. 'school', 'schooner' |
|
2270 |
if(StringAt((current + 3), 2, OO, ER, EN, UY, ED, EM, )) |
|
2271 |
{ |
|
2272 |
//'schermerhorn', 'schenker' |
|
2273 |
if(StringAt((current + 3), 2, ER, EN, )) |
|
2274 |
{ |
|
2275 |
MetaphAdd(X, SK); |
|
2276 |
}else |
|
2277 |
MetaphAdd(SK); |
|
2278 |
current += 3; |
|
2279 |
break; |
|
2280 |
}else{ |
|
2281 |
if((current == 0) AND !!IsVowel(3) AND (GetAt(3) !!= 'W')) |
|
2282 |
MetaphAdd(X, S); |
|
2283 |
else |
|
2284 |
MetaphAdd(X); |
|
2285 |
current += 3; |
|
2286 |
break; |
|
2287 |
} |
|
2288 |
||
2289 |
if(StringAt((current + 2), 1, I, E, Y, )) |
|
2290 |
{ |
|
2291 |
MetaphAdd(S); |
|
2292 |
current += 3; |
|
2293 |
break; |
|
2294 |
} |
|
2295 |
//else |
|
2296 |
MetaphAdd(SK); |
|
2297 |
current += 3; |
|
2298 |
break; |
|
2299 |
} |
|
2300 |
||
2301 |
//french e.g. 'resnais', 'artois' |
|
2302 |
if((current == last) AND StringAt((current - 2), 2, AI, OI, )) |
|
2303 |
MetaphAdd(, S); |
|
2304 |
else |
|
2305 |
MetaphAdd(S); |
|
2306 |
||
2307 |
if(StringAt((current + 1), 1, S, Z, )) |
|
2308 |
current += 2; |
|
2309 |
else |
|
2310 |
current += 1; |
|
2311 |
break; |
|
2312 |
" |
|
2313 |
||
2213 | 2314 |
| nextChar char2 chars char | |
4488 | 2315 |
(#('ISL' 'YSL') includes: (inputKey copyFrom: (currentIndex - 1 max: 1) to: (currentIndex + 1 min: inputKey size))) |
2213 | 2316 |
ifTrue: [ |
2317 |
^self |
|
2318 |
]. |
|
4488 | 2319 |
(currentIndex = 1 and: [(inputKey copyFrom: 1 to: (5 min: inputKey size)) = 'SUGAR']) |
2213 | 2320 |
ifTrue: [ |
2321 |
self addPrimaryTranslation: 'X'; |
|
2322 |
addSecondaryTranslation: 'S'. |
|
2323 |
^self. |
|
2324 |
]. |
|
4488 | 2325 |
(inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SH' |
2213 | 2326 |
ifTrue: [ |
4488 | 2327 |
(#('HEIM' 'HOEK' 'HOLM' 'HOLZ') includes: (inputKey copyFrom: (currentIndex + 1 min: inputKey size) to: ((currentIndex + 5) min: inputKey size))) |
2213 | 2328 |
ifTrue: [ |
2329 |
self addPrimaryTranslation: 'S'; |
|
2330 |
addSecondaryTranslation: 'S'. |
|
2331 |
] ifFalse: [ |
|
2332 |
self addPrimaryTranslation: 'X'; |
|
2333 |
addSecondaryTranslation: 'X'. |
|
2334 |
]. |
|
4488 | 2335 |
skipCount := skipCount + 1. |
2336 |
^self |
|
2213 | 2337 |
]. |
4488 | 2338 |
((#('SIO' 'SIA') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 2 min: inputKey size))) |
2339 |
or: [(inputKey copyFrom: currentIndex to: (currentIndex + 3 min: inputKey size)) = 'SIAN']) |
|
2213 | 2340 |
ifTrue: [ |
4488 | 2341 |
(self isSlavoGermanic: inputKey) not |
2213 | 2342 |
ifTrue: [ |
2343 |
self addPrimaryTranslation: 'S'; |
|
2344 |
addSecondaryTranslation: 'X'. |
|
2345 |
] ifFalse: [ |
|
2346 |
self addPrimaryTranslation: 'S'; |
|
2347 |
addSecondaryTranslation: 'S'. |
|
2348 |
]. |
|
4488 | 2349 |
skipCount := skipCount + 2. |
2350 |
^self |
|
2213 | 2351 |
]. |
2352 |
((currentIndex = 1 and: [#($M $N $L $W) includes: (self keyAt: currentIndex + 1)]) |
|
2353 |
or: [(nextChar := self keyAt: currentIndex + 1) = $Z]) |
|
2354 |
ifTrue: [ |
|
2355 |
self addPrimaryTranslation: 'S'; |
|
2356 |
addSecondaryTranslation: 'X'. |
|
3488
5a69e672d7f8
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
3185
diff
changeset
|
2357 |
nextChar == $Z |
2213 | 2358 |
ifTrue: [ |
4488 | 2359 |
skipCount := skipCount + 1. |
2360 |
^self. |
|
2213 | 2361 |
]. |
2362 |
^self. |
|
2363 |
]. |
|
4488 | 2364 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'SC') |
2213 | 2365 |
ifTrue: [ |
2366 |
(char2 := self keyAt: currentIndex + 2) = $H |
|
2367 |
ifTrue: [ |
|
4488 | 2368 |
(#('OO' 'ER' 'EN' 'UY' 'ED' 'EM') includes: (chars := inputKey copyFrom: ((currentIndex + 3) min: inputKey size) to: ((currentIndex + 4) min: inputKey size))) |
2213 | 2369 |
ifTrue: [ |
2370 |
(#('ER' 'EN') includes: chars) |
|
2371 |
ifTrue: [ |
|
2372 |
self addPrimaryTranslation: 'X'; |
|
2373 |
addSecondaryTranslation: 'SK'. |
|
2374 |
] ifFalse: [ |
|
2375 |
self addPrimaryTranslation: 'SK'; |
|
2376 |
addSecondaryTranslation: 'SK'. |
|
2377 |
]. |
|
4488 | 2378 |
skipCount := skipCount + 2. |
2379 |
^self. |
|
2213 | 2380 |
] ifFalse: [ |
4488 | 2381 |
((currentIndex = 1 and: [(char := inputKey at: 4 ifAbsent: [$b]) isVowel not]) and: [char ~= $W]) |
2213 | 2382 |
ifTrue: [ |
2383 |
self addPrimaryTranslation: 'X'; |
|
2384 |
addSecondaryTranslation: 'S'. |
|
2385 |
] ifFalse: [ |
|
2386 |
self addPrimaryTranslation: 'X'; |
|
2387 |
addSecondaryTranslation: 'X'. |
|
2388 |
]. |
|
4488 | 2389 |
skipCount := skipCount + 2. |
2390 |
^self . |
|
2213 | 2391 |
]. |
2392 |
] ifFalse: [ |
|
2393 |
(#($I $E $Y) includes: char2) |
|
2394 |
ifTrue: [ |
|
2395 |
self addPrimaryTranslation: 'S'; |
|
2396 |
addSecondaryTranslation: 'S'. |
|
4488 | 2397 |
skipCount := skipCount + 2. |
2398 |
^self . |
|
2213 | 2399 |
] ifFalse: [ |
2400 |
self addPrimaryTranslation: 'SK'; |
|
2401 |
addSecondaryTranslation: 'SK'. |
|
4488 | 2402 |
skipCount := skipCount + 2. |
2403 |
^self. |
|
2213 | 2404 |
] |
2405 |
]. |
|
2406 |
]. |
|
4488 | 2407 |
(currentIndex = inputKey size and: [(#('AI' 'OI') includes: (inputKey copyFrom: ((currentIndex - 2) max: 1) to: ((currentIndex - 1) max: 1)))]) |
2213 | 2408 |
ifTrue: [ |
2409 |
self addPrimaryTranslation: ''; |
|
2410 |
addSecondaryTranslation: 'S'. |
|
2411 |
] ifFalse: [ |
|
2412 |
self addPrimaryTranslation: 'S'; |
|
2413 |
addSecondaryTranslation: 'S'. |
|
2414 |
]. |
|
2415 |
(#($S $Z) includes: (self keyAt: currentIndex + 1)) |
|
2416 |
ifTrue: [ |
|
4488 | 2417 |
skipCount := skipCount + 1. |
2418 |
^self. |
|
2213 | 2419 |
]. |
4488 | 2420 |
|
2421 |
"Modified: / 28-07-2017 / 11:34:18 / cg" |
|
2208 | 2422 |
! |
2423 |
||
2424 |
processT |
|
2213 | 2425 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2426 |
case 'T': |
|
2208 | 2427 |
if(StringAt(current, 4, TION, )) |
2428 |
{ |
|
2429 |
MetaphAdd(X); |
|
2430 |
current += 3; |
|
2431 |
break; |
|
2432 |
} |
|
2433 |
||
2434 |
if(StringAt(current, 3, TIA, TCH, )) |
|
2435 |
{ |
|
2436 |
MetaphAdd(X); |
|
2437 |
current += 3; |
|
2438 |
break; |
|
2439 |
} |
|
2440 |
||
2441 |
if(StringAt(current, 2, TH, ) |
|
2442 |
OR StringAt(current, 3, TTH, )) |
|
2443 |
{ |
|
2444 |
//special case 'thomas', 'thames' or germanic |
|
2445 |
if(StringAt((current + 2), 2, OM, AM, ) |
|
2446 |
OR StringAt(0, 4, VAN , VON , ) |
|
2447 |
OR StringAt(0, 3, SCH, )) |
|
2448 |
{ |
|
2449 |
MetaphAdd(T); |
|
2450 |
}else{ |
|
2451 |
MetaphAdd(0, T); |
|
2452 |
} |
|
2453 |
current += 2; |
|
2454 |
break; |
|
2455 |
} |
|
2456 |
||
2457 |
if(StringAt((current + 1), 1, T, D, )) |
|
2458 |
current += 2; |
|
2459 |
else |
|
2460 |
current += 1; |
|
2461 |
MetaphAdd(T); |
|
2462 |
break; |
|
2463 |
" |
|
4488 | 2464 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 3) min: inputKey size)) = 'TION') |
2213 | 2465 |
ifTrue: [ |
2466 |
self addPrimaryTranslation: 'X'; |
|
4488 | 2467 |
addSecondaryTranslation: 'X'. |
2468 |
skipCount := skipCount + 2. |
|
2469 |
^self. |
|
2213 | 2470 |
]. |
4488 | 2471 |
(#('TIA' 'TCH') includes: (inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size))) |
2213 | 2472 |
ifTrue: [ |
2473 |
self addPrimaryTranslation: 'X'; |
|
4488 | 2474 |
addSecondaryTranslation: 'X'. |
2475 |
skipCount := skipCount + 2. |
|
2476 |
^self. |
|
2213 | 2477 |
]. |
4488 | 2478 |
(((inputKey copyFrom: currentIndex to: ((currentIndex + 1) min: inputKey size)) = 'TH') or: [ |
2479 |
((inputKey copyFrom: currentIndex to: ((currentIndex + 2) min: inputKey size)) = 'TTH') |
|
2213 | 2480 |
]) |
2481 |
ifTrue: [ |
|
4488 | 2482 |
((#('OM' 'AM') includes: (inputKey copyFrom: currentIndex + 2 to: ((currentIndex + 3) min: inputKey size))) |
2483 |
or: [(#('VAN ' 'VON ') includes: (inputKey copyFrom: 1 to: (4 min: inputKey size))) |
|
2484 |
or: [(inputKey copyFrom: 1 to: (3 min: inputKey size)) = 'SCH'] |
|
2213 | 2485 |
]) |
2486 |
ifTrue: [ |
|
2487 |
self addPrimaryTranslation: 'T'; |
|
2488 |
addSecondaryTranslation: 'T'. |
|
2489 |
] ifFalse: [ |
|
2490 |
self addPrimaryTranslation: '0'; |
|
2491 |
addSecondaryTranslation: 'T'. |
|
2492 |
]. |
|
4488 | 2493 |
skipCount := skipCount + 1. |
2494 |
^self. |
|
2213 | 2495 |
]. |
2496 |
(#($T $D) includes: (self keyAt: currentIndex + 1)) |
|
2497 |
ifTrue: [ |
|
4488 | 2498 |
skipCount := skipCount + 1. |
2213 | 2499 |
]. |
2500 |
self addPrimaryTranslation: 'T'; |
|
4488 | 2501 |
addSecondaryTranslation: 'T'. |
2502 |
||
2503 |
"Modified: / 28-07-2017 / 11:33:33 / cg" |
|
2208 | 2504 |
! |
2505 |
||
2506 |
processV |
|
2213 | 2507 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2508 |
case 'V': |
|
2208 | 2509 |
if(GetAt(current + 1) == 'V') |
2510 |
current += 2; |
|
2511 |
else |
|
2512 |
current += 1; |
|
2513 |
MetaphAdd(F); |
|
2514 |
break; |
|
2515 |
||
2516 |
||
2213 | 2517 |
" |
2518 |
||
2519 |
(self keyAt: currentIndex + 1) = $V |
|
2520 |
ifTrue: [ |
|
4488 | 2521 |
skipCount := skipCount + 1 |
2213 | 2522 |
]. |
2523 |
self addPrimaryTranslation: 'F'; |
|
2524 |
addSecondaryTranslation: 'F'. |
|
4488 | 2525 |
|
2526 |
"Modified: / 28-07-2017 / 11:34:27 / cg" |
|
2208 | 2527 |
! |
2528 |
||
2529 |
processW |
|
2213 | 2530 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2531 |
case 'W': |
|
2208 | 2532 |
//can also be in middle of word |
2533 |
if(StringAt(current, 2, WR, )) |
|
2534 |
{ |
|
2535 |
MetaphAdd(R); |
|
2536 |
current += 2; |
|
2537 |
break; |
|
2538 |
} |
|
2539 |
||
2540 |
if((current == 0) |
|
2541 |
AND (IsVowel(current + 1) OR StringAt(current, 2, WH, ))) |
|
2542 |
{ |
|
2543 |
//Wasserman should match Vasserman |
|
2544 |
if(IsVowel(current + 1)) |
|
2545 |
MetaphAdd(A, F); |
|
2546 |
else |
|
2547 |
//need Uomo to match Womo |
|
2548 |
MetaphAdd(A); |
|
2549 |
} |
|
2550 |
||
2551 |
//Arnow should match Arnoff |
|
2552 |
if(((current == last) AND IsVowel(current - 1)) |
|
2553 |
OR StringAt((current - 1), 5, EWSKI, EWSKY, OWSKI, OWSKY, ) |
|
2554 |
OR StringAt(0, 3, SCH, )) |
|
2213 | 2555 |
{ |
2208 | 2556 |
MetaphAdd(, F); |
2557 |
current +=1; |
|
2558 |
break; |
|
2559 |
} |
|
2560 |
||
2561 |
//polish e.g. 'filipowicz' |
|
2562 |
if(StringAt(current, 4, WICZ, WITZ, )) |
|
2563 |
{ |
|
2564 |
MetaphAdd(TS, FX); |
|
2565 |
current +=4; |
|
2566 |
break; |
|
2567 |
} |
|
2568 |
||
2569 |
//else skip it |
|
2570 |
current +=1; |
|
2571 |
break; |
|
2572 |
" |
|
2213 | 2573 |
| word nextLetter | |
4488 | 2574 |
((word := inputKey copyFrom: currentIndex to: (currentIndex + 1 min: inputKey size)) = 'WR') |
2213 | 2575 |
ifTrue: [ |
2576 |
self addPrimaryTranslation: 'R'; |
|
2577 |
addSecondaryTranslation: 'R'. |
|
4488 | 2578 |
skipCount := skipCount + 1. |
2579 |
^self |
|
2213 | 2580 |
]. |
2581 |
((currentIndex = 1 and: [(nextLetter := self keyAt: currentIndex + 1) isVowel]) or: [ |
|
2582 |
word = 'WH' |
|
2583 |
]) |
|
2584 |
ifTrue: [ |
|
2585 |
nextLetter isVowel |
|
2586 |
ifTrue: [ |
|
2587 |
self addPrimaryTranslation: 'A'; |
|
2588 |
addSecondaryTranslation: 'F'. |
|
2589 |
] ifFalse: [ |
|
2590 |
self addPrimaryTranslation: 'A'; |
|
2591 |
addSecondaryTranslation: 'A'. |
|
2592 |
] |
|
2593 |
]. |
|
4488 | 2594 |
((((currentIndex = inputKey size) and: [(self keyAt: currentIndex - 1) isVowel]) |
2595 |
or: [#('EWSKI' 'EWSKY' 'OWSKI' 'OWSKY') includes: (inputKey copyFrom: ((currentIndex - 1) max: 1) to: (currentIndex + 3 min: inputKey size))]) |
|
2596 |
or: [inputKey startsWith:'SCH']) |
|
2213 | 2597 |
ifTrue: [ |
2598 |
self addPrimaryTranslation: ''; |
|
2599 |
addSecondaryTranslation: 'F'. |
|
2600 |
^self. |
|
2601 |
]. |
|
4488 | 2602 |
(#('WICZ' 'WITZ') includes: (inputKey copyFrom: currentIndex to: (currentIndex + 4 min: inputKey size))) |
2213 | 2603 |
ifTrue: [ |
2604 |
self addPrimaryTranslation: 'TS'; |
|
2605 |
addSecondaryTranslation: 'FX'. |
|
4488 | 2606 |
skipCount := skipCount + 3. |
2607 |
^self |
|
2213 | 2608 |
]. |
4488 | 2609 |
|
2610 |
"Modified: / 28-07-2017 / 11:34:51 / cg" |
|
2208 | 2611 |
! |
2612 |
||
2613 |
processX |
|
2213 | 2614 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2615 |
case 'X': |
|
2208 | 2616 |
//french e.g. breaux |
2617 |
if(!!((current == last) |
|
2618 |
AND (StringAt((current - 3), 3, IAU, EAU, ) |
|
2619 |
OR StringAt((current - 2), 2, AU, OU, ))) ) |
|
2620 |
MetaphAdd(KS); |
|
2621 |
||
2622 |
if(StringAt((current + 1), 1, C, X, )) |
|
2623 |
current += 2; |
|
2624 |
else |
|
2625 |
current += 1; |
|
2626 |
break; |
|
2627 |
" |
|
2628 |
||
2629 |
||
4488 | 2630 |
((currentIndex = inputKey size) |
2631 |
and: [(#('IAU' 'EAU') includes: (inputKey copyFrom: ((currentIndex - 3) min: 1) to: currentIndex)) |
|
2632 |
or: [(#('AU' 'OU') includes: (inputKey copyFrom: ((currentIndex - 2) min: 1) to: currentIndex))]]) |
|
2580
7ce713ba2618
not ifTrue -> ifFalse (trying the rewrite tool ;-)
Claus Gittinger <cg@exept.de>
parents:
2445
diff
changeset
|
2633 |
ifFalse: [ |
2213 | 2634 |
self addPrimaryTranslation: 'KS'; |
2635 |
addSecondaryTranslation: 'KS'. |
|
2636 |
]. |
|
2637 |
(#($C $X) includes: (self keyAt: currentIndex + 1)) |
|
2638 |
ifTrue: [ |
|
4488 | 2639 |
skipCount := skipCount + 1. |
2640 |
^self |
|
2213 | 2641 |
] |
2580
7ce713ba2618
not ifTrue -> ifFalse (trying the rewrite tool ;-)
Claus Gittinger <cg@exept.de>
parents:
2445
diff
changeset
|
2642 |
|
4488 | 2643 |
"Modified: / 28-07-2017 / 11:34:44 / cg" |
2208 | 2644 |
! |
2645 |
||
2646 |
processZ |
|
2213 | 2647 |
"http://aspell.sourceforge.net/metaphone/dmetaph.cpp |
2648 |
case 'Z': |
|
2208 | 2649 |
//chinese pinyin e.g. 'zhao' |
2650 |
if(GetAt(current + 1) == 'H') |
|
2651 |
{ |
|
2652 |
MetaphAdd(J); |
|
2653 |
current += 2; |
|
2654 |
break; |
|
2655 |
}else |
|
2656 |
if(StringAt((current + 1), 2, ZO, ZI, ZA, ) |
|
2657 |
OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) !!= 'T'))) |
|
2658 |
{ |
|
2659 |
MetaphAdd(S, TS); |
|
2660 |
} |
|
2661 |
else |
|
2662 |
MetaphAdd(S); |
|
2663 |
||
2664 |
if(GetAt(current + 1) == 'Z') |
|
2665 |
current += 2; |
|
2666 |
else |
|
2667 |
current += 1; |
|
2668 |
break; |
|
2669 |
" |
|
2670 |
||
2213 | 2671 |
(self keyAt: currentIndex + 1) = $H |
2672 |
ifTrue: [ |
|
2673 |
self addPrimaryTranslation: 'J'; |
|
2674 |
addSecondaryTranslation: 'J'. |
|
4488 | 2675 |
skipCount := skipCount + 1. |
2676 |
^self |
|
2213 | 2677 |
] ifFalse: [ |
4488 | 2678 |
((#('ZO' 'ZI' 'ZA') includes: (inputKey copyFrom: ((currentIndex + 1) min: inputKey size) to: ((currentIndex + 2) min: inputKey size))) or: [ |
2679 |
(self isSlavoGermanic: inputKey) and: [(currentIndex > 1 and: [(self keyAt: currentIndex - 1) ~= 'T'])] |
|
2213 | 2680 |
]) |
2681 |
ifTrue: [ |
|
2682 |
self addPrimaryTranslation: 'S'; |
|
2683 |
addSecondaryTranslation: 'TS'. |
|
2684 |
] ifFalse: [ |
|
2685 |
self addPrimaryTranslation: 'S'; |
|
2686 |
addSecondaryTranslation: 'S'. |
|
2687 |
]. |
|
2688 |
(self keyAt: currentIndex + 1) = $Z |
|
2689 |
ifTrue: [ |
|
4488 | 2690 |
skipCount := skipCount + 1. |
2691 |
^self |
|
2213 | 2692 |
]. |
2693 |
] |
|
4488 | 2694 |
|
2695 |
"Modified: / 28-07-2017 / 11:35:12 / cg" |
|
2696 |
! ! |
|
2697 |
||
4491 | 2698 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'! |
2699 |
||
2700 |
documentation |
|
2701 |
" |
|
2702 |
There are many extended and enhanced soundex variants around; |
|
2703 |
here is one, called 'extended soundex'. It is destribed for example in |
|
2704 |
http://www.epidata.dk/documentation.php. |
|
2705 |
An author or origin is unknown. |
|
2706 |
||
2707 |
The number of digits is increased to 5 or 8; |
|
2708 |
The first character is not used literally; instead it is encoded like the rest. |
|
2709 |
This might have a negative effect on names starting with a vovel, though. |
|
2710 |
||
2711 |
Overall, it can be doubted if this is really an enhancement after all. |
|
2712 |
" |
|
2713 |
! ! |
|
2714 |
||
2715 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'! |
|
2716 |
||
2717 |
phoneticStringsFor:aString |
|
2718 |
"generates both an extended soundex of length 5 and one of length 8" |
|
2719 |
||
2720 |
|first second u t prevCode| |
|
2721 |
||
2722 |
u := aString asUppercase. |
|
2723 |
first := second := ''. |
|
2724 |
u do:[:c | |
|
2725 |
t := self translate:c. |
|
2726 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
2727 |
first := first , t. |
|
2728 |
second := second , t. |
|
2729 |
second size == 8 ifTrue:[ |
|
2730 |
^ Array with:(first copyTo:5) with:second |
|
2731 |
]. |
|
2732 |
]. |
|
2733 |
prevCode := t |
|
2734 |
]. |
|
2735 |
[ first size < 5 ] whileTrue:[ |
|
2736 |
first := first , '0'. |
|
2737 |
second := second , '0'. |
|
2738 |
]. |
|
2739 |
[ second size < 8 ] whileTrue:[ |
|
2740 |
second := second , '0' |
|
2741 |
]. |
|
2742 |
^ Array with:first with:second |
|
2743 |
||
2744 |
" |
|
2745 |
self basicNew phoneticStringsFor:'müller' #('87900' '87900000') |
|
2746 |
self basicNew phoneticStringsFor:'miller' #('87900' '87900000') |
|
2747 |
self basicNew phoneticStringsFor:'muller' #('87900' '87900000') |
|
2748 |
self basicNew phoneticStringsFor:'muler' #('87900' '87900000') |
|
2749 |
self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000') |
|
2750 |
self basicNew phoneticStringsFor:'schneider' #('38690' '38690000') |
|
2751 |
self basicNew phoneticStringsFor:'fischer' #('23900' '23900000') |
|
2752 |
self basicNew phoneticStringsFor:'weber' #('19000' '19000000') |
|
2753 |
self basicNew phoneticStringsFor:'meyer' #('89000' '89000000') |
|
2754 |
self basicNew phoneticStringsFor:'wagner' #('48900' '48900000') |
|
2755 |
self basicNew phoneticStringsFor:'schulz' #('37500' '37500000') |
|
2756 |
self basicNew phoneticStringsFor:'becker' #('13900' '13900000') |
|
2757 |
self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000') |
|
2758 |
self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000') |
|
2759 |
" |
|
2760 |
! ! |
|
2761 |
||
2762 |
!PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'! |
|
2763 |
||
2764 |
translate:aCharacter |
|
2765 |
"use simple if's for more speed when compiled" |
|
2766 |
||
2767 |
"vowels serve as separators" |
|
2768 |
aCharacter == $A ifTrue:[^ '0' ]. |
|
2769 |
aCharacter == $E ifTrue:[^ '0' ]. |
|
2770 |
aCharacter == $I ifTrue:[^ '0' ]. |
|
2771 |
aCharacter == $O ifTrue:[^ '0' ]. |
|
2772 |
aCharacter == $U ifTrue:[^ '0' ]. |
|
2773 |
aCharacter == $Y ifTrue:[^ '0' ]. |
|
2774 |
||
2775 |
aCharacter == $B ifTrue:[^ '1' ]. |
|
2776 |
aCharacter == $P ifTrue:[^ '1' ]. |
|
2777 |
||
2778 |
aCharacter == $F ifTrue:[^ '2' ]. |
|
2779 |
aCharacter == $V ifTrue:[^ '2' ]. |
|
2780 |
||
2781 |
aCharacter == $C ifTrue:[^ '3' ]. |
|
2782 |
aCharacter == $S ifTrue:[^ '3' ]. |
|
2783 |
aCharacter == $K ifTrue:[^ '3' ]. |
|
2784 |
||
2785 |
aCharacter == $G ifTrue:[^ '4' ]. |
|
2786 |
aCharacter == $J ifTrue:[^ '4' ]. |
|
2787 |
||
2788 |
aCharacter == $Q ifTrue:[^ '5' ]. |
|
2789 |
aCharacter == $X ifTrue:[^ '5' ]. |
|
2790 |
aCharacter == $Z ifTrue:[^ '5' ]. |
|
2791 |
||
2792 |
aCharacter == $D ifTrue:[^ '6' ]. |
|
2793 |
aCharacter == $G ifTrue:[^ '6' ]. |
|
2794 |
aCharacter == $T ifTrue:[^ '6' ]. |
|
2795 |
||
2796 |
aCharacter == $L ifTrue:[^ '7' ]. |
|
2797 |
||
2798 |
aCharacter == $M ifTrue:[^ '8' ]. |
|
2799 |
aCharacter == $N ifTrue:[^ '8' ]. |
|
2800 |
||
2801 |
aCharacter == $R ifTrue:[^ '9' ]. |
|
2802 |
^ nil |
|
2803 |
! ! |
|
2804 |
||
2805 |
!PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'! |
|
2806 |
||
2807 |
documentation |
|
2808 |
" |
|
2809 |
documentation to be added. |
|
2810 |
||
2811 |
[author:] |
|
2812 |
cg |
|
2813 |
||
2814 |
[instance variables:] |
|
2815 |
||
2816 |
[class variables:] |
|
2817 |
||
2818 |
[see also:] |
|
2819 |
||
2820 |
" |
|
2821 |
! ! |
|
2822 |
||
2823 |
!PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'! |
|
2824 |
||
2825 |
encode:word |
|
2826 |
^ self subclassResponsibility |
|
2827 |
||
2828 |
"Created: / 28-07-2017 / 15:20:49 / cg" |
|
2829 |
! |
|
2830 |
||
2831 |
phoneticStringsFor:word |
|
2832 |
^ Array with:(self encode:word) |
|
2833 |
||
2834 |
"Created: / 28-07-2017 / 15:20:38 / cg" |
|
2835 |
! ! |
|
2836 |
||
2837 |
!PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'! |
|
2838 |
||
2839 |
documentation |
|
2840 |
" |
|
2841 |
Match Rating Approach Encoder |
|
2842 |
||
2843 |
The Western Airlines matching rating approach name encoder |
|
2844 |
||
2845 |
[see also:] |
|
2846 |
https://en.wikipedia.org/wiki/Match_Rating_Approach |
|
2847 |
||
2848 |
G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2849 |
''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' |
|
2850 |
US National Institute of Standards and Technology, SP-500-2 (1977), p. 17. |
|
2851 |
" |
|
2852 |
! |
|
2853 |
||
2854 |
rCode |
|
2855 |
"<<END |
|
2856 |
## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us> |
|
2857 |
## |
|
2858 |
## Redistribution and use in source and binary forms, with or without |
|
2859 |
## modification, are permitted provided that the following conditions are |
|
2860 |
## met: |
|
2861 |
## |
|
2862 |
## Redistributions of source code must retain the above copyright |
|
2863 |
## notice, this list of conditions and the following disclaimer. |
|
2864 |
## |
|
2865 |
## Redistributions in binary form must reproduce the above copyright |
|
2866 |
## notice, this list of conditions and the following disclaimer in |
|
2867 |
## the documentation and/or other materials provided with the |
|
2868 |
## distribution. |
|
2869 |
## |
|
2870 |
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
2871 |
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
2872 |
## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
2873 |
## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
2874 |
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
2875 |
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
2876 |
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
2877 |
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
2878 |
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
2879 |
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
2880 |
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
2881 |
||
2882 |
#' @rdname mra |
|
2883 |
#' @title Match Rating Approach Encoder |
|
2884 |
#' |
|
2885 |
#' @description |
|
2886 |
#' The Western Airlines matching rating approach name encoder |
|
2887 |
#' |
|
2888 |
#' @param word string or vector of strings to encode |
|
2889 |
#' @param x MRA-encoded character vector |
|
2890 |
#' @param y MRA-encoded character vector |
|
2891 |
#' |
|
2892 |
#' @details |
|
2893 |
#' |
|
2894 |
#' The variable \code{word} is the name to be encoded. The variable |
|
2895 |
#' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder |
|
2896 |
#' because the algorithm itself is dependent upon its six-character |
|
2897 |
#' length. The variables \code{x} and \code{y} are MRA-encoded and are |
|
2898 |
#' compared to each other using the MRA comparison specification. |
|
2899 |
#' |
|
2900 |
#' @return The \code{mra_encode} function returns match rating approach |
|
2901 |
#' encoded character vector. The \code{mra_compare} returns a boolean |
|
2902 |
#' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA |
|
2903 |
#' comparison test. |
|
2904 |
#' |
|
2905 |
#' @references |
|
2906 |
#' |
|
2907 |
#' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2908 |
#' \emph{Accessing Individual Records from Personal Data Files Using |
|
2909 |
#' Nonunique Identifiers,} US National Institute of Standards and |
|
2910 |
#' Technology, SP-500-2 (1977), p. 17. |
|
2911 |
#' |
|
2912 |
#' @family phonics |
|
2913 |
#' |
|
2914 |
#' @examples |
|
2915 |
#' mra_encode("William") |
|
2916 |
#' mra_encode(c("Peter", "Peady")) |
|
2917 |
#' mra_encode("Stevenson") |
|
2918 |
||
2919 |
#' @rdname mra |
|
2920 |
#' @name mra_encode |
|
2921 |
#' @export |
|
2922 |
mra_encode <- function(word) { |
|
2923 |
||
2924 |
## First, remove any nonalphabetical characters and uppercase it |
|
2925 |
word <- gsub("[^[:alpha:]]*", "", word) |
|
2926 |
word <- toupper(word) |
|
2927 |
||
2928 |
## First character of key = first character of name |
|
2929 |
first <- substr(word, 1, 1) |
|
2930 |
word <- substr(word, 2, nchar(word)) |
|
2931 |
||
2932 |
## Delete vowels not at the start of the word |
|
2933 |
word <- gsub("[AEIOU]", "", word) |
|
2934 |
word <- paste(first, word, sep = "") |
|
2935 |
||
2936 |
## Remove duplicate consecutive characters |
|
2937 |
word <- gsub("([A-Z])\\1+", "\\1", word) |
|
2938 |
||
2939 |
## If longer than 6 characters, take first and last 3...and we have |
|
2940 |
## to vectorize it |
|
2941 |
for(i in 1:length(word)) { |
|
2942 |
if((l = nchar(word[i])) > 6) { |
|
2943 |
first <- substr(word[i], 1, 3) |
|
2944 |
last <- substr(word[i], l - 2, l) |
|
2945 |
word[i] <- paste(first, last, sep = ""); |
|
2946 |
} |
|
2947 |
} |
|
2948 |
||
2949 |
return(word) |
|
2950 |
} |
|
2951 |
||
2952 |
#' @rdname mra |
|
2953 |
#' @name mra_compare |
|
2954 |
#' @export |
|
2955 |
mra_compare <- function(x, y) { |
|
2956 |
mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE) |
|
2957 |
||
2958 |
## Obtain the minimum rating value by calculating the length sum of |
|
2959 |
## the encoded strings and using table A (from Wikipedia). We start |
|
2960 |
## by setting the minimum to be the sum and move from there. |
|
2961 |
mra$lensum <- nchar(mra$x) + nchar(mra$y) |
|
2962 |
mra$min[mra$lensum == 12] <- 2 |
|
2963 |
mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3 |
|
2964 |
mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4 |
|
2965 |
mra$min[mra$lensum <= 4] <- 5 |
|
2966 |
||
2967 |
## If the length difference between the encoded strings is 3 or |
|
2968 |
## greater, then no similarity comparison is done. For us, we |
|
2969 |
## continue the similarity comparison out of laziness and ensure the |
|
2970 |
## minimum is impossibly high to meet. |
|
2971 |
mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100 |
|
2972 |
||
2973 |
## Start the comparison. |
|
2974 |
x <- strsplit(mra$x, split = "") |
|
2975 |
y <- strsplit(mra$y, split = "") |
|
2976 |
rows <- nrow(mra) |
|
2977 |
for(i in 1:rows) { |
|
2978 |
## Process the encoded strings from left to right and remove any |
|
2979 |
## identical characters found from both strings respectively. |
|
2980 |
j <- 1 |
|
2981 |
while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
2982 |
if(x[[i]][j] == y[[i]][j]) { |
|
2983 |
x[[i]] <- x[[i]][-j] |
|
2984 |
y[[i]] <- y[[i]][-j] |
|
2985 |
} else |
|
2986 |
j <- j + 1 |
|
2987 |
} |
|
2988 |
||
2989 |
## Process the unmatched characters from right to left and |
|
2990 |
## remove any identical characters found from both names |
|
2991 |
## respectively. |
|
2992 |
x[[i]] <- rev(x[[i]]) |
|
2993 |
y[[i]] <- rev(y[[i]]) |
|
2994 |
j <- 1 |
|
2995 |
while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
2996 |
if(x[[i]][j] == y[[i]][j]) { |
|
2997 |
x[[i]] <- x[[i]][-j] |
|
2998 |
y[[i]] <- y[[i]][-j] |
|
2999 |
} else |
|
3000 |
j <- j + 1 |
|
3001 |
} |
|
3002 |
## Subtract the number of unmatched characters from 6 in the |
|
3003 |
## longer string. This is the similarity rating. |
|
3004 |
len <- min(length(x[[i]]), length(y[[i]])) |
|
3005 |
mra$sim[i] <- 6 - len |
|
3006 |
} |
|
3007 |
||
3008 |
## If the similarity is greater than or equal to the minimum |
|
3009 |
## required, it is a successful match. |
|
3010 |
mra$match <- (mra$sim >= mra$min) |
|
3011 |
return(mra$match) |
|
3012 |
} |
|
3013 |
||
3014 |
END>> |
|
3015 |
! ! |
|
3016 |
||
3017 |
!PhoneticStringUtilities::MRAStringComparator methodsFor:'api'! |
|
3018 |
||
3019 |
encode:wordIn |
|
3020 |
"see https://en.wikipedia.org/wiki/Match_Rating_Approach" |
|
3021 |
||
3022 |
|word prev| |
|
3023 |
||
3024 |
word := wordIn. |
|
3025 |
||
3026 |
"/ First, remove any nonalphabetical characters and uppercase it |
|
3027 |
||
3028 |
word := word select:#isLetter thenCollect:#asUppercase. |
|
3029 |
||
3030 |
"/ Delete vowels not at the start of the word |
|
3031 |
||
3032 |
word := word first asString , ((word from:2) reject:#isVowel). |
|
3033 |
||
3034 |
"/ Remove duplicate consecutive characters |
|
3035 |
||
3036 |
prev := nil. |
|
3037 |
word := word |
|
3038 |
collect:[:char | |
|
3039 |
char == prev ifTrue:[ |
|
3040 |
$* |
|
3041 |
] ifFalse:[ |
|
3042 |
prev := char. |
|
3043 |
char. |
|
3044 |
]. |
|
3045 |
] |
|
3046 |
thenSelect:[:char | char ~~ $*]. |
|
3047 |
||
3048 |
"/ If longer than 6 characters, take first and last 3 |
|
3049 |
word size > 6 ifTrue:[ |
|
3050 |
word := (word copyFirst:3),(word copyLast:3) |
|
3051 |
]. |
|
3052 |
^ word. |
|
3053 |
||
3054 |
" |
|
3055 |
self new encode:'Catherine' -> 'CTHRN' |
|
3056 |
self new encode:'CatherineCatherine' -> 'CTHHRN' |
|
3057 |
self new encode:'Butter' -> 'BTR' |
|
3058 |
self new encode:'Byrne' -> 'BYRN' |
|
3059 |
self new encode:'Boern' -> 'BRN' |
|
3060 |
self new encode:'Smith' -> 'SMTH' |
|
3061 |
self new encode:'Smyth' -> 'SMYTH' |
|
3062 |
self new encode:'Kathryn' -> 'KTHRYN' |
|
3063 |
" |
|
3064 |
||
3065 |
"Created: / 28-07-2017 / 15:19:22 / cg" |
|
3066 |
"Modified (comment): / 31-07-2017 / 15:14:31 / cg" |
|
3067 |
! ! |
|
3068 |
||
3069 |
!PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'! |
|
3070 |
||
3071 |
documentation |
|
3072 |
" |
|
4495 | 3073 |
Ongoing work - do not use at the moment |
3074 |
||
4491 | 3075 |
Encodes a string into a Metaphone value. |
3076 |
||
3077 |
Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3078 |
Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3079 |
||
3080 |
Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39. |
|
3081 |
Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3082 |
https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6 |
|
3083 |
||
3084 |
They have had undocumented changes from the originally published algorithm. |
|
3085 |
For more information, see https://issues.apache.org/jira/browse/CODEC-57 |
|
3086 |
||
3087 |
Metaphone uses the following rules: |
|
3088 |
||
3089 |
Doubled letters except 'c' -> drop 2nd letter. |
|
3090 |
Vowels are only kept when they are the first letter. |
|
3091 |
B -> B unless at the end of a word after 'm' as in 'dumb' |
|
3092 |
C -> X (sh) if -cia- or -ch- |
|
3093 |
S if -ci-, -ce- or -cy- |
|
3094 |
K otherwise, including -sch- |
|
3095 |
D -> J if in -dge-, -dgy- or -dgi-; T otherwise |
|
3096 |
F -> F |
|
3097 |
G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above) |
|
3098 |
J if before i or e or y if not double gg; K otherwise |
|
3099 |
H -> silent if after vowel and no vowel follows; H otherwise |
|
3100 |
J -> J |
|
3101 |
K -> silent if after 'c'; K otherwise |
|
3102 |
L -> L |
|
3103 |
M -> M |
|
3104 |
N -> N |
|
3105 |
P -> F if before 'h'; P otherwise |
|
3106 |
Q -> K |
|
3107 |
R -> R |
|
3108 |
S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise |
|
3109 |
T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise |
|
3110 |
V -> F |
|
3111 |
W -> silent if not followed by a vowel W if followed by a vowel |
|
3112 |
X -> KS |
|
3113 |
Y -> silent if not followed by a vowel Y if followed by a vowel |
|
3114 |
Z -> S |
|
3115 |
||
3116 |
Initial Letter Exceptions |
|
3117 |
||
3118 |
Initial kn-, gn- pn, ae- or wr- -> drop first letter |
|
3119 |
Initial x- -> change to 's' |
|
3120 |
Initial wh- -> change to 'w' |
|
3121 |
||
3122 |
||
3123 |
self new encode:'a' |
|
3124 |
self new encode:'dumb' |
|
3125 |
self new encode:'MILLER' |
|
3126 |
self new encode:'schmidt' |
|
3127 |
self new encode:'schneider' |
|
3128 |
self new encode:'FISCHER' |
|
3129 |
self new encode:'HEDGY' |
|
3130 |
self new encode:'weber' |
|
3131 |
self new encode:'wagner' |
|
3132 |
self new encode:'van gogh' |
|
3133 |
" |
|
3134 |
! |
|
3135 |
||
3136 |
javaCode |
|
3137 |
"<<END |
|
3138 |
/* |
|
3139 |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
3140 |
* contributor license agreements. See the NOTICE file distributed with |
|
3141 |
* this work for additional information regarding copyright ownership. |
|
3142 |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
3143 |
* (the "License"); you may not use this file except in compliance with |
|
3144 |
* the License. You may obtain a copy of the License at |
|
3145 |
* |
|
3146 |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
3147 |
* |
|
3148 |
* Unless required by applicable law or agreed to in writing, software |
|
3149 |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
3150 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
3151 |
* See the License for the specific language governing permissions and |
|
3152 |
* limitations under the License. |
|
3153 |
*/ |
|
3154 |
||
3155 |
package org.apache.commons.codec.language; |
|
3156 |
||
3157 |
import org.apache.commons.codec.EncoderException; |
|
3158 |
import org.apache.commons.codec.StringEncoder; |
|
3159 |
||
3160 |
/** |
|
3161 |
* Encodes a string into a Metaphone value. |
|
3162 |
* <p> |
|
3163 |
* Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3164 |
* Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3165 |
* <p> |
|
3166 |
* <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, |
|
3167 |
* p 39.</CITE> |
|
3168 |
* <p> |
|
3169 |
* Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3170 |
* </p> |
|
3171 |
* <ul> |
|
3172 |
* <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> |
|
3173 |
* (broken link 4/30/2013) </li> |
|
3174 |
* <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> |
|
3175 |
* (link checked 4/30/2013) </li> |
|
3176 |
* </ul> |
|
3177 |
* <p> |
|
3178 |
* They have had undocumented changes from the originally published algorithm. |
|
3179 |
* For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. |
|
3180 |
* <p> |
|
3181 |
* This class is conditionally thread-safe. |
|
3182 |
* The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} |
|
3183 |
* but is not volatile, and accesses are not synchronized. |
|
3184 |
* If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization |
|
3185 |
* is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} |
|
3186 |
* after initial setup. |
|
3187 |
* |
|
3188 |
* @version $Id$ |
|
3189 |
*/ |
|
3190 |
public class Metaphone implements StringEncoder { |
|
3191 |
||
3192 |
/** |
|
3193 |
* Five values in the English language |
|
3194 |
*/ |
|
3195 |
private static final String VOWELS = "AEIOU"; |
|
3196 |
||
3197 |
/** |
|
3198 |
* Variable used in Metaphone algorithm |
|
3199 |
*/ |
|
3200 |
private static final String FRONTV = "EIY"; |
|
3201 |
||
3202 |
/** |
|
3203 |
* Variable used in Metaphone algorithm |
|
3204 |
*/ |
|
3205 |
private static final String VARSON = "CSPTG"; |
|
3206 |
||
3207 |
/** |
|
3208 |
* The max code length for metaphone is 4 |
|
3209 |
*/ |
|
3210 |
private int maxCodeLen = 4; |
|
3211 |
||
3212 |
/** |
|
3213 |
* Creates an instance of the Metaphone encoder |
|
3214 |
*/ |
|
3215 |
public Metaphone() { |
|
3216 |
super(); |
|
3217 |
} |
|
3218 |
||
3219 |
/** |
|
3220 |
* Find the metaphone value of a String. This is similar to the |
|
3221 |
* soundex algorithm, but better at finding similar sounding words. |
|
3222 |
* All input is converted to upper case. |
|
3223 |
* Limitations: Input format is expected to be a single ASCII word |
|
3224 |
* with only characters in the A - Z range, no punctuation or numbers. |
|
3225 |
* |
|
3226 |
* @param txt String to find the metaphone code for |
|
3227 |
* @return A metaphone code corresponding to the String supplied |
|
3228 |
*/ |
|
3229 |
public String metaphone(final String txt) { |
|
3230 |
boolean hard = false; |
|
3231 |
int txtLength; |
|
3232 |
if (txt == null || (txtLength = txt.length()) == 0) { |
|
3233 |
return ""; |
|
3234 |
} |
|
3235 |
// single character is itself |
|
3236 |
if (txtLength == 1) { |
|
3237 |
return txt.toUpperCase(java.util.Locale.ENGLISH); |
|
3238 |
} |
|
3239 |
||
3240 |
final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); |
|
3241 |
||
3242 |
final StringBuilder local = new StringBuilder(40); // manipulate |
|
3243 |
final StringBuilder code = new StringBuilder(10); // output |
|
3244 |
// handle initial 2 characters exceptions |
|
3245 |
switch(inwd[0]) { |
|
3246 |
case 'K': |
|
3247 |
case 'G': |
|
3248 |
case 'P': /* looking for KN, etc*/ |
|
3249 |
if (inwd[1] == 'N') { |
|
3250 |
local.append(inwd, 1, inwd.length - 1); |
|
3251 |
} else { |
|
3252 |
local.append(inwd); |
|
3253 |
} |
|
3254 |
break; |
|
3255 |
case 'A': /* looking for AE */ |
|
3256 |
if (inwd[1] == 'E') { |
|
3257 |
local.append(inwd, 1, inwd.length - 1); |
|
3258 |
} else { |
|
3259 |
local.append(inwd); |
|
3260 |
} |
|
3261 |
break; |
|
3262 |
case 'W': /* looking for WR or WH */ |
|
3263 |
if (inwd[1] == 'R') { // WR -> R |
|
3264 |
local.append(inwd, 1, inwd.length - 1); |
|
3265 |
break; |
|
3266 |
} |
|
3267 |
if (inwd[1] == 'H') { |
|
3268 |
local.append(inwd, 1, inwd.length - 1); |
|
3269 |
local.setCharAt(0, 'W'); // WH -> W |
|
3270 |
} else { |
|
3271 |
local.append(inwd); |
|
3272 |
} |
|
3273 |
break; |
|
3274 |
case 'X': /* initial X becomes S */ |
|
3275 |
inwd[0] = 'S'; |
|
3276 |
local.append(inwd); |
|
3277 |
break; |
|
3278 |
default: |
|
3279 |
local.append(inwd); |
|
3280 |
} // now local has working string with initials fixed |
|
3281 |
||
3282 |
final int wdsz = local.length(); |
|
3283 |
int n = 0; |
|
3284 |
||
3285 |
while (code.length() < this.getMaxCodeLen() && |
|
3286 |
n < wdsz ) { // max code size of 4 works well |
|
3287 |
final char symb = local.charAt(n); |
|
3288 |
// remove duplicate letters except C |
|
3289 |
if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) { |
|
3290 |
n++; |
|
3291 |
} else { // not dup |
|
3292 |
switch(symb) { |
|
3293 |
case 'A': |
|
3294 |
case 'E': |
|
3295 |
case 'I': |
|
3296 |
case 'O': |
|
3297 |
case 'U': |
|
3298 |
if (n == 0) { |
|
3299 |
code.append(symb); |
|
3300 |
} |
|
3301 |
break; // only use vowel if leading char |
|
3302 |
case 'B': |
|
3303 |
if ( isPreviousChar(local, n, 'M') && |
|
3304 |
isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3305 |
break; |
|
3306 |
} |
|
3307 |
code.append(symb); |
|
3308 |
break; |
|
3309 |
case 'C': // lots of C special cases |
|
3310 |
/* discard if SCI, SCE or SCY */ |
|
3311 |
if ( isPreviousChar(local, n, 'S') && |
|
3312 |
!!isLastChar(wdsz, n) && |
|
3313 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3314 |
break; |
|
3315 |
} |
|
3316 |
if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3317 |
code.append('X'); |
|
3318 |
break; |
|
3319 |
} |
|
3320 |
if (!!isLastChar(wdsz, n) && |
|
3321 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3322 |
code.append('S'); |
|
3323 |
break; // CI,CE,CY -> S |
|
3324 |
} |
|
3325 |
if (isPreviousChar(local, n, 'S') && |
|
3326 |
isNextChar(local, n, 'H') ) { // SCH->sk |
|
3327 |
code.append('K'); |
|
3328 |
break; |
|
3329 |
} |
|
3330 |
if (isNextChar(local, n, 'H')) { // detect CH |
|
3331 |
if (n == 0 && |
|
3332 |
wdsz >= 3 && |
|
3333 |
isVowel(local,2) ) { // CH consonant -> K consonant |
|
3334 |
code.append('K'); |
|
3335 |
} else { |
|
3336 |
code.append('X'); // CHvowel -> X |
|
3337 |
} |
|
3338 |
} else { |
|
3339 |
code.append('K'); |
|
3340 |
} |
|
3341 |
break; |
|
3342 |
case 'D': |
|
3343 |
if (!!isLastChar(wdsz, n + 1) && |
|
3344 |
isNextChar(local, n, 'G') && |
|
3345 |
FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3346 |
code.append('J'); n += 2; |
|
3347 |
} else { |
|
3348 |
code.append('T'); |
|
3349 |
} |
|
3350 |
break; |
|
3351 |
case 'G': // GH silent at end or before consonant |
|
3352 |
if (isLastChar(wdsz, n + 1) && |
|
3353 |
isNextChar(local, n, 'H')) { |
|
3354 |
break; |
|
3355 |
} |
|
3356 |
if (!!isLastChar(wdsz, n + 1) && |
|
3357 |
isNextChar(local,n,'H') && |
|
3358 |
!!isVowel(local,n+2)) { |
|
3359 |
break; |
|
3360 |
} |
|
3361 |
if (n > 0 && |
|
3362 |
( regionMatch(local, n, "GN") || |
|
3363 |
regionMatch(local, n, "GNED") ) ) { |
|
3364 |
break; // silent G |
|
3365 |
} |
|
3366 |
if (isPreviousChar(local, n, 'G')) { |
|
3367 |
// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true |
|
3368 |
hard = true; |
|
3369 |
} else { |
|
3370 |
hard = false; |
|
3371 |
} |
|
3372 |
if (!!isLastChar(wdsz, n) && |
|
3373 |
FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3374 |
!!hard) { |
|
3375 |
code.append('J'); |
|
3376 |
} else { |
|
3377 |
code.append('K'); |
|
3378 |
} |
|
3379 |
break; |
|
3380 |
case 'H': |
|
3381 |
if (isLastChar(wdsz, n)) { |
|
3382 |
break; // terminal H |
|
3383 |
} |
|
3384 |
if (n > 0 && |
|
3385 |
VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3386 |
break; |
|
3387 |
} |
|
3388 |
if (isVowel(local,n+1)) { |
|
3389 |
code.append('H'); // Hvowel |
|
3390 |
} |
|
3391 |
break; |
|
3392 |
case 'F': |
|
3393 |
case 'J': |
|
3394 |
case 'L': |
|
3395 |
case 'M': |
|
3396 |
case 'N': |
|
3397 |
case 'R': |
|
3398 |
code.append(symb); |
|
3399 |
break; |
|
3400 |
case 'K': |
|
3401 |
if (n > 0) { // not initial |
|
3402 |
if (!!isPreviousChar(local, n, 'C')) { |
|
3403 |
code.append(symb); |
|
3404 |
} |
|
3405 |
} else { |
|
3406 |
code.append(symb); // initial K |
|
3407 |
} |
|
3408 |
break; |
|
3409 |
case 'P': |
|
3410 |
if (isNextChar(local,n,'H')) { |
|
3411 |
// PH -> F |
|
3412 |
code.append('F'); |
|
3413 |
} else { |
|
3414 |
code.append(symb); |
|
3415 |
} |
|
3416 |
break; |
|
3417 |
case 'Q': |
|
3418 |
code.append('K'); |
|
3419 |
break; |
|
3420 |
case 'S': |
|
3421 |
if (regionMatch(local,n,"SH") || |
|
3422 |
regionMatch(local,n,"SIO") || |
|
3423 |
regionMatch(local,n,"SIA")) { |
|
3424 |
code.append('X'); |
|
3425 |
} else { |
|
3426 |
code.append('S'); |
|
3427 |
} |
|
3428 |
break; |
|
3429 |
case 'T': |
|
3430 |
if (regionMatch(local,n,"TIA") || |
|
3431 |
regionMatch(local,n,"TIO")) { |
|
3432 |
code.append('X'); |
|
3433 |
break; |
|
3434 |
} |
|
3435 |
if (regionMatch(local,n,"TCH")) { |
|
3436 |
// Silent if in "TCH" |
|
3437 |
break; |
|
3438 |
} |
|
3439 |
// substitute numeral 0 for TH (resembles theta after all) |
|
3440 |
if (regionMatch(local,n,"TH")) { |
|
3441 |
code.append('0'); |
|
3442 |
} else { |
|
3443 |
code.append('T'); |
|
3444 |
} |
|
3445 |
break; |
|
3446 |
case 'V': |
|
3447 |
code.append('F'); break; |
|
3448 |
case 'W': |
|
3449 |
case 'Y': // silent if not followed by vowel |
|
3450 |
if (!!isLastChar(wdsz,n) && |
|
3451 |
isVowel(local,n+1)) { |
|
3452 |
code.append(symb); |
|
3453 |
} |
|
3454 |
break; |
|
3455 |
case 'X': |
|
3456 |
code.append('K'); |
|
3457 |
code.append('S'); |
|
3458 |
break; |
|
3459 |
case 'Z': |
|
3460 |
code.append('S'); |
|
3461 |
break; |
|
3462 |
default: |
|
3463 |
// do nothing |
|
3464 |
break; |
|
3465 |
} // end switch |
|
3466 |
n++; |
|
3467 |
} // end else from symb !!= 'C' |
|
3468 |
if (code.length() > this.getMaxCodeLen()) { |
|
3469 |
code.setLength(this.getMaxCodeLen()); |
|
3470 |
} |
|
3471 |
} |
|
3472 |
return code.toString(); |
|
3473 |
} |
|
3474 |
||
3475 |
private boolean isVowel(final StringBuilder string, final int index) { |
|
3476 |
return VOWELS.indexOf(string.charAt(index)) >= 0; |
|
3477 |
} |
|
3478 |
||
3479 |
private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { |
|
3480 |
boolean matches = false; |
|
3481 |
if( index > 0 && |
|
3482 |
index < string.length() ) { |
|
3483 |
matches = string.charAt(index - 1) == c; |
|
3484 |
} |
|
3485 |
return matches; |
|
3486 |
} |
|
3487 |
||
3488 |
private boolean isNextChar(final StringBuilder string, final int index, final char c) { |
|
3489 |
boolean matches = false; |
|
3490 |
if( index >= 0 && |
|
3491 |
index < string.length() - 1 ) { |
|
3492 |
matches = string.charAt(index + 1) == c; |
|
3493 |
} |
|
3494 |
return matches; |
|
3495 |
} |
|
3496 |
||
3497 |
private boolean regionMatch(final StringBuilder string, final int index, final String test) { |
|
3498 |
boolean matches = false; |
|
3499 |
if( index >= 0 && |
|
3500 |
index + test.length() - 1 < string.length() ) { |
|
3501 |
final String substring = string.substring( index, index + test.length()); |
|
3502 |
matches = substring.equals( test ); |
|
3503 |
} |
|
3504 |
return matches; |
|
3505 |
} |
|
3506 |
||
3507 |
private boolean isLastChar(final int wdsz, final int n) { |
|
3508 |
return n + 1 == wdsz; |
|
3509 |
} |
|
3510 |
||
3511 |
||
3512 |
/** |
|
3513 |
* Encodes an Object using the metaphone algorithm. This method |
|
3514 |
* is provided in order to satisfy the requirements of the |
|
3515 |
* Encoder interface, and will throw an EncoderException if the |
|
3516 |
* supplied object is not of type java.lang.String. |
|
3517 |
* |
|
3518 |
* @param obj Object to encode |
|
3519 |
* @return An object (or type java.lang.String) containing the |
|
3520 |
* metaphone code which corresponds to the String supplied. |
|
3521 |
* @throws EncoderException if the parameter supplied is not |
|
3522 |
* of type java.lang.String |
|
3523 |
*/ |
|
3524 |
@Override |
|
3525 |
public Object encode(final Object obj) throws EncoderException { |
|
3526 |
if (!!(obj instanceof String)) { |
|
3527 |
throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
|
3528 |
} |
|
3529 |
return metaphone((String) obj); |
|
3530 |
} |
|
3531 |
||
3532 |
/** |
|
3533 |
* Encodes a String using the Metaphone algorithm. |
|
3534 |
* |
|
3535 |
* @param str String object to encode |
|
3536 |
* @return The metaphone code corresponding to the String supplied |
|
3537 |
*/ |
|
3538 |
@Override |
|
3539 |
public String encode(final String str) { |
|
3540 |
return metaphone(str); |
|
3541 |
} |
|
3542 |
||
3543 |
/** |
|
3544 |
* Tests is the metaphones of two strings are identical. |
|
3545 |
* |
|
3546 |
* @param str1 First of two strings to compare |
|
3547 |
* @param str2 Second of two strings to compare |
|
3548 |
* @return <code>true</code> if the metaphones of these strings are identical, |
|
3549 |
* <code>false</code> otherwise. |
|
3550 |
*/ |
|
3551 |
public boolean isMetaphoneEqual(final String str1, final String str2) { |
|
3552 |
return metaphone(str1).equals(metaphone(str2)); |
|
3553 |
} |
|
3554 |
||
3555 |
/** |
|
3556 |
* Returns the maxCodeLen. |
|
3557 |
* @return int |
|
3558 |
*/ |
|
3559 |
public int getMaxCodeLen() { return this.maxCodeLen; } |
|
3560 |
||
3561 |
/** |
|
3562 |
* Sets the maxCodeLen. |
|
3563 |
* @param maxCodeLen The maxCodeLen to set |
|
3564 |
*/ |
|
3565 |
public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
|
3566 |
||
3567 |
} |
|
3568 |
END>>" |
|
3569 |
! ! |
|
3570 |
||
3571 |
!PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'! |
|
3572 |
||
3573 |
encode:txt |
|
3574 |
" |
|
3575 |
self new encode:'a' |
|
3576 |
self new encode:'MILLER' |
|
3577 |
self new encode:'schmidt' |
|
3578 |
self new encode:'schneider' |
|
3579 |
self new encode:'FISCHER' |
|
3580 |
self new encode:'HEDGY' |
|
3581 |
self new encode:'weber' |
|
3582 |
self new encode:'wagner' |
|
3583 |
self new encode:'van gogh' |
|
3584 |
self new encode:'dumb' |
|
3585 |
" |
|
3586 |
||
4495 | 3587 |
|hard txtLength local code inwd ch ch2 wdsz n| |
4491 | 3588 |
|
3589 |
inwd := txt. |
|
3590 |
hard := false. |
|
3591 |
txtLength := 0. |
|
3592 |
||
3593 |
(txtLength := txt size) == 0 ifTrue:[^ '']. |
|
3594 |
||
3595 |
inwd := txt asUppercase. |
|
3596 |
"/ single character is itself |
|
3597 |
(txtLength == 1) ifTrue:[ |
|
3598 |
^ inwd |
|
3599 |
]. |
|
3600 |
||
3601 |
code := '' writeStream. |
|
3602 |
local := inwd. |
|
3603 |
||
3604 |
"/ handle initial 2 characters exceptions |
|
3605 |
ch := inwd at:(0+1). |
|
3606 |
ch2 := inwd at:(1+1). |
|
3607 |
('KGP' includes:ch) ifTrue:[ |
|
3608 |
"/ looking for KN, etc |
|
3609 |
"/ KNx -> Nx |
|
3610 |
"/ GNx -> Nx |
|
3611 |
"/ PNx -> Nx |
|
3612 |
(ch2 == $N) ifTrue:[ |
|
3613 |
local := (inwd from:1+1) |
|
3614 |
]. |
|
3615 |
] ifFalse:[ |
|
3616 |
('A' includes:ch) ifTrue:[ |
|
3617 |
"/ looking for AE |
|
3618 |
"/ AEx -> Ex |
|
3619 |
(ch2 == $E) ifTrue:[ |
|
3620 |
local := (inwd from:1+1) |
|
3621 |
]. |
|
3622 |
] ifFalse:[ |
|
3623 |
('W' includes:ch) ifTrue:[ |
|
3624 |
"/ looking for WR or WH |
|
3625 |
(ch2 == $R) ifTrue:[ |
|
3626 |
"/ WRx -> Wx |
|
3627 |
local := (inwd from:1+1) |
|
3628 |
] ifFalse:[ |
|
3629 |
(ch2 == $H) ifTrue:[ |
|
3630 |
"/ // WH -> W |
|
3631 |
local := 'W',(inwd from:2+1). |
|
3632 |
] |
|
3633 |
] |
|
3634 |
] ifFalse:[ |
|
3635 |
('X' includes:ch) ifTrue:[ |
|
3636 |
"/ initial X becomes S */ |
|
3637 |
"/ Xx -> Sx |
|
3638 |
local := 'S',(inwd from:1+1). |
|
3639 |
]]]]. |
|
3640 |
||
3641 |
"/ now local has working string with initials fixed |
|
3642 |
||
3643 |
wdsz := local size. |
|
3644 |
n := 1. |
|
3645 |
||
4495 | 3646 |
[ n <= wdsz ] whileTrue:[ |
4491 | 3647 |
"/ max code size of 4 works well |
3648 |
||
3649 |
|symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar| |
|
3650 |
||
3651 |
symb := local at:n. |
|
3652 |
(n > 1) ifTrue:[ prevChar := local at:(n-1) ]. |
|
3653 |
(isLastChar := (n == wdsz)) ifFalse:[ |
|
3654 |
nextChar := local at:(n+1) |
|
3655 |
]. |
|
3656 |
isPrevToLastChar := (n == (wdsz-1)). |
|
3657 |
(n+2) <= wdsz ifTrue:[ |
|
3658 |
nextNextChar := local at:(n+2) |
|
3659 |
]. |
|
3660 |
||
4495 | 3661 |
"/ remove duplicate letters except C and except first |
3662 |
(symb == $C or:[ nextChar ~~ symb or:[ n == 1] ]) ifTrue:[ |
|
4491 | 3663 |
"/ not dup |
3664 |
('AEIOU' includes:symb) ifTrue:[ |
|
3665 |
"/ only use vowel if leading char |
|
3666 |
(n == 1) ifTrue:[ |
|
3667 |
code nextPut:symb |
|
3668 |
] |
|
3669 |
] ifFalse:[ |
|
3670 |
('B' includes:symb) ifTrue:[ |
|
3671 |
"/ if ( isPreviousChar(local, n, 'M') && |
|
3672 |
"/ isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3673 |
"/ break; |
|
3674 |
"/ } |
|
3675 |
"/ code.append(symb); |
|
3676 |
"/ break; |
|
4495 | 3677 |
(isLastChar and:[ prevChar == $M]) ifTrue:[ |
4491 | 3678 |
"/ B is silent if word ends in MB |
3679 |
] ifFalse:[ |
|
3680 |
code nextPut:symb. |
|
3681 |
]. |
|
3682 |
] ifFalse:[ |
|
3683 |
('C' includes:symb) ifTrue:[ |
|
3684 |
"/ lots of C special cases |
|
3685 |
"/ /* discard if SCI, SCE or SCY */ |
|
3686 |
"/ if ( isPreviousChar(local, n, 'S') && |
|
3687 |
"/ !!isLastChar(wdsz, n) && |
|
3688 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3689 |
"/ break; |
|
3690 |
"/ } |
|
3691 |
"/ if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3692 |
"/ code.append('X'); |
|
3693 |
"/ break; |
|
3694 |
"/ } |
|
3695 |
"/ if (!!isLastChar(wdsz, n) && |
|
3696 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3697 |
"/ code.append('S'); |
|
3698 |
"/ break; // CI,CE,CY -> S |
|
3699 |
"/ } |
|
3700 |
"/ if (isPreviousChar(local, n, 'S') && |
|
3701 |
"/ isNextChar(local, n, 'H') ) { // SCH->sk |
|
3702 |
"/ code.append('K'); |
|
3703 |
"/ break; |
|
3704 |
"/ } |
|
3705 |
"/ if (isNextChar(local, n, 'H')) { // detect CH |
|
3706 |
"/ if (n == 0 && |
|
3707 |
"/ wdsz >= 3 && |
|
3708 |
"/ isVowel(local,2) ) { // CH consonant -> K consonant |
|
3709 |
"/ code.append('K'); |
|
3710 |
"/ } else { |
|
3711 |
"/ code.append('X'); // CHvowel -> X |
|
3712 |
"/ } |
|
3713 |
"/ } else { |
|
3714 |
"/ code.append('K'); |
|
3715 |
"/ } |
|
3716 |
"/ break; |
|
3717 |
(prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[ |
|
3718 |
"/ discard if SCI, SCE or SCY |
|
3719 |
] ifFalse:[ |
|
3720 |
((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[ |
|
3721 |
"/ "CIA" -> X |
|
3722 |
code nextPut:$X |
|
3723 |
] ifFalse:[ |
|
3724 |
('IEY' includes:nextChar) ifTrue:[ |
|
3725 |
"/ CI,CE,CY -> S |
|
3726 |
code nextPut:$S |
|
3727 |
] ifFalse:[ |
|
3728 |
((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[ |
|
3729 |
"/ SCH->sk |
|
3730 |
code nextPut:$K |
|
3731 |
] ifFalse:[ |
|
3732 |
nextChar == $H ifTrue:[ |
|
3733 |
"/ CH |
|
3734 |
('AEIOU' includes:nextNextChar) ifTrue:[ |
|
3735 |
code nextPut:$K "/ CH consonant -> K consonant |
|
3736 |
] ifFalse:[ |
|
3737 |
code nextPut:$X "/ CHvowel -> X |
|
3738 |
] |
|
3739 |
] ifFalse:[ |
|
3740 |
code nextPut:$K |
|
3741 |
]. |
|
3742 |
] |
|
3743 |
] |
|
3744 |
] |
|
3745 |
]. |
|
3746 |
||
3747 |
] ifFalse:[ |
|
3748 |
('D' includes:symb) ifTrue:[ |
|
3749 |
"/ if (!!isLastChar(wdsz, n + 1) && |
|
3750 |
"/ isNextChar(local, n, 'G') && |
|
3751 |
"/ FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3752 |
"/ code.append('J'); n += 2; |
|
3753 |
"/ } else { |
|
3754 |
"/ code.append('T'); |
|
3755 |
"/ } |
|
3756 |
"/ break; |
|
3757 |
((nextChar == $G) |
|
3758 |
and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ]) |
|
3759 |
ifTrue:[ |
|
3760 |
code nextPut:$J. |
|
3761 |
n := n + 2. |
|
3762 |
] ifFalse:[ |
|
3763 |
code nextPut:$T. |
|
3764 |
]. |
|
3765 |
] ifFalse:[ |
|
3766 |
('G' includes:symb) ifTrue:[ |
|
3767 |
"/ GH silent at end or before consonant |
|
3768 |
"/ if (isLastChar(wdsz, n + 1) && |
|
3769 |
"/ isNextChar(local, n, 'H')) { |
|
3770 |
"/ break; |
|
3771 |
"/ } |
|
3772 |
"/ if (!!isLastChar(wdsz, n + 1) && |
|
3773 |
"/ isNextChar(local,n,'H') && |
|
3774 |
"/ !!isVowel(local,n+2)) { |
|
3775 |
"/ break; |
|
3776 |
"/ } |
|
3777 |
"/ if (n > 0 && |
|
3778 |
"/ ( regionMatch(local, n, "GN") || |
|
3779 |
"/ regionMatch(local, n, "GNED") ) ) { |
|
3780 |
"/ break; // silent G |
|
3781 |
"/ } |
|
3782 |
"/ if (isPreviousChar(local, n, 'G')) { |
|
3783 |
"/ // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true |
|
3784 |
"/ hard = true; |
|
3785 |
"/ } else { |
|
3786 |
"/ hard = false; |
|
3787 |
"/ } |
|
3788 |
"/ if (!!isLastChar(wdsz, n) && |
|
3789 |
"/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3790 |
"/ !!hard) { |
|
3791 |
"/ code.append('J'); |
|
3792 |
"/ } else { |
|
3793 |
"/ code.append('K'); |
|
3794 |
"/ } |
|
3795 |
"/ break; |
|
3796 |
(isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[ |
|
3797 |
"/ GH silent at end |
|
3798 |
] ifFalse:[ |
|
3799 |
(isPrevToLastChar not and:[ nextChar == $H |
|
3800 |
and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[ |
|
3801 |
"/ GH silent before consonant |
|
3802 |
] ifFalse:[ |
|
3803 |
(n > 1 and:[ nextChar == $N ]) ifTrue:[ |
|
3804 |
"/ GN -> silent G |
|
3805 |
] ifFalse:[ |
|
3806 |
hard := (prevChar == $G). |
|
3807 |
(isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[ |
|
3808 |
code nextPut:$J |
|
3809 |
] ifFalse:[ |
|
3810 |
code nextPut:$K |
|
3811 |
]. |
|
3812 |
]. |
|
3813 |
]. |
|
3814 |
]. |
|
3815 |
] ifFalse:[ |
|
3816 |
('H' includes:symb) ifTrue:[ |
|
3817 |
"/ case 'H': |
|
3818 |
"/ if (isLastChar(wdsz, n)) { |
|
3819 |
"/ break; // terminal H |
|
3820 |
"/ } |
|
3821 |
"/ if (n > 0 && |
|
3822 |
"/ VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3823 |
"/ break; |
|
3824 |
"/ } |
|
3825 |
"/ if (isVowel(local,n+1)) { |
|
3826 |
"/ code.append('H'); // Hvowel |
|
3827 |
"/ } |
|
3828 |
"/ break; |
|
3829 |
isLastChar ifTrue:[ |
|
3830 |
"/ ignore terminal H |
|
3831 |
] ifFalse:[ |
|
3832 |
('CSPTG' includes:prevChar) ifTrue:[ |
|
3833 |
"/ ignore CH, SH, PH, TH, GH (H treated there) |
|
3834 |
] ifFalse:[ |
|
3835 |
('AEIOU' includes:nextChar) ifTrue:[ |
|
3836 |
"/ Hvowel |
|
3837 |
code nextPut:$H |
|
3838 |
]. |
|
3839 |
]. |
|
3840 |
]. |
|
3841 |
] ifFalse:[ |
|
3842 |
('FJLMNR' includes:symb) ifTrue:[ |
|
3843 |
"/ case 'F': |
|
3844 |
"/ case 'J': |
|
3845 |
"/ case 'L': |
|
3846 |
"/ case 'M': |
|
3847 |
"/ case 'N': |
|
3848 |
"/ case 'R': |
|
3849 |
"/ code.append(symb); |
|
3850 |
"/ break; |
|
3851 |
code nextPut:symb. |
|
3852 |
] ifFalse:[ |
|
3853 |
('K' includes:symb) ifTrue:[ |
|
3854 |
"/ case 'K': |
|
3855 |
"/ if (n > 0) { // not initial |
|
3856 |
"/ if (!!isPreviousChar(local, n, 'C')) { |
|
3857 |
"/ code.append(symb); |
|
3858 |
"/ } |
|
3859 |
"/ } else { |
|
3860 |
"/ code.append(symb); // initial K |
|
3861 |
"/ } |
|
3862 |
"/ break; |
|
3863 |
n > 1 ifTrue:[ |
|
3864 |
"/ not initial |
|
3865 |
prevChar ~~ $C ifTrue:[ |
|
3866 |
code nextPut:$K. "/ initial K |
|
3867 |
]. |
|
3868 |
] ifFalse:[ |
|
3869 |
code nextPut:$K. "/ initial K |
|
3870 |
]. |
|
3871 |
] ifFalse:[ |
|
3872 |
('P' includes:symb) ifTrue:[ |
|
3873 |
"/ case 'P': |
|
3874 |
"/ if (isNextChar(local,n,'H')) { |
|
3875 |
"/ // PH -> F |
|
3876 |
"/ code.append('F'); |
|
3877 |
"/ } else { |
|
3878 |
"/ code.append(symb); |
|
3879 |
"/ } |
|
3880 |
"/ break; |
|
3881 |
nextChar == $H ifTrue:[ |
|
3882 |
"/ PH -> F |
|
3883 |
code nextPut:$F. |
|
3884 |
] ifFalse:[ |
|
3885 |
code nextPut:symb. |
|
3886 |
]. |
|
3887 |
] ifFalse:[ |
|
3888 |
('Q' includes:symb) ifTrue:[ |
|
3889 |
"/ case 'Q': |
|
3890 |
"/ code.append('K'); |
|
3891 |
"/ break; |
|
3892 |
code nextPut:$K |
|
3893 |
||
3894 |
] ifFalse:[ |
|
3895 |
('S' includes:symb) ifTrue:[ |
|
4495 | 3896 |
"/ case 'S': |
3897 |
"/ if (regionMatch(local,n,"SH") || |
|
3898 |
"/ regionMatch(local,n,"SIO") || |
|
3899 |
"/ regionMatch(local,n,"SIA")) { |
|
3900 |
"/ code.append('X'); |
|
3901 |
"/ } else { |
|
3902 |
"/ code.append('S'); |
|
3903 |
"/ } |
|
3904 |
"/ break; |
|
4491 | 3905 |
"/ SH -> X (as in shave or ashton) |
3906 |
"/ SIO -> X |
|
3907 |
"/ SIA -> X (as in ASIA) |
|
3908 |
((nextChar == $H) |
|
4495 | 3909 |
or:[ ((nextChar == $I) and:[ 'OA' includes:nextNextChar])] |
3910 |
) ifTrue:[ |
|
3911 |
code nextPut:$X |
|
4491 | 3912 |
] ifFalse:[ |
4495 | 3913 |
code nextPut:$S |
4491 | 3914 |
] |
3915 |
] ifFalse:[ |
|
3916 |
('T' includes:symb) ifTrue:[ |
|
4495 | 3917 |
"/ case 'T': |
3918 |
"/ if (regionMatch(local,n,"TIA") || |
|
3919 |
"/ regionMatch(local,n,"TIO")) { |
|
3920 |
"/ code.append('X'); |
|
3921 |
"/ break; |
|
3922 |
"/ } |
|
3923 |
"/ if (regionMatch(local,n,"TCH")) { |
|
3924 |
"/ // Silent if in "TCH" |
|
3925 |
"/ break; |
|
3926 |
"/ } |
|
3927 |
"/ // substitute numeral 0 for TH (resembles theta after all) |
|
3928 |
"/ if (regionMatch(local,n,"TH")) { |
|
3929 |
"/ code.append('0'); |
|
3930 |
"/ } else { |
|
3931 |
"/ code.append('T'); |
|
3932 |
"/ } |
|
3933 |
"/ break; |
|
3934 |
(nextChar == $I and:[ 'AO' includes:nextNextChar]) ifTrue:[ |
|
3935 |
code nextPut:$X. |
|
3936 |
] ifFalse:[ |
|
3937 |
(nextChar == $C and:[ nextNextChar == $H]) ifTrue:[ |
|
3938 |
"/ Silent if in "TCH" |
|
3939 |
"/ cg - huh; hutch - methinksthereisat |
|
3940 |
] ifFalse:[ |
|
3941 |
"/ substitute numeral 0 for TH (resembles theta after all) |
|
3942 |
nextChar == $H ifTrue:[ |
|
3943 |
code nextPut:$0. |
|
3944 |
] ifFalse:[ |
|
3945 |
code nextPut:$T. |
|
3946 |
]. |
|
3947 |
]. |
|
3948 |
]. |
|
4491 | 3949 |
] ifFalse:[ |
3950 |
('V' includes:symb) ifTrue:[ |
|
3951 |
"/ case 'V': |
|
3952 |
"/ code.append('F'); break; |
|
3953 |
code nextPut:$F |
|
3954 |
||
3955 |
] ifFalse:[ |
|
3956 |
('WY' includes:symb) ifTrue:[ |
|
3957 |
"/ case 'W': |
|
3958 |
"/ case 'Y': // silent if not followed by vowel |
|
3959 |
"/ if (!!isLastChar(wdsz,n) && |
|
3960 |
"/ isVowel(local,n+1)) { |
|
3961 |
"/ code.append(symb); |
|
3962 |
"/ } |
|
3963 |
"/ break; |
|
3964 |
||
3965 |
"/ silent if not followed by vowel |
|
3966 |
(isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[ |
|
3967 |
code nextPut:symb |
|
3968 |
]. |
|
3969 |
] ifFalse:[ |
|
3970 |
('X' includes:symb) ifTrue:[ |
|
3971 |
"/ case 'X': |
|
3972 |
"/ code.append('K'); |
|
3973 |
"/ code.append('S'); |
|
3974 |
"/ break; |
|
3975 |
code nextPutAll:'KS' |
|
3976 |
] ifFalse:[ |
|
3977 |
('Z' includes:symb) ifTrue:[ |
|
3978 |
"/ case 'Z': |
|
3979 |
"/ code.append('S'); |
|
3980 |
"/ break; |
|
3981 |
code nextPut:$S |
|
3982 |
] ifFalse:[ |
|
4495 | 3983 |
"/ default: |
3984 |
"/ // do nothing |
|
3985 |
"/ break; |
|
4491 | 3986 |
]]]]]]]]]]]]]]]]. "/ end switch |
3987 |
]. "/ end else from symb !!= 'C' |
|
3988 |
n := n + 1. |
|
3989 |
]. |
|
3990 |
^ code contents |
|
3991 |
||
3992 |
"Created: / 02-08-2017 / 09:51:31 / cg" |
|
4495 | 3993 |
"Modified: / 03-08-2017 / 14:55:22 / cg" |
4491 | 3994 |
! ! |
3995 |
||
3996 |
!PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'! |
|
3997 |
||
3998 |
documentation |
|
3999 |
" |
|
4000 |
WARNING: this is the so called 'simplified soundex' algorithm; |
|
4001 |
there are more variants like miracode (american soundex) or |
|
4002 |
mysqlSoundex around. |
|
4003 |
||
4004 |
Be sure to use the correct algorithm, if the generated strings must be compatible |
|
4005 |
(otherwise, the differences are probably too small to be noticed as effect, but |
|
4006 |
your search will be different) |
|
4007 |
||
4008 |
The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm |
|
4009 |
||
4010 |
SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable |
|
4011 |
components of names, but by doing so reports more matches. |
|
4012 |
||
4013 |
There are some variations around in the literature; |
|
4014 |
the following is called 'simplified soundex', and the rules for coding a name are: |
|
4015 |
||
4016 |
1. The first letter of the name is used in its un-coded form to serve as the prefix |
|
4017 |
character of the code. (The rest of the code is numerical). |
|
4018 |
||
4019 |
2. Thereafter, W and H are ignored entirely. |
|
4020 |
||
4021 |
3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5). |
|
4022 |
||
4023 |
4. Other letters of the name are converted to a numerical equivalent: |
|
4024 |
B, P, F, V 1 |
|
4025 |
C, G, J, K, Q, S, X, Z 2 |
|
4026 |
D, T 3 |
|
4027 |
L 4 |
|
4028 |
M, N 5 |
|
4029 |
R 6 |
|
4030 |
||
4031 |
5. There are two exceptions: |
|
4032 |
1. Letters that follow prefix letters which would, if coded, have the same |
|
4033 |
numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them. |
|
4034 |
||
4035 |
2. The second letter of any pair of consonants having the same code number is likewise ignored, |
|
4036 |
i.e. unless there is a ''separator'' between them in the name. |
|
4037 |
||
4038 |
6. The final SOUNDEX code consists of the prefix letter plus three numerical characters. |
|
4039 |
Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros. |
|
4040 |
||
4041 |
Notice, that in another variant, w and h are treated slightly differently. |
|
4042 |
This is only of relevance, if you need to reconstruct original soundex codes of other programs |
|
4043 |
or for the original 1880 us census data. |
|
4044 |
SoundexStringComparator new encode:'Ashcraft' -> 'A226' |
|
4045 |
vs. |
|
4046 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4047 |
||
4048 |
Also notice, that soundex deals better with english. |
|
4049 |
For german and other languages, other algorithms may provide better results. |
|
4050 |
" |
|
4051 |
! ! |
|
4052 |
||
4053 |
!PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'! |
|
4054 |
||
4055 |
encode:word |
|
4056 |
|u p t prevCode| |
|
4057 |
||
4058 |
u := word asUppercase. |
|
4059 |
p := u first asString. |
|
4060 |
prevCode := self translate:u first. |
|
4061 |
u from:2 to:u size do:[:c | |
|
4062 |
t := self translate:c. |
|
4063 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4064 |
p := p , t. |
|
4065 |
p size == 4 ifTrue:[^ p ]. |
|
4066 |
]. |
|
4067 |
prevCode := t |
|
4068 |
]. |
|
4069 |
[ p size < 4 ] whileTrue:[ |
|
4070 |
p := p , '0' |
|
4071 |
]. |
|
4072 |
^ (p copyFrom:1 to:4) |
|
4073 |
||
4074 |
" |
|
4075 |
self new encode:'washington' -> 'W252' |
|
4076 |
self new encode:'lee' -> 'L000' |
|
4077 |
self new encode:'Gutierrez' -> 'G362' |
|
4078 |
self new encode:'Pfister' -> 'P236' |
|
4079 |
self new encode:'Jackson' -> 'J250' |
|
4080 |
self new encode:'Tymczak' -> 'T522' |
|
4081 |
" |
|
4082 |
||
4083 |
"notice: |
|
4084 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4085 |
self new encode:'Ashcraft' -> 'A226' |
|
4086 |
" |
|
4087 |
||
4088 |
"Created: / 28-07-2017 / 15:21:23 / cg" |
|
4089 |
"Modified (comment): / 01-08-2017 / 19:01:43 / cg" |
|
4090 |
! ! |
|
4091 |
||
4092 |
!PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'! |
|
4093 |
||
4094 |
translate:aCharacter |
|
4095 |
"use simple if's for more speed when compiled" |
|
4096 |
||
4097 |
"vowels serve as separators" |
|
4098 |
aCharacter == $A ifTrue:[^ '0' ]. |
|
4099 |
aCharacter == $E ifTrue:[^ '0' ]. |
|
4100 |
aCharacter == $I ifTrue:[^ '0' ]. |
|
4101 |
aCharacter == $O ifTrue:[^ '0' ]. |
|
4102 |
aCharacter == $U ifTrue:[^ '0' ]. |
|
4103 |
aCharacter == $Y ifTrue:[^ '0' ]. |
|
4104 |
||
4105 |
aCharacter == $B ifTrue:[^ '1' ]. |
|
4106 |
aCharacter == $P ifTrue:[^ '1' ]. |
|
4107 |
aCharacter == $F ifTrue:[^ '1' ]. |
|
4108 |
aCharacter == $V ifTrue:[^ '1' ]. |
|
4109 |
||
4110 |
aCharacter == $C ifTrue:[^ '2' ]. |
|
4111 |
aCharacter == $S ifTrue:[^ '2' ]. |
|
4112 |
aCharacter == $K ifTrue:[^ '2' ]. |
|
4113 |
aCharacter == $G ifTrue:[^ '2' ]. |
|
4114 |
aCharacter == $J ifTrue:[^ '2' ]. |
|
4115 |
aCharacter == $Q ifTrue:[^ '2' ]. |
|
4116 |
aCharacter == $X ifTrue:[^ '2' ]. |
|
4117 |
aCharacter == $Z ifTrue:[^ '2' ]. |
|
4118 |
||
4119 |
aCharacter == $D ifTrue:[^ '3' ]. |
|
4120 |
aCharacter == $T ifTrue:[^ '3' ]. |
|
4121 |
||
4122 |
aCharacter == $L ifTrue:[^ '4' ]. |
|
4123 |
||
4124 |
aCharacter == $M ifTrue:[^ '5' ]. |
|
4125 |
aCharacter == $N ifTrue:[^ '5' ]. |
|
4126 |
||
4127 |
aCharacter == $R ifTrue:[^ '6' ]. |
|
4128 |
^ nil |
|
4129 |
||
4130 |
"Modified: / 02-08-2017 / 01:35:40 / cg" |
|
4131 |
"Modified (comment): / 02-08-2017 / 14:30:11 / cg" |
|
4132 |
! ! |
|
4133 |
||
4134 |
!PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'! |
|
4135 |
||
4136 |
documentation |
|
4137 |
" |
|
4138 |
MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation, |
|
4139 |
and also removing vokals first, then removing duplicate codes |
|
4140 |
(whereas the soundex code does this in reverse order). |
|
4141 |
||
4142 |
These variations are important, if you need the miracode soundex codes to be generated. |
|
4143 |
" |
|
4144 |
! ! |
|
4145 |
||
4146 |
!PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'! |
|
4147 |
||
4148 |
encode:word |
|
4149 |
"same as inherited, but cares for 0, W and H" |
|
4150 |
||
4151 |
|u p t prevCode| |
|
4152 |
||
4153 |
u := word asUppercase. |
|
4154 |
p := u first asString. |
|
4155 |
prevCode := self translate:u first. |
|
4156 |
u from:2 to:u size do:[:c | |
|
4157 |
t := self translate:c. |
|
4158 |
(t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4159 |
p := p , t. |
|
4160 |
]. |
|
4161 |
(t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[ |
|
4162 |
prevCode := t. |
|
4163 |
]. |
|
4164 |
]. |
|
4165 |
[ p size < 4 ] whileTrue:[ |
|
4166 |
p := p , '0' |
|
4167 |
]. |
|
4168 |
^ p |
|
4169 |
||
4170 |
"Created: / 28-07-2017 / 15:23:41 / cg" |
|
4171 |
"Modified: / 31-07-2017 / 17:53:51 / cg" |
|
4172 |
"Modified (comment): / 02-08-2017 / 14:31:15 / cg" |
|
4173 |
! ! |
|
4174 |
||
4175 |
!PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'! |
|
4176 |
||
4177 |
documentation |
|
4178 |
" |
|
4179 |
NYSIIS Algorithm: |
|
4180 |
||
4181 |
1. |
|
4182 |
remove all ''S'' and ''Z'' chars from the end of the surname |
|
4183 |
||
4184 |
2. |
|
4185 |
transcode initial strings |
|
4186 |
MAC => MC |
|
4187 |
PF => F |
|
4188 |
||
4189 |
3. |
|
4190 |
Transcode trailing strings as follows, |
|
4191 |
||
4192 |
IX => IC |
|
4193 |
EX => EC |
|
4194 |
YE,EE,IE => Y |
|
4195 |
NT,ND => D |
|
4196 |
||
4197 |
4. |
|
4198 |
transcode ''EV'' to ''EF'' if not at start of name |
|
4199 |
||
4200 |
5. |
|
4201 |
use first character of name as first character of key |
|
4202 |
||
4203 |
6. |
|
4204 |
remove any ''W'' that follows a vowel |
|
4205 |
||
4206 |
7. |
|
4207 |
replace all vowels with ''A'' |
|
4208 |
||
4209 |
8. |
|
4210 |
transcode ''GHT'' to ''GT'' |
|
4211 |
||
4212 |
9. |
|
4213 |
transcode ''DG'' to ''G'' |
|
4214 |
||
4215 |
10. |
|
4216 |
transcode ''PH'' to ''F'' |
|
4217 |
||
4218 |
11. |
|
4219 |
if not first character, eliminate all ''H'' preceded or followed by a vowel |
|
4220 |
||
4221 |
12. |
|
4222 |
change ''KN'' to ''N'', else ''K'' to ''C'' |
|
4223 |
||
4224 |
13. |
|
4225 |
if not first character, change ''M'' to ''N'' |
|
4226 |
||
4227 |
14. |
|
4228 |
if not first character, change ''Q'' to ''G'' |
|
4229 |
||
4230 |
15. |
|
4231 |
transcode ''SH'' to ''S'' |
|
4232 |
||
4233 |
16. |
|
4234 |
transcode ''SCH'' to ''S'' |
|
4235 |
||
4236 |
17. |
|
4237 |
transcode ''YW'' to ''Y'' |
|
4238 |
||
4239 |
18. |
|
4240 |
if not first or last character, change ''Y'' to ''A'' |
|
4241 |
||
4242 |
19. |
|
4243 |
transcode ''WR'' to ''R'' |
|
4244 |
||
4245 |
20. |
|
4246 |
if not first character, change ''Z'' to ''S'' |
|
4247 |
||
4248 |
21. |
|
4249 |
transcode terminal ''AY'' to ''Y'' |
|
4250 |
||
4251 |
22. |
|
4252 |
remove traling vowels |
|
4253 |
||
4254 |
23. |
|
4255 |
collapse all strings of repeated characters |
|
4256 |
||
4257 |
24. |
|
4258 |
if first char of original surname was a vowel, append it to the code |
|
4259 |
" |
|
4260 |
! ! |
|
4261 |
||
4262 |
!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'! |
|
4263 |
||
4264 |
encode:aString |
|
4265 |
|k| |
|
4266 |
||
4267 |
k := self rule1:(aString asUppercase). |
|
4268 |
"2. Transcode initial strings: MAC => MC PF => F" |
|
4269 |
k := self rule2:k. |
|
4270 |
k := self rule3:k. |
|
4271 |
k := self rule4:k. |
|
4272 |
k := self rule5:k. |
|
4273 |
k := self rule6:k. |
|
4274 |
k := self rule7:k. |
|
4275 |
k := self rule8:k. |
|
4276 |
k := self rule9:k. |
|
4277 |
k := self rule10:k. |
|
4278 |
k := self rule11:k. |
|
4279 |
k := self rule12:k. |
|
4280 |
k := self rule13:k. |
|
4281 |
k := self rule14:k. |
|
4282 |
k := self rule15:k. |
|
4283 |
k := self rule16:k. |
|
4284 |
k := self rule17:k. |
|
4285 |
k := self rule18:k. |
|
4286 |
k := self rule19:k. |
|
4287 |
k := self rule20:k. |
|
4288 |
k := self rule21:k. |
|
4289 |
k := self rule22:k. |
|
4290 |
k := self rule23:k. |
|
4291 |
k := self rule24:k originalKey:aString. |
|
4292 |
^ k |
|
4293 |
||
4294 |
" |
|
4295 |
self new encode:'hello' |
|
4296 |
self new encode:'bliss' |
|
4297 |
" |
|
4298 |
" |
|
4299 |
self new phoneticStringsFor:'hello' |
|
4300 |
self new phoneticStringsFor:'bliss' |
|
4301 |
" |
|
4302 |
||
4303 |
"Created: / 28-07-2017 / 15:34:52 / cg" |
|
4304 |
"Modified (comment): / 02-08-2017 / 14:31:47 / cg" |
|
4305 |
! ! |
|
4306 |
||
4307 |
!PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'! |
|
4308 |
||
4309 |
rule10:key |
|
4310 |
"10. transcode 'PH' to 'F' " |
|
4311 |
||
4312 |
^ self transcodeAll:'PH' of:key to:'F' startingAt:1 |
|
4313 |
||
4314 |
"Modified (format): / 02-08-2017 / 14:34:27 / cg" |
|
4315 |
! |
|
4316 |
||
4317 |
rule11:key |
|
4318 |
|k c| |
|
4319 |
||
4320 |
"11. if not first character, eliminate all 'H' preceded or followed by a vowel " |
|
4321 |
k := key copy. |
|
4322 |
c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4323 |
2 to:key size do:[:i | |
|
4324 |
(key at:i) = $H ifTrue:[ |
|
4325 |
((key at:i - 1) isVowel |
|
4326 |
or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ] |
|
4327 |
] |
|
4328 |
]. |
|
4329 |
c do:[:n | |
|
4330 |
k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4331 |
]. |
|
4332 |
^ k |
|
4333 |
! |
|
4334 |
||
4335 |
rule12:key |
|
4336 |
|k| |
|
4337 |
||
4338 |
"12. change 'KN' to 'N', else 'K' to 'C' " |
|
4339 |
k := self transcodeAll:'KN' of:key to:'K' startingAt:1. |
|
4340 |
k := self transcodeAll:'K' of:k to:'C' startingAt:1. |
|
4341 |
^ k |
|
4342 |
||
4343 |
"Modified (format): / 02-08-2017 / 14:34:48 / cg" |
|
4344 |
! |
|
4345 |
||
4346 |
rule13:key |
|
4347 |
"13. if not first character, change 'M' to 'N' " |
|
4348 |
||
4349 |
^ self transcodeAll:'M' of:key to:'N' startingAt:2 |
|
4350 |
||
4351 |
"Modified (format): / 02-08-2017 / 14:35:00 / cg" |
|
4352 |
! |
|
4353 |
||
4354 |
rule14:key |
|
4355 |
"14. if not first character, change 'Q' to 'G' " |
|
4356 |
||
4357 |
^ self transcodeAll:'Q' of:key to:'G' startingAt:2 |
|
4358 |
||
4359 |
"Modified (format): / 02-08-2017 / 14:35:08 / cg" |
|
4360 |
! |
|
4361 |
||
4362 |
rule15:key |
|
4363 |
"15. transcode 'SH' to 'S' " |
|
4364 |
||
4365 |
^ self transcodeAll:'SH' of:key to:'S' startingAt:1 |
|
4366 |
||
4367 |
"Modified (format): / 02-08-2017 / 14:35:18 / cg" |
|
4368 |
! |
|
4369 |
||
4370 |
rule16:key |
|
4371 |
"16. transcode 'SCH' to 'S' " |
|
4372 |
||
4373 |
^ self transcodeAll:'SCH' of:key to:'S' startingAt:1 |
|
4374 |
||
4375 |
"Modified (format): / 02-08-2017 / 14:35:25 / cg" |
|
4376 |
! |
|
4377 |
||
4378 |
rule17:key |
|
4379 |
"17. transcode 'YW' to 'Y' " |
|
4380 |
||
4381 |
^ self transcodeAll:'YW' of:key to:'Y' startingAt:1 |
|
4382 |
||
4383 |
"Modified (format): / 02-08-2017 / 14:35:33 / cg" |
|
4384 |
! |
|
4385 |
||
4386 |
rule18:key |
|
4387 |
|k| |
|
4388 |
||
4389 |
"18. if not first or last character, change 'Y' to 'A' " |
|
4390 |
k := self transcodeAll:'Y' of:key to:'A' startingAt:2. |
|
4391 |
key last = $Y ifTrue:[ |
|
4392 |
k at:k size put:$Y |
|
4393 |
]. |
|
4394 |
^ k |
|
4395 |
||
4396 |
"Modified (format): / 02-08-2017 / 14:35:44 / cg" |
|
4397 |
! |
|
4398 |
||
4399 |
rule19:key |
|
4400 |
"19. transcode 'WR' to 'R' " |
|
4401 |
||
4402 |
^ self transcodeAll:'WR' of:key to:'R' startingAt:1 |
|
4403 |
||
4404 |
"Modified (format): / 02-08-2017 / 14:35:52 / cg" |
|
4405 |
! |
|
4406 |
||
4407 |
rule1:key |
|
4408 |
|k| |
|
4409 |
||
4410 |
k := key copy. |
|
4411 |
"1. Remove all 'S' and 'Z' chars from the end of the name" |
|
4412 |
[ |
|
4413 |
'SZ' includes:k last |
|
4414 |
] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ]. |
|
4415 |
^ k |
|
4416 |
! |
|
4417 |
||
4418 |
rule20:key |
|
4419 |
"20. if not first character, change 'Z' to 'S' " |
|
4420 |
||
4421 |
^ self transcodeAll:'Z' of:key to:'S' startingAt:2 |
|
4422 |
||
4423 |
"Modified (format): / 02-08-2017 / 14:36:00 / cg" |
|
4424 |
! |
|
4425 |
||
4426 |
rule21:key |
|
4427 |
"21. transcode terminal 'AY' to 'Y' " |
|
4428 |
||
4429 |
^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1 |
|
4430 |
||
4431 |
"Modified (format): / 02-08-2017 / 14:36:08 / cg" |
|
4432 |
! |
|
4433 |
||
4434 |
rule22:key |
|
4435 |
|k| |
|
4436 |
||
4437 |
"22. remove trailing vowels " |
|
4438 |
k := key copy. |
|
4439 |
[ k last isVowel ] whileTrue:[ |
|
4440 |
k := k copyButLast |
|
4441 |
]. |
|
4442 |
^ k |
|
4443 |
||
4444 |
"Modified: / 02-08-2017 / 14:36:42 / cg" |
|
4445 |
! |
|
4446 |
||
4447 |
rule23:key |
|
4448 |
|k c| |
|
4449 |
||
4450 |
"23. collapse all strings of repeated characters " |
|
4451 |
k := key copy. |
|
4452 |
c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4453 |
k size to:2 do:[:i | |
|
4454 |
(k at:i) = (k at:i - 1) ifTrue:[ |
|
4455 |
c add:i |
|
4456 |
] |
|
4457 |
]. |
|
4458 |
c do:[:n | |
|
4459 |
k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4460 |
]. |
|
4461 |
^ k |
|
4462 |
! |
|
4463 |
||
4464 |
rule24:key originalKey:originalKey |
|
4465 |
|k| |
|
4466 |
||
4467 |
"24. if first char of original surname was a vowel, append it to the code" |
|
4468 |
k := key copy. |
|
4469 |
originalKey first isVowel ifTrue:[ |
|
4470 |
k := k , originalKey first asString asUppercase |
|
4471 |
]. |
|
4472 |
^ k |
|
4473 |
! |
|
4474 |
||
4475 |
rule2:key |
|
4476 |
"2. Transcode initial strings: MAC => MC PF => F" |
|
4477 |
||
4478 |
|k| |
|
4479 |
||
4480 |
k := key copy. |
|
4481 |
(k startsWith:'MAC') ifTrue:[ |
|
4482 |
k := 'MC' , (k copyFrom:4) |
|
4483 |
]. |
|
4484 |
(k startsWith:'PF') ifTrue:[ |
|
4485 |
k := 'F' , (k copyFrom:3) |
|
4486 |
]. |
|
4487 |
^ k |
|
4488 |
||
4489 |
"Modified (format): / 02-08-2017 / 14:31:40 / cg" |
|
4490 |
! |
|
4491 |
||
4492 |
rule3:key |
|
4493 |
|k| |
|
4494 |
||
4495 |
"3. Transcode trailing strings as follows: |
|
4496 |
IX => IC |
|
4497 |
EX => EC |
|
4498 |
YE, EE, IE => Y |
|
4499 |
NT, ND => D" |
|
4500 |
||
4501 |
k := key copy. |
|
4502 |
k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'. |
|
4503 |
k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'. |
|
4504 |
k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'. |
|
4505 |
k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'. |
|
4506 |
^ k |
|
4507 |
||
4508 |
"Modified (format): / 02-08-2017 / 14:32:24 / cg" |
|
4509 |
! |
|
4510 |
||
4511 |
rule4:key |
|
4512 |
"4. Transcode 'EV' to 'EF' if not at start of name" |
|
4513 |
||
4514 |
^ self transcodeAll:'EV' of:key to:'EF' startingAt:2 |
|
4515 |
||
4516 |
"Modified (format): / 02-08-2017 / 14:32:35 / cg" |
|
4517 |
! |
|
4518 |
||
4519 |
rule5:key |
|
4520 |
"5. Use first character of name as first character of key. |
|
4521 |
Ignored because we're doing an in-place conversion" |
|
4522 |
||
4523 |
^ key |
|
4524 |
||
4525 |
"Modified (comment): / 02-08-2017 / 14:32:45 / cg" |
|
4526 |
! |
|
4527 |
||
4528 |
rule6:key |
|
4529 |
|k i| |
|
4530 |
||
4531 |
"6. Remove any 'W' that follows a vowel" |
|
4532 |
k := key copy. |
|
4533 |
i := 2. |
|
4534 |
[ |
|
4535 |
(i := k indexOf:$W startingAt:i) > 0 |
|
4536 |
] whileTrue:[ |
|
4537 |
(k at:i - 1) isVowel ifTrue:[ |
|
4538 |
k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size). |
|
4539 |
i := i - 1 |
|
4540 |
] |
|
4541 |
]. |
|
4542 |
^ k |
|
4543 |
! |
|
4544 |
||
4545 |
rule7:key |
|
4546 |
"7. replace all vowels with 'A' " |
|
4547 |
^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]]. |
|
4548 |
||
4549 |
"Modified: / 02-08-2017 / 14:33:56 / cg" |
|
4550 |
! |
|
4551 |
||
4552 |
rule8:key |
|
4553 |
"8. transcode 'GHT' to 'GT' " |
|
4554 |
||
4555 |
^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1 |
|
4556 |
||
4557 |
"Modified (format): / 02-08-2017 / 14:34:05 / cg" |
|
4558 |
! |
|
4559 |
||
4560 |
rule9:key |
|
4561 |
"9. transcode 'DG' to 'G' " |
|
4562 |
||
4563 |
^ self transcodeAll:'DG' of:key to:'G' startingAt:1 |
|
4564 |
||
4565 |
"Modified (format): / 02-08-2017 / 14:34:15 / cg" |
|
4566 |
! |
|
4567 |
||
4568 |
transcodeAll:aString of:key to:replacementString startingAt:start |
|
4569 |
|k i| |
|
4570 |
||
4571 |
k := key copy. |
|
4572 |
[ |
|
4573 |
(i := k indexOfSubCollection:aString startingAt:start) > 0 |
|
4574 |
] whileTrue:[ |
|
4575 |
k := (k copyFrom:1 to:i - 1) , replacementString |
|
4576 |
, (k copyFrom:i + aString size to:k size) |
|
4577 |
]. |
|
4578 |
^ k |
|
4579 |
! |
|
4580 |
||
4581 |
transcodeTrailing:anArrayOfStrings of:key to:replacementString |
|
4582 |
|answer| |
|
4583 |
||
4584 |
answer := key copy. |
|
4585 |
anArrayOfStrings do:[:aString | |
|
4586 |
answer := self |
|
4587 |
transcodeAll:aString |
|
4588 |
of:answer |
|
4589 |
to:replacementString |
|
4590 |
startingAt:(answer size - aString size) + 1 |
|
4591 |
]. |
|
4592 |
^ answer |
|
4593 |
! ! |
|
4594 |
||
4595 |
!PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'! |
|
4596 |
||
4597 |
documentation |
|
4598 |
" |
|
4599 |
Implementation of the PHONEM algorithm, as described in |
|
4600 |
'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht - |
|
4601 |
Ein Programm fuer kontextsensitive phonetische Textumwandlung |
|
4602 |
ct Magazin fuer Computer & Technik 25/1998' |
|
4603 |
||
4604 |
This algorithm deals better with the german language (it cares for umlauts) |
|
4605 |
" |
|
4606 |
! ! |
|
4607 |
||
4608 |
!PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'! |
|
4609 |
||
4610 |
encode:aString |
|
4611 |
|s idx t t2| |
|
4612 |
||
4613 |
s := aString asUppercase. |
|
4614 |
||
4615 |
idx := 1. |
|
4616 |
[idx < (s size-1)] whileTrue:[ |
|
4617 |
t2 := nil. |
|
4618 |
t := s copyFrom:idx to:idx+1. |
|
4619 |
t = 'SC' ifTrue:[ t2 := 'C' ] |
|
4620 |
ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ] |
|
4621 |
ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ] |
|
4622 |
ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ] |
|
4623 |
ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ] |
|
4624 |
ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ] |
|
4625 |
ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ] |
|
4626 |
ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ] |
|
4627 |
ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ] |
|
4628 |
ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ] |
|
4629 |
ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ] |
|
4630 |
ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ] |
|
4631 |
ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ] |
|
4632 |
ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ] |
|
4633 |
ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ] |
|
4634 |
ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ] |
|
4635 |
ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]]. |
|
4636 |
t2 notNil ifTrue:[ |
|
4637 |
s := (s copyTo:idx-1),t2,(s copyFrom:idx+2) |
|
4638 |
] ifFalse:[ |
|
4639 |
idx := idx + 1. |
|
4640 |
]. |
|
4641 |
]. |
|
4642 |
||
4643 |
"/ single character substitutions via tr |
|
4644 |
s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'. |
|
4645 |
s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false. |
|
4646 |
s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true. |
|
4647 |
^ s |
|
4648 |
||
4649 |
" |
|
4650 |
self basicNew encode:'müller' -> 'MYLR' |
|
4651 |
self basicNew encode:'mueller' -> 'MYLR' |
|
4652 |
self basicNew encode:'möller' -> 'MYLR' |
|
4653 |
self basicNew encode:'miller' -> 'MYLR' |
|
4654 |
self basicNew encode:'muller' -> 'MULR' |
|
4655 |
self basicNew encode:'muler' -> 'MULR' |
|
4656 |
||
4657 |
self basicNew phoneticStringsFor:'müller' #('MYLR') |
|
4658 |
self basicNew phoneticStringsFor:'mueller' #('MYLR') |
|
4659 |
self basicNew phoneticStringsFor:'möller' #('MYLR') |
|
4660 |
self basicNew phoneticStringsFor:'miller' #('MYLR') |
|
4661 |
self basicNew phoneticStringsFor:'muller' #('MULR') |
|
4662 |
self basicNew phoneticStringsFor:'muler' #('MULR') |
|
4663 |
||
4664 |
self basicNew phoneticStringsFor:'schmidt' #('CMYD') |
|
4665 |
self basicNew phoneticStringsFor:'schneider' #('CNAYDR') |
|
4666 |
self basicNew phoneticStringsFor:'fischer' #('VYCR') |
|
4667 |
self basicNew phoneticStringsFor:'weber' #('VBR') |
|
4668 |
self basicNew phoneticStringsFor:'weeber' #('VBR') |
|
4669 |
self basicNew phoneticStringsFor:'webber' #('VBR') |
|
4670 |
self basicNew phoneticStringsFor:'wepper' #('VBR') |
|
4671 |
||
4672 |
self basicNew phoneticStringsFor:'meyer' #('MAYR') |
|
4673 |
self basicNew phoneticStringsFor:'maier' #('MAYR') |
|
4674 |
self basicNew phoneticStringsFor:'mayer' #('MAYR') |
|
4675 |
self basicNew phoneticStringsFor:'mayr' #('MAYR') |
|
4676 |
self basicNew phoneticStringsFor:'meir' #('MAYR') |
|
4677 |
||
4678 |
self basicNew phoneticStringsFor:'wagner' #('VACNR') |
|
4679 |
self basicNew phoneticStringsFor:'schulz' #('CULC') |
|
4680 |
self basicNew phoneticStringsFor:'becker' #('BCR') |
|
4681 |
self basicNew phoneticStringsFor:'hoffmann' #('OVMAN') |
|
4682 |
self basicNew phoneticStringsFor:'haus' #('AUS') |
|
4683 |
||
4684 |
self basicNew phoneticStringsFor:'schäfer' #('CVR') |
|
4685 |
self basicNew phoneticStringsFor:'scheffer' #('CVR') |
|
4686 |
self basicNew phoneticStringsFor:'schaeffer' #('CVR') |
|
4687 |
self basicNew phoneticStringsFor:'schaefer' #('CVR') |
|
4688 |
" |
|
4689 |
||
4690 |
"Created: / 28-07-2017 / 15:38:08 / cg" |
|
4691 |
! ! |
|
4692 |
||
4693 |
!PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'! |
|
4694 |
||
4695 |
documentation |
|
4696 |
" |
|
4697 |
Caverphone (2) Algorithm: |
|
4698 |
||
4699 |
see http://caversham.otago.ac.nz/files/working/ctp150804.pdf |
|
4700 |
||
4701 |
Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it, |
|
4702 |
with the proviso that the Caversham Project at the University of Otago should be acknowledged as the |
|
4703 |
original source (which is hereby done ;-). |
|
4704 |
||
4705 |
• Start with a Surname or Firstname |
|
4706 |
• Convert to lowercase |
|
4707 |
This coding system is case sensitive, implementations should acknowledge that a is not the same as A |
|
4708 |
• Remove anything not A-Z |
|
4709 |
The main intention of this is to remove spaces, hyphens, and apostrophes. |
|
4710 |
example: o'brian becomes obrian |
|
4711 |
• If the name starts with cough make it cou2f |
|
4712 |
2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in. |
|
4713 |
• If the name starts with rough make it rou2f |
|
4714 |
• If the name starts with tough make it tou2f |
|
4715 |
• If the name starts with enough make it enou2f |
|
4716 |
• If the name starts with gn make it 2n |
|
4717 |
• If the name ends with mb make it m2 |
|
4718 |
• replace cq with 2q |
|
4719 |
• replace ci with si |
|
4720 |
• replace ce with se |
|
4721 |
• replace cy with sy |
|
4722 |
• replace tch with 2ch |
|
4723 |
• replace c with k |
|
4724 |
• replace q with k |
|
4725 |
• replace x with k |
|
4726 |
• replace v with f |
|
4727 |
• replace dg with 2g |
|
4728 |
• replace tio with sio |
|
4729 |
• replace tia with sia |
|
4730 |
• replace d with t |
|
4731 |
• replace ph with fh |
|
4732 |
• replace b with p |
|
4733 |
• replace sh with s2 |
|
4734 |
• replace z with s |
|
4735 |
• replace and initial vowel with an A |
|
4736 |
• replace all other vowels with a 3 |
|
4737 |
3 is a temporary placeholder marking a vowel |
|
4738 |
• replace 3gh3 with 3kh3 |
|
4739 |
Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule. |
|
4740 |
• replace gh with 22 |
|
4741 |
• replace g with k |
|
4742 |
• replace groups of the letter s with a S |
|
4743 |
Continuous strings of s are replace by a single S |
|
4744 |
• replace groups of the letter t with a T |
|
4745 |
• replace groups of the letter p with a P |
|
4746 |
• replace groups of the letter k with a K |
|
4747 |
• replace groups of the letter f with a F |
|
4748 |
• replace groups of the letter m with a M |
|
4749 |
• replace groups of the letter n with a N |
|
4750 |
• replace w3 with W3 |
|
4751 |
• replace wy with Wy |
|
4752 |
• replace wh3 with Wh3 |
|
4753 |
• replace why with Why |
|
4754 |
• replace w with 2 |
|
4755 |
• replace and initial h with an A |
|
4756 |
• replace all other occurrences of h with a 2 |
|
4757 |
• replace r3 with R3 |
|
4758 |
• replace ry with Ry |
|
4759 |
• replace r with 2 |
|
4760 |
• replace l3 with L3 |
|
4761 |
• replace ly with Ly |
|
4762 |
• replace l with 2 |
|
4763 |
• replace j with y |
|
4764 |
• replace y3 with Y3 |
|
4765 |
• replace y with 2 |
|
4766 |
• remove all 2s |
|
4767 |
• remove all 3s |
|
4768 |
• put six (v1) / ten (v2) 1s on the end |
|
4769 |
• take the first six characters as the code (caverphone 1); |
|
4770 |
/ take the first ten characters as the code (caverphone 2); |
|
4771 |
||
4772 |
self new encode:'david' -> 'TFT1111111' |
|
4773 |
self new encode:'whittle' -> 'WTA1111111' |
|
4774 |
||
4775 |
self new encode:'Stevenson' -> 'STFNSN1111' |
|
4776 |
self new encode:'Peter' -> 'PTA1111111' |
|
4777 |
||
4778 |
self new encode:'washington' -> 'WSNKTN1111' |
|
4779 |
self new encode:'lee' -> 'LA11111111' |
|
4780 |
self new encode:'Gutierrez' -> 'KTRS111111' |
|
4781 |
self new encode:'Pfister' -> 'PFSTA11111' |
|
4782 |
self new encode:'Jackson' -> 'YKSN111111' |
|
4783 |
self new encode:'Tymczak' -> 'TMKSK11111' |
|
4784 |
||
4785 |
self new encode:'add' -> 'AT11111111' |
|
4786 |
self new encode:'aid' -> 'AT11111111' |
|
4787 |
self new encode:'at' -> 'AT11111111' |
|
4788 |
self new encode:'art' -> 'AT11111111' |
|
4789 |
self new encode:'earth' -> 'AT11111111' |
|
4790 |
self new encode:'head' -> 'AT11111111' |
|
4791 |
self new encode:'old' -> 'AT11111111' |
|
4792 |
||
4793 |
self new encode:'ready' -> 'RTA1111111' |
|
4794 |
self new encode:'rather' -> 'RTA1111111' |
|
4795 |
self new encode:'able' -> 'APA1111111' |
|
4796 |
self new encode:'appear' -> 'APA1111111' |
|
4797 |
||
4798 |
self new encode:'Deedee' -> 'TTA1111111' |
|
4799 |
" |
|
4800 |
! ! |
|
4801 |
||
4802 |
!PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'! |
|
4803 |
||
4804 |
encode:word |
|
4805 |
|txt| |
|
4806 |
||
4807 |
word size == 0 ifTrue:[^ '1111111111' ]. |
|
4808 |
||
4809 |
"/ 1. Convert to lowercase |
|
4810 |
txt := word asLowercase. |
|
4811 |
||
4812 |
"/ 2. Remove anything not A-Z |
|
4813 |
txt := txt select:#isLetter. |
|
4814 |
||
4815 |
#( |
|
4816 |
"/ oldSeq newSeq repeat |
|
4817 |
||
4818 |
"/ 2.5. Remove final e |
|
4819 |
'e$' '' false |
|
4820 |
"/ 3. Handle various start options |
|
4821 |
'^cough' 'cou2f' false |
|
4822 |
'^rough' 'rou2f' false |
|
4823 |
'^tough' 'tou2f' false |
|
4824 |
'^enough' 'enou2f' false |
|
4825 |
'^trough' 'trou2f' false |
|
4826 |
||
4827 |
'^gn' '2n' false |
|
4828 |
'mb$' 'm2' false |
|
4829 |
||
4830 |
"/ 4. Handle replacements |
|
4831 |
'cq' '2q' true |
|
4832 |
'ci' 'si' true |
|
4833 |
'ce' 'se' true |
|
4834 |
'cy' 'sy' true |
|
4835 |
'tch' '2ch' true |
|
4836 |
'c' 'k' true |
|
4837 |
'q' 'k' true |
|
4838 |
'x' 'k' true |
|
4839 |
'v' 'f' true |
|
4840 |
'dg' '2g' true |
|
4841 |
'tio' 'sio' true |
|
4842 |
'tia' 'sia' true |
|
4843 |
'd' 't' true |
|
4844 |
'ph' 'fh' true |
|
4845 |
'b' 'p' true |
|
4846 |
'sh' 's2' true |
|
4847 |
'z' 's' true |
|
4848 |
||
4849 |
'^a' 'A' false |
|
4850 |
'^e' 'A' false |
|
4851 |
'^i' 'A' false |
|
4852 |
'^o' 'A' false |
|
4853 |
'^u' 'A' false |
|
4854 |
||
4855 |
'a' '3' true |
|
4856 |
'e' '3' true |
|
4857 |
'i' '3' true |
|
4858 |
'o' '3' true |
|
4859 |
'u' '3' true |
|
4860 |
'j' 'y' true |
|
4861 |
||
4862 |
'^y3' 'Y3' false |
|
4863 |
'^y' 'A' false |
|
4864 |
||
4865 |
'y' '3' true |
|
4866 |
'3gh3' '3kh3' true |
|
4867 |
'gh' '22' true |
|
4868 |
'g' 'k' true |
|
4869 |
's' 'S' true |
|
4870 |
'SS' 'S' true |
|
4871 |
't' 'T' true |
|
4872 |
'TT' 'T' true |
|
4873 |
'p' 'P' true |
|
4874 |
'PP' 'P' true |
|
4875 |
'k' 'K' true |
|
4876 |
'KK' 'K' true |
|
4877 |
'f' 'F' true |
|
4878 |
'FF' 'F' true |
|
4879 |
'm' 'M' true |
|
4880 |
'MM' 'M' true |
|
4881 |
'n' 'N' true |
|
4882 |
'NN' 'N' true |
|
4883 |
'w3' 'W3' true |
|
4884 |
'wh3' 'Wh3' true |
|
4885 |
'w$' '3' false |
|
4886 |
'w' '2' true |
|
4887 |
'^h' 'A' false |
|
4888 |
'h' '2' true |
|
4889 |
'r3' 'R3' true |
|
4890 |
'r$' '3' false |
|
4891 |
'r' '2' true |
|
4892 |
'l3' 'L3' true |
|
4893 |
'l$' '3' false |
|
4894 |
'l' '2' true |
|
4895 |
||
4896 |
"/ 5. removals |
|
4897 |
||
4898 |
'2' '' true |
|
4899 |
'3$' 'A' true |
|
4900 |
'3' '' true |
|
4901 |
) inGroupsOf:3 do:[:pat :repl :repeat| |
|
4902 |
|s txtBefore| |
|
4903 |
||
4904 |
txtBefore := txt. |
|
4905 |
(pat startsWith:$^) ifTrue:[ |
|
4906 |
s := pat copyButFirst. |
|
4907 |
repeat ifTrue:[ |
|
4908 |
[txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4909 |
] ifFalse:[ |
|
4910 |
(txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4911 |
]. |
|
4912 |
] ifFalse:[ |
|
4913 |
(pat endsWith:$$) ifTrue:[ |
|
4914 |
s := pat copyButLast. |
|
4915 |
repeat ifTrue:[ |
|
4916 |
[txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4917 |
] ifFalse:[ |
|
4918 |
(txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4919 |
] |
|
4920 |
] ifFalse:[ |
|
4921 |
repeat ifTrue:[ |
|
4922 |
txt := txt copyReplaceAllSubcollections:pat with:repl |
|
4923 |
] ifFalse:[ |
|
4924 |
txt := txt copyReplaceSubcollection:pat with:repl |
|
4925 |
] |
|
4926 |
] |
|
4927 |
]. |
|
4928 |
"/ txt ~= txtBefore ifTrue:[ |
|
4929 |
"/ Transcript showCR:(pat,' | ',repl,' -> ',txt). |
|
4930 |
"/ ]. |
|
4931 |
]. |
|
4932 |
||
4933 |
"/ 6. put ten 1s on the end |
|
4934 |
txt := txt,'1111111111'. |
|
4935 |
||
4936 |
"/ 7. take the first ten characters as the code |
|
4937 |
^ txt copyTo:10 |
|
4938 |
||
4939 |
" |
|
4940 |
self new encode:'david' -> 'TFT1111111' |
|
4941 |
self new encode:'whittle' -> 'WTA1111111' |
|
4942 |
||
4943 |
self new encode:'Stevenson' -> 'STFNSN1111' |
|
4944 |
self new encode:'Peter' -> 'PTA1111111' |
|
4945 |
||
4946 |
self new encode:'washington' -> 'WSNKTN1111' |
|
4947 |
self new encode:'lee' -> 'LA11111111' |
|
4948 |
self new encode:'Gutierrez' -> 'KTRS111111' |
|
4949 |
self new encode:'Pfister' -> 'PFSTA11111' |
|
4950 |
self new encode:'Jackson' -> 'YKSN111111' |
|
4951 |
self new encode:'Tymczak' -> 'TMKSK11111' |
|
4952 |
||
4953 |
self new encode:'add' -> 'AT11111111' |
|
4954 |
self new encode:'aid' -> 'AT11111111' |
|
4955 |
self new encode:'at' -> 'AT11111111' |
|
4956 |
self new encode:'art' -> 'AT11111111' |
|
4957 |
self new encode:'earth' -> 'AT11111111' |
|
4958 |
self new encode:'head' -> 'AT11111111' |
|
4959 |
self new encode:'old' -> 'AT11111111' |
|
4960 |
||
4961 |
self new encode:'ready' -> 'RTA1111111' |
|
4962 |
self new encode:'rather' -> 'RTA1111111' |
|
4963 |
self new encode:'able' -> 'APA1111111' |
|
4964 |
self new encode:'appear' -> 'APA1111111' |
|
4965 |
||
4966 |
self new encode:'Deedee' -> 'TTA1111111' |
|
4967 |
" |
|
4968 |
||
4969 |
"Created: / 28-07-2017 / 15:21:23 / cg" |
|
4970 |
"Modified: / 02-08-2017 / 01:42:35 / cg" |
|
4971 |
! ! |
|
4972 |
||
4488 | 4973 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
4974 |
||
4975 |
documentation |
|
4976 |
" |
|
4977 |
The 'Kölner Phonetik' (cologne phonetic) code is for the german language |
|
4978 |
what the soundex code is for english: |
|
4979 |
it returns similar strings for similar sounding words |
|
4980 |
(but is specifically aware of the pronunciation of German and eastern languages) . |
|
4981 |
||
4982 |
There are some other differences to soundex, though: |
|
4983 |
its length is not limited to 4, but depends on the length of the original string; |
|
4984 |
it does not start with the first character of the input, but returns a pure numeric string. |
|
4985 |
||
4986 |
This algorithm was described by Postel 1969, |
|
4987 |
See http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik |
|
4988 |
||
4989 |
self new phoneticStringsFor:'Müller-Lüdenscheidt' -> #('65752682') |
|
4990 |
" |
|
4991 |
! |
|
4992 |
||
4993 |
examples |
|
4994 |
" |
|
4995 |
words sounding similar (german pronunciation) will deliver a similar code: |
|
4996 |
||
4997 |
#( |
|
4998 |
'Müller' |
|
4999 |
'Miller' |
|
5000 |
'Mueller' |
|
5001 |
'Mühler' |
|
5002 |
'Mühlherr' |
|
5003 |
'Mülherr' |
|
5004 |
'Myler' |
|
5005 |
'Millar' |
|
5006 |
'Myller' |
|
5007 |
'Müllar' |
|
5008 |
'Müler' |
|
5009 |
'Muehler' |
|
5010 |
'Mülller' |
|
5011 |
'Müllerr' |
|
5012 |
'Muehlherr' |
|
5013 |
'Muellar' |
|
5014 |
'Mueler' |
|
5015 |
'Mülleer' |
|
5016 |
'Mueller' |
|
5017 |
'Nüller' |
|
5018 |
'Nyller' |
|
5019 |
'Niler' |
|
5020 |
'Czerny' |
|
5021 |
'Tscherny' |
|
5022 |
'Czernie' |
|
5023 |
'Tschernie' |
|
5024 |
'Schernie' |
|
5025 |
'Scherny' |
|
5026 |
'Scherno' |
|
5027 |
'Czerne' |
|
5028 |
'Zerny' |
|
5029 |
'Tzernie' |
|
5030 |
'Breschnew' |
|
5031 |
'Breschnew' |
|
5032 |
'Breschneff' |
|
5033 |
'Breschnjeff' |
|
5034 |
'Braeschneff' |
|
5035 |
'Braessneff' |
|
5036 |
'Pressneff' |
|
5037 |
'Presznäph' |
|
5038 |
'Präschnäf' |
|
5039 |
'Breschnjeff' |
|
5040 |
'Breschnijeff' |
|
5041 |
'Breschnieff' |
|
5042 |
'Bräschnieff' |
|
5043 |
'Braschnieff' |
|
5044 |
'Broschnieff' |
|
5045 |
) do:[:w | |
|
5046 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w) |
|
5047 |
]. |
|
5048 |
" |
|
5049 |
! ! |
|
5050 |
||
5051 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'api'! |
|
5052 |
||
5053 |
encode: aString |
|
5054 |
"return a koelner phonetic code. |
|
5055 |
The koelnerPhonetic code is for the german language what the soundex code is for english; |
|
5056 |
it returns simular strings for similar sounding words. |
|
5057 |
There are some differences to soundex, though: |
|
5058 |
its length is not limited to 4, but depends on the length of the original string; |
|
5059 |
it does not start with the first character of the input. |
|
5060 |
This algorithm is described by Postel 1969" |
|
5061 |
||
5062 |
|in ret val rslt| |
|
5063 |
||
5064 |
in := aString withoutSeparators asLowercase. |
|
5065 |
in := in copyReplaceString:'ph' withString:'f'. |
|
5066 |
(in includesAny:'öäüß') ifTrue:[ |
|
5067 |
in := in copyReplaceAll:$ü withAll:'u'. |
|
5068 |
in := in copyReplaceAll:$ä withAll:'a'. |
|
5069 |
in := in copyReplaceAll:$ö withAll:'o'. |
|
5070 |
in := in copyReplaceAll:$ß withAll:'ss'. |
|
5071 |
]. |
|
5072 |
in := in select:[:ch | ch isLetter]. |
|
5073 |
in := '#',in,'#'. |
|
5074 |
||
5075 |
ret := ''. |
|
5076 |
1 to:in size-2 do:[:i | |
|
5077 |
|sub| |
|
5078 |
||
5079 |
sub := in copyFrom:i to:i+2. |
|
5080 |
val := (i==1) |
|
5081 |
ifTrue:[ self convertFirst:sub ] |
|
5082 |
ifFalse:[ self convertRest:sub ]. |
|
5083 |
ret := ret,val |
|
5084 |
]. |
|
5085 |
||
5086 |
ret := ret select:[:ch | ch ~= $-]. |
|
5087 |
||
5088 |
(ret startsWith:'0') ifTrue:[ |
|
5089 |
ret := '0',(ret select:[:ch | ch ~= $0]). |
|
5090 |
] ifFalse:[ |
|
5091 |
ret := ret select:[:ch | ch ~= $0]. |
|
5092 |
]. |
|
5093 |
||
5094 |
rslt := String streamContents:[:s | |
|
5095 |
|prev| |
|
5096 |
||
5097 |
ret do:[:ch | |
|
5098 |
ch ~= prev ifTrue:[ |
|
5099 |
s nextPut:ch |
|
5100 |
]. |
|
5101 |
prev := ch. |
|
5102 |
]. |
|
5103 |
]. |
|
5104 |
^ rslt. |
|
5105 |
||
5106 |
" |
|
5107 |
#( |
|
5108 |
'Müller' |
|
5109 |
'Miller' |
|
5110 |
'Mueller' |
|
5111 |
'Mühler' |
|
5112 |
'Mühlherr' |
|
5113 |
'Mülherr' |
|
5114 |
'Myler' |
|
5115 |
'Millar' |
|
5116 |
'Myller' |
|
5117 |
'Müllar' |
|
5118 |
'Müler' |
|
5119 |
'Muehler' |
|
5120 |
'Mülller' |
|
5121 |
'Müllerr' |
|
5122 |
'Muehlherr' |
|
5123 |
'Muellar' |
|
5124 |
'Mueler' |
|
5125 |
'Mülleer' |
|
5126 |
'Mueller' |
|
5127 |
'Nüller' |
|
5128 |
'Nyller' |
|
5129 |
'Niler' |
|
5130 |
'Czerny' |
|
5131 |
'Tscherny' |
|
5132 |
'Czernie' |
|
5133 |
'Tschernie' |
|
5134 |
'Schernie' |
|
5135 |
'Scherny' |
|
5136 |
'Scherno' |
|
5137 |
'Czerne' |
|
5138 |
'Zerny' |
|
5139 |
'Tzernie' |
|
5140 |
'Breschnew' |
|
5141 |
'Breschnew' |
|
5142 |
'Breschneff' |
|
5143 |
'Breschnjeff' |
|
5144 |
'Braeschneff' |
|
5145 |
'Braessneff' |
|
5146 |
'Pressneff' |
|
5147 |
'Presznäph' |
|
5148 |
'Präschnäf' |
|
5149 |
'Breschnjeff' |
|
5150 |
'Breschnijeff' |
|
5151 |
'Breschnieff' |
|
5152 |
) do:[:w | |
|
5153 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:w) |
|
5154 |
]. |
|
5155 |
" |
|
5156 |
||
5157 |
" |
|
5158 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnew' -> '17863' |
|
5159 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschneff' -> '17863' |
|
5160 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braeschneff' -> '17863' |
|
5161 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Braessneff' -> '17863' |
|
5162 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Pressneff' -> '17863' |
|
5163 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Presznäph' -> '17863' |
|
5164 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Präschnäf' -> '17863' |
|
5165 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnjeff' -> '17863' |
|
5166 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnijeff' -> '17863' |
|
5167 |
PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator new encode:'Breschnieff' -> '17863' |
|
5168 |
" |
|
5169 |
" |
|
5170 |
self basicNew encode:'müller' -> '657' |
|
5171 |
self basicNew encode:'möller' -> '657' |
|
5172 |
self basicNew encode:'miller' -> '657' |
|
5173 |
self basicNew encode:'muller' -> '657' |
|
5174 |
self basicNew encode:'muler' -> '657' |
|
5175 |
self basicNew encode:'schmidt' -> '862' |
|
5176 |
self basicNew encode:'schneider' -> '8627' |
|
5177 |
self basicNew encode:'fischer' -> '387' |
|
5178 |
self basicNew encode:'weber' -> '317' |
|
5179 |
self basicNew encode:'meyer' -> '67' |
|
5180 |
self basicNew encode:'wagner' -> '3467' |
|
5181 |
self basicNew encode:'schulz' -> '858' |
|
5182 |
self basicNew encode:'becker' -> '147' |
|
5183 |
self basicNew encode:'hoffmann' -> '036' |
|
5184 |
self basicNew encode:'schäfer' -> '837' |
|
5185 |
" |
|
5186 |
||
5187 |
"Created: / 28-07-2017 / 15:24:33 / cg" |
|
5188 |
! ! |
|
5189 |
||
5190 |
!PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator methodsFor:'private'! |
|
5191 |
||
5192 |
convertFirst:chars |
|
5193 |
|c2 c3| |
|
5194 |
||
5195 |
chars size == 3 ifTrue:[ |
|
5196 |
c2 := (chars at:2). |
|
5197 |
c2 == $a ifTrue:[^ '0']. |
|
5198 |
c2 == $e ifTrue:[^ '0']. |
|
5199 |
c2 == $i ifTrue:[^ '0']. |
|
5200 |
c2 == $j ifTrue:[^ '0']. |
|
5201 |
c2 == $y ifTrue:[^ '0']. |
|
5202 |
c2 == $o ifTrue:[^ '0']. |
|
5203 |
c2 == $u ifTrue:[^ '0']. |
|
5204 |
||
5205 |
c2 == $c ifTrue:[ |
|
5206 |
c3 := (chars at:3). |
|
5207 |
(c3 == $a) ifTrue:[^ '4']. |
|
5208 |
(c3 == $h) ifTrue:[^ '4']. |
|
5209 |
(c3 == $k) ifTrue:[^ '4']. |
|
5210 |
(c3 == $l) ifTrue:[^ '4']. |
|
5211 |
(c3 == $o) ifTrue:[^ '4']. |
|
5212 |
(c3 == $q) ifTrue:[^ '4']. |
|
5213 |
(c3 == $r) ifTrue:[^ '4']. |
|
5214 |
(c3 == $u) ifTrue:[^ '4']. |
|
5215 |
(c3 == $x) ifTrue:[^ '4']. |
|
5216 |
^ '8' |
|
5217 |
]. |
|
5218 |
||
5219 |
"/ #( |
|
5220 |
"/ ('#a#' '0') |
|
5221 |
"/ ('#e#' '0') |
|
5222 |
"/ ('#i#' '0') |
|
5223 |
"/ ('#j#' '0') |
|
5224 |
"/ ('#y#' '0') |
|
5225 |
"/ ('#o#' '0') |
|
5226 |
"/ ('#u#' '0') |
|
5227 |
"/ |
|
5228 |
"/ ('#ca' '4') |
|
5229 |
"/ ('#ch' '4') |
|
5230 |
"/ ('#ck' '4') |
|
5231 |
"/ ('#cl' '4') |
|
5232 |
"/ ('#co' '4') |
|
5233 |
"/ ('#cq' '4') |
|
5234 |
"/ ('#cr' '4') |
|
5235 |
"/ ('#cu' '4') |
|
5236 |
"/ ('#cx' '4') |
|
5237 |
"/ |
|
5238 |
"/ ('#c#' '8') |
|
5239 |
"/ ) do:[:pair | |
|
5240 |
"/ (pair first match:chars) ifTrue:[ |
|
5241 |
"/ ^ pair second |
|
5242 |
"/ ] |
|
5243 |
"/ ]. |
|
5244 |
]. |
|
5245 |
||
5246 |
^ self convertRest:chars |
|
5247 |
||
5248 |
"Modified: / 29-07-2017 / 14:22:20 / cg" |
|
5249 |
! |
|
5250 |
||
5251 |
convertRest:chars |
|
5252 |
chars size == 3 ifFalse:[ |
|
5253 |
self error:'cannot happen'. |
|
5254 |
^ '?' |
|
5255 |
]. |
|
5256 |
||
5257 |
#( |
|
5258 |
"/ used to be matchpattern code, |
|
5259 |
"/ but doing these glob-matches is too slow. |
|
5260 |
"/ changed to: |
|
5261 |
"/ start nil code |
|
5262 |
"/ nil end code |
|
5263 |
"/ nil char code |
|
5264 |
"/ |
|
5265 |
(nil 'ds' " '#ds' " '8') |
|
5266 |
(nil 'dc' " '#dc' " '8') |
|
5267 |
(nil 'dz' " '#dz' " '8') |
|
5268 |
(nil 'ts' " '#ts' " '8') |
|
5269 |
(nil 'tc' " '#tc' " '8') |
|
5270 |
(nil 'tz' " '#tz' " '8') |
|
5271 |
(nil $d " '#d#' " '2') |
|
5272 |
(nil $t " '#t#' " '2') |
|
5273 |
('cx' nil " 'cx#' " '8') |
|
5274 |
('kx' nil " 'kx#' " '8') |
|
5275 |
('qx' nil " 'qx#' " '8') |
|
5276 |
(nil $x " '#x#' " '48') |
|
5277 |
('sc' nil " 'sc#' " '8') |
|
5278 |
('sz' nil " 'sz#' " '8') |
|
5279 |
(nil 'ca' " '#ca' " '4') |
|
5280 |
(nil 'co' " '#co' " '4') |
|
5281 |
(nil 'cu' " '#cu' " '4') |
|
5282 |
(nil 'ch' " '#ch' " '4') |
|
5283 |
(nil 'ck' " '#ck' " '4') |
|
5284 |
(nil 'cx' " '#cx' " '4') |
|
5285 |
(nil 'cq' " '#cq' " '4') |
|
5286 |
(nil $c " '#c#' " '8') |
|
5287 |
(nil $a " '#a#' " '0') |
|
5288 |
(nil $e " '#e#' " '0') |
|
5289 |
(nil $i " '#i#' " '0') |
|
5290 |
(nil $j " '#j#' " '0') |
|
5291 |
(nil $y " '#y#' " '0') |
|
5292 |
(nil $o " '#o#' " '0') |
|
5293 |
(nil $u " '#u#' " '0') |
|
5294 |
(nil $h " '#h#' " '-') |
|
5295 |
(nil $l " '#l#' " '5') |
|
5296 |
(nil $r " '#r#' " '7') |
|
5297 |
(nil $m " '#m#' " '6') |
|
5298 |
(nil $n " '#n#' " '6') |
|
5299 |
(nil $s " '#s#' " '8') |
|
5300 |
(nil $z " '#z#' " '8') |
|
5301 |
(nil $b " '#b#' " '1') |
|
5302 |
(nil $p " '#p#' " '1') |
|
5303 |
(nil $f " '#f#' " '3') |
|
5304 |
(nil $v " '#v#' " '3') |
|
5305 |
(nil $w " '#w#' " '3') |
|
5306 |
(nil $g " '#g#' " '4') |
|
5307 |
(nil $k " '#k#' " '4') |
|
5308 |
(nil $q " '#q#' " '4') |
|
5309 |
(nil nil " '###' " '?') |
|
5310 |
) do:[:vector | |
|
5311 |
|v1 v2| |
|
5312 |
||
5313 |
(v1 := vector at:1) notNil ifTrue:[ |
|
5314 |
"/ prefix |
|
5315 |
(chars startsWith:v1) ifTrue:[^ (vector at:3) ]. |
|
5316 |
] ifFalse:[ |
|
5317 |
(v2 := vector at:2) isCharacter ifTrue:[ |
|
5318 |
"/ middle character compare |
|
5319 |
(chars at:2) == v2 ifTrue:[^ (vector at:3) ]. |
|
5320 |
] ifFalse:[ |
|
5321 |
v2 isString ifTrue:[ |
|
5322 |
"/ suffix |
|
5323 |
(chars endsWith:v2) ifTrue:[^ (vector at:3) ]. |
|
5324 |
] ifFalse:[ |
|
5325 |
^ '?' |
|
5326 |
] |
|
5327 |
] |
|
5328 |
]. |
|
5329 |
||
5330 |
"/ (vector first match:chars) ifTrue:[ |
|
5331 |
"/ ^ vector second |
|
5332 |
"/ ] |
|
5333 |
]. |
|
5334 |
||
5335 |
self error:'cannot happen' |
|
5336 |
||
5337 |
"Modified: / 29-07-2017 / 14:17:38 / cg" |
|
2208 | 5338 |
! ! |
5339 |
||
5340 |
!PhoneticStringUtilities::MiracodeStringComparator class methodsFor:'documentation'! |
|
5341 |
||
5342 |
documentation |
|
5343 |
" |
|
4489 | 5344 |
Miracode (also called << American Soundex >>) is like Soundex with the |
5345 |
addition that h and w are discarded if they separate consonants. |
|
5346 |
||
5347 |
These variants may be specifically important because they were used in |
|
5348 |
U.S. National Archives. |
|
5349 |
Most archive data were encoded with Miracode, |
|
5350 |
but there are some (older) entries encoded with Simplified Soundex. |
|
5351 |
||
5352 |
The HW-rule was documented as a standard in 1910, |
|
5353 |
but actually data of 1880, 1900 and 1910 |
|
3185
9833bbba2050
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
2580
diff
changeset
|
5354 |
censuses were encoded with mixed methods. |
4489 | 5355 |
|
5356 |
self new encode:'washington' -> 'W252' |
|
5357 |
self new encode:'lee' -> 'L000' |
|
5358 |
self new encode:'Gutierrez' -> 'G362' |
|
5359 |
self new encode:'Pfister' -> 'P236' |
|
5360 |
self new encode:'Jackson' -> 'J250' |
|
5361 |
self new encode:'Tymczak' -> 'T522' |
|
5362 |
||
5363 |
notice: |
|
4491 | 5364 |
MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
5365 |
SoundexStringComparator new encode:'Ashcraft' -> 'A226' |
|
4489 | 5366 |
|
5367 |
see also: |
|
5368 |
https://www.archives.gov/research/census/soundex.html |
|
2208 | 5369 |
" |
5370 |
! ! |
|
5371 |
||
4491 | 5372 |
!PhoneticStringUtilities::MiracodeStringComparator methodsFor:'private'! |
2208 | 5373 |
|
4488 | 5374 |
encode:word |
4491 | 5375 |
"same as inherited, but cares for W and H" |
5376 |
||
2208 | 5377 |
|u p t prevCode| |
5378 |
||
4488 | 5379 |
u := word asUppercase. |
2208 | 5380 |
p := u first asString. |
5381 |
prevCode := self translate:u first. |
|
5382 |
u from:2 to:u size do:[:c | |
|
5383 |
t := self translate:c. |
|
5384 |
(t notNil |
|
5385 |
and:[ t ~= '0' |
|
5386 |
and:[ t ~= prevCode ]]) ifTrue:[ |
|
5387 |
p := p , t. |
|
4488 | 5388 |
p size == 4 ifTrue:[^ p ]. |
2208 | 5389 |
]. |
5390 |
(c ~= $W and:[c ~= $H]) ifTrue:[ |
|
5391 |
prevCode := t. |
|
5392 |
]. |
|
5393 |
]. |
|
5394 |
[ p size < 4 ] whileTrue:[ |
|
5395 |
p := p , '0' |
|
5396 |
]. |
|
4488 | 5397 |
^ (p copyFrom:1 to:4) |
5398 |
||
4491 | 5399 |
"Created: / 02-08-2017 / 00:19:47 / cg" |
5400 |
"Modified (comment): / 02-08-2017 / 14:30:47 / cg" |
|
4489 | 5401 |
! ! |
5402 |
||
5403 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator class methodsFor:'documentation'! |
|
5404 |
||
5405 |
documentation |
|
5406 |
" |
|
5407 |
The 'Spanish Phonetik' (spanish phonetic) code is for the spanish language |
|
5408 |
what the soundex code is for english: |
|
5409 |
it returns similar strings for similar sounding words |
|
5410 |
(but is specifically aware of the pronunciation of spanish) . |
|
5411 |
||
5412 |
There are some other differences to soundex, though: |
|
5413 |
its length is not limited to 4, but depends on the length of the original string; |
|
5414 |
it does not start with the first character of the input, |
|
5415 |
but returns a pure numeric string, |
|
5416 |
it uses different character groups |
|
5417 |
||
5418 |
This algorithm was described by Marıa del Pilar Angeles, Adrian Espino-Gamez, |
|
5419 |
and Jonathan Gil-Moncada, in |
|
5420 |
'Comparison of a Modified Spanish phonetic, |
|
5421 |
Soundex, and Phonex coding functions during data matching process' |
|
5422 |
See https://www.researchgate.net/publication/285589803_Comparison_of_a_Modified_Spanish_Phonetic_Soundex_and_Phonex_coding_functions_during_data_matching_process |
|
5423 |
||
5424 |
" |
|
5425 |
! |
|
5426 |
||
5427 |
examples |
|
5428 |
" |
|
5429 |
words sounding similar (german pronunciation) will deliver a similar code: |
|
5430 |
||
5431 |
#( |
|
5432 |
'Marıa' |
|
5433 |
'Pilar' |
|
5434 |
'Angeles' |
|
5435 |
'Adrian' |
|
5436 |
'Gamez' |
|
5437 |
) do:[:w | |
|
5438 |
Transcript show:w; show:'->'; showCR:(PhoneticStringUtilities::SpanishPhoneticCodeStringComparator new encode:w) |
|
5439 |
]. |
|
5440 |
" |
|
5441 |
! ! |
|
5442 |
||
5443 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'api'! |
|
5444 |
||
5445 |
encode: aString |
|
5446 |
"return a spanish phonetic code. |
|
5447 |
The spanishPhonetic code is for the spanish language what the soundex code is for english; |
|
5448 |
it returns simular strings for similar sounding words. |
|
5449 |
There are some differences to soundex, though: |
|
5450 |
its length is not limited to 4, but depends on the length of the original string; |
|
5451 |
it does not start with the first character of the input, |
|
5452 |
it uses different character groups. |
|
5453 |
This algorithm is described by Marıa del Pilar Angeles, Adrian Espino-Gamez, |
|
5454 |
Jonathan Gil-Moncada." |
|
5455 |
||
5456 |
|in| |
|
5457 |
||
5458 |
in := aString withoutSeparators asUppercase. |
|
5459 |
||
5460 |
^ String streamContents:[:out | |
|
5461 |
|prev| |
|
5462 |
||
5463 |
in do:[:ch | |
|
5464 |
ch == prev ifFalse:[ |
|
5465 |
ch == $P ifTrue:[ |
|
5466 |
out nextPut:$0. |
|
5467 |
] ifFalse:[ ('BV' includes:ch) ifTrue:[ |
|
5468 |
out nextPut:$1. |
|
5469 |
] ifFalse:[ ('FH' includes:ch) ifTrue:[ |
|
5470 |
out nextPut:$2. |
|
5471 |
] ifFalse:[ ('DT' includes:ch) ifTrue:[ |
|
5472 |
out nextPut:$3. |
|
5473 |
] ifFalse:[ ('SZCX' includes:ch) ifTrue:[ |
|
5474 |
out nextPut:$4. |
|
5475 |
] ifFalse:[ ('YL' includes:ch) ifTrue:[ |
|
5476 |
out nextPut:$5. |
|
5477 |
] ifFalse:[ ('NŃM' includes:ch) ifTrue:[ |
|
5478 |
out nextPut:$6. |
|
5479 |
] ifFalse:[ ('QK' includes:ch) ifTrue:[ |
|
5480 |
out nextPut:$7. |
|
5481 |
] ifFalse:[ ('GJ' includes:ch) ifTrue:[ |
|
5482 |
out nextPut:$8. |
|
5483 |
] ifFalse:[ ('R' includes:ch) ifTrue:[ |
|
5484 |
out nextPut:$9. |
|
5485 |
]]]]]]]]]]. |
|
5486 |
prev := ch. |
|
5487 |
]. |
|
5488 |
]. |
|
5489 |
]. |
|
5490 |
||
5491 |
" |
|
5492 |
self new encode:'Jose' |
|
5493 |
" |
|
5494 |
||
5495 |
"Created: / 28-07-2017 / 15:24:33 / cg" |
|
5496 |
"Modified: / 01-08-2017 / 18:48:50 / cg" |
|
5497 |
! ! |
|
5498 |
||
5499 |
!PhoneticStringUtilities::SpanishPhoneticCodeStringComparator methodsFor:'private'! |
|
5500 |
||
5501 |
convertFirst:chars |
|
5502 |
|c2 c3| |
|
5503 |
||
5504 |
chars size == 3 ifTrue:[ |
|
5505 |
c2 := (chars at:2). |
|
5506 |
c2 == $a ifTrue:[^ '0']. |
|
5507 |
c2 == $e ifTrue:[^ '0']. |
|
5508 |
c2 == $i ifTrue:[^ '0']. |
|
5509 |
c2 == $j ifTrue:[^ '0']. |
|
5510 |
c2 == $y ifTrue:[^ '0']. |
|
5511 |
c2 == $o ifTrue:[^ '0']. |
|
5512 |
c2 == $u ifTrue:[^ '0']. |
|
5513 |
||
5514 |
c2 == $c ifTrue:[ |
|
5515 |
c3 := (chars at:3). |
|
5516 |
(c3 == $a) ifTrue:[^ '4']. |
|
5517 |
(c3 == $h) ifTrue:[^ '4']. |
|
5518 |
(c3 == $k) ifTrue:[^ '4']. |
|
5519 |
(c3 == $l) ifTrue:[^ '4']. |
|
5520 |
(c3 == $o) ifTrue:[^ '4']. |
|
5521 |
(c3 == $q) ifTrue:[^ '4']. |
|
5522 |
(c3 == $r) ifTrue:[^ '4']. |
|
5523 |
(c3 == $u) ifTrue:[^ '4']. |
|
5524 |
(c3 == $x) ifTrue:[^ '4']. |
|
5525 |
^ '8' |
|
5526 |
]. |
|
5527 |
||
5528 |
"/ #( |
|
5529 |
"/ ('#a#' '0') |
|
5530 |
"/ ('#e#' '0') |
|
5531 |
"/ ('#i#' '0') |
|
5532 |
"/ ('#j#' '0') |
|
5533 |
"/ ('#y#' '0') |
|
5534 |
"/ ('#o#' '0') |
|
5535 |
"/ ('#u#' '0') |
|
5536 |
"/ |
|
5537 |
"/ ('#ca' '4') |
|
5538 |
"/ ('#ch' '4') |
|
5539 |
"/ ('#ck' '4') |
|
5540 |
"/ ('#cl' '4') |
|
5541 |
"/ ('#co' '4') |
|
5542 |
"/ ('#cq' '4') |
|
5543 |
"/ ('#cr' '4') |
|
5544 |
"/ ('#cu' '4') |
|
5545 |
"/ ('#cx' '4') |
|
5546 |
"/ |
|
5547 |
"/ ('#c#' '8') |
|
5548 |
"/ ) do:[:pair | |
|
5549 |
"/ (pair first match:chars) ifTrue:[ |
|
5550 |
"/ ^ pair second |
|
5551 |
"/ ] |
|
5552 |
"/ ]. |
|
5553 |
]. |
|
5554 |
||
5555 |
^ self convertRest:chars |
|
5556 |
||
5557 |
"Modified: / 29-07-2017 / 14:22:20 / cg" |
|
5558 |
! |
|
5559 |
||
5560 |
convertRest:chars |
|
5561 |
chars size == 3 ifFalse:[ |
|
5562 |
self error:'cannot happen'. |
|
5563 |
^ '?' |
|
5564 |
]. |
|
5565 |
||
5566 |
#( |
|
5567 |
"/ used to be matchpattern code, |
|
5568 |
"/ but doing these glob-matches is too slow. |
|
5569 |
"/ changed to: |
|
5570 |
"/ start nil code |
|
5571 |
"/ nil end code |
|
5572 |
"/ nil char code |
|
5573 |
"/ |
|
5574 |
(nil 'ds' " '#ds' " '8') |
|
5575 |
(nil 'dc' " '#dc' " '8') |
|
5576 |
(nil 'dz' " '#dz' " '8') |
|
5577 |
(nil 'ts' " '#ts' " '8') |
|
5578 |
(nil 'tc' " '#tc' " '8') |
|
5579 |
(nil 'tz' " '#tz' " '8') |
|
5580 |
(nil $d " '#d#' " '2') |
|
5581 |
(nil $t " '#t#' " '2') |
|
5582 |
('cx' nil " 'cx#' " '8') |
|
5583 |
('kx' nil " 'kx#' " '8') |
|
5584 |
('qx' nil " 'qx#' " '8') |
|
5585 |
(nil $x " '#x#' " '48') |
|
5586 |
('sc' nil " 'sc#' " '8') |
|
5587 |
('sz' nil " 'sz#' " '8') |
|
5588 |
(nil 'ca' " '#ca' " '4') |
|
5589 |
(nil 'co' " '#co' " '4') |
|
5590 |
(nil 'cu' " '#cu' " '4') |
|
5591 |
(nil 'ch' " '#ch' " '4') |
|
5592 |
(nil 'ck' " '#ck' " '4') |
|
5593 |
(nil 'cx' " '#cx' " '4') |
|
5594 |
(nil 'cq' " '#cq' " '4') |
|
5595 |
(nil $c " '#c#' " '8') |
|
5596 |
(nil $a " '#a#' " '0') |
|
5597 |
(nil $e " '#e#' " '0') |
|
5598 |
(nil $i " '#i#' " '0') |
|
5599 |
(nil $j " '#j#' " '0') |
|
5600 |
(nil $y " '#y#' " '0') |
|
5601 |
(nil $o " '#o#' " '0') |
|
5602 |
(nil $u " '#u#' " '0') |
|
5603 |
(nil $h " '#h#' " '-') |
|
5604 |
(nil $l " '#l#' " '5') |
|
5605 |
(nil $r " '#r#' " '7') |
|
5606 |
(nil $m " '#m#' " '6') |
|
5607 |
(nil $n " '#n#' " '6') |
|
5608 |
(nil $s " '#s#' " '8') |
|
5609 |
(nil $z " '#z#' " '8') |
|
5610 |
(nil $b " '#b#' " '1') |
|
5611 |
(nil $p " '#p#' " '1') |
|
5612 |
(nil $f " '#f#' " '3') |
|
5613 |
(nil $v " '#v#' " '3') |
|
5614 |
(nil $w " '#w#' " '3') |
|
5615 |
(nil $g " '#g#' " '4') |
|
5616 |
(nil $k " '#k#' " '4') |
|
5617 |
(nil $q " '#q#' " '4') |
|
5618 |
(nil nil " '###' " '?') |
|
5619 |
) do:[:vector | |
|
5620 |
|v1 v2| |
|
5621 |
||
5622 |
(v1 := vector at:1) notNil ifTrue:[ |
|
5623 |
"/ prefix |
|
5624 |
(chars startsWith:v1) ifTrue:[^ (vector at:3) ]. |
|
5625 |
] ifFalse:[ |
|
5626 |
(v2 := vector at:2) isCharacter ifTrue:[ |
|
5627 |
"/ middle character compare |
|
5628 |
(chars at:2) == v2 ifTrue:[^ (vector at:3) ]. |
|
5629 |
] ifFalse:[ |
|
5630 |
v2 isString ifTrue:[ |
|
5631 |
"/ suffix |
|
5632 |
(chars endsWith:v2) ifTrue:[^ (vector at:3) ]. |
|
5633 |
] ifFalse:[ |
|
5634 |
^ '?' |
|
5635 |
] |
|
5636 |
] |
|
5637 |
]. |
|
5638 |
||
5639 |
"/ (vector first match:chars) ifTrue:[ |
|
5640 |
"/ ^ vector second |
|
5641 |
"/ ] |
|
5642 |
]. |
|
5643 |
||
5644 |
self error:'cannot happen' |
|
5645 |
||
5646 |
"Modified: / 29-07-2017 / 14:17:38 / cg" |
|
2208 | 5647 |
! ! |
5648 |
||
2197 | 5649 |
!PhoneticStringUtilities class methodsFor:'documentation'! |
5650 |
||
5651 |
version |
|
3646 | 5652 |
^ '$Header$' |
2285 | 5653 |
! |
5654 |
||
5655 |
version_CVS |
|
3646 | 5656 |
^ '$Header$' |
2197 | 5657 |
! ! |
3185
9833bbba2050
class: PhoneticStringUtilities
Claus Gittinger <cg@exept.de>
parents:
2580
diff
changeset
|
5658 |