514 "/ please change as required (and remove this comment) |
576 "/ please change as required (and remove this comment) |
515 |
577 |
516 "/ super initialize. -- commented since inherited method does nothing |
578 "/ super initialize. -- commented since inherited method does nothing |
517 ! ! |
579 ! ! |
518 |
580 |
519 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'! |
581 !PhoneticStringUtilities::DaitchMokotoffStringComparator class methodsFor:'documentation'! |
520 |
582 |
521 documentation |
583 documentation |
522 " |
584 " |
523 There are many extended and enhanced soundex variants around; |
585 self encode:'AUERBACH' -> 097400, 097500 |
524 here is one, called 'extended soundex'. It is destribed for example in |
586 |
525 http://www.epidata.dk/documentation.php. |
587 Encodes a string into a Daitch-Mokotoff Soundex value. |
526 An author or origin is unknown. |
588 The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, |
527 |
589 yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation |
528 The number of digits is increased to 5 or 8; |
590 but differences in spelling. |
529 The first character is not used literally; instead it is encoded like the rest. |
591 |
530 This might have a negative effect on names starting with a vovel, though. |
592 The main differences compared to the other soundex variants are: |
531 |
593 - coded names are 6 digits long |
532 Overall, it can be doubted if this is really an enhancement after all. |
594 - the initial character of the name is coded |
|
595 - rules to encoded multi-character n-grams |
|
596 - multiple possible encodings for the same name (branching) |
|
597 |
|
598 This implementation supports branching, depending on the used method: |
|
599 encode:aString - branching disabled, only the first code will be returned |
|
600 phoneticStringsFor:String - branching enabled, all codes will be returned, separated by '|' |
|
601 |
|
602 [see also:] |
|
603 'Wikipedia - Daitch-Mokotoff Soundex' |
|
604 http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex |
|
605 |
|
606 'Avotaynu - Soundexing and Genealogy' |
|
607 http://www.avotaynu.com/soundex.htm |
533 " |
608 " |
534 ! ! |
609 ! |
535 |
610 |
536 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'! |
611 javaCode |
537 |
|
538 phoneticStringsFor:aString |
|
539 "generates both an extended soundex of length 5 and one of length 8" |
|
540 |
|
541 |first second u t prevCode| |
|
542 |
|
543 u := aString asUppercase. |
|
544 first := second := ''. |
|
545 u do:[:c | |
|
546 t := self translate:c. |
|
547 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
548 first := first , t. |
|
549 second := second , t. |
|
550 second size == 8 ifTrue:[ |
|
551 ^ Array with:(first copyTo:5) with:second |
|
552 ]. |
|
553 ]. |
|
554 prevCode := t |
|
555 ]. |
|
556 [ first size < 5 ] whileTrue:[ |
|
557 first := first , '0'. |
|
558 second := second , '0'. |
|
559 ]. |
|
560 [ second size < 8 ] whileTrue:[ |
|
561 second := second , '0' |
|
562 ]. |
|
563 ^ Array with:first with:second |
|
564 |
|
565 " |
|
566 self basicNew phoneticStringsFor:'müller' #('87900' '87900000') |
|
567 self basicNew phoneticStringsFor:'miller' #('87900' '87900000') |
|
568 self basicNew phoneticStringsFor:'muller' #('87900' '87900000') |
|
569 self basicNew phoneticStringsFor:'muler' #('87900' '87900000') |
|
570 self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000') |
|
571 self basicNew phoneticStringsFor:'schneider' #('38690' '38690000') |
|
572 self basicNew phoneticStringsFor:'fischer' #('23900' '23900000') |
|
573 self basicNew phoneticStringsFor:'weber' #('19000' '19000000') |
|
574 self basicNew phoneticStringsFor:'meyer' #('89000' '89000000') |
|
575 self basicNew phoneticStringsFor:'wagner' #('48900' '48900000') |
|
576 self basicNew phoneticStringsFor:'schulz' #('37500' '37500000') |
|
577 self basicNew phoneticStringsFor:'becker' #('13900' '13900000') |
|
578 self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000') |
|
579 self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000') |
|
580 " |
|
581 ! ! |
|
582 |
|
583 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'! |
|
584 |
|
585 translate:aCharacter |
|
586 "use simple if's for more speed when compiled" |
|
587 |
|
588 "vowels serve as separators" |
|
589 aCharacter == $A ifTrue:[^ '0' ]. |
|
590 aCharacter == $E ifTrue:[^ '0' ]. |
|
591 aCharacter == $I ifTrue:[^ '0' ]. |
|
592 aCharacter == $O ifTrue:[^ '0' ]. |
|
593 aCharacter == $U ifTrue:[^ '0' ]. |
|
594 aCharacter == $Y ifTrue:[^ '0' ]. |
|
595 |
|
596 aCharacter == $B ifTrue:[^ '1' ]. |
|
597 aCharacter == $P ifTrue:[^ '1' ]. |
|
598 |
|
599 aCharacter == $F ifTrue:[^ '2' ]. |
|
600 aCharacter == $V ifTrue:[^ '2' ]. |
|
601 |
|
602 aCharacter == $C ifTrue:[^ '3' ]. |
|
603 aCharacter == $S ifTrue:[^ '3' ]. |
|
604 aCharacter == $K ifTrue:[^ '3' ]. |
|
605 |
|
606 aCharacter == $G ifTrue:[^ '4' ]. |
|
607 aCharacter == $J ifTrue:[^ '4' ]. |
|
608 |
|
609 aCharacter == $Q ifTrue:[^ '5' ]. |
|
610 aCharacter == $X ifTrue:[^ '5' ]. |
|
611 aCharacter == $Z ifTrue:[^ '5' ]. |
|
612 |
|
613 aCharacter == $D ifTrue:[^ '6' ]. |
|
614 aCharacter == $G ifTrue:[^ '6' ]. |
|
615 aCharacter == $T ifTrue:[^ '6' ]. |
|
616 |
|
617 aCharacter == $L ifTrue:[^ '7' ]. |
|
618 |
|
619 aCharacter == $M ifTrue:[^ '8' ]. |
|
620 aCharacter == $N ifTrue:[^ '8' ]. |
|
621 |
|
622 aCharacter == $R ifTrue:[^ '9' ]. |
|
623 ^ nil |
|
624 ! ! |
|
625 |
|
626 !PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'! |
|
627 |
|
628 documentation |
|
629 " |
|
630 documentation to be added. |
|
631 |
|
632 [author:] |
|
633 cg |
|
634 |
|
635 [instance variables:] |
|
636 |
|
637 [class variables:] |
|
638 |
|
639 [see also:] |
|
640 |
|
641 " |
|
642 ! ! |
|
643 |
|
644 !PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'! |
|
645 |
|
646 encode:word |
|
647 ^ self subclassResponsibility |
|
648 |
|
649 "Created: / 28-07-2017 / 15:20:49 / cg" |
|
650 ! |
|
651 |
|
652 phoneticStringsFor:word |
|
653 ^ Array with:(self encode:word) |
|
654 |
|
655 "Created: / 28-07-2017 / 15:20:38 / cg" |
|
656 ! ! |
|
657 |
|
658 !PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'! |
|
659 |
|
660 documentation |
|
661 " |
|
662 Match Rating Approach Encoder |
|
663 |
|
664 The Western Airlines matching rating approach name encoder |
|
665 |
|
666 [see also:] |
|
667 https://en.wikipedia.org/wiki/Match_Rating_Approach |
|
668 |
|
669 G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
670 ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' |
|
671 US National Institute of Standards and Technology, SP-500-2 (1977), p. 17. |
|
672 " |
|
673 ! |
|
674 |
|
675 rCode |
|
676 "<<END |
612 "<<END |
677 ## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us> |
613 /* |
678 ## |
614 * Licensed to the Apache Software Foundation (ASF) under one or more |
679 ## Redistribution and use in source and binary forms, with or without |
615 * contributor license agreements. See the NOTICE file distributed with |
680 ## modification, are permitted provided that the following conditions are |
616 * this work for additional information regarding copyright ownership. |
681 ## met: |
617 * The ASF licenses this file to You under the Apache License, Version 2.0 |
682 ## |
618 * (the "License"); you may not use this file except in compliance with |
683 ## Redistributions of source code must retain the above copyright |
619 * the License. You may obtain a copy of the License at |
684 ## notice, this list of conditions and the following disclaimer. |
620 * |
685 ## |
621 * http://www.apache.org/licenses/LICENSE-2.0 |
686 ## Redistributions in binary form must reproduce the above copyright |
622 * |
687 ## notice, this list of conditions and the following disclaimer in |
623 * Unless required by applicable law or agreed to in writing, software |
688 ## the documentation and/or other materials provided with the |
624 * distributed under the License is distributed on an "AS IS" BASIS, |
689 ## distribution. |
625 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
690 ## |
626 * See the License for the specific language governing permissions and |
691 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
627 * limitations under the License. |
692 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
628 */ |
693 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
629 package org.apache.commons.codec.language; |
694 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
630 |
695 ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
631 import org.apache.commons.codec.CharEncoding; |
696 ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
632 import org.apache.commons.codec.EncoderException; |
697 ## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
633 import org.apache.commons.codec.StringEncoder; |
698 ## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
634 |
699 ## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
635 import java.io.InputStream; |
700 ## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
636 import java.util.*; |
701 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
637 |
702 |
638 /** |
703 #' @rdname mra |
639 * Encodes a string into a Daitch-Mokotoff Soundex value. |
704 #' @title Match Rating Approach Encoder |
640 * <p> |
705 #' |
641 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater |
706 #' @description |
642 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. |
707 #' The Western Airlines matching rating approach name encoder |
643 * </p> |
708 #' |
644 * <p> |
709 #' @param word string or vector of strings to encode |
645 * The main differences compared to the other soundex variants are: |
710 #' @param x MRA-encoded character vector |
646 * </p> |
711 #' @param y MRA-encoded character vector |
647 * <ul> |
712 #' |
648 * <li>coded names are 6 digits long |
713 #' @details |
649 * <li>the initial character of the name is coded |
714 #' |
650 * <li>rules to encoded multi-character n-grams |
715 #' The variable \code{word} is the name to be encoded. The variable |
651 * <li>multiple possible encodings for the same name (branching) |
716 #' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder |
652 * </ul> |
717 #' because the algorithm itself is dependent upon its six-character |
653 * <p> |
718 #' length. The variables \code{x} and \code{y} are MRA-encoded and are |
654 * This implementation supports branching, depending on the used method: |
719 #' compared to each other using the MRA comparison specification. |
655 * <ul> |
720 #' |
656 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned |
721 #' @return The \code{mra_encode} function returns match rating approach |
657 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' |
722 #' encoded character vector. The \code{mra_compare} returns a boolean |
658 * </ul> |
723 #' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA |
659 * <p> |
724 #' comparison test. |
660 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The |
725 #' |
661 * rules can be customized by overriding the default rules contained in the resource file |
726 #' @references |
662 * {@code org/apache/commons/codec/language/dmrules.txt}. |
727 #' |
663 * </p> |
728 #' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
664 * <p> |
729 #' \emph{Accessing Individual Records from Personal Data Files Using |
665 * This class is thread-safe. |
730 #' Nonunique Identifiers,} US National Institute of Standards and |
666 * </p> |
731 #' Technology, SP-500-2 (1977), p. 17. |
667 * |
732 #' |
668 * @see Soundex |
733 #' @family phonics |
669 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> |
734 #' |
670 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> |
735 #' @examples |
671 * |
736 #' mra_encode("William") |
672 * @version $Id$ |
737 #' mra_encode(c("Peter", "Peady")) |
673 * @since 1.10 |
738 #' mra_encode("Stevenson") |
674 */ |
739 |
675 public class DaitchMokotoffSoundex implements StringEncoder { |
740 #' @rdname mra |
676 |
741 #' @name mra_encode |
677 /** |
742 #' @export |
678 * Inner class representing a branch during DM soundex encoding. |
743 mra_encode <- function(word) { |
679 */ |
744 |
680 private static final class Branch { |
745 ## First, remove any nonalphabetical characters and uppercase it |
681 private final StringBuilder builder; |
746 word <- gsub("[^[:alpha:]]*", "", word) |
682 private String cachedString; |
747 word <- toupper(word) |
683 private String lastReplacement; |
748 |
684 |
749 ## First character of key = first character of name |
685 private Branch() { |
750 first <- substr(word, 1, 1) |
686 builder = new StringBuilder(); |
751 word <- substr(word, 2, nchar(word)) |
687 lastReplacement = null; |
752 |
688 cachedString = null; |
753 ## Delete vowels not at the start of the word |
689 } |
754 word <- gsub("[AEIOU]", "", word) |
690 |
755 word <- paste(first, word, sep = "") |
691 /** |
756 |
692 * Creates a new branch, identical to this branch. |
757 ## Remove duplicate consecutive characters |
693 * |
758 word <- gsub("([A-Z])\\1+", "\\1", word) |
694 * @return a new, identical branch |
759 |
695 */ |
760 ## If longer than 6 characters, take first and last 3...and we have |
696 public Branch createBranch() { |
761 ## to vectorize it |
697 final Branch branch = new Branch(); |
762 for(i in 1:length(word)) { |
698 branch.builder.append(toString()); |
763 if((l = nchar(word[i])) > 6) { |
699 branch.lastReplacement = this.lastReplacement; |
764 first <- substr(word[i], 1, 3) |
700 return branch; |
765 last <- substr(word[i], l - 2, l) |
701 } |
766 word[i] <- paste(first, last, sep = ""); |
702 |
|
703 @Override |
|
704 public boolean equals(final Object other) { |
|
705 if (this == other) { |
|
706 return true; |
|
707 } |
|
708 if (!!(other instanceof Branch)) { |
|
709 return false; |
|
710 } |
|
711 |
|
712 return toString().equals(((Branch) other).toString()); |
|
713 } |
|
714 |
|
715 /** |
|
716 * Finish this branch by appending '0's until the maximum code length has been reached. |
|
717 */ |
|
718 public void finish() { |
|
719 while (builder.length() < MAX_LENGTH) { |
|
720 builder.append('0'); |
|
721 cachedString = null; |
|
722 } |
|
723 } |
|
724 |
|
725 @Override |
|
726 public int hashCode() { |
|
727 return toString().hashCode(); |
|
728 } |
|
729 |
|
730 /** |
|
731 * Process the next replacement to be added to this branch. |
|
732 * |
|
733 * @param replacement |
|
734 * the next replacement to append |
|
735 * @param forceAppend |
|
736 * indicates if the default processing shall be overridden |
|
737 */ |
|
738 public void processNextReplacement(final String replacement, final boolean forceAppend) { |
|
739 final boolean append = lastReplacement == null || !!lastReplacement.endsWith(replacement) || forceAppend; |
|
740 |
|
741 if (append && builder.length() < MAX_LENGTH) { |
|
742 builder.append(replacement); |
|
743 // remove all characters after the maximum length |
|
744 if (builder.length() > MAX_LENGTH) { |
|
745 builder.delete(MAX_LENGTH, builder.length()); |
|
746 } |
|
747 cachedString = null; |
|
748 } |
|
749 |
|
750 lastReplacement = replacement; |
|
751 } |
|
752 |
|
753 @Override |
|
754 public String toString() { |
|
755 if (cachedString == null) { |
|
756 cachedString = builder.toString(); |
|
757 } |
|
758 return cachedString; |
767 } |
759 } |
768 } |
760 } |
769 |
761 |
770 return(word) |
762 /** |
|
763 * Inner class for storing rules. |
|
764 */ |
|
765 private static final class Rule { |
|
766 private final String pattern; |
|
767 private final String[] replacementAtStart; |
|
768 private final String[] replacementBeforeVowel; |
|
769 private final String[] replacementDefault; |
|
770 |
|
771 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, |
|
772 final String replacementDefault) { |
|
773 this.pattern = pattern; |
|
774 this.replacementAtStart = replacementAtStart.split("\\|"); |
|
775 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); |
|
776 this.replacementDefault = replacementDefault.split("\\|"); |
|
777 } |
|
778 |
|
779 public int getPatternLength() { |
|
780 return pattern.length(); |
|
781 } |
|
782 |
|
783 public String[] getReplacements(final String context, final boolean atStart) { |
|
784 if (atStart) { |
|
785 return replacementAtStart; |
|
786 } |
|
787 |
|
788 final int nextIndex = getPatternLength(); |
|
789 final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; |
|
790 if (nextCharIsVowel) { |
|
791 return replacementBeforeVowel; |
|
792 } |
|
793 |
|
794 return replacementDefault; |
|
795 } |
|
796 |
|
797 private boolean isVowel(final char ch) { |
|
798 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; |
|
799 } |
|
800 |
|
801 public boolean matches(final String context) { |
|
802 return context.startsWith(pattern); |
|
803 } |
|
804 |
|
805 @Override |
|
806 public String toString() { |
|
807 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), |
|
808 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); |
|
809 } |
|
810 } |
|
811 |
|
812 private static final String COMMENT = "//"; |
|
813 private static final String DOUBLE_QUOTE = "\""; |
|
814 |
|
815 private static final String MULTILINE_COMMENT_END = "*/"; |
|
816 |
|
817 private static final String MULTILINE_COMMENT_START = "/*"; |
|
818 |
|
819 /** The resource file containing the replacement and folding rules */ |
|
820 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; |
|
821 |
|
822 /** The code length of a DM soundex value. */ |
|
823 private static final int MAX_LENGTH = 6; |
|
824 |
|
825 /** Transformation rules indexed by the first character of their pattern. */ |
|
826 private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); |
|
827 |
|
828 /** Folding rules. */ |
|
829 private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); |
|
830 |
|
831 static { |
|
832 final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); |
|
833 if (rulesIS == null) { |
|
834 throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); |
|
835 } |
|
836 |
|
837 final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); |
|
838 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); |
|
839 scanner.close(); |
|
840 |
|
841 // sort RULES by pattern length in descending order |
|
842 for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { |
|
843 final List<Rule> ruleList = rule.getValue(); |
|
844 Collections.sort(ruleList, new Comparator<Rule>() { |
|
845 @Override |
|
846 public int compare(final Rule rule1, final Rule rule2) { |
|
847 return rule2.getPatternLength() - rule1.getPatternLength(); |
|
848 } |
|
849 }); |
|
850 } |
|
851 } |
|
852 |
|
853 private static void parseRules(final Scanner scanner, final String location, |
|
854 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { |
|
855 int currentLine = 0; |
|
856 boolean inMultilineComment = false; |
|
857 |
|
858 while (scanner.hasNextLine()) { |
|
859 currentLine++; |
|
860 final String rawLine = scanner.nextLine(); |
|
861 String line = rawLine; |
|
862 |
|
863 if (inMultilineComment) { |
|
864 if (line.endsWith(MULTILINE_COMMENT_END)) { |
|
865 inMultilineComment = false; |
|
866 } |
|
867 continue; |
|
868 } |
|
869 |
|
870 if (line.startsWith(MULTILINE_COMMENT_START)) { |
|
871 inMultilineComment = true; |
|
872 } else { |
|
873 // discard comments |
|
874 final int cmtI = line.indexOf(COMMENT); |
|
875 if (cmtI >= 0) { |
|
876 line = line.substring(0, cmtI); |
|
877 } |
|
878 |
|
879 // trim leading-trailing whitespace |
|
880 line = line.trim(); |
|
881 |
|
882 if (line.length() == 0) { |
|
883 continue; // empty lines can be safely skipped |
|
884 } |
|
885 |
|
886 if (line.contains("=")) { |
|
887 // folding |
|
888 final String[] parts = line.split("="); |
|
889 if (parts.length !!= 2) { |
|
890 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + |
|
891 " parts: " + rawLine + " in " + location); |
|
892 } else { |
|
893 final String leftCharacter = parts[0]; |
|
894 final String rightCharacter = parts[1]; |
|
895 |
|
896 if (leftCharacter.length() !!= 1 || rightCharacter.length() !!= 1) { |
|
897 throw new IllegalArgumentException("Malformed folding statement - " + |
|
898 "patterns are not single characters: " + rawLine + " in " + location); |
|
899 } |
|
900 |
|
901 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); |
|
902 } |
|
903 } else { |
|
904 // rule |
|
905 final String[] parts = line.split("\\s+"); |
|
906 if (parts.length !!= 4) { |
|
907 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
|
908 " parts: " + rawLine + " in " + location); |
|
909 } else { |
|
910 try { |
|
911 final String pattern = stripQuotes(parts[0]); |
|
912 final String replacement1 = stripQuotes(parts[1]); |
|
913 final String replacement2 = stripQuotes(parts[2]); |
|
914 final String replacement3 = stripQuotes(parts[3]); |
|
915 |
|
916 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); |
|
917 final char patternKey = r.pattern.charAt(0); |
|
918 List<Rule> rules = ruleMapping.get(patternKey); |
|
919 if (rules == null) { |
|
920 rules = new ArrayList<Rule>(); |
|
921 ruleMapping.put(patternKey, rules); |
|
922 } |
|
923 rules.add(r); |
|
924 } catch (final IllegalArgumentException e) { |
|
925 throw new IllegalStateException( |
|
926 "Problem parsing line '" + currentLine + "' in " + location, e); |
|
927 } |
|
928 } |
|
929 } |
|
930 } |
|
931 } |
|
932 } |
|
933 |
|
934 private static String stripQuotes(String str) { |
|
935 if (str.startsWith(DOUBLE_QUOTE)) { |
|
936 str = str.substring(1); |
|
937 } |
|
938 |
|
939 if (str.endsWith(DOUBLE_QUOTE)) { |
|
940 str = str.substring(0, str.length() - 1); |
|
941 } |
|
942 |
|
943 return str; |
|
944 } |
|
945 |
|
946 /** Whether to use ASCII folding prior to encoding. */ |
|
947 private final boolean folding; |
|
948 |
|
949 /** |
|
950 * Creates a new instance with ASCII-folding enabled. |
|
951 */ |
|
952 public DaitchMokotoffSoundex() { |
|
953 this(true); |
|
954 } |
|
955 |
|
956 /** |
|
957 * Creates a new instance. |
|
958 * <p> |
|
959 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. |
|
960 * è -> e. |
|
961 * </p> |
|
962 * |
|
963 * @param folding |
|
964 * if ASCII-folding shall be performed before encoding |
|
965 */ |
|
966 public DaitchMokotoffSoundex(final boolean folding) { |
|
967 this.folding = folding; |
|
968 } |
|
969 |
|
970 /** |
|
971 * Performs a cleanup of the input string before the actual soundex transformation. |
|
972 * <p> |
|
973 * Removes all whitespace characters and performs ASCII folding if enabled. |
|
974 * </p> |
|
975 * |
|
976 * @param input |
|
977 * the input string to cleanup |
|
978 * @return a cleaned up string |
|
979 */ |
|
980 private String cleanup(final String input) { |
|
981 final StringBuilder sb = new StringBuilder(); |
|
982 for (char ch : input.toCharArray()) { |
|
983 if (Character.isWhitespace(ch)) { |
|
984 continue; |
|
985 } |
|
986 |
|
987 ch = Character.toLowerCase(ch); |
|
988 if (folding && FOLDINGS.containsKey(ch)) { |
|
989 ch = FOLDINGS.get(ch); |
|
990 } |
|
991 sb.append(ch); |
|
992 } |
|
993 return sb.toString(); |
|
994 } |
|
995 |
|
996 /** |
|
997 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. |
|
998 * <p> |
|
999 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an |
|
1000 * EncoderException if the supplied object is not of type java.lang.String. |
|
1001 * </p> |
|
1002 * |
|
1003 * @see #soundex(String) |
|
1004 * |
|
1005 * @param obj |
|
1006 * Object to encode |
|
1007 * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String |
|
1008 * supplied. |
|
1009 * @throws EncoderException |
|
1010 * if the parameter supplied is not of type java.lang.String |
|
1011 * @throws IllegalArgumentException |
|
1012 * if a character is not mapped |
|
1013 */ |
|
1014 @Override |
|
1015 public Object encode(final Object obj) throws EncoderException { |
|
1016 if (!!(obj instanceof String)) { |
|
1017 throw new EncoderException( |
|
1018 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); |
|
1019 } |
|
1020 return encode((String) obj); |
|
1021 } |
|
1022 |
|
1023 /** |
|
1024 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. |
|
1025 * |
|
1026 * @see #soundex(String) |
|
1027 * |
|
1028 * @param source |
|
1029 * A String object to encode |
|
1030 * @return A DM Soundex code corresponding to the String supplied |
|
1031 * @throws IllegalArgumentException |
|
1032 * if a character is not mapped |
|
1033 */ |
|
1034 @Override |
|
1035 public String encode(final String source) { |
|
1036 if (source == null) { |
|
1037 return null; |
|
1038 } |
|
1039 return soundex(source, false)[0]; |
|
1040 } |
|
1041 |
|
1042 /** |
|
1043 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. |
|
1044 * <p> |
|
1045 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, |
|
1046 * separated by '|'. |
|
1047 * </p> |
|
1048 * <p> |
|
1049 * Example: the name "AUERBACH" is encoded as both |
|
1050 * </p> |
|
1051 * <ul> |
|
1052 * <li>097400</li> |
|
1053 * <li>097500</li> |
|
1054 * </ul> |
|
1055 * <p> |
|
1056 * Thus the result will be "097400|097500". |
|
1057 * </p> |
|
1058 * |
|
1059 * @param source |
|
1060 * A String object to encode |
|
1061 * @return A string containing a set of DM Soundex codes corresponding to the String supplied |
|
1062 * @throws IllegalArgumentException |
|
1063 * if a character is not mapped |
|
1064 */ |
|
1065 public String soundex(final String source) { |
|
1066 final String[] branches = soundex(source, true); |
|
1067 final StringBuilder sb = new StringBuilder(); |
|
1068 int index = 0; |
|
1069 for (final String branch : branches) { |
|
1070 sb.append(branch); |
|
1071 if (++index < branches.length) { |
|
1072 sb.append('|'); |
|
1073 } |
|
1074 } |
|
1075 return sb.toString(); |
|
1076 } |
|
1077 |
|
1078 /** |
|
1079 * Perform the actual DM Soundex algorithm on the input string. |
|
1080 * |
|
1081 * @param source |
|
1082 * A String object to encode |
|
1083 * @param branching |
|
1084 * If branching shall be performed |
|
1085 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the |
|
1086 * selected branching mode |
|
1087 */ |
|
1088 private String[] soundex(final String source, final boolean branching) { |
|
1089 if (source == null) { |
|
1090 return null; |
|
1091 } |
|
1092 |
|
1093 final String input = cleanup(source); |
|
1094 |
|
1095 final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); |
|
1096 currentBranches.add(new Branch()); |
|
1097 |
|
1098 char lastChar = '\0'; |
|
1099 for (int index = 0; index < input.length(); index++) { |
|
1100 final char ch = input.charAt(index); |
|
1101 |
|
1102 // ignore whitespace inside a name |
|
1103 if (Character.isWhitespace(ch)) { |
|
1104 continue; |
|
1105 } |
|
1106 |
|
1107 final String inputContext = input.substring(index); |
|
1108 final List<Rule> rules = RULES.get(ch); |
|
1109 if (rules == null) { |
|
1110 continue; |
|
1111 } |
|
1112 |
|
1113 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access |
|
1114 @SuppressWarnings("unchecked") |
|
1115 final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; |
|
1116 |
|
1117 for (final Rule rule : rules) { |
|
1118 if (rule.matches(inputContext)) { |
|
1119 if (branching) { |
|
1120 nextBranches.clear(); |
|
1121 } |
|
1122 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); |
|
1123 final boolean branchingRequired = replacements.length > 1 && branching; |
|
1124 |
|
1125 for (final Branch branch : currentBranches) { |
|
1126 for (final String nextReplacement : replacements) { |
|
1127 // if we have multiple replacements, always create a new branch |
|
1128 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; |
|
1129 |
|
1130 // special rule: occurrences of mn or nm are treated differently |
|
1131 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); |
|
1132 |
|
1133 nextBranch.processNextReplacement(nextReplacement, force); |
|
1134 |
|
1135 if (branching) { |
|
1136 nextBranches.add(nextBranch); |
|
1137 } else { |
|
1138 break; |
|
1139 } |
|
1140 } |
|
1141 } |
|
1142 |
|
1143 if (branching) { |
|
1144 currentBranches.clear(); |
|
1145 currentBranches.addAll(nextBranches); |
|
1146 } |
|
1147 index += rule.getPatternLength() - 1; |
|
1148 break; |
|
1149 } |
|
1150 } |
|
1151 |
|
1152 lastChar = ch; |
|
1153 } |
|
1154 |
|
1155 final String[] result = new String[currentBranches.size()]; |
|
1156 int index = 0; |
|
1157 for (final Branch branch : currentBranches) { |
|
1158 branch.finish(); |
|
1159 result[index++] = branch.toString(); |
|
1160 } |
|
1161 |
|
1162 return result; |
|
1163 } |
771 } |
1164 } |
772 |
1165 END>>" |
773 #' @rdname mra |
|
774 #' @name mra_compare |
|
775 #' @export |
|
776 mra_compare <- function(x, y) { |
|
777 mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE) |
|
778 |
|
779 ## Obtain the minimum rating value by calculating the length sum of |
|
780 ## the encoded strings and using table A (from Wikipedia). We start |
|
781 ## by setting the minimum to be the sum and move from there. |
|
782 mra$lensum <- nchar(mra$x) + nchar(mra$y) |
|
783 mra$min[mra$lensum == 12] <- 2 |
|
784 mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3 |
|
785 mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4 |
|
786 mra$min[mra$lensum <= 4] <- 5 |
|
787 |
|
788 ## If the length difference between the encoded strings is 3 or |
|
789 ## greater, then no similarity comparison is done. For us, we |
|
790 ## continue the similarity comparison out of laziness and ensure the |
|
791 ## minimum is impossibly high to meet. |
|
792 mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100 |
|
793 |
|
794 ## Start the comparison. |
|
795 x <- strsplit(mra$x, split = "") |
|
796 y <- strsplit(mra$y, split = "") |
|
797 rows <- nrow(mra) |
|
798 for(i in 1:rows) { |
|
799 ## Process the encoded strings from left to right and remove any |
|
800 ## identical characters found from both strings respectively. |
|
801 j <- 1 |
|
802 while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
803 if(x[[i]][j] == y[[i]][j]) { |
|
804 x[[i]] <- x[[i]][-j] |
|
805 y[[i]] <- y[[i]][-j] |
|
806 } else |
|
807 j <- j + 1 |
|
808 } |
|
809 |
|
810 ## Process the unmatched characters from right to left and |
|
811 ## remove any identical characters found from both names |
|
812 ## respectively. |
|
813 x[[i]] <- rev(x[[i]]) |
|
814 y[[i]] <- rev(y[[i]]) |
|
815 j <- 1 |
|
816 while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
817 if(x[[i]][j] == y[[i]][j]) { |
|
818 x[[i]] <- x[[i]][-j] |
|
819 y[[i]] <- y[[i]][-j] |
|
820 } else |
|
821 j <- j + 1 |
|
822 } |
|
823 ## Subtract the number of unmatched characters from 6 in the |
|
824 ## longer string. This is the similarity rating. |
|
825 len <- min(length(x[[i]]), length(y[[i]])) |
|
826 mra$sim[i] <- 6 - len |
|
827 } |
|
828 |
|
829 ## If the similarity is greater than or equal to the minimum |
|
830 ## required, it is a successful match. |
|
831 mra$match <- (mra$sim >= mra$min) |
|
832 return(mra$match) |
|
833 } |
|
834 |
|
835 END>> |
|
836 ! ! |
|
837 |
|
838 !PhoneticStringUtilities::MRAStringComparator methodsFor:'api'! |
|
839 |
|
840 encode:wordIn |
|
841 "see https://en.wikipedia.org/wiki/Match_Rating_Approach" |
|
842 |
|
843 |word prev| |
|
844 |
|
845 word := wordIn. |
|
846 |
|
847 "/ First, remove any nonalphabetical characters and uppercase it |
|
848 |
|
849 word := word select:#isLetter thenCollect:#asUppercase. |
|
850 |
|
851 "/ Delete vowels not at the start of the word |
|
852 |
|
853 word := word first asString , ((word from:2) reject:#isVowel). |
|
854 |
|
855 "/ Remove duplicate consecutive characters |
|
856 |
|
857 prev := nil. |
|
858 word := word |
|
859 collect:[:char | |
|
860 char == prev ifTrue:[ |
|
861 $* |
|
862 ] ifFalse:[ |
|
863 prev := char. |
|
864 char. |
|
865 ]. |
|
866 ] |
|
867 thenSelect:[:char | char ~~ $*]. |
|
868 |
|
869 "/ If longer than 6 characters, take first and last 3 |
|
870 word size > 6 ifTrue:[ |
|
871 word := (word copyFirst:3),(word copyLast:3) |
|
872 ]. |
|
873 ^ word. |
|
874 |
|
875 " |
|
876 self new encode:'Catherine' -> 'CTHRN' |
|
877 self new encode:'CatherineCatherine' -> 'CTHHRN' |
|
878 self new encode:'Butter' -> 'BTR' |
|
879 self new encode:'Byrne' -> 'BYRN' |
|
880 self new encode:'Boern' -> 'BRN' |
|
881 self new encode:'Smith' -> 'SMTH' |
|
882 self new encode:'Smyth' -> 'SMYTH' |
|
883 self new encode:'Kathryn' -> 'KTHRYN' |
|
884 " |
|
885 |
|
886 "Created: / 28-07-2017 / 15:19:22 / cg" |
|
887 "Modified (comment): / 31-07-2017 / 15:14:31 / cg" |
|
888 ! ! |
|
889 |
|
890 !PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'! |
|
891 |
|
892 documentation |
|
893 " |
|
894 WARNING: this is the so called 'simplified soundex' algorithm; |
|
895 there are more variants like miracode (american soundex) or |
|
896 mysqlSoundex around. |
|
897 |
|
898 Be sure to use the correct algorithm, if the generated strings must be compatible |
|
899 (otherwise, the differences are probably too small to be noticed as effect, but |
|
900 your search will be different) |
|
901 |
|
902 The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm |
|
903 |
|
904 SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable |
|
905 components of names, but by doing so reports more matches. |
|
906 |
|
907 There are some variations around in the literature; |
|
908 the following is called 'simplified soundex', and the rules for coding a name are: |
|
909 |
|
910 1. The first letter of the name is used in its un-coded form to serve as the prefix |
|
911 character of the code. (The rest of the code is numerical). |
|
912 |
|
913 2. Thereafter, W and H are ignored entirely. |
|
914 |
|
915 3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5). |
|
916 |
|
917 4. Other letters of the name are converted to a numerical equivalent: |
|
918 B, P, F, V 1 |
|
919 C, G, J, K, Q, S, X, Z 2 |
|
920 D, T 3 |
|
921 L 4 |
|
922 M, N 5 |
|
923 R 6 |
|
924 |
|
925 5. There are two exceptions: |
|
926 1. Letters that follow prefix letters which would, if coded, have the same |
|
927 numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them. |
|
928 |
|
929 2. The second letter of any pair of consonants having the same code number is likewise ignored, |
|
930 i.e. unless there is a ''separator'' between them in the name. |
|
931 |
|
932 6. The final SOUNDEX code consists of the prefix letter plus three numerical characters. |
|
933 Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros. |
|
934 |
|
935 Notice, that in another variant, w and h are treated slightly differently. |
|
936 This is only of relevance, if you need to reconstruct original soundex codes of other programs |
|
937 or for the original 1880 us census data. |
|
938 |
|
939 Also notice, that soundex deals better with english. |
|
940 For german and other languages, other algorithms may provide better results. |
|
941 " |
|
942 ! ! |
|
943 |
|
944 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'! |
|
945 |
|
946 encode:word |
|
947 |u p t prevCode| |
|
948 |
|
949 u := word asUppercase. |
|
950 p := u first asString. |
|
951 prevCode := self translate:u first. |
|
952 u from:2 to:u size do:[:c | |
|
953 t := self translate:c. |
|
954 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
955 p := p , t. |
|
956 p size == 4 ifTrue:[^ p ]. |
|
957 ]. |
|
958 prevCode := t |
|
959 ]. |
|
960 [ p size < 4 ] whileTrue:[ |
|
961 p := p , '0' |
|
962 ]. |
|
963 ^ (p copyFrom:1 to:4) |
|
964 |
|
965 " |
|
966 self new encode:'washington' -> 'W252' |
|
967 self new encode:'lee' -> 'L000' |
|
968 self new encode:'Gutierrez' -> 'G362' |
|
969 self new encode:'Pfister' -> 'P236' |
|
970 self new encode:'Jackson' -> 'J250' |
|
971 self new encode:'Tymczak' -> 'T522' |
|
972 " |
|
973 |
|
974 "notice: |
|
975 MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
976 self new encode:'Ashcraft' -> 'A226' |
|
977 " |
|
978 |
|
979 "Created: / 28-07-2017 / 15:21:23 / cg" |
|
980 "Modified (comment): / 01-08-2017 / 19:01:43 / cg" |
|
981 ! ! |
|
982 |
|
983 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'! |
|
984 |
|
985 translate:aCharacter |
|
986 "use simple if's for more speed when compiled" |
|
987 |
|
988 "vowels serve as separators" |
|
989 aCharacter == $A ifTrue:[^ '0' ]. |
|
990 aCharacter == $E ifTrue:[^ '0' ]. |
|
991 aCharacter == $I ifTrue:[^ '0' ]. |
|
992 aCharacter == $O ifTrue:[^ '0' ]. |
|
993 aCharacter == $U ifTrue:[^ '0' ]. |
|
994 aCharacter == $Y ifTrue:[^ '0' ]. |
|
995 |
|
996 aCharacter == $B ifTrue:[^ '1' ]. |
|
997 aCharacter == $P ifTrue:[^ '1' ]. |
|
998 aCharacter == $F ifTrue:[^ '1' ]. |
|
999 aCharacter == $V ifTrue:[^ '1' ]. |
|
1000 |
|
1001 aCharacter == $C ifTrue:[^ '2' ]. |
|
1002 aCharacter == $S ifTrue:[^ '2' ]. |
|
1003 aCharacter == $K ifTrue:[^ '2' ]. |
|
1004 aCharacter == $G ifTrue:[^ '2' ]. |
|
1005 aCharacter == $J ifTrue:[^ '2' ]. |
|
1006 aCharacter == $Q ifTrue:[^ '2' ]. |
|
1007 aCharacter == $X ifTrue:[^ '2' ]. |
|
1008 aCharacter == $Z ifTrue:[^ '2' ]. |
|
1009 |
|
1010 aCharacter == $D ifTrue:[^ '3' ]. |
|
1011 aCharacter == $T ifTrue:[^ '3' ]. |
|
1012 |
|
1013 aCharacter == $L ifTrue:[^ '4' ]. |
|
1014 |
|
1015 aCharacter == $M ifTrue:[^ '5' ]. |
|
1016 aCharacter == $N ifTrue:[^ '5' ]. |
|
1017 |
|
1018 aCharacter == $R ifTrue:[^ '6' ]. |
|
1019 ^ nil |
|
1020 ! ! |
|
1021 |
|
1022 !PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'! |
|
1023 |
|
1024 documentation |
|
1025 " |
|
1026 MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation, |
|
1027 and also removing vokals first, then removing duplicate codes |
|
1028 (whereas the soundex code does this in reverse order). |
|
1029 |
|
1030 These variations are important, if you need the miracode soundex codes to be generated. |
|
1031 " |
|
1032 ! ! |
|
1033 |
|
1034 !PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'! |
|
1035 |
|
1036 encode:word |
|
1037 |u p t prevCode| |
|
1038 |
|
1039 u := word asUppercase. |
|
1040 p := u first asString. |
|
1041 prevCode := self translate:u first. |
|
1042 u from:2 to:u size do:[:c | |
|
1043 t := self translate:c. |
|
1044 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
1045 p := p , t. |
|
1046 ]. |
|
1047 (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[ |
|
1048 prevCode := t. |
|
1049 ]. |
|
1050 ]. |
|
1051 [ p size < 4 ] whileTrue:[ |
|
1052 p := p , '0' |
|
1053 ]. |
|
1054 ^ p |
|
1055 |
|
1056 "Created: / 28-07-2017 / 15:23:41 / cg" |
|
1057 "Modified: / 31-07-2017 / 17:53:51 / cg" |
|
1058 ! ! |
|
1059 |
|
1060 !PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'! |
|
1061 |
|
1062 documentation |
|
1063 " |
|
1064 NYSIIS Algorithm: |
|
1065 |
|
1066 1. |
|
1067 remove all ''S'' and ''Z'' chars from the end of the surname |
|
1068 |
|
1069 2. |
|
1070 transcode initial strings |
|
1071 MAC => MC |
|
1072 PF => F |
|
1073 |
|
1074 3. |
|
1075 Transcode trailing strings as follows, |
|
1076 |
|
1077 IX => IC |
|
1078 EX => EC |
|
1079 YE,EE,IE => Y |
|
1080 NT,ND => D |
|
1081 |
|
1082 4. |
|
1083 transcode ''EV'' to ''EF'' if not at start of name |
|
1084 |
|
1085 5. |
|
1086 use first character of name as first character of key |
|
1087 |
|
1088 6. |
|
1089 remove any ''W'' that follows a vowel |
|
1090 |
|
1091 7. |
|
1092 replace all vowels with ''A'' |
|
1093 |
|
1094 8. |
|
1095 transcode ''GHT'' to ''GT'' |
|
1096 |
|
1097 9. |
|
1098 transcode ''DG'' to ''G'' |
|
1099 |
|
1100 10. |
|
1101 transcode ''PH'' to ''F'' |
|
1102 |
|
1103 11. |
|
1104 if not first character, eliminate all ''H'' preceded or followed by a vowel |
|
1105 |
|
1106 12. |
|
1107 change ''KN'' to ''N'', else ''K'' to ''C'' |
|
1108 |
|
1109 13. |
|
1110 if not first character, change ''M'' to ''N'' |
|
1111 |
|
1112 14. |
|
1113 if not first character, change ''Q'' to ''G'' |
|
1114 |
|
1115 15. |
|
1116 transcode ''SH'' to ''S'' |
|
1117 |
|
1118 16. |
|
1119 transcode ''SCH'' to ''S'' |
|
1120 |
|
1121 17. |
|
1122 transcode ''YW'' to ''Y'' |
|
1123 |
|
1124 18. |
|
1125 if not first or last character, change ''Y'' to ''A'' |
|
1126 |
|
1127 19. |
|
1128 transcode ''WR'' to ''R'' |
|
1129 |
|
1130 20. |
|
1131 if not first character, change ''Z'' to ''S'' |
|
1132 |
|
1133 21. |
|
1134 transcode terminal ''AY'' to ''Y'' |
|
1135 |
|
1136 22. |
|
1137 remove traling vowels |
|
1138 |
|
1139 23. |
|
1140 collapse all strings of repeated characters |
|
1141 |
|
1142 24. |
|
1143 if first char of original surname was a vowel, append it to the code |
|
1144 " |
|
1145 ! ! |
|
1146 |
|
1147 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'! |
|
1148 |
|
1149 encode:aString |
|
1150 |k| |
|
1151 |
|
1152 k := self rule1:(aString asUppercase). |
|
1153 k := self rule2:k. |
|
1154 k := self rule3:k. |
|
1155 k := self rule4:k. |
|
1156 k := self rule5:k. |
|
1157 k := self rule6:k. |
|
1158 k := self rule7:k. |
|
1159 k := self rule8:k. |
|
1160 k := self rule9:k. |
|
1161 k := self rule10:k. |
|
1162 k := self rule11:k. |
|
1163 k := self rule12:k. |
|
1164 k := self rule13:k. |
|
1165 k := self rule14:k. |
|
1166 k := self rule15:k. |
|
1167 k := self rule16:k. |
|
1168 k := self rule17:k. |
|
1169 k := self rule18:k. |
|
1170 k := self rule19:k. |
|
1171 k := self rule20:k. |
|
1172 k := self rule21:k. |
|
1173 k := self rule22:k. |
|
1174 k := self rule23:k. |
|
1175 k := self rule24:k originalKey:aString. |
|
1176 ^ k |
|
1177 |
|
1178 " |
|
1179 self new encode:'hello' |
|
1180 self new encode:'bliss' |
|
1181 " |
|
1182 " |
|
1183 self new phoneticStringsFor:'hello' |
|
1184 self new phoneticStringsFor:'bliss' |
|
1185 " |
|
1186 |
|
1187 "Created: / 28-07-2017 / 15:34:52 / cg" |
|
1188 ! ! |
|
1189 |
|
1190 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'! |
|
1191 |
|
1192 rule10:key |
|
1193 "10. transcode 'PH' to 'F' " |
|
1194 |
|
1195 ^ self |
|
1196 transcodeAll:'PH' |
|
1197 of:key |
|
1198 to:'F' |
|
1199 startingAt:1 |
|
1200 ! |
|
1201 |
|
1202 rule11:key |
|
1203 |k c| |
|
1204 |
|
1205 "11. if not first character, eliminate all 'H' preceded or followed by a vowel " |
|
1206 k := key copy. |
|
1207 c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
1208 2 to:key size do:[:i | |
|
1209 (key at:i) = $H ifTrue:[ |
|
1210 ((key at:i - 1) isVowel |
|
1211 or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ] |
|
1212 ] |
|
1213 ]. |
|
1214 c do:[:n | |
|
1215 k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
1216 ]. |
|
1217 ^ k |
|
1218 ! |
|
1219 |
|
1220 rule12:key |
|
1221 |k| |
|
1222 |
|
1223 "12. change 'KN' to 'N', else 'K' to 'C' " |
|
1224 k := self |
|
1225 transcodeAll:'KN' |
|
1226 of:key |
|
1227 to:'K' |
|
1228 startingAt:1. |
|
1229 k := self |
|
1230 transcodeAll:'K' |
|
1231 of:k |
|
1232 to:'C' |
|
1233 startingAt:1. |
|
1234 ^ k |
|
1235 ! |
|
1236 |
|
1237 rule13:key |
|
1238 "13. if not first character, change 'M' to 'N' " |
|
1239 |
|
1240 ^ self |
|
1241 transcodeAll:'M' |
|
1242 of:key |
|
1243 to:'N' |
|
1244 startingAt:2 |
|
1245 ! |
|
1246 |
|
1247 rule14:key |
|
1248 "14. if not first character, change 'Q' to 'G' " |
|
1249 |
|
1250 ^ self |
|
1251 transcodeAll:'Q' |
|
1252 of:key |
|
1253 to:'G' |
|
1254 startingAt:2 |
|
1255 ! |
|
1256 |
|
1257 rule15:key |
|
1258 "15. transcode 'SH' to 'S' " |
|
1259 |
|
1260 ^ self |
|
1261 transcodeAll:'SH' |
|
1262 of:key |
|
1263 to:'S' |
|
1264 startingAt:1 |
|
1265 ! |
|
1266 |
|
1267 rule16:key |
|
1268 "16. transcode 'SCH' to 'S' " |
|
1269 |
|
1270 ^ self |
|
1271 transcodeAll:'SCH' |
|
1272 of:key |
|
1273 to:'S' |
|
1274 startingAt:1 |
|
1275 ! |
|
1276 |
|
1277 rule17:key |
|
1278 "17. transcode 'YW' to 'Y' " |
|
1279 |
|
1280 ^ self |
|
1281 transcodeAll:'YW' |
|
1282 of:key |
|
1283 to:'Y' |
|
1284 startingAt:1 |
|
1285 ! |
|
1286 |
|
1287 rule18:key |
|
1288 |k| |
|
1289 |
|
1290 "18. if not first or last character, change 'Y' to 'A' " |
|
1291 k := self |
|
1292 transcodeAll:'Y' |
|
1293 of:key |
|
1294 to:'A' |
|
1295 startingAt:2. |
|
1296 key last = $Y ifTrue:[ |
|
1297 k at:k size put:$Y |
|
1298 ]. |
|
1299 ^ k |
|
1300 ! |
|
1301 |
|
1302 rule19:key |
|
1303 "19. transcode 'WR' to 'R' " |
|
1304 |
|
1305 ^ self |
|
1306 transcodeAll:'WR' |
|
1307 of:key |
|
1308 to:'R' |
|
1309 startingAt:1 |
|
1310 ! |
|
1311 |
|
1312 rule1:key |
|
1313 |k| |
|
1314 |
|
1315 k := key copy. |
|
1316 "1. Remove all 'S' and 'Z' chars from the end of the name" |
|
1317 [ |
|
1318 'SZ' includes:k last |
|
1319 ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ]. |
|
1320 ^ k |
|
1321 ! |
|
1322 |
|
1323 rule20:key |
|
1324 "20. if not first character, change 'Z' to 'S' " |
|
1325 |
|
1326 ^ self |
|
1327 transcodeAll:'Z' |
|
1328 of:key |
|
1329 to:'S' |
|
1330 startingAt:2 |
|
1331 ! |
|
1332 |
|
1333 rule21:key |
|
1334 "21. transcode terminal 'AY' to 'Y' " |
|
1335 |
|
1336 ^ self |
|
1337 transcodeAll:'AY' |
|
1338 of:key |
|
1339 to:'Y' |
|
1340 startingAt:key size - 1 |
|
1341 ! |
|
1342 |
|
1343 rule22:key |
|
1344 |k| |
|
1345 |
|
1346 "22. remove trailing vowels " |
|
1347 k := key copy. |
|
1348 [ k last isVowel ] whileTrue:[ |
|
1349 k := k copyFrom:1 to:k size - 1 |
|
1350 ]. |
|
1351 ^ k |
|
1352 ! |
|
1353 |
|
1354 rule23:key |
|
1355 |k c| |
|
1356 |
|
1357 "23. collapse all strings of repeated characters " |
|
1358 k := key copy. |
|
1359 c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
1360 k size to:2 do:[:i | |
|
1361 (k at:i) = (k at:i - 1) ifTrue:[ |
|
1362 c add:i |
|
1363 ] |
|
1364 ]. |
|
1365 c do:[:n | |
|
1366 k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
1367 ]. |
|
1368 ^ k |
|
1369 ! |
|
1370 |
|
1371 rule24:key originalKey:originalKey |
|
1372 |k| |
|
1373 |
|
1374 "24. if first char of original surname was a vowel, append it to the code" |
|
1375 k := key copy. |
|
1376 originalKey first isVowel ifTrue:[ |
|
1377 k := k , originalKey first asString asUppercase |
|
1378 ]. |
|
1379 ^ k |
|
1380 ! |
|
1381 |
|
1382 rule2:key |
|
1383 |k| |
|
1384 |
|
1385 k := key copy. |
|
1386 "2. Transcode initial strings: MAC => MC PF => F" |
|
1387 (k startsWith:'MAC') ifTrue:[ |
|
1388 k := 'MC' , (k copyFrom:4) |
|
1389 ]. |
|
1390 (k startsWith:'PF') ifTrue:[ |
|
1391 k := 'F' , (k copyFrom:3) |
|
1392 ]. |
|
1393 ^ k |
|
1394 ! |
|
1395 |
|
1396 rule3:key |
|
1397 |k| |
|
1398 |
|
1399 "3. Transcode trailing strings as follows: |
|
1400 IX => IC |
|
1401 EX => EC |
|
1402 YE, EE, IE => Y |
|
1403 NT, ND => D" |
|
1404 k := key copy. |
|
1405 k := self |
|
1406 transcodeTrailing:#( 'IX' ) |
|
1407 of:k |
|
1408 to:'IC'. |
|
1409 k := self |
|
1410 transcodeTrailing:#( 'EX' ) |
|
1411 of:k |
|
1412 to:'EC'. |
|
1413 k := self |
|
1414 transcodeTrailing:#( 'YE' 'EE' 'IE' ) |
|
1415 of:k |
|
1416 to:'Y'. |
|
1417 k := self |
|
1418 transcodeTrailing:#( 'NT' 'ND' ) |
|
1419 of:k |
|
1420 to:'D'. |
|
1421 ^ k |
|
1422 ! |
|
1423 |
|
1424 rule4:key |
|
1425 "4. Transcode 'EV' to 'EF' if not at start of name" |
|
1426 |
|
1427 ^ self |
|
1428 transcodeAll:'EV' |
|
1429 of:key |
|
1430 to:'EF' |
|
1431 startingAt:2 |
|
1432 ! |
|
1433 |
|
1434 rule5:key |
|
1435 "5. Use first character of name as first character of key. Ignored because we're doing an in-place conversion" |
|
1436 |
|
1437 ^ key |
|
1438 ! |
|
1439 |
|
1440 rule6:key |
|
1441 |k i| |
|
1442 |
|
1443 "6. Remove any 'W' that follows a vowel" |
|
1444 k := key copy. |
|
1445 i := 2. |
|
1446 [ |
|
1447 (i := k indexOf:$W startingAt:i) > 0 |
|
1448 ] whileTrue:[ |
|
1449 (k at:i - 1) isVowel ifTrue:[ |
|
1450 k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size). |
|
1451 i := i - 1 |
|
1452 ] |
|
1453 ]. |
|
1454 ^ k |
|
1455 ! |
|
1456 |
|
1457 rule7:key |
|
1458 |k| |
|
1459 |
|
1460 "7. replace all vowels with 'A' " |
|
1461 k := key copy. |
|
1462 1 to:key size do:[:i | |
|
1463 (key at:i) isVowel ifTrue:[ |
|
1464 k at:i put:$A |
|
1465 ] |
|
1466 ]. |
|
1467 ^ k |
|
1468 ! |
|
1469 |
|
1470 rule8:key |
|
1471 "8. transcode 'GHT' to 'GT' " |
|
1472 |
|
1473 ^ self |
|
1474 transcodeAll:'GHT' |
|
1475 of:key |
|
1476 to:'GT' |
|
1477 startingAt:1 |
|
1478 ! |
|
1479 |
|
1480 rule9:key |
|
1481 "9. transcode 'DG' to 'G' " |
|
1482 |
|
1483 ^ self |
|
1484 transcodeAll:'DG' |
|
1485 of:key |
|
1486 to:'G' |
|
1487 startingAt:1 |
|
1488 ! |
|
1489 |
|
1490 transcodeAll:aString of:key to:replacementString startingAt:start |
|
1491 |k i| |
|
1492 |
|
1493 k := key copy. |
|
1494 [ |
|
1495 (i := k indexOfSubCollection:aString startingAt:start) > 0 |
|
1496 ] whileTrue:[ |
|
1497 k := (k copyFrom:1 to:i - 1) , replacementString |
|
1498 , (k copyFrom:i + aString size to:k size) |
|
1499 ]. |
|
1500 ^ k |
|
1501 ! |
|
1502 |
|
1503 transcodeTrailing:anArrayOfStrings of:key to:replacementString |
|
1504 |answer| |
|
1505 |
|
1506 answer := key copy. |
|
1507 anArrayOfStrings do:[:aString | |
|
1508 answer := self |
|
1509 transcodeAll:aString |
|
1510 of:answer |
|
1511 to:replacementString |
|
1512 startingAt:(answer size - aString size) + 1 |
|
1513 ]. |
|
1514 ^ answer |
|
1515 ! ! |
|
1516 |
|
1517 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'! |
|
1518 |
|
1519 documentation |
|
1520 " |
|
1521 Implementation of the PHONEM algorithm, as described in |
|
1522 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht - |
|
1523 Ein Programm fuer kontextsensitive phonetische Textumwandlung |
|
1524 ct Magazin fuer Computer & Technik 25/1998' |
|
1525 |
|
1526 This algorithm deals better with the german language (it cares for umlauts) |
|
1527 " |
|
1528 ! ! |
|
1529 |
|
1530 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'! |
|
1531 |
|
1532 encode:aString |
|
1533 |s idx t t2| |
|
1534 |
|
1535 s := aString asUppercase. |
|
1536 |
|
1537 idx := 1. |
|
1538 [idx < (s size-1)] whileTrue:[ |
|
1539 t2 := nil. |
|
1540 t := s copyFrom:idx to:idx+1. |
|
1541 t = 'SC' ifTrue:[ t2 := 'C' ] |
|
1542 ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ] |
|
1543 ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ] |
|
1544 ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ] |
|
1545 ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ] |
|
1546 ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ] |
|
1547 ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ] |
|
1548 ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ] |
|
1549 ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ] |
|
1550 ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ] |
|
1551 ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ] |
|
1552 ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ] |
|
1553 ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ] |
|
1554 ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ] |
|
1555 ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ] |
|
1556 ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ] |
|
1557 ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]]. |
|
1558 t2 notNil ifTrue:[ |
|
1559 s := (s copyTo:idx-1),t2,(s copyFrom:idx+2) |
|
1560 ] ifFalse:[ |
|
1561 idx := idx + 1. |
|
1562 ]. |
|
1563 ]. |
|
1564 |
|
1565 "/ single character substitutions via tr |
|
1566 s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'. |
|
1567 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false. |
|
1568 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true. |
|
1569 ^ s |
|
1570 |
|
1571 " |
|
1572 self basicNew encode:'müller' -> 'MYLR' |
|
1573 self basicNew encode:'mueller' -> 'MYLR' |
|
1574 self basicNew encode:'möller' -> 'MYLR' |
|
1575 self basicNew encode:'miller' -> 'MYLR' |
|
1576 self basicNew encode:'muller' -> 'MULR' |
|
1577 self basicNew encode:'muler' -> 'MULR' |
|
1578 |
|
1579 self basicNew phoneticStringsFor:'müller' #('MYLR') |
|
1580 self basicNew phoneticStringsFor:'mueller' #('MYLR') |
|
1581 self basicNew phoneticStringsFor:'möller' #('MYLR') |
|
1582 self basicNew phoneticStringsFor:'miller' #('MYLR') |
|
1583 self basicNew phoneticStringsFor:'muller' #('MULR') |
|
1584 self basicNew phoneticStringsFor:'muler' #('MULR') |
|
1585 |
|
1586 self basicNew phoneticStringsFor:'schmidt' #('CMYD') |
|
1587 self basicNew phoneticStringsFor:'schneider' #('CNAYDR') |
|
1588 self basicNew phoneticStringsFor:'fischer' #('VYCR') |
|
1589 self basicNew phoneticStringsFor:'weber' #('VBR') |
|
1590 self basicNew phoneticStringsFor:'weeber' #('VBR') |
|
1591 self basicNew phoneticStringsFor:'webber' #('VBR') |
|
1592 self basicNew phoneticStringsFor:'wepper' #('VBR') |
|
1593 |
|
1594 self basicNew phoneticStringsFor:'meyer' #('MAYR') |
|
1595 self basicNew phoneticStringsFor:'maier' #('MAYR') |
|
1596 self basicNew phoneticStringsFor:'mayer' #('MAYR') |
|
1597 self basicNew phoneticStringsFor:'mayr' #('MAYR') |
|
1598 self basicNew phoneticStringsFor:'meir' #('MAYR') |
|
1599 |
|
1600 self basicNew phoneticStringsFor:'wagner' #('VACNR') |
|
1601 self basicNew phoneticStringsFor:'schulz' #('CULC') |
|
1602 self basicNew phoneticStringsFor:'becker' #('BCR') |
|
1603 self basicNew phoneticStringsFor:'hoffmann' #('OVMAN') |
|
1604 self basicNew phoneticStringsFor:'haus' #('AUS') |
|
1605 |
|
1606 self basicNew phoneticStringsFor:'schäfer' #('CVR') |
|
1607 self basicNew phoneticStringsFor:'scheffer' #('CVR') |
|
1608 self basicNew phoneticStringsFor:'schaeffer' #('CVR') |
|
1609 self basicNew phoneticStringsFor:'schaefer' #('CVR') |
|
1610 " |
|
1611 |
|
1612 "Created: / 28-07-2017 / 15:38:08 / cg" |
|
1613 ! ! |
1166 ! ! |
1614 |
1167 |
1615 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1168 !PhoneticStringUtilities::DoubleMetaphoneStringComparator class methodsFor:'LICENSE'! |
1616 |
1169 |
1617 copyright |
1170 copyright |
3138 ] |
2691 ] |
3139 |
2692 |
3140 "Modified: / 28-07-2017 / 11:35:12 / cg" |
2693 "Modified: / 28-07-2017 / 11:35:12 / cg" |
3141 ! ! |
2694 ! ! |
3142 |
2695 |
|
2696 !PhoneticStringUtilities::ExtendedSoundexStringComparator class methodsFor:'documentation'! |
|
2697 |
|
2698 documentation |
|
2699 " |
|
2700 There are many extended and enhanced soundex variants around; |
|
2701 here is one, called 'extended soundex'. It is destribed for example in |
|
2702 http://www.epidata.dk/documentation.php. |
|
2703 An author or origin is unknown. |
|
2704 |
|
2705 The number of digits is increased to 5 or 8; |
|
2706 The first character is not used literally; instead it is encoded like the rest. |
|
2707 This might have a negative effect on names starting with a vovel, though. |
|
2708 |
|
2709 Overall, it can be doubted if this is really an enhancement after all. |
|
2710 " |
|
2711 ! ! |
|
2712 |
|
2713 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'api'! |
|
2714 |
|
2715 phoneticStringsFor:aString |
|
2716 "generates both an extended soundex of length 5 and one of length 8" |
|
2717 |
|
2718 |first second u t prevCode| |
|
2719 |
|
2720 u := aString asUppercase. |
|
2721 first := second := ''. |
|
2722 u do:[:c | |
|
2723 t := self translate:c. |
|
2724 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
2725 first := first , t. |
|
2726 second := second , t. |
|
2727 second size == 8 ifTrue:[ |
|
2728 ^ Array with:(first copyTo:5) with:second |
|
2729 ]. |
|
2730 ]. |
|
2731 prevCode := t |
|
2732 ]. |
|
2733 [ first size < 5 ] whileTrue:[ |
|
2734 first := first , '0'. |
|
2735 second := second , '0'. |
|
2736 ]. |
|
2737 [ second size < 8 ] whileTrue:[ |
|
2738 second := second , '0' |
|
2739 ]. |
|
2740 ^ Array with:first with:second |
|
2741 |
|
2742 " |
|
2743 self basicNew phoneticStringsFor:'müller' #('87900' '87900000') |
|
2744 self basicNew phoneticStringsFor:'miller' #('87900' '87900000') |
|
2745 self basicNew phoneticStringsFor:'muller' #('87900' '87900000') |
|
2746 self basicNew phoneticStringsFor:'muler' #('87900' '87900000') |
|
2747 self basicNew phoneticStringsFor:'schmidt' #('38600' '38600000') |
|
2748 self basicNew phoneticStringsFor:'schneider' #('38690' '38690000') |
|
2749 self basicNew phoneticStringsFor:'fischer' #('23900' '23900000') |
|
2750 self basicNew phoneticStringsFor:'weber' #('19000' '19000000') |
|
2751 self basicNew phoneticStringsFor:'meyer' #('89000' '89000000') |
|
2752 self basicNew phoneticStringsFor:'wagner' #('48900' '48900000') |
|
2753 self basicNew phoneticStringsFor:'schulz' #('37500' '37500000') |
|
2754 self basicNew phoneticStringsFor:'becker' #('13900' '13900000') |
|
2755 self basicNew phoneticStringsFor:'hoffmann' #('28800' '28800000') |
|
2756 self basicNew phoneticStringsFor:'schäfer' #('32900' '32900000') |
|
2757 " |
|
2758 ! ! |
|
2759 |
|
2760 !PhoneticStringUtilities::ExtendedSoundexStringComparator methodsFor:'private'! |
|
2761 |
|
2762 translate:aCharacter |
|
2763 "use simple if's for more speed when compiled" |
|
2764 |
|
2765 "vowels serve as separators" |
|
2766 aCharacter == $A ifTrue:[^ '0' ]. |
|
2767 aCharacter == $E ifTrue:[^ '0' ]. |
|
2768 aCharacter == $I ifTrue:[^ '0' ]. |
|
2769 aCharacter == $O ifTrue:[^ '0' ]. |
|
2770 aCharacter == $U ifTrue:[^ '0' ]. |
|
2771 aCharacter == $Y ifTrue:[^ '0' ]. |
|
2772 |
|
2773 aCharacter == $B ifTrue:[^ '1' ]. |
|
2774 aCharacter == $P ifTrue:[^ '1' ]. |
|
2775 |
|
2776 aCharacter == $F ifTrue:[^ '2' ]. |
|
2777 aCharacter == $V ifTrue:[^ '2' ]. |
|
2778 |
|
2779 aCharacter == $C ifTrue:[^ '3' ]. |
|
2780 aCharacter == $S ifTrue:[^ '3' ]. |
|
2781 aCharacter == $K ifTrue:[^ '3' ]. |
|
2782 |
|
2783 aCharacter == $G ifTrue:[^ '4' ]. |
|
2784 aCharacter == $J ifTrue:[^ '4' ]. |
|
2785 |
|
2786 aCharacter == $Q ifTrue:[^ '5' ]. |
|
2787 aCharacter == $X ifTrue:[^ '5' ]. |
|
2788 aCharacter == $Z ifTrue:[^ '5' ]. |
|
2789 |
|
2790 aCharacter == $D ifTrue:[^ '6' ]. |
|
2791 aCharacter == $G ifTrue:[^ '6' ]. |
|
2792 aCharacter == $T ifTrue:[^ '6' ]. |
|
2793 |
|
2794 aCharacter == $L ifTrue:[^ '7' ]. |
|
2795 |
|
2796 aCharacter == $M ifTrue:[^ '8' ]. |
|
2797 aCharacter == $N ifTrue:[^ '8' ]. |
|
2798 |
|
2799 aCharacter == $R ifTrue:[^ '9' ]. |
|
2800 ^ nil |
|
2801 ! ! |
|
2802 |
|
2803 !PhoneticStringUtilities::SingleResultPhoneticStringComparator class methodsFor:'documentation'! |
|
2804 |
|
2805 documentation |
|
2806 " |
|
2807 documentation to be added. |
|
2808 |
|
2809 [author:] |
|
2810 cg |
|
2811 |
|
2812 [instance variables:] |
|
2813 |
|
2814 [class variables:] |
|
2815 |
|
2816 [see also:] |
|
2817 |
|
2818 " |
|
2819 ! ! |
|
2820 |
|
2821 !PhoneticStringUtilities::SingleResultPhoneticStringComparator methodsFor:'api'! |
|
2822 |
|
2823 encode:word |
|
2824 ^ self subclassResponsibility |
|
2825 |
|
2826 "Created: / 28-07-2017 / 15:20:49 / cg" |
|
2827 ! |
|
2828 |
|
2829 phoneticStringsFor:word |
|
2830 ^ Array with:(self encode:word) |
|
2831 |
|
2832 "Created: / 28-07-2017 / 15:20:38 / cg" |
|
2833 ! ! |
|
2834 |
|
2835 !PhoneticStringUtilities::MRAStringComparator class methodsFor:'documentation'! |
|
2836 |
|
2837 documentation |
|
2838 " |
|
2839 Match Rating Approach Encoder |
|
2840 |
|
2841 The Western Airlines matching rating approach name encoder |
|
2842 |
|
2843 [see also:] |
|
2844 https://en.wikipedia.org/wiki/Match_Rating_Approach |
|
2845 |
|
2846 G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2847 ''Accessing Individual Records from Personal Data Files Using Nonunique Identifiers'' |
|
2848 US National Institute of Standards and Technology, SP-500-2 (1977), p. 17. |
|
2849 " |
|
2850 ! |
|
2851 |
|
2852 rCode |
|
2853 "<<END |
|
2854 ## Copyright (c) 2015, James P. Howard, II <jh@jameshoward.us> |
|
2855 ## |
|
2856 ## Redistribution and use in source and binary forms, with or without |
|
2857 ## modification, are permitted provided that the following conditions are |
|
2858 ## met: |
|
2859 ## |
|
2860 ## Redistributions of source code must retain the above copyright |
|
2861 ## notice, this list of conditions and the following disclaimer. |
|
2862 ## |
|
2863 ## Redistributions in binary form must reproduce the above copyright |
|
2864 ## notice, this list of conditions and the following disclaimer in |
|
2865 ## the documentation and/or other materials provided with the |
|
2866 ## distribution. |
|
2867 ## |
|
2868 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
2869 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
2870 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
2871 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
2872 ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
2873 ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
2874 ## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
2875 ## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
2876 ## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
2877 ## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
2878 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
2879 |
|
2880 #' @rdname mra |
|
2881 #' @title Match Rating Approach Encoder |
|
2882 #' |
|
2883 #' @description |
|
2884 #' The Western Airlines matching rating approach name encoder |
|
2885 #' |
|
2886 #' @param word string or vector of strings to encode |
|
2887 #' @param x MRA-encoded character vector |
|
2888 #' @param y MRA-encoded character vector |
|
2889 #' |
|
2890 #' @details |
|
2891 #' |
|
2892 #' The variable \code{word} is the name to be encoded. The variable |
|
2893 #' \code{maxCodeLen} is \emph{not} supported in this algorithm encoder |
|
2894 #' because the algorithm itself is dependent upon its six-character |
|
2895 #' length. The variables \code{x} and \code{y} are MRA-encoded and are |
|
2896 #' compared to each other using the MRA comparison specification. |
|
2897 #' |
|
2898 #' @return The \code{mra_encode} function returns match rating approach |
|
2899 #' encoded character vector. The \code{mra_compare} returns a boolean |
|
2900 #' vector which is \code{TRUE} if \code{x} and \code{y} pass the MRA |
|
2901 #' comparison test. |
|
2902 #' |
|
2903 #' @references |
|
2904 #' |
|
2905 #' G.B. Moore, J.L. Kuhns, J.L. Treffzs, and C.A. Montgomery, |
|
2906 #' \emph{Accessing Individual Records from Personal Data Files Using |
|
2907 #' Nonunique Identifiers,} US National Institute of Standards and |
|
2908 #' Technology, SP-500-2 (1977), p. 17. |
|
2909 #' |
|
2910 #' @family phonics |
|
2911 #' |
|
2912 #' @examples |
|
2913 #' mra_encode("William") |
|
2914 #' mra_encode(c("Peter", "Peady")) |
|
2915 #' mra_encode("Stevenson") |
|
2916 |
|
2917 #' @rdname mra |
|
2918 #' @name mra_encode |
|
2919 #' @export |
|
2920 mra_encode <- function(word) { |
|
2921 |
|
2922 ## First, remove any nonalphabetical characters and uppercase it |
|
2923 word <- gsub("[^[:alpha:]]*", "", word) |
|
2924 word <- toupper(word) |
|
2925 |
|
2926 ## First character of key = first character of name |
|
2927 first <- substr(word, 1, 1) |
|
2928 word <- substr(word, 2, nchar(word)) |
|
2929 |
|
2930 ## Delete vowels not at the start of the word |
|
2931 word <- gsub("[AEIOU]", "", word) |
|
2932 word <- paste(first, word, sep = "") |
|
2933 |
|
2934 ## Remove duplicate consecutive characters |
|
2935 word <- gsub("([A-Z])\\1+", "\\1", word) |
|
2936 |
|
2937 ## If longer than 6 characters, take first and last 3...and we have |
|
2938 ## to vectorize it |
|
2939 for(i in 1:length(word)) { |
|
2940 if((l = nchar(word[i])) > 6) { |
|
2941 first <- substr(word[i], 1, 3) |
|
2942 last <- substr(word[i], l - 2, l) |
|
2943 word[i] <- paste(first, last, sep = ""); |
|
2944 } |
|
2945 } |
|
2946 |
|
2947 return(word) |
|
2948 } |
|
2949 |
|
2950 #' @rdname mra |
|
2951 #' @name mra_compare |
|
2952 #' @export |
|
2953 mra_compare <- function(x, y) { |
|
2954 mra <- data.frame(x = x, y = y, sim = 0, min = 100, stringsAsFactors = FALSE) |
|
2955 |
|
2956 ## Obtain the minimum rating value by calculating the length sum of |
|
2957 ## the encoded strings and using table A (from Wikipedia). We start |
|
2958 ## by setting the minimum to be the sum and move from there. |
|
2959 mra$lensum <- nchar(mra$x) + nchar(mra$y) |
|
2960 mra$min[mra$lensum == 12] <- 2 |
|
2961 mra$min[mra$lensum > 7 && mra$lensum <= 11] <- 3 |
|
2962 mra$min[mra$lensum > 4 && mra$lensum <= 7] <- 4 |
|
2963 mra$min[mra$lensum <= 4] <- 5 |
|
2964 |
|
2965 ## If the length difference between the encoded strings is 3 or |
|
2966 ## greater, then no similarity comparison is done. For us, we |
|
2967 ## continue the similarity comparison out of laziness and ensure the |
|
2968 ## minimum is impossibly high to meet. |
|
2969 mra$min[abs(nchar(mra$x) - nchar(mra$y)) >= 3] <- 100 |
|
2970 |
|
2971 ## Start the comparison. |
|
2972 x <- strsplit(mra$x, split = "") |
|
2973 y <- strsplit(mra$y, split = "") |
|
2974 rows <- nrow(mra) |
|
2975 for(i in 1:rows) { |
|
2976 ## Process the encoded strings from left to right and remove any |
|
2977 ## identical characters found from both strings respectively. |
|
2978 j <- 1 |
|
2979 while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
2980 if(x[[i]][j] == y[[i]][j]) { |
|
2981 x[[i]] <- x[[i]][-j] |
|
2982 y[[i]] <- y[[i]][-j] |
|
2983 } else |
|
2984 j <- j + 1 |
|
2985 } |
|
2986 |
|
2987 ## Process the unmatched characters from right to left and |
|
2988 ## remove any identical characters found from both names |
|
2989 ## respectively. |
|
2990 x[[i]] <- rev(x[[i]]) |
|
2991 y[[i]] <- rev(y[[i]]) |
|
2992 j <- 1 |
|
2993 while(j < min(length(x[[i]]), length(y[[i]]))) { |
|
2994 if(x[[i]][j] == y[[i]][j]) { |
|
2995 x[[i]] <- x[[i]][-j] |
|
2996 y[[i]] <- y[[i]][-j] |
|
2997 } else |
|
2998 j <- j + 1 |
|
2999 } |
|
3000 ## Subtract the number of unmatched characters from 6 in the |
|
3001 ## longer string. This is the similarity rating. |
|
3002 len <- min(length(x[[i]]), length(y[[i]])) |
|
3003 mra$sim[i] <- 6 - len |
|
3004 } |
|
3005 |
|
3006 ## If the similarity is greater than or equal to the minimum |
|
3007 ## required, it is a successful match. |
|
3008 mra$match <- (mra$sim >= mra$min) |
|
3009 return(mra$match) |
|
3010 } |
|
3011 |
|
3012 END>> |
|
3013 ! ! |
|
3014 |
|
3015 !PhoneticStringUtilities::MRAStringComparator methodsFor:'api'! |
|
3016 |
|
3017 encode:wordIn |
|
3018 "see https://en.wikipedia.org/wiki/Match_Rating_Approach" |
|
3019 |
|
3020 |word prev| |
|
3021 |
|
3022 word := wordIn. |
|
3023 |
|
3024 "/ First, remove any nonalphabetical characters and uppercase it |
|
3025 |
|
3026 word := word select:#isLetter thenCollect:#asUppercase. |
|
3027 |
|
3028 "/ Delete vowels not at the start of the word |
|
3029 |
|
3030 word := word first asString , ((word from:2) reject:#isVowel). |
|
3031 |
|
3032 "/ Remove duplicate consecutive characters |
|
3033 |
|
3034 prev := nil. |
|
3035 word := word |
|
3036 collect:[:char | |
|
3037 char == prev ifTrue:[ |
|
3038 $* |
|
3039 ] ifFalse:[ |
|
3040 prev := char. |
|
3041 char. |
|
3042 ]. |
|
3043 ] |
|
3044 thenSelect:[:char | char ~~ $*]. |
|
3045 |
|
3046 "/ If longer than 6 characters, take first and last 3 |
|
3047 word size > 6 ifTrue:[ |
|
3048 word := (word copyFirst:3),(word copyLast:3) |
|
3049 ]. |
|
3050 ^ word. |
|
3051 |
|
3052 " |
|
3053 self new encode:'Catherine' -> 'CTHRN' |
|
3054 self new encode:'CatherineCatherine' -> 'CTHHRN' |
|
3055 self new encode:'Butter' -> 'BTR' |
|
3056 self new encode:'Byrne' -> 'BYRN' |
|
3057 self new encode:'Boern' -> 'BRN' |
|
3058 self new encode:'Smith' -> 'SMTH' |
|
3059 self new encode:'Smyth' -> 'SMYTH' |
|
3060 self new encode:'Kathryn' -> 'KTHRYN' |
|
3061 " |
|
3062 |
|
3063 "Created: / 28-07-2017 / 15:19:22 / cg" |
|
3064 "Modified (comment): / 31-07-2017 / 15:14:31 / cg" |
|
3065 ! ! |
|
3066 |
|
3067 !PhoneticStringUtilities::MetaphoneStringComparator class methodsFor:'documentation'! |
|
3068 |
|
3069 documentation |
|
3070 " |
|
3071 Encodes a string into a Metaphone value. |
|
3072 |
|
3073 Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3074 Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3075 |
|
3076 Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39. |
|
3077 Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3078 https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm6 |
|
3079 |
|
3080 They have had undocumented changes from the originally published algorithm. |
|
3081 For more information, see https://issues.apache.org/jira/browse/CODEC-57 |
|
3082 |
|
3083 Metaphone uses the following rules: |
|
3084 |
|
3085 Doubled letters except 'c' -> drop 2nd letter. |
|
3086 Vowels are only kept when they are the first letter. |
|
3087 B -> B unless at the end of a word after 'm' as in 'dumb' |
|
3088 C -> X (sh) if -cia- or -ch- |
|
3089 S if -ci-, -ce- or -cy- |
|
3090 K otherwise, including -sch- |
|
3091 D -> J if in -dge-, -dgy- or -dgi-; T otherwise |
|
3092 F -> F |
|
3093 G -> silent if in -gh- and not at end or before a vowel in -gn- or -gned- (also see dge etc. above) |
|
3094 J if before i or e or y if not double gg; K otherwise |
|
3095 H -> silent if after vowel and no vowel follows; H otherwise |
|
3096 J -> J |
|
3097 K -> silent if after 'c'; K otherwise |
|
3098 L -> L |
|
3099 M -> M |
|
3100 N -> N |
|
3101 P -> F if before 'h'; P otherwise |
|
3102 Q -> K |
|
3103 R -> R |
|
3104 S -> X (sh) if before 'h' or in -sio- or -sia-; S otherwise |
|
3105 T -> X (sh) if -tia- or -tio- 0 (th) if before 'h' silent if in -tch-; T otherwise |
|
3106 V -> F |
|
3107 W -> silent if not followed by a vowel W if followed by a vowel |
|
3108 X -> KS |
|
3109 Y -> silent if not followed by a vowel Y if followed by a vowel |
|
3110 Z -> S |
|
3111 |
|
3112 Initial Letter Exceptions |
|
3113 |
|
3114 Initial kn-, gn- pn, ae- or wr- -> drop first letter |
|
3115 Initial x- -> change to 's' |
|
3116 Initial wh- -> change to 'w' |
|
3117 |
|
3118 |
|
3119 self new encode:'a' |
|
3120 self new encode:'dumb' |
|
3121 self new encode:'MILLER' |
|
3122 self new encode:'schmidt' |
|
3123 self new encode:'schneider' |
|
3124 self new encode:'FISCHER' |
|
3125 self new encode:'HEDGY' |
|
3126 self new encode:'weber' |
|
3127 self new encode:'wagner' |
|
3128 self new encode:'van gogh' |
|
3129 " |
|
3130 ! |
|
3131 |
|
3132 javaCode |
|
3133 "<<END |
|
3134 /* |
|
3135 * Licensed to the Apache Software Foundation (ASF) under one or more |
|
3136 * contributor license agreements. See the NOTICE file distributed with |
|
3137 * this work for additional information regarding copyright ownership. |
|
3138 * The ASF licenses this file to You under the Apache License, Version 2.0 |
|
3139 * (the "License"); you may not use this file except in compliance with |
|
3140 * the License. You may obtain a copy of the License at |
|
3141 * |
|
3142 * http://www.apache.org/licenses/LICENSE-2.0 |
|
3143 * |
|
3144 * Unless required by applicable law or agreed to in writing, software |
|
3145 * distributed under the License is distributed on an "AS IS" BASIS, |
|
3146 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
3147 * See the License for the specific language governing permissions and |
|
3148 * limitations under the License. |
|
3149 */ |
|
3150 |
|
3151 package org.apache.commons.codec.language; |
|
3152 |
|
3153 import org.apache.commons.codec.EncoderException; |
|
3154 import org.apache.commons.codec.StringEncoder; |
|
3155 |
|
3156 /** |
|
3157 * Encodes a string into a Metaphone value. |
|
3158 * <p> |
|
3159 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
|
3160 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
|
3161 * <p> |
|
3162 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, |
|
3163 * p 39.</CITE> |
|
3164 * <p> |
|
3165 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: |
|
3166 * </p> |
|
3167 * <ul> |
|
3168 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> |
|
3169 * (broken link 4/30/2013) </li> |
|
3170 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> |
|
3171 * (link checked 4/30/2013) </li> |
|
3172 * </ul> |
|
3173 * <p> |
|
3174 * They have had undocumented changes from the originally published algorithm. |
|
3175 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. |
|
3176 * <p> |
|
3177 * This class is conditionally thread-safe. |
|
3178 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} |
|
3179 * but is not volatile, and accesses are not synchronized. |
|
3180 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization |
|
3181 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} |
|
3182 * after initial setup. |
|
3183 * |
|
3184 * @version $Id$ |
|
3185 */ |
|
3186 public class Metaphone implements StringEncoder { |
|
3187 |
|
3188 /** |
|
3189 * Five values in the English language |
|
3190 */ |
|
3191 private static final String VOWELS = "AEIOU"; |
|
3192 |
|
3193 /** |
|
3194 * Variable used in Metaphone algorithm |
|
3195 */ |
|
3196 private static final String FRONTV = "EIY"; |
|
3197 |
|
3198 /** |
|
3199 * Variable used in Metaphone algorithm |
|
3200 */ |
|
3201 private static final String VARSON = "CSPTG"; |
|
3202 |
|
3203 /** |
|
3204 * The max code length for metaphone is 4 |
|
3205 */ |
|
3206 private int maxCodeLen = 4; |
|
3207 |
|
3208 /** |
|
3209 * Creates an instance of the Metaphone encoder |
|
3210 */ |
|
3211 public Metaphone() { |
|
3212 super(); |
|
3213 } |
|
3214 |
|
3215 /** |
|
3216 * Find the metaphone value of a String. This is similar to the |
|
3217 * soundex algorithm, but better at finding similar sounding words. |
|
3218 * All input is converted to upper case. |
|
3219 * Limitations: Input format is expected to be a single ASCII word |
|
3220 * with only characters in the A - Z range, no punctuation or numbers. |
|
3221 * |
|
3222 * @param txt String to find the metaphone code for |
|
3223 * @return A metaphone code corresponding to the String supplied |
|
3224 */ |
|
3225 public String metaphone(final String txt) { |
|
3226 boolean hard = false; |
|
3227 int txtLength; |
|
3228 if (txt == null || (txtLength = txt.length()) == 0) { |
|
3229 return ""; |
|
3230 } |
|
3231 // single character is itself |
|
3232 if (txtLength == 1) { |
|
3233 return txt.toUpperCase(java.util.Locale.ENGLISH); |
|
3234 } |
|
3235 |
|
3236 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); |
|
3237 |
|
3238 final StringBuilder local = new StringBuilder(40); // manipulate |
|
3239 final StringBuilder code = new StringBuilder(10); // output |
|
3240 // handle initial 2 characters exceptions |
|
3241 switch(inwd[0]) { |
|
3242 case 'K': |
|
3243 case 'G': |
|
3244 case 'P': /* looking for KN, etc*/ |
|
3245 if (inwd[1] == 'N') { |
|
3246 local.append(inwd, 1, inwd.length - 1); |
|
3247 } else { |
|
3248 local.append(inwd); |
|
3249 } |
|
3250 break; |
|
3251 case 'A': /* looking for AE */ |
|
3252 if (inwd[1] == 'E') { |
|
3253 local.append(inwd, 1, inwd.length - 1); |
|
3254 } else { |
|
3255 local.append(inwd); |
|
3256 } |
|
3257 break; |
|
3258 case 'W': /* looking for WR or WH */ |
|
3259 if (inwd[1] == 'R') { // WR -> R |
|
3260 local.append(inwd, 1, inwd.length - 1); |
|
3261 break; |
|
3262 } |
|
3263 if (inwd[1] == 'H') { |
|
3264 local.append(inwd, 1, inwd.length - 1); |
|
3265 local.setCharAt(0, 'W'); // WH -> W |
|
3266 } else { |
|
3267 local.append(inwd); |
|
3268 } |
|
3269 break; |
|
3270 case 'X': /* initial X becomes S */ |
|
3271 inwd[0] = 'S'; |
|
3272 local.append(inwd); |
|
3273 break; |
|
3274 default: |
|
3275 local.append(inwd); |
|
3276 } // now local has working string with initials fixed |
|
3277 |
|
3278 final int wdsz = local.length(); |
|
3279 int n = 0; |
|
3280 |
|
3281 while (code.length() < this.getMaxCodeLen() && |
|
3282 n < wdsz ) { // max code size of 4 works well |
|
3283 final char symb = local.charAt(n); |
|
3284 // remove duplicate letters except C |
|
3285 if (symb !!= 'C' && isPreviousChar( local, n, symb ) ) { |
|
3286 n++; |
|
3287 } else { // not dup |
|
3288 switch(symb) { |
|
3289 case 'A': |
|
3290 case 'E': |
|
3291 case 'I': |
|
3292 case 'O': |
|
3293 case 'U': |
|
3294 if (n == 0) { |
|
3295 code.append(symb); |
|
3296 } |
|
3297 break; // only use vowel if leading char |
|
3298 case 'B': |
|
3299 if ( isPreviousChar(local, n, 'M') && |
|
3300 isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3301 break; |
|
3302 } |
|
3303 code.append(symb); |
|
3304 break; |
|
3305 case 'C': // lots of C special cases |
|
3306 /* discard if SCI, SCE or SCY */ |
|
3307 if ( isPreviousChar(local, n, 'S') && |
|
3308 !!isLastChar(wdsz, n) && |
|
3309 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3310 break; |
|
3311 } |
|
3312 if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3313 code.append('X'); |
|
3314 break; |
|
3315 } |
|
3316 if (!!isLastChar(wdsz, n) && |
|
3317 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3318 code.append('S'); |
|
3319 break; // CI,CE,CY -> S |
|
3320 } |
|
3321 if (isPreviousChar(local, n, 'S') && |
|
3322 isNextChar(local, n, 'H') ) { // SCH->sk |
|
3323 code.append('K'); |
|
3324 break; |
|
3325 } |
|
3326 if (isNextChar(local, n, 'H')) { // detect CH |
|
3327 if (n == 0 && |
|
3328 wdsz >= 3 && |
|
3329 isVowel(local,2) ) { // CH consonant -> K consonant |
|
3330 code.append('K'); |
|
3331 } else { |
|
3332 code.append('X'); // CHvowel -> X |
|
3333 } |
|
3334 } else { |
|
3335 code.append('K'); |
|
3336 } |
|
3337 break; |
|
3338 case 'D': |
|
3339 if (!!isLastChar(wdsz, n + 1) && |
|
3340 isNextChar(local, n, 'G') && |
|
3341 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3342 code.append('J'); n += 2; |
|
3343 } else { |
|
3344 code.append('T'); |
|
3345 } |
|
3346 break; |
|
3347 case 'G': // GH silent at end or before consonant |
|
3348 if (isLastChar(wdsz, n + 1) && |
|
3349 isNextChar(local, n, 'H')) { |
|
3350 break; |
|
3351 } |
|
3352 if (!!isLastChar(wdsz, n + 1) && |
|
3353 isNextChar(local,n,'H') && |
|
3354 !!isVowel(local,n+2)) { |
|
3355 break; |
|
3356 } |
|
3357 if (n > 0 && |
|
3358 ( regionMatch(local, n, "GN") || |
|
3359 regionMatch(local, n, "GNED") ) ) { |
|
3360 break; // silent G |
|
3361 } |
|
3362 if (isPreviousChar(local, n, 'G')) { |
|
3363 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true |
|
3364 hard = true; |
|
3365 } else { |
|
3366 hard = false; |
|
3367 } |
|
3368 if (!!isLastChar(wdsz, n) && |
|
3369 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3370 !!hard) { |
|
3371 code.append('J'); |
|
3372 } else { |
|
3373 code.append('K'); |
|
3374 } |
|
3375 break; |
|
3376 case 'H': |
|
3377 if (isLastChar(wdsz, n)) { |
|
3378 break; // terminal H |
|
3379 } |
|
3380 if (n > 0 && |
|
3381 VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3382 break; |
|
3383 } |
|
3384 if (isVowel(local,n+1)) { |
|
3385 code.append('H'); // Hvowel |
|
3386 } |
|
3387 break; |
|
3388 case 'F': |
|
3389 case 'J': |
|
3390 case 'L': |
|
3391 case 'M': |
|
3392 case 'N': |
|
3393 case 'R': |
|
3394 code.append(symb); |
|
3395 break; |
|
3396 case 'K': |
|
3397 if (n > 0) { // not initial |
|
3398 if (!!isPreviousChar(local, n, 'C')) { |
|
3399 code.append(symb); |
|
3400 } |
|
3401 } else { |
|
3402 code.append(symb); // initial K |
|
3403 } |
|
3404 break; |
|
3405 case 'P': |
|
3406 if (isNextChar(local,n,'H')) { |
|
3407 // PH -> F |
|
3408 code.append('F'); |
|
3409 } else { |
|
3410 code.append(symb); |
|
3411 } |
|
3412 break; |
|
3413 case 'Q': |
|
3414 code.append('K'); |
|
3415 break; |
|
3416 case 'S': |
|
3417 if (regionMatch(local,n,"SH") || |
|
3418 regionMatch(local,n,"SIO") || |
|
3419 regionMatch(local,n,"SIA")) { |
|
3420 code.append('X'); |
|
3421 } else { |
|
3422 code.append('S'); |
|
3423 } |
|
3424 break; |
|
3425 case 'T': |
|
3426 if (regionMatch(local,n,"TIA") || |
|
3427 regionMatch(local,n,"TIO")) { |
|
3428 code.append('X'); |
|
3429 break; |
|
3430 } |
|
3431 if (regionMatch(local,n,"TCH")) { |
|
3432 // Silent if in "TCH" |
|
3433 break; |
|
3434 } |
|
3435 // substitute numeral 0 for TH (resembles theta after all) |
|
3436 if (regionMatch(local,n,"TH")) { |
|
3437 code.append('0'); |
|
3438 } else { |
|
3439 code.append('T'); |
|
3440 } |
|
3441 break; |
|
3442 case 'V': |
|
3443 code.append('F'); break; |
|
3444 case 'W': |
|
3445 case 'Y': // silent if not followed by vowel |
|
3446 if (!!isLastChar(wdsz,n) && |
|
3447 isVowel(local,n+1)) { |
|
3448 code.append(symb); |
|
3449 } |
|
3450 break; |
|
3451 case 'X': |
|
3452 code.append('K'); |
|
3453 code.append('S'); |
|
3454 break; |
|
3455 case 'Z': |
|
3456 code.append('S'); |
|
3457 break; |
|
3458 default: |
|
3459 // do nothing |
|
3460 break; |
|
3461 } // end switch |
|
3462 n++; |
|
3463 } // end else from symb !!= 'C' |
|
3464 if (code.length() > this.getMaxCodeLen()) { |
|
3465 code.setLength(this.getMaxCodeLen()); |
|
3466 } |
|
3467 } |
|
3468 return code.toString(); |
|
3469 } |
|
3470 |
|
3471 private boolean isVowel(final StringBuilder string, final int index) { |
|
3472 return VOWELS.indexOf(string.charAt(index)) >= 0; |
|
3473 } |
|
3474 |
|
3475 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { |
|
3476 boolean matches = false; |
|
3477 if( index > 0 && |
|
3478 index < string.length() ) { |
|
3479 matches = string.charAt(index - 1) == c; |
|
3480 } |
|
3481 return matches; |
|
3482 } |
|
3483 |
|
3484 private boolean isNextChar(final StringBuilder string, final int index, final char c) { |
|
3485 boolean matches = false; |
|
3486 if( index >= 0 && |
|
3487 index < string.length() - 1 ) { |
|
3488 matches = string.charAt(index + 1) == c; |
|
3489 } |
|
3490 return matches; |
|
3491 } |
|
3492 |
|
3493 private boolean regionMatch(final StringBuilder string, final int index, final String test) { |
|
3494 boolean matches = false; |
|
3495 if( index >= 0 && |
|
3496 index + test.length() - 1 < string.length() ) { |
|
3497 final String substring = string.substring( index, index + test.length()); |
|
3498 matches = substring.equals( test ); |
|
3499 } |
|
3500 return matches; |
|
3501 } |
|
3502 |
|
3503 private boolean isLastChar(final int wdsz, final int n) { |
|
3504 return n + 1 == wdsz; |
|
3505 } |
|
3506 |
|
3507 |
|
3508 /** |
|
3509 * Encodes an Object using the metaphone algorithm. This method |
|
3510 * is provided in order to satisfy the requirements of the |
|
3511 * Encoder interface, and will throw an EncoderException if the |
|
3512 * supplied object is not of type java.lang.String. |
|
3513 * |
|
3514 * @param obj Object to encode |
|
3515 * @return An object (or type java.lang.String) containing the |
|
3516 * metaphone code which corresponds to the String supplied. |
|
3517 * @throws EncoderException if the parameter supplied is not |
|
3518 * of type java.lang.String |
|
3519 */ |
|
3520 @Override |
|
3521 public Object encode(final Object obj) throws EncoderException { |
|
3522 if (!!(obj instanceof String)) { |
|
3523 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
|
3524 } |
|
3525 return metaphone((String) obj); |
|
3526 } |
|
3527 |
|
3528 /** |
|
3529 * Encodes a String using the Metaphone algorithm. |
|
3530 * |
|
3531 * @param str String object to encode |
|
3532 * @return The metaphone code corresponding to the String supplied |
|
3533 */ |
|
3534 @Override |
|
3535 public String encode(final String str) { |
|
3536 return metaphone(str); |
|
3537 } |
|
3538 |
|
3539 /** |
|
3540 * Tests is the metaphones of two strings are identical. |
|
3541 * |
|
3542 * @param str1 First of two strings to compare |
|
3543 * @param str2 Second of two strings to compare |
|
3544 * @return <code>true</code> if the metaphones of these strings are identical, |
|
3545 * <code>false</code> otherwise. |
|
3546 */ |
|
3547 public boolean isMetaphoneEqual(final String str1, final String str2) { |
|
3548 return metaphone(str1).equals(metaphone(str2)); |
|
3549 } |
|
3550 |
|
3551 /** |
|
3552 * Returns the maxCodeLen. |
|
3553 * @return int |
|
3554 */ |
|
3555 public int getMaxCodeLen() { return this.maxCodeLen; } |
|
3556 |
|
3557 /** |
|
3558 * Sets the maxCodeLen. |
|
3559 * @param maxCodeLen The maxCodeLen to set |
|
3560 */ |
|
3561 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
|
3562 |
|
3563 } |
|
3564 END>>" |
|
3565 ! ! |
|
3566 |
|
3567 !PhoneticStringUtilities::MetaphoneStringComparator methodsFor:'api'! |
|
3568 |
|
3569 encode:txt |
|
3570 " |
|
3571 self new encode:'a' |
|
3572 self new encode:'MILLER' |
|
3573 self new encode:'schmidt' |
|
3574 self new encode:'schneider' |
|
3575 self new encode:'FISCHER' |
|
3576 self new encode:'HEDGY' |
|
3577 self new encode:'weber' |
|
3578 self new encode:'wagner' |
|
3579 self new encode:'van gogh' |
|
3580 self new encode:'dumb' |
|
3581 " |
|
3582 |
|
3583 |hard txtLength local code inwd ch ch2 wdsz n maxCodeLen| |
|
3584 |
|
3585 inwd := txt. |
|
3586 hard := false. |
|
3587 txtLength := 0. |
|
3588 maxCodeLen := self maxCodeLen. |
|
3589 |
|
3590 (txtLength := txt size) == 0 ifTrue:[^ '']. |
|
3591 |
|
3592 inwd := txt asUppercase. |
|
3593 "/ single character is itself |
|
3594 (txtLength == 1) ifTrue:[ |
|
3595 ^ inwd |
|
3596 ]. |
|
3597 |
|
3598 code := '' writeStream. |
|
3599 local := inwd. |
|
3600 |
|
3601 "/ handle initial 2 characters exceptions |
|
3602 ch := inwd at:(0+1). |
|
3603 ch2 := inwd at:(1+1). |
|
3604 ('KGP' includes:ch) ifTrue:[ |
|
3605 "/ looking for KN, etc |
|
3606 "/ KNx -> Nx |
|
3607 "/ GNx -> Nx |
|
3608 "/ PNx -> Nx |
|
3609 (ch2 == $N) ifTrue:[ |
|
3610 local := (inwd from:1+1) |
|
3611 ]. |
|
3612 ] ifFalse:[ |
|
3613 ('A' includes:ch) ifTrue:[ |
|
3614 "/ looking for AE |
|
3615 "/ AEx -> Ex |
|
3616 (ch2 == $E) ifTrue:[ |
|
3617 local := (inwd from:1+1) |
|
3618 ]. |
|
3619 ] ifFalse:[ |
|
3620 ('W' includes:ch) ifTrue:[ |
|
3621 "/ looking for WR or WH |
|
3622 (ch2 == $R) ifTrue:[ |
|
3623 "/ WRx -> Wx |
|
3624 local := (inwd from:1+1) |
|
3625 ] ifFalse:[ |
|
3626 (ch2 == $H) ifTrue:[ |
|
3627 "/ // WH -> W |
|
3628 local := 'W',(inwd from:2+1). |
|
3629 ] |
|
3630 ] |
|
3631 ] ifFalse:[ |
|
3632 ('X' includes:ch) ifTrue:[ |
|
3633 "/ initial X becomes S */ |
|
3634 "/ Xx -> Sx |
|
3635 local := 'S',(inwd from:1+1). |
|
3636 ]]]]. |
|
3637 |
|
3638 "/ now local has working string with initials fixed |
|
3639 |
|
3640 wdsz := local size. |
|
3641 n := 1. |
|
3642 |
|
3643 [ (code size < maxCodeLen) and:[ n <= wdsz ] ] whileTrue:[ |
|
3644 "/ max code size of 4 works well |
|
3645 |
|
3646 |symb prevChar nextChar nextNextChar isLastChar isPrevToLastChar| |
|
3647 |
|
3648 symb := local at:n. |
|
3649 (n > 1) ifTrue:[ prevChar := local at:(n-1) ]. |
|
3650 (isLastChar := (n == wdsz)) ifFalse:[ |
|
3651 nextChar := local at:(n+1) |
|
3652 ]. |
|
3653 isPrevToLastChar := (n == (wdsz-1)). |
|
3654 (n+2) <= wdsz ifTrue:[ |
|
3655 nextNextChar := local at:(n+2) |
|
3656 ]. |
|
3657 |
|
3658 "/ remove duplicate letters except C |
|
3659 (symb ~~ $C and:[ nextChar == symb ]) ifFalse:[ |
|
3660 "/ not dup |
|
3661 ('AEIOU' includes:symb) ifTrue:[ |
|
3662 "/ only use vowel if leading char |
|
3663 (n == 1) ifTrue:[ |
|
3664 code nextPut:symb |
|
3665 ] |
|
3666 ] ifFalse:[ |
|
3667 ('B' includes:symb) ifTrue:[ |
|
3668 "/ if ( isPreviousChar(local, n, 'M') && |
|
3669 "/ isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
|
3670 "/ break; |
|
3671 "/ } |
|
3672 "/ code.append(symb); |
|
3673 "/ break; |
|
3674 ((prevChar == $M) and:[isLastChar]) ifTrue:[ |
|
3675 "/ B is silent if word ends in MB |
|
3676 ] ifFalse:[ |
|
3677 code nextPut:symb. |
|
3678 ]. |
|
3679 ] ifFalse:[ |
|
3680 ('C' includes:symb) ifTrue:[ |
|
3681 "/ lots of C special cases |
|
3682 "/ /* discard if SCI, SCE or SCY */ |
|
3683 "/ if ( isPreviousChar(local, n, 'S') && |
|
3684 "/ !!isLastChar(wdsz, n) && |
|
3685 "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { |
|
3686 "/ break; |
|
3687 "/ } |
|
3688 "/ if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
3689 "/ code.append('X'); |
|
3690 "/ break; |
|
3691 "/ } |
|
3692 "/ if (!!isLastChar(wdsz, n) && |
|
3693 "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0) { |
|
3694 "/ code.append('S'); |
|
3695 "/ break; // CI,CE,CY -> S |
|
3696 "/ } |
|
3697 "/ if (isPreviousChar(local, n, 'S') && |
|
3698 "/ isNextChar(local, n, 'H') ) { // SCH->sk |
|
3699 "/ code.append('K'); |
|
3700 "/ break; |
|
3701 "/ } |
|
3702 "/ if (isNextChar(local, n, 'H')) { // detect CH |
|
3703 "/ if (n == 0 && |
|
3704 "/ wdsz >= 3 && |
|
3705 "/ isVowel(local,2) ) { // CH consonant -> K consonant |
|
3706 "/ code.append('K'); |
|
3707 "/ } else { |
|
3708 "/ code.append('X'); // CHvowel -> X |
|
3709 "/ } |
|
3710 "/ } else { |
|
3711 "/ code.append('K'); |
|
3712 "/ } |
|
3713 "/ break; |
|
3714 (prevChar == $S and:[ 'EIY' includes:nextChar ]) ifTrue:[ |
|
3715 "/ discard if SCI, SCE or SCY |
|
3716 ] ifFalse:[ |
|
3717 ((nextChar == $I) and:[ nextNextChar == $A ]) ifTrue:[ |
|
3718 "/ "CIA" -> X |
|
3719 code nextPut:$X |
|
3720 ] ifFalse:[ |
|
3721 ('IEY' includes:nextChar) ifTrue:[ |
|
3722 "/ CI,CE,CY -> S |
|
3723 code nextPut:$S |
|
3724 ] ifFalse:[ |
|
3725 ((prevChar == $S) and:[ nextChar == $H ]) ifTrue:[ |
|
3726 "/ SCH->sk |
|
3727 code nextPut:$K |
|
3728 ] ifFalse:[ |
|
3729 nextChar == $H ifTrue:[ |
|
3730 "/ CH |
|
3731 ('AEIOU' includes:nextNextChar) ifTrue:[ |
|
3732 code nextPut:$K "/ CH consonant -> K consonant |
|
3733 ] ifFalse:[ |
|
3734 code nextPut:$X "/ CHvowel -> X |
|
3735 ] |
|
3736 ] ifFalse:[ |
|
3737 code nextPut:$K |
|
3738 ]. |
|
3739 ] |
|
3740 ] |
|
3741 ] |
|
3742 ]. |
|
3743 |
|
3744 ] ifFalse:[ |
|
3745 ('D' includes:symb) ifTrue:[ |
|
3746 "/ if (!!isLastChar(wdsz, n + 1) && |
|
3747 "/ isNextChar(local, n, 'G') && |
|
3748 "/ FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J |
|
3749 "/ code.append('J'); n += 2; |
|
3750 "/ } else { |
|
3751 "/ code.append('T'); |
|
3752 "/ } |
|
3753 "/ break; |
|
3754 ((nextChar == $G) |
|
3755 and:[ (local from:n) startsWithAnyOf:#('DGE' 'DGI' 'DGY') ]) |
|
3756 ifTrue:[ |
|
3757 code nextPut:$J. |
|
3758 n := n + 2. |
|
3759 ] ifFalse:[ |
|
3760 code nextPut:$T. |
|
3761 ]. |
|
3762 ] ifFalse:[ |
|
3763 ('G' includes:symb) ifTrue:[ |
|
3764 "/ GH silent at end or before consonant |
|
3765 "/ if (isLastChar(wdsz, n + 1) && |
|
3766 "/ isNextChar(local, n, 'H')) { |
|
3767 "/ break; |
|
3768 "/ } |
|
3769 "/ if (!!isLastChar(wdsz, n + 1) && |
|
3770 "/ isNextChar(local,n,'H') && |
|
3771 "/ !!isVowel(local,n+2)) { |
|
3772 "/ break; |
|
3773 "/ } |
|
3774 "/ if (n > 0 && |
|
3775 "/ ( regionMatch(local, n, "GN") || |
|
3776 "/ regionMatch(local, n, "GNED") ) ) { |
|
3777 "/ break; // silent G |
|
3778 "/ } |
|
3779 "/ if (isPreviousChar(local, n, 'G')) { |
|
3780 "/ // NOTE: Given that duplicated chars are removed, I dont see how this can ever be true |
|
3781 "/ hard = true; |
|
3782 "/ } else { |
|
3783 "/ hard = false; |
|
3784 "/ } |
|
3785 "/ if (!!isLastChar(wdsz, n) && |
|
3786 "/ FRONTV.indexOf(local.charAt(n + 1)) >= 0 && |
|
3787 "/ !!hard) { |
|
3788 "/ code.append('J'); |
|
3789 "/ } else { |
|
3790 "/ code.append('K'); |
|
3791 "/ } |
|
3792 "/ break; |
|
3793 (isPrevToLastChar and:[ nextChar == $H ]) ifTrue:[ |
|
3794 "/ GH silent at end |
|
3795 ] ifFalse:[ |
|
3796 (isPrevToLastChar not and:[ nextChar == $H |
|
3797 and:[ ('AEIOU' includes:nextNextChar) not ]]) ifTrue:[ |
|
3798 "/ GH silent before consonant |
|
3799 ] ifFalse:[ |
|
3800 (n > 1 and:[ nextChar == $N ]) ifTrue:[ |
|
3801 "/ GN -> silent G |
|
3802 ] ifFalse:[ |
|
3803 hard := (prevChar == $G). |
|
3804 (isLastChar not and:[ hard not and:[ ('EIY' includes:nextChar) ]]) ifTrue:[ |
|
3805 code nextPut:$J |
|
3806 ] ifFalse:[ |
|
3807 code nextPut:$K |
|
3808 ]. |
|
3809 ]. |
|
3810 ]. |
|
3811 ]. |
|
3812 ] ifFalse:[ |
|
3813 ('H' includes:symb) ifTrue:[ |
|
3814 "/ case 'H': |
|
3815 "/ if (isLastChar(wdsz, n)) { |
|
3816 "/ break; // terminal H |
|
3817 "/ } |
|
3818 "/ if (n > 0 && |
|
3819 "/ VARSON.indexOf(local.charAt(n - 1)) >= 0) { |
|
3820 "/ break; |
|
3821 "/ } |
|
3822 "/ if (isVowel(local,n+1)) { |
|
3823 "/ code.append('H'); // Hvowel |
|
3824 "/ } |
|
3825 "/ break; |
|
3826 isLastChar ifTrue:[ |
|
3827 "/ ignore terminal H |
|
3828 ] ifFalse:[ |
|
3829 ('CSPTG' includes:prevChar) ifTrue:[ |
|
3830 "/ ignore CH, SH, PH, TH, GH (H treated there) |
|
3831 ] ifFalse:[ |
|
3832 ('AEIOU' includes:nextChar) ifTrue:[ |
|
3833 "/ Hvowel |
|
3834 code nextPut:$H |
|
3835 ]. |
|
3836 ]. |
|
3837 ]. |
|
3838 ] ifFalse:[ |
|
3839 ('FJLMNR' includes:symb) ifTrue:[ |
|
3840 "/ case 'F': |
|
3841 "/ case 'J': |
|
3842 "/ case 'L': |
|
3843 "/ case 'M': |
|
3844 "/ case 'N': |
|
3845 "/ case 'R': |
|
3846 "/ code.append(symb); |
|
3847 "/ break; |
|
3848 code nextPut:symb. |
|
3849 ] ifFalse:[ |
|
3850 ('K' includes:symb) ifTrue:[ |
|
3851 "/ case 'K': |
|
3852 "/ if (n > 0) { // not initial |
|
3853 "/ if (!!isPreviousChar(local, n, 'C')) { |
|
3854 "/ code.append(symb); |
|
3855 "/ } |
|
3856 "/ } else { |
|
3857 "/ code.append(symb); // initial K |
|
3858 "/ } |
|
3859 "/ break; |
|
3860 n > 1 ifTrue:[ |
|
3861 "/ not initial |
|
3862 prevChar ~~ $C ifTrue:[ |
|
3863 code nextPut:$K. "/ initial K |
|
3864 ]. |
|
3865 ] ifFalse:[ |
|
3866 code nextPut:$K. "/ initial K |
|
3867 ]. |
|
3868 ] ifFalse:[ |
|
3869 ('P' includes:symb) ifTrue:[ |
|
3870 "/ case 'P': |
|
3871 "/ if (isNextChar(local,n,'H')) { |
|
3872 "/ // PH -> F |
|
3873 "/ code.append('F'); |
|
3874 "/ } else { |
|
3875 "/ code.append(symb); |
|
3876 "/ } |
|
3877 "/ break; |
|
3878 nextChar == $H ifTrue:[ |
|
3879 "/ PH -> F |
|
3880 code nextPut:$F. |
|
3881 ] ifFalse:[ |
|
3882 code nextPut:symb. |
|
3883 ]. |
|
3884 ] ifFalse:[ |
|
3885 ('Q' includes:symb) ifTrue:[ |
|
3886 "/ case 'Q': |
|
3887 "/ code.append('K'); |
|
3888 "/ break; |
|
3889 code nextPut:$K |
|
3890 |
|
3891 ] ifFalse:[ |
|
3892 ('S' includes:symb) ifTrue:[ |
|
3893 "/ case 'S': |
|
3894 "/ if (regionMatch(local,n,"SH") || |
|
3895 "/ regionMatch(local,n,"SIO") || |
|
3896 "/ regionMatch(local,n,"SIA")) { |
|
3897 "/ code.append('X'); |
|
3898 "/ } else { |
|
3899 "/ code.append('S'); |
|
3900 "/ } |
|
3901 "/ break; |
|
3902 "/ SH -> X (as in shave or ashton) |
|
3903 "/ SIO -> X |
|
3904 "/ SIA -> X (as in ASIA) |
|
3905 ((nextChar == $H) |
|
3906 or:[ |
|
3907 ((nextChar == $I) |
|
3908 and:[ |
|
3909 (((local from:n) startsWith:'SIO') |
|
3910 or:[ ((local from:n) startsWith:'SIA') ]) |
|
3911 ] |
|
3912 ) |
|
3913 ]) ifTrue:[ |
|
3914 code nextPut:$X |
|
3915 ] ifFalse:[ |
|
3916 code nextPut:$S |
|
3917 ] |
|
3918 ] ifFalse:[ |
|
3919 ('T' includes:symb) ifTrue:[ |
|
3920 "/ case 'T': |
|
3921 "/ if (regionMatch(local,n,"TIA") || |
|
3922 "/ regionMatch(local,n,"TIO")) { |
|
3923 "/ code.append('X'); |
|
3924 "/ break; |
|
3925 "/ } |
|
3926 "/ if (regionMatch(local,n,"TCH")) { |
|
3927 "/ // Silent if in "TCH" |
|
3928 "/ break; |
|
3929 "/ } |
|
3930 "/ // substitute numeral 0 for TH (resembles theta after all) |
|
3931 "/ if (regionMatch(local,n,"TH")) { |
|
3932 "/ code.append('0'); |
|
3933 "/ } else { |
|
3934 "/ code.append('T'); |
|
3935 "/ } |
|
3936 "/ break; |
|
3937 self halt. |
|
3938 ] ifFalse:[ |
|
3939 ('V' includes:symb) ifTrue:[ |
|
3940 "/ case 'V': |
|
3941 "/ code.append('F'); break; |
|
3942 code nextPut:$F |
|
3943 |
|
3944 ] ifFalse:[ |
|
3945 ('WY' includes:symb) ifTrue:[ |
|
3946 "/ case 'W': |
|
3947 "/ case 'Y': // silent if not followed by vowel |
|
3948 "/ if (!!isLastChar(wdsz,n) && |
|
3949 "/ isVowel(local,n+1)) { |
|
3950 "/ code.append(symb); |
|
3951 "/ } |
|
3952 "/ break; |
|
3953 |
|
3954 "/ silent if not followed by vowel |
|
3955 (isLastChar not and:[ 'AEIOU' includes:nextChar ]) ifTrue:[ |
|
3956 code nextPut:symb |
|
3957 ]. |
|
3958 ] ifFalse:[ |
|
3959 ('X' includes:symb) ifTrue:[ |
|
3960 "/ case 'X': |
|
3961 "/ code.append('K'); |
|
3962 "/ code.append('S'); |
|
3963 "/ break; |
|
3964 code nextPutAll:'KS' |
|
3965 ] ifFalse:[ |
|
3966 ('Z' includes:symb) ifTrue:[ |
|
3967 "/ case 'Z': |
|
3968 "/ code.append('S'); |
|
3969 "/ break; |
|
3970 code nextPut:$S |
|
3971 ] ifFalse:[ |
|
3972 "/ default: |
|
3973 "/ // do nothing |
|
3974 "/ break; |
|
3975 ]]]]]]]]]]]]]]]]. "/ end switch |
|
3976 ]. "/ end else from symb !!= 'C' |
|
3977 n := n + 1. |
|
3978 (code size > maxCodeLen) ifTrue:[ |
|
3979 code := code truncateTo:maxCodeLen |
|
3980 ] |
|
3981 ]. |
|
3982 ^ code contents |
|
3983 |
|
3984 "Created: / 02-08-2017 / 09:51:31 / cg" |
|
3985 "Modified: / 02-08-2017 / 12:00:38 / cg" |
|
3986 ! |
|
3987 |
|
3988 maxCodeLen |
|
3989 ^ 4 |
|
3990 |
|
3991 "Created: / 02-08-2017 / 09:51:59 / cg" |
|
3992 ! ! |
|
3993 |
|
3994 !PhoneticStringUtilities::SoundexStringComparator class methodsFor:'documentation'! |
|
3995 |
|
3996 documentation |
|
3997 " |
|
3998 WARNING: this is the so called 'simplified soundex' algorithm; |
|
3999 there are more variants like miracode (american soundex) or |
|
4000 mysqlSoundex around. |
|
4001 |
|
4002 Be sure to use the correct algorithm, if the generated strings must be compatible |
|
4003 (otherwise, the differences are probably too small to be noticed as effect, but |
|
4004 your search will be different) |
|
4005 |
|
4006 The following was copied from http://www.civilsolutions.com.au/publications/dedup.htm |
|
4007 |
|
4008 SOUNDEX is a phonetic coding algorithm that ignores many of the unreliable |
|
4009 components of names, but by doing so reports more matches. |
|
4010 |
|
4011 There are some variations around in the literature; |
|
4012 the following is called 'simplified soundex', and the rules for coding a name are: |
|
4013 |
|
4014 1. The first letter of the name is used in its un-coded form to serve as the prefix |
|
4015 character of the code. (The rest of the code is numerical). |
|
4016 |
|
4017 2. Thereafter, W and H are ignored entirely. |
|
4018 |
|
4019 3. A, E, I, 0, U, Y are not assigned a code number, but do serve as 'separators' (see Step 5). |
|
4020 |
|
4021 4. Other letters of the name are converted to a numerical equivalent: |
|
4022 B, P, F, V 1 |
|
4023 C, G, J, K, Q, S, X, Z 2 |
|
4024 D, T 3 |
|
4025 L 4 |
|
4026 M, N 5 |
|
4027 R 6 |
|
4028 |
|
4029 5. There are two exceptions: |
|
4030 1. Letters that follow prefix letters which would, if coded, have the same |
|
4031 numerical code, are ignored in all cases unless a ''separator'' (see Step 3) precedes them. |
|
4032 |
|
4033 2. The second letter of any pair of consonants having the same code number is likewise ignored, |
|
4034 i.e. unless there is a ''separator'' between them in the name. |
|
4035 |
|
4036 6. The final SOUNDEX code consists of the prefix letter plus three numerical characters. |
|
4037 Longer codes are truncated to this length, and shorter codes are extended to it by adding zeros. |
|
4038 |
|
4039 Notice, that in another variant, w and h are treated slightly differently. |
|
4040 This is only of relevance, if you need to reconstruct original soundex codes of other programs |
|
4041 or for the original 1880 us census data. |
|
4042 SoundexStringComparator new encode:'Ashcraft' -> 'A226' |
|
4043 vs. |
|
4044 MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4045 |
|
4046 Also notice, that soundex deals better with english. |
|
4047 For german and other languages, other algorithms may provide better results. |
|
4048 " |
|
4049 ! ! |
|
4050 |
|
4051 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'api'! |
|
4052 |
|
4053 encode:word |
|
4054 |u p t prevCode| |
|
4055 |
|
4056 u := word asUppercase. |
|
4057 p := u first asString. |
|
4058 prevCode := self translate:u first. |
|
4059 u from:2 to:u size do:[:c | |
|
4060 t := self translate:c. |
|
4061 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4062 p := p , t. |
|
4063 p size == 4 ifTrue:[^ p ]. |
|
4064 ]. |
|
4065 prevCode := t |
|
4066 ]. |
|
4067 [ p size < 4 ] whileTrue:[ |
|
4068 p := p , '0' |
|
4069 ]. |
|
4070 ^ (p copyFrom:1 to:4) |
|
4071 |
|
4072 " |
|
4073 self new encode:'washington' -> 'W252' |
|
4074 self new encode:'lee' -> 'L000' |
|
4075 self new encode:'Gutierrez' -> 'G362' |
|
4076 self new encode:'Pfister' -> 'P236' |
|
4077 self new encode:'Jackson' -> 'J250' |
|
4078 self new encode:'Tymczak' -> 'T522' |
|
4079 " |
|
4080 |
|
4081 "notice: |
|
4082 MiracodeStringComparator new encode:'Ashcraft' -> 'A261' |
|
4083 self new encode:'Ashcraft' -> 'A226' |
|
4084 " |
|
4085 |
|
4086 "Created: / 28-07-2017 / 15:21:23 / cg" |
|
4087 "Modified (comment): / 01-08-2017 / 19:01:43 / cg" |
|
4088 ! ! |
|
4089 |
|
4090 !PhoneticStringUtilities::SoundexStringComparator methodsFor:'private'! |
|
4091 |
|
4092 translate:aCharacter |
|
4093 "use simple if's for more speed when compiled" |
|
4094 |
|
4095 "vowels serve as separators" |
|
4096 aCharacter == $A ifTrue:[^ '0' ]. |
|
4097 aCharacter == $E ifTrue:[^ '0' ]. |
|
4098 aCharacter == $I ifTrue:[^ '0' ]. |
|
4099 aCharacter == $O ifTrue:[^ '0' ]. |
|
4100 aCharacter == $U ifTrue:[^ '0' ]. |
|
4101 aCharacter == $Y ifTrue:[^ '0' ]. |
|
4102 |
|
4103 aCharacter == $B ifTrue:[^ '1' ]. |
|
4104 aCharacter == $P ifTrue:[^ '1' ]. |
|
4105 aCharacter == $F ifTrue:[^ '1' ]. |
|
4106 aCharacter == $V ifTrue:[^ '1' ]. |
|
4107 |
|
4108 aCharacter == $C ifTrue:[^ '2' ]. |
|
4109 aCharacter == $S ifTrue:[^ '2' ]. |
|
4110 aCharacter == $K ifTrue:[^ '2' ]. |
|
4111 aCharacter == $G ifTrue:[^ '2' ]. |
|
4112 aCharacter == $J ifTrue:[^ '2' ]. |
|
4113 aCharacter == $Q ifTrue:[^ '2' ]. |
|
4114 aCharacter == $X ifTrue:[^ '2' ]. |
|
4115 aCharacter == $Z ifTrue:[^ '2' ]. |
|
4116 |
|
4117 aCharacter == $D ifTrue:[^ '3' ]. |
|
4118 aCharacter == $T ifTrue:[^ '3' ]. |
|
4119 |
|
4120 aCharacter == $L ifTrue:[^ '4' ]. |
|
4121 |
|
4122 aCharacter == $M ifTrue:[^ '5' ]. |
|
4123 aCharacter == $N ifTrue:[^ '5' ]. |
|
4124 |
|
4125 aCharacter == $R ifTrue:[^ '6' ]. |
|
4126 ^ nil |
|
4127 |
|
4128 "Modified: / 02-08-2017 / 01:35:40 / cg" |
|
4129 "Modified (comment): / 02-08-2017 / 14:30:11 / cg" |
|
4130 ! ! |
|
4131 |
|
4132 !PhoneticStringUtilities::MySQLSoundexStringComparator class methodsFor:'documentation'! |
|
4133 |
|
4134 documentation |
|
4135 " |
|
4136 MySQL soundex is like american Soundex (i.e. miracode) without the 4 character limitation, |
|
4137 and also removing vokals first, then removing duplicate codes |
|
4138 (whereas the soundex code does this in reverse order). |
|
4139 |
|
4140 These variations are important, if you need the miracode soundex codes to be generated. |
|
4141 " |
|
4142 ! ! |
|
4143 |
|
4144 !PhoneticStringUtilities::MySQLSoundexStringComparator methodsFor:'api'! |
|
4145 |
|
4146 encode:word |
|
4147 "same as inherited, but cares for 0, W and H" |
|
4148 |
|
4149 |u p t prevCode| |
|
4150 |
|
4151 u := word asUppercase. |
|
4152 p := u first asString. |
|
4153 prevCode := self translate:u first. |
|
4154 u from:2 to:u size do:[:c | |
|
4155 t := self translate:c. |
|
4156 (t notNil and:[ t ~= '0' and:[ t ~= prevCode ]]) ifTrue:[ |
|
4157 p := p , t. |
|
4158 ]. |
|
4159 (t ~= '0' and:[ c ~= $W and:[c ~= $H]]) ifTrue:[ |
|
4160 prevCode := t. |
|
4161 ]. |
|
4162 ]. |
|
4163 [ p size < 4 ] whileTrue:[ |
|
4164 p := p , '0' |
|
4165 ]. |
|
4166 ^ p |
|
4167 |
|
4168 "Created: / 28-07-2017 / 15:23:41 / cg" |
|
4169 "Modified: / 31-07-2017 / 17:53:51 / cg" |
|
4170 "Modified (comment): / 02-08-2017 / 14:31:15 / cg" |
|
4171 ! ! |
|
4172 |
|
4173 !PhoneticStringUtilities::NYSIISStringComparator class methodsFor:'documentation'! |
|
4174 |
|
4175 documentation |
|
4176 " |
|
4177 NYSIIS Algorithm: |
|
4178 |
|
4179 1. |
|
4180 remove all ''S'' and ''Z'' chars from the end of the surname |
|
4181 |
|
4182 2. |
|
4183 transcode initial strings |
|
4184 MAC => MC |
|
4185 PF => F |
|
4186 |
|
4187 3. |
|
4188 Transcode trailing strings as follows, |
|
4189 |
|
4190 IX => IC |
|
4191 EX => EC |
|
4192 YE,EE,IE => Y |
|
4193 NT,ND => D |
|
4194 |
|
4195 4. |
|
4196 transcode ''EV'' to ''EF'' if not at start of name |
|
4197 |
|
4198 5. |
|
4199 use first character of name as first character of key |
|
4200 |
|
4201 6. |
|
4202 remove any ''W'' that follows a vowel |
|
4203 |
|
4204 7. |
|
4205 replace all vowels with ''A'' |
|
4206 |
|
4207 8. |
|
4208 transcode ''GHT'' to ''GT'' |
|
4209 |
|
4210 9. |
|
4211 transcode ''DG'' to ''G'' |
|
4212 |
|
4213 10. |
|
4214 transcode ''PH'' to ''F'' |
|
4215 |
|
4216 11. |
|
4217 if not first character, eliminate all ''H'' preceded or followed by a vowel |
|
4218 |
|
4219 12. |
|
4220 change ''KN'' to ''N'', else ''K'' to ''C'' |
|
4221 |
|
4222 13. |
|
4223 if not first character, change ''M'' to ''N'' |
|
4224 |
|
4225 14. |
|
4226 if not first character, change ''Q'' to ''G'' |
|
4227 |
|
4228 15. |
|
4229 transcode ''SH'' to ''S'' |
|
4230 |
|
4231 16. |
|
4232 transcode ''SCH'' to ''S'' |
|
4233 |
|
4234 17. |
|
4235 transcode ''YW'' to ''Y'' |
|
4236 |
|
4237 18. |
|
4238 if not first or last character, change ''Y'' to ''A'' |
|
4239 |
|
4240 19. |
|
4241 transcode ''WR'' to ''R'' |
|
4242 |
|
4243 20. |
|
4244 if not first character, change ''Z'' to ''S'' |
|
4245 |
|
4246 21. |
|
4247 transcode terminal ''AY'' to ''Y'' |
|
4248 |
|
4249 22. |
|
4250 remove traling vowels |
|
4251 |
|
4252 23. |
|
4253 collapse all strings of repeated characters |
|
4254 |
|
4255 24. |
|
4256 if first char of original surname was a vowel, append it to the code |
|
4257 " |
|
4258 ! ! |
|
4259 |
|
4260 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'api'! |
|
4261 |
|
4262 encode:aString |
|
4263 |k| |
|
4264 |
|
4265 k := self rule1:(aString asUppercase). |
|
4266 "2. Transcode initial strings: MAC => MC PF => F" |
|
4267 k := self rule2:k. |
|
4268 k := self rule3:k. |
|
4269 k := self rule4:k. |
|
4270 k := self rule5:k. |
|
4271 k := self rule6:k. |
|
4272 k := self rule7:k. |
|
4273 k := self rule8:k. |
|
4274 k := self rule9:k. |
|
4275 k := self rule10:k. |
|
4276 k := self rule11:k. |
|
4277 k := self rule12:k. |
|
4278 k := self rule13:k. |
|
4279 k := self rule14:k. |
|
4280 k := self rule15:k. |
|
4281 k := self rule16:k. |
|
4282 k := self rule17:k. |
|
4283 k := self rule18:k. |
|
4284 k := self rule19:k. |
|
4285 k := self rule20:k. |
|
4286 k := self rule21:k. |
|
4287 k := self rule22:k. |
|
4288 k := self rule23:k. |
|
4289 k := self rule24:k originalKey:aString. |
|
4290 ^ k |
|
4291 |
|
4292 " |
|
4293 self new encode:'hello' |
|
4294 self new encode:'bliss' |
|
4295 " |
|
4296 " |
|
4297 self new phoneticStringsFor:'hello' |
|
4298 self new phoneticStringsFor:'bliss' |
|
4299 " |
|
4300 |
|
4301 "Created: / 28-07-2017 / 15:34:52 / cg" |
|
4302 "Modified (comment): / 02-08-2017 / 14:31:47 / cg" |
|
4303 ! ! |
|
4304 |
|
4305 !PhoneticStringUtilities::NYSIISStringComparator methodsFor:'private'! |
|
4306 |
|
4307 rule10:key |
|
4308 "10. transcode 'PH' to 'F' " |
|
4309 |
|
4310 ^ self transcodeAll:'PH' of:key to:'F' startingAt:1 |
|
4311 |
|
4312 "Modified (format): / 02-08-2017 / 14:34:27 / cg" |
|
4313 ! |
|
4314 |
|
4315 rule11:key |
|
4316 |k c| |
|
4317 |
|
4318 "11. if not first character, eliminate all 'H' preceded or followed by a vowel " |
|
4319 k := key copy. |
|
4320 c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4321 2 to:key size do:[:i | |
|
4322 (key at:i) = $H ifTrue:[ |
|
4323 ((key at:i - 1) isVowel |
|
4324 or:[ (i < key size) and:[ (key at:i + 1) isVowel ] ]) ifTrue:[ c add:i ] |
|
4325 ] |
|
4326 ]. |
|
4327 c do:[:n | |
|
4328 k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4329 ]. |
|
4330 ^ k |
|
4331 ! |
|
4332 |
|
4333 rule12:key |
|
4334 |k| |
|
4335 |
|
4336 "12. change 'KN' to 'N', else 'K' to 'C' " |
|
4337 k := self transcodeAll:'KN' of:key to:'K' startingAt:1. |
|
4338 k := self transcodeAll:'K' of:k to:'C' startingAt:1. |
|
4339 ^ k |
|
4340 |
|
4341 "Modified (format): / 02-08-2017 / 14:34:48 / cg" |
|
4342 ! |
|
4343 |
|
4344 rule13:key |
|
4345 "13. if not first character, change 'M' to 'N' " |
|
4346 |
|
4347 ^ self transcodeAll:'M' of:key to:'N' startingAt:2 |
|
4348 |
|
4349 "Modified (format): / 02-08-2017 / 14:35:00 / cg" |
|
4350 ! |
|
4351 |
|
4352 rule14:key |
|
4353 "14. if not first character, change 'Q' to 'G' " |
|
4354 |
|
4355 ^ self transcodeAll:'Q' of:key to:'G' startingAt:2 |
|
4356 |
|
4357 "Modified (format): / 02-08-2017 / 14:35:08 / cg" |
|
4358 ! |
|
4359 |
|
4360 rule15:key |
|
4361 "15. transcode 'SH' to 'S' " |
|
4362 |
|
4363 ^ self transcodeAll:'SH' of:key to:'S' startingAt:1 |
|
4364 |
|
4365 "Modified (format): / 02-08-2017 / 14:35:18 / cg" |
|
4366 ! |
|
4367 |
|
4368 rule16:key |
|
4369 "16. transcode 'SCH' to 'S' " |
|
4370 |
|
4371 ^ self transcodeAll:'SCH' of:key to:'S' startingAt:1 |
|
4372 |
|
4373 "Modified (format): / 02-08-2017 / 14:35:25 / cg" |
|
4374 ! |
|
4375 |
|
4376 rule17:key |
|
4377 "17. transcode 'YW' to 'Y' " |
|
4378 |
|
4379 ^ self transcodeAll:'YW' of:key to:'Y' startingAt:1 |
|
4380 |
|
4381 "Modified (format): / 02-08-2017 / 14:35:33 / cg" |
|
4382 ! |
|
4383 |
|
4384 rule18:key |
|
4385 |k| |
|
4386 |
|
4387 "18. if not first or last character, change 'Y' to 'A' " |
|
4388 k := self transcodeAll:'Y' of:key to:'A' startingAt:2. |
|
4389 key last = $Y ifTrue:[ |
|
4390 k at:k size put:$Y |
|
4391 ]. |
|
4392 ^ k |
|
4393 |
|
4394 "Modified (format): / 02-08-2017 / 14:35:44 / cg" |
|
4395 ! |
|
4396 |
|
4397 rule19:key |
|
4398 "19. transcode 'WR' to 'R' " |
|
4399 |
|
4400 ^ self transcodeAll:'WR' of:key to:'R' startingAt:1 |
|
4401 |
|
4402 "Modified (format): / 02-08-2017 / 14:35:52 / cg" |
|
4403 ! |
|
4404 |
|
4405 rule1:key |
|
4406 |k| |
|
4407 |
|
4408 k := key copy. |
|
4409 "1. Remove all 'S' and 'Z' chars from the end of the name" |
|
4410 [ |
|
4411 'SZ' includes:k last |
|
4412 ] whileTrue:[ k := k copyFrom:1 to:(k size - 1) ]. |
|
4413 ^ k |
|
4414 ! |
|
4415 |
|
4416 rule20:key |
|
4417 "20. if not first character, change 'Z' to 'S' " |
|
4418 |
|
4419 ^ self transcodeAll:'Z' of:key to:'S' startingAt:2 |
|
4420 |
|
4421 "Modified (format): / 02-08-2017 / 14:36:00 / cg" |
|
4422 ! |
|
4423 |
|
4424 rule21:key |
|
4425 "21. transcode terminal 'AY' to 'Y' " |
|
4426 |
|
4427 ^ self transcodeAll:'AY' of:key to:'Y' startingAt:key size - 1 |
|
4428 |
|
4429 "Modified (format): / 02-08-2017 / 14:36:08 / cg" |
|
4430 ! |
|
4431 |
|
4432 rule22:key |
|
4433 |k| |
|
4434 |
|
4435 "22. remove trailing vowels " |
|
4436 k := key copy. |
|
4437 [ k last isVowel ] whileTrue:[ |
|
4438 k := k copyButLast |
|
4439 ]. |
|
4440 ^ k |
|
4441 |
|
4442 "Modified: / 02-08-2017 / 14:36:42 / cg" |
|
4443 ! |
|
4444 |
|
4445 rule23:key |
|
4446 |k c| |
|
4447 |
|
4448 "23. collapse all strings of repeated characters " |
|
4449 k := key copy. |
|
4450 c := SortedCollection sortBlock:[:a :b | b < a ]. |
|
4451 k size to:2 do:[:i | |
|
4452 (k at:i) = (k at:i - 1) ifTrue:[ |
|
4453 c add:i |
|
4454 ] |
|
4455 ]. |
|
4456 c do:[:n | |
|
4457 k := (k copyFrom:1 to:n - 1) , (k copyFrom:n + 1 to:k size) |
|
4458 ]. |
|
4459 ^ k |
|
4460 ! |
|
4461 |
|
4462 rule24:key originalKey:originalKey |
|
4463 |k| |
|
4464 |
|
4465 "24. if first char of original surname was a vowel, append it to the code" |
|
4466 k := key copy. |
|
4467 originalKey first isVowel ifTrue:[ |
|
4468 k := k , originalKey first asString asUppercase |
|
4469 ]. |
|
4470 ^ k |
|
4471 ! |
|
4472 |
|
4473 rule2:key |
|
4474 "2. Transcode initial strings: MAC => MC PF => F" |
|
4475 |
|
4476 |k| |
|
4477 |
|
4478 k := key copy. |
|
4479 (k startsWith:'MAC') ifTrue:[ |
|
4480 k := 'MC' , (k copyFrom:4) |
|
4481 ]. |
|
4482 (k startsWith:'PF') ifTrue:[ |
|
4483 k := 'F' , (k copyFrom:3) |
|
4484 ]. |
|
4485 ^ k |
|
4486 |
|
4487 "Modified (format): / 02-08-2017 / 14:31:40 / cg" |
|
4488 ! |
|
4489 |
|
4490 rule3:key |
|
4491 |k| |
|
4492 |
|
4493 "3. Transcode trailing strings as follows: |
|
4494 IX => IC |
|
4495 EX => EC |
|
4496 YE, EE, IE => Y |
|
4497 NT, ND => D" |
|
4498 |
|
4499 k := key copy. |
|
4500 k := self transcodeTrailing:#( 'IX' ) of:k to:'IC'. |
|
4501 k := self transcodeTrailing:#( 'EX' ) of:k to:'EC'. |
|
4502 k := self transcodeTrailing:#( 'YE' 'EE' 'IE' ) of:k to:'Y'. |
|
4503 k := self transcodeTrailing:#( 'NT' 'ND' ) of:k to:'D'. |
|
4504 ^ k |
|
4505 |
|
4506 "Modified (format): / 02-08-2017 / 14:32:24 / cg" |
|
4507 ! |
|
4508 |
|
4509 rule4:key |
|
4510 "4. Transcode 'EV' to 'EF' if not at start of name" |
|
4511 |
|
4512 ^ self transcodeAll:'EV' of:key to:'EF' startingAt:2 |
|
4513 |
|
4514 "Modified (format): / 02-08-2017 / 14:32:35 / cg" |
|
4515 ! |
|
4516 |
|
4517 rule5:key |
|
4518 "5. Use first character of name as first character of key. |
|
4519 Ignored because we're doing an in-place conversion" |
|
4520 |
|
4521 ^ key |
|
4522 |
|
4523 "Modified (comment): / 02-08-2017 / 14:32:45 / cg" |
|
4524 ! |
|
4525 |
|
4526 rule6:key |
|
4527 |k i| |
|
4528 |
|
4529 "6. Remove any 'W' that follows a vowel" |
|
4530 k := key copy. |
|
4531 i := 2. |
|
4532 [ |
|
4533 (i := k indexOf:$W startingAt:i) > 0 |
|
4534 ] whileTrue:[ |
|
4535 (k at:i - 1) isVowel ifTrue:[ |
|
4536 k := (k copyFrom:1 to:i - 1) , (k copyFrom:i + 1 to:k size). |
|
4537 i := i - 1 |
|
4538 ] |
|
4539 ]. |
|
4540 ^ k |
|
4541 ! |
|
4542 |
|
4543 rule7:key |
|
4544 "7. replace all vowels with 'A' " |
|
4545 ^ key collect:[:ch | ch isVowel ifTrue:[$A] ifFalse:[ch]]. |
|
4546 |
|
4547 "Modified: / 02-08-2017 / 14:33:56 / cg" |
|
4548 ! |
|
4549 |
|
4550 rule8:key |
|
4551 "8. transcode 'GHT' to 'GT' " |
|
4552 |
|
4553 ^ self transcodeAll:'GHT' of:key to:'GT' startingAt:1 |
|
4554 |
|
4555 "Modified (format): / 02-08-2017 / 14:34:05 / cg" |
|
4556 ! |
|
4557 |
|
4558 rule9:key |
|
4559 "9. transcode 'DG' to 'G' " |
|
4560 |
|
4561 ^ self transcodeAll:'DG' of:key to:'G' startingAt:1 |
|
4562 |
|
4563 "Modified (format): / 02-08-2017 / 14:34:15 / cg" |
|
4564 ! |
|
4565 |
|
4566 transcodeAll:aString of:key to:replacementString startingAt:start |
|
4567 |k i| |
|
4568 |
|
4569 k := key copy. |
|
4570 [ |
|
4571 (i := k indexOfSubCollection:aString startingAt:start) > 0 |
|
4572 ] whileTrue:[ |
|
4573 k := (k copyFrom:1 to:i - 1) , replacementString |
|
4574 , (k copyFrom:i + aString size to:k size) |
|
4575 ]. |
|
4576 ^ k |
|
4577 ! |
|
4578 |
|
4579 transcodeTrailing:anArrayOfStrings of:key to:replacementString |
|
4580 |answer| |
|
4581 |
|
4582 answer := key copy. |
|
4583 anArrayOfStrings do:[:aString | |
|
4584 answer := self |
|
4585 transcodeAll:aString |
|
4586 of:answer |
|
4587 to:replacementString |
|
4588 startingAt:(answer size - aString size) + 1 |
|
4589 ]. |
|
4590 ^ answer |
|
4591 ! ! |
|
4592 |
|
4593 !PhoneticStringUtilities::PhonemStringComparator class methodsFor:'documentation'! |
|
4594 |
|
4595 documentation |
|
4596 " |
|
4597 Implementation of the PHONEM algorithm, as described in |
|
4598 'Georg Wilde and Carsten Meyer, Doppelgaenger gesucht - |
|
4599 Ein Programm fuer kontextsensitive phonetische Textumwandlung |
|
4600 ct Magazin fuer Computer & Technik 25/1998' |
|
4601 |
|
4602 This algorithm deals better with the german language (it cares for umlauts) |
|
4603 " |
|
4604 ! ! |
|
4605 |
|
4606 !PhoneticStringUtilities::PhonemStringComparator methodsFor:'api'! |
|
4607 |
|
4608 encode:aString |
|
4609 |s idx t t2| |
|
4610 |
|
4611 s := aString asUppercase. |
|
4612 |
|
4613 idx := 1. |
|
4614 [idx < (s size-1)] whileTrue:[ |
|
4615 t2 := nil. |
|
4616 t := s copyFrom:idx to:idx+1. |
|
4617 t = 'SC' ifTrue:[ t2 := 'C' ] |
|
4618 ifFalse:[ t = 'SZ' ifTrue:[ t2 := 'C' ] |
|
4619 ifFalse:[ t = 'CZ' ifTrue:[ t2 := 'C' ] |
|
4620 ifFalse:[ t = 'TZ' ifTrue:[ t2 := 'C' ] |
|
4621 ifFalse:[ t = 'TS' ifTrue:[ t2 := 'C' ] |
|
4622 ifFalse:[ t = 'KS' ifTrue:[ t2 := 'X' ] |
|
4623 ifFalse:[ t = 'PF' ifTrue:[ t2 := 'V' ] |
|
4624 ifFalse:[ t = 'QU' ifTrue:[ t2 := 'KW' ] |
|
4625 ifFalse:[ t = 'PH' ifTrue:[ t2 := 'V' ] |
|
4626 ifFalse:[ t = 'UE' ifTrue:[ t2 := 'Y' ] |
|
4627 ifFalse:[ t = 'AE' ifTrue:[ t2 := 'E' ] |
|
4628 ifFalse:[ t = 'OE' ifTrue:[ t2 := 'Ö' ] |
|
4629 ifFalse:[ t = 'EI' ifTrue:[ t2 := 'AY' ] |
|
4630 ifFalse:[ t = 'EY' ifTrue:[ t2 := 'AY' ] |
|
4631 ifFalse:[ t = 'EU' ifTrue:[ t2 := 'OY' ] |
|
4632 ifFalse:[ t = 'AU' ifTrue:[ t2 := 'A§' ] |
|
4633 ifFalse:[ t = 'OU' ifTrue:[ t2 := '§ ' ]]]]]]]]]]]]]]]]]. |
|
4634 t2 notNil ifTrue:[ |
|
4635 s := (s copyTo:idx-1),t2,(s copyFrom:idx+2) |
|
4636 ] ifFalse:[ |
|
4637 idx := idx + 1. |
|
4638 ]. |
|
4639 ]. |
|
4640 |
|
4641 "/ single character substitutions via tr |
|
4642 s := s copyTransliterating:'ÖÄZKGQÜIJFWPT§' to:'YECCCCYYYVVDDUA'. |
|
4643 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'' complement:true squashDuplicates:false. |
|
4644 s := s copyTransliterating:'ABCDLMNORSUVWXY' to:'ABCDLMNORSUVWXY' complement:false squashDuplicates:true. |
|
4645 ^ s |
|
4646 |
|
4647 " |
|
4648 self basicNew encode:'müller' -> 'MYLR' |
|
4649 self basicNew encode:'mueller' -> 'MYLR' |
|
4650 self basicNew encode:'möller' -> 'MYLR' |
|
4651 self basicNew encode:'miller' -> 'MYLR' |
|
4652 self basicNew encode:'muller' -> 'MULR' |
|
4653 self basicNew encode:'muler' -> 'MULR' |
|
4654 |
|
4655 self basicNew phoneticStringsFor:'müller' #('MYLR') |
|
4656 self basicNew phoneticStringsFor:'mueller' #('MYLR') |
|
4657 self basicNew phoneticStringsFor:'möller' #('MYLR') |
|
4658 self basicNew phoneticStringsFor:'miller' #('MYLR') |
|
4659 self basicNew phoneticStringsFor:'muller' #('MULR') |
|
4660 self basicNew phoneticStringsFor:'muler' #('MULR') |
|
4661 |
|
4662 self basicNew phoneticStringsFor:'schmidt' #('CMYD') |
|
4663 self basicNew phoneticStringsFor:'schneider' #('CNAYDR') |
|
4664 self basicNew phoneticStringsFor:'fischer' #('VYCR') |
|
4665 self basicNew phoneticStringsFor:'weber' #('VBR') |
|
4666 self basicNew phoneticStringsFor:'weeber' #('VBR') |
|
4667 self basicNew phoneticStringsFor:'webber' #('VBR') |
|
4668 self basicNew phoneticStringsFor:'wepper' #('VBR') |
|
4669 |
|
4670 self basicNew phoneticStringsFor:'meyer' #('MAYR') |
|
4671 self basicNew phoneticStringsFor:'maier' #('MAYR') |
|
4672 self basicNew phoneticStringsFor:'mayer' #('MAYR') |
|
4673 self basicNew phoneticStringsFor:'mayr' #('MAYR') |
|
4674 self basicNew phoneticStringsFor:'meir' #('MAYR') |
|
4675 |
|
4676 self basicNew phoneticStringsFor:'wagner' #('VACNR') |
|
4677 self basicNew phoneticStringsFor:'schulz' #('CULC') |
|
4678 self basicNew phoneticStringsFor:'becker' #('BCR') |
|
4679 self basicNew phoneticStringsFor:'hoffmann' #('OVMAN') |
|
4680 self basicNew phoneticStringsFor:'haus' #('AUS') |
|
4681 |
|
4682 self basicNew phoneticStringsFor:'schäfer' #('CVR') |
|
4683 self basicNew phoneticStringsFor:'scheffer' #('CVR') |
|
4684 self basicNew phoneticStringsFor:'schaeffer' #('CVR') |
|
4685 self basicNew phoneticStringsFor:'schaefer' #('CVR') |
|
4686 " |
|
4687 |
|
4688 "Created: / 28-07-2017 / 15:38:08 / cg" |
|
4689 ! ! |
|
4690 |
|
4691 !PhoneticStringUtilities::Caverphone2StringComparator class methodsFor:'documentation'! |
|
4692 |
|
4693 documentation |
|
4694 " |
|
4695 Caverphone (2) Algorithm: |
|
4696 |
|
4697 see http://caversham.otago.ac.nz/files/working/ctp150804.pdf |
|
4698 |
|
4699 Caverphone 2.0 is being made available for free use for the benefit of anyone who has a use for it, |
|
4700 with the proviso that the Caversham Project at the University of Otago should be acknowledged as the |
|
4701 original source (which is hereby done ;-). |
|
4702 |
|
4703 • Start with a Surname or Firstname |
|
4704 • Convert to lowercase |
|
4705 This coding system is case sensitive, implementations should acknowledge that a is not the same as A |
|
4706 • Remove anything not A-Z |
|
4707 The main intention of this is to remove spaces, hyphens, and apostrophes. |
|
4708 example: o'brian becomes obrian |
|
4709 • If the name starts with cough make it cou2f |
|
4710 2 is being used as a temporary placeholder to indicate a consonant which we are no longer interested in. |
|
4711 • If the name starts with rough make it rou2f |
|
4712 • If the name starts with tough make it tou2f |
|
4713 • If the name starts with enough make it enou2f |
|
4714 • If the name starts with gn make it 2n |
|
4715 • If the name ends with mb make it m2 |
|
4716 • replace cq with 2q |
|
4717 • replace ci with si |
|
4718 • replace ce with se |
|
4719 • replace cy with sy |
|
4720 • replace tch with 2ch |
|
4721 • replace c with k |
|
4722 • replace q with k |
|
4723 • replace x with k |
|
4724 • replace v with f |
|
4725 • replace dg with 2g |
|
4726 • replace tio with sio |
|
4727 • replace tia with sia |
|
4728 • replace d with t |
|
4729 • replace ph with fh |
|
4730 • replace b with p |
|
4731 • replace sh with s2 |
|
4732 • replace z with s |
|
4733 • replace and initial vowel with an A |
|
4734 • replace all other vowels with a 3 |
|
4735 3 is a temporary placeholder marking a vowel |
|
4736 • replace 3gh3 with 3kh3 |
|
4737 Exceptions are dealt with before the general case. gh between vowels is an except of the more general gh rule. |
|
4738 • replace gh with 22 |
|
4739 • replace g with k |
|
4740 • replace groups of the letter s with a S |
|
4741 Continuous strings of s are replace by a single S |
|
4742 • replace groups of the letter t with a T |
|
4743 • replace groups of the letter p with a P |
|
4744 • replace groups of the letter k with a K |
|
4745 • replace groups of the letter f with a F |
|
4746 • replace groups of the letter m with a M |
|
4747 • replace groups of the letter n with a N |
|
4748 • replace w3 with W3 |
|
4749 • replace wy with Wy |
|
4750 • replace wh3 with Wh3 |
|
4751 • replace why with Why |
|
4752 • replace w with 2 |
|
4753 • replace and initial h with an A |
|
4754 • replace all other occurrences of h with a 2 |
|
4755 • replace r3 with R3 |
|
4756 • replace ry with Ry |
|
4757 • replace r with 2 |
|
4758 • replace l3 with L3 |
|
4759 • replace ly with Ly |
|
4760 • replace l with 2 |
|
4761 • replace j with y |
|
4762 • replace y3 with Y3 |
|
4763 • replace y with 2 |
|
4764 • remove all 2s |
|
4765 • remove all 3s |
|
4766 • put six (v1) / ten (v2) 1s on the end |
|
4767 • take the first six characters as the code (caverphone 1); |
|
4768 / take the first ten characters as the code (caverphone 2); |
|
4769 |
|
4770 self new encode:'david' -> 'TFT1111111' |
|
4771 self new encode:'whittle' -> 'WTA1111111' |
|
4772 |
|
4773 self new encode:'Stevenson' -> 'STFNSN1111' |
|
4774 self new encode:'Peter' -> 'PTA1111111' |
|
4775 |
|
4776 self new encode:'washington' -> 'WSNKTN1111' |
|
4777 self new encode:'lee' -> 'LA11111111' |
|
4778 self new encode:'Gutierrez' -> 'KTRS111111' |
|
4779 self new encode:'Pfister' -> 'PFSTA11111' |
|
4780 self new encode:'Jackson' -> 'YKSN111111' |
|
4781 self new encode:'Tymczak' -> 'TMKSK11111' |
|
4782 |
|
4783 self new encode:'add' -> 'AT11111111' |
|
4784 self new encode:'aid' -> 'AT11111111' |
|
4785 self new encode:'at' -> 'AT11111111' |
|
4786 self new encode:'art' -> 'AT11111111' |
|
4787 self new encode:'earth' -> 'AT11111111' |
|
4788 self new encode:'head' -> 'AT11111111' |
|
4789 self new encode:'old' -> 'AT11111111' |
|
4790 |
|
4791 self new encode:'ready' -> 'RTA1111111' |
|
4792 self new encode:'rather' -> 'RTA1111111' |
|
4793 self new encode:'able' -> 'APA1111111' |
|
4794 self new encode:'appear' -> 'APA1111111' |
|
4795 |
|
4796 self new encode:'Deedee' -> 'TTA1111111' |
|
4797 " |
|
4798 ! ! |
|
4799 |
|
4800 !PhoneticStringUtilities::Caverphone2StringComparator methodsFor:'api'! |
|
4801 |
|
4802 encode:word |
|
4803 |txt| |
|
4804 |
|
4805 word size == 0 ifTrue:[^ '1111111111' ]. |
|
4806 |
|
4807 "/ 1. Convert to lowercase |
|
4808 txt := word asLowercase. |
|
4809 |
|
4810 "/ 2. Remove anything not A-Z |
|
4811 txt := txt select:#isLetter. |
|
4812 |
|
4813 #( |
|
4814 "/ oldSeq newSeq repeat |
|
4815 |
|
4816 "/ 2.5. Remove final e |
|
4817 'e$' '' false |
|
4818 "/ 3. Handle various start options |
|
4819 '^cough' 'cou2f' false |
|
4820 '^rough' 'rou2f' false |
|
4821 '^tough' 'tou2f' false |
|
4822 '^enough' 'enou2f' false |
|
4823 '^trough' 'trou2f' false |
|
4824 |
|
4825 '^gn' '2n' false |
|
4826 'mb$' 'm2' false |
|
4827 |
|
4828 "/ 4. Handle replacements |
|
4829 'cq' '2q' true |
|
4830 'ci' 'si' true |
|
4831 'ce' 'se' true |
|
4832 'cy' 'sy' true |
|
4833 'tch' '2ch' true |
|
4834 'c' 'k' true |
|
4835 'q' 'k' true |
|
4836 'x' 'k' true |
|
4837 'v' 'f' true |
|
4838 'dg' '2g' true |
|
4839 'tio' 'sio' true |
|
4840 'tia' 'sia' true |
|
4841 'd' 't' true |
|
4842 'ph' 'fh' true |
|
4843 'b' 'p' true |
|
4844 'sh' 's2' true |
|
4845 'z' 's' true |
|
4846 |
|
4847 '^a' 'A' false |
|
4848 '^e' 'A' false |
|
4849 '^i' 'A' false |
|
4850 '^o' 'A' false |
|
4851 '^u' 'A' false |
|
4852 |
|
4853 'a' '3' true |
|
4854 'e' '3' true |
|
4855 'i' '3' true |
|
4856 'o' '3' true |
|
4857 'u' '3' true |
|
4858 'j' 'y' true |
|
4859 |
|
4860 '^y3' 'Y3' false |
|
4861 '^y' 'A' false |
|
4862 |
|
4863 'y' '3' true |
|
4864 '3gh3' '3kh3' true |
|
4865 'gh' '22' true |
|
4866 'g' 'k' true |
|
4867 's' 'S' true |
|
4868 'SS' 'S' true |
|
4869 't' 'T' true |
|
4870 'TT' 'T' true |
|
4871 'p' 'P' true |
|
4872 'PP' 'P' true |
|
4873 'k' 'K' true |
|
4874 'KK' 'K' true |
|
4875 'f' 'F' true |
|
4876 'FF' 'F' true |
|
4877 'm' 'M' true |
|
4878 'MM' 'M' true |
|
4879 'n' 'N' true |
|
4880 'NN' 'N' true |
|
4881 'w3' 'W3' true |
|
4882 'wh3' 'Wh3' true |
|
4883 'w$' '3' false |
|
4884 'w' '2' true |
|
4885 '^h' 'A' false |
|
4886 'h' '2' true |
|
4887 'r3' 'R3' true |
|
4888 'r$' '3' false |
|
4889 'r' '2' true |
|
4890 'l3' 'L3' true |
|
4891 'l$' '3' false |
|
4892 'l' '2' true |
|
4893 |
|
4894 "/ 5. removals |
|
4895 |
|
4896 '2' '' true |
|
4897 '3$' 'A' true |
|
4898 '3' '' true |
|
4899 ) inGroupsOf:3 do:[:pat :repl :repeat| |
|
4900 |s txtBefore| |
|
4901 |
|
4902 txtBefore := txt. |
|
4903 (pat startsWith:$^) ifTrue:[ |
|
4904 s := pat copyButFirst. |
|
4905 repeat ifTrue:[ |
|
4906 [txt startsWith:s] whileTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4907 ] ifFalse:[ |
|
4908 (txt startsWith:s) ifTrue:[ txt := repl,(txt copyButFirst:s size) ] |
|
4909 ]. |
|
4910 ] ifFalse:[ |
|
4911 (pat endsWith:$$) ifTrue:[ |
|
4912 s := pat copyButLast. |
|
4913 repeat ifTrue:[ |
|
4914 [txt endsWith:s] whileTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4915 ] ifFalse:[ |
|
4916 (txt endsWith:s) ifTrue:[ txt := (txt copyButLast:s size),repl ] |
|
4917 ] |
|
4918 ] ifFalse:[ |
|
4919 repeat ifTrue:[ |
|
4920 txt := txt copyReplaceAllSubcollections:pat with:repl |
|
4921 ] ifFalse:[ |
|
4922 txt := txt copyReplaceSubcollection:pat with:repl |
|
4923 ] |
|
4924 ] |
|
4925 ]. |
|
4926 "/ txt ~= txtBefore ifTrue:[ |
|
4927 "/ Transcript showCR:(pat,' | ',repl,' -> ',txt). |
|
4928 "/ ]. |
|
4929 ]. |
|
4930 |
|
4931 "/ 6. put ten 1s on the end |
|
4932 txt := txt,'1111111111'. |
|
4933 |
|
4934 "/ 7. take the first ten characters as the code |
|
4935 ^ txt copyTo:10 |
|
4936 |
|
4937 " |
|
4938 self new encode:'david' -> 'TFT1111111' |
|
4939 self new encode:'whittle' -> 'WTA1111111' |
|
4940 |
|
4941 self new encode:'Stevenson' -> 'STFNSN1111' |
|
4942 self new encode:'Peter' -> 'PTA1111111' |
|
4943 |
|
4944 self new encode:'washington' -> 'WSNKTN1111' |
|
4945 self new encode:'lee' -> 'LA11111111' |
|
4946 self new encode:'Gutierrez' -> 'KTRS111111' |
|
4947 self new encode:'Pfister' -> 'PFSTA11111' |
|
4948 self new encode:'Jackson' -> 'YKSN111111' |
|
4949 self new encode:'Tymczak' -> 'TMKSK11111' |
|
4950 |
|
4951 self new encode:'add' -> 'AT11111111' |
|
4952 self new encode:'aid' -> 'AT11111111' |
|
4953 self new encode:'at' -> 'AT11111111' |
|
4954 self new encode:'art' -> 'AT11111111' |
|
4955 self new encode:'earth' -> 'AT11111111' |
|
4956 self new encode:'head' -> 'AT11111111' |
|
4957 self new encode:'old' -> 'AT11111111' |
|
4958 |
|
4959 self new encode:'ready' -> 'RTA1111111' |
|
4960 self new encode:'rather' -> 'RTA1111111' |
|
4961 self new encode:'able' -> 'APA1111111' |
|
4962 self new encode:'appear' -> 'APA1111111' |
|
4963 |
|
4964 self new encode:'Deedee' -> 'TTA1111111' |
|
4965 " |
|
4966 |
|
4967 "Created: / 28-07-2017 / 15:21:23 / cg" |
|
4968 "Modified: / 02-08-2017 / 01:42:35 / cg" |
|
4969 ! ! |
|
4970 |
3143 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
4971 !PhoneticStringUtilities::KoelnerPhoneticCodeStringComparator class methodsFor:'documentation'! |
3144 |
4972 |
3145 documentation |
4973 documentation |
3146 " |
4974 " |
3147 The 'Kölner Phonetik' (cologne phonetic) code is for the german language |
4975 The 'Kölner Phonetik' (cologne phonetic) code is for the german language |