#FEATURE by cg
authorClaus Gittinger <cg@exept.de>
Mon, 20 Jun 2016 12:56:08 +0200
changeset 20013 d2c0fcf7ac29
parent 20012 06acba6d608d
child 20014 ae568b91d01b
#FEATURE by cg class: CharacterArray added: #asDenormalizedUnicodeString changed: #setupNormalizationMaps
CharacterArray.st
--- a/CharacterArray.st	Mon Jun 20 12:48:20 2016 +0200
+++ b/CharacterArray.st	Mon Jun 20 12:56:08 2016 +0200
@@ -557,12 +557,16 @@
     |def|
 
     UnicodeNormalizationMap := Dictionary new.
-
-    def := [:ch1 :chars2 :mappedChars |
+    UnicodeDenormalizationMap := Dictionary new.
+
+    def := [:combiner :chars :mappedChars |
                |d|
 
-               d := UnicodeNormalizationMap at:ch1 ifAbsentPut:[Dictionary new].
-               chars2 with:mappedChars do:[:ch2 :mappedChar | d at:ch2 put:mappedChar ].
+               d := UnicodeNormalizationMap at:combiner ifAbsentPut:[Dictionary new].
+               chars with:mappedChars do:[:ch1 :mappedChar | 
+                    d at:ch1 put:mappedChar.
+                    UnicodeDenormalizationMap at:mappedChar put:(Unicode16String with:ch1 with:combiner).
+               ].
            ].        
     def value:(Character codePoint:16r0300) "grave"         value:'AEIOUaeiou' value:'ÀÈÌÒÙàèìòù'.
     def value:(Character codePoint:16r0301) "degu"          value:'AEIOUYaeiouy' value:'ÁÉÍÓÚÝáéíóúý'.
@@ -4596,6 +4600,37 @@
 
 !CharacterArray methodsFor:'encoding & decoding'!
 
+asDenormalizedUnicodeString
+    "return a new string containing the same characters, as a denormalized Unicode string.
+     This replaces diacritical chars (umlauts, accented chars etc) by
+     a sequence with combination characters.
+     (i.e. a plain character followed by a combining diacritical in the 0x03xx range)"
+
+    |map outStream|
+
+    map := self class unicodeDenormalizationMap.
+    
+    self containsNon7BitAscii ifFalse:[^ self]. "/ I cannot contain any
+
+    outStream := WriteStream on:(Unicode16String new:self size).
+    self do:[:char |
+        |mappedChars|
+        
+        (mappedChars := map at:char ifAbsent:nil) notNil ifTrue:[ 
+            outStream nextPutAll:mappedChars.
+        ] ifFalse:[
+            outStream nextPut:char.        
+        ].
+    ].        
+    ^ outStream contents asSingleByteStringIfPossible.
+
+    "
+     'Ö' asDenormalizedUnicodeString 
+     'aÖÄx' asDenormalizedUnicodeString 
+     'abc' asDenormalizedUnicodeString 
+    "
+!
+
 asNormalizedUnicodeString
     "return a new string containing the same characters, as a normalized Unicode string.
      This replaces combination characters by corresponding single characters.