#FEATURE by cg
class: CharacterArray
added: #asDenormalizedUnicodeString
changed: #setupNormalizationMaps
--- a/CharacterArray.st Mon Jun 20 12:48:20 2016 +0200
+++ b/CharacterArray.st Mon Jun 20 12:56:08 2016 +0200
@@ -557,12 +557,16 @@
|def|
UnicodeNormalizationMap := Dictionary new.
-
- def := [:ch1 :chars2 :mappedChars |
+ UnicodeDenormalizationMap := Dictionary new.
+
+ def := [:combiner :chars :mappedChars |
|d|
- d := UnicodeNormalizationMap at:ch1 ifAbsentPut:[Dictionary new].
- chars2 with:mappedChars do:[:ch2 :mappedChar | d at:ch2 put:mappedChar ].
+ d := UnicodeNormalizationMap at:combiner ifAbsentPut:[Dictionary new].
+ chars with:mappedChars do:[:ch1 :mappedChar |
+ d at:ch1 put:mappedChar.
+ UnicodeDenormalizationMap at:mappedChar put:(Unicode16String with:ch1 with:combiner).
+ ].
].
def value:(Character codePoint:16r0300) "grave" value:'AEIOUaeiou' value:'ÀÈÌÒÙàèìòù'.
def value:(Character codePoint:16r0301) "degu" value:'AEIOUYaeiouy' value:'ÁÉÍÓÚÝáéíóúý'.
@@ -4596,6 +4600,37 @@
!CharacterArray methodsFor:'encoding & decoding'!
+asDenormalizedUnicodeString
+ "return a new string containing the same characters, as a denormalized Unicode string.
+ This replaces diacritical chars (umlauts, accented chars etc) by
+ a sequence with combination characters.
+ (i.e. a plain character followed by a combining diacritical in the 0x03xx range)"
+
+ |map outStream|
+
+ map := self class unicodeDenormalizationMap.
+
+ self containsNon7BitAscii ifFalse:[^ self]. "/ I cannot contain any
+
+ outStream := WriteStream on:(Unicode16String new:self size).
+ self do:[:char |
+ |mappedChars|
+
+ (mappedChars := map at:char ifAbsent:nil) notNil ifTrue:[
+ outStream nextPutAll:mappedChars.
+ ] ifFalse:[
+ outStream nextPut:char.
+ ].
+ ].
+ ^ outStream contents asSingleByteStringIfPossible.
+
+ "
+ 'Ö' asDenormalizedUnicodeString
+ 'aÖÄx' asDenormalizedUnicodeString
+ 'abc' asDenormalizedUnicodeString
+ "
+!
+
asNormalizedUnicodeString
"return a new string containing the same characters, as a normalized Unicode string.
This replaces combination characters by corresponding single characters.