class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
authorClaus Gittinger <cg@exept.de>
Fri, 27 Feb 2015 18:07:14 +0100
changeset 17564 67ae75f28757
parent 17563 4affe2d5112a
child 17565 29224f55218c
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC class definition comment/format in: #decompositionOf:into: #documentation #encodeString: changed: #compositionOf:with:to: #decodeString: #initializeDecomposeMap more characters (cedille, breve); better (easier) setup
CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st
--- a/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st	Thu Feb 26 00:10:14 2015 +0100
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st	Fri Feb 27 18:07:14 2015 +0100
@@ -17,7 +17,7 @@
 
 ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC
 	instanceVariableNames:''
-	classVariableNames:'AccentMap DecomposeMap'
+	classVariableNames:'AccentMap DecomposeMap ComposeMap'
 	poolDictionaries:''
 	category:'Collections-Text-Encodings'
 !
@@ -45,7 +45,7 @@
         - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a
           combining diacritical mark (for example: acute).
 
-    MAC OSX needs the second form for its filenames.
+    MAC OSX needs the second form for its file names.
     However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF.
 
     This is a q&d hack, to at least support the first page (latin1) characters.
@@ -57,6 +57,7 @@
     [instance variables:]
 
     [class variables:]
+        ComposeMap DecomposeMap
 
     [see also:]
         http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
@@ -67,66 +68,35 @@
 !ISO10646_to_UTF8_MAC class methodsFor:'initialization'!
 
 initializeDecomposeMap
-    DecomposeMap := Dictionary new.
+    "the map which decomposes a diacritical character into its two components"
 
-    DecomposeMap at:"À"  16rC0 put:#( 16r41 16r0300).
-    DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300).
-    DecomposeMap at:"Á"  16rC1 put:#( 16r41 16r0301).
-    DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301).
-    DecomposeMap at:"Â"  16rC2 put:#( 16r41 16r0302).
-    DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302).
-    DecomposeMap at:"Ã"  16rC3 put:#( 16r41 16r0303).
-    DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303).
-    DecomposeMap at:"Ä"  16rC4 put:#( 16r41 16r0308).
-    DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308).
-    DecomposeMap at:"Å"  16rC5 put:#( 16r41 16r030A).
-    DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A).
-
-    DecomposeMap at:"È"  16rC8 put:#( 16r45 16r0300).
-    DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300).
-    DecomposeMap at:"É"  16rC9 put:#( 16r45 16r0301).
-    DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301).
-    DecomposeMap at:"Ê"  16rCA put:#( 16r45 16r0302).
-    DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302).
-    DecomposeMap at:"Ë"  16rCB put:#( 16r45 16r0308).
-    DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308).
+    DecomposeMap := Dictionary new.
+    ComposeMap := Dictionary new.
 
-    DecomposeMap at:"Ì"  16rCC put:#( 16r49 16r0300).
-    DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300).
-    DecomposeMap at:"í"  16rCD put:#( 16r49 16r0301).
-    DecomposeMap at:"í" 16rED put:#( 16r69 16r0301).
-    DecomposeMap at:"Î"  16rCE put:#( 16r49 16r0302).
-    DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302).
-    DecomposeMap at:"Ï"  16rCF put:#( 16r49 16r0308).
-    DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308).
-
-    DecomposeMap at:"Ñ"  16rD1 put:#( 16r4E 16r0303).
-    DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303).
+    #(
+        (16r0300 "gravis" 'AÀaàEÈeèIÌiìoòOÒUÙuù')
+        (16r0301 "akut"   'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝCĆcćNŃnńRŔrŕSŚsśZŹzź')
+        (16r0302 "circonflex" 'AÂaâEÊeêIÎiîOÔoôUÛuûCĈcĉGĜgĝHĤhĥJĴjĵSŜsŝWŴwŵYŶyŷ')
+        (16r0303  "tilde" 'AÃaãNÑnñOÕoõUŨuũ')
+        (16r0308  "umlaut" 'AÄaäOÖoöUÜuüIÏiïyÿYŸ')
+        (16r030A  "ring" 'AÅaåUŮuů')
+        (16r030C  "breve" 'CČcčDĎEĚeěNŇnňRŘrřSŠsšZŽzž')
+        (16r0327  "cedille" 'CÇc窺TŢtţ')       
+    ) do:[:eachPair |
+        |composeCode mapping|
 
-    DecomposeMap at:"Ò"  16rD2 put:#( 16r4F 16r0300).
-    DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300).
-    DecomposeMap at:"Ó"  16rD3 put:#( 16r4F 16r0301).
-    DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301).
-    DecomposeMap at:"Ô"  16rD4 put:#( 16r4F 16r0302).
-    DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302).
-    DecomposeMap at:"Õ"  16rD5 put:#( 16r4F 16r0303).
-    DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303).
-    DecomposeMap at:"Ö"  16rD6 put:#( 16r4F 16r0308).
-    DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308).
+        composeCode := eachPair first.
+        mapping := eachPair second.
+        mapping pairWiseDo:[:baseChar :composedChar |
+            "/ setup, so that we find
+            "/    DecomposeMap at:"$à codePoint" 16rE0 put:#( "$a codePoint" 16r61 "greve codePoint" 16r0300).
+            DecomposeMap 
+                at:composedChar codePoint 
+                put:(Array with:baseChar codePoint with:composeCode)
+        ].
 
-    DecomposeMap at:"Ù"  16rD9 put:#( 16r55 16r0300).
-    DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300).
-    DecomposeMap at:"Ú"  16rDA put:#( 16r55 16r0301).
-    DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301).
-    DecomposeMap at:"Û"  16rDB put:#( 16r55 16r0302).
-    DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302).
-    DecomposeMap at:"Ü"  16rDC put:#( 16r55 16r0308).
-    DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308).
-
-    DecomposeMap at:"Ý"  16rDD put:#( 16r59 16r0301).
-    DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301).
-
-    DecomposeMap at:"ÿ"  16rFF put:#( 16r79 16r0308).
+        ComposeMap at:composeCode put:mapping.
+    ].
 ! !
 
 !ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'!
@@ -138,26 +108,10 @@
     |cp map i|
 
     cp := diacriticalChar codePoint.
-    cp == 16r0300  ifTrue:[
-        "/ accent grave
-        map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'.
-    ] ifFalse:[ cp == 16r0301  ifTrue:[
-        "/ accent
-        map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'.
-    ] ifFalse:[ cp == 16r0302  ifTrue:[
-        "/ circonflex
-        map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'.
-    ] ifFalse:[ cp == 16r0303  ifTrue:[
-        "/ tilde
-        map := 'AÃaãNÑnñOÕoõ'.
-    ] ifFalse:[ cp == 16r0308  ifTrue:[
-        "/ umlaut
-        map := 'AÄaäOÖoöUÜuüIÏiïyÿ'.
-    ] ifFalse:[ cp == 16r030A  ifTrue:[
-        "/ ring
-        map := 'AÅaå'.
-    ]]]]]].
+    map := ComposeMap at:cp ifAbsent:nil.
+
     map notNil ifTrue:[
+        "/ compose
         i := map indexOf: baseChar.
         i ~~ 0 ifTrue:[
             outStream nextPut: (map at:i+1).
@@ -165,6 +119,7 @@
         ].
     ].
 
+    "/ leave as is
     outStream nextPut: baseChar.
     outStream nextPut: diacriticalChar.
 !
@@ -174,19 +129,23 @@
      This is UTF-8 with compose-characters decomposed 
      (i.e. as separate codes, not as single combined characters).
 
-     For now, here is a hacked (hardwired knowledge) version, 
-     which will work for some european countries only...
+     For now, here is a limited version, which should work
+     at least for most european countries...
     "
 
     |s buff previous|
 
-    s := super  decodeString:aStringOrByteCollection.
-    (s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifFalse:[^ s].
+    s := super decodeString:aStringOrByteCollection.
+    (s contains:[:char | char codePoint between:16r0300 and:16r0327]) ifFalse:[^ s].
+
+    ComposeMap isNil ifTrue:[
+        self class initializeDecomposeMap
+    ].
 
     buff := CharacterWriteStream on:''.
     previous := nil.
     s do:[:each |
-        (each codePoint between:16r0300 and:16r030F) ifTrue:[
+        (each codePoint between:16r0300 and:16r0327) ifTrue:[
             self compositionOf:previous with:each to:buff.
             previous := nil.
         ] ifFalse:[
@@ -220,7 +179,8 @@
     "if required, decompose a diacritical character into a base character and a punctuation;
      eg. ä -> a + umlaut-diacritic-mark.
      Pass both as args to the given block.
-     For non diactit. chars, pass a nil diacrit-mark value"
+     For non diactit. chars, pass a nil diacrit-mark value.
+     Return true, if a decomposition was done."
 
     |entry|
 
@@ -238,8 +198,8 @@
      This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as
      single combined characters).
 
-     For now, here is a hacked (hardwired knowledge) version, which should work
-     at least for some european countries...
+     For now, here is a limited version, which should work
+     at least for most european countries...
     "
 
     |gen s decomp codePoint composeCodePoint|
@@ -296,9 +256,10 @@
             ].
         ].
 
-    decomp := [:baseCodePointArg :composeCodePointArg | 
-                codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
-              ].
+    decomp := 
+        [:baseCodePointArg :composeCodePointArg | 
+            codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
+        ].
 
     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
     aUnicodeString do:[:eachCharacter |
@@ -340,10 +301,10 @@
 !ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
 !
 
 version_CVS
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
 ! !