--- a/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st Thu Feb 26 00:10:14 2015 +0100
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st Fri Feb 27 18:07:14 2015 +0100
@@ -17,7 +17,7 @@
ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC
instanceVariableNames:''
- classVariableNames:'AccentMap DecomposeMap'
+ classVariableNames:'AccentMap DecomposeMap ComposeMap'
poolDictionaries:''
category:'Collections-Text-Encodings'
!
@@ -45,7 +45,7 @@
- or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a
combining diacritical mark (for example: acute).
- MAC OSX needs the second form for its filenames.
+ MAC OSX needs the second form for its file names.
However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF.
This is a q&d hack, to at least support the first page (latin1) characters.
@@ -57,6 +57,7 @@
[instance variables:]
[class variables:]
+ ComposeMap DecomposeMap
[see also:]
http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
@@ -67,66 +68,35 @@
!ISO10646_to_UTF8_MAC class methodsFor:'initialization'!
initializeDecomposeMap
- DecomposeMap := Dictionary new.
+ "the map which decomposes a diacritical character into its two components"
- DecomposeMap at:"À" 16rC0 put:#( 16r41 16r0300).
- DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300).
- DecomposeMap at:"Á" 16rC1 put:#( 16r41 16r0301).
- DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301).
- DecomposeMap at:"Â" 16rC2 put:#( 16r41 16r0302).
- DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302).
- DecomposeMap at:"Ã" 16rC3 put:#( 16r41 16r0303).
- DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303).
- DecomposeMap at:"Ä" 16rC4 put:#( 16r41 16r0308).
- DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308).
- DecomposeMap at:"Å" 16rC5 put:#( 16r41 16r030A).
- DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A).
-
- DecomposeMap at:"È" 16rC8 put:#( 16r45 16r0300).
- DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300).
- DecomposeMap at:"É" 16rC9 put:#( 16r45 16r0301).
- DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301).
- DecomposeMap at:"Ê" 16rCA put:#( 16r45 16r0302).
- DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302).
- DecomposeMap at:"Ë" 16rCB put:#( 16r45 16r0308).
- DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308).
+ DecomposeMap := Dictionary new.
+ ComposeMap := Dictionary new.
- DecomposeMap at:"Ì" 16rCC put:#( 16r49 16r0300).
- DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300).
- DecomposeMap at:"í" 16rCD put:#( 16r49 16r0301).
- DecomposeMap at:"í" 16rED put:#( 16r69 16r0301).
- DecomposeMap at:"Î" 16rCE put:#( 16r49 16r0302).
- DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302).
- DecomposeMap at:"Ï" 16rCF put:#( 16r49 16r0308).
- DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308).
-
- DecomposeMap at:"Ñ" 16rD1 put:#( 16r4E 16r0303).
- DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303).
+ #(
+ (16r0300 "gravis" 'AÀaàEÈeèIÌiìoòOÒUÙuù')
+ (16r0301 "akut" 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝCĆcćNŃnńRŔrŕSŚsśZŹzź')
+ (16r0302 "circonflex" 'AÂaâEÊeêIÎiîOÔoôUÛuûCĈcĉGĜgĝHĤhĥJĴjĵSŜsŝWŴwŵYŶyŷ')
+ (16r0303 "tilde" 'AÃaãNÑnñOÕoõUŨuũ')
+ (16r0308 "umlaut" 'AÄaäOÖoöUÜuüIÏiïyÿYŸ')
+ (16r030A "ring" 'AÅaåUŮuů')
+ (16r030C "breve" 'CČcčDĎEĚeěNŇnňRŘrřSŠsšZŽzž')
+ (16r0327 "cedille" 'CÇc窺TŢtţ')
+ ) do:[:eachPair |
+ |composeCode mapping|
- DecomposeMap at:"Ò" 16rD2 put:#( 16r4F 16r0300).
- DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300).
- DecomposeMap at:"Ó" 16rD3 put:#( 16r4F 16r0301).
- DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301).
- DecomposeMap at:"Ô" 16rD4 put:#( 16r4F 16r0302).
- DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302).
- DecomposeMap at:"Õ" 16rD5 put:#( 16r4F 16r0303).
- DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303).
- DecomposeMap at:"Ö" 16rD6 put:#( 16r4F 16r0308).
- DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308).
+ composeCode := eachPair first.
+ mapping := eachPair second.
+ mapping pairWiseDo:[:baseChar :composedChar |
+ "/ setup, so that we find
+ "/ DecomposeMap at:"$à codePoint" 16rE0 put:#( "$a codePoint" 16r61 "greve codePoint" 16r0300).
+ DecomposeMap
+ at:composedChar codePoint
+ put:(Array with:baseChar codePoint with:composeCode)
+ ].
- DecomposeMap at:"Ù" 16rD9 put:#( 16r55 16r0300).
- DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300).
- DecomposeMap at:"Ú" 16rDA put:#( 16r55 16r0301).
- DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301).
- DecomposeMap at:"Û" 16rDB put:#( 16r55 16r0302).
- DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302).
- DecomposeMap at:"Ü" 16rDC put:#( 16r55 16r0308).
- DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308).
-
- DecomposeMap at:"Ý" 16rDD put:#( 16r59 16r0301).
- DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301).
-
- DecomposeMap at:"ÿ" 16rFF put:#( 16r79 16r0308).
+ ComposeMap at:composeCode put:mapping.
+ ].
! !
!ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'!
@@ -138,26 +108,10 @@
|cp map i|
cp := diacriticalChar codePoint.
- cp == 16r0300 ifTrue:[
- "/ accent grave
- map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'.
- ] ifFalse:[ cp == 16r0301 ifTrue:[
- "/ accent
- map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'.
- ] ifFalse:[ cp == 16r0302 ifTrue:[
- "/ circonflex
- map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'.
- ] ifFalse:[ cp == 16r0303 ifTrue:[
- "/ tilde
- map := 'AÃaãNÑnñOÕoõ'.
- ] ifFalse:[ cp == 16r0308 ifTrue:[
- "/ umlaut
- map := 'AÄaäOÖoöUÜuüIÏiïyÿ'.
- ] ifFalse:[ cp == 16r030A ifTrue:[
- "/ ring
- map := 'AÅaå'.
- ]]]]]].
+ map := ComposeMap at:cp ifAbsent:nil.
+
map notNil ifTrue:[
+ "/ compose
i := map indexOf: baseChar.
i ~~ 0 ifTrue:[
outStream nextPut: (map at:i+1).
@@ -165,6 +119,7 @@
].
].
+ "/ leave as is
outStream nextPut: baseChar.
outStream nextPut: diacriticalChar.
!
@@ -174,19 +129,23 @@
This is UTF-8 with compose-characters decomposed
(i.e. as separate codes, not as single combined characters).
- For now, here is a hacked (hardwired knowledge) version,
- which will work for some european countries only...
+ For now, here is a limited version, which should work
+ at least for most european countries...
"
|s buff previous|
- s := super decodeString:aStringOrByteCollection.
- (s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifFalse:[^ s].
+ s := super decodeString:aStringOrByteCollection.
+ (s contains:[:char | char codePoint between:16r0300 and:16r0327]) ifFalse:[^ s].
+
+ ComposeMap isNil ifTrue:[
+ self class initializeDecomposeMap
+ ].
buff := CharacterWriteStream on:''.
previous := nil.
s do:[:each |
- (each codePoint between:16r0300 and:16r030F) ifTrue:[
+ (each codePoint between:16r0300 and:16r0327) ifTrue:[
self compositionOf:previous with:each to:buff.
previous := nil.
] ifFalse:[
@@ -220,7 +179,8 @@
"if required, decompose a diacritical character into a base character and a punctuation;
eg. ä -> a + umlaut-diacritic-mark.
Pass both as args to the given block.
- For non diactit. chars, pass a nil diacrit-mark value"
+ For non diactit. chars, pass a nil diacrit-mark value.
+ Return true, if a decomposition was done."
|entry|
@@ -238,8 +198,8 @@
This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as
single combined characters).
- For now, here is a hacked (hardwired knowledge) version, which should work
- at least for some european countries...
+ For now, here is a limited version, which should work
+ at least for most european countries...
"
|gen s decomp codePoint composeCodePoint|
@@ -296,9 +256,10 @@
].
].
- decomp := [:baseCodePointArg :composeCodePointArg |
- codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
- ].
+ decomp :=
+ [:baseCodePointArg :composeCodePointArg |
+ codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
+ ].
s := WriteStream on:(String uninitializedNew:aUnicodeString size).
aUnicodeString do:[:eachCharacter |
@@ -340,10 +301,10 @@
!ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
!
version_CVS
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
! !