|
1 " |
|
2 COPYRIGHT (c) 2006 by eXept Software AG |
|
3 All Rights Reserved |
|
4 |
|
5 This software is furnished under a license and may be used |
|
6 only in accordance with the terms of that license and with the |
|
7 inclusion of the above copyright notice. This software may not |
|
8 be provided or otherwise made available to, or used by, any |
|
9 other person. No title to or ownership of the software is |
|
10 hereby transferred. |
|
11 " |
|
12 "{ Package: 'stx:libbasic' }" |
|
13 |
|
14 "{ NameSpace: CharacterEncoderImplementations }" |
|
15 |
|
16 ISO10646_to_UTF8 subclass:#ISO10646_to_XMLUTF8 |
|
17 instanceVariableNames:'' |
|
18 classVariableNames:'ReplacementCharacter' |
|
19 poolDictionaries:'' |
|
20 category:'Collections-Text-Encodings' |
|
21 ! |
|
22 |
|
23 !ISO10646_to_XMLUTF8 class methodsFor:'documentation'! |
|
24 |
|
25 copyright |
|
26 " |
|
27 COPYRIGHT (c) 2006 by eXept Software AG |
|
28 All Rights Reserved |
|
29 |
|
30 This software is furnished under a license and may be used |
|
31 only in accordance with the terms of that license and with the |
|
32 inclusion of the above copyright notice. This software may not |
|
33 be provided or otherwise made available to, or used by, any |
|
34 other person. No title to or ownership of the software is |
|
35 hereby transferred. |
|
36 " |
|
37 ! |
|
38 |
|
39 documentation |
|
40 " |
|
41 This encoder encodes characters into utf8 characters that may |
|
42 occur in XML document. |
|
43 |
|
44 Not all UTF characters are valid in XML, whatever encoding |
|
45 is used. For a reference, see |
|
46 |
|
47 http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char |
|
48 |
|
49 Invalid characters are replaced by ReplacementCharacter |
|
50 with $? as default. |
|
51 |
|
52 [author:] |
|
53 Jan Vrany <jan.vrany@fit.cvut.cz> |
|
54 |
|
55 [instance variables:] |
|
56 |
|
57 [class variables:] |
|
58 |
|
59 [see also:] |
|
60 http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char |
|
61 |
|
62 " |
|
63 ! ! |
|
64 |
|
65 !ISO10646_to_XMLUTF8 class methodsFor:'initialization'! |
|
66 |
|
67 initialize |
|
68 "Invoked at system start or when the class is dynamically loaded." |
|
69 |
|
70 ReplacementCharacter := $?. |
|
71 |
|
72 "Modified: / 30-06-2012 / 19:55:00 / Jan Vrany <jan.vrany@fit.cvut.cz>" |
|
73 ! ! |
|
74 |
|
75 !ISO10646_to_XMLUTF8 methodsFor:'encoding & decoding'! |
|
76 |
|
77 encodeString:aUnicodeString |
|
78 "return the UTF-8 representation of a aUnicodeString. |
|
79 The resulting string contains only valid XML unicode |
|
80 characters. Invalid characters are replaced by a |
|
81 ReplacementCharacter. For details, please see |
|
82 |
|
83 http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char |
|
84 |
|
85 " |
|
86 |
|
87 |s| |
|
88 |
|
89 "Copy-paste of superclass's method and tweaked. Not ideal, but |
|
90 but avoids 1 string copy" |
|
91 |
|
92 s := WriteStream on:(String uninitializedNew:aUnicodeString size). |
|
93 aUnicodeString do:[:eachCharacter | |
|
94 |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"| |
|
95 |
|
96 codePoint := eachCharacter codePoint. |
|
97 (self isValidXMLunicode: codePoint) ifFalse:[ |
|
98 codePoint := ReplacementCharacter codePoint. |
|
99 ]. |
|
100 |
|
101 codePoint <= 16r7F ifTrue:[ |
|
102 s nextPut:(Character value:codePoint). |
|
103 ] ifFalse:[ |
|
104 b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000). |
|
105 v := codePoint bitShift:-6. |
|
106 v <= 16r1F ifTrue:[ |
|
107 s nextPut:(Character value:(v bitOr:2r11000000)). |
|
108 s nextPut:b1. |
|
109 ] ifFalse:[ |
|
110 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
111 v := v bitShift:-6. |
|
112 v <= 16r0F ifTrue:[ |
|
113 s nextPut:(Character value:(v bitOr:2r11100000)). |
|
114 s nextPut:b2; nextPut:b1. |
|
115 ] ifFalse:[ |
|
116 b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
117 v := v bitShift:-6. |
|
118 v <= 16r07 ifTrue:[ |
|
119 s nextPut:(Character value:(v bitOr:2r11110000)). |
|
120 s nextPut:b3; nextPut:b2; nextPut:b1. |
|
121 ] ifFalse:[ |
|
122 b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
123 v := v bitShift:-6. |
|
124 v <= 16r03 ifTrue:[ |
|
125 s nextPut:(Character value:(v bitOr:2r11111000)). |
|
126 s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
127 ] ifFalse:[ |
|
128 b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
129 v := v bitShift:-6. |
|
130 v <= 16r01 ifTrue:[ |
|
131 s nextPut:(Character value:(v bitOr:2r11111100)). |
|
132 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
133 ] ifFalse:[ |
|
134 "/ cannot happen - we only support up to 30 bit characters |
|
135 self error:'ascii value > 31bit in utf8Encode'. |
|
136 ] |
|
137 ]. |
|
138 ]. |
|
139 ]. |
|
140 ]. |
|
141 ]. |
|
142 ]. |
|
143 |
|
144 ^ s contents |
|
145 |
|
146 " |
|
147 (self encodeString:'hello') asByteArray #[104 101 108 108 111] |
|
148 (self encodeString:(Character value:16r40) asString) asByteArray #[64] |
|
149 (self encodeString:(Character value:16r7F) asString) asByteArray #[127] |
|
150 (self encodeString:(Character value:16r80) asString) asByteArray #[194 128] |
|
151 (self encodeString:(Character value:16rFF) asString) asByteArray #[195 191] |
|
152 (self encodeString:(Character value:16r100) asString) asByteArray #[196 128] |
|
153 (self encodeString:(Character value:16r200) asString) asByteArray #[200 128] |
|
154 (self encodeString:(Character value:16r400) asString) asByteArray #[208 128] |
|
155 (self encodeString:(Character value:16r800) asString) asByteArray #[224 160 128] |
|
156 (self encodeString:(Character value:16r1000) asString) asByteArray #[225 128 128] |
|
157 (self encodeString:(Character value:16r2000) asString) asByteArray #[226 128 128] |
|
158 (self encodeString:(Character value:16r4000) asString) asByteArray #[228 128 128] |
|
159 (self encodeString:(Character value:16r8000) asString) asByteArray #[232 128 128] |
|
160 (self encodeString:(Character value:16rFFFF) asString) asByteArray #[239 191 191] |
|
161 " |
|
162 |
|
163 "Created: / 30-06-2012 / 20:07:43 / Jan Vrany <jan.vrany@fit.cvut.cz>" |
|
164 ! ! |
|
165 |
|
166 !ISO10646_to_XMLUTF8 methodsFor:'queries'! |
|
167 |
|
168 isValidXMLunicode: codePoint |
|
169 "Returns true, if given codePoint (Integer!!!!!!) is |
|
170 valid XML unicode." |
|
171 |
|
172 codePoint == 16r0009 ifTrue:[ ^ true ]. |
|
173 codePoint == 16r000A ifTrue:[ ^ true ]. |
|
174 codePoint == 16r000D ifTrue:[ ^ true ]. |
|
175 (codePoint between: 16r0020 and: 16rD7FF ) ifTrue:[ ^ true ]. |
|
176 (codePoint between: 16rE000 and: 16rFFFD ) ifTrue:[ ^ true ]. |
|
177 (codePoint between: 16r10000 and: 16r10FFFF) ifTrue:[ ^ true ]. |
|
178 |
|
179 ^false. |
|
180 |
|
181 "Created: / 30-06-2012 / 20:11:16 / Jan Vrany <jan.vrany@fit.cvut.cz>" |
|
182 ! ! |
|
183 |
|
184 !ISO10646_to_XMLUTF8 class methodsFor:'documentation'! |
|
185 |
|
186 version |
|
187 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_XMLUTF8.st,v 1.1 2014-02-05 17:11:46 cg Exp $' |
|
188 ! |
|
189 |
|
190 version_CVS |
|
191 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_XMLUTF8.st,v 1.1 2014-02-05 17:11:46 cg Exp $' |
|
192 ! ! |
|
193 |
|
194 |
|
195 ISO10646_to_XMLUTF8 initialize! |