CharacterEncoderImplementations__ISO10646_to_XMLUTF8.st
changeset 15945 94b64f8a1abe
child 17496 4dacd778d9c9
equal deleted inserted replaced
15944:8299575b9314 15945:94b64f8a1abe
       
     1 "
       
     2  COPYRIGHT (c) 2006 by eXept Software AG
       
     3 	      All Rights Reserved
       
     4 
       
     5  This software is furnished under a license and may be used
       
     6  only in accordance with the terms of that license and with the
       
     7  inclusion of the above copyright notice.   This software may not
       
     8  be provided or otherwise made available to, or used by, any
       
     9  other person.  No title to or ownership of the software is
       
    10  hereby transferred.
       
    11 "
       
    12 "{ Package: 'stx:libbasic' }"
       
    13 
       
    14 "{ NameSpace: CharacterEncoderImplementations }"
       
    15 
       
    16 ISO10646_to_UTF8 subclass:#ISO10646_to_XMLUTF8
       
    17 	instanceVariableNames:''
       
    18 	classVariableNames:'ReplacementCharacter'
       
    19 	poolDictionaries:''
       
    20 	category:'Collections-Text-Encodings'
       
    21 !
       
    22 
       
    23 !ISO10646_to_XMLUTF8 class methodsFor:'documentation'!
       
    24 
       
    25 copyright
       
    26 "
       
    27  COPYRIGHT (c) 2006 by eXept Software AG
       
    28 	      All Rights Reserved
       
    29 
       
    30  This software is furnished under a license and may be used
       
    31  only in accordance with the terms of that license and with the
       
    32  inclusion of the above copyright notice.   This software may not
       
    33  be provided or otherwise made available to, or used by, any
       
    34  other person.  No title to or ownership of the software is
       
    35  hereby transferred.
       
    36 "
       
    37 !
       
    38 
       
    39 documentation
       
    40 "
       
    41     This encoder encodes characters into utf8 characters that may
       
    42     occur in XML document.
       
    43 
       
    44     Not all UTF characters are valid in XML, whatever encoding
       
    45     is used. For a reference, see 
       
    46 
       
    47       http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char   
       
    48 
       
    49     Invalid characters are replaced by ReplacementCharacter
       
    50     with $? as default.
       
    51 
       
    52     [author:]
       
    53         Jan Vrany <jan.vrany@fit.cvut.cz>
       
    54 
       
    55     [instance variables:]
       
    56 
       
    57     [class variables:]
       
    58 
       
    59     [see also:]
       
    60         http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
       
    61 
       
    62 "
       
    63 ! !
       
    64 
       
    65 !ISO10646_to_XMLUTF8 class methodsFor:'initialization'!
       
    66 
       
    67 initialize
       
    68     "Invoked at system start or when the class is dynamically loaded."
       
    69 
       
    70     ReplacementCharacter := $?.
       
    71 
       
    72     "Modified: / 30-06-2012 / 19:55:00 / Jan Vrany <jan.vrany@fit.cvut.cz>"
       
    73 ! !
       
    74 
       
    75 !ISO10646_to_XMLUTF8 methodsFor:'encoding & decoding'!
       
    76 
       
    77 encodeString:aUnicodeString
       
    78     "return the UTF-8 representation of a aUnicodeString.
       
    79      The resulting string contains only valid XML unicode
       
    80      characters. Invalid characters are replaced by a
       
    81      ReplacementCharacter. For details, please see
       
    82 
       
    83      http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
       
    84 
       
    85     "
       
    86 
       
    87     |s|
       
    88 
       
    89     "Copy-paste of superclass's method and tweaked. Not ideal, but
       
    90      but avoids 1 string copy"
       
    91 
       
    92     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
       
    93     aUnicodeString do:[:eachCharacter |
       
    94         |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
       
    95 
       
    96         codePoint := eachCharacter codePoint.
       
    97         (self isValidXMLunicode: codePoint) ifFalse:[
       
    98             codePoint := ReplacementCharacter codePoint.
       
    99         ].
       
   100 
       
   101         codePoint <= 16r7F ifTrue:[
       
   102             s nextPut:(Character value:codePoint).
       
   103         ] ifFalse:[
       
   104             b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
       
   105             v := codePoint bitShift:-6.
       
   106             v <= 16r1F ifTrue:[
       
   107                 s nextPut:(Character value:(v bitOr:2r11000000)).
       
   108                 s nextPut:b1.
       
   109             ] ifFalse:[
       
   110                 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   111                 v := v bitShift:-6.
       
   112                 v <= 16r0F ifTrue:[
       
   113                     s nextPut:(Character value:(v bitOr:2r11100000)).
       
   114                     s nextPut:b2; nextPut:b1.
       
   115                 ] ifFalse:[
       
   116                     b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   117                     v := v bitShift:-6.
       
   118                     v <= 16r07 ifTrue:[
       
   119                         s nextPut:(Character value:(v bitOr:2r11110000)).
       
   120                         s nextPut:b3; nextPut:b2; nextPut:b1.
       
   121                     ] ifFalse:[
       
   122                         b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   123                         v := v bitShift:-6.
       
   124                         v <= 16r03 ifTrue:[
       
   125                             s nextPut:(Character value:(v bitOr:2r11111000)).
       
   126                             s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
       
   127                         ] ifFalse:[
       
   128                             b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   129                             v := v bitShift:-6.
       
   130                             v <= 16r01 ifTrue:[
       
   131                                 s nextPut:(Character value:(v bitOr:2r11111100)).
       
   132                                 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
       
   133                             ] ifFalse:[
       
   134                                 "/ cannot happen - we only support up to 30 bit characters
       
   135                                 self error:'ascii value > 31bit in utf8Encode'.
       
   136                             ]
       
   137                         ].
       
   138                     ].
       
   139                 ].
       
   140             ].
       
   141         ].
       
   142     ].
       
   143 
       
   144     ^ s contents
       
   145 
       
   146     "
       
   147      (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
       
   148      (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
       
   149      (self encodeString:(Character value:16r7F) asString) asByteArray    #[127]
       
   150      (self encodeString:(Character value:16r80) asString) asByteArray    #[194 128]
       
   151      (self encodeString:(Character value:16rFF) asString) asByteArray    #[195 191]
       
   152      (self encodeString:(Character value:16r100) asString) asByteArray   #[196 128]
       
   153      (self encodeString:(Character value:16r200) asString) asByteArray   #[200 128]
       
   154      (self encodeString:(Character value:16r400) asString) asByteArray   #[208 128]
       
   155      (self encodeString:(Character value:16r800) asString) asByteArray   #[224 160 128]
       
   156      (self encodeString:(Character value:16r1000) asString) asByteArray  #[225 128 128]
       
   157      (self encodeString:(Character value:16r2000) asString) asByteArray  #[226 128 128]
       
   158      (self encodeString:(Character value:16r4000) asString) asByteArray  #[228 128 128]
       
   159      (self encodeString:(Character value:16r8000) asString) asByteArray  #[232 128 128]
       
   160      (self encodeString:(Character value:16rFFFF) asString) asByteArray  #[239 191 191]
       
   161     "
       
   162 
       
   163     "Created: / 30-06-2012 / 20:07:43 / Jan Vrany <jan.vrany@fit.cvut.cz>"
       
   164 ! !
       
   165 
       
   166 !ISO10646_to_XMLUTF8 methodsFor:'queries'!
       
   167 
       
   168 isValidXMLunicode: codePoint
       
   169     "Returns true, if given codePoint (Integer!!!!!!) is
       
   170      valid XML unicode."
       
   171 
       
   172     codePoint == 16r0009 ifTrue:[ ^ true ].
       
   173     codePoint == 16r000A ifTrue:[ ^ true ].
       
   174     codePoint == 16r000D ifTrue:[ ^ true ].
       
   175     (codePoint between: 16r0020  and: 16rD7FF  ) ifTrue:[ ^ true ].
       
   176     (codePoint between: 16rE000  and: 16rFFFD  ) ifTrue:[ ^ true ].
       
   177     (codePoint between: 16r10000 and: 16r10FFFF) ifTrue:[ ^ true ].
       
   178 
       
   179     ^false.
       
   180 
       
   181     "Created: / 30-06-2012 / 20:11:16 / Jan Vrany <jan.vrany@fit.cvut.cz>"
       
   182 ! !
       
   183 
       
   184 !ISO10646_to_XMLUTF8 class methodsFor:'documentation'!
       
   185 
       
   186 version
       
   187     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_XMLUTF8.st,v 1.1 2014-02-05 17:11:46 cg Exp $'
       
   188 !
       
   189 
       
   190 version_CVS
       
   191     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_XMLUTF8.st,v 1.1 2014-02-05 17:11:46 cg Exp $'
       
   192 ! !
       
   193 
       
   194 
       
   195 ISO10646_to_XMLUTF8 initialize!