CharacterEncoderImplementations__ISO10646_to_UTF8.st
changeset 22474 f42c97c037ed
parent 22429 48389a135c35
child 25271 3b763ce09c7e
equal deleted inserted replaced
22473:35fd10859181 22474:f42c97c037ed
    13 "
    13 "
    14 "{ Package: 'stx:libbasic' }"
    14 "{ Package: 'stx:libbasic' }"
    15 
    15 
    16 "{ NameSpace: CharacterEncoderImplementations }"
    16 "{ NameSpace: CharacterEncoderImplementations }"
    17 
    17 
    18 TwoByteEncoder subclass:#ISO10646_to_UTF8
    18 VariableBytesEncoder subclass:#ISO10646_to_UTF8
    19 	instanceVariableNames:''
    19 	instanceVariableNames:''
    20 	classVariableNames:''
    20 	classVariableNames:''
    21 	poolDictionaries:''
    21 	poolDictionaries:''
    22 	category:'Collections-Text-Encodings'
    22 	category:'Collections-Text-Encodings'
    23 !
    23 !
    45 "
    45 "
    46 !
    46 !
    47 
    47 
    48 documentation
    48 documentation
    49 "
    49 "
    50     I can encode characters into/from UTF8
    50     I can encode unicode characters into utf-8 and
       
    51     decode utf-8 characters into unicode.
    51     
    52     
    52     Notice the naming (many are confused):
    53     Notice the naming (many are confused):
    53         Unicode is the set of number-to-glyph assignments
    54         Unicode is the set of number-to-glyph assignments
    54     whereas:
    55     whereas:
    55         UTF8 is a concrete way of xmitting Unicode codePoints (numbers).
    56         UTF8 is a concrete way of xmitting Unicode codePoints (numbers).
   120     "Modified: / 10-01-2018 / 22:59:20 / stefan"
   121     "Modified: / 10-01-2018 / 22:59:20 / stefan"
   121 ! !
   122 ! !
   122 
   123 
   123 !ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
   124 !ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
   124 
   125 
   125 decode:aCode
       
   126     "given an integer in my encoding, return a unicode codePoint for it"
       
   127 
       
   128     self shouldNotImplement "/ no single byte conversion possible
       
   129 
       
   130     "Modified (comment): / 03-01-2018 / 23:15:37 / stefan"
       
   131 !
       
   132 
       
   133 decodeString:aStringOrByteCollection
   126 decodeString:aStringOrByteCollection
   134     "given a string in UTF8 encoding,
   127     "given a string in UTF8 encoding,
   135      return a new string containing the same characters, in Unicode encoding.
   128      return a new string containing the same characters, in Unicode encoding.
   136      Returns either a normal String, a Unicode16String or a Unicode32String instance.
   129      Returns either a normal String, a Unicode16String or a Unicode32String instance.
   137      This is only useful, when reading from external sources or communicating with
   130      This is only useful, when reading from external sources or communicating with
   140      This only handles up-to 30bit characters."
   133      This only handles up-to 30bit characters."
   141 
   134 
   142     ^ CharacterArray decodeFromUTF8:aStringOrByteCollection.
   135     ^ CharacterArray decodeFromUTF8:aStringOrByteCollection.
   143 !
   136 !
   144 
   137 
   145 encode:aCode
       
   146     "given a codePoint in unicode, return a byte in my encoding for it"
       
   147 
       
   148     self shouldNotImplement "/ no vonversion to a single byte is possible
       
   149 
       
   150     "Modified (comment): / 03-01-2018 / 23:13:58 / stefan"
       
   151 !
       
   152 
       
   153 encodeCharacter:aUnicodeCharacter on:aStream
       
   154     "given a character in unicode, encode it onto aStream."
       
   155 
       
   156     aStream nextPutUtf8:aUnicodeCharacter.
       
   157 
       
   158     "Created: / 16-02-2017 / 16:20:57 / stefan"
       
   159 !
       
   160 
       
   161 encodeString:aUnicodeString
   138 encodeString:aUnicodeString
   162     "return the UTF-8 representation of a Unicode string.
   139     "return the UTF-8 representation of a Unicode string.
   163      The resulting string is only useful to be stored on some external file,
   140      The resulting string is only useful to be stored on some external file,
   164      not for being used inside ST/X."
   141      not for being used inside ST/X."
   165 
   142 
   166     ^ aUnicodeString utf8Encoded.
   143     ^ aUnicodeString utf8Encoded.
       
   144 ! !
       
   145 
       
   146 !ISO10646_to_UTF8 methodsFor:'queries'!
       
   147 
       
   148 characterSize:charOrCodePoint
       
   149     "return the number of bytes required to encode codePoint"
       
   150 
       
   151     ^ charOrCodePoint asCharacter utf8BytesPerCharacter.
       
   152 
       
   153     "Created: / 15-06-2005 / 15:16:22 / janfrog"
       
   154     "Modified: / 03-01-2018 / 23:05:59 / stefan"
       
   155 !
       
   156 
       
   157 nameOfEncoding
       
   158     ^ #utf8
       
   159 ! !
       
   160 
       
   161 !ISO10646_to_UTF8 methodsFor:'stream support'!
       
   162 
       
   163 encodeCharacter:aUnicodeCharacter on:aStream
       
   164     "given a character in unicode, encode it onto aStream."
       
   165 
       
   166     aStream nextPutUtf8:aUnicodeCharacter.
       
   167 
       
   168     "Created: / 16-02-2017 / 16:20:57 / stefan"
   167 !
   169 !
   168 
   170 
   169 encodeString:aUnicodeString on:aStream
   171 encodeString:aUnicodeString on:aStream
   170     "given a string in unicode, encode it onto aStream."
   172     "given a string in unicode, encode it onto aStream."
   171 
   173 
   172      aStream nextPutAllUtf8:aUnicodeString.
   174      aStream nextPutAllUtf8:aUnicodeString.
   173 
   175 
   174     "Created: / 16-02-2017 / 16:27:31 / stefan"
   176     "Created: / 16-02-2017 / 16:27:31 / stefan"
   175 ! !
   177 !
   176 
   178 
   177 !ISO10646_to_UTF8 methodsFor:'queries'!
   179 readNext:charactersToReadArg charactersFrom:aStream
   178 
   180     "decode the next charactersToRead on aStream from utf-8 to unicode"
   179 characterSize:charOrCodePoint
   181 
   180     "return the number of bytes required to encode codePoint"
   182     |s c cp hasUtf8 charactersToRead "{ Class:SmallInteger }"|
   181 
   183 
   182     ^ charOrCodePoint asCharacter utf8BytesPerCharacter.
   184     charactersToRead := charactersToReadArg.
   183 
       
   184     "Created: / 15-06-2005 / 15:16:22 / janfrog"
       
   185     "Modified: / 03-01-2018 / 23:05:59 / stefan"
       
   186 !
       
   187 
       
   188 nameOfEncoding
       
   189     ^ #utf8
       
   190 ! !
       
   191 
       
   192 !ISO10646_to_UTF8 methodsFor:'stream support'!
       
   193 
       
   194 readNext:charactersToRead charactersFrom:stream
       
   195     | s c cp hasUtf8|
       
   196 
       
   197     hasUtf8 := false.
   185     hasUtf8 := false.
   198     "stream may be both text or bytes"
   186     "stream may be both text or bytes"
   199     s := (stream contentsSpecies new:charactersToRead) writeStream.
   187     s := (aStream contentsSpecies new:charactersToRead) writeStream.
   200     charactersToRead timesRepeat:[
   188     charactersToRead timesRepeat:[
   201         c := stream next.
   189         c := aStream next.
   202         s nextPut:c.
   190         s nextPut:c.
   203         cp := c codePoint.
   191         cp := c codePoint.
   204         (cp bitTest:16r80) ifTrue:[
   192         (cp bitTest:16r80) ifTrue:[
   205             hasUtf8 := true.
   193             hasUtf8 := true.
   206             s nextPutAll:(stream next:(self class bytesToReadFor:cp)-1).
   194             s nextPutAll:(aStream next:(self class bytesToReadFor:cp)-1).
   207         ].
   195         ].
   208     ].
   196     ].
   209     hasUtf8 ifTrue:[
   197     hasUtf8 ifTrue:[
   210         ^ self decodeString:s contents.
   198         ^ self decodeString:s contents.
   211     ].
   199     ].
   212     ^ s contents asString
   200     ^ s contents asString
   213 
   201 
   214     "Created: / 16-06-2005 / 11:45:14 / masca"
   202     "Created: / 16-06-2005 / 11:45:14 / masca"
   215     "Modified: / 10-01-2018 / 22:28:39 / stefan"
   203     "Modified (comment): / 17-01-2018 / 13:24:42 / stefan"
   216 !
   204 !
   217 
   205 
   218 readNextCharacterFrom:aStream 
   206 readNextCharacterFrom:aStream
       
   207     "decode the next character or byte on aStream from utf-8 to unicode"
       
   208 
   219     ^ Character utf8DecodeFrom:aStream.
   209     ^ Character utf8DecodeFrom:aStream.
   220 
   210 
   221     "Created: / 14-06-2005 / 17:03:59 / janfrog"
   211     "Created: / 14-06-2005 / 17:03:59 / janfrog"
   222     "Modified: / 10-01-2018 / 17:35:40 / stefan"
   212     "Modified: / 10-01-2018 / 17:35:40 / stefan"
       
   213     "Modified (comment): / 17-01-2018 / 13:24:08 / stefan"
   223 ! !
   214 ! !
   224 
   215 
   225 !ISO10646_to_UTF8 class methodsFor:'documentation'!
   216 !ISO10646_to_UTF8 class methodsFor:'documentation'!
   226 
   217 
   227 version
   218 version