"
COPYRIGHT (c) 2006 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
"{ Package: 'stx:libbasic' }"
"{ NameSpace: CharacterEncoderImplementations }"
ISO10646_to_UTF8 subclass:#ISO10646_to_XMLUTF8
instanceVariableNames:''
classVariableNames:'ReplacementCharacter'
poolDictionaries:''
category:'Collections-Text-Encodings'
!
!ISO10646_to_XMLUTF8 class methodsFor:'documentation'!
copyright
"
COPYRIGHT (c) 2006 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
!
documentation
"
This encoder encodes characters into utf8 characters that may
occur in XML document.
Not all UTF characters are valid in XML, whatever encoding
is used. For a reference, see
https://www.w3.org/TR/xml/#NT-Char
Invalid characters are replaced by ReplacementCharacter
with $? as default.
[author:]
Jan Vrany <jan.vrany@fit.cvut.cz>
[instance variables:]
[class variables:]
[see also:]
https://www.w3.org/TR/xml/#NT-Char
"
! !
!ISO10646_to_XMLUTF8 class methodsFor:'initialization'!
initialize
"Invoked at system start or when the class is dynamically loaded."
ReplacementCharacter := $?.
"Modified: / 30-06-2012 / 19:55:00 / Jan Vrany <jan.vrany@fit.cvut.cz>"
! !
!ISO10646_to_XMLUTF8 class methodsFor:'queries'!
isValidXMLunicode: codePoint
"Returns true, if given codePoint (Integer!!!!!!) is
valid XML unicode."
codePoint == 16r0009 ifTrue:[ ^ true ].
codePoint == 16r000A ifTrue:[ ^ true ].
codePoint == 16r000D ifTrue:[ ^ true ].
(codePoint between: 16r0020 and: 16rD7FF ) ifTrue:[ ^ true ].
(codePoint between: 16rE000 and: 16rFFFD ) ifTrue:[ ^ true ].
(codePoint between: 16r10000 and: 16r10FFFF) ifTrue:[ ^ true ].
^false.
"Created: / 30-06-2012 / 20:11:16 / Jan Vrany <jan.vrany@fit.cvut.cz>"
! !
!ISO10646_to_XMLUTF8 methodsFor:'encoding & decoding'!
encodeCharacter:aUnicodeCharacter on:aStream
"return the UTF-8 representation of a aUnicodeCharacter.
The resulting string contains only valid XML unicode
characters. Invalid characters are replaced by a
ReplacementCharacter. For details, please see
https://www.w3.org/TR/xml/#NT-Char"
|codePoint|
codePoint := aUnicodeCharacter codePoint.
(self class isValidXMLunicode:codePoint) ifFalse:[
codePoint := ReplacementCharacter codePoint.
].
aStream nextPutUtf8:codePoint.
"Created: / 16-02-2017 / 17:29:24 / stefan"
"Modified: / 04-01-2018 / 01:10:28 / stefan"
!
encodeString:aUnicodeString
"return the UTF-8 representation of a aUnicodeString.
The resulting string contains only valid XML unicode
characters. Invalid characters are replaced by a
ReplacementCharacter. For details, please see
https://www.w3.org/TR/xml/#NT-Char"
|s|
s := WriteStream on:(String uninitializedNew:aUnicodeString size).
self encodeString:aUnicodeString on:s.
^ s contents
"
(self encodeString:'hello') asByteArray #[104 101 108 108 111]
(self encodeString:(Character value:16r40) asString) asByteArray #[64]
(self encodeString:(Character value:16r7F) asString) asByteArray #[127]
(self encodeString:(Character value:16r80) asString) asByteArray #[194 128]
(self encodeString:(Character value:16rFF) asString) asByteArray #[195 191]
(self encodeString:(Character value:16r100) asString) asByteArray #[196 128]
(self encodeString:(Character value:16r200) asString) asByteArray #[200 128]
(self encodeString:(Character value:16r400) asString) asByteArray #[208 128]
(self encodeString:(Character value:16r800) asString) asByteArray #[224 160 128]
(self encodeString:(Character value:16r1000) asString) asByteArray #[225 128 128]
(self encodeString:(Character value:16r2000) asString) asByteArray #[226 128 128]
(self encodeString:(Character value:16r4000) asString) asByteArray #[228 128 128]
(self encodeString:(Character value:16r8000) asString) asByteArray #[232 128 128]
(self encodeString:(Character value:16rFFFF) asString) '?'
"
"Created: / 30-06-2012 / 20:07:43 / Jan Vrany <jan.vrany@fit.cvut.cz>"
"Modified (comment): / 04-01-2018 / 01:08:11 / stefan"
!
encodeString:aUnicodeString on:aStream
"return the UTF-8 representation of a aUnicodeString.
The resulting string contains only valid XML unicode
characters. Invalid characters are replaced by a
ReplacementCharacter. For details, please see
https://www.w3.org/TR/xml/#NT-Char"
|sz "{Class: SmallInteger}"|
sz := aUnicodeString size.
1 to:sz do:[:idx|
self encodeCharacter:(aUnicodeString at:idx) on:aStream
].
"Created: / 16-02-2017 / 17:18:47 / stefan"
"Modified (comment): / 04-01-2018 / 01:07:32 / stefan"
! !
!ISO10646_to_XMLUTF8 methodsFor:'queries'!
nameOfEncoding
^ #'utf8-XML'
! !
!ISO10646_to_XMLUTF8 class methodsFor:'documentation'!
version
^ '$Header$'
!
version_CVS
^ '$Header$'
! !
ISO10646_to_XMLUTF8 initialize!