CharacterEncoderImplementations__JIS0208_to_SJIS.st
changeset 22482 763385e4fdeb
parent 21623 0fd2de531f9a
equal deleted inserted replaced
22481:bb21f2349d1c 22482:763385e4fdeb
    13 "
    13 "
    14 "{ Package: 'stx:libbasic' }"
    14 "{ Package: 'stx:libbasic' }"
    15 
    15 
    16 "{ NameSpace: CharacterEncoderImplementations }"
    16 "{ NameSpace: CharacterEncoderImplementations }"
    17 
    17 
    18 TwoByteEncoder subclass:#JIS0208_to_SJIS
    18 VariableBytesEncoder subclass:#JIS0208_to_SJIS
    19 	instanceVariableNames:''
    19 	instanceVariableNames:''
    20 	classVariableNames:''
    20 	classVariableNames:''
    21 	poolDictionaries:''
    21 	poolDictionaries:''
    22 	category:'Collections-Text-Encodings'
    22 	category:'Collections-Text-Encodings'
    23 !
    23 !
    33  only in accordance with the terms of that license and with the
    33  only in accordance with the terms of that license and with the
    34  inclusion of the above copyright notice.   This software may not
    34  inclusion of the above copyright notice.   This software may not
    35  be provided or otherwise made available to, or used by, any
    35  be provided or otherwise made available to, or used by, any
    36  other person.  No title to or ownership of the software is
    36  other person.  No title to or ownership of the software is
    37  hereby transferred.
    37  hereby transferred.
       
    38 "
       
    39 !
       
    40 
       
    41 documentation
       
    42 "
       
    43     documentation to be added.
       
    44 
       
    45     [author:]
       
    46         stefan
       
    47 
       
    48     [instance variables:]
       
    49 
       
    50     [class variables:]
       
    51 
       
    52     [see also:]
       
    53 
    38 "
    54 "
    39 ! !
    55 ! !
    40 
    56 
    41 !JIS0208_to_SJIS class methodsFor:'mapping'!
    57 !JIS0208_to_SJIS class methodsFor:'mapping'!
    42 
    58 
    79     romans := CharacterEncoderImplementations::JIS0208 romanTable.
    95     romans := CharacterEncoderImplementations::JIS0208 romanTable.
    80 
    96 
    81 %{
    97 %{
    82     if (__isStringLike(aString)
    98     if (__isStringLike(aString)
    83      && (__Class(newString) == @global(TwoByteString))) {
    99      && (__Class(newString) == @global(TwoByteString))) {
    84 	INT _dstIdx = 0, _srcIdx = 0;
   100         INT _dstIdx = 0, _srcIdx = 0;
    85 	int _sz = __intVal(sz);
   101         int _sz = __intVal(sz);
    86 	unsigned char *_cp = __stringVal(aString);
   102         unsigned char *_cp = __stringVal(aString);
    87 	unsigned char _c1, _c2;
   103         unsigned char _c1, _c2;
    88 	unsigned short *_jcp = (unsigned short *)__stringVal(newString);
   104         unsigned short *_jcp = (unsigned short *)__stringVal(newString);
    89 
   105 
    90 	while (_srcIdx < _sz) {
   106         while (_srcIdx < _sz) {
    91 	    int _val;
   107             int _val;
    92 
   108 
    93 	    _c1 = _cp[_srcIdx];
   109             _c1 = _cp[_srcIdx];
    94 	    _srcIdx++;
   110             _srcIdx++;
    95 
   111 
    96 	    if ((_srcIdx < _sz)
   112             if ((_srcIdx < _sz)
    97 	     && (((_c1 >= 129) && (_c1 <= 159))
   113              && (((_c1 >= 129) && (_c1 <= 159))
    98 		 || ((_c1 >= 224) && (_c1 <= 239)))) {
   114                  || ((_c1 >= 224) && (_c1 <= 239)))) {
    99 		_c2 = _cp[_srcIdx];
   115                 _c2 = _cp[_srcIdx];
   100 		_srcIdx++;
   116                 _srcIdx++;
   101 		if ((_c2 >= 64) && (_c2 <= 252)) {
   117                 if ((_c2 >= 64) && (_c2 <= 252)) {
   102 		    int _adjust, _rowOffs, _cellOffs;
   118                     int _adjust, _rowOffs, _cellOffs;
   103 		    int _b1, _b2;
   119                     int _b1, _b2;
   104 
   120 
   105 		    _adjust = (_c2 < 159) ? 1 : 0;
   121                     _adjust = (_c2 < 159) ? 1 : 0;
   106 		    _rowOffs = (_c1 < 160) ? 112 : 176;
   122                     _rowOffs = (_c1 < 160) ? 112 : 176;
   107 		    if (_adjust) {
   123                     if (_adjust) {
   108 			_cellOffs = 31 + ((_c2 > 127) ? 1 : 0);
   124                         _cellOffs = 31 + ((_c2 > 127) ? 1 : 0);
   109 		    } else {
   125                     } else {
   110 			_cellOffs = 126;
   126                         _cellOffs = 126;
   111 		    }
   127                     }
   112 		    _b1 = ((_c1 - _rowOffs) << 1) - _adjust;
   128                     _b1 = ((_c1 - _rowOffs) << 1) - _adjust;
   113 		    _b2 = (_c2 - _cellOffs);
   129                     _b2 = (_c2 - _cellOffs);
   114 		    _val = (_b1<<8) + _b2;
   130                     _val = (_b1<<8) + _b2;
   115 		    if (_val <= 0) {
   131                     if (_val <= 0) {
   116 			/* decoder error - let smalltalk handle that */
   132                         /* decoder error - let smalltalk handle that */
   117 			_srcIdx -= 2;
   133                         _srcIdx -= 2;
   118 			goto getOutOfHere;
   134                         goto getOutOfHere;
   119 		    }
   135                     }
   120 		    if (_val > 0xFF) any16bit = true;
   136                     if (_val > 0xFF) any16bit = true;
   121 		    _jcp[_dstIdx] = _val;
   137                     _jcp[_dstIdx] = _val;
   122 		} else {
   138                 } else {
   123 		    /* mhmh - append untranslated */
   139                     /* mhmh - append untranslated */
   124 
   140 
   125 		    _jcp[_dstIdx] = _c1;
   141                     _jcp[_dstIdx] = _c1;
   126 		    _dstIdx++;
   142                     _dstIdx++;
   127 		    _jcp[_dstIdx] = _c2;
   143                     _jcp[_dstIdx] = _c2;
   128 		}
   144                 }
   129 	    } else {
   145             } else {
   130 		if ((_c1 >= 0xA1 /* 161 */) && (_c1 <= 0xDF /* 223 */)) {
   146                 if ((_c1 >= 0xA1 /* 161 */) && (_c1 <= 0xDF /* 223 */)) {
   131 		    /* HALFWIDTH KATAKANA
   147                     /* HALFWIDTH KATAKANA
   132 		     * map half-width katakana to 8E:xx
   148                      * map half-width katakana to 8E:xx
   133 		     */
   149                      */
   134 		    _val = _c1 - 128;
   150                     _val = _c1 - 128;
   135 		    _val = _val + 0x8E00;
   151                     _val = _val + 0x8E00;
   136 		    any16bit = true;
   152                     any16bit = true;
   137 		    _jcp[_dstIdx] = _val;
   153                     _jcp[_dstIdx] = _val;
   138 		} else {
   154                 } else {
   139 		    /* roman characters are translated as per romanTable */
   155                     /* roman characters are translated as per romanTable */
   140 		    _jcp[_dstIdx] = _c1;
   156                     _jcp[_dstIdx] = _c1;
   141 		    if ((romans != nil)
   157                     if ((romans != nil)
   142 		     && (__isArrayLike(romans))
   158                      && (__isArrayLike(romans))
   143 		     && ((_c1 - 0x20) < __arraySize(romans))) {
   159                      && ((_c1 - 0x20) < __arraySize(romans))) {
   144 			any16bit = true;
   160                         any16bit = true;
   145 			_jcp[_dstIdx] = __intVal(__ArrayInstPtr(romans)->a_element[(_c1 - 0x20)]);
   161                         _jcp[_dstIdx] = __intVal(__ArrayInstPtr(romans)->a_element[(_c1 - 0x20)]);
   146 		    }
   162                     }
   147 		}
   163                 }
   148 	    }
   164             }
   149 	    _dstIdx++;
   165             _dstIdx++;
   150 	}
   166         }
   151     getOutOfHere: ;
   167     getOutOfHere: ;
   152 	dstIdx = __mkSmallInteger(_dstIdx+1);
   168         dstIdx = __mkSmallInteger(_dstIdx+1);
   153 	srcIdx = __mkSmallInteger(_srcIdx+1);
   169         srcIdx = __mkSmallInteger(_srcIdx+1);
   154     }
   170     }
   155 %}.
   171 %}.
   156 
   172 
   157     [srcIdx <= sz] whileTrue:[
   173     [srcIdx <= sz] whileTrue:[
   158 	"/
   174         "/
   159 	"/ scan for next character in 129..159 or 224..239
   175         "/ scan for next character in 129..159 or 224..239
   160 	"/
   176         "/
   161 	char1 := aString at:srcIdx.
   177         char1 := aString at:srcIdx.
   162 	srcIdx := srcIdx + 1.
   178         srcIdx := srcIdx + 1.
   163 	b1 := char1 codePoint.
   179         b1 := char1 codePoint.
   164 
   180 
   165 	((srcIdx <= sz)
   181         ((srcIdx <= sz)
   166 	and:[(b1 >= 16r81"129" and:[b1 <= 16r9F"159"])                 "/ SJIS1 81 .. 9F
   182         and:[(b1 >= 16r81"129" and:[b1 <= 16r9F"159"])                 "/ SJIS1 81 .. 9F
   167 	     or:[b1 >= 16rE0"224" and:[b1 <= 16rEF"239"]]]) ifTrue:[   "/       E0 .. EF
   183              or:[b1 >= 16rE0"224" and:[b1 <= 16rEF"239"]]]) ifTrue:[   "/       E0 .. EF
   168 	    char2 := aString at:srcIdx.
   184             char2 := aString at:srcIdx.
   169 	    srcIdx := srcIdx + 1.
   185             srcIdx := srcIdx + 1.
   170 	    b2 := char2 codePoint.
   186             b2 := char2 codePoint.
   171 	    (b2 >= 16r40"64" and:[b2 <= 16rFC"252"]) ifTrue:[          "/ SJIS2 40 .. FC
   187             (b2 >= 16r40"64" and:[b2 <= 16rFC"252"]) ifTrue:[          "/ SJIS2 40 .. FC
   172 		|adjust rowOffs cellOffs|
   188                 |adjust rowOffs cellOffs|
   173 
   189 
   174 		adjust := (b2 < 16r9F"159") ifTrue:[1] ifFalse:[0].
   190                 adjust := (b2 < 16r9F"159") ifTrue:[1] ifFalse:[0].
   175 		rowOffs := b1 < 16rA0"160" ifTrue:[112] ifFalse:[176].
   191                 rowOffs := b1 < 16rA0"160" ifTrue:[112] ifFalse:[176].
   176 		adjust == 1 ifTrue:[
   192                 adjust == 1 ifTrue:[
   177 		    cellOffs := 31 + (b2 > 127 ifTrue:[1] ifFalse:[0]).
   193                     cellOffs := 31 + (b2 > 127 ifTrue:[1] ifFalse:[0]).
   178 		] ifFalse:[
   194                 ] ifFalse:[
   179 		    cellOffs := 126.
   195                     cellOffs := 126.
   180 		].
   196                 ].
   181 		b1 := ((b1 - rowOffs) bitShift:1) - adjust.
   197                 b1 := ((b1 - rowOffs) bitShift:1) - adjust.
   182 		b2 := (b2 - cellOffs).
   198                 b2 := (b2 - cellOffs).
   183 		val := (b1 bitShift:8) + b2.
   199                 val := (b1 bitShift:8) + b2.
   184 		val <= 0 ifTrue:[
   200                 val <= 0 ifTrue:[
   185 		    DecodingError
   201                     DecodingError
   186 			    raiseWith:aString
   202                             raiseWith:aString
   187 			    errorString:'SJIS decoding failed (not SJIS encoded ?)'.
   203                             errorString:'SJIS decoding failed (not SJIS encoded ?)'.
   188 		    newString at:dstIdx put:char1.
   204                     newString at:dstIdx put:char1.
   189 		    dstIdx := dstIdx + 1.
   205                     dstIdx := dstIdx + 1.
   190 		    newString at:dstIdx put:char2.
   206                     newString at:dstIdx put:char2.
   191 		] ifFalse:[
   207                 ] ifFalse:[
   192 		    val > 16rFF ifTrue:[any16bit := true].
   208                     val > 16rFF ifTrue:[any16bit := true].
   193 		    newString at:dstIdx put:(Character value:val).
   209                     newString at:dstIdx put:(Character value:val).
   194 		]
   210                 ]
   195 	    ] ifFalse:[
   211             ] ifFalse:[
   196 		"/ mhmh - append untranslated
   212                 "/ mhmh - append untranslated
   197 
   213 
   198 		newString at:dstIdx put:char1.
   214                 newString at:dstIdx put:char1.
   199 		dstIdx := dstIdx + 1.
   215                 dstIdx := dstIdx + 1.
   200 		newString at:dstIdx put:char2.
   216                 newString at:dstIdx put:char2.
   201 	    ]
   217             ]
   202 	] ifFalse:[
   218         ] ifFalse:[
   203 	    (b1 >= 16rA1 "161" and:[b1 <= 16rDF "223"]) ifTrue:[     "/ HALFWIDTH KATAKANA
   219             (b1 >= 16rA1 "161" and:[b1 <= 16rDF "223"]) ifTrue:[     "/ HALFWIDTH KATAKANA
   204 		"/ map half-width katakan to 8E:xx
   220                 "/ map half-width katakan to 8E:xx
   205 		val := b1 - 128.
   221                 val := b1 - 128.
   206 		val := val + (16r8E"142" bitShift:8).
   222                 val := val + (16r8E"142" bitShift:8).
   207 		any16bit := true.
   223                 any16bit := true.
   208 		newString at:dstIdx put:(Character value:val).
   224                 newString at:dstIdx put:(Character value:val).
   209 	    ] ifFalse:[
   225             ] ifFalse:[
   210 		"/ roman characters translated as per romanTable
   226                 "/ roman characters translated as per romanTable
   211 		newString at:dstIdx put:char1
   227                 newString at:dstIdx put:char1.
   212 		romans isArray ifTrue:[
   228                 romans isArray ifTrue:[
   213 		    char1 codePoint < romans size ifTrue:[
   229                     char1 codePoint < romans size ifTrue:[
   214 			any16bit := true.
   230                         any16bit := true.
   215 			newString at:dstIdx put:(Character value:(romans at:char1 codePoint-32+1)).
   231                         newString at:dstIdx put:(Character value:(romans at:char1 codePoint-32+1)).
   216 		    ]
   232                     ]
   217 		]
   233                 ]
   218 	    ]
   234             ]
   219 	].
   235         ].
   220 	dstIdx := dstIdx + 1.
   236         dstIdx := dstIdx + 1.
   221     ].
   237     ].
   222     any16bit ifFalse:[
   238     any16bit ifFalse:[
   223 	newString := String fromString:newString
   239         newString := String fromString:newString
   224     ].
   240     ].
   225 
   241 
   226     (dstIdx-1) ~~ sz ifTrue:[
   242     (dstIdx-1) ~~ sz ifTrue:[
   227 	newString := newString copyTo:dstIdx - 1.
   243         newString := newString copyTo:dstIdx - 1.
   228     ].
   244     ].
   229 
   245 
   230     ^ newString
   246     ^ newString
   231 
   247 
   232     "simple:
   248     "simple:
   233 
   249 
   234      CharacterEncoderImplementations::JIS0208_to_SJIS decodeString:'hello'
   250      CharacterEncoderImplementations::JIS0208_to_SJIS decodeString:'hello'
   235      (CharacterEncoder encoderFor:#sjis) decodeString:'hello'
   251      (CharacterEncoder encoderFor:#sjis) decodeString:'hello'
   236 
   252 
   237      CharacterEncoderImplementations::JIS0208_to_SJIS decodeString:('../../doc/online/japanese/TOP.html' asFilename contents asString)
   253      CharacterEncoderImplementations::JIS0208_to_SJIS decodeString:('../../doc/online/japanese/TOP.html' asFilename contentsAsString)
   238 
   254 
   239      '../../doc/online/japanese/TOP.html' asFilename contents asString
   255      '../../doc/online/japanese/TOP.html' asFilename contentsAsString
   240 		decodeFrom:#sjis
   256                 decodeFrom:#jis208
   241     "
   257     "
       
   258 
       
   259     "Modified (comment): / 17-01-2018 / 17:48:08 / stefan"
   242 !
   260 !
   243 
   261 
   244 encodeString:aJISString
   262 encodeString:aJISString
   245     "return a new string with aJISString's characters as SJIS encoded 8bit string.
   263     "return a new string with aJISString's characters as SJIS encoded 8bit string.
   246      The resulting string is only useful to be stored on some external file,
   264      The resulting string is only useful to be stored on some external file,
   345 !
   363 !
   346 
   364 
   347 version_CVS
   365 version_CVS
   348     ^ '$Header$'
   366     ^ '$Header$'
   349 ! !
   367 ! !
       
   368