Character.st
author Claus Gittinger <cg@exept.de>
Sat, 11 Jan 1997 13:14:19 +0100
changeset 2145 d243ffafeae3
parent 2124 6238280f6120
child 2214 2c73e6907cbc
permissions -rw-r--r--
more docu

"
 COPYRIGHT (c) 1988 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"

Magnitude subclass:#Character
	instanceVariableNames:'asciivalue'
	classVariableNames:'CharacterTable'
	poolDictionaries:''
	category:'Magnitude-General'
!

!Character class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 1988 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    This class represents characters.
    Notice, that actual character objects are not used when characters
    are stored in strings, symbols or twoByteStrings; these only store
    a characters asciiValue for a more compact representation.
    The word 'asciiValue' is a historic leftover - actually, any integer
    code is allowed (i.e. characters are not limited to 8bit).

    Single byte Characters are unique; 
    i.e. for every asciiValue (0..255) there exists exactly one instance of 
    Character, which is shared (Character value:xxx checks for this, and returns 
    a reference to an existing instance).
    Other characters (i.e. asciivalue > 255) are not shared; i.e. these
    are created as required.

    This means: you may compare characters using #== ONLY IFF you are certain,
    that the characters ranges is 0..255. Otherwise, you HAVE TO compare
    using #=. (if in doubt, always compare using #=).
    Sorry for this inconvenience, but it is (practically) impossible to keep
    the possible maximum of 2^32 characters (Unicode) around, for that
    convenience alone.


    Methods marked as (JS) come from the manchester Character goody
    (CharacterComparing) by Jan Steinman, which allow Characters to be used as
    Interval elements (i.e. ($a to:$z) do:[...] ); 
    They are not a big deal, but convenient add-ons.
    Some of these have been modified a bit.

    WARNING: characters are known by compiler and runtime system -
             do not change the instance layout. 

    Also, although you can create subclasses of Character, the compiler always
    creates instances of Character for literals ...
    ... and other classes are hard-wired to always return instances of characters
    in some cases (i.e. String>>at:, Symbol>>at: etc.).
    Therefore, it may not make sense to create a character-subclass.

    [author:]
        Claus Gittinger

    [see also:]
        String TwoByteString
        StringCollection
"
! !

!Character class methodsFor:'instance creation'!

basicNew
    "catch new - Characters cannot be created with new"

    ^ self error:'Characters cannot be created with new'
!

digitValue:anInteger
    "return a character that corresponds to anInteger.
     0-9 map to $0-$9, 10-35 map to $A-$Z"

    |val "{ Class: SmallInteger }" |

    val := anInteger.
    (val between:0 and:9) ifTrue:[
	^ Character value:(val + ($0 asciiValue))
    ].
    (val between:10 and:35) ifTrue:[
	^ Character value:(val + ($A asciiValue - 10))
    ].
    ^self error:'value not in range 0 to 35'
!

value:anInteger
    "return a character with asciivalue anInteger"

%{  /* NOCONTEXT */

    INT ascii;

    if (__isSmallInteger(anInteger)) {
	ascii = __intVal(anInteger);
	if ((ascii & ~0xFF) == 0 /* (ascii >= 0) && (ascii <= 255) */)
	    RETURN ( __MKCHARACTER(ascii) );
    }
%}.
    (anInteger between:0 and:16rFF) ifTrue:[
	^ CharacterTable at:(anInteger + 1)
    ].
    (anInteger between:16r100 and:16rFFFF) ifTrue:[
	^ super basicNew setAsciiValue:anInteger
    ].
    "
     a characters ascii-code must be 0..16rFFFF.
     (i.e. only single-byte and twoByte characters are allowed.)
    "
    self error:'invalid ascii code for character'
! !

!Character class methodsFor:'constants'!

backspace
    "return the backspace character"

    ^ Character value:8
!

bell
    "return the bell character"

    ^ Character value:7
!

cr
    "return the lineEnd character 
     - actually (in unix) this is a newline character"

    ^ Character value:10
!

del 
    "return the delete character"

    ^ Character value:16r7F 
!

doubleQuote
    "return the double-quote character"

    ^ Character value:34
!

esc
    "return the escape character"

    ^ Character value:27
!

excla
    "return the exclamation-mark character"
    ^ $!!
!

ff
    "return the form-feed character"

    ^ Character value:12
!

lf
    "return the newline/linefeed character"

    ^ Character value:10
!

maxValue 
    "return the maximum asciiValue a character can have"

    ^ 16rFFFF
!

newPage
    "return the form-feed character"

    ^ Character value:12
!

nl
    "return the newline character"

    ^ Character value:10
!

quote
    "return the single-quote character"

    ^ Character value:39
!

return 
    "return the return character.
     In ST/X, this is different from cr - for Unix reasons."

    ^ Character value:13 
!

space
    "return the blank character"

    ^ Character value:32
!

tab
    "return the tabulator character"

    ^ Character value:9
! !

!Character class methodsFor:'primitive input'!

fromUser
    "return a character from the keyboard (C's standard input stream)
     - this should only be used for emergency evaluators and the like."

%{  /* NOCONTEXT */
    int c;

    c = getchar();
    if (c < 0) {
	RETURN (nil);
    }
    RETURN ( __MKCHARACTER(c & 0xFF) );
%}
! !

!Character class methodsFor:'queries'!

isBuiltInClass
    "return true if this class is known by the run-time-system.
     Here, true is returned for myself, false for subclasses."

    ^ self == Character

    "Modified: 23.4.1996 / 15:56:39 / cg"
! !

!Character methodsFor:'accessing'!

asciiValue
    "return the asciivalue of myself.
     The name 'asciiValue' is a historic leftover - characters are not
     limited to 8bit characters.
     PP has removed this methhod with 4.1 and providing
     asInteger instead."

    ^asciivalue

    "Modified: 27.6.1996 / 12:34:34 / cg"
!

instVarAt:index put:anObject
    "catch instvar access - asciivalue may not be changed"

    self error:'Characters may not be modified'
! !

!Character methodsFor:'arithmetic'!

+ aMagnitude
    "Return the Character that is <aMagnitude> higher than the receiver. 
     Wrap if the resulting value is not a legal Character value. (JS)"

    ^ Character value:(asciivalue + aMagnitude asInteger \\ 256)

    "
     $A + 5
    "

    "Modified: 27.6.1996 / 12:34:51 / cg"
!

- aMagnitude
    "Return the Character that is <aMagnitude> lower than the receiver.  
     Wrap if the resulting value is not a legal Character value. (JS)
     claus: 
        modified to return the difference as integer, if the argument
        is another character. If the argument is a number, a character is
        returned."

    aMagnitude isCharacter ifTrue:[
        ^ self asciiValue - aMagnitude asciiValue
    ].
    ^ Character value:(asciivalue - aMagnitude asInteger \\ 256)

    "
     $z - $a  
     $d - 3
    "

    "Modified: 27.6.1996 / 12:35:34 / cg"
!

// aMagnitude
    "Return the Character who's value is the receiver divided by <aMagnitude>. 
     Wrap if the resulting value is not a legal Character value. (JS)"

    ^ Character value:(asciivalue // aMagnitude asInteger \\ 256)
!

\\ aMagnitude
    "Return the Character who's value is the receiver modulo <aMagnitude>.  
     Wrap if the resulting value is not a legal Character value. (JS)"

    ^ Character value:(asciivalue \\ aMagnitude asInteger \\ 256)
! !

!Character methodsFor:'binary storage'!

hasSpecialBinaryRepresentation
    "return true, if the receiver has a special binary representation"

    ^ true
!

storeBinaryOn:stream manager:manager
    "store a binary representation of the receiver on stream;
     redefined, since single-byte characters are stored more compact
     with a special type-code followed by the asciiValue."

    (asciivalue < 256) ifTrue:[
	stream nextPut:manager codeForCharacter.
	stream nextPut:asciivalue
    ] ifFalse:[
	stream nextPut:manager codeForTwoByteCharacter.
	stream nextPutShort:asciivalue MSB:true
    ]
! !

!Character methodsFor:'comparing'!

< aCharacter
    "return true, if the arguments asciiValue is greater than mine"

    ^ (asciivalue < aCharacter asciiValue)
!

<= aCharacter
    "return true, if the arguments asciiValue is greater or equal to mine"

    ^ (asciivalue <= aCharacter asciiValue)
!

= aCharacter
    "return true, if the argument, aCharacter is the same character
     Redefined to take care of 16bit characters."

    self == aCharacter ifTrue:[^ true].
    aCharacter isCharacter ifFalse:[^ false].
    ^ (asciivalue = aCharacter asciiValue)
!

> aCharacter
    "return true, if the arguments asciiValue is less than mine"

    ^ (asciivalue > aCharacter asciiValue)
!

>= aCharacter
    "return true, if the arguments asciiValue is less or equal to mine"

    ^ (asciivalue >= aCharacter asciiValue)
!

identityHash
    "return an integer useful for hashing on identity"

    asciivalue <= 255 ifTrue:[
	^ 4096 + asciivalue
    ].
    ^ super identityHash
!

sameAs:aCharacter
    "return true, if the argument, aCharacter is the same character,
     ignoring case differences."

    self == aCharacter ifTrue:[^ true].
    ^ self asLowercase = aCharacter asLowercase
!

~= aCharacter
    "return true, if the argument, aCharacter is not the same character
     Redefined to take care of 16bit characters."

    self == aCharacter ifTrue:[^ false].
    aCharacter isCharacter ifFalse:[^ true].
    ^ (asciivalue ~~ aCharacter asciiValue)
! !

!Character methodsFor:'converting'!

asCharacter
    "usually sent to integers, but redefined here to allow integers
     and characters to be used commonly without a need for a test."

    ^ self

    "
     32 asCharacter  
    "
!

asInteger
    "return an Integer with my ascii-value.
     OWST4.2 compatibility (sigh)"

    ^ asciivalue
!

asLowercase
    "return a character with same letter as the receiver,
     but lowercase (the receiver if its lowercase or nonLetter)"

    |code "{Class: SmallInteger }"|

"/ the old code:
"/    self isUppercase ifFalse:[^ self].
"/    ^ Character value:(asciivalue + 32)

    code := asciivalue.

    "/ ISO Latin-1
    ((code >= $A asciiValue) and:[code <= $Z asciiValue]) ifTrue:[
        ^ Character value:(code + ($a asciiValue - $A asciiValue))
    ].
    code < 16r00C0 ifTrue:[^ self].
    code < 16r0100 ifTrue:[
        code >= 16r00DF ifTrue:[^ self].
        code == 16r00D7 ifTrue:[^ self].
        ^ Character value:(code + 16r20)
    ].

    "/ mhmh - in which encoding is this character.
    "/ here, assume Unicode

    'Character [info]: Unicode support is under construction' infoPrintCR.
    ^ self

    "
     $A asLowercase 
     $1 asLowercase  
    "

    "Modified: 10.1.1997 / 15:35:44 / cg"
!

asString
    "return a string of len 1 with myself as contents"

%{  /* NOCONTEXT */
    char buffer[2];
    OBJ s;
    unsigned val;

    val = __intVal(_characterVal(self));
    if (val <= 0xFF) {
        buffer[0] = (char) val;
        buffer[1] = '\0';
        s = __MKSTRING_L(buffer, 1 COMMA_SND);
        if (s != nil) {
            RETURN (s);
        }
    }
%}.
    asciivalue > 255 ifTrue:[
        ^ (TwoByteString new:1) at:1 put:self; yourself
    ].

"/
"/    ^ (String new:1) at:1 put:self; yourself.
"/

    "
     memory allocation (for the new string) failed.
     When we arrive here, there was no memory, even after a garbage collect.
     This means, that the VM wanted to get some more memory from the
     OS, which was not kind enough to give it.
     Bad luck - you should increase the swap space on your machine.
    "

    ^ ObjectMemory allocationFailureSignal raise.
!

asSymbol
    "return a unique symbol which prints like I print"

    ^ Symbol internCharacter:self
!

asUppercase
    "return a character with same letter as the receiver,
     but uppercase (the receiver if its uppercase or nonLetter)"

    |code "{Class: SmallInteger }"|

"/ the old code:
"/    self isLowercase ifFalse:[^ self].
"/    ^ Character value:(asciivalue - 32)

    code := asciivalue.

    "/ ISO Latin-1
    ((code >= $a asciiValue) and:[code <= $z asciiValue]) ifTrue:[
        ^ Character value:(code + ($A asciiValue - $a asciiValue))
    ].
    code < 16r00E0 ifTrue:[^ self].
    code < 16r0100 ifTrue:[
        code == 16r00F7 ifTrue:[^ self]. "/ division
"/        code == 16r00FF ifTrue:[^ Character value:16r0178].  "/ y diaresis (no uppercase equivalent in ISO-latin 1
        code == 16r00FF ifTrue:[^ self]. 

        ^ Character value:(code - 16r20)
    ].

    "/ mhmh - in which encoding is this character.
    "/ here, assume Unicode

    'Character [info]: Unicode support is under construction' infoPrintCR.
    ^ self

    "
     $A asLowercase 
     $1 asLowercase  
    "

    "Modified: 10.1.1997 / 15:35:51 / cg"
!

digitValue
    "return my digitValue for any base"

    |ascii "{ Class: SmallInteger }" |

    ascii := asciivalue.
    (ascii between:($0 asciiValue) and:($9 asciiValue)) ifTrue:[
	^ ascii - $0 asciiValue
    ].
    (ascii between:($a asciiValue) and:($z asciiValue)) ifTrue:[
	^ ascii + (10 - $a asciiValue)
    ]. 
    (ascii between:($A asciiValue) and:($Z asciiValue)) ifTrue:[
	^ ascii + (10 - $A asciiValue)
    ]. 

"remove error below for X3J20 conformance ... "
    self error:'bad character'.
" "
    ^ -1
!

to:aMagnitude
    "Return an Interval over the characters from the receiver to <aMagnitude>. 
     Wrap <aMagnitude> if it is not a legal Character value. (JS)"

    ^ Interval from:self to:(aMagnitude \\ 256)
! !

!Character methodsFor:'copying'!

copy
    "return a copy of myself
     reimplemented since characters are unique"

     ^ self
!

deepCopy
    "return a deep copy of myself
     reimplemented since characters are unique"

     ^ self
!

deepCopyUsing:aDictionary
    "return a deep copy of myself
     reimplemented since characters are unique"

     ^ self
!

shallowCopy
    "return a shallow copy of myself
     reimplemented since characters are unique"

     ^ self
!

simpleDeepCopy
    "return a deep copy of myself
     reimplemented since characters are unique"

     ^ self
! !

!Character methodsFor:'enumerating'!

to:stopCharacter do:aBlock
    "evaluate aBlock for each character in self .. stopCharacter.
     This is somewhat stupid, since it depends on the ascii encoding
     (370-users watch out :-)"

    |runChar|

    runChar := self.
    [runChar <= stopCharacter] whileTrue:[
	aBlock value:runChar.
	runChar := runChar + 1
    ]

    "
     ($a to:$z) do:[:char | char printNL]
     $a to:$z do:[:char | char printNL].
    "
! !

!Character methodsFor:'national testing'!

isNationalAlphaNumeric
    "return true, if the receiver is a letter in the
     current language (Language variable)"

    "stupid - should be configurable from a table ...
     ... good thing is, that iso8859 puts all national
	 characters above 16rC0"

    self isLetterOrDigit ifTrue:[^ true].
    ^ self asciiValue between:16rC0 and:16rFF
!

isNationalLetter
    "return true, if the receiver is a letter in the
     current language (Language variable)"

    "stupid - should be configurable from a table ...
     ... good thing is, that iso8859 puts all national
	 characters above 16rC0"

    self isLetter ifTrue:[^ true].
    ^ self asciiValue between:16rC0 and:16rFF
! !

!Character methodsFor:'printing & storing'!

displayString
    "return a string used when the receiver is to be displayed
     in an inspector kind-of-thing"

    ^ self storeString
!

isLiteral
    "return true, if the receiver can be used as a literal
     (i.e. can be used in constant arrays)"

    ^ true
!

print
    "print myself on stdout"

%{  /* NOCONTEXT */

    putchar(__intVal(__INST(asciivalue)));
%}
!

printOn:aStream
    "print myself on aStream"

    aStream nextPut:self
!

printString
    "return a string to print me"

    ^ self asString
!

storeOn:aStream
    "store myself on aStream"

    |special|

    (asciivalue between:33 and:127) ifFalse:[
        (self == Character space) ifTrue:[
            special := 'space'
        ] ifFalse:[
            (self == Character cr) ifTrue:[
                special := 'cr'.
            ] ifFalse:[
                (self == Character tab) ifTrue:[
                    special := 'tab'.
                ] ifFalse:[
                    (self == Character esc) ifTrue:[
                        special := 'esc'.
                    ]
                ]
            ]
        ].
        special notNil ifTrue:[
            aStream nextPutAll:'(Character '; nextPutAll:special; nextPutAll:')'.
            ^ self
        ].
        aStream nextPutAll:'(Character value:16r'; 
                nextPutAll:(asciivalue printStringRadix:16); nextPutAll:')'
    ] ifTrue:[
        aStream nextPut:$$; nextPut:self
    ]

    "Modified: 23.2.1996 / 23:27:32 / cg"
! !

!Character methodsFor:'private accessing'!

setAsciiValue:anInteger
    "very private - set the ascii value. 
     - use this only for characters with codes > 16rFF.
     DANGER alert: funny things happen, if this is applied to
     one of the fixed-characters 0..255."

    asciivalue := anInteger
! !

!Character methodsFor:'testing'!

isAlphaNumeric
    "return true, if I am a letter or a digit
     - same as isAlphaNumeric for compatibility reasons."

    ^ self isLetterOrDigit
!

isCharacter
    "return true,  if the receiver is some kind of character"

    ^ true
!

isDigit
    "return true, if I am a digit (i.e. $0 .. $9)"

    ^ asciivalue between:($0 asciiValue) and:($9 asciiValue)
!

isDigitRadix:r
    "return true, if I am a digit of a base r number"

    (asciivalue < $0 asciiValue) ifTrue:[^ false]. 
    (r > 10) ifTrue:[
	(asciivalue between:($0 asciiValue) and:($9 asciiValue)) ifTrue:[
	    ^ true
	].
	((asciivalue - $a asciiValue) between:0 and:(r - 10)) ifTrue:[
	    ^ true
	].
	^ (asciivalue - $A asciiValue) between:0 and:(r - 10)
    ].
    (asciivalue - $0 asciiValue) < r ifTrue:[^ true].
    ^ false
!

isEndOfLineCharacter
    "return true if I am a line delimitting character"

%{  /* NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
    if ((val == '\n')
     || (val == '\r')
     || (val == '\f')) {
	RETURN ( true );
    }
%}
.
    ^ false
!

isLetter
    "return true, if I am a letter 
     - use isNationalLetter, if you are interested in those."

%{  /*NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
    RETURN ( (((val >= 'a') && (val <= 'z')) ||
	      ((val >= 'A') && (val <= 'Z'))) ? true : false );
%}
!

isLetterOrDigit
    "return true, if I am a letter or a digit
     - use isNationalAlphaNumeric, if you are interested in those."

%{  /* NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
    if ((val >= 'a') && (val <= 'z')) {
	RETURN ( true );
    }
    if ((val >= 'A') && (val <= 'Z')) {
	RETURN ( true );
    }
    if ((val >= '0') && (val <= '9')) {
	RETURN ( true );
    }
    RETURN ( false );
%}
!

isLowercase
    "return true, if I am a lower-case letter"

%{  /* NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
#ifndef OLD
    /* iso8859 puts national lower case characters at e0 .. ff */
    if ((val >= 0xE0) && (val <= 0xFF)) {
	RETURN(true);
    }
#endif
    RETURN ( ((val >= 'a') && (val <= 'z')) ? true : false );
%}
!

isPrintable
    "return true, if the receiver is a useful printable character
     (see fileBrowsers showFile:-method on how it can be used)"

    (asciivalue between:32 and:127) ifTrue:[^ true].
    asciivalue == 13 ifTrue:[^ true].
    asciivalue == 9 ifTrue:[^ true].
    asciivalue == 10 ifTrue:[^ true].
    ^ self isNationalLetter
!

isSeparator
    "return true if I am a space, cr, tab, nl, or newPage"

%{  /* NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
#ifndef NON_ASCII       /* i.e. EBCDIC ;-) */
    if (val <= ' ')
#endif
	if ((val == ' ')
	 || (val == '\n') 
	 || (val == '\t')
	 || (val == '\r')
	 || (val == '\f')) {
	    RETURN ( true );
	}
%}
.
    ^ false
!

isUppercase
    "return true, if I am an upper-case letter"

%{  /* NOCONTEXT */

    REGISTER int val;

    val = __intVal(__INST(asciivalue));
#ifndef OLD
    /* iso8859 puts national upper case characters at c0 .. df */
    if ((val >= 0xC0) && (val <= 0xDF)) {
	RETURN(true);
    }
#endif
    RETURN ( ((val >= 'A') && (val <= 'Z')) ? true : false );
%}
!

isVowel
    "return true, if I am a vowel (lower- or uppercase)"

    (self == $a) ifTrue:[^ true].
    (self == $e) ifTrue:[^ true].
    (self == $i) ifTrue:[^ true].
    (self == $o) ifTrue:[^ true].
    (self == $u) ifTrue:[^ true].
    (self == $A) ifTrue:[^ true].
    (self == $E) ifTrue:[^ true].
    (self == $I) ifTrue:[^ true].
    (self == $O) ifTrue:[^ true].
    (self == $U) ifTrue:[^ true].
    ^ false
! !

!Character class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/Character.st,v 1.40 1997-01-10 15:03:41 cg Exp $'
! !