TokenizedStream.st
changeset 171 ce00d8aab37c
child 172 99b850002359
equal deleted inserted replaced
170:7194ef17f5b8 171:ce00d8aab37c
       
     1 'From Smalltalk/X, Version:2.10.8 on 1-feb-1996 at 6:09:27 pm'                  !
       
     2 
       
     3 ReadStream subclass:#TokenizedStream
       
     4 	instanceVariableNames:'source token tokenType tokenPosition tokenName tokenLineNr
       
     5 		tokenValue tokenRadix hereChar peekChar peekChar2
       
     6 		beginCommentCharacter endCommentCharacter eolCommentCharacter
       
     7 		eolCharacter outStream outCol actions types eolIsSignificant'
       
     8 	classVariableNames:'DefaultActions DefaultTypes'
       
     9 	poolDictionaries:''
       
    10 	category:'Streams'
       
    11 !
       
    12 
       
    13 !TokenizedStream class methodsFor:'documentation'!
       
    14 
       
    15 documentation
       
    16 "
       
    17     a first version of a tokenStream.
       
    18     This is still being constructed and will finally help a simplified
       
    19     Scanner class.
       
    20     For now, it may be useful when textual input files are to be read and
       
    21     parsed. For example, ascii data files are often in a simple free form format
       
    22     which requires some little processing.
       
    23 
       
    24     operation:
       
    25 
       
    26         a TokenizedStream reads characters from its real input stream
       
    27         and dispatches to a toke reading method by the help of an actionArray,
       
    28         which is indexed by the characters ascii code.
       
    29         By default, the table is setup to only read numbers (integers)
       
    30         and identifiers. Whitespace is ignored, and all other characters return
       
    31         themself.
       
    32 
       
    33     The returned tokens are either symbols (#Identifier / #Integer) or
       
    34     characters ($+ $, etc.)
       
    35     If its an Identifier, the name is found in tokenName (there is an access method for that).
       
    36     If its an Integer, the value is found in tokenValue.
       
    37 
       
    38     EndOfLine is returned as #EOL; end of input as #EOF.
       
    39     Unrecognized input leads to #Error to be returned.
       
    40 "
       
    41 !
       
    42 
       
    43 examples
       
    44 "
       
    45     simple example:
       
    46 
       
    47         |s|
       
    48 
       
    49         s := TokenizedStream on:'hello world, how much is 3 + 2'.
       
    50         [s atEnd] whileFalse:[
       
    51             Transcript showCr:(s next).
       
    52         ].
       
    53 
       
    54 
       
    55     simple example2:
       
    56 
       
    57         |s token|
       
    58 
       
    59         s := TokenizedStream on:'foo bar baz  3 + 2'.
       
    60         [s atEnd] whileFalse:[
       
    61             token := s next.
       
    62             token == #Identifier ifTrue:[
       
    63                 Transcript showCr:(token , ' name=' , s tokenName).
       
    64             ] ifFalse:[
       
    65                 token == #Integer ifTrue:[
       
    66                     Transcript showCr:(token , ' value=' , s tokenValue printString).
       
    67                 ] ifFalse:[
       
    68                     Transcript showCr:token.
       
    69                 ]
       
    70             ]
       
    71         ].
       
    72 
       
    73 
       
    74     reading expressions:
       
    75 
       
    76         |s num1 num2|
       
    77 
       
    78         s := TokenizedStream on:'
       
    79 3 + 2
       
    80 4 + 6
       
    81 1 + 2
       
    82 '.
       
    83         [s atEnd] whileFalse:[
       
    84             s next == #Integer ifTrue:[
       
    85                 num1 := s tokenValue.
       
    86                 s next == $+ ifTrue:[
       
    87                     s next == #Integer ifTrue:[
       
    88                         num2 := s tokenValue.
       
    89                         Transcript showCr:num1 printString 
       
    90                                           , ' + ' 
       
    91                                           , num2 printString 
       
    92                                           , ' => ' 
       
    93                                           , (num1 + num2) printString.
       
    94                     ]
       
    95                 ]
       
    96             ]
       
    97         ].
       
    98 
       
    99 
       
   100     with eol-comments:
       
   101 
       
   102         |s num1 num2|
       
   103 
       
   104         s := TokenizedStream on:'
       
   105 3 + 2
       
   106 ; this is a comment
       
   107 4 + 6
       
   108 1 + 2
       
   109 '.
       
   110         s eolCommentCharacter:$;.
       
   111 
       
   112         [s atEnd] whileFalse:[
       
   113             s next == #Integer ifTrue:[
       
   114                 num1 := s tokenValue.
       
   115                 s next == $+ ifTrue:[
       
   116                     s next == #Integer ifTrue:[
       
   117                         num2 := s tokenValue.
       
   118                         Transcript showCr:num1 printString 
       
   119                                           , ' + ' 
       
   120                                           , num2 printString 
       
   121                                           , ' => ' 
       
   122                                           , (num1 + num2) printString.
       
   123                     ]
       
   124                 ]
       
   125             ]
       
   126         ].
       
   127 
       
   128 
       
   129     scan /etc/services file:
       
   130 
       
   131         |s t service port protocol|
       
   132 
       
   133         s := TokenizedStream on:'/etc/services' asFilename readStream.
       
   134         s eolCommentCharacter:$#.
       
   135         s typeTable at:($- asciiValue) put:#letter.
       
   136 
       
   137         [s atEnd] whileFalse:[
       
   138             t := s next.
       
   139             t == #Identifier ifTrue:[
       
   140                 service := s tokenName.
       
   141                 t := s next.
       
   142                 t == #Integer ifTrue:[
       
   143                     port := s tokenValue.
       
   144                     s next == $/ ifTrue:[
       
   145                         t := s next.
       
   146                         t == #Identifier ifTrue:[
       
   147                             protocol := s tokenName.
       
   148                             Transcript showCr:(service , ' is ' , protocol , ' port=' , port printString).
       
   149                         ]
       
   150                     ]
       
   151                 ]
       
   152             ].
       
   153             s skipToEol
       
   154         ]
       
   155 "
       
   156 ! !
       
   157 
       
   158 !TokenizedStream class methodsFor:'initialization'!
       
   159 
       
   160 initialize
       
   161     |block|
       
   162 
       
   163     DefaultActions := Array new:256.
       
   164     DefaultTypes := Array new:256.
       
   165 
       
   166     "kludge: action is nextColonOrAssign, but type is special"
       
   167     2 to:255 do:[:code |
       
   168         DefaultTypes at:code put:(Character value:code).
       
   169     ].
       
   170 
       
   171     block := [:s :char | s nextInteger].
       
   172     ($0 asciiValue) to:($9 asciiValue) do:[:index |
       
   173         DefaultTypes at:index put:#digit.
       
   174         DefaultActions at:index put:block
       
   175     ].
       
   176 
       
   177     block := [:s :char | s nextIdentifier].
       
   178     ($a asciiValue) to:($z asciiValue) do:[:index |
       
   179         DefaultTypes at:index put:#letter.
       
   180         DefaultActions at:index put:block
       
   181     ].
       
   182     ($A asciiValue) to:($Z asciiValue) do:[:index |
       
   183         DefaultTypes at:index put:#letter.
       
   184         DefaultActions at:index put:block
       
   185     ].
       
   186 
       
   187     "
       
   188      TokenizedStream initialize
       
   189     "
       
   190 ! !
       
   191 
       
   192 !TokenizedStream class methodsFor:'instance creation'!
       
   193 
       
   194 on:aStream
       
   195     ^ self basicNew on:aStream
       
   196 ! !
       
   197 
       
   198 !TokenizedStream methodsFor:'accessing'!
       
   199 
       
   200 actionTable
       
   201     ^ actions
       
   202 
       
   203     "Created: 1.2.1996 / 17:42:00 / cg"
       
   204 !
       
   205 
       
   206 beginCommentCharacter:aCharacter
       
   207     beginCommentCharacter := aCharacter
       
   208 
       
   209     "Created: 1.2.1996 / 17:38:01 / cg"
       
   210 !
       
   211 
       
   212 endCommentCharacter:aCharacter
       
   213     endCommentCharacter := aCharacter
       
   214 
       
   215     "Created: 1.2.1996 / 17:38:06 / cg"
       
   216 !
       
   217 
       
   218 eolCommentCharacter:aCharacter
       
   219     eolCommentCharacter := aCharacter
       
   220 
       
   221     "Created: 1.2.1996 / 17:37:51 / cg"
       
   222 !
       
   223 
       
   224 tokenName
       
   225     ^ tokenName
       
   226 
       
   227     "Created: 1.2.1996 / 17:46:48 / cg"
       
   228 !
       
   229 
       
   230 tokenType
       
   231     ^ tokenType
       
   232 
       
   233     "Created: 1.2.1996 / 17:26:24 / cg"
       
   234 !
       
   235 
       
   236 tokenValue
       
   237     ^ tokenValue
       
   238 
       
   239     "Created: 1.2.1996 / 17:26:30 / cg"
       
   240 !
       
   241 
       
   242 typeTable
       
   243     ^ types
       
   244 
       
   245     "Created: 1.2.1996 / 17:41:54 / cg"
       
   246 ! !
       
   247 
       
   248 !TokenizedStream methodsFor:'initialization'!
       
   249 
       
   250 initialize
       
   251     tokenLineNr := 1.
       
   252     eolCommentCharacter := beginCommentCharacter := endCommentCharacter := nil.
       
   253     eolCharacter := Character cr.
       
   254     eolIsSignificant := false.
       
   255 
       
   256     actions := DefaultActions.
       
   257     types := DefaultTypes.
       
   258 
       
   259     "Modified: 1.2.1996 / 17:36:56 / cg"
       
   260 ! !
       
   261 
       
   262 !TokenizedStream methodsFor:'private'!
       
   263 
       
   264 on:aStringOrStream
       
   265     self initialize.
       
   266 
       
   267     aStringOrStream isStream ifFalse:[
       
   268         source := ReadStream on:aStringOrStream
       
   269     ] ifTrue:[
       
   270         source := aStringOrStream.
       
   271     ].
       
   272 
       
   273     "Created: 1.2.1996 / 16:18:34 / cg"
       
   274     "Modified: 1.2.1996 / 16:18:47 / cg"
       
   275 ! !
       
   276 
       
   277 !TokenizedStream methodsFor:'reading'!
       
   278 
       
   279 next
       
   280     ^ self nextToken
       
   281 
       
   282     "Created: 1.2.1996 / 17:21:47 / cg"
       
   283 !
       
   284 
       
   285 nextIdentifier
       
   286     |nextChar string oldString 
       
   287      index "{ Class: SmallInteger }"
       
   288      max   "{ Class: SmallInteger }" 
       
   289      t done|
       
   290 
       
   291     nextChar := source peek.
       
   292     string := String basicNew:20.
       
   293     index := 0.
       
   294     max := 10.
       
   295 
       
   296     done := false.
       
   297     [done] whileFalse:[
       
   298         nextChar isNil ifTrue:[
       
   299             done := true
       
   300         ] ifFalse:[
       
   301             t := types at:(nextChar asciiValue).
       
   302             done := (t ~~ #letter and:[t ~~ #digit]).
       
   303         ].
       
   304         done ifFalse:[
       
   305             (index == max) ifTrue:[
       
   306                 oldString := string.
       
   307                 string := String basicNew:(max * 2).
       
   308                 string replaceFrom:1 to:max with:oldString.
       
   309                 max := max * 2
       
   310             ].
       
   311             index := index + 1.
       
   312             string at:index put:nextChar.
       
   313             nextChar := source nextPeek
       
   314         ]
       
   315     ].
       
   316     tokenType := #Identifier.
       
   317     tokenName := string copyTo:index.
       
   318     ^ tokenType
       
   319 
       
   320     "Created: 1.2.1996 / 16:35:53 / cg"
       
   321     "Modified: 1.2.1996 / 17:51:59 / cg"
       
   322 !
       
   323 
       
   324 nextInteger
       
   325     tokenValue := Integer readFrom:source radix:10.
       
   326     tokenRadix := 10.
       
   327     tokenType := #Integer.
       
   328     ^ tokenType
       
   329 
       
   330     "Created: 1.2.1996 / 16:37:03 / cg"
       
   331     "Modified: 1.2.1996 / 16:37:28 / cg"
       
   332 !
       
   333 
       
   334 nextString:separator
       
   335     |nextChar string pos
       
   336      index "{ Class: SmallInteger }"
       
   337      len   "{ Class: SmallInteger }"
       
   338      inString|
       
   339 
       
   340     string := String basicNew:20.
       
   341     len := 20.
       
   342     index := 1.
       
   343     pos := source position.
       
   344     source next.
       
   345     nextChar := source next.
       
   346     inString := true.
       
   347 
       
   348     [inString] whileTrue:[
       
   349         nextChar isNil ifTrue:[
       
   350             self error:'unexpected end-of-input in String'.
       
   351             tokenType := #EOF.
       
   352             ^ tokenType
       
   353         ].
       
   354         (nextChar == Character cr) ifTrue:[
       
   355             tokenLineNr := tokenLineNr + 1
       
   356         ].
       
   357         (nextChar == separator) ifTrue:[
       
   358             (source peek == separator) ifTrue:[
       
   359                 source next
       
   360             ] ifFalse:[
       
   361                 inString := false
       
   362             ]
       
   363         ].
       
   364         inString ifTrue:[
       
   365             string at:index put:nextChar.
       
   366             (index == len) ifTrue:[
       
   367                 string := string , (String new:len).
       
   368                 len := len * 2
       
   369             ].
       
   370             index := index + 1.
       
   371             nextChar := source next
       
   372         ]
       
   373     ].
       
   374     tokenValue := string copyTo:(index - 1).
       
   375     tokenType := #String.
       
   376     ^ tokenType
       
   377 
       
   378     "Created: 1.2.1996 / 16:39:48 / cg"
       
   379 !
       
   380 
       
   381 nextToken
       
   382     "return the next token from the source-stream"
       
   383 
       
   384     |skipping actionBlock|
       
   385 
       
   386     peekChar notNil ifTrue:[
       
   387         hereChar := peekChar.
       
   388         peekChar := peekChar2.
       
   389         peekChar2 := nil
       
   390     ] ifFalse:[
       
   391         skipping := true.
       
   392         [skipping] whileTrue:[
       
   393             outStream notNil ifTrue:[
       
   394                 [(hereChar := source peek) == Character space] whileTrue:[
       
   395                     source next.
       
   396                     outStream space. 
       
   397                     outCol := outCol + 1.
       
   398                 ]
       
   399             ] ifFalse:[
       
   400                 hereChar := source skipSeparatorsExceptCR.
       
   401             ].
       
   402             hereChar isNil ifTrue:[
       
   403                 tokenType := #EOF.
       
   404                 ^ tokenType
       
   405             ].
       
   406             hereChar == eolCharacter ifTrue:[
       
   407                 tokenLineNr := tokenLineNr + 1.
       
   408                 source next.
       
   409                 outStream notNil ifTrue:[
       
   410                     outStream cr.
       
   411                     outCol := 1
       
   412                 ].
       
   413                 eolIsSignificant ifTrue:[
       
   414                     tokenType := #EOL.
       
   415                     ^ tokenType
       
   416                 ]
       
   417             ] ifFalse:[
       
   418                 hereChar == beginCommentCharacter ifTrue:[
       
   419                     "start of a comment"
       
   420 
       
   421                     self skipComment.
       
   422                     hereChar := source peek.
       
   423                 ] ifFalse:[
       
   424                     hereChar == eolCommentCharacter ifTrue:[
       
   425                         "start of an eol comment"
       
   426 
       
   427                         self skipEolComment.
       
   428                         hereChar := source peek.
       
   429                     ] ifFalse:[
       
   430                         skipping := false
       
   431                     ]
       
   432                 ]
       
   433             ]
       
   434         ].
       
   435         hereChar isNil ifTrue:[
       
   436             tokenType := #EOF.
       
   437             ^ tokenType
       
   438         ]
       
   439     ].
       
   440     tokenPosition := source position.
       
   441 
       
   442     actions notNil ifTrue:[
       
   443         actionBlock := actions at:(hereChar asciiValue).
       
   444         actionBlock notNil ifTrue:[
       
   445             ^ actionBlock value:self value:hereChar
       
   446         ]
       
   447     ].
       
   448 
       
   449     types notNil ifTrue:[
       
   450         source next.
       
   451         tokenType := types at:(hereChar asciiValue).
       
   452         tokenType notNil ifTrue:[
       
   453             ^ tokenType
       
   454         ]
       
   455     ].
       
   456 
       
   457     tokenType := #Error.
       
   458     ^ #Error
       
   459 
       
   460     "Modified: 1.2.1996 / 17:39:20 / cg"
       
   461 !
       
   462 
       
   463 skipComment
       
   464     source next.
       
   465     hereChar := source peek.
       
   466 
       
   467     [hereChar notNil and:[hereChar ~~ endCommentCharacter]] whileTrue:[
       
   468         hereChar == eolCharacter ifTrue:[
       
   469             tokenLineNr := tokenLineNr + 1.
       
   470         ].
       
   471         outStream notNil ifTrue:[
       
   472             outStream nextPut:hereChar.
       
   473             outCol := outCol + 1
       
   474         ].
       
   475         hereChar := source nextPeek
       
   476     ].
       
   477 
       
   478     "Created: 1.2.1996 / 17:35:24 / cg"
       
   479     "Modified: 1.2.1996 / 17:37:21 / cg"
       
   480 !
       
   481 
       
   482 skipEolComment
       
   483     source next.
       
   484     self skipToEol
       
   485 
       
   486     "Created: 1.2.1996 / 17:34:17 / cg"
       
   487     "Modified: 1.2.1996 / 18:06:33 / cg"
       
   488 !
       
   489 
       
   490 skipToEol
       
   491     hereChar := source peek.
       
   492 
       
   493     [hereChar notNil and:[hereChar ~~ eolCharacter]] whileTrue:[
       
   494         outStream notNil ifTrue:[
       
   495             outStream nextPut:hereChar.
       
   496             outCol := outCol + 1
       
   497         ].
       
   498         hereChar := source nextPeek.
       
   499     ].
       
   500     tokenLineNr := tokenLineNr + 1.
       
   501 
       
   502     "Created: 1.2.1996 / 18:06:09 / cg"
       
   503     "Modified: 1.2.1996 / 18:06:36 / cg"
       
   504 ! !
       
   505 
       
   506 !TokenizedStream methodsFor:'testing'!
       
   507 
       
   508 atEnd
       
   509    ^ source atEnd or:[tokenType == #Error or:[tokenType == #EOF]]
       
   510 
       
   511     "Created: 1.2.1996 / 17:21:28 / cg"
       
   512     "Modified: 1.2.1996 / 17:30:25 / cg"
       
   513 ! !
       
   514 
       
   515 !TokenizedStream class methodsFor:'documentation'!
       
   516 
       
   517 version
       
   518     ^ '$Header: /cvs/stx/stx/libbasic2/TokenizedStream.st,v 1.1 1996-02-01 17:10:11 cg Exp $'
       
   519 ! !
       
   520 TokenizedStream initialize!