parsers/java/PPJavaLexicon.st
changeset 435 3bc08fb90133
child 436 e1c44b571db9
equal deleted inserted replaced
434:840942b96eea 435:3bc08fb90133
       
     1 "{ Package: 'stx:goodies/petitparser/parsers/java' }"
       
     2 
       
     3 "{ NameSpace: Smalltalk }"
       
     4 
       
     5 PPCompositeParser subclass:#PPJavaLexicon
       
     6 	instanceVariableNames:'unicodeEscape rawInputCharacter unicodeMarker hexDigit
       
     7 		lineTerminator unicodeInputCharacter inputElements sub
       
     8 		inputElement whiteSpace comment javaToken keyword literal
       
     9 		separator operator identifier traditionalComment endOfLineComment
       
    10 		commentTail charactersInLine commentTailStar notStar
       
    11 		notStarNotSlash inputCharacter booleanLiteral nullLiteral
       
    12 		identifierChars javaLetter javaLetterOrDigit keywords
       
    13 		floatingPointLiteral integerLiteral characterLiteral
       
    14 		stringLiteral hexIntegerLiteral octalIntegerLiteral
       
    15 		decimalIntegerLiteral decimalNumeral integerTypeSuffix hexNumeral
       
    16 		octalNumeral nonZeroDigit digits hexDigits octalDigits octalDigit
       
    17 		hexadecimalFloatingPointLiteral decimalFloatingPointLiteral
       
    18 		exponentPart floatTypeSuffix exponentIndicator signedInteger sign
       
    19 		hexSignificand binaryExponent binaryExponentIndicator
       
    20 		escapeSequence singleCharacter stringCharacters stringCharacter
       
    21 		octalEscape zeroToThree input operators separators trueToken
       
    22 		falseToken nullToken'
       
    23 	classVariableNames:''
       
    24 	poolDictionaries:''
       
    25 	category:'PetitJava-Core'
       
    26 !
       
    27 
       
    28 PPJavaLexicon comment:'A parser with a definitions for some basic Java gramar parts
       
    29 
       
    30 Grammar rules follow as closely as possible the specification found in "The Java Language Specification Third Edition"
       
    31 
       
    32 URL = '
       
    33 !
       
    34 
       
    35 !PPJavaLexicon class methodsFor:'accessing'!
       
    36 
       
    37 ignoredNames
       
    38 	"Answer a collection of instance-variables that should not be automatically initialized with productions, but that are used internal to the composite parser."
       
    39 
       
    40 	| newArray |	
       
    41 	newArray := Array new: ((self namesToIgnore size) + (super ignoredNames size)).
       
    42 	newArray
       
    43 		replaceFrom: 1
       
    44 		to: self namesToIgnore size
       
    45 		with: self namesToIgnore.
       
    46 	newArray
       
    47 		replaceFrom: (self namesToIgnore size + 1)
       
    48 		to: newArray size
       
    49 		with: super ignoredNames.	
       
    50 	^newArray
       
    51 !
       
    52 
       
    53 namesToIgnore
       
    54 
       
    55 	^#('keywords' 'operators' 'separators')
       
    56 ! !
       
    57 
       
    58 !PPJavaLexicon methodsFor:'accessing'!
       
    59 
       
    60 start
       
    61 	"Default start production."
       
    62 
       
    63 	^ input end
       
    64 ! !
       
    65 
       
    66 !PPJavaLexicon methodsFor:'grammar-comments'!
       
    67 
       
    68 charactersInLine   
       
    69 
       
    70 	^ inputCharacter plus
       
    71 !
       
    72 
       
    73 comment
       
    74 	"traditional -> /*
       
    75 	 endOfLine -> //"
       
    76 	^ traditionalComment / endOfLineComment
       
    77 !
       
    78 
       
    79 commentTail
       
    80 
       
    81 	^ 	('*' asParser , commentTailStar ) /
       
    82 		(notStar , commentTail)
       
    83 !
       
    84 
       
    85 commentTailStar 
       
    86 
       
    87 	^ ('/' asParser ) /
       
    88 	  ('*' asParser , commentTailStar ) /
       
    89 	  (notStarNotSlash , commentTail )
       
    90 !
       
    91 
       
    92 endOfLineComment 
       
    93 
       
    94 	^ '//' asParser , charactersInLine optional
       
    95 !
       
    96 
       
    97 notStar
       
    98 
       
    99 	^  ('*' asParser not , inputCharacter)/lineTerminator
       
   100 !
       
   101 
       
   102 notStarNotSlash  
       
   103 
       
   104 	^ lineTerminator / ((PPPredicateObjectParser anyOf: '*/') not , inputCharacter )
       
   105 !
       
   106 
       
   107 traditionalComment
       
   108 
       
   109 	^ '/*' asParser , commentTail
       
   110 ! !
       
   111 
       
   112 !PPJavaLexicon methodsFor:'grammar-identifiers'!
       
   113 
       
   114 identifier 
       
   115 
       
   116 	^  self asToken: (((keyword not) , (booleanLiteral not) , (nullLiteral not) , identifierChars ))
       
   117 !
       
   118 
       
   119 identifierChars
       
   120 	
       
   121 	^ javaLetter plus , javaLetterOrDigit star
       
   122 !
       
   123 
       
   124 javaLetter
       
   125 
       
   126 	^ (#letter asParser) / (PPPredicateObjectParser anyOf: '_$')
       
   127 !
       
   128 
       
   129 javaLetterOrDigit
       
   130 
       
   131 	^ javaLetter / (#digit asParser)
       
   132 ! !
       
   133 
       
   134 !PPJavaLexicon methodsFor:'grammar-input'!
       
   135 
       
   136 input
       
   137 
       
   138 	^ (inputElements optional) , (sub optional)
       
   139 !
       
   140 
       
   141 inputElement
       
   142 
       
   143 	^ whiteSpace / comment / javaToken
       
   144 !
       
   145 
       
   146 inputElements
       
   147 
       
   148 	^ inputElement plus
       
   149 !
       
   150 
       
   151 javaToken
       
   152 
       
   153 
       
   154 	^ identifier / keyword / literal / separator / operator
       
   155 !
       
   156 
       
   157 sub
       
   158 
       
   159 	^ (Character value: 26) asParser 
       
   160 ! !
       
   161 
       
   162 !PPJavaLexicon methodsFor:'grammar-keywords'!
       
   163 
       
   164 keyword
       
   165 
       
   166 	| keywordParsers |
       
   167 	
       
   168 	keywordParsers := keywords keysSortedSafely 
       
   169 								collect: [:eachKey | keywords at: eachKey ].
       
   170 	^ self asToken: ( (keywordParsers reduce: [ :a :b | a / b ]) )
       
   171 ! !
       
   172 
       
   173 !PPJavaLexicon methodsFor:'grammar-lineTerminators'!
       
   174 
       
   175 inputCharacter 
       
   176 
       
   177 	^(lineTerminator not) , unicodeInputCharacter ==> #second
       
   178 !
       
   179 
       
   180 lineTerminator
       
   181 
       
   182 	^ (Character lf asParser) / (Character cr asParser , (Character lf asParser ) optional )
       
   183 ! !
       
   184 
       
   185 !PPJavaLexicon methodsFor:'grammar-literals'!
       
   186 
       
   187 literal
       
   188 	"a literal must be a single token. Whitespaces are not allowed inside the literal"
       
   189 	
       
   190 	^ nullLiteral / booleanLiteral / floatingPointLiteral / integerLiteral / characterLiteral / stringLiteral
       
   191 ! !
       
   192 
       
   193 !PPJavaLexicon methodsFor:'grammar-literals-boolean'!
       
   194 
       
   195 booleanLiteral 
       
   196 
       
   197  ^ trueToken / falseToken
       
   198 !
       
   199 
       
   200 falseToken
       
   201 	^ ('false' asParser , #word asParser not) javaToken
       
   202 !
       
   203 
       
   204 nullToken
       
   205 	^ ('null' asParser , #word asParser not) javaToken
       
   206 !
       
   207 
       
   208 trueToken
       
   209 	^ ('true' asParser , #word asParser not) javaToken
       
   210 ! !
       
   211 
       
   212 !PPJavaLexicon methodsFor:'grammar-literals-character'!
       
   213 
       
   214 characterLiteral 
       
   215 
       
   216  ^ ($' asParser , ( escapeSequence / singleCharacter ), $' asParser) javaToken
       
   217 !
       
   218 
       
   219 singleCharacter 	
       
   220 
       
   221 	^( PPPredicateObjectParser anyOf: '''\') not , inputCharacter ==> #second
       
   222 ! !
       
   223 
       
   224 !PPJavaLexicon methodsFor:'grammar-literals-escape'!
       
   225 
       
   226 escapeSequence 
       
   227 
       
   228 	^ ($\ asParser , (PPPredicateObjectParser anyOf: 'btnfr""''\' ) ) /
       
   229 	   octalEscape 
       
   230 !
       
   231 
       
   232 octalEscape
       
   233 
       
   234 	^ $\ asParser , ( (zeroToThree , octalDigit , octalDigit) / (octalDigit , octalDigit optional) )
       
   235 !
       
   236 
       
   237 zeroToThree
       
   238 
       
   239 	^PPPredicateObjectParser anyOf: '0123'
       
   240 ! !
       
   241 
       
   242 !PPJavaLexicon methodsFor:'grammar-literals-floating'!
       
   243 
       
   244 binaryExponent
       
   245 
       
   246  ^ binaryExponentIndicator , signedInteger
       
   247 !
       
   248 
       
   249 binaryExponentIndicator
       
   250 
       
   251   ^ PPPredicateObjectParser anyOf: 'pP'
       
   252 !
       
   253 
       
   254 decimalFloatingPointLiteral
       
   255 
       
   256 	|dot|
       
   257 	dot := $. asParser.
       
   258 
       
   259  ^ ( ( (dot , digits) 
       
   260         / 
       
   261         (digits , dot , digits optional)) , 
       
   262 			exponentPart optional , floatTypeSuffix optional ) 
       
   263   	/ 
       
   264   	(digits , 
       
   265 		( (exponentPart , floatTypeSuffix optional) 
       
   266 		  /
       
   267 		  (exponentPart optional , floatTypeSuffix) ))
       
   268 !
       
   269 
       
   270 exponentIndicator
       
   271 
       
   272   ^ PPPredicateObjectParser anyOf: 'eE'
       
   273 !
       
   274 
       
   275 exponentPart
       
   276 
       
   277  ^ exponentIndicator , signedInteger
       
   278 !
       
   279 
       
   280 floatTypeSuffix
       
   281 
       
   282 	^ PPPredicateObjectParser anyOf: 'fFdD'
       
   283 !
       
   284 
       
   285 floatingPointLiteral
       
   286 
       
   287   ^ (hexadecimalFloatingPointLiteral / decimalFloatingPointLiteral) javaToken
       
   288 !
       
   289 
       
   290 hexSignificand 
       
   291 	|dot|
       
   292 	dot := $. asParser.
       
   293 
       
   294  ^  (hexNumeral , dot optional) /
       
   295     ($0 asParser , (PPPredicateObjectParser anyOf: 'xX') , hexDigits optional , dot , hexDigits )
       
   296 !
       
   297 
       
   298 hexadecimalFloatingPointLiteral
       
   299 
       
   300  ^ hexSignificand , binaryExponent , floatTypeSuffix optional
       
   301 !
       
   302 
       
   303 sign
       
   304 
       
   305   ^PPPredicateObjectParser anyOf: '-+'
       
   306 !
       
   307 
       
   308 signedInteger
       
   309 
       
   310   ^ sign optional , digits
       
   311 ! !
       
   312 
       
   313 !PPJavaLexicon methodsFor:'grammar-literals-integer'!
       
   314 
       
   315 decimalIntegerLiteral
       
   316 
       
   317  ^ decimalNumeral , (integerTypeSuffix optional)
       
   318 !
       
   319 
       
   320 decimalNumeral 
       
   321 
       
   322 	^($0 asParser) / (nonZeroDigit , digits optional) 
       
   323 !
       
   324 
       
   325 digits 
       
   326 	"digit is already defined, no need to redefine it"
       
   327 	^#digit asParser plus
       
   328 !
       
   329 
       
   330 hexDigits 
       
   331 
       
   332 	^hexDigit plus
       
   333 !
       
   334 
       
   335 hexIntegerLiteral 
       
   336 
       
   337   ^ hexNumeral , (integerTypeSuffix optional)
       
   338 !
       
   339 
       
   340 hexNumeral 
       
   341 
       
   342 	^$0 asParser, (PPPredicateObjectParser anyOf: 'xX' ), hexDigits
       
   343 !
       
   344 
       
   345 integerLiteral
       
   346 
       
   347   ^ (hexIntegerLiteral / octalIntegerLiteral / decimalIntegerLiteral) javaToken
       
   348 !
       
   349 
       
   350 integerTypeSuffix
       
   351 
       
   352 	^ PPPredicateObjectParser anyOf: 'lL'
       
   353 !
       
   354 
       
   355 nonZeroDigit 
       
   356 
       
   357 	^PPPredicateObjectParser anyOf: '123456789'.
       
   358 !
       
   359 
       
   360 octalDigit 
       
   361 
       
   362 	^PPPredicateObjectParser anyOf: '01234567'
       
   363 !
       
   364 
       
   365 octalDigits
       
   366 
       
   367 	^ octalDigit plus
       
   368 !
       
   369 
       
   370 octalIntegerLiteral 
       
   371 
       
   372  ^ octalNumeral , (integerTypeSuffix optional)
       
   373 !
       
   374 
       
   375 octalNumeral 
       
   376 
       
   377 	^($0 asParser) , octalDigits
       
   378 ! !
       
   379 
       
   380 !PPJavaLexicon methodsFor:'grammar-literals-null'!
       
   381 
       
   382 nullLiteral 
       
   383 
       
   384  ^ nullToken
       
   385 ! !
       
   386 
       
   387 !PPJavaLexicon methodsFor:'grammar-literals-string'!
       
   388 
       
   389 stringCharacter
       
   390 		
       
   391 	^ ( ( PPPredicateObjectParser anyOf: '"\') not , inputCharacter ==> #second ) /
       
   392 	   escapeSequence 
       
   393 !
       
   394 
       
   395 stringCharacters
       
   396 
       
   397 	^ stringCharacter plus
       
   398 !
       
   399 
       
   400 stringLiteral 
       
   401 
       
   402  ^ ($" asParser , stringCharacters optional , $" asParser) javaToken
       
   403 ! !
       
   404 
       
   405 !PPJavaLexicon methodsFor:'grammar-operators'!
       
   406 
       
   407 operator
       
   408 	| operatorParsers |
       
   409 	
       
   410 	operatorParsers := operators keysSortedSafely 
       
   411 								collect: [:eachKey | operators at: eachKey ].
       
   412 						
       
   413 	^self asToken:  (operatorParsers reduce: [ :a :b | a / b ]) 
       
   414 ! !
       
   415 
       
   416 !PPJavaLexicon methodsFor:'grammar-separators'!
       
   417 
       
   418 separator	
       
   419 	^self asToken: (PPPredicateObjectParser anyOf: '(){}[];,.' )
       
   420 ! !
       
   421 
       
   422 !PPJavaLexicon methodsFor:'grammar-unicode-escapes'!
       
   423 
       
   424 hexDigit 
       
   425 
       
   426 	^#hex asParser
       
   427 !
       
   428 
       
   429 rawInputCharacter
       
   430 
       
   431 	^#any asParser
       
   432 !
       
   433 
       
   434 unicodeEscape
       
   435 
       
   436 	^ $\ asParser , unicodeMarker , hexDigit , hexDigit , hexDigit , hexDigit
       
   437 !
       
   438 
       
   439 unicodeInputCharacter
       
   440 	 ^ unicodeEscape / rawInputCharacter
       
   441 !
       
   442 
       
   443 unicodeMarker
       
   444 
       
   445 	^$u asParser plus
       
   446 ! !
       
   447 
       
   448 !PPJavaLexicon methodsFor:'grammar-whiteSpace'!
       
   449 
       
   450 whiteSpace
       
   451 
       
   452 	^ (Character space asParser ) /
       
   453 	  (Character tab asParser ) /
       
   454 	  ((Character value: 12) asParser ) /
       
   455 		lineTerminator 
       
   456 ! !
       
   457 
       
   458 !PPJavaLexicon methodsFor:'initialization'!
       
   459 
       
   460 initialize
       
   461 
       
   462 	super initialize.
       
   463 	
       
   464 	self initializeKeywords.
       
   465 	self initializeOperators.
       
   466 	self initializeSeparators.
       
   467 !
       
   468 
       
   469 initializeKeywords
       
   470 
       
   471 	| values |
       
   472 	keywords := Dictionary new.
       
   473 	values := #('abstract' 'assert' 'boolean' 'break' 'byte' 'case'  'catch' 'char' 'class' 'const'
       
   474 	   'continue' 'default' 'do' 'double' 'else' 'enum' 'extends' 'final'  'finally' 'float'
       
   475 	   'for' 'if' 'goto' 'implements' 'import' 'instanceof' 'int' 'interface' 'long' 'native'
       
   476 	   'new' 'package' 'private' 'protected' 'public' 'return' 'short' 'static' 'strictfp' 'super'
       
   477 	   'switch' 'synchronized' 'this' 'throw' 'throws' 'transient' 'try' 'void' 'volatile' 'while').
       
   478 	
       
   479 	values do: [:eachKeyword |
       
   480 		keywords at: eachKeyword 
       
   481 			put: (PPUnresolvedParser named: ('keyword', eachKeyword first asUppercase asString , eachKeyword allButFirst))		
       
   482 		].
       
   483 	
       
   484 	keywords keysAndValuesDo:  [:key :value |
       
   485 		(keywords at: key) def: (key asParser ,  #word asParser not)]
       
   486 !
       
   487 
       
   488 initializeOperators
       
   489 
       
   490 	| values |
       
   491 	operators := Dictionary new.
       
   492 	values := #(	'>>>=' '>>>' '>>=' '>>' '>=' '>'	'<<=' '<<' '<=' '<'	'++' '+=' '+'	'--' '-=' '-'	'&&' '&=' '&'
       
   493 					'||' '|=' '|'	'*=' '*'	'%=' '%'	'/=' '/'	'^=' '^'	'!!=' '!!'	'==' '='	'~'	'?'	':'	'@' ).
       
   494 	" @ ? perhaps for annotation but not in the doc "
       
   495 	values do: [:eachOperator |
       
   496 		operators at: eachOperator 
       
   497 			put: (PPUnresolvedParser named: ('operator', eachOperator asString))		
       
   498 		].
       
   499 	
       
   500 	operators  keysAndValuesDo:  [:key :value |
       
   501 		(operators at: key) def: (key asParser)]
       
   502 !
       
   503 
       
   504 initializeSeparators
       
   505 
       
   506 	| values |
       
   507 	separators := Dictionary new.
       
   508 	values := #( '(' ')' '{' '}' '[' ']' ';' ',' '.' ).
       
   509 	
       
   510 	values do: [:eachSeparator |
       
   511 		separators at: eachSeparator 
       
   512 			put: (PPUnresolvedParser named: ('separator', eachSeparator asString))		
       
   513 		].
       
   514 	
       
   515 	separators  keysAndValuesDo:  [:key :value |
       
   516 		(separators at: key) def: (key asParser)]
       
   517 ! !
       
   518 
       
   519 !PPJavaLexicon methodsFor:'utility'!
       
   520 
       
   521 asToken: aParser
       
   522 
       
   523 	^aParser javaToken
       
   524 !
       
   525 
       
   526 emptySquaredParenthesis
       
   527 
       
   528 	^ self asToken: (((self tokenFor: '['), (self tokenFor: ']')))
       
   529 !
       
   530 
       
   531 tokenFor: aString
       
   532 
       
   533 	^self asToken: (keywords at: aString 
       
   534 						ifAbsent: [separators at: aString 
       
   535 							ifAbsent: [operators at: aString] ])
       
   536 ! !
       
   537