KeywordInContextIndexBuilder.st
author Claus Gittinger <cg@exept.de>
Mon, 14 Feb 2011 18:39:30 +0100
changeset 2536 8907a20de2dc
parent 1375 e034d3e027f2
child 3184 27271594c7d8
permissions -rw-r--r--
changed: #examples

"
 COPYRIGHT (c) 2003 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

Object subclass:#KeywordInContextIndexBuilder
	instanceVariableNames:'keywordToLinesMapping excluded separatorAlgorithm'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Support'
!

!KeywordInContextIndexBuilder class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2003 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    A support class for building a KWIC (Keyword in Context) indices.
    (for example, to build a KWIC index on html pages or class documentation).

    [author:]
        Claus Gittinger (cg@alan)

    [instance variables:]

    [class variables:]

    [see also:]

"
!

examples
"
                                                                [exBegin]
    |kwic|

    kwic := KeywordInContextIndexBuilder new.
    kwic excluded:#('the' 'and' 'a' 'an').

    kwic addLine:'bla bla bla' reference:1.
    kwic addLine:'one two three' reference:2.
    kwic addLine:'a cat and a dog' reference:3.
    kwic addLine:'the man in the middle' reference:4.
    kwic addLine:'the man with the dog' reference:5.

    kwic 
        entriesDo:[:word :left :right :ref |
            Transcript 
                show:((left contractTo:20) leftPaddedTo:20);
                space;
                show:((word contractTo:10) leftPaddedTo:10);
                space;
                show:((right contractTo:20) leftPaddedTo:20);
                space;
                show:'['; show:ref; show:']';
                cr    
        ].
                                                                [exEnd]


  KWIC index over method selector components:
                                                                [exBegin]
    |kwic|

    kwic := KeywordInContextIndexBuilder new.
    Smalltalk allClassesDo:[:eachClass |
        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
            kwic addLine:sel reference:mthd.
        ]
    ].
    kwic 
                                                                [exEnd]

  KWIC index over method selector components, with word separation:
                                                                [exBegin]
    |kwic|

    kwic := KeywordInContextIndexBuilder forMethodSelectorIndex.

    Smalltalk allClassesDo:[:eachClass |
        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
            kwic addLine:sel reference:mthd.
        ]
    ].
    kwic
                                                                [exEnd]

  KWIC index over method comments:
                                                                [exBegin]
    |kwic|

    kwic := KeywordInContextIndexBuilder forMethodComments.

    Smalltalk allClassesDo:[:eachClass |
        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
            |comment|

            (sel == #documentation) ifTrue:[
                comment := mthd comment.
                comment notNil ifTrue:[
                    kwic addLine:comment reference:mthd mclass ignoreCase:true.
                ]
            ] ifFalse:[
                (sel ~~ #examples
                and:[ sel ~~ #copyright
                and:[ sel ~~ #version]]) ifTrue:[
                    comment := mthd comment.
                    comment notNil ifTrue:[
                        kwic addLine:comment reference:mthd ignoreCase:true.
                    ]
                ]
            ]
        ]
    ].
    kwic
                                                                [exEnd]

  KWIC index over class comments:
                                                                [exBegin]
    |kwic|

    kwic := KeywordInContextIndexBuilder forMethodComments.

    Smalltalk allClassesDo:[:eachClass |
        |mthd comment|

        mthd := eachClass theMetaclass compiledMethodAt:#documentation.
        mthd notNil ifTrue:[
            comment := mthd comment.
            comment notNil ifTrue:[
                kwic addLine:comment reference:eachClass theNonMetaclass ignoreCase:true.
            ]
        ]
    ].
    kwic
                                                                [exEnd]
"
! !

!KeywordInContextIndexBuilder class methodsFor:'instance creation'!

forMethodComments
    "return an indexer for method comments"

    |sepChars sep kwic|

    sepChars := '.,;:_ !![]()''"#?<>|' , Character return, Character lf, Character tab.

    sep := [:lines | lines asString asCollectionOfSubstringsSeparatedByAny:sepChars].

    kwic := self new.
    kwic separatorAlgorithm:sep.
    kwic excluded:#('the' 'and' 'a' 'an' 'for' 'with' 'no').
    ^ kwic
!

forMethodSelectorIndex
    "return an indexer for method selector components, with word separation at case boundaries"

    |sep kwic sepUCWords|

    sepUCWords := [:word :keyWords| 
                    |s w c lastC last2C frag|

                    word asLowercase = word ifTrue:[
                        keyWords add:word.
                    ] ifFalse:[
                        s := word readStream.
                        w := '' writeStream.
                        [s atEnd] whileFalse:[
                            c := s next.
                            (c isUppercase) ifTrue:[
                                (lastC notNil and:[lastC isUppercase not]) ifTrue:[
                                    keyWords add:w contents.
                                    w := '' writeStream.
                                ].
                            ] ifFalse:[
                                (last2C notNil and:[last2C isUppercase and:[lastC isUppercase]]) ifTrue:[
                                    c isLetter ifTrue:[
                                        frag := w contents.
                                        w := '' writeStream.
                                        w nextPut:(frag last).
                                        keyWords add:(frag allButLast).
                                    ] ifFalse:[
                                       ' frag := w contents.
                                        w := '' writeStream.
                                        keyWords add:frag. '.
                                    ].
                                ].
                            ].
                            w nextPut:c.
                            last2C := lastC.
                            lastC := c.
                        ].
                    ].
                  ].

    sep := [:line | 
                |words keyWords|

                words := line asCollectionOfSubstringsSeparatedByAny:'.,;:_ '.
                keyWords := OrderedCollection new.
                words do:[:eachWord | sepUCWords value:eachWord value:keyWords].
                keyWords
            ].

    kwic := self new.
    kwic separatorAlgorithm:sep.
    ^ kwic
!

new
    ^ self basicNew initialize
! !

!KeywordInContextIndexBuilder methodsFor:'accessing'!

excluded:something
    excluded := something asSet.
!

separatorAlgorithm:something
    separatorAlgorithm := something.
! !

!KeywordInContextIndexBuilder methodsFor:'building'!

addLine:aLine reference:opaqueReference
    self addLine:aLine reference:opaqueReference ignoreCase:false
!

addLine:aLine reference:opaqueReference ignoreCase:ignoreCase
    (separatorAlgorithm value:aLine) do:[:eachWord |
        |set word|

        ignoreCase ifTrue:[
            word := eachWord asLowercase.
        ] ifFalse:[
            word := eachWord asLowercase.
        ].
        (excluded includes:word) ifFalse:[
            set := keywordToLinesMapping at:word ifAbsent:nil.
            set isNil ifTrue:[
                set := Set new.
                keywordToLinesMapping at:word put:set
            ].
            set add:(aLine -> opaqueReference).
        ]
    ].
! !

!KeywordInContextIndexBuilder methodsFor:'enumerating'!

entriesDo:aBlock
    keywordToLinesMapping keys asSortedCollection do:[:eachKey |
        |setOfMatches lcKey|

        setOfMatches := keywordToLinesMapping at:eachKey.
        lcKey := eachKey asLowercase.
        setOfMatches do:[:eachAssoc |
            |text ref lines idx lIdx context left right word prevLine nextLine|

            text := eachAssoc key.
            ref := eachAssoc value.

            lines := text asCollectionOfLines.
            idx := lines findFirst:[:line | line asLowercase includesString:lcKey].
            idx ~~ 0 ifTrue:[
                context := lines at:idx.
                idx > 1 ifTrue:[
                    prevLine := (lines at:idx-1).
                    context := prevLine , ' ' , context.
                ].
                idx < lines size ifTrue:[
                    nextLine := (lines at:idx+1).
                    context :=  context , ' ' , nextLine.
                ].
                lIdx := context asLowercase findString:lcKey.
                left := (context copyTo:lIdx - 1) withoutSeparators.
                right := (context copyFrom:lIdx + lcKey size) withoutSeparators.
                word := (context copyFrom:lIdx to:lIdx + lcKey size - 1) withoutSeparators.
                aBlock value:word value:left value:right value:ref.
            ].
        ]
    ]
! !

!KeywordInContextIndexBuilder methodsFor:'initialization'!

initialize
    keywordToLinesMapping := Dictionary new.
    excluded := Set new.
    separatorAlgorithm := [:line | line asCollectionOfSubstringsSeparatedByAny:' .:,;-'].
! !

!KeywordInContextIndexBuilder class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic2/KeywordInContextIndexBuilder.st,v 1.2 2011-02-14 17:39:30 cg Exp $'
!

version_CVS
    ^ '$Header: /cvs/stx/stx/libbasic2/KeywordInContextIndexBuilder.st,v 1.2 2011-02-14 17:39:30 cg Exp $'
! !