KeywordInContextIndexBuilder.st
changeset 1375 e034d3e027f2
child 2536 8907a20de2dc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/KeywordInContextIndexBuilder.st	Mon Dec 08 17:03:24 2003 +0100
@@ -0,0 +1,321 @@
+"
+ COPYRIGHT (c) 2003 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+
+"{ Package: 'stx:libbasic2' }"
+
+Object subclass:#KeywordInContextIndexBuilder
+	instanceVariableNames:'keywordToLinesMapping excluded separatorAlgorithm'
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Support'
+!
+
+!KeywordInContextIndexBuilder class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 2003 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+    A support class for building a KWIC (Keyword in Context) indices.
+    (for example, to build a KWIC index on html pages or class documentation).
+
+    [author:]
+        Claus Gittinger (cg@alan)
+
+    [instance variables:]
+
+    [class variables:]
+
+    [see also:]
+
+"
+!
+
+examples
+"
+                                                                [exBegin]
+    |kwic|
+
+    kwic := KeywordInContextIndexBuilder new.
+    kwic excluded:#('the' 'and' 'a' 'an').
+
+    kwic addLine:'bla bla bla' reference:1.
+    kwic addLine:'one two three' reference:2.
+    kwic addLine:'a cat and a dog' reference:3.
+    kwic addLine:'the man in the middle' reference:4.
+    kwic addLine:'the man with the dog' reference:5.
+
+    kwic 
+        entriesDo:[:word :left :right :ref |
+            Transcript 
+                show:((left contractTo:20) leftPaddedTo:20);
+                space;
+                show:((word contractTo:10) leftPaddedTo:10);
+                space;
+                show:((right contractTo:20) leftPaddedTo:20);
+                cr    
+        ].
+                                                                [exEnd]
+
+
+  KWIC index over method selector components:
+                                                                [exBegin]
+    |kwic|
+
+    kwic := KeywordInContextIndexBuilder new.
+    Smalltalk allClassesDo:[:eachClass |
+        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
+            kwic addLine:sel reference:mthd.
+        ]
+    ].
+    kwic 
+                                                                [exEnd]
+
+  KWIC index over method selector components, with word separation:
+                                                                [exBegin]
+    |kwic|
+
+    kwic := KeywordInContextIndexBuilder forMethodSelectorIndex.
+
+    Smalltalk allClassesDo:[:eachClass |
+        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
+            kwic addLine:sel reference:mthd.
+        ]
+    ].
+    kwic
+                                                                [exEnd]
+
+  KWIC index over method comments:
+                                                                [exBegin]
+    |kwic|
+
+    kwic := KeywordInContextIndexBuilder forMethodComments.
+
+    Smalltalk allClassesDo:[:eachClass |
+        eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
+            |comment|
+
+            (sel == #documentation) ifTrue:[
+                comment := mthd comment.
+                comment notNil ifTrue:[
+                    kwic addLine:comment reference:mthd mclass ignoreCase:true.
+                ]
+            ] ifFalse:[
+                (sel ~~ #examples
+                and:[ sel ~~ #copyright
+                and:[ sel ~~ #version]]) ifTrue:[
+                    comment := mthd comment.
+                    comment notNil ifTrue:[
+                        kwic addLine:comment reference:mthd ignoreCase:true.
+                    ]
+                ]
+            ]
+        ]
+    ].
+    kwic
+                                                                [exEnd]
+
+  KWIC index over class comments:
+                                                                [exBegin]
+    |kwic|
+
+    kwic := KeywordInContextIndexBuilder forMethodComments.
+
+    Smalltalk allClassesDo:[:eachClass |
+        |mthd comment|
+
+        mthd := eachClass theMetaclass compiledMethodAt:#documentation.
+        mthd notNil ifTrue:[
+            comment := mthd comment.
+            comment notNil ifTrue:[
+                kwic addLine:comment reference:eachClass theNonMetaclass ignoreCase:true.
+            ]
+        ]
+    ].
+    kwic
+                                                                [exEnd]
+"
+! !
+
+!KeywordInContextIndexBuilder class methodsFor:'instance creation'!
+
+forMethodComments
+    "return an indexer for method comments"
+
+    |sepChars sep kwic|
+
+    sepChars := '.,;:_ !![]()''"#?<>|' , Character return, Character lf, Character tab.
+
+    sep := [:lines | lines asString asCollectionOfSubstringsSeparatedByAny:sepChars].
+
+    kwic := self new.
+    kwic separatorAlgorithm:sep.
+    kwic excluded:#('the' 'and' 'a' 'an' 'for' 'with' 'no').
+    ^ kwic
+!
+
+forMethodSelectorIndex
+    "return an indexer for method selector components, with word separation at case boundaries"
+
+    |sep kwic sepUCWords|
+
+    sepUCWords := [:word :keyWords| 
+                    |s w c lastC last2C frag|
+
+                    word asLowercase = word ifTrue:[
+                        keyWords add:word.
+                    ] ifFalse:[
+                        s := word readStream.
+                        w := '' writeStream.
+                        [s atEnd] whileFalse:[
+                            c := s next.
+                            (c isUppercase) ifTrue:[
+                                (lastC notNil and:[lastC isUppercase not]) ifTrue:[
+                                    keyWords add:w contents.
+                                    w := '' writeStream.
+                                ].
+                            ] ifFalse:[
+                                (last2C notNil and:[last2C isUppercase and:[lastC isUppercase]]) ifTrue:[
+                                    c isLetter ifTrue:[
+                                        frag := w contents.
+                                        w := '' writeStream.
+                                        w nextPut:(frag last).
+                                        keyWords add:(frag allButLast).
+                                    ] ifFalse:[
+                                       ' frag := w contents.
+                                        w := '' writeStream.
+                                        keyWords add:frag. '.
+                                    ].
+                                ].
+                            ].
+                            w nextPut:c.
+                            last2C := lastC.
+                            lastC := c.
+                        ].
+                    ].
+                  ].
+
+    sep := [:line | 
+                |words keyWords|
+
+                words := line asCollectionOfSubstringsSeparatedByAny:'.,;:_ '.
+                keyWords := OrderedCollection new.
+                words do:[:eachWord | sepUCWords value:eachWord value:keyWords].
+                keyWords
+            ].
+
+    kwic := self new.
+    kwic separatorAlgorithm:sep.
+    ^ kwic
+!
+
+new
+    ^ self basicNew initialize
+! !
+
+!KeywordInContextIndexBuilder methodsFor:'accessing'!
+
+excluded:something
+    excluded := something asSet.
+!
+
+separatorAlgorithm:something
+    separatorAlgorithm := something.
+! !
+
+!KeywordInContextIndexBuilder methodsFor:'building'!
+
+addLine:aLine reference:opaqueReference
+    self addLine:aLine reference:opaqueReference ignoreCase:false
+!
+
+addLine:aLine reference:opaqueReference ignoreCase:ignoreCase
+    (separatorAlgorithm value:aLine) do:[:eachWord |
+        |set word|
+
+        ignoreCase ifTrue:[
+            word := eachWord asLowercase.
+        ] ifFalse:[
+            word := eachWord asLowercase.
+        ].
+        (excluded includes:word) ifFalse:[
+            set := keywordToLinesMapping at:word ifAbsent:nil.
+            set isNil ifTrue:[
+                set := Set new.
+                keywordToLinesMapping at:word put:set
+            ].
+            set add:(aLine -> opaqueReference).
+        ]
+    ].
+! !
+
+!KeywordInContextIndexBuilder methodsFor:'enumerating'!
+
+entriesDo:aBlock
+    keywordToLinesMapping keys asSortedCollection do:[:eachKey |
+        |setOfMatches lcKey|
+
+        setOfMatches := keywordToLinesMapping at:eachKey.
+        lcKey := eachKey asLowercase.
+        setOfMatches do:[:eachAssoc |
+            |text ref lines idx lIdx context left right word prevLine nextLine|
+
+            text := eachAssoc key.
+            ref := eachAssoc value.
+
+            lines := text asCollectionOfLines.
+            idx := lines findFirst:[:line | line asLowercase includesString:lcKey].
+            idx ~~ 0 ifTrue:[
+                context := lines at:idx.
+                idx > 1 ifTrue:[
+                    prevLine := (lines at:idx-1).
+                    context := prevLine , ' ' , context.
+                ].
+                idx < lines size ifTrue:[
+                    nextLine := (lines at:idx+1).
+                    context :=  context , ' ' , nextLine.
+                ].
+                lIdx := context asLowercase findString:lcKey.
+                left := (context copyTo:lIdx - 1) withoutSeparators.
+                right := (context copyFrom:lIdx + lcKey size) withoutSeparators.
+                word := (context copyFrom:lIdx to:lIdx + lcKey size - 1) withoutSeparators.
+                aBlock value:word value:left value:right value:ref.
+            ].
+        ]
+    ]
+! !
+
+!KeywordInContextIndexBuilder methodsFor:'initialization'!
+
+initialize
+    keywordToLinesMapping := Dictionary new.
+    excluded := Set new.
+    separatorAlgorithm := [:line | line asCollectionOfSubstringsSeparatedByAny:' .:,;-'].
+! !
+
+!KeywordInContextIndexBuilder class methodsFor:'documentation'!
+
+version
+    ^ '$Header: /cvs/stx/stx/libbasic2/KeywordInContextIndexBuilder.st,v 1.1 2003-12-08 16:03:24 cg Exp $'
+! !