1375
|
1 |
"
|
|
2 |
COPYRIGHT (c) 2003 by eXept Software AG
|
|
3 |
All Rights Reserved
|
|
4 |
|
|
5 |
This software is furnished under a license and may be used
|
|
6 |
only in accordance with the terms of that license and with the
|
|
7 |
inclusion of the above copyright notice. This software may not
|
|
8 |
be provided or otherwise made available to, or used by, any
|
|
9 |
other person. No title to or ownership of the software is
|
|
10 |
hereby transferred.
|
|
11 |
"
|
|
12 |
"{ Package: 'stx:libbasic2' }"
|
|
13 |
|
4108
|
14 |
"{ NameSpace: Smalltalk }"
|
|
15 |
|
1375
|
16 |
Object subclass:#KeywordInContextIndexBuilder
|
4130
|
17 |
instanceVariableNames:'keywordToLinesMapping excluded separatorAlgorithm
|
4188
|
18 |
unquoteAlgorithm exclusionFilter matchSorter'
|
1375
|
19 |
classVariableNames:''
|
|
20 |
poolDictionaries:''
|
|
21 |
category:'Collections-Support'
|
|
22 |
!
|
|
23 |
|
|
24 |
!KeywordInContextIndexBuilder class methodsFor:'documentation'!
|
|
25 |
|
|
26 |
copyright
|
|
27 |
"
|
|
28 |
COPYRIGHT (c) 2003 by eXept Software AG
|
|
29 |
All Rights Reserved
|
|
30 |
|
|
31 |
This software is furnished under a license and may be used
|
|
32 |
only in accordance with the terms of that license and with the
|
|
33 |
inclusion of the above copyright notice. This software may not
|
|
34 |
be provided or otherwise made available to, or used by, any
|
|
35 |
other person. No title to or ownership of the software is
|
|
36 |
hereby transferred.
|
|
37 |
"
|
|
38 |
!
|
|
39 |
|
|
40 |
documentation
|
|
41 |
"
|
4126
|
42 |
A support class for building KWIC (Keyword in Context) or KWOC (Keyword out of Context) indexes.
|
|
43 |
(for example, to build such indexes on html pages or class documentation).
|
4125
|
44 |
|
3184
|
45 |
To generate a kwic, add each line together with a reference (or page number, or whatever),
|
|
46 |
using addLine:reference:.
|
4126
|
47 |
Then, when finished, enumerate the kwic and print as kwic or kwoc.
|
4127
|
48 |
|
4187
|
49 |
To ignore fill words (such as 'and', 'the', 'in', etc.),
|
|
50 |
define those with the #excluded: messages.
|
|
51 |
|
|
52 |
The keyword handling is configurable by providing actions/lists for:
|
|
53 |
separatorAlgorithm a block which separates lines into individual words
|
|
54 |
gets a line; delivers a collection of words
|
|
55 |
|
|
56 |
excluded a collection of words which are to be ignored
|
|
57 |
|
|
58 |
unquoteAlgorithm a block to remove quotes around words.
|
|
59 |
gets word as argument, delivers unquoted word
|
|
60 |
|
|
61 |
keywordMappingAlgorithm
|
|
62 |
maps keywords; for example, can be used to map 'startsWith'
|
|
63 |
to 'start', so they appear in the same section.
|
|
64 |
Gets the word and the set-of-all-words as arguments,
|
|
65 |
delivers the key into which the word's entries should be placed
|
|
66 |
|
|
67 |
matchSorter determines the order in which keywords are listed
|
|
68 |
|
1375
|
69 |
[author:]
|
|
70 |
Claus Gittinger (cg@alan)
|
|
71 |
|
4126
|
72 |
[examples:]
|
|
73 |
see examples method
|
1375
|
74 |
|
|
75 |
[see also:]
|
4125
|
76 |
https://en.wikipedia.org/wiki/Key_Word_in_Context (english)
|
|
77 |
https://de.wikipedia.org/wiki/Permutiertes_Register (german)
|
|
78 |
|
1375
|
79 |
"
|
|
80 |
!
|
|
81 |
|
|
82 |
examples
|
|
83 |
"
|
4126
|
84 |
building a kwic; print as kwic and kwoc
|
1375
|
85 |
[exBegin]
|
|
86 |
|kwic|
|
|
87 |
|
|
88 |
kwic := KeywordInContextIndexBuilder new.
|
4127
|
89 |
kwic excluded:#('the' 'and' 'a' 'an' 'in').
|
1375
|
90 |
|
|
91 |
kwic addLine:'bla bla bla' reference:1.
|
4126
|
92 |
kwic addLine:'foo, bar. baz' reference:2.
|
|
93 |
kwic addLine:'one two three' reference:3.
|
|
94 |
kwic addLine:'a cat and a dog' reference:4.
|
|
95 |
kwic addLine:'the man in the middle' reference:5.
|
|
96 |
kwic addLine:'the man with the dog' reference:6.
|
1375
|
97 |
|
4126
|
98 |
Transcript showCR:'Printed as KWIC:'.
|
1375
|
99 |
kwic
|
|
100 |
entriesDo:[:word :left :right :ref |
|
|
101 |
Transcript
|
|
102 |
show:((left contractTo:20) leftPaddedTo:20);
|
|
103 |
space;
|
4124
|
104 |
show:((word contractTo:10) leftPaddedTo:10) allBold;
|
1375
|
105 |
space;
|
|
106 |
show:((right contractTo:20) leftPaddedTo:20);
|
2536
|
107 |
space;
|
|
108 |
show:'['; show:ref; show:']';
|
1375
|
109 |
cr
|
|
110 |
].
|
4126
|
111 |
|
|
112 |
Transcript cr.
|
|
113 |
Transcript showCR:'Printed as KWOC:'.
|
|
114 |
kwic
|
4128
|
115 |
entriesDo:[:word :left :right :ref :fullText :context |
|
4126
|
116 |
Transcript
|
|
117 |
show:((word contractTo:10) paddedTo:10) allBold;
|
|
118 |
space;
|
4128
|
119 |
show:((context contractTo:60) paddedTo:60);
|
4126
|
120 |
space;
|
|
121 |
show:'['; show:ref; show:']';
|
|
122 |
cr
|
|
123 |
].
|
1375
|
124 |
[exEnd]
|
|
125 |
|
|
126 |
|
3184
|
127 |
KWIC index over method selector components; build a little browser window:
|
1375
|
128 |
[exBegin]
|
3184
|
129 |
|kwic v s c list refs|
|
1375
|
130 |
|
|
131 |
kwic := KeywordInContextIndexBuilder new.
|
|
132 |
Smalltalk allClassesDo:[:eachClass |
|
|
133 |
eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
|
|
134 |
kwic addLine:sel reference:mthd.
|
|
135 |
]
|
|
136 |
].
|
3184
|
137 |
|
|
138 |
v := StandardSystemView new.
|
|
139 |
v addComponent:(s := HVScrollableView for:SelectionInListView).
|
|
140 |
s origin:0.0@0.0 corner:1.0@0.5.
|
|
141 |
v addComponent:(c := HVScrollableView for:CodeView).
|
|
142 |
c origin:0.0@0.5 corner:1.0@1.0.
|
|
143 |
|
|
144 |
refs := OrderedCollection new.
|
|
145 |
list := OrderedCollection new.
|
1375
|
146 |
kwic
|
3184
|
147 |
entriesDo:[:word :left :right :ref |
|
|
148 |
list add:(word,' ',left,' ',word allBold,' ',right,' (',ref mclass name,')').
|
|
149 |
refs add:ref].
|
|
150 |
s list:list.
|
|
151 |
s action:[:lNr | c contents:(refs at:lNr) source].
|
|
152 |
v open.
|
1375
|
153 |
[exEnd]
|
|
154 |
|
|
155 |
KWIC index over method selector components, with word separation:
|
|
156 |
[exBegin]
|
|
157 |
|kwic|
|
|
158 |
|
|
159 |
kwic := KeywordInContextIndexBuilder forMethodSelectorIndex.
|
|
160 |
|
|
161 |
Smalltalk allClassesDo:[:eachClass |
|
|
162 |
eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
|
|
163 |
kwic addLine:sel reference:mthd.
|
|
164 |
]
|
|
165 |
].
|
|
166 |
kwic
|
|
167 |
[exEnd]
|
|
168 |
|
|
169 |
KWIC index over method comments:
|
|
170 |
[exBegin]
|
4124
|
171 |
|kwic v s c refs list|
|
1375
|
172 |
|
|
173 |
kwic := KeywordInContextIndexBuilder forMethodComments.
|
|
174 |
|
|
175 |
Smalltalk allClassesDo:[:eachClass |
|
|
176 |
eachClass instAndClassSelectorsAndMethodsDo:[:sel :mthd |
|
|
177 |
|comment|
|
|
178 |
|
|
179 |
(sel == #documentation) ifTrue:[
|
|
180 |
comment := mthd comment.
|
|
181 |
comment notNil ifTrue:[
|
|
182 |
kwic addLine:comment reference:mthd mclass ignoreCase:true.
|
|
183 |
]
|
|
184 |
] ifFalse:[
|
|
185 |
(sel ~~ #examples
|
|
186 |
and:[ sel ~~ #copyright
|
|
187 |
and:[ sel ~~ #version]]) ifTrue:[
|
|
188 |
comment := mthd comment.
|
|
189 |
comment notNil ifTrue:[
|
|
190 |
kwic addLine:comment reference:mthd ignoreCase:true.
|
|
191 |
]
|
|
192 |
]
|
|
193 |
]
|
|
194 |
]
|
|
195 |
].
|
4124
|
196 |
kwic.
|
1375
|
197 |
[exEnd]
|
|
198 |
|
|
199 |
KWIC index over class comments:
|
|
200 |
[exBegin]
|
|
201 |
|kwic|
|
|
202 |
|
|
203 |
kwic := KeywordInContextIndexBuilder forMethodComments.
|
|
204 |
|
|
205 |
Smalltalk allClassesDo:[:eachClass |
|
|
206 |
|mthd comment|
|
|
207 |
|
|
208 |
mthd := eachClass theMetaclass compiledMethodAt:#documentation.
|
|
209 |
mthd notNil ifTrue:[
|
|
210 |
comment := mthd comment.
|
|
211 |
comment notNil ifTrue:[
|
|
212 |
kwic addLine:comment reference:eachClass theNonMetaclass ignoreCase:true.
|
|
213 |
]
|
|
214 |
]
|
|
215 |
].
|
|
216 |
kwic
|
|
217 |
[exEnd]
|
|
218 |
"
|
|
219 |
! !
|
|
220 |
|
|
221 |
!KeywordInContextIndexBuilder class methodsFor:'instance creation'!
|
|
222 |
|
|
223 |
forMethodComments
|
|
224 |
"return an indexer for method comments"
|
|
225 |
|
|
226 |
|sepChars sep kwic|
|
|
227 |
|
|
228 |
sepChars := '.,;:_ !![]()''"#?<>|' , Character return, Character lf, Character tab.
|
|
229 |
|
|
230 |
sep := [:lines | lines asString asCollectionOfSubstringsSeparatedByAny:sepChars].
|
|
231 |
|
|
232 |
kwic := self new.
|
|
233 |
kwic separatorAlgorithm:sep.
|
|
234 |
kwic excluded:#('the' 'and' 'a' 'an' 'for' 'with' 'no').
|
|
235 |
^ kwic
|
|
236 |
!
|
|
237 |
|
|
238 |
forMethodSelectorIndex
|
|
239 |
"return an indexer for method selector components, with word separation at case boundaries"
|
|
240 |
|
|
241 |
|sep kwic sepUCWords|
|
|
242 |
|
|
243 |
sepUCWords := [:word :keyWords|
|
|
244 |
|s w c lastC last2C frag|
|
|
245 |
|
|
246 |
word asLowercase = word ifTrue:[
|
|
247 |
keyWords add:word.
|
|
248 |
] ifFalse:[
|
|
249 |
s := word readStream.
|
|
250 |
w := '' writeStream.
|
|
251 |
[s atEnd] whileFalse:[
|
|
252 |
c := s next.
|
|
253 |
(c isUppercase) ifTrue:[
|
|
254 |
(lastC notNil and:[lastC isUppercase not]) ifTrue:[
|
|
255 |
keyWords add:w contents.
|
|
256 |
w := '' writeStream.
|
|
257 |
].
|
|
258 |
] ifFalse:[
|
|
259 |
(last2C notNil and:[last2C isUppercase and:[lastC isUppercase]]) ifTrue:[
|
|
260 |
c isLetter ifTrue:[
|
|
261 |
frag := w contents.
|
|
262 |
w := '' writeStream.
|
|
263 |
w nextPut:(frag last).
|
|
264 |
keyWords add:(frag allButLast).
|
|
265 |
] ifFalse:[
|
4108
|
266 |
"/ frag := w contents.
|
|
267 |
"/ w := '' writeStream.
|
|
268 |
"/ keyWords add:frag.
|
1375
|
269 |
].
|
|
270 |
].
|
|
271 |
].
|
|
272 |
w nextPut:c.
|
|
273 |
last2C := lastC.
|
|
274 |
lastC := c.
|
|
275 |
].
|
|
276 |
].
|
|
277 |
].
|
|
278 |
|
|
279 |
sep := [:line |
|
|
280 |
|words keyWords|
|
|
281 |
|
|
282 |
words := line asCollectionOfSubstringsSeparatedByAny:'.,;:_ '.
|
|
283 |
keyWords := OrderedCollection new.
|
|
284 |
words do:[:eachWord | sepUCWords value:eachWord value:keyWords].
|
|
285 |
keyWords
|
|
286 |
].
|
|
287 |
|
|
288 |
kwic := self new.
|
|
289 |
kwic separatorAlgorithm:sep.
|
|
290 |
^ kwic
|
|
291 |
!
|
|
292 |
|
|
293 |
new
|
|
294 |
^ self basicNew initialize
|
|
295 |
! !
|
|
296 |
|
|
297 |
!KeywordInContextIndexBuilder methodsFor:'accessing'!
|
|
298 |
|
4128
|
299 |
excluded:aListOfExcludedWords
|
|
300 |
"define words which are to be ignored.
|
|
301 |
Typically, this is a list of fillwords, such as 'and', 'the', 'in', etc."
|
|
302 |
|
|
303 |
excluded := aListOfExcludedWords asSet.
|
1375
|
304 |
!
|
|
305 |
|
4130
|
306 |
exclusionFilter:aBlock
|
|
307 |
"define an additional filter to exclude more complicated patterns.
|
|
308 |
This is invoked after filtering by the exclusion list.
|
|
309 |
If defined, this should return true,if the word is to be excluded."
|
|
310 |
|
|
311 |
exclusionFilter := aBlock.
|
|
312 |
!
|
|
313 |
|
4132
|
314 |
matchSorter:aSortBlock
|
4187
|
315 |
"if set, matches will be enumerated in that sort order."
|
4132
|
316 |
|
|
317 |
matchSorter := aSortBlock.
|
|
318 |
!
|
|
319 |
|
4128
|
320 |
separatorAlgorithm:aBlock
|
|
321 |
"define the algorithm to split a given string into words.
|
|
322 |
The default is to split at punctuation and whitespace
|
|
323 |
(see #initialize)"
|
|
324 |
|
|
325 |
separatorAlgorithm := aBlock.
|
4130
|
326 |
!
|
|
327 |
|
|
328 |
unquoteAlgorithm:aBlock
|
|
329 |
"define the algorithm to unquote words.
|
|
330 |
The default is to unquote single and double quotes
|
|
331 |
(see #initialize)"
|
|
332 |
|
|
333 |
unquoteAlgorithm := aBlock.
|
1375
|
334 |
! !
|
|
335 |
|
|
336 |
!KeywordInContextIndexBuilder methodsFor:'building'!
|
|
337 |
|
|
338 |
addLine:aLine reference:opaqueReference
|
3184
|
339 |
"add a text line; the line is split at words and entered into the kwic.
|
4128
|
340 |
The reference argument is stored as 'value' of the generated entries.
|
|
341 |
It can be anything"
|
3184
|
342 |
|
4128
|
343 |
self addLine:aLine reference:opaqueReference ignoreCase:true
|
1375
|
344 |
!
|
|
345 |
|
|
346 |
addLine:aLine reference:opaqueReference ignoreCase:ignoreCase
|
4128
|
347 |
"add a line to the kwic.
|
|
348 |
The line is split up into words, and a reference to opaqueReference
|
|
349 |
is added for each word.
|
4188
|
350 |
The reference argument is stored as 'value' of the generated entries;
|
|
351 |
it can be anything"
|
4128
|
352 |
|
4187
|
353 |
(separatorAlgorithm value:aLine optionalArgument:keywordToLinesMapping) do:[:eachWord |
|
1375
|
354 |
|set word|
|
|
355 |
|
4131
|
356 |
(excluded includes:eachWord) ifFalse:[
|
|
357 |
word := unquoteAlgorithm value:eachWord.
|
|
358 |
ignoreCase ifTrue:[
|
|
359 |
word := word asLowercase.
|
|
360 |
].
|
|
361 |
(excluded includes:word) ifFalse:[
|
|
362 |
(exclusionFilter isNil or:[ (exclusionFilter value:word) not]) ifTrue:[
|
|
363 |
set := keywordToLinesMapping at:word ifAbsentPut:[Set new].
|
|
364 |
set add:(aLine -> opaqueReference).
|
|
365 |
]
|
4130
|
366 |
]
|
1375
|
367 |
]
|
|
368 |
].
|
4188
|
369 |
!
|
|
370 |
|
|
371 |
remapKeywordsWith:keywordMappingAlgorithm
|
|
372 |
"allows for an additional mapper to be applied (after the kwic has been constructed).
|
|
373 |
This can map multiple different words to the same keword.
|
|
374 |
It is given the word and the set of already known words as argument.
|
|
375 |
It may, for example figure out that a word with a long prefix is already in the
|
|
376 |
list and decide, that a new word should be brought into the same bucket.
|
|
377 |
For example, if 'starts' is already in the list, and 'startWith' is encountered."
|
|
378 |
|
|
379 |
|knownKeys|
|
4187
|
380 |
|
4188
|
381 |
knownKeys := keywordToLinesMapping keys copy.
|
|
382 |
knownKeys do:[:kw |
|
|
383 |
|mappedWord oldSet newSet|
|
|
384 |
|
|
385 |
mappedWord := keywordMappingAlgorithm value:kw optionalArgument:knownKeys.
|
|
386 |
mappedWord ~= kw ifTrue:[
|
|
387 |
oldSet := keywordToLinesMapping at:kw ifAbsent:[nil].
|
|
388 |
oldSet notNil ifTrue:[
|
|
389 |
newSet := keywordToLinesMapping at:mappedWord ifAbsentPut:[Set new].
|
|
390 |
oldSet do:[:eachEntry |
|
|
391 |
newSet add:eachEntry.
|
|
392 |
].
|
|
393 |
keywordToLinesMapping removeKey:kw.
|
4187
|
394 |
]
|
4188
|
395 |
]
|
4187
|
396 |
].
|
1375
|
397 |
! !
|
|
398 |
|
|
399 |
!KeywordInContextIndexBuilder methodsFor:'enumerating'!
|
|
400 |
|
4128
|
401 |
entriesDo:aFourToSixArgBlock
|
4126
|
402 |
"evaluate the argument, for each entry.
|
|
403 |
If it is a 4-arg block, it is called with:
|
|
404 |
kwic-word,
|
|
405 |
left-text,
|
|
406 |
right text
|
|
407 |
and reference
|
|
408 |
If it is a 5-arg block, the original text is passed as additional argument.
|
4128
|
409 |
If it is a 6-arg block, the original text and the context are passed as additional argument.
|
4126
|
410 |
(stupid, but done for backward compatibility)"
|
3184
|
411 |
|
4128
|
412 |
|fourArgBlock|
|
4126
|
413 |
|
4128
|
414 |
aFourToSixArgBlock numArgs == 4 ifTrue:[
|
|
415 |
fourArgBlock := aFourToSixArgBlock
|
4126
|
416 |
].
|
1375
|
417 |
keywordToLinesMapping keys asSortedCollection do:[:eachKey |
|
|
418 |
|setOfMatches lcKey|
|
|
419 |
|
|
420 |
setOfMatches := keywordToLinesMapping at:eachKey.
|
4132
|
421 |
matchSorter notNil ifTrue:[
|
|
422 |
setOfMatches := setOfMatches asSortedCollection:matchSorter
|
|
423 |
].
|
1375
|
424 |
lcKey := eachKey asLowercase.
|
|
425 |
setOfMatches do:[:eachAssoc |
|
|
426 |
|text ref lines idx lIdx context left right word prevLine nextLine|
|
|
427 |
|
|
428 |
text := eachAssoc key.
|
|
429 |
ref := eachAssoc value.
|
|
430 |
|
|
431 |
lines := text asCollectionOfLines.
|
|
432 |
idx := lines findFirst:[:line | line asLowercase includesString:lcKey].
|
|
433 |
idx ~~ 0 ifTrue:[
|
|
434 |
context := lines at:idx.
|
|
435 |
idx > 1 ifTrue:[
|
|
436 |
prevLine := (lines at:idx-1).
|
|
437 |
context := prevLine , ' ' , context.
|
|
438 |
].
|
|
439 |
idx < lines size ifTrue:[
|
|
440 |
nextLine := (lines at:idx+1).
|
|
441 |
context := context , ' ' , nextLine.
|
|
442 |
].
|
|
443 |
lIdx := context asLowercase findString:lcKey.
|
|
444 |
left := (context copyTo:lIdx - 1) withoutSeparators.
|
|
445 |
right := (context copyFrom:lIdx + lcKey size) withoutSeparators.
|
|
446 |
word := (context copyFrom:lIdx to:lIdx + lcKey size - 1) withoutSeparators.
|
4126
|
447 |
fourArgBlock notNil ifTrue:[
|
|
448 |
fourArgBlock value:word value:left value:right value:ref.
|
|
449 |
] ifFalse:[
|
4128
|
450 |
aFourToSixArgBlock value:word optionalArgument:left and:right and:ref and:text and:context
|
4126
|
451 |
].
|
1375
|
452 |
].
|
|
453 |
]
|
|
454 |
]
|
|
455 |
! !
|
|
456 |
|
|
457 |
!KeywordInContextIndexBuilder methodsFor:'initialization'!
|
|
458 |
|
|
459 |
initialize
|
|
460 |
keywordToLinesMapping := Dictionary new.
|
4129
|
461 |
self excluded:(Set new).
|
4130
|
462 |
self exclusionFilter:nil.
|
4129
|
463 |
self separatorAlgorithm:[:line | line asCollectionOfSubstringsSeparatedByAny:' .:,;-'].
|
4130
|
464 |
self unquoteAlgorithm:[:word | (word unquote:$") unquote:$' ].
|
1375
|
465 |
! !
|
|
466 |
|
|
467 |
!KeywordInContextIndexBuilder class methodsFor:'documentation'!
|
|
468 |
|
|
469 |
version
|
4108
|
470 |
^ '$Header$'
|
2536
|
471 |
!
|
|
472 |
|
|
473 |
version_CVS
|
4108
|
474 |
^ '$Header$'
|
1375
|
475 |
! !
|
3184
|
476 |
|