"{ Package: 'stx:libbasic2' }"
"{ NameSpace: Smalltalk }"
Object subclass:#TextClassifier
instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
categories vocabulary'
classVariableNames:''
poolDictionaries:''
category:'Collections-Text-Support'
!
!TextClassifier class methodsFor:'documentation'!
documentation
"
an initial experiment in bayes text classification.
see BayesClassifierTest
This is possibly unfinished and may need more work.
[author:]
cg
[instance variables:]
[class variables:]
[see also:]
"
! !
!TextClassifier class methodsFor:'instance creation'!
new
"return an initialized instance"
^ self basicNew initialize.
! !
!TextClassifier methodsFor:'initialization'!
initialize
"Invoked when a new instance is created."
wordBag := Bag new.
"/ sentences := nil.
docCounts := Dictionary new.
wordCounts := Dictionary new.
wordFrequencyCounts := Dictionary new.
categories := Set new.
vocabulary := Set new.
"/ super initialize. -- commented since inherited method does nothing
!
initializeCategory:categoryName
(categories includes:categoryName) ifFalse:[
docCounts at:categoryName put:0.
wordCounts at:categoryName put:0.
wordFrequencyCounts at:categoryName put:(Dictionary new).
categories add:categoryName
].
! !
!TextClassifier methodsFor:'text handling'!
classify:string
"assume that it is a regular text.
split first into lines..."
|tokens frequencyTable maxProbability chosenCategory|
maxProbability := Infinity negative.
tokens := self tokenize:string.
frequencyTable := tokens asBag.
categories do:[:categoryName |
|categoryProbability logProbability|
categoryProbability := (docCounts at:categoryName) / docCounts size.
logProbability := categoryProbability log.
frequencyTable valuesAndCountsDo:[:token :frequencyInText |
| tokenProbability|
tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
logProbability := logProbability + (frequencyInText * tokenProbability log).
].
Transcript show:'P(',categoryName,') = '; showCR:logProbability.
logProbability > maxProbability ifTrue:[
maxProbability := logProbability.
chosenCategory := categoryName.
].
].
^ chosenCategory
!
classify:string asCategory:categoryName
|tokens frequencyTable sumWordCount|
self initializeCategory:categoryName.
docCounts incrementAt:categoryName.
tokens := self tokenize:string.
frequencyTable := tokens asBag.
sumWordCount := 0.
frequencyTable valuesAndCountsDo:[:token :count |
vocabulary add:token.
(wordFrequencyCounts at:categoryName) incrementAt:token by:count.
sumWordCount := sumWordCount + count.
].
wordCounts incrementAt:categoryName by:sumWordCount
!
collectWords:lines
"computes words from lines"
|words|
words := lines collectAll:[:l |
l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
].
^ words
!
dehyphenate:linesCollection
"join hypens"
|lines partialLine|
lines := OrderedCollection new.
linesCollection do:[:eachLine |
|l isHyphenated|
l := eachLine withoutSeparators.
l notEmptyOrNil ifTrue:[
isHyphenated := (l endsWith:'-')
and:[ l size > 1
and:[ (l at:(l size-1)) isLetter ]].
isHyphenated ifFalse:[
partialLine := (partialLine ? '') , l.
lines add:partialLine.
partialLine := nil.
] ifTrue:[
l := l copyButLast.
partialLine := (partialLine ? '') , l.
].
].
].
partialLine notEmptyOrNil ifTrue:[
lines add:partialLine
].
^ lines
!
tokenProbabilityOf:token inCategory:category
"Calculate probability that a `token` belongs to a `category`"
|wordFrequencyCount wordCount prob|
wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
wordCount := wordCounts at:category.
"/use laplace Add-1 Smoothing equation
prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
prob := prob asFloat.
Transcript showCR:(' P(%1, %2) = %3' bindWith:token with:category with:prob).
^ prob
!
tokenize:string
|rawLines lines allWords|
rawLines := string asCollectionOfLines.
lines := self dehyphenate:rawLines.
allWords := self collectWords:lines.
^ allWords
! !
!TextClassifier class methodsFor:'documentation'!
version
^ '$Header$'
!
version_CVS
^ '$Header$'
! !