diff -r 1629a0dc2875 -r 1e4fc7b0ebca BayesClassifier.st --- a/BayesClassifier.st Wed Jan 06 13:09:50 2016 +0100 +++ b/BayesClassifier.st Wed Jan 06 13:10:00 2016 +0100 @@ -9,6 +9,103 @@ category:'Collections-Text-Support' ! +!BayesClassifier class methodsFor:'documentation'! + +documentation +" + an initial experiment in bayes text classification. + see BayesClassifierTest + This is possibly unfinished and may need more work. + + [author:] + cg +" +! + +examples +" + |b| + + b := BayesClassifier new. + 'teach it positive phrases'. + b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'. + b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'. + + 'teach it a negative phrase'. + b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'. + + 'teach it a neutral phrase'. + b classify:'I dont really know what to make of this.' asCategory: 'neutral'. + + 'now test it to see that it correctly categorizes a new document'. + self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'. +" +! ! + +!BayesClassifier methodsFor:'text handling'! + +classify:string + "assume that it is a regular text. + split first into lines..." + + |tokens frequencyTable maxProbability chosenCategory| + + maxProbability := Infinity negative. + + tokens := self tokenize:string. + frequencyTable := tokens asBag. + + categories do:[:categoryName | + |categoryProbability logProbability| + + categoryProbability := (docCounts at:categoryName) / docCounts size. + logProbability := categoryProbability log. + frequencyTable valuesAndCountsDo:[:token :frequencyInText | + | tokenProbability| + + tokenProbability := self tokenProbabilityOf:token inCategory:categoryName. + logProbability := logProbability + (frequencyInText * tokenProbability log). + ]. + Transcript show:'P(',categoryName,') = '; showCR:logProbability. + + logProbability > maxProbability ifTrue:[ + maxProbability := logProbability. + chosenCategory := categoryName. + ]. + ]. + ^ chosenCategory +! + +classify:string asCategory:categoryName + |tokens frequencyTable sumWordCount| + + self initializeCategory:categoryName. + docCounts incrementAt:categoryName. + tokens := self tokenize:string. + frequencyTable := tokens asBag. + sumWordCount := 0. + frequencyTable valuesAndCountsDo:[:token :count | + vocabulary add:token. + (wordFrequencyCounts at:categoryName) incrementAt:token by:count. + sumWordCount := sumWordCount + count. + ]. + wordCounts incrementAt:categoryName by:sumWordCount +! + +tokenProbabilityOf:token inCategory:category + "Calculate probability that a `token` belongs to a `category`" + + |wordFrequencyCount wordCount prob| + + wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0. + wordCount := wordCounts at:category. + + "/use laplace Add-1 Smoothing equation + prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ). + prob := prob asFloat. + Transcript showCR:(' P(%1, %2) = %3' bindWith:token with:category with:prob). + ^ prob +! ! !BayesClassifier class methodsFor:'documentation'!