--- a/BayesClassifier.st Wed Jan 06 13:09:50 2016 +0100
+++ b/BayesClassifier.st Wed Jan 06 13:10:00 2016 +0100
@@ -9,6 +9,103 @@
category:'Collections-Text-Support'
!
+!BayesClassifier class methodsFor:'documentation'!
+
+documentation
+"
+ an initial experiment in bayes text classification.
+ see BayesClassifierTest
+ This is possibly unfinished and may need more work.
+
+ [author:]
+ cg
+"
+!
+
+examples
+"
+ |b|
+
+ b := BayesClassifier new.
+ 'teach it positive phrases'.
+ b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'.
+ b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'.
+
+ 'teach it a negative phrase'.
+ b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'.
+
+ 'teach it a neutral phrase'.
+ b classify:'I dont really know what to make of this.' asCategory: 'neutral'.
+
+ 'now test it to see that it correctly categorizes a new document'.
+ self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'.
+"
+! !
+
+!BayesClassifier methodsFor:'text handling'!
+
+classify:string
+ "assume that it is a regular text.
+ split first into lines..."
+
+ |tokens frequencyTable maxProbability chosenCategory|
+
+ maxProbability := Infinity negative.
+
+ tokens := self tokenize:string.
+ frequencyTable := tokens asBag.
+
+ categories do:[:categoryName |
+ |categoryProbability logProbability|
+
+ categoryProbability := (docCounts at:categoryName) / docCounts size.
+ logProbability := categoryProbability log.
+ frequencyTable valuesAndCountsDo:[:token :frequencyInText |
+ | tokenProbability|
+
+ tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
+ logProbability := logProbability + (frequencyInText * tokenProbability log).
+ ].
+ Transcript show:'P(',categoryName,') = '; showCR:logProbability.
+
+ logProbability > maxProbability ifTrue:[
+ maxProbability := logProbability.
+ chosenCategory := categoryName.
+ ].
+ ].
+ ^ chosenCategory
+!
+
+classify:string asCategory:categoryName
+ |tokens frequencyTable sumWordCount|
+
+ self initializeCategory:categoryName.
+ docCounts incrementAt:categoryName.
+ tokens := self tokenize:string.
+ frequencyTable := tokens asBag.
+ sumWordCount := 0.
+ frequencyTable valuesAndCountsDo:[:token :count |
+ vocabulary add:token.
+ (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
+ sumWordCount := sumWordCount + count.
+ ].
+ wordCounts incrementAt:categoryName by:sumWordCount
+!
+
+tokenProbabilityOf:token inCategory:category
+ "Calculate probability that a `token` belongs to a `category`"
+
+ |wordFrequencyCount wordCount prob|
+
+ wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
+ wordCount := wordCounts at:category.
+
+ "/use laplace Add-1 Smoothing equation
+ prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
+ prob := prob asFloat.
+ Transcript showCR:(' P(%1, %2) = %3' bindWith:token with:category with:prob).
+ ^ prob
+! !
!BayesClassifier class methodsFor:'documentation'!