BayesClassifier.st
changeset 3683 1e4fc7b0ebca
parent 3679 b451fd09c975
child 4274 947603fffad1
--- a/BayesClassifier.st	Wed Jan 06 13:09:50 2016 +0100
+++ b/BayesClassifier.st	Wed Jan 06 13:10:00 2016 +0100
@@ -9,6 +9,103 @@
 	category:'Collections-Text-Support'
 !
 
+!BayesClassifier class methodsFor:'documentation'!
+
+documentation
+"
+    an initial experiment in bayes text classification.
+    see BayesClassifierTest
+    This is possibly unfinished and may need more work.
+
+    [author:]
+        cg
+"
+!
+
+examples
+"
+    |b|
+
+    b := BayesClassifier new.
+    'teach it positive phrases'.
+    b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'.
+    b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'.
+
+    'teach it a negative phrase'.
+    b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'.
+
+    'teach it a neutral phrase'.
+    b classify:'I dont really know what to make of this.' asCategory: 'neutral'.
+
+    'now test it to see that it correctly categorizes a new document'.
+    self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'.
+"
+! !
+
+!BayesClassifier methodsFor:'text handling'!
+
+classify:string
+    "assume that it is a regular text.
+     split first into lines..."
+
+    |tokens frequencyTable maxProbability chosenCategory|
+
+    maxProbability := Infinity negative.
+
+    tokens := self tokenize:string.
+    frequencyTable := tokens asBag.
+
+    categories do:[:categoryName |
+        |categoryProbability logProbability|
+
+        categoryProbability := (docCounts at:categoryName) / docCounts size.
+        logProbability := categoryProbability log.
+        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
+            | tokenProbability|
+            
+            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
+            logProbability := logProbability + (frequencyInText * tokenProbability log).
+        ].
+        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
+        
+        logProbability > maxProbability ifTrue:[
+            maxProbability := logProbability.
+            chosenCategory := categoryName.
+        ].
+    ].
+    ^ chosenCategory
+!
+
+classify:string asCategory:categoryName
+    |tokens frequencyTable sumWordCount|
+    
+    self initializeCategory:categoryName.
+    docCounts incrementAt:categoryName.
+    tokens := self tokenize:string.
+    frequencyTable := tokens asBag.
+    sumWordCount := 0.
+    frequencyTable valuesAndCountsDo:[:token :count |
+        vocabulary add:token.
+        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
+        sumWordCount := sumWordCount + count.
+    ].
+    wordCounts incrementAt:categoryName by:sumWordCount
+!
+
+tokenProbabilityOf:token inCategory:category
+    "Calculate probability that a `token` belongs to a `category`"
+
+    |wordFrequencyCount wordCount prob|
+
+    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
+    wordCount := wordCounts at:category.
+
+    "/use laplace Add-1 Smoothing equation
+    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
+    prob := prob asFloat.
+    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
+    ^ prob
+! !
 
 !BayesClassifier class methodsFor:'documentation'!