TextClassifier.st
changeset 3678 a03fb375c047
child 3682 1629a0dc2875
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TextClassifier.st	Wed Jan 06 01:42:18 2016 +0100
@@ -0,0 +1,190 @@
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Object subclass:#TextClassifier
+	instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
+		categories vocabulary'
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Text-Support'
+!
+
+!TextClassifier class methodsFor:'documentation'!
+
+documentation
+"
+    an initial experiment in bayes text classification.
+    see BayesClassifierTest
+    This is possibly unfinished and may need more work.
+    
+    [author:]
+        cg
+
+    [instance variables:]
+
+    [class variables:]
+
+    [see also:]
+
+"
+! !
+
+!TextClassifier class methodsFor:'instance creation'!
+
+new
+    "return an initialized instance"
+
+    ^ self basicNew initialize.
+! !
+
+!TextClassifier methodsFor:'initialization'!
+
+initialize
+    "Invoked when a new instance is created."
+
+    wordBag := Bag new.
+    "/ sentences := nil.
+    docCounts := Dictionary new.
+    wordCounts := Dictionary new.
+    wordFrequencyCounts := Dictionary new.
+    categories := Set new.
+    vocabulary := Set new.
+
+    "/ super initialize.   -- commented since inherited method does nothing
+!
+
+initializeCategory:categoryName
+    (categories includes:categoryName) ifFalse:[
+        docCounts at:categoryName put:0.
+        wordCounts at:categoryName put:0.
+        wordFrequencyCounts at:categoryName put:(Dictionary new).
+        categories add:categoryName
+    ].
+! !
+
+!TextClassifier methodsFor:'text handling'!
+
+classify:string
+    "assume that it is a regular text.
+     split first into lines..."
+
+    |tokens frequencyTable maxProbability chosenCategory|
+
+    maxProbability := Infinity negative.
+
+    tokens := self tokenize:string.
+    frequencyTable := tokens asBag.
+
+    categories do:[:categoryName |
+        |categoryProbability logProbability|
+
+        categoryProbability := (docCounts at:categoryName) / docCounts size.
+        logProbability := categoryProbability log.
+        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
+            | tokenProbability|
+            
+            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
+            logProbability := logProbability + (frequencyInText * tokenProbability log).
+        ].
+        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
+        
+        logProbability > maxProbability ifTrue:[
+            maxProbability := logProbability.
+            chosenCategory := categoryName.
+        ].
+    ].
+    ^ chosenCategory
+!
+
+classify:string asCategory:categoryName
+    |tokens frequencyTable sumWordCount|
+    
+    self initializeCategory:categoryName.
+    docCounts incrementAt:categoryName.
+    tokens := self tokenize:string.
+    frequencyTable := tokens asBag.
+    sumWordCount := 0.
+    frequencyTable valuesAndCountsDo:[:token :count |
+        vocabulary add:token.
+        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
+        sumWordCount := sumWordCount + count.
+    ].
+    wordCounts incrementAt:categoryName by:sumWordCount
+!
+
+collectWords:lines
+    "computes words from lines"
+
+    |words|
+
+    words := lines collectAll:[:l | 
+                l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
+             ].
+    ^ words
+!
+
+dehyphenate:linesCollection
+    "join hypens"
+
+    |lines partialLine|
+
+    lines := OrderedCollection new.
+    linesCollection do:[:eachLine |
+        |l isHyphenated|
+        
+        l := eachLine withoutSeparators.
+        l notEmptyOrNil ifTrue:[
+            isHyphenated := (l endsWith:'-')
+                            and:[ l size > 1 
+                            and:[ (l at:(l size-1)) isLetter ]].
+            isHyphenated ifFalse:[
+                partialLine := (partialLine ? '') , l.
+                lines add:partialLine.
+                partialLine := nil.
+            ] ifTrue:[
+                l := l copyButLast.
+                partialLine := (partialLine ? '') , l.
+            ].    
+        ].
+    ].
+    partialLine notEmptyOrNil ifTrue:[
+        lines add:partialLine
+    ].
+    ^ lines
+!
+
+tokenProbabilityOf:token inCategory:category
+    "Calculate probability that a `token` belongs to a `category`"
+
+    |wordFrequencyCount wordCount prob|
+
+    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
+    wordCount := wordCounts at:category.
+
+    "/use laplace Add-1 Smoothing equation
+    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
+    prob := prob asFloat.
+    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
+    ^ prob
+!
+
+tokenize:string
+    |rawLines lines allWords|
+
+    rawLines := string asCollectionOfLines.
+    lines := self dehyphenate:rawLines.
+    allWords := self collectWords:lines.
+    ^ allWords
+! !
+
+!TextClassifier class methodsFor:'documentation'!
+
+version
+    ^ '$Header$'
+!
+
+version_CVS
+    ^ '$Header$'
+! !
+