--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TextClassifier.st Wed Jan 06 01:42:18 2016 +0100
@@ -0,0 +1,190 @@
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Object subclass:#TextClassifier
+ instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
+ categories vocabulary'
+ classVariableNames:''
+ poolDictionaries:''
+ category:'Collections-Text-Support'
+!
+
+!TextClassifier class methodsFor:'documentation'!
+
+documentation
+"
+ an initial experiment in bayes text classification.
+ see BayesClassifierTest
+ This is possibly unfinished and may need more work.
+
+ [author:]
+ cg
+
+ [instance variables:]
+
+ [class variables:]
+
+ [see also:]
+
+"
+! !
+
+!TextClassifier class methodsFor:'instance creation'!
+
+new
+ "return an initialized instance"
+
+ ^ self basicNew initialize.
+! !
+
+!TextClassifier methodsFor:'initialization'!
+
+initialize
+ "Invoked when a new instance is created."
+
+ wordBag := Bag new.
+ "/ sentences := nil.
+ docCounts := Dictionary new.
+ wordCounts := Dictionary new.
+ wordFrequencyCounts := Dictionary new.
+ categories := Set new.
+ vocabulary := Set new.
+
+ "/ super initialize. -- commented since inherited method does nothing
+!
+
+initializeCategory:categoryName
+ (categories includes:categoryName) ifFalse:[
+ docCounts at:categoryName put:0.
+ wordCounts at:categoryName put:0.
+ wordFrequencyCounts at:categoryName put:(Dictionary new).
+ categories add:categoryName
+ ].
+! !
+
+!TextClassifier methodsFor:'text handling'!
+
+classify:string
+ "assume that it is a regular text.
+ split first into lines..."
+
+ |tokens frequencyTable maxProbability chosenCategory|
+
+ maxProbability := Infinity negative.
+
+ tokens := self tokenize:string.
+ frequencyTable := tokens asBag.
+
+ categories do:[:categoryName |
+ |categoryProbability logProbability|
+
+ categoryProbability := (docCounts at:categoryName) / docCounts size.
+ logProbability := categoryProbability log.
+ frequencyTable valuesAndCountsDo:[:token :frequencyInText |
+ | tokenProbability|
+
+ tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
+ logProbability := logProbability + (frequencyInText * tokenProbability log).
+ ].
+ Transcript show:'P(',categoryName,') = '; showCR:logProbability.
+
+ logProbability > maxProbability ifTrue:[
+ maxProbability := logProbability.
+ chosenCategory := categoryName.
+ ].
+ ].
+ ^ chosenCategory
+!
+
+classify:string asCategory:categoryName
+ |tokens frequencyTable sumWordCount|
+
+ self initializeCategory:categoryName.
+ docCounts incrementAt:categoryName.
+ tokens := self tokenize:string.
+ frequencyTable := tokens asBag.
+ sumWordCount := 0.
+ frequencyTable valuesAndCountsDo:[:token :count |
+ vocabulary add:token.
+ (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
+ sumWordCount := sumWordCount + count.
+ ].
+ wordCounts incrementAt:categoryName by:sumWordCount
+!
+
+collectWords:lines
+ "computes words from lines"
+
+ |words|
+
+ words := lines collectAll:[:l |
+ l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
+ ].
+ ^ words
+!
+
+dehyphenate:linesCollection
+ "join hypens"
+
+ |lines partialLine|
+
+ lines := OrderedCollection new.
+ linesCollection do:[:eachLine |
+ |l isHyphenated|
+
+ l := eachLine withoutSeparators.
+ l notEmptyOrNil ifTrue:[
+ isHyphenated := (l endsWith:'-')
+ and:[ l size > 1
+ and:[ (l at:(l size-1)) isLetter ]].
+ isHyphenated ifFalse:[
+ partialLine := (partialLine ? '') , l.
+ lines add:partialLine.
+ partialLine := nil.
+ ] ifTrue:[
+ l := l copyButLast.
+ partialLine := (partialLine ? '') , l.
+ ].
+ ].
+ ].
+ partialLine notEmptyOrNil ifTrue:[
+ lines add:partialLine
+ ].
+ ^ lines
+!
+
+tokenProbabilityOf:token inCategory:category
+ "Calculate probability that a `token` belongs to a `category`"
+
+ |wordFrequencyCount wordCount prob|
+
+ wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
+ wordCount := wordCounts at:category.
+
+ "/use laplace Add-1 Smoothing equation
+ prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
+ prob := prob asFloat.
+ Transcript showCR:(' P(%1, %2) = %3' bindWith:token with:category with:prob).
+ ^ prob
+!
+
+tokenize:string
+ |rawLines lines allWords|
+
+ rawLines := string asCollectionOfLines.
+ lines := self dehyphenate:rawLines.
+ allWords := self collectWords:lines.
+ ^ allWords
+! !
+
+!TextClassifier class methodsFor:'documentation'!
+
+version
+ ^ '$Header$'
+!
+
+version_CVS
+ ^ '$Header$'
+! !
+