BayesClassifier.st
author Claus Gittinger <cg@exept.de>
Thu, 09 Jun 2016 17:01:18 +0200
changeset 3918 c565bdb0c8c4
parent 3683 1e4fc7b0ebca
child 4274 947603fffad1
permissions -rw-r--r--
#OTHER by cg class: LinkedList changed: #at:ifAbsent:

"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

TextClassifier subclass:#BayesClassifier
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!BayesClassifier class methodsFor:'documentation'!

documentation
"
    an initial experiment in bayes text classification.
    see BayesClassifierTest
    This is possibly unfinished and may need more work.

    [author:]
        cg
"
!

examples
"
    |b|

    b := BayesClassifier new.
    'teach it positive phrases'.
    b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'.
    b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'.

    'teach it a negative phrase'.
    b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'.

    'teach it a neutral phrase'.
    b classify:'I dont really know what to make of this.' asCategory: 'neutral'.

    'now test it to see that it correctly categorizes a new document'.
    self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'.
"
! !

!BayesClassifier methodsFor:'text handling'!

classify:string
    "assume that it is a regular text.
     split first into lines..."

    |tokens frequencyTable maxProbability chosenCategory|

    maxProbability := Infinity negative.

    tokens := self tokenize:string.
    frequencyTable := tokens asBag.

    categories do:[:categoryName |
        |categoryProbability logProbability|

        categoryProbability := (docCounts at:categoryName) / docCounts size.
        logProbability := categoryProbability log.
        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
            | tokenProbability|
            
            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
            logProbability := logProbability + (frequencyInText * tokenProbability log).
        ].
        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
        
        logProbability > maxProbability ifTrue:[
            maxProbability := logProbability.
            chosenCategory := categoryName.
        ].
    ].
    ^ chosenCategory
!

classify:string asCategory:categoryName
    |tokens frequencyTable sumWordCount|
    
    self initializeCategory:categoryName.
    docCounts incrementAt:categoryName.
    tokens := self tokenize:string.
    frequencyTable := tokens asBag.
    sumWordCount := 0.
    frequencyTable valuesAndCountsDo:[:token :count |
        vocabulary add:token.
        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
        sumWordCount := sumWordCount + count.
    ].
    wordCounts incrementAt:categoryName by:sumWordCount
!

tokenProbabilityOf:token inCategory:category
    "Calculate probability that a `token` belongs to a `category`"

    |wordFrequencyCount wordCount prob|

    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
    wordCount := wordCounts at:category.

    "/use laplace Add-1 Smoothing equation
    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
    prob := prob asFloat.
    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
    ^ prob
! !

!BayesClassifier class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !