BayesClassifier.st
author Claus Gittinger <cg@exept.de>
Thu, 09 Jun 2016 12:36:55 +0200
changeset 3898 c90424dba938
parent 3683 1e4fc7b0ebca
child 4274 947603fffad1
permissions -rw-r--r--
#UI_ENHANCEMENT by cg class: HTMLPrinterStream changed: #initialize lazy package dependency
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
3679
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     1
"{ Package: 'stx:libbasic2' }"
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     2
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     3
"{ NameSpace: Smalltalk }"
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     4
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     5
TextClassifier subclass:#BayesClassifier
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     6
	instanceVariableNames:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     7
	classVariableNames:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     8
	poolDictionaries:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     9
	category:'Collections-Text-Support'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    10
!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    11
3683
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    12
!BayesClassifier class methodsFor:'documentation'!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    13
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    14
documentation
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    15
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    16
    an initial experiment in bayes text classification.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    17
    see BayesClassifierTest
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    18
    This is possibly unfinished and may need more work.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    19
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    20
    [author:]
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    21
        cg
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    22
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    23
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    24
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    25
examples
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    26
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    27
    |b|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    28
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    29
    b := BayesClassifier new.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    30
    'teach it positive phrases'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    31
    b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    32
    b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    33
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    34
    'teach it a negative phrase'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    35
    b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    36
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    37
    'teach it a neutral phrase'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    38
    b classify:'I dont really know what to make of this.' asCategory: 'neutral'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    39
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    40
    'now test it to see that it correctly categorizes a new document'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    41
    self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    42
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    43
! !
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    44
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    45
!BayesClassifier methodsFor:'text handling'!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    46
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    47
classify:string
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    48
    "assume that it is a regular text.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    49
     split first into lines..."
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    50
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    51
    |tokens frequencyTable maxProbability chosenCategory|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    52
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    53
    maxProbability := Infinity negative.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    54
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    55
    tokens := self tokenize:string.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    56
    frequencyTable := tokens asBag.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    57
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    58
    categories do:[:categoryName |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    59
        |categoryProbability logProbability|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    60
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    61
        categoryProbability := (docCounts at:categoryName) / docCounts size.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    62
        logProbability := categoryProbability log.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    63
        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    64
            | tokenProbability|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    65
            
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    66
            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    67
            logProbability := logProbability + (frequencyInText * tokenProbability log).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    68
        ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    69
        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    70
        
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    71
        logProbability > maxProbability ifTrue:[
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    72
            maxProbability := logProbability.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    73
            chosenCategory := categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    74
        ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    75
    ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    76
    ^ chosenCategory
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    77
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    78
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    79
classify:string asCategory:categoryName
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    80
    |tokens frequencyTable sumWordCount|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    81
    
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    82
    self initializeCategory:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    83
    docCounts incrementAt:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    84
    tokens := self tokenize:string.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    85
    frequencyTable := tokens asBag.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    86
    sumWordCount := 0.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    87
    frequencyTable valuesAndCountsDo:[:token :count |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    88
        vocabulary add:token.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    89
        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    90
        sumWordCount := sumWordCount + count.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    91
    ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    92
    wordCounts incrementAt:categoryName by:sumWordCount
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    93
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    94
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    95
tokenProbabilityOf:token inCategory:category
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    96
    "Calculate probability that a `token` belongs to a `category`"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    97
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    98
    |wordFrequencyCount wordCount prob|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    99
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   100
    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   101
    wordCount := wordCounts at:category.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   102
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   103
    "/use laplace Add-1 Smoothing equation
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   104
    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   105
    prob := prob asFloat.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   106
    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   107
    ^ prob
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   108
! !
3679
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   109
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   110
!BayesClassifier class methodsFor:'documentation'!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   111
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   112
version
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   113
    ^ '$Header$'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   114
!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   115
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   116
version_CVS
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   117
    ^ '$Header$'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   118
! !
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   119