BayesClassifier.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 4274 947603fffad1
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4274
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     1
"
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     2
 COPYRIGHT (c) 2016 by eXept Software AG
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     3
              All Rights Reserved
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     4
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     5
 This software is furnished under a license and may be used
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     6
 only in accordance with the terms of that license and with the
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     7
 inclusion of the above copyright notice. This software may not
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     8
 be provided or otherwise made available to, or used by, any
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
     9
 other person. No title to or ownership of the software is
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    10
 hereby transferred.
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    11
"
3679
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    12
"{ Package: 'stx:libbasic2' }"
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    13
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    14
"{ NameSpace: Smalltalk }"
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    15
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    16
TextClassifier subclass:#BayesClassifier
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    17
	instanceVariableNames:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    18
	classVariableNames:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    19
	poolDictionaries:''
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    20
	category:'Collections-Text-Support'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    21
!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    22
3683
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    23
!BayesClassifier class methodsFor:'documentation'!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    24
4274
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    25
copyright
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    26
"
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    27
 COPYRIGHT (c) 2016 by eXept Software AG
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    28
              All Rights Reserved
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    29
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    30
 This software is furnished under a license and may be used
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    31
 only in accordance with the terms of that license and with the
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    32
 inclusion of the above copyright notice. This software may not
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    33
 be provided or otherwise made available to, or used by, any
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    34
 other person. No title to or ownership of the software is
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    35
 hereby transferred.
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    36
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    37
"
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    38
!
947603fffad1 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3683
diff changeset
    39
3683
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    40
documentation
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    41
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    42
    an initial experiment in bayes text classification.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    43
    see BayesClassifierTest
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    44
    This is possibly unfinished and may need more work.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    45
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    46
    [author:]
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    47
        cg
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    48
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    49
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    50
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    51
examples
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    52
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    53
    |b|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    54
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    55
    b := BayesClassifier new.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    56
    'teach it positive phrases'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    57
    b classify:'amazing, awesome movie!!!! Yeah!!!!' asCategory: 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    58
    b classify:'Sweet, this is incredibly, amazing, perfect, great!!!!' asCategory: 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    59
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    60
    'teach it a negative phrase'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    61
    b classify:'terrible, shitty thing. Damn. Sucks!!!!' asCategory: 'negative'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    62
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    63
    'teach it a neutral phrase'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    64
    b classify:'I dont really know what to make of this.' asCategory: 'neutral'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    65
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    66
    'now test it to see that it correctly categorizes a new document'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    67
    self assert:(b classify:'awesome, cool, amazing!!!! Yay.')= 'positive'.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    68
"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    69
! !
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    70
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    71
!BayesClassifier methodsFor:'text handling'!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    72
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    73
classify:string
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    74
    "assume that it is a regular text.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    75
     split first into lines..."
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    76
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    77
    |tokens frequencyTable maxProbability chosenCategory|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    78
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    79
    maxProbability := Infinity negative.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    80
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    81
    tokens := self tokenize:string.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    82
    frequencyTable := tokens asBag.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    83
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    84
    categories do:[:categoryName |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    85
        |categoryProbability logProbability|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    86
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    87
        categoryProbability := (docCounts at:categoryName) / docCounts size.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    88
        logProbability := categoryProbability log.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    89
        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    90
            | tokenProbability|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    91
            
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    92
            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    93
            logProbability := logProbability + (frequencyInText * tokenProbability log).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    94
        ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    95
        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    96
        
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    97
        logProbability > maxProbability ifTrue:[
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    98
            maxProbability := logProbability.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
    99
            chosenCategory := categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   100
        ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   101
    ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   102
    ^ chosenCategory
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   103
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   104
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   105
classify:string asCategory:categoryName
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   106
    |tokens frequencyTable sumWordCount|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   107
    
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   108
    self initializeCategory:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   109
    docCounts incrementAt:categoryName.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   110
    tokens := self tokenize:string.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   111
    frequencyTable := tokens asBag.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   112
    sumWordCount := 0.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   113
    frequencyTable valuesAndCountsDo:[:token :count |
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   114
        vocabulary add:token.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   115
        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   116
        sumWordCount := sumWordCount + count.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   117
    ].
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   118
    wordCounts incrementAt:categoryName by:sumWordCount
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   119
!
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   120
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   121
tokenProbabilityOf:token inCategory:category
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   122
    "Calculate probability that a `token` belongs to a `category`"
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   123
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   124
    |wordFrequencyCount wordCount prob|
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   125
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   126
    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   127
    wordCount := wordCounts at:category.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   128
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   129
    "/use laplace Add-1 Smoothing equation
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   130
    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   131
    prob := prob asFloat.
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   132
    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   133
    ^ prob
1e4fc7b0ebca #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3679
diff changeset
   134
! !
3679
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   135
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   136
!BayesClassifier class methodsFor:'documentation'!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   137
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   138
version
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   139
    ^ '$Header$'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   140
!
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   141
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   142
version_CVS
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   143
    ^ '$Header$'
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   144
! !
b451fd09c975 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   145