TextClassifier.st
author Claus Gittinger <cg@exept.de>
Wed, 06 Jan 2016 01:42:18 +0100
changeset 3678 a03fb375c047
child 3682 1629a0dc2875
permissions -rw-r--r--
initial checkin

"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#TextClassifier
	instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
		categories vocabulary'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!TextClassifier class methodsFor:'documentation'!

documentation
"
    an initial experiment in bayes text classification.
    see BayesClassifierTest
    This is possibly unfinished and may need more work.
    
    [author:]
        cg

    [instance variables:]

    [class variables:]

    [see also:]

"
! !

!TextClassifier class methodsFor:'instance creation'!

new
    "return an initialized instance"

    ^ self basicNew initialize.
! !

!TextClassifier methodsFor:'initialization'!

initialize
    "Invoked when a new instance is created."

    wordBag := Bag new.
    "/ sentences := nil.
    docCounts := Dictionary new.
    wordCounts := Dictionary new.
    wordFrequencyCounts := Dictionary new.
    categories := Set new.
    vocabulary := Set new.

    "/ super initialize.   -- commented since inherited method does nothing
!

initializeCategory:categoryName
    (categories includes:categoryName) ifFalse:[
        docCounts at:categoryName put:0.
        wordCounts at:categoryName put:0.
        wordFrequencyCounts at:categoryName put:(Dictionary new).
        categories add:categoryName
    ].
! !

!TextClassifier methodsFor:'text handling'!

classify:string
    "assume that it is a regular text.
     split first into lines..."

    |tokens frequencyTable maxProbability chosenCategory|

    maxProbability := Infinity negative.

    tokens := self tokenize:string.
    frequencyTable := tokens asBag.

    categories do:[:categoryName |
        |categoryProbability logProbability|

        categoryProbability := (docCounts at:categoryName) / docCounts size.
        logProbability := categoryProbability log.
        frequencyTable valuesAndCountsDo:[:token :frequencyInText |
            | tokenProbability|
            
            tokenProbability := self tokenProbabilityOf:token inCategory:categoryName.
            logProbability := logProbability + (frequencyInText * tokenProbability log).
        ].
        Transcript show:'P(',categoryName,') = '; showCR:logProbability.
        
        logProbability > maxProbability ifTrue:[
            maxProbability := logProbability.
            chosenCategory := categoryName.
        ].
    ].
    ^ chosenCategory
!

classify:string asCategory:categoryName
    |tokens frequencyTable sumWordCount|
    
    self initializeCategory:categoryName.
    docCounts incrementAt:categoryName.
    tokens := self tokenize:string.
    frequencyTable := tokens asBag.
    sumWordCount := 0.
    frequencyTable valuesAndCountsDo:[:token :count |
        vocabulary add:token.
        (wordFrequencyCounts at:categoryName) incrementAt:token by:count.
        sumWordCount := sumWordCount + count.
    ].
    wordCounts incrementAt:categoryName by:sumWordCount
!

collectWords:lines
    "computes words from lines"

    |words|

    words := lines collectAll:[:l | 
                l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
             ].
    ^ words
!

dehyphenate:linesCollection
    "join hypens"

    |lines partialLine|

    lines := OrderedCollection new.
    linesCollection do:[:eachLine |
        |l isHyphenated|
        
        l := eachLine withoutSeparators.
        l notEmptyOrNil ifTrue:[
            isHyphenated := (l endsWith:'-')
                            and:[ l size > 1 
                            and:[ (l at:(l size-1)) isLetter ]].
            isHyphenated ifFalse:[
                partialLine := (partialLine ? '') , l.
                lines add:partialLine.
                partialLine := nil.
            ] ifTrue:[
                l := l copyButLast.
                partialLine := (partialLine ? '') , l.
            ].    
        ].
    ].
    partialLine notEmptyOrNil ifTrue:[
        lines add:partialLine
    ].
    ^ lines
!

tokenProbabilityOf:token inCategory:category
    "Calculate probability that a `token` belongs to a `category`"

    |wordFrequencyCount wordCount prob|

    wordFrequencyCount := (wordFrequencyCounts at:category) at:token ifAbsent:0.
    wordCount := wordCounts at:category.

    "/use laplace Add-1 Smoothing equation
    prob :=( wordFrequencyCount + 1 ) / ( wordCount + vocabulary size ).
    prob := prob asFloat.
    Transcript showCR:('  P(%1, %2) = %3' bindWith:token with:category with:prob).
    ^ prob
!

tokenize:string
    |rawLines lines allWords|

    rawLines := string asCollectionOfLines.
    lines := self dehyphenate:rawLines.
    allWords := self collectWords:lines.
    ^ allWords
! !

!TextClassifier class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !