TextClassifier.st
author Claus Gittinger <cg@exept.de>
Thu, 09 Jun 2016 17:48:53 +0200
changeset 3928 d1133788cbba
parent 3682 1629a0dc2875
child 4273 6b1c8dd09469
permissions -rw-r--r--
#OTHER by cg bz2 for windows
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
3678
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     1
"{ Package: 'stx:libbasic2' }"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     2
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     3
"{ NameSpace: Smalltalk }"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     4
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     5
Object subclass:#TextClassifier
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     6
	instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     7
		categories vocabulary'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     8
	classVariableNames:''
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     9
	poolDictionaries:''
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    10
	category:'Collections-Text-Support'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    11
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    12
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    13
!TextClassifier class methodsFor:'documentation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    14
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    15
documentation
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    16
"
3682
1629a0dc2875 #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3678
diff changeset
    17
    an initial experiment in text classification.
3678
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    18
    see BayesClassifierTest
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    19
    
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    20
    [author:]
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    21
        cg
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    22
"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    23
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    24
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    25
!TextClassifier class methodsFor:'instance creation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    26
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    27
new
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    28
    "return an initialized instance"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    29
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    30
    ^ self basicNew initialize.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    31
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    32
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    33
!TextClassifier methodsFor:'initialization'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    34
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    35
initialize
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    36
    "Invoked when a new instance is created."
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    37
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    38
    wordBag := Bag new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    39
    "/ sentences := nil.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    40
    docCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    41
    wordCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    42
    wordFrequencyCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    43
    categories := Set new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    44
    vocabulary := Set new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    45
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    46
    "/ super initialize.   -- commented since inherited method does nothing
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    47
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    48
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    49
initializeCategory:categoryName
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    50
    (categories includes:categoryName) ifFalse:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    51
        docCounts at:categoryName put:0.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    52
        wordCounts at:categoryName put:0.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    53
        wordFrequencyCounts at:categoryName put:(Dictionary new).
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    54
        categories add:categoryName
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    55
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    56
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    57
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    58
!TextClassifier methodsFor:'text handling'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    59
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    60
collectWords:lines
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    61
    "computes words from lines"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    62
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    63
    |words|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    64
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    65
    words := lines collectAll:[:l | 
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    66
                l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    67
             ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    68
    ^ words
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    69
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    70
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    71
dehyphenate:linesCollection
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    72
    "join hypens"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    73
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    74
    |lines partialLine|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    75
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    76
    lines := OrderedCollection new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    77
    linesCollection do:[:eachLine |
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    78
        |l isHyphenated|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    79
        
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    80
        l := eachLine withoutSeparators.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    81
        l notEmptyOrNil ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    82
            isHyphenated := (l endsWith:'-')
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    83
                            and:[ l size > 1 
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    84
                            and:[ (l at:(l size-1)) isLetter ]].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    85
            isHyphenated ifFalse:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    86
                partialLine := (partialLine ? '') , l.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    87
                lines add:partialLine.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    88
                partialLine := nil.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    89
            ] ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    90
                l := l copyButLast.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    91
                partialLine := (partialLine ? '') , l.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    92
            ].    
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    93
        ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    94
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    95
    partialLine notEmptyOrNil ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    96
        lines add:partialLine
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    97
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    98
    ^ lines
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    99
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   100
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   101
tokenize:string
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   102
    |rawLines lines allWords|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   103
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   104
    rawLines := string asCollectionOfLines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   105
    lines := self dehyphenate:rawLines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   106
    allWords := self collectWords:lines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   107
    ^ allWords
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   108
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   109
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   110
!TextClassifier class methodsFor:'documentation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   111
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   112
version
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   113
    ^ '$Header$'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   114
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   115
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   116
version_CVS
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   117
    ^ '$Header$'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   118
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   119