TextClassifier.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 4273 6b1c8dd09469
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4273
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     1
"
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     2
 COPYRIGHT (c) 2016 by eXept Software AG
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     3
              All Rights Reserved
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     4
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     5
 This software is furnished under a license and may be used
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     6
 only in accordance with the terms of that license and with the
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     7
 inclusion of the above copyright notice. This software may not
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     8
 be provided or otherwise made available to, or used by, any
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
     9
 other person. No title to or ownership of the software is
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    10
 hereby transferred.
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    11
"
3678
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    12
"{ Package: 'stx:libbasic2' }"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    13
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    14
"{ NameSpace: Smalltalk }"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    15
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    16
Object subclass:#TextClassifier
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    17
	instanceVariableNames:'wordBag sentences docCounts wordCounts wordFrequencyCounts
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    18
		categories vocabulary'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    19
	classVariableNames:''
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    20
	poolDictionaries:''
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    21
	category:'Collections-Text-Support'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    22
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    23
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    24
!TextClassifier class methodsFor:'documentation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    25
4273
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    26
copyright
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    27
"
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    28
 COPYRIGHT (c) 2016 by eXept Software AG
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    29
              All Rights Reserved
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    30
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    31
 This software is furnished under a license and may be used
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    32
 only in accordance with the terms of that license and with the
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    33
 inclusion of the above copyright notice. This software may not
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    34
 be provided or otherwise made available to, or used by, any
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    35
 other person. No title to or ownership of the software is
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    36
 hereby transferred.
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    37
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    38
"
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    39
!
6b1c8dd09469 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 3682
diff changeset
    40
3678
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    41
documentation
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    42
"
3682
1629a0dc2875 #DOCUMENTATION
Claus Gittinger <cg@exept.de>
parents: 3678
diff changeset
    43
    an initial experiment in text classification.
3678
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    44
    see BayesClassifierTest
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    45
    
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    46
    [author:]
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    47
        cg
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    48
"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    49
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    50
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    51
!TextClassifier class methodsFor:'instance creation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    52
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    53
new
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    54
    "return an initialized instance"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    55
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    56
    ^ self basicNew initialize.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    57
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    58
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    59
!TextClassifier methodsFor:'initialization'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    60
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    61
initialize
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    62
    "Invoked when a new instance is created."
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    63
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    64
    wordBag := Bag new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    65
    "/ sentences := nil.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    66
    docCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    67
    wordCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    68
    wordFrequencyCounts := Dictionary new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    69
    categories := Set new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    70
    vocabulary := Set new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    71
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    72
    "/ super initialize.   -- commented since inherited method does nothing
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    73
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    74
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    75
initializeCategory:categoryName
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    76
    (categories includes:categoryName) ifFalse:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    77
        docCounts at:categoryName put:0.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    78
        wordCounts at:categoryName put:0.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    79
        wordFrequencyCounts at:categoryName put:(Dictionary new).
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    80
        categories add:categoryName
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    81
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    82
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    83
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    84
!TextClassifier methodsFor:'text handling'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    85
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    86
collectWords:lines
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    87
    "computes words from lines"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    88
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    89
    |words|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    90
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    91
    words := lines collectAll:[:l | 
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    92
                l asCollectionOfSubCollectionsSeparatedByAnyForWhich:[:ch | ch isLetterOrDigit not]
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    93
             ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    94
    ^ words
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    95
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    96
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    97
dehyphenate:linesCollection
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    98
    "join hypens"
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    99
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   100
    |lines partialLine|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   101
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   102
    lines := OrderedCollection new.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   103
    linesCollection do:[:eachLine |
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   104
        |l isHyphenated|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   105
        
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   106
        l := eachLine withoutSeparators.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   107
        l notEmptyOrNil ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   108
            isHyphenated := (l endsWith:'-')
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   109
                            and:[ l size > 1 
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   110
                            and:[ (l at:(l size-1)) isLetter ]].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   111
            isHyphenated ifFalse:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   112
                partialLine := (partialLine ? '') , l.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   113
                lines add:partialLine.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   114
                partialLine := nil.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   115
            ] ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   116
                l := l copyButLast.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   117
                partialLine := (partialLine ? '') , l.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   118
            ].    
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   119
        ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   120
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   121
    partialLine notEmptyOrNil ifTrue:[
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   122
        lines add:partialLine
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   123
    ].
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   124
    ^ lines
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   125
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   126
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   127
tokenize:string
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   128
    |rawLines lines allWords|
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   129
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   130
    rawLines := string asCollectionOfLines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   131
    lines := self dehyphenate:rawLines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   132
    allWords := self collectWords:lines.
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   133
    ^ allWords
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   134
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   135
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   136
!TextClassifier class methodsFor:'documentation'!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   137
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   138
version
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   139
    ^ '$Header$'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   140
!
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   141
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   142
version_CVS
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   143
    ^ '$Header$'
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   144
! !
a03fb375c047 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   145