1 # /* vim: set filetype=awk : */ -*- awk -*-
2 ###############################################################
3 # gains.awK : scores attributes by the gain if we split on them
4 # (c) 2007 Tim Menzies tim@menzies.us
5 #
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; version 3.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston,
18 # MA 02110-1301, USA.
19 ###############################################################
20
21 # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/gains.awk
22 # usage: gawk -f gains.awk [Debug=1] [Klass=N] data.arff
23
24 ## warning: hastily written code! use with great care!
25 # has known bugs
26 # 1) if an attribute name has spaces, this won't work
27 # 2) the infogain calculation does not include gain ratios
28 # 3) the gain on numeric and symbolic columns are different
29 # units, and this code offers no suggestions on how
30 # to rank one over the other
31 # 4) on some data sets (autos.arff) i get this bug:
32 #
33 # gawk: gains.awk:(FILENAME=- FNR=333) warning: sqrt: called
34 # with negative argument -7.27596e-12
35 # gawk: gains.awk: (FILENAME=- FNR=333) warning: sqrt: called
36 # with negative argument -1.81899e-12
37
38 BEGIN {
39 #### command line options
40 Klass= -1 # Position of class attribute. if negative,
41 # counts in from the right.
42 # e.g "-2" means second most right column
43 # e.g "3" means third most left column
44 Debug = 0 # Set to one to get LOTS of internal info
45
46 #### internal stuff
47 OFS="," # Simplifies printing output
48 IGNORECASE=1 # make case comparison case insenstive
49 Relation="any" # Name of this relation
50 Attr=0 # Number of attributes
51 array(Name) # Name of attribute "i"
52 Instances # Number of instances
53 array(N) # N[k] is the number of class "k" instances
54 array(Num) # If Num[i] then attribute "i" is a numeric
55
56 # stuff to keep information from symbolic columns
57 array(Count) # Frequency counts of symbols in a column
58 array(Uniques) # Each column "c" has all=Uniques[c,0] symbols,
59 # store at positions 1<=i<=all Uniques[c,i]
60
61 # stuff to keep information from numeric columns
62 array(Sum) # Sum of the values in a column
63 array(SumSq) # Sum of the squres of the values in a column
64 }
65
66 #### generic stuff to read an arff file
67 # (should adapt to CSV real easy)
68 { sub(/#.*/,"") }
69 /^[ \t]*$/ { next }
70 /@relation/ { Relation=$2}
71 /@attribute/ { Attr++; Num[Attr]=0; Name[Attr]=$2;}
72 /@attribute/ && $3 ~ /numeric|real|integer/ {
73 Num[Attr]=1; }
74 /@data/ { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass}
75 /@/ { next; }
76 In { instance() }
77 END { debug(); report() }
78
79 #### worker functions
80 function instance( i) {
81 Instances++
82 N[$Klass]++
83 for(i=1;i<=Attr;i++)
84 if (i != Klass)
85 if ($i !~ /\?/)
86 Num[i] ? number(i,$i,$Klass) : symbol(i,$i,$Klass)
87 }
88 function report ( i) {
89 for(i=1;i<=Attr;i++)
90 if (i != Klass)
91 print (Num[i] ? "numeric" : "symbolic"),
92 Name[i],
93 sprintf("%5.5f",Num[i] ? sdGain(i) : infoGain(i))
94 }
95 function debug() {
96 if (Debug) {
97 print Relation
98 print Attr
99 saya("N",N)
100 saya("Num",Num)
101 saya("Name",Name)
102 saya("Sum",Sum)
103 saya("SumSq",SumSq)
104 saya("Uniques",Uniques)
105 saya("Count",Count)
106 }
107 }
108 #### record keeping
109 function number(col,value,klass) {
110 Sum[col,klass] += value;
111 SumSq[col,klass] += value*value;
112 Sum[col] += value;
113 SumSq[col] += value*value;
114 }
115 function symbol(col,value,klass) {
116 if (++Count[col,value] == 1)
117 Uniques[col,++Uniques[col,0]]=value;
118 Count[klass,col,value]++;
119 }
120
121 #### asessing the gain on numeric columns
122 function sdGain(col, klass,before,after,probKlass) {
123 before = sd(SumSq[col],Sum[col],Instances);
124 for(klass in N) {
125 probKlass = N[klass]/Instances
126 after += probKlass * sd(SumSq[col,klass],Sum[col,klass],N[klass])
127 }
128 return before - after
129 }
130
131 #### assessing the gain on symbolic columns
132 function infoGain(col, all,klass,before,after,probKlass) {
133 all = Uniques[col,0]
134 before = bitsInColumn(all,col);
135 for(klass in N) {
136 probKlass = N[klass]/Instances
137 after += probKlass * bitsInKlass(all,col,klass)
138 }
139 return before - after;
140 }
141 function bitsInKlass(all,col,klass, i,one,f) {
142 for(i=1;i<=all;i++) {
143 one = Uniques[col,i];
144 f[i]= Count[klass,col,one];
145 }
146 return entropy(f,N[klass])
147 }
148 function bitsInColumn(all,col, one,f,i) {
149 for(i=1;i<=all;i++) {
150 one = Uniques[col,i];
151 f[i]= Count[col,one];
152 };
153 return entropy(f,Instances)
154 }
155
156 #### low-level stuff
157 function entropy(f,n, out,i,p) {
158 for(i in f)
159 if (p=f[i]/n)
160 out += -1 * p * log2(p);
161 return out
162 }
163 function log2(n) { return log(n)/log(2) }
164 function sd(sumSq,sumX,n) { return sqrt((sumSq-((sumX*sumX)/n))/(n-1)); }
165 function array(a) { split("",a) }
166
167 #### debugging stuff
168 function saya(str,a, com,i,j) {
169 com="sort #" rand()
170 for(i in a) {
171 j=i;
172 gsub(SUBSEP,",",j)
173 print str "[" j "]=" a[i] | com;
174 }
175 close(com)
176 }