1 # /* vim: set filetype=awk : */ -*- awk -*-
  2 ###############################################################
  3 # gains.awK : scores attributes by the gain if we split on them
  4 # (c) 2007 Tim Menzies tim@menzies.us
  5 #
  6 # This program is free software; you can redistribute it and/or
  7 # modify it under the terms of the GNU General Public License
  8 # as published by the Free Software Foundation; version 3.
  9 #
 10 # This program is distributed in the hope that it will be useful,
 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 # GNU General Public License for more details.
 14 #
 15 # You should have received a copy of the GNU General Public License
 16 # along with this program; if not, write to the Free Software
 17 # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston,
 18 # MA 02110-1301, USA.
 19 ###############################################################
 20 
 21 # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/gains.awk
 22 # usage: gawk -f gains.awk [Debug=1] [Klass=N] data.arff
 23 
 24 ## warning: hastily written code! use with great care!
 25 # has known bugs
 26 # 1) if an attribute name has spaces, this won't work
 27 # 2) the infogain calculation does not include gain ratios
 28 # 3) the gain on numeric and symbolic columns are different
 29 #    units, and this code offers no suggestions on how
 30 #    to rank one over the other
 31 # 4) on some data sets (autos.arff) i get this bug:
 32 #
 33 # gawk: gains.awk:(FILENAME=- FNR=333) warning: sqrt: called 
 34 #       with negative argument -7.27596e-12
 35 # gawk: gains.awk: (FILENAME=- FNR=333) warning: sqrt: called 
 36 #       with negative argument -1.81899e-12
 37 
 38 BEGIN  {
 39         #### command line options 
 40         Klass= -1       # Position of class attribute. if negative,
 41                         #    counts in from the right. 
 42                         #    e.g "-2" means second most right column
 43                         #    e.g "3" means third most left column
 44         Debug = 0       # Set to one to get LOTS of internal info
 45 
 46         #### internal stuff
 47         OFS=","         # Simplifies printing output
 48         IGNORECASE=1    # make case comparison case insenstive
 49         Relation="any"  # Name of this relation
 50         Attr=0          # Number of attributes
 51         array(Name)     # Name of attribute "i"
 52         Instances       # Number of instances
 53         array(N)        # N[k] is the number of class "k" instances
 54         array(Num)      # If Num[i] then attribute "i" is a numeric
 55 
 56         # stuff to keep information from symbolic columns
 57         array(Count)    # Frequency counts of symbols in a column
 58         array(Uniques)  # Each column "c" has all=Uniques[c,0] symbols, 
 59                         #     store at positions 1<=i<=all Uniques[c,i]
 60 
 61         # stuff to keep information from numeric columns
 62         array(Sum)      # Sum of the values in a column
 63         array(SumSq)    # Sum of the squres of the values in a column
 64         }
 65 
 66 #### generic stuff to read an arff file 
 67 # (should adapt to CSV real easy)
 68              { sub(/#.*/,"") }
 69 /^[ \t]*$/   { next }
 70 /@relation/  { Relation=$2}
 71 /@attribute/ { Attr++; Num[Attr]=0; Name[Attr]=$2;}
 72 /@attribute/ && $3 ~ /numeric|real|integer/  {
 73                Num[Attr]=1; }
 74 /@data/      { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass}
 75 /@/          { next; }
 76 In           { instance() }
 77 END          { debug(); report() }
 78 
 79 #### worker functions
 80 function instance(   i) {
 81     Instances++
 82     N[$Klass]++
 83     for(i=1;i<=Attr;i++)
 84       if (i != Klass)
 85         if ($i !~ /\?/)
 86           Num[i] ? number(i,$i,$Klass) : symbol(i,$i,$Klass)
 87 }
 88 function report (     i) {
 89     for(i=1;i<=Attr;i++)
 90         if (i != Klass)
 91             print (Num[i] ? "numeric" : "symbolic"),
 92                    Name[i],
 93                    sprintf("%5.5f",Num[i] ? sdGain(i) : infoGain(i))
 94 }
 95 function debug() {
 96     if (Debug) {
 97         print Relation
 98         print Attr
 99         saya("N",N)
100         saya("Num",Num)
101         saya("Name",Name)
102         saya("Sum",Sum)
103         saya("SumSq",SumSq)
104         saya("Uniques",Uniques)
105         saya("Count",Count)
106     }
107 }
108 #### record keeping
109 function number(col,value,klass) {
110     Sum[col,klass]   += value;
111     SumSq[col,klass] += value*value;
112     Sum[col]         += value;
113     SumSq[col]       += value*value;
114 }
115 function symbol(col,value,klass) {
116     if (++Count[col,value] == 1)
117         Uniques[col,++Uniques[col,0]]=value;
118     Count[klass,col,value]++;
119 }
120 
121 #### asessing the gain on numeric columns
122 function sdGain(col,     klass,before,after,probKlass) {
123     before = sd(SumSq[col],Sum[col],Instances);
124     for(klass in N) {
125         probKlass = N[klass]/Instances
126         after    += probKlass * sd(SumSq[col,klass],Sum[col,klass],N[klass])
127     }
128     return before - after
129 }
130 
131 #### assessing the gain on symbolic columns
132 function infoGain(col,   all,klass,before,after,probKlass) {
133     all    = Uniques[col,0]
134     before = bitsInColumn(all,col);
135     for(klass in N) {
136         probKlass = N[klass]/Instances
137         after    += probKlass * bitsInKlass(all,col,klass)
138     }
139     return before - after;
140 }
141 function bitsInKlass(all,col,klass,   i,one,f) {
142     for(i=1;i<=all;i++) {
143         one = Uniques[col,i];
144         f[i]= Count[klass,col,one];
145     }
146     return entropy(f,N[klass])
147 }
148 function bitsInColumn(all,col,      one,f,i) {
149     for(i=1;i<=all;i++) {
150         one = Uniques[col,i];
151         f[i]= Count[col,one];
152     };
153     return entropy(f,Instances)
154 }
155 
156 #### low-level stuff
157 function entropy(f,n,     out,i,p) {
158     for(i in f)
159         if (p=f[i]/n)
160             out +=  -1 * p * log2(p);
161     return out
162 }
163 function log2(n)          { return log(n)/log(2) }
164 function sd(sumSq,sumX,n) { return sqrt((sumSq-((sumX*sumX)/n))/(n-1)); }
165 function array(a)         { split("",a) }
166 
167 #### debugging stuff
168 function saya(str,a,   com,i,j) {
169     com="sort #" rand()
170     for(i in a) {
171         j=i;
172         gsub(SUBSEP,",",j)
173         print str "[" j "]=" a[i] | com;
174     }
175     close(com)
176 }