# /* vim: set filetype=awk : */ -*- awk -*- ############################################################### # gains.awK : scores attributes by the gain if we split on them # (c) 2007 Tim Menzies tim@menzies.us # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. ############################################################### # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/gains.awk # usage: gawk -f gains.awk [Debug=1] [Klass=N] data.arff ## warning: hastily written code! use with great care! # has known bugs # 1) if an attribute name has spaces, this won't work # 2) the infogain calculation does not include gain ratios # 3) the gain on numeric and symbolic columns are different # units, and this code offers no suggestions on how # to rank one over the other # 4) on some data sets (autos.arff) i get this bug: # # gawk: gains.awk:(FILENAME=- FNR=333) warning: sqrt: called # with negative argument -7.27596e-12 # gawk: gains.awk: (FILENAME=- FNR=333) warning: sqrt: called # with negative argument -1.81899e-12 BEGIN { #### command line options Klass= -1 # Position of class attribute. if negative, # counts in from the right. # e.g "-2" means second most right column # e.g "3" means third most left column Debug = 0 # Set to one to get LOTS of internal info #### internal stuff OFS="," # Simplifies printing output IGNORECASE=1 # make case comparison case insenstive Relation="any" # Name of this relation Attr=0 # Number of attributes array(Name) # Name of attribute "i" Instances # Number of instances array(N) # N[k] is the number of class "k" instances array(Num) # If Num[i] then attribute "i" is a numeric # stuff to keep information from symbolic columns array(Count) # Frequency counts of symbols in a column array(Uniques) # Each column "c" has all=Uniques[c,0] symbols, # store at positions 1<=i<=all Uniques[c,i] # stuff to keep information from numeric columns array(Sum) # Sum of the values in a column array(SumSq) # Sum of the squres of the values in a column } #### generic stuff to read an arff file # (should adapt to CSV real easy) { sub(/%.*/,"") } /^[ \t]*$/ { next } /@relation/ { Relation=$2} /@attribute/ { Attr++; Num[Attr]=0; Name[Attr]=$2;} /@attribute/ && $3 ~ /numeric|real|integer/ { Num[Attr]=1; } /@data/ { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass} /@/ { next; } In { instance() } END { debug(); report() } #### worker functions function instance( i) { Instances++ N[$Klass]++ for(i=1;i<=Attr;i++) if (i != Klass) if ($i !~ /\?/) Num[i] ? number(i,$i,$Klass) : symbol(i,$i,$Klass) } function report ( i) { for(i=1;i<=Attr;i++) if (i != Klass) print (Num[i] ? "numeric" : "symbolic"), Name[i], sprintf("%5.5f",Num[i] ? sdGain(i) : infoGain(i)) } function debug() { if (Debug) { print Relation print Attr saya("N",N) saya("Num",Num) saya("Name",Name) saya("Sum",Sum) saya("SumSq",SumSq) saya("Uniques",Uniques) saya("Count",Count) } } #### record keeping function number(col,value,klass) { Sum[col,klass] += value; SumSq[col,klass] += value*value; Sum[col] += value; SumSq[col] += value*value; } function symbol(col,value,klass) { if (++Count[col,value] == 1) Uniques[col,++Uniques[col,0]]=value; Count[klass,col,value]++; } #### asessing the gain on numeric columns function sdGain(col, klass,before,after,probKlass) { before = sd(SumSq[col],Sum[col],Instances); for(klass in N) { probKlass = N[klass]/Instances after += probKlass * sd(SumSq[col,klass],Sum[col,klass],N[klass]) } return before - after } #### assessing the gain on symbolic columns function infoGain(col, all,klass,before,after,probKlass) { all = Uniques[col,0] before = bitsInColumn(all,col); for(klass in N) { probKlass = N[klass]/Instances after += probKlass * bitsInKlass(all,col,klass) } return before - after; } function bitsInKlass(all,col,klass, i,one,f) { for(i=1;i<=all;i++) { one = Uniques[col,i]; f[i]= Count[klass,col,one]; } return entropy(f,N[klass]) } function bitsInColumn(all,col, one,f,i) { for(i=1;i<=all;i++) { one = Uniques[col,i]; f[i]= Count[col,one]; }; return entropy(f,Instances) } #### low-level stuff function entropy(f,n, out,i,p) { for(i in f) if (p=f[i]/n) out += -1 * p * log2(p); return out } function log2(n) { return log(n)/log(2) } function sd(sumSq,sumX,n) { return sqrt((sumSq-((sumX*sumX)/n))/(n-1)); } function array(a) { split("",a) } #### debugging stuff function saya(str,a, com,i,j) { com="sort #" rand() for(i in a) { j=i; gsub(SUBSEP,",",j) print str "[" j "]=" a[i] | com; } close(com) }