1 # /* vim: set filetype=awk : */ -*- awk -*- 2 ############################################################### 3 # gains.awK : scores attributes by the gain if we split on them 4 # (c) 2007 Tim Menzies tim@menzies.us 5 # 6 # This program is free software; you can redistribute it and/or 7 # modify it under the terms of the GNU General Public License 8 # as published by the Free Software Foundation; version 3. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston, 18 # MA 02110-1301, USA. 19 ############################################################### 20 21 # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/gains.awk 22 # usage: gawk -f gains.awk [Debug=1] [Klass=N] data.arff 23 24 ## warning: hastily written code! use with great care! 25 # has known bugs 26 # 1) if an attribute name has spaces, this won't work 27 # 2) the infogain calculation does not include gain ratios 28 # 3) the gain on numeric and symbolic columns are different 29 # units, and this code offers no suggestions on how 30 # to rank one over the other 31 # 4) on some data sets (autos.arff) i get this bug: 32 # 33 # gawk: gains.awk:(FILENAME=- FNR=333) warning: sqrt: called 34 # with negative argument -7.27596e-12 35 # gawk: gains.awk: (FILENAME=- FNR=333) warning: sqrt: called 36 # with negative argument -1.81899e-12 37 38 BEGIN { 39 #### command line options 40 Klass= -1 # Position of class attribute. if negative, 41 # counts in from the right. 42 # e.g "-2" means second most right column 43 # e.g "3" means third most left column 44 Debug = 0 # Set to one to get LOTS of internal info 45 46 #### internal stuff 47 OFS="," # Simplifies printing output 48 IGNORECASE=1 # make case comparison case insenstive 49 Relation="any" # Name of this relation 50 Attr=0 # Number of attributes 51 array(Name) # Name of attribute "i" 52 Instances # Number of instances 53 array(N) # N[k] is the number of class "k" instances 54 array(Num) # If Num[i] then attribute "i" is a numeric 55 56 # stuff to keep information from symbolic columns 57 array(Count) # Frequency counts of symbols in a column 58 array(Uniques) # Each column "c" has all=Uniques[c,0] symbols, 59 # store at positions 1<=i<=all Uniques[c,i] 60 61 # stuff to keep information from numeric columns 62 array(Sum) # Sum of the values in a column 63 array(SumSq) # Sum of the squres of the values in a column 64 } 65 66 #### generic stuff to read an arff file 67 # (should adapt to CSV real easy) 68 { sub(/#.*/,"") } 69 /^[ \t]*$/ { next } 70 /@relation/ { Relation=$2} 71 /@attribute/ { Attr++; Num[Attr]=0; Name[Attr]=$2;} 72 /@attribute/ && $3 ~ /numeric|real|integer/ { 73 Num[Attr]=1; } 74 /@data/ { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass} 75 /@/ { next; } 76 In { instance() } 77 END { debug(); report() } 78 79 #### worker functions 80 function instance( i) { 81 Instances++ 82 N[$Klass]++ 83 for(i=1;i<=Attr;i++) 84 if (i != Klass) 85 if ($i !~ /\?/) 86 Num[i] ? number(i,$i,$Klass) : symbol(i,$i,$Klass) 87 } 88 function report ( i) { 89 for(i=1;i<=Attr;i++) 90 if (i != Klass) 91 print (Num[i] ? "numeric" : "symbolic"), 92 Name[i], 93 sprintf("%5.5f",Num[i] ? sdGain(i) : infoGain(i)) 94 } 95 function debug() { 96 if (Debug) { 97 print Relation 98 print Attr 99 saya("N",N) 100 saya("Num",Num) 101 saya("Name",Name) 102 saya("Sum",Sum) 103 saya("SumSq",SumSq) 104 saya("Uniques",Uniques) 105 saya("Count",Count) 106 } 107 } 108 #### record keeping 109 function number(col,value,klass) { 110 Sum[col,klass] += value; 111 SumSq[col,klass] += value*value; 112 Sum[col] += value; 113 SumSq[col] += value*value; 114 } 115 function symbol(col,value,klass) { 116 if (++Count[col,value] == 1) 117 Uniques[col,++Uniques[col,0]]=value; 118 Count[klass,col,value]++; 119 } 120 121 #### asessing the gain on numeric columns 122 function sdGain(col, klass,before,after,probKlass) { 123 before = sd(SumSq[col],Sum[col],Instances); 124 for(klass in N) { 125 probKlass = N[klass]/Instances 126 after += probKlass * sd(SumSq[col,klass],Sum[col,klass],N[klass]) 127 } 128 return before - after 129 } 130 131 #### assessing the gain on symbolic columns 132 function infoGain(col, all,klass,before,after,probKlass) { 133 all = Uniques[col,0] 134 before = bitsInColumn(all,col); 135 for(klass in N) { 136 probKlass = N[klass]/Instances 137 after += probKlass * bitsInKlass(all,col,klass) 138 } 139 return before - after; 140 } 141 function bitsInKlass(all,col,klass, i,one,f) { 142 for(i=1;i<=all;i++) { 143 one = Uniques[col,i]; 144 f[i]= Count[klass,col,one]; 145 } 146 return entropy(f,N[klass]) 147 } 148 function bitsInColumn(all,col, one,f,i) { 149 for(i=1;i<=all;i++) { 150 one = Uniques[col,i]; 151 f[i]= Count[col,one]; 152 }; 153 return entropy(f,Instances) 154 } 155 156 #### low-level stuff 157 function entropy(f,n, out,i,p) { 158 for(i in f) 159 if (p=f[i]/n) 160 out += -1 * p * log2(p); 161 return out 162 } 163 function log2(n) { return log(n)/log(2) } 164 function sd(sumSq,sumX,n) { return sqrt((sumSq-((sumX*sumX)/n))/(n-1)); } 165 function array(a) { split("",a) } 166 167 #### debugging stuff 168 function saya(str,a, com,i,j) { 169 com="sort #" rand() 170 for(i in a) { 171 j=i; 172 gsub(SUBSEP,",",j) 173 print str "[" j "]=" a[i] | com; 174 } 175 close(com) 176 }