# /* vim: set filetype=awk : */ -*- awk -*- ############################################################### # nbd.awK : naive bayes classifier for discrete data # (c) 2007 Tim Menzies tim@menzies.us # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. ############################################################### # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/nbd.awk # usage: gawk -f nbd.awk Pass=1 train.arff Pass=1 test.arff ## warning: hastily written code! use with great care! BEGIN { #### command line options M = 2 # M-estimate K = 1 # Laplace estimate Klass= -1 # Position of class attribute. if negative, # counts in from the right. # e.g "-2" means second most right column # e.g "3" means third most left column Debug=0 # Set to "1" to get a verbose dump of data Brief = 0 # Set to "1" to just get gotwant #### internal stuff OFS="," # Simplifies printing output IGNORECASE=1 # make case comparison case insenstive Relation="any" # Name of this relation Attr=0 # Number of attributes array(Name) # Name of attribute "i" Instances # Number of instances array(N) # N[k] is the number of class "k" instances array(Num) # If Num[i] then attribute "i" is a numeric # stuff to keep information from symbolic columns array(Count) # Frequency counts of symbols in a column array(Uniques) # Each column "c" has all=Uniques[c,0] symbols, # store at positions 1<=i<=all Uniques[c,i] } #### generic stuff to read an arff file # (should adapt to CSV real easy) { sub(/\%.*/,"") } /^[ \t]*$/ { next } /@relation/ && Pass==1 { In=0; Relation=$2} /@attribute/ && Pass==1 { Attr++; Name[Attr]=$2;} /@data/ && Pass==1 { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass } /@/ { next; } In && Pass==1 { train() } In && Pass==2 { classify() } END {debug() } #### debug stuff function debug() { if (Debug) { print "Attr=" Attr; print "Klass=" Klass; print "Klasses=" Klasses; print "Instances=" Instances; saya("Uniques",Uniques); saya("Count",Count); saya("N",N); } } #### worker functions function classify( best, l) { best=likelihood(l); if (Brief) { print best , $Klass } else { printf("%.2f" OFS "%s" OFS,l[best],best) print $0 } } function train( i) { Instances++ if (++N[$Klass]==1) Klasses++ for(i=1;i<=Attr;i++) if (i != Klass) if ($i !~ /\?/) symbol(i,$i,$Klass) } #### record keeping function symbol(col,value,klass) { Count[klass,col,value]++; } #### train function likelihood(l, klass,i,inc,temp,prior,what,like) { like = -10000000000; # smaller than any log for(klass in N) { prior=(N[klass]+K)/(Instances + (K*Klasses)); temp= log(prior) for(i=1;i<=Attr;i++) { if (i != Klass) if ( $i !~ /\?/ ) temp += log((Count[klass,i,$i]+M*prior)/(N[klass]+M)) } l[klass]= temp if ( temp >= like ) {like = temp; what=klass} } return what } #### dull utils function array(a) { split("",a) } function saya(str,a, com,i,j) { com="sort #" rand() for(i in a) { j=i; gsub(SUBSEP,",",j) print str "[" j "]=" a[i] | com; } close(com) }