# /* vim: set filetype=awk : */ -*- awk -*-
###############################################################
# nbd.awK : naive bayes classifier for discrete data
# (c) 2007 Tim Menzies tim@menzies.us
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
###############################################################

# url: http://unbox.org/wisp/branches/tims-our/minerc.lib/nbd.awk
# usage: gawk -f nbd.awk Pass=1 train.arff Pass=1 test.arff

## warning: hastily written code! use with great care!
BEGIN  {
		#### command line options 
		M = 2           # M-estimate
		K = 1           # Laplace estimate
	   	Klass= -1       # Position of class attribute. if negative,
				        #    counts in from the right. 
			 	        #    e.g "-2" means second most right column
			 	        #    e.g "3" means third most left column
	 	Debug=0         # Set to "1" to get a verbose dump of data
	    Brief = 0       # Set to "1" to just get gotwant

		#### internal stuff
		OFS=","         # Simplifies printing output
		IGNORECASE=1    # make case comparison case insenstive
		Relation="any"  # Name of this relation
		Attr=0          # Number of attributes
		array(Name)     # Name of attribute "i"
		Instances	    # Number of instances
		array(N)        # N[k] is the number of class "k" instances
		array(Num)      # If Num[i] then attribute "i" is a numeric

		# stuff to keep information from symbolic columns
		array(Count)    # Frequency counts of symbols in a column
		array(Uniques)  # Each column "c" has all=Uniques[c,0] symbols, 
		                #     store at positions 1<=i<=all Uniques[c,i]
		}

#### generic stuff to read an arff file	
# (should adapt to CSV real easy)
              { sub(/\%.*/,"") }
/^[ \t]*$/    { next }
/@relation/  && Pass==1 { In=0; Relation=$2}
/@attribute/ && Pass==1 { Attr++;  Name[Attr]=$2;}
/@data/      && Pass==1 { In=1; FS=","; Klass= Klass>0 ? Klass : Attr+1+Klass }
/@/           { next; }
In && Pass==1 { train()
	          }
In && Pass==2 { classify() }
END 		   {debug() }

#### debug stuff
function debug() {
	if (Debug) {
		print "Attr=" Attr;
		print "Klass=" Klass;
		print "Klasses=" Klasses;
		print "Instances=" Instances;
		saya("Uniques",Uniques);
		saya("Count",Count);
		saya("N",N);
	}
}
#### worker functions
function classify(   best, l) {
	best=likelihood(l);
	if (Brief) { print best ,  $Klass }
	else       { printf("%.2f" OFS "%s" OFS,l[best],best) 
		         print $0
               }
}
function train(   i) {
	Instances++
	if (++N[$Klass]==1) Klasses++  
	for(i=1;i<=Attr;i++) 
	  if (i != Klass)
		if ($i !~ /\?/)  
		   symbol(i,$i,$Klass) 
}
#### record keeping
function symbol(col,value,klass) {
	Count[klass,col,value]++;
}
#### train
function likelihood(l,         klass,i,inc,temp,prior,what,like) {  
	like = -10000000000;      # smaller than any log
	for(klass in N) {  
		prior=(N[klass]+K)/(Instances + (K*Klasses)); 
		temp= log(prior)
		for(i=1;i<=Attr;i++) {  
			if (i != Klass)
				if ( $i !~ /\?/ ) 
				    temp += log((Count[klass,i,$i]+M*prior)/(N[klass]+M))
		}
		l[klass]= temp
		if ( temp >= like ) {like = temp; what=klass}
	}
	return what
}
#### dull utils
function array(a) { split("",a) }
function saya(str,a,   com,i,j) {
	com="sort #" rand()
	for(i in a) {
		j=i;
		gsub(SUBSEP,",",j)
		print str "[" j "]=" a[i] | com;
	}
	close(com)
}