# assumes discretized data # usage: # gawk -f nbc.awk Pass=1 train.csv Pass=2 test.csv # # gather accuracy stats (example): # gawk -f nbc.awk Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | gawk -F, '{print $2==$NF}' | sort -n | uniq -c # # to find outliers (example): # gawk -f nbc.awk Classify=0 Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | cut -d, -f 1 | bars -t 1 -w 0.00001 # # to see the effects of outliners: # gawk -f nbc.awk Classify=0 Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | # gawk -F, '$2 != $NF {print $1,"\t 1"; next} {print $1,"\t 0"}'| # sort -t, -n +0 > /tmp/x.dat # gnuplot diabetesD.plt # epstopdf diabetesD.eps # xpdf diabetesD.pdf BEGIN { OFS=FS ="," Total=0 # count of all instances # Classes # table of class names/frequencies # Freg # table of counters for values in attributes in classes # Seen # table of counters for values in attributes # Attributes # table of number of values per attribute Ee = 848456353 / 312129649; # an approximation to "e" All = "__all" } {sub(/#.*/,""); gsub(/[ \t]*/,"") } # no comments or white space /^$/ {next} FNR==1 { next } Pass==1 { train() } Pass==2 { Best=likelihood(LS); printf("%.30f%s\n", LS[Best],OFS Best OFS $0) } function train( i,c) { Total++; c=$NF; Classes[c]++; for(i=1;i<=NF;i++) { if ($i=="?") continue; Freq[c,i,$i]++ Freq[All,i,$i]++ if (++Seen[i,$i]==1) Attributes[i]++} } function likelihood(ls, best,tmp,i,a,b,priora,priorb,what,c,m,k,cs) { m=2; k=1; cs = Attributes[NF]; # number of classes best = -10000000000; # smaller than any log for(c in Classes) { priora=(Classes[c]+k)/(Total + (k*cs)); #uses logs to stop numeric errors priorb=((Total-Classes[c])+k)/(Total + (k*cs)); #uses logs to stop numeric errors a= priora b= priorb for(i=1;i= best ) {best = tmp ; what=c} }; return what }