# assumes discretized data # usage: # gawk -f nbc.awk Pass=1 train.csv Pass=2 test.csv # # gather accuracy stats (example): # gawk -f nbc.awk Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | gawk -F, '{print $2==$NF}' | sort -n | uniq -c # # to find outliers (example): # gawk -f nbc.awk Classify=0 Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | cut -d, -f 1 | bars -t 1 -w 0.00001 # # to see the effects of outliners: # gawk -f nbc.awk Classify=0 Pass=1 diabetesD.dvs Pass=2 diabetesD.dvs | # gawk -F, '$2 != $NF {print $1,"\t 1"; next} {print $1,"\t 0"}'| # sort -t, -n +0 > /tmp/x.dat # gnuplot diabetesD.plt # epstopdf diabetesD.eps # xpdf diabetesD.pdf BEGIN { OFS=FS ="," Total=0 # count of all instances # Classes # table of class names/frequencies # Freg # table of counters for values in attributes in classes # Seen # table of counters for values in attributes # Attributes # table of number of values per attribute Ee = 848456353 / 312129649; # an approximation to "e" Classify =1 # if false, run as an anomaly detector } {sub(/#.*/,""); gsub(/[ \t]*/,"") } # no comments or white space /^$/ {next} FNR==1 { next } Pass==1 { $NF= Classify ? $NF : "ALL"; train() } Pass==2 { Best=likelihood(L); printf("%.30f%s\n", Ee^L[Best],OFS Best OFS $0) } function train( i,c) { Total++; c=$NF; Classes[c]++; for(i=1;i<=NF;i++) { if ($i=="?") continue; Freq[c,i,$i]++ if (++Seen[i,$i]==1) Attributes[i]++} } function likelihood(l, i,temp,prior,what,like,c,m,k,cs) { m=2; k=1; cs = Attributes[NF]; # number of classes like = -10000000000; # smaller than any log for(c in Classes) { prior=(Classes[c]+k)/(Total + (k*cs)); #uses logs to stop numeric errors temp= log(prior) for(i=1;i= like ) {like = temp; what=c} }; return what }