#!/usr/bin/gawk -f #/* vim: set filteype=awk : */ -*- awk -*- #Design for RBST-based Naive Bayes Classifier #DJ Boland #Changelog #05/16/07 - Initial Implementation #------------------------------------------------------------------------------ BEGIN{ FS=OFS=" *, *"; #Fields seperated by a comma and its preceeding # and trailing spaces. SUBSEP="#" #An underscore will be used as the default seperator IGNORECASE=1; #Set interpreter to be case-insensitive Val="val"; #Substitute the string "val" where Val used Left="left"; #Substitute the string "left" where Left used Right="right"; #Substitute the string "right" where Right used Here="here"; #Substitute the string "here" where Here used All="all"; #Substitute the string "all" where All used Min="min"; #Substitute the string "min" where Min used Max="max"; #Substitute teh string "max" where Max used srand(); #Randomize the seed used to generate random numbers BuildTree=0; #By default, don't add records to the trees #0 - don't add; 1 - add. CurrentRecord=0; #Record currently being accessed Pass=0; #Contains the current pass being made through data LastLine=""; #Contains last line seen from the file being read MaxSize=15; #The maximum size to allow the tree to grow to ReqRecs=7; #The minimum number of records that must be seen #between garbage collections Csize=0; #Size of the tree currently being manipulate Mode=0; Total=0; Count="count"; # Special #If special equals 1, print Trees on run finish } # gsub("'", "", $0) Mode==1{ if(FNR==1) { # print "Reading File Info"; getinfo(); next; } # print "Training"; sub(/\%.*/,"") #Substitute comments with blanks gsub(/[\'\"]/,"",$0) #Remove any single or double quotes from this line gsub("'", "", $0) train(); } Mode==2{ if(FNR==1) { checkinfo(); for (t in Trees) { if(CurrentSize[t] > MaxSize) { CurrentSize[t]=garbageCollect(CurrentRoot[t], Forest, MaxSize) NewRecords[t]=0; } } next; } sub(/\%.*/,"") #Substitute comments with blanks gsub(/[\'\"]/,"",$0) #Remove any single or double quotes from this line gsub("'", "", $0) if (NF == NumAttrs) { guess = classify(); print "Actual: " $NumAttrs "| Predicted: " guess; if($NumAttrs == guess) { Correct++ } } } END{ print "Accuracy: " (Correct/(Total+.00000001))*100 "%"; if(Special==1) { for (I in Trees) { if(Trees[I]==1) { print "" allvals = "" for(J in Classes) { if(allvals==""){allvals=J} else{allvals = allvals " " J} } print "Classes are: [" allvals "]" printtree(CurrentRoot[I], Forest,"","", Classes) print "" } } } } function getinfo( t) { #print "Getting File Info" #Process data file if it matches the appropriate type if(FILENAME~/.arff$/) { FS=" "; t=1; #Skip everything before the @relation symbol while (($0!~/^ *@r/) && ($0!~/^ *@R/)) getline; #Process data file until the @data symbol while (($0!~/^ *@d/) && ($0!~/^ *@D/)) { gsub(/[\'\"]/,"",$0) #Remove any single or double quotes from this line gsub("'", "", $0) #If the current line contains an attribute, store its information if (($0~/^ *@a/) || ($0~/^ *@A/)) { Names[t]=$2; Trees[t]=1; t++ lastline=$0; } getline; } NumAttrs=t-1; Trees[NumAttrs]=0; for(t=1; t= like) { like = score; what = c; } } return what; } function addrecord( croot, csize, returnval, vals) { for(t in Trees) { if(Trees[t]==1) { field=$t; # print "t is: " t " and field is: " field csize=CurrentSize[t]; croot=CurrentRoot[t]; # print "Current Size is: " csize " and Current Root is: " croot; if(field~/?/) { continue; } else { # print "inserting: " field " from starting root: " croot returnval=rbstinsert(croot, Forest, field, Names[t], CurrentRecord, $NF, Classes, csize); split(returnval, retvals, SUBSEP); CurrentRoot[t]=retvals[1] SUBSEP retvals[2]; CurrentSize[t]=retvals[3]; NewRecords[t]++; if((CurrentSize[t] > MaxSize) && (NewRecords[t]>=ReqRecs)) { CurrentSize[t]=garbageCollect(CurrentRoot[t], Forest, MaxSize) # print CurrentSize[t]; NewRecords[t]=0; } } } } CurrentRecord++; } #****************************************************************************** #bst.awk #DJ's Binary Search Tree Functions #Insert a a new node at the root of the tree function insertR(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { # print "inserting a new node" return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } Tree[RootInstance,All]++ Tree[RootInstance,"A" SUBSEP Class]++ if (Value == Tree[RootInstance,Val]) { # print "inserting at an existing node" Tree[RootInstance,Here]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } if (Value < Tree[RootInstance,Val]) { # print "inserting on left with a new root" returnvals=insertR(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateR(RootInstance,Tree,Classes) } else { # print "inserting on right with a new root, where " Value " is greater than "Tree[RootInstance,Val]; returnvals=insertR(Tree[RootInstance,Right], Tree, Value, Attr, Rec,Class, Classes, CurrentSize) split(returnvals,vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateL(RootInstance,Tree,Classes) } return RootInstance SUBSEP CurrentSize } #Rotate the current root's left subchild to right to make it become the current root function rotateR(RootInstance, Tree, Classes, temp) { temp = Tree[RootInstance,Left] Tree[RootInstance,Left] = Tree[temp,Right] Tree[temp,Right] = RootInstance Tree[temp,All] = Tree[RootInstance,All] for(M in Classes) { Tree[temp, "A" SUBSEP Classes[M]] = Tree[RootInstance, "A" SUBSEP Classes[M]] } if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]]+Tree[RootInstance,"h" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] for(M in Classes) { Tree[RootInstance, "A" SUBSEP Classes[M]] = Tree[RootInstance,"h" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]= Tree[RootInstance,"h" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]] } } else { Tree[RootInstance,All] = Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance,"h" SUBSEP Classes[M]] } } } } return temp } #Rotate the current roots right subchild up to the root (by rotating it to the left) function rotateL(RootInstance, Tree, Classes, temp) { temp = Tree[RootInstance,Right] Tree[RootInstance,Right] = Tree[temp,Left] Tree[temp,Left] = RootInstance Tree[temp,All] = Tree[RootInstance,All] for(M in Classes) { Tree[temp,"A" SUBSEP Classes[M]]=Tree[RootInstance,"A" SUBSEP Classes[M]] } if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]] = Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]]+Tree[RootInstance,"h" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance, "h" SUBSEP Classes[M]] + Tree[Tree[RootInstance,Left], "A" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance, "h" SUBSEP Classes[M]] + Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]] } } else { Tree[RootInstance,All] = Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance, "A" SUBSEP Classes[M]] = Tree[RootInstance, "h" SUBSEP Classes[M]] } } } } return temp } #Function to insert a new node into the tree function newtree(RootInstance, Tree, Value, Class, Classes, CurrentSize) { # print Value " is being inserted as a new node"; Tree[RootInstance,Left]="?"; Tree[RootInstance,Right]="?"; Tree[RootInstance,Val]=Value; Tree[RootInstance,All]=1; Tree[RootInstance,Here]=1; #Initialize the here and All class counts for each class for(M in Classes) { Tree[RootInstance,"h" SUBSEP Classes[M]]=0; Tree[RootInstance,"A" SUBSEP Classes[M]]=0; } # print "New Node has value: " Tree[RootInstance,Val] " right child " Tree[RootInstance,Right] " and left child " Tree[RootInstance,Left]; Tree[RootInstance,"A" SUBSEP Class]=1 Tree[RootInstance,"h" SUBSEP Class]=1 CurrentSize++ return RootInstance SUBSEP CurrentSize } #Function to print the current tree function printtree(RootInstance, Tree, b4, indent, Classes, allclasses, treeclasses) { allclasses="" treeclasses="" for(M in Classes) { if(allclasses==""){allclasses=Tree[RootInstance,"h" SUBSEP Classes[M]]} else allclasses=allclasses " " Tree[RootInstance,"h" SUBSEP Classes[M]] if(treeclasses==""){treeclasses=Tree[RootInstance,"A" SUBSEP Classes[M]]} else treeclasses=treeclasses " " Tree[RootInstance,"A" SUBSEP Classes[M]] } print indent b4 RootInstance " = " Tree[RootInstance,Val] " [here=" Tree[RootInstance,Here] "] [all=" Tree[RootInstance,All] "]" print indent b4 RootInstance " Classes here are: [" allclasses "], all: [" treeclasses "]" if(Tree[RootInstance,Left] != "?") printtree(Tree[RootInstance,Left], Tree, "< ", indent "| ",Classes,x) if(Tree[RootInstance,Right] != "?") printtree(Tree[RootInstance,Right], Tree, "> ", indent "| ",Classes,x) } #Function for inserting a value into the tree. If the value is not at the root, #this function will, with a 1/n chance, insert the node at the current root. function rbstinsert(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { # print "Inserting a new node" return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } if (Value == Tree[RootInstance,Val]) { # print "Inserting in an existing root node"; Tree[RootInstance,Here]++ Tree[RootInstance,All]++ Tree[RootInstance, "A" SUBSEP Class]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } else { if ((rand()*Tree[RootInstance,All]) < 1.0) { # print "inserting a root node" return insertR(RootInstance,Tree,Value, Attr, Rec, Class, Classes, CurrentSize) } if (Value < Tree[RootInstance,Val]) { # print "inserting at a lower level on left" returnvals=rbstinsert(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } else { # print "inserting at a lower level on right" returnvals=rbstinsert(Tree[RootInstance,Right], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } Tree[RootInstance,All]++ Tree[RootInstance,"A" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } } #Function that determines which tree node to substitute for a given value #For a tree node to be chosen as a viable substitute, it must have at least #minSize nodes at in itself and/or in its children. function subwith(minSize, Value, Tree, RootInstance) { if(((Value == Tree[RootInstance,Val]) || (Value ~ Tree[RootInstance,Val])) && (Tree[RootInstance,All] >= minSize)) { return RootInstance } if(Value < Tree[RootInstance,Val]) { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Left]); } else return RootInstance } else return RootInstance } if (Value > Tree[RootInstance,Val]) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if(int(2*rand()) > 1) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } } } #Function to obtain the lists of discretized classes #To be a part of this list, the given node must #be of at least minSize function discClasses(Tree, RootInstance, minSize, myClasses) { if(Tree[RootInstance,Left] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Left], minSize, myClasses) } if(Tree[RootInstance,All] >= minSize) { if(myClasses != "") { myClasses = myClasses " , " RootInstance } else { myClasses = RootInstance } } if(Tree[RootInstance,Right] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Right], minSize, myClasses) } return myClasses } #Function to prune tree to an appropriate size/keep it from #exceeding memory availability function garbageCollect(RootInstance, Tree, MaxSize, low, high, size) { #Set up a set to track the tree nodes that have been #"touched# by the breadth first search size=0; high=1; low=0; NodeSet[0]=RootInstance #Function uses a breadth-first approach to span the tree, #adding nodes to the set left to right across each level #and then moving down to the following one. When the max #size of the tree is reached, the nodes remaining in the #set are removed, along with any of their children that #were not discovered. while (high != low) { if (Tree[NodeSet[low],Left] != "?") { NodeSet[high]= Tree[NodeSet[low],Left]; high++; } if (Tree[NodeSet[low],Right] != "?") { NodeSet[high]= Tree[NodeSet[low],Right]; high++; } size++; low++; if (size == MaxSize) { high = high-1; while(low <= high) { removeVal(RootInstance, Tree, pruneNode(NodeSet[high], Tree)) high = high - 1; } } } return size; } #Function to remove a given node and any of #its children from the tree function pruneNode(RootInstance, Tree) { #If this node has a left child, remove that #left child before this node if(Tree[RootInstance,Left] != "?") { pruneNode(Tree[RootInstance,Left], Tree) } #If this node has a right child, remove that #right child before this node if(Tree[RootInstance,Right] != "?") { pruneNode(Tree[RootInstance,Right],Tree) } #Set all relevant values of this tree to zero #and disconnect the childrens nodes Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,All]=0 Tree[RootInstance,Here]=0 return Tree[RootInstance,Val] } #Function to remove a node from the tree based on its value #Starting root node will not be removed using this method function removeVal(RootInstance, Tree, myVal) { if(Tree[RootInstance,Val] > myVal) { if(Tree[Tree[RootInstance, Left], Val] == myVal) { Tree[RootInstance, Left]="?" } else { removeVal(Tree[RootInstance,Left], Tree, myVal) } } else { if(Tree[Tree[RootInstance, Right], Val] == myVal) { Tree[RootInstance, Right]="?" } else { removeVal(Tree[RootInstance, Right], Tree, myVal) } } } function getclasscount(Class) { # print "Root is: " CurrentRoot[1]; # print "Value at that root is: " Forest[CurrentRoot[1], Val]; return Forest[CurrentRoot[1], "A" SUBSEP Class]; } function getcount(minSize, Value, Tree, RootInstance, Class) { if(((Value == Tree[RootInstance,Val]) || (Value ~ Tree[RootInstance,Val])) && (Tree[RootInstance,All] >= minSize)) { return Tree[RootInstance, "h" SUBSEP Class]; # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]+Tree[RootInstance, "h" SUBSEP Class]); # else return (Tree[RootInstance, "A" SUBSEP Class]); } if(Value < Tree[RootInstance,Val]) { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return getcount(minSize,Value, Tree, Tree[RootInstance,Left], Class) } # else return (Tree[RootInstance, "h" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]+Tree[RootInstance, "h" SUBSEP Class]); else return (Tree[RootInstance, "A" SUBSEP Class]); } # else return Tree[RootInstance, "h" SUBSEP Class]; else return (Tree[RootInstance, "A" SUBSEP Class]); } if (Value > Tree[RootInstance,Val]) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return getcount(minSize, Value, Tree, Tree[RootInstance,Right], Class) } # else return (Tree[RootInstance, "h" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]+Tree[RootInstance, "h" SUBSEP Class]); else return (Tree[RootInstance, "A" SUBSEP Class]); } # else return Tree[RootInstance, "h" SUBSEP Class]; else return (Tree[RootInstance, "A" SUBSEP Class]); } else { if(int(2*rand()) > 1) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return getcount(minSize, Value, Tree, Tree[RootInstance,Right], Class) } # else return (Tree[RootInstance, "h" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]+Tree[RootInstance, "h" SUBSEP Class]); else return (Tree[RootInstance, "A" SUBSEP Class]); } # else return Tree[RootInstance, "h" SUBSEP Class]; else return (Tree[RootInstance, "A" SUBSEP Class]); } else { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return getcount(minSize,Value, Tree, Tree[RootInstance,Left], Class) } # else return (Tree[RootInstance, "h" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]); # else return (Tree[Tree[RootInstance,Left],"A" SUBSEP Class]+Tree[RootInstance, "h" SUBSEP Class]); else return (Tree[RootInstance, "A" SUBSEP Class]); } # else return Tree[RootInstance, "h" SUBSEP Class]; else return (Tree[RootInstance, "A" SUBSEP Class]); } } }