#!/usr/bin/gawk -f #/* vim: set filteype=awk : */ -*- awk -*- # A Design for A Randomized Binary # Search Tree Discretization Method # By DJ Boland # Multipurpose implementation. Allows dynamic tree sizing, tree size, and # variable node size requirement for substitution #------------------------------------------------------------------------------ BEGIN{ FS=" *, *"; #Fields seperated by a comma and its preceeding OFS=","; # and trailing spaces. SUBSEP="#" #An pound sign will be used as the default seperator IGNORECASE=1; #Set interpreter to be case-insensitive Val="val"; #Substitute the string "val" where Val used Left="left"; #Substitute the string "left" where Left used Right="right"; #Substitute the string "right" where Right used Here="here"; #Substitute the string "here" where Here used All="all"; #Substitute the string "all" where All used Min="min"; #Substitute the string "min" where Min used Max="max"; #Substitute teh string "max" where Max used srand(); #Randomize the seed used to generate random numbers BuildTree=0; #By default, don't add records to the trees #0 - don't add; 1 - add. CurrentRecord=0; #Record currently being accessed Pass=0; #Contains the current pass being made through data LastLine=""; #Contains last line seen from the file being read MaxSize=7; #The maximum size to allow the tree to grow to ReqRecs=35; #The minimum number of records that must be seen #between garbage collections Csize=0; #Size fo the tree currently being manipulate Mode=0; RecordsIn=0; #Number of records seen thus far IncSize=75; #Number of records required before tree size grows GrowthMult=1; # Special #If special equals 1, print Trees on discretization finish DynTree=0; #DynTree = 1 allows dynamic tree size; else static SizeChoice=0; #SizeChoice = 1 use size req of n/3; else sqrt(n) } #Skip blank and comment lines {sub(/\%.*/,"")} #Substitute comments with blanks {gsub(/[\'\"\`]/,"",$0)} #Remove any single or double quotes from this line {gsub("'", "", $0)} #Remove any single quotes /^[ \t]*$/ {next} #When blank line observed, go to the next line #PROCESS RELATION LINES /@RELATION/{sub(/@RELATION/, "@relation")} /@relation/{ BuildTree=0; #Set flag for parsing lines to remember (add to tree) to false Attr=0; #Set attribute counter to zero LastLine=""; #Set the LastLine seen to nothing RecordsIn=0; if(Pass==2){ for (I in Trees) { if (CurrentSize[I] > MaxSize) { garbageCollect(CurrentRoot[I], Rbst, MaxSize) } } print $0 } #COMMENTED CODE can be used to verify that the first pass created the BSTs for the numeric #attributes and that it maintained class counts. It prints the tree to the screen for review #if (Pass==2){ # for (I in Numeric) # { # allvalues="" # for (J in Classes){allvalues = allvalues " " Classes[J]} # print "Classes are: [" allvalues "]" # printtree(CurrentRoot[I], Rbst,"","", Classes) # print "" # } #} } #PROCESS ATTRIBUTE LINES /@ATTRIBUTE/ {sub(/@ATTRIBUTE/, "@attribute")} /@attribute/ { #On the first pass, determine which attributes should be put into trees #while preventing the class attribute from being put into the tree. if(Pass==1) { if (LastLine != "") { Attr++; #For each new attribute, increase attribute counter # Obtain the name of each attribute and # store to the Names array split(LastLine,a," ") Names[Attr]=a[2] #Create a list of the fields that should be put into trees Attrs[Attr]=1 # if ((match(LastLine, /numeric/) > 0) || (match(LastLine, /real/) > 0) || (match(LastLine,/integer/) > 0)) { Trees[Attr]=1 } } LastLine=$0 #Maintain a copy of the last attribute read, as if the @data #symbol is seen then the LastLine attribute is the class #attribute next } #On the second pass, replace the original attribute values with an #accurate list of the discretized classes if(Pass==2) { if(LastLine != "") { Attr++ if (Attr in Trees) { outclasses = "" if (SizeChoice==1) { outclasses = discClasses(Rbst, CurrentRoot[Attr], int((Rbst[CurrentRoot[Attr], All])/3), outclasses) } else { outclasses = discClasses(Rbst, CurrentRoot[Attr], int(sqrt(Rbst[CurrentRoot[Attr], All])), outclasses) } sub(LastLine, "@attribute " Names[Attr] " {"outclasses"}", LastLine) } print LastLine } LastLine=$0 next } } #PREPARE TO PROCESS DATA /@DATA/{sub(/@DATA/, "@data")} /@data/{ BuildTree=1; #The point has been reached where data should be put in tree CurrentRecord=1; #The next nonblank line read will be the first record #On first pass, determine the the classes and set up the trees if (Pass==1) { #Parse last line to obtain classess sub(/\}$/, "", LastLine) #Prune closing bracket from class sub(/@attribute /,"", LastLine) #Prune @attribute sybmol from class sub(/^[[:alnum:][:blank:]]*\{/, "", LastLine) #Prune class name and opening bracket from classes gsub(/[ \t\n]*/, "", LastLine) #Prune any extraneous spaces from classes split(LastLine, myClasses, ",") #Split remaining information into different classes # COMMENTED CODE prints each class and creates a list of all the classes and prints it # for (J in myClasses){print myClasses[J]} # allvals = "" # for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} # print allvals #Initialize the roots of each tree to sybolize an empty tree, and new records for each tree to 0 for(I in Trees){CurrentRoot[I] = "?"; CurrentSize[I]=0; NewRecords[I]=0} } #When outputting pruned dataset, ensure keeping the @data line if (Pass==2) {print LastLine; print $0} #Do not want to proceed to put the @data symbol in the tree, so go to the next line next } #PROCESS DATA / BUILD TREES FROM DATA #PASS ONE creates the binary search tree for each (numeric) attribute Pass==1 && BuildTree && NF >1{ for(I in Trees) #For each non-class attribute { NewRecords[I]++ #Add one to the number of records seen since last garbage collect field=$I; #field set to the value of the attribute in the row currently being processed CSize=CurrentSize[I] #set CSize to the size of the current tree #If missing data, decrease new record count and go to next field if(field ~ /?/) {NewRecords[I]--; continue} #Otherwise, data is valid, insert into the tree else returnvalues=rbstinsert(CurrentRoot[I], Rbst, field, Names[I], CurrentRecord,$NF, myClasses, CSize) #Split the returned value into the current root and size of the tree #that was just processed split (returnvalues, vals, SUBSEP) CurrentRoot[I]=vals[1] SUBSEP vals[2] CurrentSize[I]=vals[3] #If the tree exceeds the Maximum size and the minumum numer of records #between garbage collections has been seen, Garbage Collect! if ((CurrentSize[I] > MaxSize) && (NewRecords[I] >=ReqRecs)) { garbageCollect(CurrentRoot[I], Rbst, MaxSize) NewRecords[I]=0 } RecordsIn++; if(DynTree==1) { if(RecordsIn >= (IncSize * GrowthMult)) { GrowthMult++; MaxSize= MaxSize + 2^(2+GrowthMulti) } } } #Increase the counter of the number of records that have been processed CurrentRecord++; } #PASS TWO substitutes the discretized value from the search tree for each value/attribute Pass==2 && BuildTree && NF >1{ for(I in Trees) #for each non-class attribute { field=$I #field set to the value of the attribute in the row currently being processed RootInstance=CurrentRoot[I] #Set RootInstance to the root instance of the attribute currently being processed if(field ~ /?/) continue #If missing data (symbolized by ?), go to the next field #Otherwise, data is valid, determine what discrete value to substitute with else { if(SizeChoice==1) { $I=subwith(int((Rbst[RootInstance,All])/3),field, Rbst,CurrentRoot[I]) } else { $I=subwith(int(sqrt(Rbst[RootInstance,All])),field, Rbst,CurrentRoot[I]) } } } #Print the parsed lines print $0 } END{ if (Special==1) { for (I in Trees) { print "" allvals = "" for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} print "Classes are: [" allvals "]" printtree(CurrentRoot[I], Rbst,"","", myClasses) print "" } } } #****************************************************************************** #bst.awk #DJ's Binary Search Tree Functions #Insert a a new node at the root of the tree function insertR(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } Tree[RootInstance,All]++ Tree[RootInstance,"A" SUBSEP Class]++ if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } if (Value < Tree[RootInstance,Val]) { returnvals=insertR(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateR(RootInstance,Tree,Classes) } else { returnvals=insertR(Tree[RootInstance,Right], Tree, Value, Attr, Rec,Class, Classes, CurrentSize) split(returnvals,vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateL(RootInstance,Tree,Classes) } return RootInstance SUBSEP CurrentSize } #Rotate the current root's left subchild to right to make it become the current root function rotateR(RootInstance, Tree, Classes, temp) { temp = Tree[RootInstance,Left] Tree[RootInstance,Left] = Tree[temp,Right] Tree[temp,Right] = RootInstance Tree[temp,All] = Tree[RootInstance,All] for(M in Classes) { Tree[temp, "A" SUBSEP Classes[M]] = Tree[RootInstance, "A" SUBSEP Classes[M]] } if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]]+Tree[RootInstance,"h" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] for(M in Classes) { Tree[RootInstance, "A" SUBSEP Classes[M]] = Tree[RootInstance,"h" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]= Tree[RootInstance,"h" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]] } } else { Tree[RootInstance,All] = Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance,"h" SUBSEP Classes[M]] } } } } return temp } #Rotate the current roots right subchild up to the root (by rotating it to the left) function rotateL(RootInstance, Tree, Classes, temp) { temp = Tree[RootInstance,Right] Tree[RootInstance,Right] = Tree[temp,Left] Tree[temp,Left] = RootInstance Tree[temp,All] = Tree[RootInstance,All] for(M in Classes) { Tree[temp,"A" SUBSEP Classes[M]]=Tree[RootInstance,"A" SUBSEP Classes[M]] } if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]] = Tree[Tree[RootInstance,Left],"A" SUBSEP Classes[M]]+Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]]+Tree[RootInstance,"h" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance, "h" SUBSEP Classes[M]] + Tree[Tree[RootInstance,Left], "A" SUBSEP Classes[M]] } } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] for(M in Classes) { Tree[RootInstance,"A" SUBSEP Classes[M]]=Tree[RootInstance, "h" SUBSEP Classes[M]] + Tree[Tree[RootInstance,Right],"A" SUBSEP Classes[M]] } } else { Tree[RootInstance,All] = Tree[RootInstance,Here] for(M in Classes) { Tree[RootInstance, "A" SUBSEP Classes[M]] = Tree[RootInstance, "h" SUBSEP Classes[M]] } } } } return temp } #Function to insert a new node into the tree function newtree(RootInstance, Tree, Value, Class, Classes, CurrentSize) { Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,Val]=Value Tree[RootInstance,All]=1 Tree[RootInstance,Here]=1 #Initialize the here and All class counts for each class for(M in Classes) { Tree[RootInstance,"h" SUBSEP Classes[M]]=0 Tree[RootInstance,"A" SUBSEP Classes[M]]=0 } Tree[RootInstance,"A" SUBSEP Class]=1 Tree[RootInstance,"h" SUBSEP Class]=1 CurrentSize++ return RootInstance SUBSEP CurrentSize } #Function to print the current tree function printtree(RootInstance, Tree, b4, indent, Classes, allclasses, treeclasses) { allclasses="" treeclasses="" for(M in Classes) { if(allclasses==""){allclasses=Tree[RootInstance,"h" SUBSEP Classes[M]]} else allclasses=allclasses " " Tree[RootInstance,"h" SUBSEP Classes[M]] if(treeclasses==""){treeclasses=Tree[RootInstance,"A" SUBSEP Classes[M]]} else treeclasses=treeclasses " " Tree[RootInstance,"A" SUBSEP Classes[M]] } print indent b4 RootInstance " = " Tree[RootInstance,Val] " [here=" Tree[RootInstance,Here] "] [all=" Tree[RootInstance,All] "]" print indent b4 RootInstance " Classes here are: [" allclasses "], all: [" treeclasses "]" if(Tree[RootInstance,Left] != "?") printtree(Tree[RootInstance,Left], Tree, "< ", indent "| ",Classes,x) if(Tree[RootInstance,Right] != "?") printtree(Tree[RootInstance,Right], Tree, "> ", indent "| ",Classes,x) } #Function for inserting a value into the tree. If the value is not at the root, #this function will, with a 1/n chance, insert the node at the current root. function rbstinsert(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,All]++ Tree[RootInstance, "A" SUBSEP Class]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } else { if ((rand()*Tree[RootInstance,All]) < 1.0) { return insertR(RootInstance,Tree,Value, Attr, Rec, Class, Classes, CurrentSize) } if (Value < Tree[RootInstance,Val]) { returnvals=rbstinsert(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } else { returnvals=rbstinsert(Tree[RootInstance,Right], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) split(returnvals, vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } Tree[RootInstance,All]++ Tree[RootInstance,"A" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } } #Function that determines which tree node to substitute for a given value #For a tree node to be chosen as a viable substitute, it must have at least #minSize nodes at in itself and/or in its children. function subwith(minSize, Value, Tree, RootInstance) { # print "This instance is: " Value " and it is being compared to: " Tree[RootInstance, Val] if(((Value == Tree[RootInstance,Val]) || (Value ~ Tree[RootInstance,Val])) && (Tree[RootInstance,All] >= minSize)) { # print "They are equal" return RootInstance } # print "not equal" if(Value < Tree[RootInstance,Val]) { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } if (Value > Tree[RootInstance,Val]) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if(int(2*rand()) > 1) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } } } #Function to obtain the lists of discretized classes #To be a part of this list, the given node must #be of at least minSize function discClasses(Tree, RootInstance, minSize, myClasses) { if(Tree[RootInstance,Left] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Left], minSize, myClasses) } if(Tree[RootInstance,All] >= minSize) { if(myClasses != "") { myClasses = myClasses " , " RootInstance } else { myClasses = RootInstance } } if(Tree[RootInstance,Right] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Right], minSize, myClasses) } return myClasses } #Function to prune tree to an appropriate size/keep it from exceeding memory availability function garbageCollect(RootInstance, Tree, MaxSize, low, high, size) { #Set up a set to track the tree nodes that have been "touched by the breadth first search size=0; high=1; low=0; NodeSet[0]=RootInstance #Function uses a breadth-first approach to span the tree, adding nodes to the set left to right #across each level and then moving down to the following one. When the max size of the tree is reached, #the nodes remaining in the set are removed, along with any of their children that were not discovered. while (high != low) { if (Tree[NodeSet[low],Left] != "?") { NodeSet[high]= Tree[NodeSet[low],Left]; high++; } if (Tree[NodeSet[low],Right] != "?") { NodeSet[high]= Tree[NodeSet[low],Right]; high++; } size++; low++; if (size == MaxSize) { high = high-1; while(low <= high) { removeVal(RootInstance, Tree, pruneNode(NodeSet[high], Tree)) high = high - 1; } } } } #Function to remove a given node and any of its children from the tree function pruneNode(RootInstance, Tree) { #If this node has a left child, remove that left child before this node if(Tree[RootInstance,Left] != "?") { pruneNode(Tree[RootInstance,Left], Tree) } #If this node has a right child, remove that right child before this node if(Tree[RootInstance,Right] != "?") { pruneNode(Tree[RootInstance,Right],Tree) } #Set all relevant values of this tree to zero and disconnect the childrens nodes Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,All]=0 Tree[RootInstance,Here]=0 return Tree[RootInstance,Val] } #Function to remove a node from the tree based on its value #Starting root node will not be removed using this method function removeVal(RootInstance, Tree, myVal) { if(Tree[RootInstance,Val] > myVal) { if(Tree[Tree[RootInstance, Left], Val] == myVal) { Tree[RootInstance, Left]="?" } else { removeVal(Tree[RootInstance,Left], Tree, myVal) } } else { if(Tree[Tree[RootInstance, Right], Val] == myVal) { Tree[RootInstance, Right]="?" } else { removeVal(Tree[RootInstance, Right], Tree, myVal) } } }