#!/usr/bin/gawk -f #/* vim: set filteype=awk : */ -*- awk -*- #Design for RBST discretization implementation #DJ Boland 10/01/06 #------------------------------------------------------------------------------ BEGIN{ FS=OFS=","; #Fields seperated by a comma and its preceeding # and trailing spaces. SUBSEP="_" IGNORECASE=1; Val="val"; # Left="left"; Right="right"; Here="here"; All="all"; Min="min"; Max="max"; srand(); BuildTree=0; CurrentRecord=0; #Record currently being accessed Pass=0; LastLine=""; MaxSize=15; ReqRecs=10; } #Skip blank and comment lines {sub(/\%.*/,"")} #substitute comments with blanks {gsub(/[\'\"]/,"",$0)} /^[ \t]*$/ {next} #when blank line observed, go to the next line #PROCESS RELATION LINES /@RELATION/{sub(/@RELATION/, "@relation")} /@relation/{ BuildTree=0; #Set flag for parsing lines to remember (add to tree) to false Attr=0; #Set attribute counter to zero if (Pass==2){ #COMMENTED CODE can be used to verify that the first pass created the BSTs for the numeric #attributes and that it maintained class counts. It prints the tree to the screen for review # for (I in Numeric) # { # allvalues="" # for (J in Classes){allvalues = allvalues " " Classes[J]} # print "Classes are: [" allvalues "]" # printtree(CurrentRoot[I], Rbst,"","", Classes) # print "" # } } } #PROCESS ATTRIBUTE LINES /@ATTRIBUTE/ {sub(/@ATTRIBUTE/, "@attribute")} /@attribute/ { Attr++; #For each new attribute, increase attribute counter LastLine=$0; #Maintain a copy of the last attribute read, as if the @data symbol is seen #then the LastLine attribute is the class attribute } #Check if the attribute is a numeric value; if so, then set a flag/create an array value that says it is Pass==1 && /@attribute/ && (/numeric/ || /real/ || /integer/){Numeric[Attr]=1;} /@attribute/{ #On first pass, obtain the name of each attribute, store to the Names array if(Pass==1){split($1,a," "); Names[Attr]=a[2]; next} #On the second pass, replace the /numeric/, /real/, /integer/ with an accurate list of the discretized classes if((Pass==2) && (Attr in Numeric)) { outclasses = "" outclasses = discClasses(Rbst, CurrentRoot[Attr], int(sqrt(Rbst[CurrentRoot[Attr], All])), outclasses) sub(/integer|numeric|real/,"{"outclasses"}") } } #PREPARE TO PROCESS DATA /@DATA/{sub(/@DATA/, "@data")} /@data/{ BuildTree=1; #Point has been reached where data should be put in tree CurrentRecord=1;#The next nonblank line read will be the first record if (Pass==1) { sub(/\}$/, "", LastLine) #Prune closing bracket from class sub(/@attribute /,"", LastLine) #Prune @attribute sybmol from class sub(/^[[:alnum:][:blank:]]*\{/, "", LastLine) #Prune class name and opening bracket from classes gsub(/[ \t\n]*/, "", LastLine) #Prune any extraneous spaces from classes split(LastLine, myClasses, ",") #Split remaining information into different classes # COMMENTED CODE prints each class and creates a list of all the classes and prints it # for (J in myClasses){print myClasses[J]} # allvals = "" # for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} # print allvals for(I in Numeric){CurrentRoot[I] = "?"; CurrentSize[I]=0} #Initialize the roots of each tree to sybolize an empty tree } #When outputting pruned dataset, ensure keeping the @data line if (Pass==2) {print $0} NewRecords=0; #Do not want to proceed to put the @data symbol in the tree, so go to the next line next } #PROCESS DATA / BUILD TREES FROM DATA #PASS ONE creates the binary search tree for each (numeric) attribute Pass==1 && BuildTree && NF >1{ NewRecords++ for (I in Numeric) #for each numeric attribute of a dataset { field=$I; #field set to the value of the attribute in the row currently being processed CSize=CurrentSize[I] if(field ~ /?/) continue #if missing data (symbolized by ?), go to next numeric field #Otherwise, data is valid, insert into the tree else returnvalues=rbstinsert(CurrentRoot[I], Rbst, field, Names[I], CurrentRecord,$NF, myClasses, CSize) split (returnvalues, vals, SUBSEP) CurrentRoot[I]=vals[1] SUBSEP vals[2] CurrentSize[I]=vals[3] if ((CurrentSize[I] > MaxSize) && (NewRecords>=ReqRecs)) { # print "Garbage Collect!" garbageCollect(CurrentRoot[I], Rbst, MaxSize) } # print "tree " Names[I] " has size: " CurrentSize[I]; } if (NewRecords >= MaxSize) { NewRecords=0; } #Increase the counter of the number of records that have been processed CurrentRecord++; } #PASS TWO substitutes the discretized value from the search tree for each (numeric) value/attribute Pass==2 && BuildTree && NF >1{ for (I in Numeric) #for each numeric attribute of a dataset { field=$I #field set to the value of the attribute in the row currently being processed RootInstance=CurrentRoot[I] #Set RootInstance to the root instance of the attribute currently being processed if(field ~ /?/) continue #If missing data (symbolized by ?), go to the next numeric field #Otherwise, data is valid, determine what discrete value to substitute with else{$I=subwith(int(sqrt(Rbst[RootInstance,All])),field, Rbst,CurrentRoot[I])} } } #Print the parsed lines Pass==2{print $0} END{ #COMMENTED CODE outputs the tree created for the numeric datasets # for (I in Numeric) # { # print "" # allvals = "" # for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} # print "Classes are: [" allvals "]" # printtree(CurrentRoot[I], Rbst,"","", myClasses) # print "" # } } #****************************************************************************** #bst.awk #DJ's Binary Search Tree Functions #Insert a new root node function insertR(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } Tree[RootInstance,All]++ if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } if (Value < Tree[RootInstance,Val]) { returnvals=insertR(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) # print returnvals split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateR(RootInstance,Tree) } else { returnvals=insertR(Tree[RootInstance,Right], Tree, Value, Attr, Rec,Class, Classes, CurrentSize) # print returnvals split(returnvals,vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] RootInstance = rotateL(RootInstance,Tree) } return RootInstance SUBSEP CurrentSize } function rotateR(RootInstance, Tree, temp) { temp = Tree[RootInstance,Left] Tree[RootInstance,Left] = Tree[temp,Right] Tree[temp,Right] = RootInstance Tree[temp,All] = Tree[RootInstance,All] if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] } else { Tree[RootInstance,All] = Tree[RootInstance,Here] } } } return temp } function rotateL(RootInstance, Tree, temp) { temp = Tree[RootInstance,Right] Tree[RootInstance,Right] = Tree[temp,Left] Tree[temp,Left] = RootInstance Tree[temp,All] = Tree[RootInstance,All] if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] } else { Tree[RootInstance,All] = Tree[RootInstance,Here] } } } return temp } function newtree(RootInstance, Tree, Value, Class, Classes, CurrentSize) { Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,Val]=Value Tree[RootInstance,All]=1 Tree[RootInstance,Here]=1 for(M in Classes) {Tree[RootInstance,"h" SUBSEP Classes[M]]=0} Tree[RootInstance,"h" SUBSEP Class]=1 CurrentSize++ # print "Added a new item, current size is now: " CurrentSize return RootInstance SUBSEP CurrentSize } function printtree(RootInstance, Tree, b4, indent, Classes, x, allclasses) { #Current code obtained and slightly modified from Timm's code's src website x--; if (x==0) return; allclasses="" for(M in Classes) { if(allclasses==""){allclasses=Tree[RootInstance,"h" SUBSEP Classes[M]]} else allclasse=allclasses=allclasses " " Tree[RootInstance,"h" SUBSEP Classes[M]] } print indent b4 RootInstance " = " Tree[RootInstance,Val] " [here=" Tree[RootInstance,Here] "] [all =" Tree[RootInstance,All] "]" print indent b4 RootInstance " Classes are: [" allclasses "]" if(Tree[RootInstance,Left] != "?") printtree(Tree[RootInstance,Left], Tree, "< ", indent "| ",Classes,x) if(Tree[RootInstance,Right] != "?") printtree(Tree[RootInstance,Right], Tree, "> ", indent "| ",Classes,x) } function rbstinsert(RootInstance, Tree, Value, Attr, Rec, Class, Classes, CurrentSize) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes, CurrentSize) } if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,All]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance SUBSEP CurrentSize } else { if ((rand()*Tree[RootInstance,All]) < 1.0) { return insertR(RootInstance,Tree,Value, Attr, Rec, Class, Classes, CurrentSize) } if (Value < Tree[RootInstance,Val]) { returnvals=rbstinsert(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) # print returnvals split(returnvals, vals, SUBSEP) Tree[RootInstance,Left]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } else { returnvals=rbstinsert(Tree[RootInstance,Right], Tree, Value, Attr, Rec, Class, Classes, CurrentSize) # print returnvals split(returnvals, vals, SUBSEP) Tree[RootInstance,Right]=vals[1] SUBSEP vals[2] CurrentSize=vals[3] } Tree[RootInstance,All]++ return RootInstance SUBSEP CurrentSize } } function subwith(minSize, Value, Tree, RootInstance) { if(Value < Tree[RootInstance,Val]) { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } if (Value > Tree[RootInstance,Val]) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if(int(2*rand()) > 1) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } } } #function to obtain the lists of discretized classes function discClasses(Tree, RootInstance, minSize, myClasses) { if(Tree[RootInstance,Left] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Left], minSize, myClasses) } if(Tree[RootInstance,All] >= minSize) { if(myClasses != "") { myClasses = myClasses " , " RootInstance } else { myClasses = RootInstance } } if(Tree[RootInstance,Right] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Right], minSize, myClasses) } return myClasses } #function to classify new/testing instances function classify(Tree,RootInstance,minSize,Class) { #For Each record #For Each Attribute in record #Go down tree to node where the item would belong #Ask node what class it would assign the item (using some rule i.e. FI, max class, etc) #Take majority decision of decisions from each attribute } #Function to prune tree to an appropriate size/keep it from exceeding memory availability function garbageCollect(RootInstance, Tree, MaxSize, low, high, size) { size=0; high=1; low=0; NodeSet[0]=RootInstance while (high != low) { if (Tree[NodeSet[low],Left] != "?") { NodeSet[high]= Tree[NodeSet[low],Left]; high++; } if (Tree[NodeSet[low],Right] != "?") { NodeSet[high]= Tree[NodeSet[low],Right]; high++; } size++; low++; if (size == MaxSize) { # nodeSetString="" # for(i=low; i < high; i++) # { # nodeSetString= nodeSetString NodeSet[i] " " # } # print nodeSetString high = high-1; # print "low: " low " high: " high while(low <= high) { # print "Trying to prune node: " NodeSet[high] removeVal(RootInstance, Tree, pruneNode(NodeSet[high], Tree)) high = high - 1; } } } } #If Tree has exceeded maximum size specifications, proceed to prune #{ #If Current Node is high enough to avoid pruning, check if its left child is #{ #If its left child is not high enough, check if left child is interesting #{ #If left child is intersting, keep left child #Else, examine left subtree and ensure collapsing it would not remove an interesting effect #If no interesting effect found, collapse the values into a range #If interesting effect found, perform magic to keep that interesting effect #} #Else, move to left child #Ditto for right child #} #} #Else, do nothing function pruneNode(RootInstance, Tree) { if(Tree[RootInstance,Left] != "?") { pruneNode(Tree[RootInstance,Left], Tree) } if(Tree[RootInstance,Right] != "?") { pruneNode(Tree[RootInstance,Right],Tree) } Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,All]=0 Tree[RootInstance,Here]=0 return Tree[RootInstance,Val] } function removeVal(RootInstance, Tree, myVal) { # print "Trying to remove value: " myVal if(Tree[RootInstance,Val] > myVal) { if(Tree[Tree[RootInstance, Left], Val] == myVal) { Tree[RootInstance, Left]="?" } else { removeVal(Tree[RootInstance,Left], Tree, myVal) } } else { if(Tree[Tree[RootInstance, Right], Val] == myVal) { Tree[RootInstance, Right]="?" } else { removeVal(Tree[RootInstance, Right], Tree, myVal) } } }