#!/usr/bin/gawk -f #/* vim: set filteype=awk : */ -*- awk -*- #Design for RBST discretization implementation #DJ Boland 10/01/06 #------------------------------------------------------------------------------ BEGIN{ FS=OFS=","; #Fields seperated by a comma and its preceeding # and trailing spaces. SUBSEP="_" IGNORECASE=1; Val="val"; # Left="left"; Right="right"; Here="here"; All="all"; Min="min"; Max="max"; srand(); BuildTree=0; CurrentRecord=0; #Record currently being accessed Pass=0; LastLine=""; } #Skip blank and comment lines {sub(/\%.*/,"")} #substitute comments with blanks {gsub(/[\'\"]/,"",$0)} /^[ \t]*$/ {next} #when blank line observed, go to the next line #PROCESS RELATION LINES /@RELATION/{sub(/@RELATION/, "@relation")} /@relation/{ BuildTree=0; #Set flag for parsing lines to remember (add to tree) to false Attr=0; #Set attribute counter to zero if (Pass==2){ #COMMENTED CODE can be used to verify that the first pass created the BSTs for the numeric #attributes and that it maintained class counts. It prints the tree to the screen for review # for (I in Numeric) # { # allvalues="" # for (J in Classes){allvalues = allvalues " " Classes[J]} # print "Classes are: [" allvalues "]" # printtree(CurrentRoot[I], Rbst,"","", Classes) # print "" # } } } #PROCESS ATTRIBUTE LINES /@ATTRIBUTE/ {sub(/@ATTRIBUTE/, "@attribute")} /@attribute/ { Attr++; #For each new attribute, increase attribute counter LastLine=$0; #Maintain a copy of the last attribute read, as if the @data symbol is seen #then the LastLine attribute is the class attribute } #Check if the attribute is a numeric value; if so, then set a flag/create an array value that says it is Pass==1 && /@attribute/ && (/numeric/ || /real/ || /integer/){Numeric[Attr]=1;} /@attribute/{ #On first pass, obtain the name of each attribute, store to the Names array if(Pass==1){split($1,a," "); Names[Attr]=a[2]; next} #On the second pass, replace the /numeric/, /real/, /integer/ with an accurate list of the discretized classes if((Pass==2) && (Attr in Numeric)) { outclasses = "" outclasses = discClasses(Rbst, CurrentRoot[Attr], int(sqrt(Rbst[CurrentRoot[Attr], All])), outclasses) sub(/integer|numeric|real/,"{"outclasses"}") } } #PREPARE TO PROCESS DATA /@DATA/{sub(/@DATA/, "@data")} /@data/{ BuildTree=1; #Point has been reached where data should be put in tree CurrentRecord=1;#The next nonblank line read will be the first record if (Pass==1) { sub(/\}$/, "", LastLine) #Prune closing bracket from class sub(/@attribute /,"", LastLine) #Prune @attribute sybmol from class sub(/^[[:alnum:][:blank:]]*\{/, "", LastLine) #Prune class name and opening bracket from classes gsub(/[ \t\n]*/, "", LastLine) #Prune any extraneous spaces from classes split(LastLine, myClasses, ",") #Split remaining information into different classes # COMMENTED CODE prints each class and creates a list of all the classes and prints it # for (J in myClasses){print myClasses[J]} # allvals = "" # for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} # print allvals for(I in Numeric){CurrentRoot[I] = "?"} #Initialize the roots of each tree to sybolize an empty tree } #When outputting pruned dataset, ensure keeping the @data line if (Pass==2) {print $0} #Do not want to proceed to put the @data symbol in the tree, so go to the next line next } #PROCESS DATA / BUILD TREES FROM DATA #PASS ONE creates the binary search tree for each (numeric) attribute Pass==1 && BuildTree && NF >1{ for (I in Numeric) #for each numeric attribute of a dataset { field=$I; #field set to the value of the attribute in the row currently being processed if(field ~ /?/) continue #if missing data (symbolized by ?), go to next numeric field #Otherwise, data is valid, insert into the tree else CurrentRoot[I]=rbstinsert(CurrentRoot[I], Rbst, field, Names[I], CurrentRecord,$NF, myClasses) } #Increase the counter of the number of records that have been processed CurrentRecord++; } #PASS TWO substitutes the discretized value from the search tree for each (numeric) value/attribute Pass==2 && BuildTree && NF >1{ for (I in Numeric) #for each numeric attribute of a dataset { field=$I #field set to the value of the attribute in the row currently being processed RootInstance=CurrentRoot[I] #Set RootInstance to the root instance of the attribute currently being processed if(field ~ /?/) continue #If missing data (symbolized by ?), go to the next numeric field #Otherwise, data is valid, determine what discrete value to substitute with else{$I=subwith(int(sqrt(Rbst[RootInstance,All])),field, Rbst,CurrentRoot[I])} } } #Print the parsed lines Pass==2{print $0} END{ #COMMENTED CODE outputs the tree created for the numeric datasets for (I in Numeric) { print "" allvals = "" for(J in myClasses){if(allvals==""){allvals=myClasses[J]}else{allvals = allvals " " myClasses[J]}} print "Classes are: [" allvals "]" printtree(CurrentRoot[I], Rbst,"","", myClasses) print "" } } #****************************************************************************** #bst.awk #DJ's Binary Search Tree Functions #Insert a new root node function insertR(RootInstance, Tree, Value, Attr, Rec, Class, Classes) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes) } Tree[RootInstance,All]++ if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance } if (Value < Tree[RootInstance,Val]) { Tree[RootInstance,Left]=insertR(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes) RootInstance = rotateR(RootInstance,Tree) } else { Tree[RootInstance,Right]=insertR(Tree[RootInstance,Right], Tree, Value, Attr, Rec,Class, Classes) RootInstance = rotateL(RootInstance,Tree) } return RootInstance } function rotateR(RootInstance, Tree, temp) { temp = Tree[RootInstance,Left] Tree[RootInstance,Left] = Tree[temp,Right] Tree[temp,Right] = RootInstance Tree[temp,All] = Tree[RootInstance,All] if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] } else { Tree[RootInstance,All] = Tree[RootInstance,Here] } } } return temp } function rotateL(RootInstance, Tree, temp) { temp = Tree[RootInstance,Right] Tree[RootInstance,Right] = Tree[temp,Left] Tree[temp,Left] = RootInstance Tree[temp,All] = Tree[RootInstance,All] if((Tree[RootInstance,Left] != "?") && (Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All]=Tree[Tree[RootInstance,Left],All]+Tree[Tree[RootInstance,Right],All]+Tree[RootInstance,Here] } else { if((Tree[RootInstance,Left] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Left],All] } else { if((Tree[RootInstance,Right] != "?")) { Tree[RootInstance,All] = Tree[RootInstance,Here]+Tree[Tree[RootInstance,Right],All] } else { Tree[RootInstance,All] = Tree[RootInstance,Here] } } } return temp } function newtree(RootInstance, Tree, Value, Class, Classes) { Tree[RootInstance,Left]="?" Tree[RootInstance,Right]="?" Tree[RootInstance,Val]=Value Tree[RootInstance,All]=1 Tree[RootInstance,Here]=1 for(M in Classes) {Tree[RootInstance,"h" SUBSEP Classes[M]]=0} Tree[RootInstance,"h" SUBSEP Class]=1 return RootInstance } function printtree(RootInstance, Tree, b4, indent, Classes, x, allclasses) { #Current code obtained and slightly modified from Timm's code's src website x--; if (x==0) return; allclasses="" for(M in Classes) { if(allclasses==""){allclasses=Tree[RootInstance,"h" SUBSEP Classes[M]]} else allclasse=allclasses=allclasses " " Tree[RootInstance,"h" SUBSEP Classes[M]] } print indent b4 RootInstance " = " Tree[RootInstance,Val] " [here=" Tree[RootInstance,Here] "] [all =" Tree[RootInstance,All] "]" print indent b4 RootInstance " Classes are: [" allclasses "]" if(Tree[RootInstance,Left] != "?") printtree(Tree[RootInstance,Left], Tree, "< ", indent "| ",Classes,x) if(Tree[RootInstance,Right] != "?") printtree(Tree[RootInstance,Right], Tree, "> ", indent "| ",Classes,x) } function rbstinsert(RootInstance, Tree, Value, Attr, Rec, Class, Classes) { if (RootInstance == "?") { return newtree(Attr SUBSEP Rec, Tree, Value, Class, Classes) } if (Value == Tree[RootInstance,Val]) { Tree[RootInstance,Here]++ Tree[RootInstance,All]++ Tree[RootInstance,"h" SUBSEP Class]++ return RootInstance } else { if ((rand()*Tree[RootInstance,All]) < 1.0) { return insertR(RootInstance,Tree,Value, Attr, Rec, Class, Classes) } if (Value < Tree[RootInstance,Val]) { Tree[RootInstance,Left]=rbstinsert(Tree[RootInstance,Left], Tree, Value, Attr, Rec, Class, Classes) } else { Tree[RootInstance,Right]=rbstinsert(Tree[RootInstance,Right], Tree, Value, Attr, Rec, Class, Classes) } Tree[RootInstance,All]++ return RootInstance } } function subwith(minSize, Value, Tree, RootInstance) { if(Value < Tree[RootInstance,Val]) { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } if (Value > Tree[RootInstance,Val]) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if(int(2*rand()) > 1) { if (Tree[RootInstance,Right] != "?") { if(Tree[Tree[RootInstance,Right],All] >= minSize) { return subwith(minSize, Value, Tree, Tree[RootInstance,Right]) } else return RootInstance } else return RootInstance } else { if (Tree[RootInstance,Left]!= "?") { if(Tree[Tree[RootInstance,Left],All] >= minSize) { return subwith(minSize,Value, Tree, Tree[RootInstance,Left]) } else return RootInstance } else return RootInstance } } } #function to obtain the lists of discretized classes function discClasses(Tree, RootInstance, minSize, myClasses) { if(Tree[RootInstance,Left] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Left], minSize, myClasses) } if(Tree[RootInstance,All] >= minSize) { if(myClasses != "") { myClasses = myClasses " , " RootInstance } else { myClasses = RootInstance } } if(Tree[RootInstance,Right] != "?") { myClasses=discClasses(Tree, Tree[RootInstance,Right], minSize, myClasses) } return myClasses } #function to classify new/testing instances function classify(Tree,RootInstance,minSize,Class) { #For Each record #For Each Attribute in record #Go down tree to node where the item would belong #Ask node what class it would assign the item (using some rule i.e. FI, max class, etc) #Take majority decision of decisions from each attribute } #Function to prune tree to an appropriate size/keep it from exceeding memory availability function garbageCollect(Tree, RootInstance, others) { #If Tree has exceeded maximum size specifications, proceed to prune #{ #If Current Node is high enough to avoid pruning, check if its left child is #{ #If its left child is not high enough, check if left child is interesting #{ #If left child is intersting, keep left child #Else, examine left subtree and ensure collapsing it would not remove an interesting effect #If no interesting effect found, collapse the values into a range #If interesting effect found, perform magic to keep that interesting effect #} #Else, move to left child #Ditto for right child #} #} #Else, do nothing }