Sawtooth.awk

#©copyleft 2004 #by Andres Orrego and Tim Menzies. #

Initialization

#{ BEGIN { #Command line arguments: # Verbose # for verbose mode # Brief # for "brief mode" # SUBBINS # Number of subdivisions per new value seen (discretization) # ERA # Cache size (expresed in number of instances) FS=" *, *"; # Field separator is a comma and surrounding blank-spaces #Internal globals: Total=0; # count of all instances Classified=0;# count of currently classified instances Correct=0; # count of currently correctly classified instances totClsfd=0; # count of all classified instances totCorrect=0;# count of all correctly classified instances stblClsfd=0; # count of correctly classified insts. since stable stblCrrct=0; # count of classified insts. since stable guess=""; # Stores the current class prediction MaxAtt=0; # count of number of attributes StTime = 0; # Start time TrTime = 0; # Training Time TsTime = 0; # Test Time TtTime = 0; # Total Time MININST=0; # minimum number of instances per band split MAXINST=0; # max number of instances MINBANDS=0; # minimum number of bands per attribute MAXBANDS=0; # max number of bands trained=0; # Flag that determines whether first ERA was learned. stDevAcc=0.0;# Standar deviation of the accuracies meanAcc=0.0; # Mean of the accuracies zTest=0.0; # ZTest results # MaxBands # table of counters for attribute ranges # NumAtt # table of index for numeric attributes # Classes # table of class names/frequencies # Freq # table of counters for values in attributes in classes # Cache # Temporary Bayesian table. # Seen # table of counters for values in attributes # Attributes # table of number of values per attribute StTime = systime(); # Start timer. Timer=0; } # END Initialization #} #============================================================================ #

Training

#{ Pass==1 { # Processing header of train file. if(FNR==1) { init(); # Checks for file type and attribute characteristics. next; } do { if ($0~/^%/ || NF != MaxAtt) continue; # Classification and testing. guess = classify(); if ( $NF == guess ) { Correct++; } ++Classified; # Caching instances for ( a=1; a<=NF; a++ ) { Cache[Classified,a] = $a; } if ( Classified >= ERA ) { break; } if ( getline <= 0 ) { break; } } while(Pass==1) if(!trained){ trained = 1; } else{ stblClsfd += Classified; # Update counters stblCrrct += Correct; totClsfd += Classified; totCorrect += Correct; stDevAcc = StDev(Classified,Correct); # calculate test parameters. newMeanAcc = Correct; #n*p=classified*(correct/classified)=correct oldMeanAcc = stblCrrct*Classified/stblClsfd; #mu_0=n*p_0 # Standardized test statistics Z(0.01) = 2.326 zTest=ZTest(newMeanAcc,oldMeanAcc,stDevAcc,Classified); if (zTest < -2.326){ # Not stable Stable = -1; stblCrrct=Correct; # Reset counters of Stable stblClsfd=Classified; } else{ # Stable Stable++; } } if(Stable < 3){ # IF stability is preserved train(Cache,Classified); # THEN train on the cache. } delete Cache; # Resete Cache Correct = Classified = 0; # Reset counters of ERA if(NumDS){ # IF dataset has numeric attributes updateBands(); # THEN Update discretization table after each ERA. } } # END Training #} # #============================================================================ #

Testing

#{ Pass==2 { # Classification time! # Processing header of test file. if ( FNR == 1 ) { init(); # Checks for file type and attribute characteristics. test(); # Checks for matching file type and attributes in init(). # update bands for numeric attributes if ( NumDS ) { updateBands() } # initialize counters instanceIndex = 0; # current instance number sampleCount = 0; # number of instances in era totalCount = 0; # total number of instances since stable sampleSumMaxLikelihood = 0; # max likelihood sum for era totalSumMaxLikelihood = 0; # max likelihood sum since stable eraNum = 1; # current era number Stable = 3; # number of stable eras detectedClass = 0; # number of the detected class # print header for the data print "instance,era,index,max Likelihood,class,guess"; next; } do { if ($0~/^%/ || NF != MaxAtt) continue; # classify instance, in addition to returning guess thes sets the # global maxLikelihood variable guess = classify(); # adjust the max likelihood maxLikelihood = log( maxLikelihood ) # keep track of the instance number ++instanceIndex; # sum and count the max likelihoods ++sampleCount; sampleSumMaxLikelihood += maxLikelihood; # store the max likelihoods sampleMaxLikelihoods[ sampleCount ] = maxLikelihood; # set class of instance to be detected class if found if ( detectedClass > 0 ) { $NF = "'__" detectedClass "__'"; } # output a record of information print instanceIndex "," eraNum "," sampleCount "," \ exp( maxLikelihood ) "," $NF "," guess # cache instances for( a = 1; a <= NF; ++a ) { Cache[sampleCount,a] = $a; } # stopping points if ( sampleCount >= ERA ) { break; } if ( getline <= 0 ) { break; } } while ( Pass==2 ) # calculate average max likelihood for the sample sampleAvgMaxLikelihood = sampleSumMaxLikelihood / sampleCount; # set max likelihood array meanMaxLikelihood[ eraNum ] = exp( sampleAvgMaxLikelihood ); # calculate the standard deviation for the sample diffSqrSum = 0; for ( arrayIndex in sampleMaxLikelihoods ) { # sum the square of the difference between each instance and the mean diff = sampleMaxLikelihoods[arrayIndex] - sampleAvgMaxLikelihood; diffSqrSum += diff * diff; } # calculate the final standard deviation if ( sampleCount > 1 ) { sampleStdDev = sqrt( diffSqrSum / ( sampleCount - 1 )); } # if there is only one sample, then there is no deviation else { sampleStdDev = 0; } # handle the first era if ( eraNum == 1 ) { totalAvgMaxLikelihood = sampleAvgMaxLikelihood; } # standardized z-test zTest = ZTest( sampleAvgMaxLikelihood, totalAvgMaxLikelihood, sampleStdDev, sampleCount ); # store the zTest value for the era eraZtest[ eraNum ] = zTest; # if change has occured then mark as unstable status[eraNum] = zTest < zThresh; if ( status[eraNum] ) { # only count as a new class if we currently stable, otherwise a double # count could result changed[eraNum] = Stable >= 3; if ( Stable >= 3 ) { # store an entry for the era marking the the unstable point newClasEras[ eraNum ] = zTest; # create a new class number ++detectedClass; # set the class of all instances of current era to new class for ( instanceIndex = 0; instanceIndex < sampleCount; ++instanceIndex ) { Cache[instanceIndex,NF] = "'__" detectedClass "__'"; } } # mark unstable Stable = 0; } # otherwise, keep tracking the mean else { # keep track of the sums totalCount += sampleCount; totalSumMaxLikelihood += sampleSumMaxLikelihood; # calculate the total average totalAvgMaxLikelihood = totalSumMaxLikelihood / totalCount; } # train if recently unstable if ( Stable < 3 ) { # train from the data train( Cache, sampleCount ); # reset counters for the mean totalCount = sampleCount; totalSumMaxLikelihood = sampleSumMaxLikelihood; totalAvgMaxLikelihood = sampleAvgMaxLikelihood; } # count the eras ++eraNum; # count stable eras ++Stable; # delete the sample arrays delete Cache; # Reset Cache delete sampleMaxLikelihoods; # reset sample vars to 0 sampleCount = 0; sampleSumMaxLikelihood = 0; # update bands if(NumDS){ # IF dataset has numeric attributes updateBands(); # THEN Update discretization table after each ERA. } } # END Testing #} #============================================================================ #

Post-Processing

#{ END { #REDTAG: output any final stats if needed # display max likelihood info print "\nindex,mean max likelihood"; for ( arrayIndex in meanMaxLikelihood ) { print arrayIndex "," meanMaxLikelihood[arrayIndex]; } # display max likelihood info # display zTest value print "\nera,Z-Test"; for ( arrayIndex in eraZtest ) { print arrayIndex "," eraZtest[arrayIndex] "," status[arrayIndex] "," changed[arrayIndex]; } # display zTest value # store the zTest value for the era eraZtest[ eraNum ] = zTest; # display ereas where new classes were detected print "\nera,ztest"; for ( arrayIndex in newClasEras ) { print arrayIndex "," newClasEras[arrayIndex]; } # display ereas where new classes were detected TsTime = systime(); if (Verbose) { # When in verbose mode. print "\n\nDISCRETE VALUES AFTER TESTING"; printDVals(); # Prints the discretized attributes. printTable(); # Prints the bayesian table. print "Number of Classes:",Attributes[MaxAtt]; print "Correctly Classified Instances : " totCorrect; print "Total Number of Instances Classified : " totClsfd; printf "accuracy: " } #if(!Brief) print totCorrect*100/totClsfd; TtTime = systime(); if (Timer){ # When in timing mode, times are displayed. printf "\nTraining Time: %-2.2dm %-2.2ds\n", (TrTime-StTime)/60,(TrTime-StTime)%60; printf "Testing Time : %-2.2dm %-2.2ds\n", (TsTime-TrTime)/60,(TsTime-TrTime)%60; printf "Total Time : %-2.2dm %-2.2ds\n", (TtTime-StTime)/60,(TtTime-StTime)%60; } } #} #

User Defined functions

Standardized Test Statistic

#{ function ZTest(newMean,oldMean,stDev,totInst){ if(stDev == 0) return newMean - oldMean; return (newMean-oldMean)/(stDev/sqrt(totInst)); } #} #

Standard Deviation

#{ function StDev(n,corr, p){ if(n > 1){ p=corr/n; return sqrt(n*p*(1-p)); } return 0; } #} #

Data File Initialization

#{ function init( i){ if (FILENAME~/.mbc$/){ # .MBC files. for(i=1;i<=NF;i++){ header[i]=$i; # Stores the attribute value name i. if ($i ~ /^\*/) { NumAtt[i]=1; # Determines that the field i is numeric. NumDS=1; Min[i,1]=1e+32; # Minimum number in the field i. Max[i,1]=-1e+32; # Maximum number in the field i. } else NumAtt[i]=0; # field i is discrete. } MaxAtt=NF; # Total number of attributes. } if (FILENAME~/.arff$/){ # .ARFF files (WEKA). FS = " "; i=1; while($0!~/^ *@r/ && $0!~/^ *@R/) getline; while($0!~/^ *@d/ && $0!~/ *^@D/){ if ($0~/^ *@a/ || $0~/^ *@A/) { header[i] = $2; if ($3!~/^\{/){ NumAtt[i]=1; NumDS=1; Min[i,1]=1e+32; Max[i,1]=-1e+32; } i++; } getline; } MaxAtt = i-1; FS=" *, *"; } } #} #

Validating the Test Datafile

#{ function test( i){ if (FILENAME~/.mbc$/){ for(i=1;i<=NF;i++){ if(header[i]!=$i){ # Matches atts names with train file. print "Error. Invalid testing file. Exiting..."; exit 1; } } } if (FILENAME~/.arff$/){ while($0!~/^ *@d/ && $0!~/^ *@D/){ # Skips file header. getline; } } } #} #

Naive Bayes

Bayesian Training Function

#{ function train(array,size, a,val,i,c) { for(a=1;a<=size;a++){ Total++; c=array[a,MaxAtt]; Classes[c]++; for(i=1;i<=MaxAtt;i++) { if (array[a,i] ~ /?/) continue; if (NumAtt[i]){ val=discretize(i,array[a,i]+0); } else val=array[a,i]; Freq[c,i,val]++ if (++Seen[i,val]==1) Attributes[i]++; } } } #} #

Bayesian Classification Function

#{ function classify( i,temp,what,like,c,prior,m,inst) { what="?"; m=2; like = 0; for(i=1;i= like ) {like = temp; what=c} } maxLikelihood = like; return what; } #} #

SPADE

SPADE's Bin Creation Function

#{ function discretize(fld,item, i,j,k,subdiv){ # numeric value lies between min max seen so far. if (item>=Min[fld,1] && item<=Max[fld,1]) { # search for the band who contains it and return its position. return find(fld,item); } # creating first Band. It is of the form (item,item] if (itemMax[fld,1]){ i=0; # The index for the first band is 0 Band[fld,i,1]=item; # Band is the array where "fld" is the attribute, Band[fld,i,2]=item; # " i" is the position of the band, "1" is the # lower limit and "2" is the upper limit. Min[fld,1]=item; # Min is an array to store the overall Min value Min[fld,2]=i; # and its position, "1," and "2" respectively. Max[fld,1]=item; # Max is analogous to Min. Max[fld,2]=i; MaxBands[fld]++; # The number of bands is now 1 return i; } # If the numeric value is less than the min seen so far. if (itemMax[fld,1]){ subdiv = ((item - Band[fld,Max[fld,2],2]) / SUBBINS); i=Max[fld,2]+1; for(j=1; jUpdating Bins of a Single Numeric Attribute #{ function updateBand(i, j,k,l,m,n,p,old,busy,temp){ l=0;m=0; n=Max[i,2]; old = Min[i,2]+1; for(j=old+1; j<=n; j++){ if(Seen[i,j]<(MAXINST-Seen[i,old])&& Seen[i,old]Updating the Bins of All Numeric Attributes #{ function updateBands( j){ MININST = sqrt(Total); # Sets variables needed for updating MAXINST = MININST * 2; # the values of the current attribute MAXBANDS = MININST; #* SUBBINS; for(j=1; j<=MaxAtt; j++){ if(NumAtt[j]){ if (MaxBands[j]>MAXBANDS){ updateBand(j); } } } } #} #

Displaying the Predictive Model

#{ function printTable( i,j,k){ printf "\n| %17s B A Y E S I A N T A B L E %17s|", " ", " "; printf "\n| %-15.15s | %-15.15s | %-9.9s | %-6.6s | %-6.6s |\n", "ATTRIBUTE","VALUE","CLASS","FREQ","SEEN"; for(i=1;i<=MaxAtt;i++){ print Attributes[i]; for(j in Freq){ split(j,k,SUBSEP); if (k[2]==i && Seen[k[2],k[3]] > 0){ if (NumAtt[i]) printf "| %-15.15s | (%2.4f,%-2.4f] | %-9.9s | %6d | %6d |\n", header[i], Band[i,k[3],1],Band[i,k[3],2], k[1], Freq[k[1],k[2],k[3]], Seen[k[2],k[3]]; else printf "| %-15.15s | %-15.15s | %-9.9s | %6d | %6d |\n", header[i], k[3], k[1], Freq[k[1],k[2],k[3]], Seen[k[2],k[3]]; } } } } #} #

Displaying SPADE's Bins

#{ function printDVal(j, i,k){ print "Att Name:", header[j] , "\tNumber of Bands:",MaxBands[j], "\tMin Val:",Min[j,1], "\t\tMax Val:",Max[j,1]; for(i=Min[j,2]; i<=Max[j,2]; i++){ printf "( %s , %s ]=%d ",Band[j,i,1],Band[j,i,2],Seen[j,i]; } print "\n"; } #} #

Displaying All Attribute Conversions

#{ function printDVals( i){ for(i=1; i<=MaxAtt; i++) if (NumAtt[i]) printDVal(i); } #} #

Finding a Number's Bin

#{ function find(fld,item, left,mid,right){ if (item == Min[fld,1]) { return Min[fld,2]; } left = Min[fld,2]; right = Max[fld,2]; while (left < right){ mid = int((left+right)/2); if (item > Band[fld,mid,2]) {left=mid+1} else { if (item == Band[fld,mid,2] || item > Band[fld,mid,1]) { return mid; } right=mid-1; } } return left; } #}