#
Sawtooth.awk
#©copyleft 2004
#by Andres Orrego and Tim Menzies.
#See also bin/sawtooth.
#
Initialization
#{
BEGIN {
#Command line arguments:
# Verbose # for verbose mode
# Brief # for "brief mode"
# SUBBINS # Number of subdivisions per new value seen (discretization)
# ERA # Cache size (expresed in number of instances)
FS=" *, *"; # Field separator is a comma and surrounding blank-spaces
#Internal globals:
Total=0; # count of all instances
Classified=0;# count of currently classified instances
Correct=0; # count of currently correctly classified instances
totClsfd=0; # count of all classified instances
totCorrect=0;# count of all correctly classified instances
stblClsfd=0; # count of correctly classified insts. since stable
stblCrrct=0; # count of classified insts. since stable
guess=""; # Stores the current class prediction
MaxAtt=0; # count of number of attributes
StTime = 0; # Start time
TrTime = 0; # Training Time
TsTime = 0; # Test Time
TtTime = 0; # Total Time
MININST=0; # minimum number of instances per band split
MAXINST=0; # max number of instances
MINBANDS=0; # minimum number of bands per attribute
MAXBANDS=0; # max number of bands
trained=0; # Flag that determines whether first ERA was learned.
stDevAcc=0.0;# Standar deviation of the accuracies
meanAcc=0.0; # Mean of the accuracies
zTest=0.0; # ZTest results
# MaxBands # table of counters for attribute ranges
# NumAtt # table of index for numeric attributes
# Classes # table of class names/frequencies
# Freq # table of counters for values in attributes in classes
# Cache # Temporary Bayesian table.
# Seen # table of counters for values in attributes
# Attributes # table of number of values per attribute
StTime = systime(); # Start timer.
Timer=0;
} # END Initialization
#}
#============================================================================
#Training
#{
Pass==1 {
# Processing header of train file.
if(FNR==1)
{
init(); # Checks for file type and attribute characteristics.
next;
}
do
{
if ($0~/^%/ || NF != MaxAtt) continue;
# Classification and testing.
guess = classify();
if ( $NF == guess ) { Correct++; }
++Classified;
# Caching instances
for ( a=1; a<=NF; a++ )
{
Cache[Classified,a] = $a;
}
if ( Classified >= ERA ) { break; }
if ( getline <= 0 ) { break; }
} while(Pass==1)
if(!trained){
trained = 1;
}
else{
stblClsfd += Classified; # Update counters
stblCrrct += Correct;
totClsfd += Classified;
totCorrect += Correct;
stDevAcc = StDev(Classified,Correct); # calculate test parameters.
newMeanAcc = Correct; #n*p=classified*(correct/classified)=correct
oldMeanAcc = stblCrrct*Classified/stblClsfd; #mu_0=n*p_0
# Standardized test statistics Z(0.01) = 2.326
zTest=ZTest(newMeanAcc,oldMeanAcc,stDevAcc,Classified);
if (zTest < -2.326){ # Not stable
Stable = -1;
stblCrrct=Correct; # Reset counters of Stable
stblClsfd=Classified;
}
else{ # Stable
Stable++;
}
}
if(Stable < 3){ # IF stability is preserved
train(Cache,Classified); # THEN train on the cache.
}
delete Cache; # Resete Cache
Correct = Classified = 0; # Reset counters of ERA
if(NumDS){ # IF dataset has numeric attributes
updateBands(); # THEN Update discretization table after each ERA.
}
} # END Training
#}
#
#============================================================================
#Testing
#{
Pass==2 { # Classification time!
# Processing header of test file.
if ( FNR == 1 )
{
init(); # Checks for file type and attribute characteristics.
test(); # Checks for matching file type and attributes in init().
# update bands for numeric attributes
if ( NumDS ) { updateBands() }
# initialize counters
instanceIndex = 0; # current instance number
sampleCount = 0; # number of instances in era
totalCount = 0; # total number of instances since stable
sampleSumMaxLikelihood = 0; # max likelihood sum for era
totalSumMaxLikelihood = 0; # max likelihood sum since stable
eraNum = 1; # current era number
Stable = 3; # number of stable eras
detectedClass = 0; # number of the detected class
# print header for the data
print "instance,era,index,max Likelihood,class,guess";
next;
}
do
{
if ($0~/^%/ || NF != MaxAtt) continue;
# classify instance, in addition to returning guess thes sets the
# global maxLikelihood variable
guess = classify();
# adjust the max likelihood
maxLikelihood = log( maxLikelihood )
# keep track of the instance number
++instanceIndex;
# sum and count the max likelihoods
++sampleCount;
sampleSumMaxLikelihood += maxLikelihood;
# store the max likelihoods
sampleMaxLikelihoods[ sampleCount ] = maxLikelihood;
# set class of instance to be detected class if found
if ( detectedClass > 0 )
{
$NF = "'__" detectedClass "__'";
}
# output a record of information
print instanceIndex "," eraNum "," sampleCount "," \
exp( maxLikelihood ) "," $NF "," guess
# cache instances
for( a = 1; a <= NF; ++a )
{
Cache[sampleCount,a] = $a;
}
# stopping points
if ( sampleCount >= ERA ) { break; }
if ( getline <= 0 ) { break; }
} while ( Pass==2 )
# calculate average max likelihood for the sample
sampleAvgMaxLikelihood = sampleSumMaxLikelihood / sampleCount;
# set max likelihood array
meanMaxLikelihood[ eraNum ] = exp( sampleAvgMaxLikelihood );
# calculate the standard deviation for the sample
diffSqrSum = 0;
for ( arrayIndex in sampleMaxLikelihoods )
{
# sum the square of the difference between each instance and the mean
diff = sampleMaxLikelihoods[arrayIndex] - sampleAvgMaxLikelihood;
diffSqrSum += diff * diff;
}
# calculate the final standard deviation
if ( sampleCount > 1 )
{
sampleStdDev = sqrt( diffSqrSum / ( sampleCount - 1 ));
}
# if there is only one sample, then there is no deviation
else
{
sampleStdDev = 0;
}
# handle the first era
if ( eraNum == 1 )
{
totalAvgMaxLikelihood = sampleAvgMaxLikelihood;
}
# standardized z-test
zTest = ZTest( sampleAvgMaxLikelihood, totalAvgMaxLikelihood, sampleStdDev,
sampleCount );
# store the zTest value for the era
eraZtest[ eraNum ] = zTest;
# if change has occured then mark as unstable
status[eraNum] = zTest < zThresh;
if ( status[eraNum] )
{
# only count as a new class if we currently stable, otherwise a double
# count could result
changed[eraNum] = Stable >= 3;
if ( Stable >= 3 )
{
# store an entry for the era marking the the unstable point
newClasEras[ eraNum ] = zTest;
# create a new class number
++detectedClass;
# set the class of all instances of current era to new class
for ( instanceIndex = 0; instanceIndex < sampleCount;
++instanceIndex )
{
Cache[instanceIndex,NF] = "'__" detectedClass "__'";
}
}
# mark unstable
Stable = 0;
}
# otherwise, keep tracking the mean
else
{
# keep track of the sums
totalCount += sampleCount;
totalSumMaxLikelihood += sampleSumMaxLikelihood;
# calculate the total average
totalAvgMaxLikelihood = totalSumMaxLikelihood / totalCount;
}
# train if recently unstable
if ( Stable < 3 )
{
# train from the data
train( Cache, sampleCount );
# reset counters for the mean
totalCount = sampleCount;
totalSumMaxLikelihood = sampleSumMaxLikelihood;
totalAvgMaxLikelihood = sampleAvgMaxLikelihood;
}
# count the eras
++eraNum;
# count stable eras
++Stable;
# delete the sample arrays
delete Cache; # Reset Cache
delete sampleMaxLikelihoods;
# reset sample vars to 0
sampleCount = 0;
sampleSumMaxLikelihood = 0;
# update bands
if(NumDS){ # IF dataset has numeric attributes
updateBands(); # THEN Update discretization table after each ERA.
}
} # END Testing
#}
#============================================================================
#Post-Processing
#{
END {
#REDTAG: output any final stats if needed
# display max likelihood info
print "\nindex,mean max likelihood";
for ( arrayIndex in meanMaxLikelihood )
{
print arrayIndex "," meanMaxLikelihood[arrayIndex];
} # display max likelihood info
# display zTest value
print "\nera,Z-Test";
for ( arrayIndex in eraZtest )
{
print arrayIndex "," eraZtest[arrayIndex] "," status[arrayIndex] "," changed[arrayIndex];
} # display zTest value
# store the zTest value for the era
eraZtest[ eraNum ] = zTest;
# display ereas where new classes were detected
print "\nera,ztest";
for ( arrayIndex in newClasEras )
{
print arrayIndex "," newClasEras[arrayIndex];
} # display ereas where new classes were detected
TsTime = systime();
if (Verbose) { # When in verbose mode.
print "\n\nDISCRETE VALUES AFTER TESTING";
printDVals(); # Prints the discretized attributes.
printTable(); # Prints the bayesian table.
print "Number of Classes:",Attributes[MaxAtt];
print "Correctly Classified Instances : " totCorrect;
print "Total Number of Instances Classified : " totClsfd;
printf "accuracy: "
}
#if(!Brief) print totCorrect*100/totClsfd;
TtTime = systime();
if (Timer){ # When in timing mode, times are displayed.
printf "\nTraining Time: %-2.2dm %-2.2ds\n",
(TrTime-StTime)/60,(TrTime-StTime)%60;
printf "Testing Time : %-2.2dm %-2.2ds\n",
(TsTime-TrTime)/60,(TsTime-TrTime)%60;
printf "Total Time : %-2.2dm %-2.2ds\n",
(TtTime-StTime)/60,(TtTime-StTime)%60;
}
}
#}
#User Defined functions
#Standardized Test Statistic
#{
function ZTest(newMean,oldMean,stDev,totInst){
if(stDev == 0) return newMean - oldMean;
return (newMean-oldMean)/(stDev/sqrt(totInst));
}
#}
#Standard Deviation
#{
function StDev(n,corr, p){
if(n > 1){
p=corr/n;
return sqrt(n*p*(1-p));
}
return 0;
}
#}
#Data File Initialization
#{
function init( i){
if (FILENAME~/.mbc$/){ # .MBC files.
for(i=1;i<=NF;i++){
header[i]=$i; # Stores the attribute value name i.
if ($i ~ /^\*/) {
NumAtt[i]=1; # Determines that the field i is numeric.
NumDS=1;
Min[i,1]=1e+32; # Minimum number in the field i.
Max[i,1]=-1e+32; # Maximum number in the field i.
}
else NumAtt[i]=0; # field i is discrete.
}
MaxAtt=NF; # Total number of attributes.
}
if (FILENAME~/.arff$/){ # .ARFF files (WEKA).
FS = " ";
i=1;
while($0!~/^ *@r/ && $0!~/^ *@R/) getline;
while($0!~/^ *@d/ && $0!~/ *^@D/){
if ($0~/^ *@a/ || $0~/^ *@A/) {
header[i] = $2;
if ($3!~/^\{/){
NumAtt[i]=1;
NumDS=1;
Min[i,1]=1e+32;
Max[i,1]=-1e+32;
}
i++;
}
getline;
}
MaxAtt = i-1;
FS=" *, *";
}
}
#}
#Validating the Test Datafile
#{
function test( i){
if (FILENAME~/.mbc$/){
for(i=1;i<=NF;i++){
if(header[i]!=$i){ # Matches atts names with train file.
print "Error. Invalid testing file. Exiting...";
exit 1;
}
}
}
if (FILENAME~/.arff$/){
while($0!~/^ *@d/ && $0!~/^ *@D/){ # Skips file header.
getline;
}
}
}
#}
#Naive Bayes
#Bayesian Training Function
#{
function train(array,size, a,val,i,c) {
for(a=1;a<=size;a++){
Total++;
c=array[a,MaxAtt];
Classes[c]++;
for(i=1;i<=MaxAtt;i++) {
if (array[a,i] ~ /?/) continue;
if (NumAtt[i]){
val=discretize(i,array[a,i]+0);
}
else val=array[a,i];
Freq[c,i,val]++
if (++Seen[i,val]==1) Attributes[i]++;
}
}
}
#}
#Bayesian Classification Function
#{
function classify( i,temp,what,like,c,prior,m,inst) {
what="?";
m=2;
like = 0;
for(i=1;i= like ) {like = temp; what=c}
}
maxLikelihood = like;
return what;
}
#}
#SPADE
#SPADE's Bin Creation Function
#{
function discretize(fld,item, i,j,k,subdiv){
# numeric value lies between min max seen so far.
if (item>=Min[fld,1] && item<=Max[fld,1]) {
# search for the band who contains it and return its position.
return find(fld,item);
}
# creating first Band. It is of the form (item,item]
if (itemMax[fld,1]){
i=0; # The index for the first band is 0
Band[fld,i,1]=item; # Band is the array where "fld" is the attribute,
Band[fld,i,2]=item; # " i" is the position of the band, "1" is the
# lower limit and "2" is the upper limit.
Min[fld,1]=item; # Min is an array to store the overall Min value
Min[fld,2]=i; # and its position, "1," and "2" respectively.
Max[fld,1]=item; # Max is analogous to Min.
Max[fld,2]=i;
MaxBands[fld]++; # The number of bands is now 1
return i;
}
# If the numeric value is less than the min seen so far.
if (itemMax[fld,1]){
subdiv = ((item - Band[fld,Max[fld,2],2]) / SUBBINS);
i=Max[fld,2]+1;
for(j=1; jUpdating Bins of a Single Numeric Attribute
#{
function updateBand(i, j,k,l,m,n,p,old,busy,temp){
l=0;m=0;
n=Max[i,2];
old = Min[i,2]+1;
for(j=old+1; j<=n; j++){
if(Seen[i,j]<(MAXINST-Seen[i,old])&& Seen[i,old]Updating the Bins of All Numeric Attributes
#{
function updateBands( j){
MININST = sqrt(Total); # Sets variables needed for updating
MAXINST = MININST * 2; # the values of the current attribute
MAXBANDS = MININST; #* SUBBINS;
for(j=1; j<=MaxAtt; j++){
if(NumAtt[j]){
if (MaxBands[j]>MAXBANDS){
updateBand(j);
}
}
}
}
#}
#Displaying the Predictive Model
#{
function printTable( i,j,k){
printf "\n| %17s B A Y E S I A N T A B L E %17s|", " ", " ";
printf "\n| %-15.15s | %-15.15s | %-9.9s | %-6.6s | %-6.6s |\n",
"ATTRIBUTE","VALUE","CLASS","FREQ","SEEN";
for(i=1;i<=MaxAtt;i++){
print Attributes[i];
for(j in Freq){
split(j,k,SUBSEP);
if (k[2]==i && Seen[k[2],k[3]] > 0){
if (NumAtt[i])
printf "| %-15.15s | (%2.4f,%-2.4f] | %-9.9s | %6d | %6d |\n",
header[i], Band[i,k[3],1],Band[i,k[3],2], k[1],
Freq[k[1],k[2],k[3]], Seen[k[2],k[3]];
else
printf "| %-15.15s | %-15.15s | %-9.9s | %6d | %6d |\n",
header[i], k[3], k[1], Freq[k[1],k[2],k[3]],
Seen[k[2],k[3]];
}
}
}
}
#}
#Displaying SPADE's Bins
#{
function printDVal(j, i,k){
print "Att Name:", header[j] ,
"\tNumber of Bands:",MaxBands[j],
"\tMin Val:",Min[j,1],
"\t\tMax Val:",Max[j,1];
for(i=Min[j,2]; i<=Max[j,2]; i++){
printf "( %s , %s ]=%d ",Band[j,i,1],Band[j,i,2],Seen[j,i];
}
print "\n";
}
#}
#Displaying All Attribute Conversions
#{
function printDVals( i){
for(i=1; i<=MaxAtt; i++)
if (NumAtt[i]) printDVal(i);
}
#}
#Finding a Number's Bin
#{
function find(fld,item, left,mid,right){
if (item == Min[fld,1]) {
return Min[fld,2];
}
left = Min[fld,2];
right = Max[fld,2];
while (left < right){
mid = int((left+right)/2);
if (item > Band[fld,mid,2]) {left=mid+1}
else {
if (item == Band[fld,mid,2] || item > Band[fld,mid,1]) {
return mid;
}
right=mid-1;
}
}
return left;
}
#}