# Local Calibration using gawk # Dan Baker, Summer 2006. Adapted from the work of Tim Menzies & Scott Chen, 2005 # Assumes that 0 or more exponential scale factors are in the left columns... # ...followed by linear effort multipliers... # ...followed by the programs size in thousands of lines of code... # ...and finally the actual project's effort in work-months. BEGIN { IGNORECASE = 1; # Ignore case in string comparisons. FS = ","; # Initialize the field separator. OFS = ","; # Initialize the output field separator. Pred = 30; # Initialize the range for Pred. E = 2.71828182846; # Initialize the magic number E. ScaleFactorCount = 0; # The number of scale factors. Set it to zero for cocomo-I Skip = 2; # Set to 1 or 2 if this is a csv file and we should skip the header line. PredN = 0; # Initialize the number of records within Pred range. TestingCount = 0; # Initialize the number of testing records. TrainingCount = 0; # Initialize the number of training records. Seed = 1; # Set the default Seed value. Sum1 = 0; # Initialize the sum of log(Kloc) Sum2 = 0; # Initialize the sum of (log(Kloc))^2 Sum3 = 0; # Initialize the sum of log(ActualEffort/EffortMultiplier) Sum4 = 0; # Initialize the sum of log(Kloc)*log(ActualEffort/EffortMultiplier) IgnoreEM = 0; # Set this to 1 if you want to ignore the effort multipliers } NR==1 { srand(Seed); } # Seed the random number generator. { sub(/\%.*/,"") } # Replace commented lines with a blank string in the current record. /^[ \t]*$/ { next } # Ignore empty records. /@relation/,/@data/ { next } # Ignore everything between @relation and @data. FNR <= Skip { next } # Ignore Skip number of the beginning records. { ScaleFactor = 0; # Initialize the scale factor.. for(I=1;I<=ScaleFactorCount;I++) # The first fields hold factors that affect the effort exponentially. ScaleFactor += (0.01 * $I); # Calculate the ScaleFactor that will be added to the exponent term B. EffortMultiplier = 1; # Initialize the effort multiplier. if (IgnoreEM == 0) { for(I=ScaleFactorCount+1; I<=(NF-2); I++) # For the fields after the scalar factors and before the last two... EffortMultiplier *= $I; # Calculate a scalar value to multiply with the calculated effort. } Kloc = $(NF-1); # Get the count for lines of code in the thousands. ActualEffort = $NF; # Get the actual recorded amount of effort in worker months. } # In the first pass, find the values needed to localy calibrate the A and B values in Boehm's Cocomo equation. Pass==1 { TrainingCount++; Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } Pass==2 { TestingCount++; if(TrainingCount) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (!A) A = (Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); if (!B) B = (TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier) EstimatedEffort = E ^ EstimatedEffort; Re = (EstimatedEffort-ActualEffort)/ActualEffort Mre = Re < 0 ? -1* Re : Re; SumMre += Mre; if (Mre < (Pred/100)) PredN++ } END { if(TestingCount > 0) { print E^A,B,100*SumMre/TestingCount,100*PredN/TestingCount; #print "Seed = " Seed #print EffortMultiplier; } }