# Local Calibration using gawk # Dan Baker, Summer 2006. Adapted from the work of Tim Menzies & Scott Chen, 2005 # Assumes that 0 or more exponential scale factors are in the left columns... # ...followed by linear effort multipliers... # ...followed by the programs size in thousands of lines of code... # ...and finally the actual project's effort in work-months. # note if the training data only has one record it will give a division by zero error. BEGIN { IGNORECASE = 1; # Ignore case in string comparisons. FS = ","; # Initialize the field separator. OFS = ","; # Initialize the output field separator. Pred = 30; # Initialize the range for Pred. E = 2.71828182846; # Initialize the magic number E. ScaleFactorCount = 0; # The number of scale factors. Set it to zero for cocomo-I Skip = 0; # Set to 1 or 2 if this is a csv file and we should skip the header line. PredN = 0; # Initialize the number of records within Pred range. TestingCount = 0; # Initialize the number of testing records. TrainingCount = 0; # Initialize the number of training records. Seed = 1; # Set the default Seed value. Sum1 = 0; # Initialize the sum of log(Kloc) Sum2 = 0; # Initialize the sum of (log(Kloc))^2 Sum3 = 0; # Initialize the sum of log(ActualEffort/EffortMultiplier) Sum4 = 0; # Initialize the sum of log(Kloc)*log(ActualEffort/EffortMultiplier) TargetEMs = "Set this to equal -a-b-c- where a, b, and c are the column numbers of effort multipliers you want to use."; } NR==1 { srand(Seed); } # Seed the random number generator. { sub(/\%.*/,"") } # Replace commented lines with a blank string in the current record. /^[ \t]*$/ { next } # Ignore empty records. /@relation/,/@data/ { next } # Ignore everything between @relation and @data. FNR <= Skip { next } # Ignore Skip number of the beginning records. { ScaleFactor = 0; # Initialize the scale factor.. for(I=1;I<=ScaleFactorCount;I++) { # The first fields hold factors that affect the effort exponentially. target="-"I"-"; if (index(TargetEMs, target)!=0) { ScaleFactor += (0.01 * $I); # Calculate the ScaleFactor that will be added to the exponent term B. } } EffortMultiplier = 1; # Initialize the effort multiplier. for(I=ScaleFactorCount+1; I<=(NF-2); I++) { # For the fields after the scalar factors and before the last two... target="-"I"-"; if (index(TargetEMs, target)!=0) { EffortMultiplier *= $I; # Calculate a scalar value to multiply with the calculated effort. } } Kloc = $(NF-1); # Get the count for lines of code in the thousands. ActualEffort = $NF; # Get the actual recorded amount of effort in worker months. } # In the first pass, find the values needed to locally calibrate the A and B values in Boehm's Cocomo equation. Pass==1 { TrainingCount++; Sum1 += log(Kloc); Sum2 += (log(Kloc) * log(Kloc)); Sum3 += log (ActualEffort / EffortMultiplier); Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier)); } Pass==2 { TestingCount++; if(TrainingCount) { # Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1 if (!A) A = (Sum2*Sum3 - Sum1*Sum4 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); if (!B) B = (TrainingCount*Sum4 - Sum1*Sum3 ) / ( TrainingCount*Sum2 - Sum1*Sum1 ); } # Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier) EstimatedEffort = E ^ EstimatedEffort; print EstimatedEffort,ActualEffort }