# Local Calibration using gawk
# Dan Baker, Summer 2006.  Adapted from the work of Tim Menzies & Scott Chen, 2005
# Assumes that 0 or more exponential scale factors are in the left columns...
# ...followed by linear effort multipliers...
# ...followed by the programs size in thousands of lines of code...
# ...and finally the actual project's effort in work-months.
# test

BEGIN {
	IGNORECASE = 1;			# Ignore case in string comparisons.
	FS = ",";				# Initialize the field separator.
	OFS = ",";				# Initialize the output field separator.
	Pred = 30;				# Initialize the range for Pred.
	E = 2.71828182846;		# Initialize the magic number E.
	ScaleFactorCount = 0;	# The number of scale factors.  Set it to zero for cocomo-I
	Skip = 2;				# Set to 1 or 2 if this is a csv file and we should skip the header line.
	PredN = 0;				# Initialize the number of records within Pred range.
	TestingCount = 0;		# Initialize the number of testing records.
	TrainingCount = 0;		# Initialize the number of training records.
	Seed = 1;				# Set the default Seed value.
	Sum1 = 0;				# Initialize the sum of log(Kloc)
	Sum2 = 0;				# Initialize the sum of (log(Kloc))^2
	Sum3 = 0;				# Initialize the sum of log(ActualEffort/EffortMultiplier)
	Sum4 = 0;				# Initialize the sum of log(Kloc)*log(ActualEffort/EffortMultiplier)
	IgnoreEM = 0;			# Set this to 1 if you want to ignore the effort multipliers 
}

NR==1					{ srand(Seed); }		# Seed the random number generator.
						{ sub(/\%.*/,"") }		# Replace commented lines with a blank string in the current record.
/^[ \t]*$/				{ next }				# Ignore empty records.
/@relation/,/@data/		{ next }				# Ignore everything between @relation and @data.
FNR <= Skip				{ next }				# Ignore Skip number of the beginning records.

{
	ScaleFactor = 0;							# Initialize the scale factor..
	for(I=1;I<=ScaleFactorCount;I++)			# The first fields hold factors that affect the effort exponentially.
		ScaleFactor += (0.01 * $I);				# Calculate the ScaleFactor that will be added to the exponent term B.
	EffortMultiplier = 1;						# Initialize the effort multiplier.
	if (IgnoreEM == 0)
	{
	  for(I=ScaleFactorCount+1; I<=(NF-2); I++)	# For the fields after the scalar factors and before the last two...
	  	EffortMultiplier *= $I;					# Calculate a scalar value to multiply with the calculated effort.
	}
	Kloc = $(NF-1);								# Get the count for lines of code in the thousands.
	ActualEffort = $NF;							# Get the actual recorded amount of effort in worker months.
}

# In the first pass, find the values needed to localy calibrate the A and B values in Boehm's Cocomo equation.
Pass==1 {
	TrainingCount++;
	Sum1 += log(Kloc);
	Sum2 += (log(Kloc) * log(Kloc));
	Sum3 += log (ActualEffort / EffortMultiplier);
	Sum4 += (log(Kloc) * log (ActualEffort / EffortMultiplier));
}

Pass==2 {
    TestingCount++;
    if(TrainingCount) {
		# Immediately on entering Pass 2 calculate the A and B values from the information retained from Pass 1
	    if (!A)  A = (Sum2*Sum3  - Sum1*Sum4 )   / ( TrainingCount*Sum2 - Sum1*Sum1 );
		if (!B)  B = (TrainingCount*Sum4 - Sum1*Sum3 )  / ( TrainingCount*Sum2 - Sum1*Sum1 );
	}
	# Calculate the estimated effort using Boehm's Cocomo formula and then calculate some statistics
	EstimatedEffort = ((B + ScaleFactor) * log(Kloc)) + A + log(EffortMultiplier)
	EstimatedEffort = E ^ EstimatedEffort;
	Re = (EstimatedEffort-ActualEffort)/ActualEffort
	Mre = Re < 0 ? -1* Re : Re;
	SumMre += Mre;
	if (Mre < (Pred/100)) PredN++
}

END {
	if(TestingCount > 0) {
		print E^A,B,100*SumMre/TestingCount,100*PredN/TestingCount; 
		#print "Seed = " Seed
		#print EffortMultiplier;
    }
}