#!/usr/bin/gawk -f #select the parameters that minimize RMSE between the predicted and actual values over all the data #scale factors are shown in the left hand columns, followed by the effort multipliers followed by #programs size (in Kloc) and finally, actual effort BEGIN { Repeats = 1; IGNORECASE = 1; FS = OFS=","; E = 2.71828182846; ScaleFactors = 5; # set to 1 if this is a csv file and we should skip the header line Skip = 1; Tests = Trains = 0; Seed = 1; } NR==1 { print "LC,STAR"; srand(Seed); } { sub(/\%.*/,""); } /^[ \t]*$/ { next; } FNR <= Skip { next; } { for (I=ScaleFactors+1;I<=NF;I++) $I = $I ? log($I) : 0; Kloc = $(NF-1); Pm = $NF; for (I=1;I<=ScaleFactors;I++) $I = 0.01*($I)*Kloc; Eaf = 0; for (I=1; I<=(NF-2); I++) Eaf += $I; } Pass==1 { Trains++; Sum1 += Kloc; Sum2 += Kloc*Kloc; Sum3 += Pm - Eaf; Sum4 += (Pm - Eaf) * Kloc; } Pass==2 { for(R=1;R<=Repeats;R++) { Tests++; if(Trains) { if (!A) A = (Sum2*Sum3 - Sum1*Sum4 ) / ( Trains*Sum2 - Sum1*Sum1 ); if (!B) B = (Trains*Sum4 - Sum1*Sum3 ) / ( Trains*Sum2 - Sum1*Sum1 ); } Got = A + Eaf + B * Kloc; Got = E ^ Got; Want = E ^ Pm; print Got,Want } } END { }