#!/usr/bin/gawk -f ##Wrapper Function ##K-fold evaluation, (k-1)/kth as training set, and the other as test set, ##Each time to select the best attribute subset ##Run k time to get how often the attribute are selected as its ranking ##COCOMO local calibration as target learner ##Select the best attribute subsest to minimize Root Mean-Square Error - RMSE of the test set ## ##Sep. 2005 ##Input the linear dataset (must be ARFF format), the attributes must folow the order: ## AEXP,DATA,PCAP,VEXP,MODP,RELY,TOOL,TURN,CPLX,LEXP,SCED,VIRT,ACAP,STOR,TIME,SIZE,Acutal Effort ## If you want to change the order, you can update the variables of AttsString ## Another better way is to write a function to get the order string of attributes from ARFF header ## ##Output the ranking of the attributes BEGIN{ # command line options FS=","; #added: semi-colon. tells emacs how to indent IGNORECASE=1; DataFlag=0; # Switch header to data, when first time this script reach data, set dataflag=1 AttrString = "AEXP,DATA,PCAP,VEXP,MODP,RELY,TOOL,TURN,CPLX,LEXP,SCED,VIRT,ACAP,STOR,TIME"; # Show the order of the attribtes for the data file FileName="xps01"; TotalPrj=0; # Caculate how many projects (rows) in this data file TotalAtt=0; # Caculate how many attributes in this data file, not including SIZE and PM Folds=10; # k-fold evaluation Out="ranking.txt"; # Print out the ranking for the attributes of this dataset Stop=5; # Stale: adding parameters has not improved the performance. Stop: after a MAX STALE number of times in the continue steps delete StoreData; # Two-dimension array, used to store the orginal lineared data, initilized here delete Ranking; # Store the ranking of the attributes, initilized here delete AttIndex; # Use array to store the index of the attributes, sorted by ranking, initilized here delete AttName; # Use array to store the name of the attributes from AttrString, initilized here delete BestAtt; # Store the best attribute at that time, at the first time, it should be empty delete RestAtt; # Store the un-selected attribute at that time, at the first time, it should include all attributes delete SelectedAtt; # Store the selected attribute at that time, at the first time, it should be empty. # Combine the attributes from BestAtt and SelectedAtt to evalute whether this will get better performance than those only from BestAtt delete Training; # Indidcate which project is used in training set delete Testing; # Indidcate which project is used in test set Ee=271801/99990; # Used in logarithm Inf = 10**32; # The largest number } { sub(/\%.*/,"") } # Delete comments /^[ \t]*$/ { next } # Skip /@attribute/ { split($0,Words," "); Attribute[++Attrs]=Words[2]; next } # Skip header - attributes information DataFlag { TotalPrj=store(StoreData,TotalPrj); } # Save the data for futher processes /@data/ { DataFlag=1; next } # switch header to data ##=========== store the data for futher processes ================================== ## datarr is two-dimension array, store the orginal lineared data ## Caculate how many projects (rows) in this data file ## i is local variable function store(datarr,n, i) { n++; for(i=1;i<=NF;i++) { datarr[n,i]= $i; } return n; } ##=========== Setup - initialize some variables ================================== ## ranking - Store the ranking of the attributes, sorted by ranking ## attIndex - Use array to store the index of the attributes, sorted by ranking ## attName - Use array to store the name of the attributes from AttrString ## i, j are local variables function setup(ranking,attName,attIndex,attrString, i,total,j) { total=NF-2; for(i=1;i<=total;i++){ ## Initialized ranking and attIndex ranking[i]=0; attIndex[i]=i; } split(attrString,attName,","); ##Change the attrString to array return total; } ##=========== Wrapper Function ================================== ## Get the ranking of the attributes based on how often the attributes are ## selected for the best attributes by randomly generating the training set and test set ## function wrapper(storeData,totalAtt,totalPrj,folds,ranking,attIndex,attName,best,rest,selected,training,testing,stop, i,j,rmse) { for (i=1;i<=folds;i++){ # How many times we should do the selections for the best attributes by ramdonly generating training set and test set initial(totalAtt,totalPrj,folds,best,selected,rest,training,testing,i); rmse=pickBestAtt(storeData,training,testing,totalAtt,totalPrj,stop,best,selected,rest); # Obtain the selected attributes for the best performance for(j=1;j<=totalAtt;j++){ # For each attribute if(best[j]>0){ # If the attribute is selected as the best attribute ranking[j]++; } } # increase the ranking by 1 at each time the attribute is selected } for(j in ranking) print j " " Attribute[j] " " ranking[j] | "sort -r -n -k 3" #quickSort(ranking,attIndex,totalAtt); # Sort the ranking, largest first #for (j=1;j<=totalAtt;j++){ # Print the ranking #print attIndex[j] " " attName[attIndex[j]] " " ranking[j] #>>Out; #} } ##=========== Output experience scripts ================================== ## function outputXP(fName,totalAtt,attIndex,attName,ranking, fileName,groups,i,j,tmp,tmp1,k) { fileName=fName; print "#-*- sh -*-" > fileName; print "Classifier=\"cl\"" >> fileName; print "Methods=\"jump\"" >> fileName; print "TimeNum=1" >> fileName; print "TimesubNum=1 # n repeats" >> fileName; print "Time=\"R01\"" >> fileName; print "Timesub=\"sub01\"" >> fileName; print "Half=15" >> fileName; print "N=30 # n repeats" >> fileName; print "N1=29 # n repeats" >> fileName; print "Some=1 # use all*Some of the data" >> fileName; print "F=3 # 1/F test, rest for train" >> fileName; print "Learners=\"lsr\" # run these learners" >> fileName; print "DataDirOriginal=\"$HOME/toyz/data\" # over data in this directory" >> fileName; print "Preds=\"30\" # pred levels" >> fileName; print "Logging=1 #true if data and actual efforts have been linearized" >> fileName; print "Zeroing=1" >> fileName; print "Dataln=\"" fName "\" " "#The original data" >> fileName; print "Dirs=\"1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30\"" >> fileName; groups=0; tmp=-1; for (j=totalAtt;j>=1;j--){ # How many group for method jump if(tmp!=ranking[j]){ tmp=ranking[j]; groups++; } } printf "Groupingsjumpcl=\"" >> fileName; for (j=0;j<=groups;j++){ # print the group number printf j " " >> fileName; } print "\"" >> fileName; printf "GroupingsFSSjumpcllsr=\"All " >> fileName; for (j=1;j<=groups;j++){ # print the fss name if(j<10){ printf "FS0" j " " >> fileName; }else{ printf "FS" j " " >> fileName; } } print "\"" >> fileName; print "Groupjumpcl[0]=\"20\"" >> fileName; tmp=-1; tmp1=-1; k=0; for (j=totalAtt;j>=1;j--){ # How many group for method jump if(tmp!=ranking[j]){ tmp=ranking[j]; k++; printf "Groupjumpcl[" k "]=\"" >> fileName; for(i=totalAtt;i>j;i--){ printf attIndex[i] "," >> fileName; } printf attIndex[j] >> fileName; if(j>1){ tmp1=ranking[j-1]; } else{ tmp1=-1; } if(tmp==tmp1){ ##the same group printf "," >> fileName; } else{ print "\"" >> fileName; } } else{ printf attIndex[j] >> fileName; if(j>1){ tmp1=ranking[j-1]; } else{ tmp1=-1; } if(tmp==tmp1){ ##the same group printf "," >> fileName; } else{ print "\"" >> fileName; } } } print "Datasjumpcl[0]=\"All\"" >> fileName; for (j=1;j<=groups;j++){ # print the fss name printf "Datasjumpcl[" j "]=\"" >> fileName; if(j<10){ printf "FS0" j >> fileName; } else{ printf "FS" j >> fileName; } print "\"" >> fileName; } } ##=========== initialize some variables ================================== ## function initial(totalAtt,totalPrj,folds,best,selected,rest,training,testing,timeth, i) { for (i=1; i<=totalAtt; i++){ ## initialize best, selected, rest best[i]=0; selected[i]=0; rest[i]=1; } for (i=1; i<=totalPrj; i++){ ## initialize training set, and test set training[i]=0; testing[i]=0; } i=1; ## Generating k-fold training set, and test set while (i<=totalPrj/folds*(timeth-1) && i<=totalPrj){ training[i]=1; testing[i]=1; i++; } while (i<=totalPrj/folds*timeth && i<=totalPrj){ training[i]=0; testing[i]=0; i++; } while (i<=totalPrj){ training[i]=1; testing[i]=1; i++; } } #################################################### # generic stuff ##==================pickBestAtt===================== ## Select the best attributes #Helper function pickBestAtt(storeData,training,testing,totalAtt,totalPrj,stop,best,selected,rest, bestnum,usednum,step,rmse) { bestnum=0; usednum=0; step=0; rmse=Inf; rmse=pickNext(storeData,training,testing,totalAtt,totalPrj,stop,rmse,best,selected,rest,step,bestnum,usednum); return rmse; } ##==================pickNext===================== ## Select the best attributes # worker function pickNext(storeData,training,testing,totalAtt,totalPrj,stop,lowestRMSE,best,selected,rest,step,bestnum,usednum, ni,min,which) { if ( unempty(rest,totalAtt) ) { # If something left to do # Find out the next best attribute from the rest list nextBest(storeData,training,testing,totalAtt,totalPrj,selected,rest,ni); min = ni["min"]; # The new error which = ni["which"]; # The index of the next best attribute if (min <= lowestRMSE) { # if improvement step=0; # Not in Stale: adding parameters has not improved the performance. # step should be reset to ZERO bestnum++; # How many best attributes usednum=bestnum; # How many attributes are used to try to get a better results. add(selected,which); # The ith attribute is selected as one of the best attributes copy(selected,best,totalAtt); # Copy the selected attributes that make the best performance minus(rest,which); # Remove that attribute from the rest list lowestRMSE=min; return pickNext(storeData,training,testing,totalAtt,totalPrj,stop,lowestRMSE,best,selected,rest,step,bestnum,usednum); # recurse } else{ # If Not improvement usednum++; # How many attributes are used to try to get a better results step++; # In Stale: adding parameters has not improved the performance. # step should be plus one if (step<=stop && usednum < totalAtt){ # if not reach the stop criteria add(selected,which); minus(rest,which); return pickNext(storeData,training,testing,totalAtt,totalPrj,stop,lowestRMSE,best,selected,rest,step,bestnum,usednum); # recurse } } } return lowestRMSE; # else, print last value } ##==================nextBest===================== ## Find the next best attribute that it make the best performance with the selected attributes function nextBest(storeData,training,testing,totalAtt,totalPrj,selected,rest,ni, selection,attError,i,j,new,min,which) { for(i=1; i<=totalAtt; i++){ if(rest[i]>0){ attError[i] =0; } } for(i=1; i<=totalAtt; i++){ if(rest[i]>0){ for (j=1; j<=totalAtt; j++){ selection[j]=0; } selection[i]=1; attError[i] =score(storeData,training,testing,selected,selection,totalPrj,totalAtt); } } min = Inf; which=0; for(j=1; j<=totalAtt; j++){ if(rest[j]>0){ new = attError[j]; if (new < min) { min = new; which = j; } } } ni["which"] = which; ni["min"] = min; } ##==================score===================== ## function score(storeData,training,testing,selected,selection,totalPrj,totalAtt, rmse,sum1,sum2,sum3,sum4,eaf,est,a,b,i,j,k,kloc,pm,tmp1,tmp2) { rmse=0; #Root Mean-Square Error (RMSE),select the parameters that minimize RMSE between the predicted and actual values over all the data. sum1=0; sum2=0; sum3=0; sum4=0; j=0; #How many projects are used in training set for(i=1; i<=totalPrj; i++){ eaf[i]=0; #The sum of linear value of the selection attributes for(k=1; k<=totalAtt; k++){ if(selection[k]>0 || selected[k]>0){ eaf[i] += storeData[i,k]; } } if(training[i]>0){ j++; #How many projects are used in training set kloc=storeData[i,totalAtt+1]; pm=storeData[i,totalAtt+2]; sum1 += kloc; sum2 += kloc*kloc; sum3 += pm - eaf[i]; sum4 +=(pm - eaf[i])*kloc; } } a=(sum2*sum3-sum1*sum4)/(j*sum2-sum1*sum1); b=(j*sum4-sum1*sum3)/(j*sum2-sum1*sum1); k=0; #How many projects are used in test set for(i=1;i<=totalPrj;i++) { if(testing[i]>0){ k++; est[i]=a+eaf[i]+b*storeData[i,totalAtt+1]; pm=storeData[i,totalAtt+2]; tmp1=Ee^(est[i]); tmp2=Ee^(pm); rmse +=abs((tmp1-tmp2)*(tmp1-tmp2)); } } rmse=sqrt(rmse/k); return rmse; } ##==================quickSort===================== ## function quickSort(numbers,numIndex,array_size) { q_sort(numbers,numIndex,1,array_size); } function q_sort(numbers,numIndex,left,right, pivot,l_hold,r_hold,localIndex) { l_hold = left; r_hold = right; pivot = numbers[left]; localIndex = numIndex[left]; while (left < right) { while ((numbers[right] <= pivot) && (left < right)) right--; if (left != right) { numbers[left] = numbers[right]; numIndex[left] = numIndex[right]; left++; } while ((numbers[left] >= pivot) && (left < right)) left++; if (left != right) { numbers[right] = numbers[left]; numIndex[right] = numIndex[left]; right--; } } numbers[left] = pivot; numIndex[left] = localIndex; pivot = left; left = l_hold; right = r_hold; if (left < pivot) q_sort(numbers,numIndex,left,pivot-1); if (right > pivot) q_sort(numbers,numIndex,pivot+1,right); } # set stuff function empty(a,totalAtt, i){ for(i=1;i<=totalAtt;i++) a[i]=0; } function unempty(a,totalAtt, i){ for(i=1;i<=totalAtt;i++){ if(a[i]>0) return 1; } return 0; } function minus(a,x){ a[x]=0; } function add(a,x){ a[x]=1; } function copy(a,b,totalAtt, i){ for(i=0;i<=totalAtt;i++){ b[i]=a[i]; } } function swapnum(numArray,left,right, tmp){ tmp=numArray[left]; numArray[left]=numArray[right]; numArray[right]=numArray[left]; } function abs(n) { if (n<0) {return -1*n} else {return n} } END { TotalAtt=setup(Ranking,AttName,AttIndex,AttrString); wrapper(StoreData,TotalAtt,TotalPrj,Folds,Ranking,AttIndex,AttName,BestAtt,RestAtt,SelectedAtt,Training,Testing,Stop); #outputXP(FileName,TotalAtt,AttIndex,AttName,Ranking); }