#! /bin/bash # by Dan Baker, WVU, 2006 # This script creates a range of estimates using the cocomost learner. # This is meant to be used on a single test record to create # an actual prediction: i.e. it's not meant for an experiment (no actual, mre's listed) # # The input must be all numeric without any comments, like the output of ../baker_lc/convertcsv.awk # # The output is in the form #Estimate,a,b,CostDrivers,Seed # There will N+1 estimates made. There are N estimates using X% of Train, and one estimate that uses # all of Train and can be noted by a NA value in the Seed column. #Set up variables Train=$1 Test=$2 vTrain="/tmp/"$USER"/vtrain.tmp" echo -n "" > $vTrain N=10 X=0.9 XNum=$(gawk -v X=$X 'END{print int((NR*X)+0.5)}' $Train) #Assumes only one test record TestCount=$(gawk 'END{print NR}' $Test) if ((TestCount>1)) then echo "Error: This script is meant for only one test record." exit fi { #Header Line echo "#Estimate,a,b,CostDrivers,Seed" #Run cocomost with all the training data CostDrivers=$(./cocomostFSS $Train) Results=$(gawk -f ../baker_lc/lc.awk Pass=1 TargetEMs=$CostDrivers $Train Pass=2 Mode=5 TargetEMs=$CostDrivers $Test) echo $Results",NA" for ((I=1;I<=$N;I++)) do # Create a random X% of the training data Seed=$RANDOM gawk -v Seed=$Seed -v TFile=$vTrain -v Xn=$XNum 'BEGIN{CONVFMT="%.20g"}NR==1{srand(Seed);}{Dat[rand()]=$0}END{c=0;for(I in Dat){if (c>TFile}c++}}' $Train CostDrivers=$(./cocomostFSS $vTrain) Results=$(gawk -f ../baker_lc/lc.awk Pass=1 TargetEMs=$CostDrivers $vTrain Pass=2 Mode=5 TargetEMs=$CostDrivers $Test) echo $Results","$Seed done } | sort | malign