#!/usr/bin/gawk -f BEGIN { FS = OFS = ","; RanksFileName = "ranks"; Method = "MRE"; } NF==10 { header = "wrapper,numeric,learner"; key = $2 FS $4 FS $6; Prediction = $(NF-1); Actual = $(NF); if (Actual == 0) Actual = 10^(-20); if (Prediction == 0) Prediction = 10^(-20); if (Actual == "inf") Actual = 10^(20); if (Prediction == "inf") Prediction = 10^(20); if (Method == "MRE") { MRE = (Actual - Prediction) / Actual; MRE = (MRE < 0 ? -1*MRE : MRE); Performance = MRE; } if (Method == "MER") { MER = (Actual - Prediction) / Prediction; MER = (MER < 0 ? -1*MER : MER); Performance = MER; } if (Method == "AR") { AR = (Actual - Prediction); AR = (AR < 0 ? -1*AR : AR); Performance = AR; } if (Method == "BRE") { if (Prediction - Actual >= 0) BRE = (Prediction - Actual) / Actual; else BRE = (Prediction - Actual) / Prediction; Performance = BRE; } if (Method == "IBRE") { if (Prediction - Actual < 0) IBRE = (Prediction - Actual) / Actual; else IBRE = (Prediction - Actual) / Prediction; Performance = IBRE; } PerformanceArray[key,++N[key]] = Performance; } function sortArray(key,tempArray) { #put all the current elements in a temporary array for (counter = 1; counter <= N[key]; counter++) tempArray[counter] = PerformanceArray[key,counter]; #sort the temporary array asort(tempArray); #put the sorted elements back in the original array for (counter = 1; counter <= N[key]; counter++) PerformanceArray[key,counter] = tempArray[counter]; } END { for (key in N) sortArray(key); for (firstKey in N) { for (secondKey in N) { if (firstKey != secondKey) { #this is to assure that two keys are not compared to each other twice newKey = firstKey FS secondKey; newKeyReverse = secondKey FS firstKey; if (!tempKeyArray[newKey] && !tempKeyArray[newKeyReverse]) { keyCounter++; tempKeyArray[newKey] = 1; analyze(firstKey, secondKey); } } } } print ""; print header,"ties,wins,losses" > RanksFileName; for (key in N) { print key, tie[key]+0, winMedianRank[key]+0, lossMedianRank[key]+0 >> RanksFileName; } } function analyze(firstKey, secondKey, MergedArray, MergedRanksArray) { #merge them for (firstCounter = 1; firstCounter <= N[firstKey]; firstCounter++) MergedArray[firstCounter] = PerformanceArray[firstKey,firstCounter]; for (secondCounter = 1; secondCounter <= N[secondKey]; secondCounter++) MergedArray[firstCounter+secondCounter-1] = PerformanceArray[secondKey,secondCounter]; asort(MergedArray); #rank them and calculate the sum of ranks for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { sameRankIndex = counter + 1; tempIndexSum = counter; while (sameRankIndex <= N[firstKey] + N[secondKey] && MergedArray[counter] == MergedArray[sameRankIndex]) { tempIndexSum = tempIndexSum + sameRankIndex; sameRankIndex++; } #decrement by 1 since the last addition did not result in a equality or was out of range sameRankIndex--; #this means that no ties were seen if (sameRankIndex == counter) { MergedRanksArray[counter] = counter*1.0; } #this means that there were ties (at least between two of them) else { newRankIndex = tempIndexSum / (sameRankIndex - counter + 1); for (tempCounter = counter; tempCounter <= sameRankIndex; tempCounter++) MergedRanksArray[tempCounter] = newRankIndex*1.0; #it should continue from here (already incremented so it is decremented so the main for loop can increment it correctly) counter = tempCounter - 1; } } #calculate the sums of ranks for each group firstRankSum = 0.0; secondRankSum = 0.0; firstCounter = 1; secondCounter = 1; firstOddMedianRank = 0; firstEvenMedianRank = 0; secondOddMedianRank = 0; secondEvenMedianRank = 0; for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { if (firstCounter <= N[firstKey] && MergedArray[counter] == PerformanceArray[firstKey,firstCounter]) { firstRankSum += MergedRanksArray[counter]; #find the medians here while in the loop in case needed later if (N[firstKey] %2 == 1 && firstCounter == (N[firstKey]+1)/2) firstOddMedianRank = MergedRanksArray[counter]; else if (N[firstKey] %2 == 0 && firstCounter == N[firstKey]/2) firstEvenMedianRank = MergedRanksArray[counter]; else if (N[firstKey] %2 == 0 && firstCounter == (N[firstKey]/2)+1) firstEvenMedianRank = (firstEvenMedianRank + MergedRanksArray[counter])/2; firstCounter++; } else if (secondCounter <= N[secondKey] && MergedArray[counter] == PerformanceArray[secondKey,secondCounter]) { secondRankSum += MergedRanksArray[counter]; #find the medians here while in the loop in case needed later if (N[secondKey] %2 == 1 && secondCounter == (N[secondKey]+1)/2) secondOddMedianRank = MergedRanksArray[counter]; else if (N[secondKey] %2 == 0 && secondCounter == N[secondKey]/2) secondEvenMedianRank = MergedRanksArray[counter]; else if (N[secondKey] %2 == 0 && secondCounter == (N[secondKey]/2)+1) secondEvenMedianRank = (secondEvenMedianRank + MergedRanksArray[counter])/2; secondCounter++; } } #the following two are always true (useful for double checking) #firstRankSum = (N[firstKey]+N[secondKey])*(N[firstKey]+N[secondKey]+1)/2 - secondRankSum; #secondRankSum = (N[firstKey]+N[secondKey])*(N[firstKey]+N[secondKey]+1)/2 - firstRankSum; firstKeyU = firstRankSum - N[firstKey]*(N[firstKey]+1)/2; secondKeyU = secondRankSum - N[secondKey]*(N[secondKey]+1)/2; meanU = N[firstKey]*N[secondKey]/2; sdU = (N[firstKey]*N[secondKey]*(N[firstKey]+N[secondKey]+1)/12)^(0.5); firstKeyZ = (firstKeyU - meanU)/sdU; secondKeyZ = (secondKeyU - meanU)/sdU; printf ("%d ", keyCounter); #since the two keys are equal but different in sign, one is enough to be compared to the critical value 0f 1.96 at 95% confidence if (firstKeyZ > secondKeyZ) Z = firstKeyZ; else Z = secondKeyZ; if (Z >= 0 && Z <= 1.96) { tie[firstKey]++; tie[secondKey]++; } else { #the rest is used to find win vs. loss based on the median of the ranks if (N[firstKey] % 2 == 1) firstMedianRank = firstOddMedianRank; else firstMedianRank = firstEvenMedianRank; if (N[secondKey] % 2 == 1) secondMedianRank = secondOddMedianRank; else secondMedianRank = secondEvenMedianRank; #since the ranking is about MREs (and other RE based measures follow the same trend), we need lower ranks as winners which give us lower MREs if (firstMedianRank < secondMedianRank) { winMedianRank[firstKey]++; lossMedianRank[secondKey]++; } else if (firstMedianRank > secondMedianRank) { lossMedianRank[firstKey]++; winMedianRank[secondKey]++; } else { tie[firstKey]++; tie[secondKey]++; } } }