#!/usr/bin/gawk -f BEGIN { FS = OFS = ","; RawResultsFileName = "rawResults"; RanksFileName = "rankResults"; } NF==3 { Results[0,0] = 0; key = $1; Got = $2; Want = $3; Mre = (Got-Want)/Want; Mre = (Mre < 0 ? -1*Mre : Mre)*100; MreArray[key,++MreArray[key,0]] = Mre; N[key]++; } END { print "keyCounter, firstKey, N[firstKey], secondKey, N[secondKey], firstKeyZ, secondKeyZ" > RawResultsFileName; for (firstKey in N) { for (secondKey in N) { if (firstKey != secondKey) { #this is to assure that two keys are not compared to each other twice newKey = firstKey FS secondKey; newKeyReverse = secondKey FS firstKey; if (!tempKeyArray[newKey] && !tempKeyArray[newKeyReverse]) { keyCounter++; tempKeyArray[newKey] = 1; analyze(firstKey, secondKey); } } } } print ""; print "key,ties,wins,losses,comparison" > RanksFileName; for (key in N) { # print key, tie[key]+0, winSign[key]+0, lossSign[key]+0, "sign" >> RanksFileName; print key, tie[key]+0, winMedianRank[key]+0, lossMedianRank[key]+0, "median" >> RanksFileName; } } function analyze(firstKey, secondKey, tempArray, tempRanksArray, tempFirstArray, tempSecondArray) { #put the raw data from the two sets together for (firstCounter = 1; firstCounter <= N[firstKey]; firstCounter++) { tempArray[firstCounter] = MreArray[firstKey,firstCounter]; } #use firstCounter-1 as the index from which it starts adding the second group of data firstCounter--; for (secondCounter = 1; secondCounter <= N[secondKey]; secondCounter++) { tempArray[firstCounter+secondCounter] = MreArray[secondKey,secondCounter]; } #sort the array containing both set's raw data asort(tempArray); #generate the ranks and resolve the ties for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { sameRankIndex = counter + 1; tempIndexSum = counter; while (sameRankIndex < N[firstKey]+N[secondKey] && tempArray[counter] == tempArray[sameRankIndex]) { tempIndexSum = tempIndexSum + sameRankIndex; sameRankIndex++; } #decrement by 1 since the last addition did not result in a equality or was out of range sameRankIndex--; #this means that no ties were seen if (sameRankIndex == counter) { tempRanksArray[counter,1] = tempArray[counter]; tempRanksArray[counter,2] = counter*1.0; } #this means that there were ties (at least between two of them) else { newRankIndex = tempIndexSum / (sameRankIndex - counter + 1); for (tempCounter = counter; tempCounter <= sameRankIndex; tempCounter++) { tempRanksArray[tempCounter,1] = tempArray[counter]; tempRanksArray[tempCounter,2] = newRankIndex*1.0; } #it should continue from here (already incremented so it is decremented so the main for loop can increment it correctly) counter = tempCounter - 1; } } #calculate the sums of ranks for each group firstRankSum = 0.0; secondRankSum = 0.0; searchRankSum = 0.0; #choose the smaller of the two for the search (for speed purposes). The other one can be calculated using the formula if (N[firstKey] <= N[secondKey]) searchKey = firstKey; else searchKey = secondKey; for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { for (searchCounter = 1; searchCounter <= N[searchKey]; searchCounter++) { if (tempRanksArray[counter,1] == MreArray[searchKey,searchCounter]) { searchRankSum = searchRankSum + tempRanksArray[counter,2]; #so it skips checking the rest since there is no point searchCounter = N[searchKey]; } } } if (searchKey == firstKey) { firstKeySum = searchRankSum; secondKeySum = (N[firstKey]+N[secondKey])*(N[firstKey]+N[secondKey]+1)/2 - searchRankSum; } else { firstKeySum = (N[firstKey]+N[secondKey])*(N[firstKey]+N[secondKey]+1)/2 - searchRankSum; secondKeySum = searchRankSum; } firstKeyU = firstKeySum - N[firstKey]*(N[firstKey]+1)/2; secondKeyU = secondKeySum - N[secondKey]*(N[secondKey]+1)/2; m = N[firstKey]; n = N[secondKey]; meanU = m*n/2; sdU = (m*n*(m+n+1)/12)^(0.5); firstKeyZ = (firstKeyU - meanU)/sdU; secondKeyZ = (secondKeyU - meanU)/sdU; print keyCounter, firstKey, N[firstKey], secondKey, N[secondKey], firstKeyZ, secondKeyZ > RawResultsFileName; printf ("%d ", keyCounter); #since the two keys are equal but different in sign, one is enough to be compared to the critical value 0f 1.96 at 95% confidence if (firstKeyZ > secondKeyZ) Z = firstKeyZ; else Z = secondKeyZ; if (Z >= 0 && Z <= 1.96) { tie[firstKey]++; tie[secondKey]++; } else { # if (firstKeyZ < 0 && secondKeyZ > 0) # { # winSign[firstKey]++; # lossSign[secondKey]++; # } # else if (firstKeyZ > 0 && secondKeyZ < 0) # { # lossSign[firstKey]++; # winSign[secondKey]++; # } #the rest is used to find win vs. loss based on the median of the ranks for (firstCounter = 1; firstCounter <= N[firstKey]; firstCounter++) { tempFirstArray[firstCounter] = MreArray[firstKey,firstCounter]; } for (secondCounter = 1; secondCounter <= N[secondKey]; secondCounter++) { tempSecondArray[secondCounter] = MreArray[secondKey,secondCounter]; } asort(tempFirstArray); asort(tempSecondArray); if (N[firstKey] % 2 == 1) firstMedianIndex = (N[firstKey]+1)/2; else firstMedianIndex = N[firstKey]/2; if (N[secondKey] % 2 == 1) secondMedianIndex = (N[secondKey]+1)/2; else secondMedianIndex = N[secondKey]/2; for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { if (tempRanksArray[counter,1] == tempFirstArray[firstMedianIndex]) { firstMedianRank = tempRanksArray[counter,2]; counter = N[firstKey]+N[secondKey]; } } for (counter = 1; counter <= N[firstKey]+N[secondKey]; counter++) { if (tempRanksArray[counter,1] == tempSecondArray[secondMedianIndex]) { secondMedianRank = tempRanksArray[counter,2]; counter = N[firstKey]+N[secondKey]; } } #since the ranking is about MREs, we need lower ranks as winners which give us lower MREs if (firstMedianRank < secondMedianRank) { winMedianRank[firstKey]++; lossMedianRank[secondKey]++; } else if (firstMedianRank > secondMedianRank) { lossMedianRank[firstKey]++; winMedianRank[secondKey]++; } else { #each one gets a win and a loss (which means a tie) but not directly a tie (since it is only used when not statistically different). # winMedianRank[firstKey]++; # lossMedianRank[firstKey]++; # winMedianRank[secondKey]++; # lossMedianRank[secondKey]++; tie[firstKey]++; tie[secondKey]++; } } }