#!/usr/bin/gawk -f BEGIN { FS=OFS=","; Pred=30; ErrorValue="ZeroSpAndSpa" } NF==11 { Gots[600,0]= 0; Key = $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $9; Got = $(NF-1); #it replaces the 0 with a very small number Got = Got < 0 ? 10^(-20) : Got; Want = $(NF); push(Got,Key,Gots); push(Want,Key,Wants); Mre = (Got-Want)/Want; Mre = (Mre < 0 ? -1*Mre : Mre)*100; PredN[Key] += (Mre <= Pred); Sum[Key] += Mre; SumSq[Key] += Mre * Mre; N[Key]++ } function push(x,k,a) { a[k,++a[k,0]] = x } END { for(I in PredN) { Mean=Sum[I]/N[I]; Sd= sd(SumSq[I],Sum[I],N[I]); CorrValue=corr(I,Gots,Wants); if (CorrValue != ErrorValue) print I , int(PredN[I]/N[I]*100) , N[I] , Mean , Sd, Sd/Mean, corr(I,Gots,Wants); } } function corr(k,predicts,actuals, i,n,sump,suma,meanp,meana,spa,sp,sa) { n= predicts[k,0]; sump=suma=sp=spa=sa=0; for(i=1;i<=n;i++) sump += predicts[k,i]; for(i=1;i<=n;i++) suma += actuals[k,i]; meanp = sump/n; meana = suma/n; for(i=1;i<=n;i++) { sp += (predicts[k,i] - meanp)^2; sa += ( actuals[k,i] - meana)^2; spa += (predicts[k,i] - meanp)*(actuals[k,i] - meana); } sp = sp /(n-1); sa = sa /(n-1); spa = spa/(n-1); return sp*sa ? spa/sqrt(sp*sa) : ErrorValue; } function sd(sumSq,sum,n) { if ( (sumSq -((sum*sum)/n)) <= 0 ) return 0; return sqrt((sumSq-((sum*sum)/n))/(n-1)) }