# /* vim: set filetype=awk : */ -*- awk -*- BEGIN { Who = "Tim Menzies"; What = "near"; When = "2006"; How = "cd:D:hn:l:M:p:z:"; Why = "return the n nearest neighbours"; } function usage() { about(Who,What,When,Why); prints("Usage: "What" [FLAGS] FILE"," ", "FILE is a DVS file of all numerics"," ", " -c show copyright", " -n NUM show the NUM nearest neighbours; default=[" N "]", " -d CHAR input fields seperator; default=[" FS "]", " -p NUM patience=NUM; i.e. number of tolerated errors; default=[" Patience "]", " -D CHAR output fields seperator; default=[" OFS "]", " -i STR ignore fields list in a comma-seperated string STR", " -l STR apply a log transform to fields listed in a comma-seperated string STR", " -M NUM minimum value for each numeric; default=[" TheMin "]", " -z a=b define zero distance at point a=b;", " (and the command line can have several -z entries)"); } # requirez(dvs.awk) # requirez(bad.awk) # requirez(trim.awk) # requirez(stack.awk) # requirez(globals.awk) # requirez(commandLine.awk) # requirez(columnNumbers.awk) function defaults() { inits(How,"-d , -D , -n 5 -M 0.000001 -p 5 "); } function set(x,y, tmp) { if (x == "d") {return FS=y }; if (x == "c") {copyleft(); exit}; if (x == "D") {return OFS=y }; if (x == "h") {usage(); exit}; if (x == "i") {return IgnoreStr=y}; if (x == "l") {return LogStr=y}; if (x == "M") {return TheMin=y}; if (x == "n") {return Want=y }; if (x == "p") {return Patience=y }; if (x == "z") {split(y,tmp,"="); push( tmp[1],X); return push(tmp[2],Y); } bad(x "? usage: " What " " How); exit; } BEGIN { globals(); defaults(); getOpts(How); } { gsub(Ccomment ".*$","") for(I=1;I<=NF;I++) $I=trim($I); } /^[ \t]*$/ { next } sub("^" Cheader,"",$1) { for(I=1;I<=NF;I++) { Name[$I] = I; Max[I] = NegInf; Min[I] = Inf; } columnNumbers(IgnoreStr,Ignore,NF) ; columnNumbers(LogStr, Logs, NF) ; # BUG! logging not done on the -z values!! print Cheader $0; next; } { Thing[++Rows]=$0 for(Col=1;Col<=NF;Col++) { Value =$Col; if (Col in Ignore) continue; if (Value == Cnothing) continue; if (Value < TheMin) Value = TheMin if (Col in Logs) Value = log(Value) if (Value > Max[Col]) Max[Col] = Value; if (Value < Min[Col]) Min[Col] = Value; Data[Rows,Col]=Value } } # requirez(saya.awk) END { Com="sort -t" OFS " -n +0 | head -" Want+1 " | cut -d"OFS " -f 2-"; for(Row in Thing) { print distance(X[0],Row),Thing[Row] | Com; imPatient(); } close(Com) } function distance(xmax,i, j,x,old,dist) { for(x=1;x<=xmax;x++) { if (X[x] in Name) { j = Name[X[x]] old = Data[i,j]; if ((! (j in Ignore)) && (Data[i,j] !~ Cnothing)) dist += ((old - Y[x])/(Max[j]-Min[j]+0.00001))^2 } else { bad(X[x] " not known ") }} return dist^0.5; }