#!/usr/bin/gawk -f BEGIN{ FS=","; MAX=-10**32; MIN=10**32; min=0; max=0; } /@attribute/ { Attr++} /@data/ { Start=1 } Start && NF>1{ if(!($Attr in Seen)) { Seen[$Attr]=1; } else { Seen[$Attr]++; } } Start && NF>1{ MAX=-10**32 MIN=10**32 for(i in Seen) { # note down the attribue not the number of isntances if(MAX= Seen[i]) { MIN=Seen[i] min=i; #print MIN "\t" i } } } min && NF>1{ Last=$0} max && NF>1{ Last1=$0 } { print $0 } END{ if(MAX/MIN>30){ for(i=0;i<(MAX-MIN);i++) print Last #print (max/min) } } #first filter the data #try to figure out the unique classes #oversample the minority classs #undersample the majority class #first start with oversampling and undersampling