#!/usr/bin/gawk -f
BEGIN{
FS=",";
MAX=-10**32;
MIN=10**32;
min=0;
max=0;
}
/@attribute/ { Attr++}

/@data/ { Start=1
}
Start && NF>1{
if(!($Attr in Seen))
{
Seen[$Attr]=1;
}
else
{
Seen[$Attr]++;
}
}
Start && NF>1{
MAX=-10**32
MIN=10**32
for(i in Seen)
{
# note down the attribue not the number of isntances
if(MAX<Seen[i])
{
MAX=Seen[i]
max=i;
#print MAX "\t" i
}

if(MIN >= Seen[i])
{
MIN=Seen[i]
min=i;
#print MIN "\t" i
}

}
}


min && NF>1{
Last=$0}
max && NF>1{
Last1=$0
}
{
print $0
}
END{

if(MAX/MIN>30){

for(i=0;i<(MAX-MIN);i++)
print Last
#print (max/min)
}
}

#first filter the data
#try to figure out the unique classes
#oversample the minority classs
#undersample the majority class 
#first start with oversampling and undersampling