knn<-function(x0,x,y,k){ x=as.matrix(x) p=dim(x)[2] n=dim(x)[1] dis=rep(0,n) for(i in 1:p){ dis= (x0[i]-x[,i])^2+dis } ind=order(dis)[1:k] mean(y[ind]) } bls<-function(x0,x,y){ x0=as.matrix(x0) x = as.matrix(x) y = as.matrix(y) x0%*%solve(t(x)%*%x,t(x)%*%y) } # read the files trainAll <- read.csv(file="train.csv",head=FALSE,sep=",") testAll <- read.csv(file="test.csv",head=FALSE,sep=",") #choose instances with code 2 and 3 only train = rbind(trainAll[which(trainAll[,1,1] == 2),],trainAll[which(trainAll[,1,1] == 3),]) test = rbind(testAll[which(testAll[,1,1] == 2),],testAll[which(testAll[,1,1] == 3),]) # define possible k values we want to try kValues = c(1,3,5,7,15) # calculate and print out the error rates for knn and linear regression for each k value for(kCounter in 1:(length(kValues)-4)){ k = kValues[kCounter] # define variables to keep track of actual and estimated classes actualsTrain = train[,1] prdKnnTrain = matrix(0,length(actualsTrain),1) prdRegTrain = matrix(0,length(actualsTrain),1) # training error calculation for(i in 1:length(train[,1])){ tmpAns1 = knn(train[i,2:length(train)],train[,2:length(train)],train[,1],k) if(tmpAns1<2.5){ prdKnnTrain[i]=2 } else{ prdKnnTrain[i] = 3 } tmpAns2 = bls(train[i,2:length(train)],train[,2:length(train)],train[,1]) if(tmpAns2<2.5){ prdRegTrain[i]=2 } else{ prdRegTrain[i] = 3 } } knnTrainAccuracy = length(which(prdKnnTrain == actualsTrain))*100/length(actualsTrain) regTrainAccuracy = length(which(prdRegTrain == actualsTrain))*100/length(actualsTrain) cat("k = ",k, "; k-NN Train Accuracy: ", knnTrainAccuracy, " Percent; Regression Train Acccuracy: ", regTrainAccuracy, " Percent\n") # now define variables for testing actualsTest = test[,1] prdKnnTest = matrix(0,length(actualsTest),1) prdRegTest = matrix(0,length(actualsTest),1) # testing error calculation for(i in 1:length(test[,1])){ prdKnnTest[i] = knn(test[i,2:length(test)],train[,2:length(train)],train[,1],k) tmpAns = bls(test[i,2:length(train)],train[,2:length(train)],train[,1]) if(tmpAns<2.5){ prdRegTest[i]=2 } else{ prdRegTest[i] = 3 } } knnTestAccuracy = length(which(prdKnnTest == actualsTest))*100/length(actualsTest) regTestAccuracy = length(which(prdRegTest == actualsTest))*100/length(actualsTest) cat("k = ",k, "; k-NN Test Accuracy: ", knnTestAccuracy, "Percent; Regression Test Acccuracy: ", regTrainAccuracy, "Percent\n") }