# define the dependent and independent variables y = airquality[,1] x = airquality[,-1] # firstly get rid of the instances with NA values yClean = y[-which(is.na(y) == TRUE)] xClean = x[-which(is.na(y) == TRUE),] yClean = yClean[-which(is.na(xClean[,1]) == TRUE)] xClean = xClean[-which(is.na(xClean[,1]) == TRUE),] # we can have 6 orderings of 3 attributes: 3*2*1 attr1 = xClean[,1] attr2 = xClean[,2] attr3 = xClean[,3] myLength = length(xClean[,1]) # first find best divisions for each attribute separately division1 = attr1[1] part1 = yClean[which(attr1<=division1)] part2 = yClean[which(attr1>division1)] error1 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) index1 = 1 for(i in 2:myLength){ tmpdivision1 = attr1[i] part1 = yClean[which(attr1<=tmpdivision1)] part2 = yClean[which(attr1>tmpdivision1)] tmperror1 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) if(tmperror1 < error1){ error1 = tmperror1 division1 = tmpdivision1 index1 = i } } # now for the second attribute division2 = attr2[1] part1 = yClean[which(attr2<=division2)] part2 = yClean[which(attr2>division2)] error2 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) index2 = 1 for(i in 2:myLength){ tmpdivision2 = attr2[i] part1 = yClean[which(attr2<=tmpdivision2)] part2 = yClean[which(attr2>tmpdivision2)] tmperror2 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) if(tmperror2 < error2){ error2 = tmperror2 division2 = tmpdivision2 index2 = i } } # and now for the third attribute # now for the second attribute division3 = attr3[1] part1 = yClean[which(attr3<=division3)] part2 = yClean[which(attr3>division3)] error3 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) index3 = 1 for(i in 2:myLength){ tmpdivision3 = attr3[i] part1 = yClean[which(attr3<=tmpdivision3)] part2 = yClean[which(attr3>tmpdivision3)] tmperror3 = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) if(tmperror3 < error3){ error3 = tmperror3 division3 = tmpdivision3 index3 = i } } findMyDivision <- function(myX, myY){ division = myX[1] part1 = myY[which(myX<=division)] part2 = myY[which(myX>division)] error = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) for(i in 2:length(myY)){ tmpdivision = myX[i] part1 = myY[which(myX<=tmpdivision)] part2 = myY[which(myX>tmpdivision)] tmperror = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) if(tmperror < error){ error = tmperror division = tmpdivision } } return(division) } myTree <- function(myY, myX, myAttributes){ if(length(myAttributes)==1){ myAttribute = myAttributes[1] myDivision = findMyDivision(myX[,myAttribute], myY) myLeft = myY[which(myX[,myAttribute]<=myDivision)] myRight = myY[which(myX[,myAttribute]>myDivision)] # get rid of NA's if there are any #myLeft = myLeft[-which(is.na(myLeft) == TRUE)] #myRight = myRight[-which(is.na(myRight) == TRUE)] sprintf("Level %d, Attribute %d, Division is %.2f\n", (3-length(myAttributes)), myAttribute, myDivision) return(sum((myLeft - mean(myLeft))^2) + sum((myRight + mean(myRight))^2)) } else{ myAttribute = myAttributes[1] myDivision = findMyDivision(myX[,myAttribute], myY) myLeftY = myY[which(myX[,myAttribute]<=myDivision)] myRightY = myY[which(myX[,myAttribute]>myDivision)] myLeftX = myX[which(myX[,myAttribute]<=myDivision),] myRightX = myX[which(myX[,myAttribute]>myDivision),] sprintf("Level %d, Attribute %d, Division is %.2f\n", (3-length(myAttributes)), myAttribute, myDivision) return(myTree(myLeftY, myLeftX, myAttributes[-1]) + myTree(myRightY, myRightX, myAttributes[-1])) } } err1 = myTree(yClean, xClean, cbind(1,2,3)) err2 = myTree(yClean, xClean, cbind(1,3,2)) err3 = myTree(yClean, xClean, cbind(2,1,3)) err4 = myTree(yClean, xClean, cbind(2,3,1)) err5 = myTree(yClean, xClean, cbind(3,1,2)) err6 = myTree(yClean, xClean, cbind(3,2,1)) # now start building the tree # do it for all orderings of 3 levels errorRate = Inf; counter = 0; order1 = 0; order2 = 0; order3 = 0; for(i in 1:3){ if(i == 1){ part1 = yClean[which(attr1<=division1)]; x1 = xClean[which(attr1<=division1)] part2 = yClean[which(attr1>division1)]; x2 = xClean[which(attr1>division1)]; } else if(i == 2){ part1 = yClean[which(attr2<=division2)]; x1 = yClean[which(attr2<=division2)]; part2 = yClean[which(attr2>division2)]; x2 = yClean[which(attr2<=division2)] } else{ part1 = yClean[which(attr3<=division3)]; x1 = xClean[which(attr3<=division3)] part2 = yClean[which(attr3>division3)]; x2 = xClean[which(attr3>division3)]; } for(j in 1:3){ if(i != j){ if(i == 1){ part11 = part1[which(attr1<=division1)] part2 = yClean[which(attr1>division1)] } else if(i == 2){ part1 = yClean[which(attr2<=division2)] part2 = yClean[which(attr2>division2)] } else{ part1 = yClean[which(attr3<=division3)] part2 = yClean[which(attr3>division3)] } for(k in 1:3){ if((k != j)&&(k != i)){ }} } } }