#load dataset data(airquality) # define the dependent and independent variables y = airquality[,1] x = airquality[,-1] # get variable names myNames = names(airquality) # firstly get rid of the instances with NA values yClean = y[-which(is.na(y) == TRUE)] xClean = x[-which(is.na(y) == TRUE),] yClean = yClean[-which(is.na(xClean[,1]) == TRUE)] xClean = xClean[-which(is.na(xClean[,1]) == TRUE),] findMyDivision <- function(myXAll, myY){ error = Inf; for(featureCounter in 1:3){ myX = myXAll[,featureCounter] for(i in 1:length(myY)){ tmpdivision = myX[i] part1 = myY[which(myX<=tmpdivision)] part2 = myY[which(myX>tmpdivision)] tmperror = sum((part1-mean(part1))^2) + sum((part2-mean(part2))^2) if(tmperror < error){ error = tmperror division = tmpdivision feature = featureCounter } } } return(cbind(feature,division)) } myTree <- function(myY, myX,myLevel){ if(myLevel==3){ tmpValues = findMyDivision(myX, myY) myAttribute = tmpValues[1] myDivision = tmpValues[2] myLeft = myY[which(myX[,myAttribute]<=myDivision)] myRight = myY[which(myX[,myAttribute]>myDivision)] cat("Level: ",myLevel, ", Attribute: ", myNames[myAttribute+1], ", Division: ", myDivision, "\n") } else{ tmpValues = findMyDivision(myX, myY) myAttribute = tmpValues[1] myDivision = tmpValues[2] myLeftY = myY[which(myX[,myAttribute]<=myDivision)] myRightY = myY[which(myX[,myAttribute]>myDivision)] myLeftX = myX[which(myX[,myAttribute]<=myDivision),] myRightX = myX[which(myX[,myAttribute]>myDivision),] cat("Level: ",myLevel, ", Attribute: ", myNames[myAttribute+1], ", Division: ", myDivision, "\n") myLevel = myLevel+1 return(myTree(myLeftY, myLeftX, myLevel) + myTree(myRightY, myRightX, myLevel)) } } myLevel = 0; myTree(yClean, xClean, myLevel)