(defun stdev (n sum sumSq) (sqrt (/ (- sumSq (/ (* sum sum) (if n n 1))) (if (<= n 1) 1 (- n 1))))) ;----------------------------------------------------------------------- (defun findVariances (variances) (let ((myVariances (make-list (length variances))) temp temp2) ;return (dotimes (x (length variances)) (dotimes (y (length (nth x variances))) (let (sum n sumSq) (setf temp2 (nth y (nth x variances))) (setf sum (second temp2) n (third temp2) sumSq (fourth temp2)) (setf temp (append temp (list (list (first temp2) (stdev n sum sumSq))))))) (setf (nth x myVariances) (append (nth x myVariances) temp)) (setf temp nil)) myVariances)) ;----------------------------------------------------------------------- ;FIND VARIANCES BY SPLITTING WITH ONE WITH LEAST VARIANCE, THEN ;CONTINUING INTO EACH SPLIT AND FINDING IF ANY SPLIT IN ThOSE ;YEILD A LOWER VARIANCE THAN THE UNSPLIT VARIANCE. (defun supDisc (tbl columnNum Columns &optional maxSize range (Var 536870911) variances) (let (;1st item= split(x,y), 2nd=sum, 3rd=totalitems(n),4th=sumSQ ;then after that is done, its: 1st=split, 2nd=variance ;need current splits (x, y) from x to y, starts with 0,length of Rows ;When it's finished it'll transform to one variance currentSplits temp BestVar start finish ) ;------------------------------ ;so the user doesn't have to enter too much information, it's calculated here (if (not maxSize) (setf maxSize (if (< 4 (length (table-rows tbl))) 4 (length (table-rows tbl))))) (if range (setf start (first range) finish (second range)) (setf start 0 finish (1- (length (table-rows tbl))))) ;------------------------------ ;find split (dotimes (x (- (length (table-rows tbl)) (* 2 maxSize))) ;clip off on both sides (let (var1 var2 temp2 (n 0) (sum 0) (sumSq 0)) (setf temp (+ x maxSize start)) ;now find variance of the current split, fun fun (if (member (list start temp) variances :key #'first :test #'equal) (setf var1 (second (nth (position (list start temp) variances :key #'first :test #'equal) variances))) (progn (dotimes (y (- temp start)) ;actual effort estimation value form tbl (setf temp2 (first (last (row-cells (nth (first (nth (+ y start) (nth columnNum Columns))) (table-rows tbl)))))) (incf n) (incf sum temp2) (incf sumSq (* temp2 temp2))) (setf var1 (stdev n sum sumSq)) (setf variances (append variances (list (list (list start x) var1)))))) (setf n 0 sum 0 sumSq 0) (if (member (list temp finish) variances :key #'first :test #'equal) (setf var2 (second (nth (position (list temp finish) variances :key #'first :test #'equal) variances))) (progn (dotimes (y (- finish temp)) ;actual effort estimation value form tbl (setf temp2 (first (last (row-cells (nth (first (nth (+ y temp) (nth columnNum Columns))) (table-rows tbl)))))) (incf n) (incf sum temp2) (incf sumSq (* temp2 temp2))) (setf var2 (stdev n sum sumSq)) (setf variances (append variances (list (list (list temp finish) var2)))))) ;now get the combined variance, and compare to the current best(least) variance ;also make sure to record the split (if (or (not (second bestVar)) (> (second bestVar) (+ var1 var2))) (setf bestVar (list temp (+ var1 var2)))) )) ;now compare the bestVar to the unsplit Var (if (> Var (second bestVar)) (progn ;(print "--------------------------------RECURSING----------------------------------") (push (supDisc tbl columnNum Columns maxSize (list start (1- (first bestVar))) (second bestVar)) currentSplits) (push (supDisc tbl columnNum Columns maxSize (list (first bestVar) finish) (second bestVar)) currentSplits)) (push (list (list start finish) Var) currentSplits)) currentSplits)) ;----------------------------------------------------------------------- (defun findDiscVar (tbl columnNums Columns) (let (;contains lists, 1st item = column #, second is list of: ;1st item= discrete value, 2nd=sum, 3rd=count(n),4th=sumSQ ;though when finished, 1st=discrete, 2nd=variance, 3rd=list of rows with 1st ;(variances (make-list (length Columns))) (variances (make-list (1- (length Columns)))) tempRow temp2 temp3 ) (dotimes (x (length (table-rows tbl))) (setf tempRow (row-cells (nth x (table-rows tbl)))) (dolist (y columnNums) (setf temp3 (first (last tempRow))) ;(print (first tempRow)) (if (member (nth y tempRow) (nth y variances) :key #'first) (progn (setf temp2 (position (nth y tempRow) (nth y variances) :key #'first)) (incf (second (nth temp2 (nth y variances))) temp3) ;sum (incf (third (nth temp2 (nth y variances)))) ;count (incf (fourth (nth temp2 (nth y variances))) ;sumSq (* temp3 temp3))) (push (list (nth y tempRow) temp3 1 (* temp3 temp3)) (nth y variances))))) (setf variances (findVariances variances)) (dotimes (x (length variances)) (setf (nth x variances) (first (sort (nth x variances) #'> :key #'second)))) ;record rows that have to do with those variances ;replace the discrete value with the rows that contain it, we don't need the actual value any more (dotimes (x (length (first Columns))) (dolist (y columnNums) (if (not (third (nth y variances))) (push nil (nth y variances))) (if (equal (second (nth x (nth y Columns))) (second (nth y variances))) (setf (first (nth y variances)) (append (first (nth y variances)) (list (first (nth x (nth y Columns))))))))) (dolist (x columnNums) (setf (nth x variances) (list (first (nth x variances)) (third (nth x variances))))) variances)) ;----------------------------------------------------------------------- (defun call_pp (input output) (readARFF input) (data "dummyData.lisp") (BAMBOO_pp)) (defun BAMBOO_pp (&optional (tbl (thetable)) (output "rr_dummyData.arff")) ;----------------------------------------- ;PREPROCESSOR: Row Removal ;----------------------------------------- (let (DiscrCols NumCols ;each item is a list of two, 1st= rows pertaining to that range, second = range Ranges temp temp2 finished ;last entry is the Klass, of course (Columns (make-list (length (table-cols tbl)))) ;max size of a split for the numeric splitting ;calculated here so it's not done over and over in it ;if I decide to make it recursive, probably will (maxSize (if (< 4 (length (table-rows tbl))) 4 (length (table-rows tbl)))) ) ;go through each attribute/column number ;and record which is numeric and which is symbolic (dotimes (x (1- (length (table-cols tbl)))) (if (sym-p (nth x (table-cols tbl))) (setf DiscrCols(append DiscrCols (list x))) (setf NumCols (append NumCols (list x))))) ;record the columns now, along with their orginial location (dotimes (x (length (table-rows tbl))) (let ((row (row-cells (nth x (table-rows tbl))))) (setf (first (last Columns)) (append (first (last Columns)) (last row))) (dotimes (y (1- (length row))) ;column = original #, column value ;can use original # to get Klass value (setf (nth y Columns) (append (nth y Columns) (list (list x (if (and (num-p (nth y (table-cols tbl))) (not (= (nth y row) 0))) (log (nth y row)) (nth y row))))))))) ;now sort the numeric columns from least to most (dolist (x NumCols) (setf (nth x Columns) (sort (nth x Columns) #'< :key #'second))) ;discrete (setf Ranges (findDiscVar tbl DiscrCols Columns)) ;numeric (dolist (x numCols) (let (temp temp2) (setf temp (first (sort (supDisc tbl x Columns) #'> :key #'second))) (dotimes (y (- (length (table-rows tbl)) (- (length (table-rows tbl)) 1 (second (first temp)))(first (first temp)))) (setf temp2 (append temp2 (list (+ y (first (first temp))))))) (setf (nth x Ranges) (list temp2 (second temp))))) ;now we can sort them because they already contain the row #'s, so we ;don't need to know what column they belong to now. (setf Ranges (sort Ranges #'> :key #'second)) ;the actual row removal, removes until rows are reduces to squar(n) ;if a range would remove to many rows, then it is removed and the next ;worst range tries removing not too many rows. (setf temp (sqrt (length (table-rows tbl)))) (setf temp2 (mapcar #'(lambda (x) (first x)) (first Columns))) (loop while (not finished) do (let (newrows) (dotimes (x (length temp2)) (if (not (member x (first (first Ranges)))) (push x newrows))) (setf Ranges (rest Ranges)) (if (= 0 (length Ranges)) (setf finished t) (if (> (length newrows) temp) (setf temp2 newrows)) ))) ;now to turn the row numbers into actual rows, and replace the former ones ;and output to an arff file? (setf temp nil) (dolist (x temp2) (push (nth x (table-rows tbl)) temp)) (setf (table-rows tbl) temp) ;now output the new table with its new rows to an output arff file (table2ARFF output) nil)) ;----------------------------------------------------------------------- (defun call_l (input_all input_loo output) (readARFF input_all) (data "dummyData.lisp") (BAMBOO_l 5 input_loo)) (defun BAMBOO_l (k testCaseFile &optional (tbl (thetable))) ;----------------------------------------- ;LEARNER: K means ;----------------------------------------- (let (nearestNeighbors rows testPoint) (readARFF testCaseFile) (data "dummyData.lisp") (setf testPoint (first (table-rows (thetable)))) (dotimes (x (length (table-rows tbl))) (setf rows (append rows (list (list x (eucliddist testPoint (nth x (table-rows tbl)) (table-cols tbl) tbl)))))) (setf rows (sort rows #'< :key #'second)) ;find the median 10 times, then at the end find the median of those medians. ;but once all the medians are found, they'll needed to be sorted by eucleadian distance. (dotimes (x 10) (let ((n 0) (sum 0) (sumSq 0) (Var 0) (NewVar 0) pointList (i 0) temp finished) (loop while (not finished) do (setf temp (first (last (row-cells (nth (first (nth i rows)) (table-rows tbl)))))) (if (< (random 100) 33) (progn (incf n) (incf sum temp) (incf sumSq (* temp temp)) (push (first (nth i rows)) pointList) (if (> n 1) (setf NewVar (stdev n sum sumSq))))) (if (or (>= (length pointList) k) (and (>= (length pointList) k) (< NewVar Var))) (setf finished t)) (setf Var NewVar) (incf i)) (push (nth (floor (/ (length pointList) 2)) pointList) nearestNeighbors))) (dotimes (x (length nearestNeighbors)) (setf (nth x nearestNeighbors) (nth (position (nth x nearestNeighbors) rows :key #'first) rows))) (setf nearestNeighbors (sort nearestNeighbors #'< :key #'second)) (setf nearestNeighbors (first (last (row-cells (nth (first (nth (floor (/ (length nearestNeighbors) 2)) nearestNeighbors)) (table-rows tbl)))))) nearestNeighbors)) ;----------------------------------------------------------------------- ;reads a given table and outputs it to an arff file (defun table2ARFF (&optional (outputFile "rr_dummyData.arff") (tbl (thetable))) ;now write to dummy lisp file dummyData.lisp, or whetever the output file is (with-open-file (out (make-pathname :name outputFile) :direction :output :if-exists :supersede) (format out "@relation ~a~%" (table-name tbl)) (dolist (x (table-arffAttr tbl)) (format out "@attribute ~a~%" x)) (format out "@data~%") (dolist (x (table-rows tbl)) (format out "~a" (first (row-cells x))) (dolist (y (rest (row-cells x))) (format out ",~a" y)) (format out "~%")))) ;reads a given arff file and outputs a lisp file where desired. (defun readARFF (filename &optional (outputFile "dummyData.lisp")) (let (line file splitfile (header "(deftable ") rows arffCols data (split #\Space)) (with-open-file (in (make-pathname :name filename) :direction :input) (setf line (string (read-line in nil 'eof))) (loop while (not (equal line "EOF")) do (setf file (append file (list line))) (setf line (string (read-line in nil 'eof))))) (dolist (x file) (let ((temp (breakString x split))) (if (equal (string-downcase (first temp)) "@data") (setf split #\,)) (setf splitfile (append splitfile (list temp))))) (setf splitfile (remove "" splitfile :key #'first)) ;(print splitfile) (dotimes (y (length splitfile)) (let ((spacing " ") (x (nth y splitfile)) row) (if (and (not (equal x '(""))) (not (eql (char (first (nth y splitfile)) 0) #\%))) (if data (let ((row "(!")) (dolist(z x) (setf row (concatenate 'string row " " z))) (setf rows (append rows (list (concatenate 'string row ")"))))) (progn (if (equal (string-downcase (first x)) "@relation") (setf header (concatenate 'string header (second x)))) (if (equal (string-downcase (first x)) "@attribute") (progn ;(print x) (setf arffCols (append arffCols (list (let (blah) (dolist (z (rest x)) (setf blah (concatenate 'string blah " " z))) blah)))) (if (equal (third x) "real") (setf spacing (concatenate 'string spacing "$"))) (if (not (equal (string-downcase(first (nth (1+ y) splitfile))) "@attribute")) (progn (setf spacing (concatenate 'string spacing "!")) (setf data t))) (setf header (concatenate 'string header spacing (second x))) (if data (progn (setf header (concatenate 'string header ")")) (incf y)))))))) )) ;now write to dummy lisp file dummyData.lisp, or whetever the output file is (with-open-file (out (make-pathname :name outputFile) :direction :output :if-exists :supersede) (format out "~a~%" header) (dolist (x arffCols) (format out "(arff \"~a\")~%" x)) (dolist (x rows) (format out "~a~%" x))) nil)) ;----------------------------------------------------------------------- ;splits a string up into a list by a given demoinator (defun breakString (str &optional (char #\Space)) (let (split spacPos (rest (string-trim (list #\Space #\Tab char) str)) whitespace) (if (setf whitespace (if (eql char #\Space) t)) (setf spacPos (position #\Tab rest))) (if (not (setf spacPos (if (not (and whitespace spacPos)) (position char rest) spacPos))) (setf split (list str))) (loop while spacPos do (setf split (append split (list (subseq rest 0 spacPos)))) (setf rest (string-trim (list #\Space #\Tab char) (subseq rest (1+ spacPos)))) (if whitespace (setf spacPos (position #\Tab rest))) (setf spacPos (if (not (and whitespace spacPos)) (position char rest) spacPos)) (if (not spacPos) (setf split (append split (list rest))))) split)) ;preprocessor (defun demoBAMBOO_pp () (readARFF "breastTumor.arff") (data "dummyData.lisp") (format t "~%NUM OF ROWS BEFORE: ~a" (length (therows))) (BAMBOO_pp) (format t "~%NUM OF ROWS AFTER: ~a" (length (therows))) ) ;learner (defun demoBAMBOO_l () (readARFF "rr_dummyData.arff") (data "dummyData.lisp") (BAMBOO_l 3 "breastTumor_loo.arff")) (defun memberp (item list) (if (member item list) t nil))