;;;;compiled under: ;;;;slime 1:20070927-3 ;;;;SBCL 1:1.0.11.0-1 ;;;;Peter Santiago, June 2nd, 2008 ;;;;Program note: the input data array is assumed to be AxB, where each row represents the B dimensions of one point and there are A points (load "utests.lisp") ;;;;;;;;;;;;;;;TEST FUNCTIONS;;;;;;;;;;;;;;;;;;; (deftest test-all() "Combines the main and ancillary tests." (combine-results (test-main) (test-ancillary))) (deftest test-main() "Tests the main functions of project1:fastmap and k-means." (combine-results (test-fastmap) (test-k-means))) (deftest test-ancillary() "Tests the ancillary functions necessary for the completion of fastmap and k-means." (combine-results (test-quicksort) (test-set-distance-matrix) (test-set-distance-matrix2) (test-choose-distance-objects))) (deftest test-fastmap() "Note:the values are entirely different depending on which point is picked to be a and which is b" (check (let ((x (fastmap 1 (make-array '(4 2) :initial-contents '((0 0) (5 3) (10 24) (60 70)))))) ;;fastmap chooses o.a to be 3, and o.b to be 0 (and (between 92 (aref x 0 0) 93) (between 86 (aref x 1 0) 87) (between 67 (aref x 2 0) 68) (between 0 (aref x 3 0) 0))))) (deftest test-k-means() (check (let ((x (k-means 3 (make-array '(4 2) :initial-contents '((0 0) (1 1) (500 500) (600 600)))))) (and (= (aref x 0 2) (aref x 1 2)) (= (aref x 2 2) (aref x 3 2)))))) (deftest test-choose-distance-objects() (check (let ((x (choose-distance-objects (set-distance-matrix (make-array '(5 3) :initial-contents '((1 2 3) (4 5 6) (7 8 9) (-500 -500 -500) (500 500 500))))))) (and (or (= (first x) 3) (= (first x) 4)) (or (= (second x) 3) (= (second x) 4)))))) (deftest test-set-distance-matrix2() (check (let ((x (set-distance-matrix2 (make-array '(2 2) :initial-contents '((0 0) (0 1))) (make-array '(2 2) :initial-contents '((1 0) (1 1))))) (z (make-array '(2 2) :initial-contents '((1 1.4142135) (1.4142135 1))))) (equalp x z)))) (deftest test-set-distance-matrix() (check (let ((x (set-distance-matrix (make-array '(4 2) :initial-contents '((0 0) (0 1) (1 0) (1 1))))) (z (make-array '(4 4) :initial-contents '((0 1 1 1.4142135) (1 0 1.4142135 1) (1 1.4142135 0 1) (1.4142135 1 1 0))))) (equalp x z)))) (deftest test-quicksort() (check (let ((x (quicksort-array (make-array '(5 2) :initial-contents '((1 5) (2 4) (3 3) (4 2) (5 1))))) (veracity t)) (dotimes (i (- (array-dimension x 0) 1) veracity) (cond ((< (aref x i (- (array-dimension x 1) 1)) (aref x (+ i 1) (- (array-dimension x 1) 1))) (setf veracity nil))))))) ;;;;;;;;;;;;;;;;;;;;END TEST FUNCTIONS;;;;;;;;;;;;;;;;;;;;;;;;;; (defun fastk-main (inputfile outputfile fastmap-dimensions-requested clusters-sought) "takes in the parameters and spits out gnuplot-able files" (let ((fastmatrix 0)) (setf fastmatrix (fastmap fastmap-dimensions-requested (make-usable (load-item inputfile)))) (print (stress fastmatrix (load-item inputfile))) (make-plottable-initial outputfile fastmap-dimensions-requested clusters-sought) (make-plottable fastmatrix outputfile) (make-plottable-clusters (quicksort-array (k-means clusters-sought fastmatrix)) outputfile))) (defun quicksort-array (data) "quicksort--array performs a quicksort on the last element in each row, such that the matrix is eventually sorted by the class characteristic." (let ((pivotvector (make-array (array-dimension data 1) :initial-element nil)) (lesser (make-array (list (array-dimension data 0) (array-dimension data 1)) :initial-element nil)) (greater (make-array (list (array-dimension data 0) (array-dimension data 1)) :initial-element nil)) (rows.lesser 0) (rows.greater 0) (x (array-dimension data 0)) (y (array-dimension data 1))) (if (<= x 1) (return-from quicksort-array data)) ;(print x) ;;set pivot (dotimes (j y) (setf (aref pivotvector j) (aref data (- x 1) j))) (setf data (adjust-array data (list (- x 1) y))) (setf x (array-dimension data 0)) ;;iteration! (dotimes (i x) (cond ((<= (aref data i (- y 1)) (aref pivotvector (- y 1))) (dotimes (j y) (setf (aref lesser rows.lesser j) (aref data i j))) (setf rows.lesser (+ rows.lesser 1))) ((> (aref data i (- y 1)) (aref pivotvector (- y 1))) (dotimes (j y) (setf (aref greater rows.greater j) (aref data i j))) (setf rows.greater (+ rows.greater 1))))) ;;adjust matrices (setf lesser (adjust-array lesser (list rows.lesser y))) (setf greater (adjust-array greater (list rows.greater y))) ;;recursion (array-concat (quicksort-array greater) pivotvector (quicksort-array lesser)))) (defun array-concat (a1 vector a3) "array-append: returns a new array with all elements of a1 vector a3 in it. Note: the row size must be the same for all arrays" (let ((arr (make-array (list (+ (array-dimension a1 0) 1 (array-dimension a3 0)) (array-dimension a1 1)) :initial-element nil)) (currow -1)) (dotimes (i (array-dimension a1 0)) (setf currow (+ currow 1)) (dotimes (j (array-dimension a1 1)) (setf (aref arr currow j) (aref a1 i j)))) (setf currow (+ currow 1)) (dotimes (j (array-dimension vector 0)) (setf (aref arr currow j) (aref vector j))) (dotimes (i (array-dimension a3 0)) (setf currow (+ currow 1)) (dotimes (j (array-dimension a3 1)) (setf (aref arr currow j) (aref a3 i j)))) arr)) (defun make-usable (data) "make-usable: runs quicksort & cuts off the unneccessary class column" (adjust-array (quicksort-array data) (list (array-dimension data 0) (- (array-dimension data 1) 1)))) (defun set-distance-matrix(datarray) "set-distance-matrix: Creates and returns an array representing the distances between all objects" (let ((distancearray (make-array (list (array-dimension datarray 0) (array-dimension datarray 0)) :initial-element 0)) (sum 0)) (dotimes (i (array-dimension datarray 0) distancearray) (dotimes (j (array-dimension datarray 0)) (setf sum 0) (dotimes (k (array-dimension datarray 1)) (setf sum (+ sum (expt (- (aref datarray i k) (aref datarray j k)) 2)))) (setf (aref distancearray i j) (sqrt sum)))))) ;end set-distance-matrix (defun set-distance-matrix2 (data1 data2) "set-distance-matrix2: Creates and returns an array representing the distances between objects of two sets" (let ((distancearray (make-array (list (array-dimension data1 0) (array-dimension data2 0)) :initial-element 0)) (sum 0)) (dotimes (i (array-dimension distancearray 0) distancearray) (dotimes (j (array-dimension distancearray 1)) (setf sum 0) (dotimes (k (array-dimension data1 1)) (setf sum (+ sum (expt (- (aref data1 i k) (aref data2 j k)) 2)))) (setf (aref distancearray i j) (sqrt sum)))))) ;end set-distance-matrix2 (defun reduce-dist(oldist xmatrix column) "reduce-dist:reduces the distancematrix using another magic equation" (let ((reducedist (make-array (list (array-dimension oldist 0) (array-dimension oldist 1)) :initial-element 0))) (dotimes (i (array-dimension oldist 0) reducedist) (dotimes (j (array-dimension oldist 1)) (setf (aref reducedist i j) (sqrt (- (expt (aref oldist i j) 2) (expt (- (aref xmatrix i column) (aref xmatrix j column)) 2)))) (if (complexp (aref reducedist i j)) (setf (aref reducedist i j) 0)))))) ;end reduce-dist (defun dist(distmatrix a) "returns the number of the object furthest from a, according to the distmatrix" (let ((b 0) (temp 0)) (dotimes (i (array-dimension distmatrix 0) b) (cond ((> (aref distmatrix a i) temp) (setf temp (aref distmatrix a i)) (setf b i)))))) ;end dist (defun choose-distance-objects (dist) "choose-distant-objects: chooses the two longest away object thingies through ancient magic" (let ((o.b 0) (o.a 0) (vlist 0)) (dotimes (i 5) ;5 is the number of iterations suggested by Faloutsos & Lin (setf o.a (dist dist o.b)) (setf o.b (dist dist o.a))) (setf vlist (list o.a o.b)))) ;end choose-distant-objects (defun fastmap (k datamatrix &optional (distancematrix (set-distance-matrix datamatrix)) (x (make-array (list (array-dimension datamatrix 0) k) :initial-element nil)) (pa (make-array (list 2 k) :initial-element nil)) (cnum -1)) ;k is the desired number of dimensions, datamatrix if the data; all the rest are optional and generally computed by fastmap for subsequent passes "fastmap: does important things; generally awesome." (let ((templist 0) ;for storing the output of choose-distance-objects (dprimematrix 0) ;for storing the reduced distancematrix (o.a 0) ;stores object a (o.b 0) ;stores object b (x.i 0)) ;stores the calculated value of x.i (a bit extraneous, but I think it helps with readability) ;;1 base case conditional (if (<= k 0) (return-from fastmap x)) ;returns the new coordinates of objects (setf cnum (+ cnum 1)) ;This is here because that's how faloutsos and lin organized their algorithm ;;2 choose pivot objects (setf templist (choose-distance-objects distancematrix)) (setf o.a (first templist)) (setf o.b (second templist)) ;;3 record the ids of the pivot objects (setf (aref pa 0 cnum) o.a) (setf (aref pa 1 cnum) o.b) ;;4 distance something (if (= (aref distancematrix o.a o.b) 0) (dotimes (i (array-dimension x 0)) (setf (aref x i cnum) 0))) ;;5 project the objects on the line(o.a,o.b) (dotimes (i (array-dimension datamatrix 0)) (setf x.i (/ (- (+ (expt (aref distancematrix o.a i) 2) (expt (aref distancematrix o.a o.b) 2)) (expt (aref distancematrix o.b i) 2)) (* 2 (aref distancematrix o.a o.b)))) (setf (aref x i cnum) x.i)) ;;6 consider the projections of the objects on a hyper-plaine perpendicular to the line (o.a,o.b); find new distance function (setf dprimematrix (reduce-dist distancematrix x cnum)) (fastmap (- k 1) datamatrix dprimematrix x pa cnum))) ;end fastmap (defun stress (data1 data2) "stress: checks the difference between two nxn matrices. The primary purpose of this function is to ensure distance between the original and 'fastmapped' objects are relatively maintained" (let ((dist1 (set-distance-matrix data1)) (dist2 (set-distance-matrix data2)) (computed (make-array (list (array-dimension data1 0) (array-dimension data1 1)) :initial-element nil))) (dotimes (i (array-dimension computed 0) computed) (dotimes (j (array-dimension computed 1)) (setf (aref computed i j) (- (aref dist1 i j) (aref dist2 i j))))))) ;end stress (that seems unlikely) (defun k-means (k data) "k-means: performs a k-means clustering." (let ((memberof (make-array (array-dimension data 0) :initial-element nil)) ;holds current cluster membership (oldmemberof (make-array (array-dimension data 0) :initial-element nil)) ;holds old cluster membership (num.in.cluster (make-array k :initial-element 0)) ;holds number of objects per cluster, to facilitate centroid calculation (centroids (make-array (list k (array-dimension data 1)) :initial-element 0)) ;holds centroid data (distances (make-array (list (array-dimension data 0) k) :initial-element nil)) ;holds distance matrix (tempmin nil) ;holds temporary minimum value. Used when recalculating cluster membership (finaldata (adjust-array data (list (array-dimension data 0) (+ (array-dimension data 1) 1))))) ;final thingie to be returned ;;initial-partition (dotimes (i (array-dimension data 0)) (setf (aref memberof i) (mod i k)) (setf (aref num.in.cluster (mod i k)) (+ (aref num.in.cluster (mod i k)) 1))) ;;initial-centroids (dotimes (i (array-dimension data 0)) (dotimes (j (array-dimension data 1)) (setf (aref centroids (aref memberof i) j) (+ (/ (aref data i j) (aref num.in.cluster (aref memberof i))) (aref centroids (aref memberof i) j))))) ;;iteration: run at a max # of points. Exit once membership remains constant (dotimes (iteration (array-dimension data 0) memberof) ;(format t "Iteration number: ~a~%" iteration) (if (equalp oldmemberof memberof) (return memberof)) ;;set the oldmemberof (dotimes (i (array-dimension memberof 0)) (setf (aref oldmemberof i) (aref memberof i))) ;;calculate distances, determine nearest centroid & membership (setf distances (set-distance-matrix2 data centroids)) (dotimes (i (array-dimension distances 0)) (setf tempmin (aref distances i 0)) (dotimes (j (array-dimension distances 1)) (cond ((< (aref distances i j) tempmin) (setf tempmin (aref distances i j)) (setf (aref memberof i) j))))) ;;calculate number of items per cluster (setf num.in.cluster (make-array k :initial-element 0)) (dotimes (i (array-dimension memberof 0)) (setf (aref num.in.cluster (aref memberof i)) (+ (aref num.in.cluster (aref memberof i)) 1))) ;;calculate mean (setf centroids (make-array (list k (array-dimension data 1)) :initial-element 0)) (dotimes (i (array-dimension data 0)) (dotimes (j (array-dimension data 1)) (setf (aref centroids (aref memberof i) j) (+ (/ (aref data i j) (aref num.in.cluster (aref memberof i))) (aref centroids (aref memberof i) j)))))) (dotimes (i (array-dimension data 0)) (setf (aref finaldata i (- (array-dimension finaldata 1) 1)) (aref memberof i))) finaldata)) ;end k-means (defun save-item (filename item) "save-item: allows the variable item to be stored in the file at filename as a lisp object" (with-open-file (out filename :direction :output :if-exists :supersede) (with-standard-io-syntax (print item out)))) ;end save-item (defun load-item (filename) "load-item: allows the file at filename to be parsed and stored" (let ((x 0)) (with-open-file (in filename) (with-standard-io-syntax (setf x (read in)) x)))) ;end load-item (defun make-plottable-initial (filename dimensions clusters) (let ((command 0)) (cond ((= dimensions 2) (setf command "plot")) ((= dimensions 3) (setf command "splot"))) (with-open-file (out filename :direction :output :if-exists :supersede) (with-standard-io-syntax (format out "###COLOR BY VALUE###~%# ~a '~a' index 0, '~a' index 1, '~a' index 2~%" command filename filename filename) (format out "###COLOR BY CLUSTER###~%# ~a '~a' index 3" command filename) (dotimes (i (- clusters 1)) (format out ", '~a' index ~a" filename (+ i 4))) (format out "~%###COLOR BY VALUE, LINES BY CLUSTER###~%# ~a '~a' index 0, '~a' index 1,'~a' index 2, '~a' index 3:~a with lines~%" command filename filename filename filename (+ clusters 2)))))) ;end make-plottable-initial (defun make-plottable-clusters (matrix filename) "make-plottable-clusters: formats the given sorted data matrix and throws it into the specified file so that is is easily readable by gnuplot. Splits the data into as many classes as in the matrix." (let ((temp (aref matrix 0 (- (array-dimension matrix 1) 1))) (indexcount 4)) (with-open-file (out filename :direction :output :if-exists :append) (with-standard-io-syntax (format out "~%~%###cluster indices###~%###index 3###~%") (dotimes (i (array-dimension matrix 0)) (cond ((/= temp (aref matrix i (- (array-dimension matrix 1) 1))) (setf temp (aref matrix i (- (array-dimension matrix 1) 1))) (format out "~%~%###index ~a###~%" indexcount) (setf indexcount (+ indexcount 1)))) (dotimes (j (- (array-dimension matrix 1) 1)) (format out "~a " (aref matrix i j))) (format out "~%")))) 'done)) ;end make-plottable-clusters (defun make-plottable (matrix filename) "make-plottable: formats the given data matrix and throws it into the specified file so that it is easily readable by gnuplot. Splits the data into three classes: top 10%, middle, bottom 10%" (let ((partition 0)) (setf partition (ceiling (array-dimension matrix 0) 10)) (with-open-file (out filename :direction :output :if-exists :append) (with-standard-io-syntax (format out "###value indices###~%") (format out "###index 0###~%") (dotimes (i partition) (dotimes (j (array-dimension matrix 1)) (format out "~a " (aref matrix i j))) (format out "~%")) (format out "~%~%") (format out "###index 1###~%") (dotimes (i (- (array-dimension matrix 0) partition)) (dotimes (j (array-dimension matrix 1)) (format out "~a " (aref matrix (+ i partition) j))) (format out "~%")) (format out "~%~%") (format out "###index 2###~%") (dotimes (i partition) (dotimes (j (array-dimension matrix 1)) (format out "~a " (aref matrix (+ (- (array-dimension matrix 0) partition) i) j))) (format out "~%")))) 'done)) ;end make-plottable