; tips ; accessors for complex nested acessing ; losf (lots of small functions) ; todo ; prism. ; prism + infogain ; prism + nomograms ; egs stored twice. once in tbl and once in xindex. reduce! ; bonus marks. bugs in code. ; 1 mark write a test that clear demonstrates an error ; 2 marks for fixing the bug and demonstrating to be the before/after ; behaviour (broken and fixed) ; traps ; sort ; (let* ((x '(2 3 2 3 5 2 1 4)) (y (stable-sort x #'<))) (o x y)) ; [X]=[(2 2 2 3 3 4 5)] [Y]=[(1 2 2 2 3 3 4 5)] ; note how the x list is damaged by the sort ; better idiom ; (let ((x '(2 3 2 3 5 2 1 4))) (setf x (sort x #'<)) x) ; (1 2 2 2 3 3 4 5) #+SBCL (DECLAIM (SB-EXT:MUFFLE-CONDITIONS CL:STYLE-WARNING)) ;;;; a generic learner ;;;; examples have classes; features ranges cross-index ;;;; config (let ((dir "/Users/timm/svns/wisp/branches/keys/") (system '("learn"))) (defun file2path (f) (format nil "~a~a.lisp" dir (string-downcase f))) (defun make () (make1 system))) ;;;; utilitites ;;; unit tests (adapted from peter seiblel) (defvar *test-name* nil) (defmacro deftest (name parameters &body body) "Define a test function. Within a test function we can call other test functions or use 'check' to run individual test cases." `(defun ,name ,parameters (let ((*test-name* (append *test-name* (list ',name)))) ,@body))) (defmacro check (&body forms) "Run each expression in 'forms' as a test case." `(combine-results ,@(loop for f in forms collect `(report-result ,f ',f)))) (defmacro with-gensyms ((&rest names) &body body) `(let ,(loop for n in names collect `(,n (make-symbol ,(string n)))) ,@body)) (defmacro combine-results (&body forms) "Combine the results (as booleans) of evaluating 'forms' in order." (with-gensyms (result) `(let ((,result t)) ,@(loop for f in forms collect `(unless ,f (setf ,result nil))) ,result))) (let ((passes 0) (fails 0)) (defun tests-reset() (setf passes 0) (setf fails 0)) (defun tests-report () (format t "~%PASSES: ~a (~a %)~%FAILS : ~a~%" passes (* 100 (/ passes (+ passes fails))) fails)) (defun report-result (result form) "Report the results of a single test case, but succinctly if it passes." (if result (and (incf passes) (format t ".")) (and (incf fails) (format t "~%fail ... ~a: ~a~%" *test-name* form))) result) ) ;;; Our test suite (deftest test-all () (tests-reset) (test-remove-nths) (test-do-array) (test-align) (test-stats) (test-ttest) (test-mann-whitney) (test-wilcoxon) (test-distogram) (test-random) (test-caution) (test-line) (test-data) (test-discretize) (test-xindex-f) (test-xindex-majority-class) (test-index) (test-folds) (test-minusr) (test-zeror) (test-prism) (test-nb) (test-abcd) (test-info-splitter) (test-id-trainer) (tests-report) ) ;;; misc ;; track a counted set (defstruct counted all (n 0) sorted) (defun counted+ (one c) (incf (counted-n c)) (push one (counted-all c)) (setf (counted-sorted c) nil) c) (defun counted-sort (c pred key) (unless (counted-sorted c) (setf (counted-all c) (sort (counted-all c) pred :key key)) (setf (counted-sorted c) t)) c) ;; debugging macros (defmacro o (&rest l) "print list of named values" `(progn ,@(mapcar #'(lambda(x) `(oprim ,x)) l) (terpri))) (defmacro oprim (x) "print a named value; e.g. (let ((a 22)) (o a)) ==> [a]=[22]" `(progn (format t "[~a]=[~a] " (quote ,x) ,x) ,x)) (defun p1 () (print 1)) (defun p2 () (print 2)) (defun p3 () (print 3)) (defun p4 () (print 4)) (defun p5 () (print 5)) (defun p6 () (print 6)) (defun p7 () (print 7)) (defun p8 () (print 8)) (defun p9 () (print 9)) (defun p10 () (print 10)) (defun p11 () (print 11)) (defun p12 () (print 12)) (defun p13 () (print 13)) (defun p14 () (print 14)) (defun p15 () (print 15)) (defun p16 () (print 16)) (defun p17 () (print 17)) (defun p18 () (print 18)) (defun p19 () (print 19)) (defun p20 () (print 20)) (defun p21 () (print 21)) (defun p22 () (print 22)) (defun p23 () (print 23)) (defun p24 () (print 24)) ;; string stiff (defun longest-string (l) (let ((max -1) tmp) (dolist (x l max) (let ((tmp (length (format nil "~a" x)))) (if (> tmp max) (setf max tmp)))))) (defmacro samep (form string) `(string= (whiteout (format nil "~a" ,form)) (whiteout ,string))) (defun whitespacep (char) "Returns true if the Lisp character CHAR is whitespace" (member char '(#\Space #\Tab #\Newline) :test #'char=)) (defun whiteout (seq) (remove-if #'whitespacep seq)) ;; sorting anything (defun lt (x y) "this has to be the saddest/slowes way to order two arbitrary things" (if (string-lessp (format nil "~a" x) (format nil "~a" y)) t nil)) ;; matrix stuff (defun transpose (x) (apply #'mapcar (cons #'list x))) ;; string stuff (defmacro list2string ((item list stream) &body body) `(with-output-to-string (,stream) (dolist (,item ,list) ,@body))) (defun nchars (n &optional (char #\Space)) (with-output-to-string (s) (dotimes (i n) (format s "~a" char )))) (defun align (lines &key (stream t) (fs " |") (rs #\Newline) (underline #\-) (before #\Newline) (after "")) (let* ((header (first lines)) (data (rest lines)) (cols (transpose lines)) (widths (mapcar #'longest-string cols)) (sep "") (fmt (with-output-to-string (s) (dolist (width widths) (format s "~a~~~a<~a~~>" sep (1+ width) "~a") (setf sep fs)) (format s "~a" rs)))) (format stream "~a" before) (apply #'format `(,stream ,fmt ,@header)) (setf sep "") (dolist (w widths) (format stream "~a ~a" sep (nchars w underline)) (setf sep fs)) (format stream "~a" rs) (dolist (line data) (apply #'format `(,stream ,fmt ,@line))) (format stream "~a" after))) (deftest test-align () (let ((tmp (with-output-to-string (s) (align '((peace love understanding) (34.1 33.4 324.2) (334 222222 33)) :stream s)))) (samep tmp " PEACE | LOVE | UNDERSTANDING ----- | ------ | ------------- 34.1 | 33.4 | 324.2 334 | 222222 | 33 "))) ;; maths (defconstant e (exp 1)) (defun sum (l) (let ((n 0)) (dolist (one l n) (incf n one)))) (defun mean (l) (let ((n 0) (sum 0)) (dolist (one l (/ sum n)) (incf sum one) (incf n)))) (defun harmonic-mean (x y) (/ (* 2 x y) (+ x y))) (defun weighted-sum (l &optional total0) (labels ((sum-first (l) (sum (mapcar #'(lambda (x) (first x)) l)))) (let ((total (if total0 total0 (sum-first l)))) (sum (mapcar #'(lambda(one) (* (/ (first one) total) (rest one))) l))))) (defun stdev (n sum sumSq) "Compute the mean and standard deviation." (sqrt (/ (- sumSq(/ (* sum sum) n)) (- n 1)))) (defun list2stdev (l) "Return the standard deviation of a list of numbers." (let ((n 0) (sum 0) (sumSq 0)) (dolist (x l (stdev n sum sumSq)) (incf n) (incf sum x) (incf sumSq (* x x ))))) (defun median (nums) "Return 50% and (75-50)% values, the 25% value" (labels ((mean (x y) (/ (+ x y ) 2))) (let* ((n1 (sort nums #'<)) (l (length n1)) (mid (floor (/ l 2))) (midval (nth mid n1)) (25percent (nth (floor (* l 0.25)) n1)) (75percent (nth (floor (* l 0.75)) n1)) (50percent (if (oddp l) midval (mean midval (nth (- mid 1) n1))))) (values 50percent (- 75percent 50percent) 75percent 25percent)))) (deftest test-stats () (let ((nums '(1 1 1 1 2 2 4 100))) (check (= 112 (sum nums)) (= 3/2 (median nums)) (= 14 (mean nums))))) ;; list stuff (defun array2list (a &optional (n 0)) (let (l) (dotimes (i (length a) (reverse l)) (push (aref a i) l)))) (defmacro doitems ((one n list &optional out) &body body ) `(let ((,n -1)) (dolist (,one ,list ,out) (incf ,n) ,@body))) (defun remove-nths (doomeds l &optional (n 0)) (if (null doomeds) l (remove-nths1 doomeds (first l) (rest l) n))) (defun remove-nths1 (doomeds first rest n) (if (= n (first doomeds)) (remove-nths (rest doomeds) rest (1+ n)) (cons first (remove-nths doomeds rest (1+ n))))) (deftest test-remove-nths () (check (equal (remove-nths '(0 1 5) '(a b c d e f g)) '(c d e g)) (equal (remove-nths '(0 1 5 6) '(a b c d e f g)) '(c d e)) (equal (remove-nths '() '(a b c d e f g)) '(a b c d e f g)))) (defun geta (key list &optional (default nil)) "Return a value from an assocation 'list'' of, if absent, some 'default' value." (or (cdr (assoc key list)) default)) ;; array stuff (defmacro doarrayi ((one n array &optional out) &body body) (let ((i (gensym))) `(let ((,i (length ,array))) (dotimes (,n ,i ,out) (let ((,one (aref ,array ,n))) ,@body))))) (defmacro doarray ((one array &optional out) &body body) (let ((j (gensym))) `(doarrayi (,one ,j ,array ,out) ,@body))) (deftest test-do-array () (let* ((count 0) (r1 (doarrayi (x i #(0 1 2) count) (incf count x) (incf count i))) (r2 (doarray (x #(0 1 2) count) (incf count x)))) (check (= r1 6) (= r2 9)))) ;; comparing normal populations (defun ttest-from-lists (one two &optional (conf 95)) "Given two lists of numbers 'a' and 'b', return the ttest result." (let ((as 0) (asq 0) (an 0)(bs 0) (bsq 0) (bn 0)) (dolist (a one) (incf an) (incf as a) (incf asq (* a a))) (dolist (b two) (incf bn) (incf bs b) (incf bsq (* b b))) (ttest as asq an bs bsq bn :conf conf))) (defun ttest (as asq an bs bsq bn &key (conf 95)) "Compares means of two indpendent samples a,b of sizes an,nb, with sums as,bs and sumsquared asq,bsq at either a 95 or 99% confidence" (labels ((less () (< (/ as an) (/ bs bn))) (same () (let* ((tcrit (tcritical (+ an bn -2) conf)) (ssa (- asq (/ (* as as) an))) (ssb (- bsq (/ (* bs bs) bn))) (pooled (/ (+ ssa ssb) (+ bn an -2))) (sxasb (sqrt (* pooled (+ (/ 1 an) (/ 1 bn))))) (tvalue (abs (/ (- (/ bs bn) (/ as an)) sxasb)))) ;(o tcrit tvalue) (> tcrit tvalue)))) (cond ((same) 0) ; H0 : mean of a same as mean of b ((less) -1) ; H1a : mean of a < mean of b (t 1)))) ; H1b : mean of a > mean of b (let ((ttable '((95 . (( 1 . 12.70 ) ( 3 . 3.1820 ) ( 5 . 2.5710 ) ( 10 . 2.2280 ) ( 20 . 2.0860 ) ( 80 . 1.99 ) ( 320 . 1.97 ))) (99 . (( 1 . 63.6570 ) ( 3 . 5.8410 ) ( 5 . 4.0320 ) ( 10 . 3.1690 ) ( 20 . 2.8450 ) ( 80 . 2.64 ) ( 320 . 2.58 )))))) (defun tcritical (n conf) "Returns the t-test critical values. Keeps those values as a piecewise set of lines and intermediary values are interpolated between the points." (interpolates n (geta conf ttable))) ) (defun ttest-demo (&optional (fudge 1)) "implements the demo described at http://www.cas.buffalo.edu/classes/psy/segal/2072001/ttests/t-tests1.html" (let ((one '(105 112 96 124 103 92 97 108 105 110)) (two '( 98 108 114 106 117 118 126 116 122 108))) (setf one (mapcar #'(lambda (x) (* x fudge)) one)) (ttest-from-lists one two))) (deftest test-ttest () (check (= 0 (ttest-demo)) (= 0 (ttest-demo 1.1)) (= 1 (ttest-demo 1.2)))) ;; comparing ranked independent populations (defun rank (l &key (ranks (make-hash-table)) (n 0) no-zeros) "Returns a hash of the ranks in a sorted list. All numbers in a run of repeated entries get the average rank of that run." (if (null l) ranks (let (repeats sum now) (labels ((walk () (incf n) (pop l)) (new () (setf repeats 1) (setf sum n)) (same () (incf sum n) (incf repeats)) (spin () (when (eql now (car l)) (walk) (same) (spin)))) (setf now (walk)) (new) (spin) ; (if (and no-zeros (zerop now)) ; (setf repeats 1 sum 0 n 0)) (setf (gethash now ranks) (/ sum repeats)) (rank l :ranks ranks :n n :no-zeros no-zeros))))) (defun mann-whitney (a b &optional (conf 95)) "The 'method 1' of http://faculty.vassar.edu/lowry/ch11a.html." (labels ((as-ranks (l r) (mapcar #'(lambda (x) (gethash x r)) l)) (sum (l) (let ((s 0)) (dolist (x l s) (incf s x)))) (median (l n) (let ((sorted (sort l #'<)) (midv (floor (/ n 2)))) (if (oddp n) (nth midv sorted) (/ (+ (nth (1- midv) sorted) (nth midv sorted)) 2))))) (let* ((all (sort (copy-list (append a b)) #'<)) (ranks (rank all)) (ranksa (as-ranks a ranks)) (ranksb (as-ranks b ranks)) (na (length a)) (nb (length b)) (n (+ na nb)) (tcrict (tcritical n conf)) (suma (* 1.0 (sum ranksa))) (ta (/ (* na (+ n 1)) 2.0)) (sigma (sqrt (/ (* na nb (+ n 1)) 12.0))) (za (/ (+ (- suma ta) 0.5) sigma))) ; (o suma ta sigma tcrict) ;debug line (cond ((< (abs za) tcrict) 0) ((< (median ranksa na) (median ranksb nb)) -1) (t 1)) ))) (defun mann-whitney-demo-small () (mann-whitney '(4.6 4.7 4.9 5.1 5.2 5.5 5.8 6.1 6.5 6.5 7.2) '(5.2 5.3 5.4 5.6 6.2 6.3 6.8 7.7 8.0 8.1))) (defun mann-whitney-demo-big (&optional (fudge 1)) "Generate two lists of 10000 random floats. Multiple the second list by 'fudge'. Check if the median ranks of list one is the same as list two (return '0'), smaller than list two (return '-1'), or larger than list two (return '1')." (labels ((big (n s) (let (out) (dotimes (i n out) (push (* s (round (random 100))) out))))) (let ((one (big 10000 1)) (two (big 10000 fudge))) (mann-whitney one two)))) (deftest test-mann-whitney () (check (= 0 (mann-whitney-demo-small)) (= 0 (mann-whitney-demo-big)) (= 1 (mann-whitney-demo-big -2)) (= -1 (mann-whitney-demo-big 2)) )) ;; comparing ranked dependent populations (defun wilcoxon (x.a x.b &optional (conf 95)) (labels ((combine (a &rest elements) `(,a ,@elements)) (as-ranks (l r) (mapcar #'(lambda (x) (gethash x r)) l)) (simple-quad (c) (/ (+ -1 (sqrt (- 1 (* -1 8 c)))) 2))) (let* ((x.dif (mapcar #'- x.a x.b)) ;x.a-x.b (x.abs (mapcar #'abs x.dif)) ;absolute value of x.dif (x (sort (mapcar #'combine x.abs x.dif x.b x.a) #'< :key #'first)) ;sorts by absolute value (ranks (as-ranks (mapcar #'first x) (rank (mapcar #'first x) :no-zeros t))) ;gets the ranks (x (mapcar #'cons ranks x)) ;adds absolute ranks to x (x (mapcar #'cons (mapcar #'* (mapcar #'first x) (mapcar #'signum (mapcar #'third x))) x)) ;adds signed ranks (n (simple-quad (loop for item in ranks summing item))) ;number of ranks used (w (loop for item in (mapcar #'first x) summing item)) ;total weight of all signed ranks (z (/ (- w .5) (sqrt (/ (* n (+ n 1) (+ (* 2 n) 1)) 6)))) (tcrict (tcritical (1+ (length x.a)) conf))) (cond ((< (abs z) tcrict) 0) ((< (median (sort x.a #'<)) (median (sort x.b #'<))) -1) (t 1))))) (deftest test-wilcoxon () "runs wilcoxon on data with known results." (check (= 0 (wilcoxon '(72 90 40 84 22 78 50 64 50 30 52 64 45 64 24 78) '(32 58 20 68 36 68 40 56 44 25 56 68 48 62 24 78))))) ;; historgram, collect, display (defstruct dist (min most-positive-double-float) (max most-negative-double-float) (bars '()) (sum 0) (sumsq 0) (n 0) (sorted t) (fuzz 1) (fat nil) (all '()) ) (defun dist-compare (d1 d2 &key (conf 95)) "compares means of two independent samples d1 d2" (ttest (dist-sum d1) (dist-sumsq d1) (dust-n d1) (dist-sum d2) (dist-sumsq d2) (dust-n d2) :conf conf)) (defun as-dist (l) "Return a dist with all the numbers of l" (if (eq 'dist (type-of l)) l (dist-adds l))) (defun dist-adds (l &optional (d (make-dist))) "Add the numbers in the list 'l' to a dist." (dolist (x l d) (dist-add x d))) (defun dist-mean (d) (/ (dist-sum d) (dist-n d))) (defun dist-sd (d) (stdev (dist-n d) (dist-sum d) (dist-sumsq d))) (defun dist-add (num &optional (d (make-dist))) (incf (dist-n d)) (incf (dist-sum d) num) (incf (dist-sumsq d) (* num num)) (if (dist-fat d) (push num (dist-all d))) (if (< num (dist-min d)) (setf (dist-min d) num)) (if (> num (dist-max d)) (setf (dist-max d) num)) (let* ((fuzz (dist-fuzz d)) (num1 (* fuzz (round (/ num fuzz)))) (val (cdr (assoc num1 (dist-bars d))))) (if val (setf (cdr (assoc num1 (dist-bars d))) (1+ val)) (push (cons num1 1) (dist-bars d))) (setf (dist-sorted d) nil)) d) (defun dist-sort (d) (labels ((car< (a b) (< (car a ) (car b)))) (unless (dist-sorted d) (setf (dist-bars d) (sort (dist-bars d) #'car<) (dist-sorted d) t)) d)) (defun distogram (d0 &key header (lwidth 5) (decimals 2) (rwidth 5) (shrink 1) (on "-") (pad 4) (str t)) "Prints the dist 'd0' on stream 'str', shrinking the right hand side bars by 'shrink'. 'Header' is some text to show on top. The other variables control details of how each line is printed." (let ((d (as-dist d0))) (unless (dist-sorted d) (dist-sort d)) (let ((sum (dist-sum d)) (bars (dist-bars d)) (min (dist-min d)) (max (dist-max d)) (n (dist-n d)) (fuzz (dist-fuzz d)) (percentSum 0) (fmt (format nil " ~~~a,~af * ~~~ad = ~~3d% ~~a " lwidth decimals rwidth))) (if header (format t "~a~%" header)) (dolist (bar bars t) (let* ((key (car bar)) (value (cdr bar)) (percent (floor (* 100 (/ value n)))) (stars (round (/ value shrink))) (halfp (< percentSum 50))) (incf percentSum percent) (format str fmt (+ key (/ fuzz 2)) value percent (if halfp "<" ">") ) (format str "~a~%" (nchars stars "*"))))))) (defun demo-distogram () (let ((log (make-dist :fuzz 2))) (reset-seed) (dotimes (i 1000) (dist-add (sqrt (my-random-int 100)) log)) (with-output-to-string (s) (distogram log :shrink 10 :str s)) )) (deftest test-distogram () (check (samep (demo-distogram) " 1.00 * 12 = 1% < * 3.00 * 68 = 6% < ******* 5.00 * 163 = 16% < **************** 7.00 * 222 = 22% < ********************** 9.00 * 327 = 32% < ********************************* 11.00 * 208 = 20% > *********************"))) ;;;; run sample data files ;;; utilities ;; random stuff (defparameter *seed0* 10013) (defparameter *seed* *seed0*) (defun reset-seed () (setf *seed* *seed0*)) (defun park-miller-randomizer () "The Park-Miller multiplicative congruential randomizer (CACM, October 88, Page 1195). Creates pseudo random floating point numbers in the range 0.0 < x <= 1.0." (let ((multiplier 16807.0d0);16807 is (expt 7 5) (modulus 2147483647.0d0)) ;2147483647 is (- (expt 2 31) 1) (let ((temp (* multiplier *seed*))) (setf *seed* (mod temp modulus)) (/ *seed* modulus)))) (defun my-random (n) "Returns a pseudo random floating-point number in range 0.0 <= number < n" (let ((random-number (park-miller-randomizer))) ;; We subtract the randomly generated number from 1.0 ;; before scaling so that we end up in the range ;; 0.0 <= x < 1.0, not 0.0 < x <= 1.0 (* n (- 1.0d0 random-number)))) (defun my-random-int (n) "Returns a pseudo-random integer in the range 0 <= n-1." (let ((random-number (/ (my-random 1000.0) 1000))) (floor (* n random-number)))) (defun random-demo () (let (counts out) (labels ((sorter (x y) (< (car x) (car y))) (zap () (setf out nil) (reset-seed) (setf counts (make-hash-table))) (inc (n) (setf (gethash n counts) (1+ (gethash n counts 0)))) (cache (k v) (push (list k v) out))) (zap) (dotimes (i 10000) (inc (my-random-int 5))) (maphash #'cache counts) (sort out #'sorter)))) (deftest test-random () (check (equalp (random-demo) (random-demo)))) ;; list utils (defun allbut (l n) "assumes list is indexed 0 1 2 3 4..." (if (zerop n) (values (rest l) (first l)) (cons (first l) (allbut (rest l) (1- n))))) (defun shuffle (l) (dotimes (i (length l) l) (rotatef (elt l i) (elt l (random (length l)))))) (defun tally (l &key (key #'eql)) (let ((h (make-hash-table :test key))) (dolist (one l h) (incf (gethash one h 0))))) ;; hash stuff (defmacro dohash ((key value hash &optional end) &body body) `(progn (maphash #'(lambda (,key ,value) ,@body) ,hash) ,end)) (defun showh (h &key (indent 0) (stream t) (before "") (after #\NewLine) (if-empty "empty") (show #'(lambda (x) (format stream "~a~a = ~a~%" (nchars indent) (first x) (rest x)))) (lt #'lt)) "Prints hash table entries sorted by key." (if (zerop (hash-table-count h)) (format stream "~a~a~a" before if-empty after) (let (l) (format stream "~a" before) (maphash #'(lambda (k v) (push (cons k v) l)) h) (mapc show (sort l #'(lambda (a b) (funcall lt (car a) (car b))))) (format stream "~a" after) h))) ;; tree stuff (defstruct node key kids) ;; line stuff (defstruct line "A 'line' runs between two points 'x1@y1' to 'x2@y2' with gradient 'm' and y-intercept 'b'. Also, some runs are 'verticalp'; i.e. run vertically." x1 y1 x2 y2 m b verticalp) (defun point-to-line (x1 y1 x2 y2) "Create a line from two points." (if (> x1 x2) (point-to-line x2 y2 x1 y1) (let* ((rise (- y2 y1)) (run (- x2 x1)) (verticalp (zerop run)) m b) (unless verticalp (setf m (/ rise run) b (- y2 (* m x2)))) (make-line :x1 x1 :y1 y1 :x2 x2 :y2 y2 :m m :b b :verticalp verticalp)))) (defun line-y (x l) "Return the 'y' point associated with 'x' on line 'l'." (if (line-verticalp l) (line-y1 l) (+ (* (line-m l) x) (line-b l)))) (defun interpolate (x x1 y1 x2 y2 &optional too-big ) "interpolate between the points x1@y2 and x2@y2; returns 'too-big' if x > x2 and returns x1 if x new max) (setf max new majority class)))))) ;;;; header stuff (defstruct header name numericp classp) (defun columns-new (cols klass) (let ((counted (make-counted))) (doitems (col i cols) (counted+ (make-header :name col :numericp (numericp col) :classp (= i klass)) counted)) (setf (counted-all counted) (reverse (counted-all counted))) counted)) (defun columns-header (cols) (mapcar #'header-name (counted-all cols))) (defun numericp (x) (equal (char (symbol-name x) 0) #\$)) (defstruct eg features class) (defun class-index (klass width) (if (< klass 0) (+ klass width) klass)) ;; read tables (defun table-copy (tbl &optional (new (table-egs tbl))) (data :egs new :name (table-name tbl) :klass (table-class tbl) :columns (columns-header (table-columns tbl)))) (defun data (&key name columns egs change (klass -1) shufflep) (let* ((tbl (make-table :name name :columns (columns-new columns (class-index klass (length columns))))) (egs1 (if shufflep (shuffle egs) egs))) (setf (table-class tbl) (class-index klass (table-width tbl))) (dolist (eg egs1 tbl) (data1 eg change klass tbl)))) (defun data1 (one change klass tbl) (when (ok (= (table-width tbl) (length one)) (table-cautions tbl) "~a wrong size" one) (if change (setf one (sublis change one))) (counted+ (make-eg :class (isa one tbl) :features one) (table-all tbl)))) (deftest test-data () (let ((tmp (test-data1))) (check (samep tmp " #(TABLE :NAME WEATHER :COLUMNS #S(COUNTED :ALL (#S(HEADER :NAME FORECAST :NUMERICP NIL :CLASSP NIL) #S(HEADER :NAME TEMP :NUMERICP NIL :CLASSP NIL) #S(HEADER :NAME HUMIDTY :NUMERICP NIL :CLASSP NIL) #S(HEADER :NAME $WINDY :NUMERICP T :CLASSP NIL) #S(HEADER :NAME PLAY :NUMERICP NIL :CLASSP T)) :N 5 :SORTED NIL) :CLASS 4 :CAUTIONS#(CAUTION :ALL ((SUNNY HOT HIGH YES) wrong size) :PATIENCE 19) :ALL #S(COUNTED :ALL (#S(EG :FEATURES (SUNNY HOT HIGH TRUE 2) :CLASS 2) #S(EG :FEATURES (SUNNY HOT HIGH FALSE 1) :CLASS 1)) :N 2 :SORTED NIL) )")))) (defun test-data1 () (data :name 'weather :columns '(forecast temp humidty $windy play) :egs '((sunny hot high FALSE no) (sunny hot high TRUE yes) (sunny hot high yes) ) :change '((yes . 2) (no . 1)))) ;;;; discretization (defun discretize (tbl n &key (using #'ewd) (cautions (make-caution))) (labels ((col1 (col) (if (header-numericp (first col)) (funcall using (rest col) n cautions) (rest col)))) (let* ((headers (counted-all (table-columns tbl))) (egs (table-egs tbl)) (data (cons headers egs)) (columns (transpose data)) (discretized (mapcar #'col1 columns))) (table-copy tbl (reverse (transpose discretized)))))) (defun ewd (data n &optional (cautions (make-caution))) (equal-width-discretization data n cautions)) (defun efd (data n &optional (cautions (make-caution))) (equal-frequency-discretization data n cautions)) (defun equal-width-discretization (data n &optional (cautions (make-caution))) (let* ((numbers (sane-numbers data cautions)) (min (first numbers)) (max (first (last numbers))) (width (/ (- max min) n))) (labels ((datum (cell) (cond ((unknownp cell) cell) ((numberp cell) (discretize cell)) (t *unknown*))) (discretize (m) (float (min max ; don't blow max (+ min ; convert to a number in the same range as before (* width (min n ; don't blow max (1+ (floor (/ (- m min) width)))))))))) (mapcar #'datum data)))) (defun equal-frequency-discretization (data n &optional (cautions (make-caution))) (let* ((numbers (sane-numbers data cautions)) (want (round (/ (length numbers) n))) (break-at (breaks numbers want)) (new (mapcar #'(lambda(datum) (efd1 datum break-at)) data))) new)) (defun efd1 (x break-at) (if (unknownp x) x (dolist (b break-at b) ;(o x b (<= x b)) (if (<= x b ) (return-from efd1 b))))) (defun breaks (l max-size) (let (out) (reverse (breaks1 (first l) (rest l) nil out 1 max-size)))) (defun breaks1 (x l b4 out n max-size) (unless (eql x b4) (when (>= n max-size) (setf n 0) (push x out))) (cond (l (breaks1 (first l) (rest l) x out (1+ n) max-size)) (t (pushnew x out) out))) (deftest test-discretize () (labels ((nums (n tbl) (mapcar #'(lambda (x) (nth n x)) (table-egs tbl)))) (or (fboundp 'weather-numerics) (loaddata 'weather-numerics)) (let ((tmp (with-output-to-string (s) (dotimes (i 10) (format s "~a = ~a~%" i (efd1 i '(4 8 9)))))) (raw (weather-numerics)) (ewd (discretize (weather-numerics) 3 :using #'ewd)) (efd (discretize (weather-numerics) 3 :using #'efd))) (check (equalp '(1 1 1 1 1 2 2 3 3 4 34 34) (sane-numbers '(1 1 3 1 3 2 34 1 1 ? 2 34 4))) (samep tmp " 0 = 4 1 = 4 2 = 4 3 = 4 4 = 4 5 = 8 6 = 8 7 = 8 8 = 8 9 = 9") (equalp (breaks '(1 2 3 4 5 6 6 6 6 6 6 6 6 6 6 6 6 6 7 8 9 10 11 12) 3) '(3 6 7 10 12)) (equalp '(2 2 2 2 6 6 6 6 9 9 9) (equal-frequency-discretization '(1 1 1 2 3 4 5 6 7 8 9) 3 ))) (equalp (nums 2 ewd) '(96.0 75.333336 96.0 75.333336 85.666664 75.333336 96.0 75.333336 75.333336 85.666664 96.0 96.0 96.0 85.666664)) (equalp (nums 2 efd) '(96 75 90 75 90 75 96 75 75 90 96 90 90 90))))) (defun sane-numbers (data &optional (cautions (make-caution)) (sortp t)) (let (numbers) (dolist (datum data (if sortp (sort (copy-list numbers) #'<) numbers)) (unless (unknownp datum) (if (ok (numberp datum) cautions "~a not a number" datum) (push datum numbers)))))) ;;;; cross-indexing (defstruct (xindex (:print-function xindex-print)) table all (n 0) ranges (counts (make-hash-table :test #'equalp)) (class-counts (make-hash-table)) (uniques (make-hash-table :test #'equal))) (defun xindex-print (x s depth) (declare (ignore depth)) (labels ((show (x y) (showh x :before y :stream s :indent 5 :after ""))) (format s "#(XINDEX~%~T:TABLE ~%~T :ALL ~a~%~T :N ~a~%~T :RANGES ~a~%~T" (xindex-all x) (xindex-n x) (xindex-ranges x)) (show (xindex-uniques x) " :UNIQUES ") (show (xindex-class-counts x) " :CLASS-COUNTS ") (show (xindex-counts x) " :COUNTS " )) (format s "~T)")) (defun xindex-new (tbl) (make-xindex :table tbl :all (make-array (table-height tbl) :initial-contents (table-rows tbl)) :n (table-height tbl) :ranges (make-array (table-width tbl)))) (defun f (x class &optional (index (table-class (xindex-table x))) (range class) (access #'counted-n) (if-missing 0)) (let ((tmp (gethash `(,class ,index ,range) (xindex-counts x)))) (if tmp (funcall access tmp) if-missing))) (defun uses (x class index range) (f x class index range #'counted-all nil)) (deftest test-xindex-f () (unless (fboundp 'weather2) (loaddata 'weather2)) (let ((x (cross-index (weather2)))) (check (= (f x 'no) 5) (= (f x 'no 0 'rainy) 2)))) (defun xindex-classi (x) (table-class (xindex-table x))) (defun xindex-classes-n (x) (counted-n (aref (xindex-ranges x) (table-class (xindex-table x))))) (defun xindex-features (x) (xindex-ranges x)) (defun xindex-feature (i x) (counted-all (aref (xindex-ranges x) i))) (defun xindex-feature-names (x) (table-feature-names (xindex-table x))) (defun xindex-n (x) (counted-n (table-all (xindex-table x)))) (defun xindex-classes-all (x) (counted-all (aref (xindex-ranges x) (table-class (xindex-table x))))) (defun xindex-unique-n (x feature range) (gethash `(,feature ,range) (xindex-uniques x) 0)) (defun xindex-width (x) (table-width (xindex-table x))) (defun xindex-majority-class (x) (let* ((classes (xindex-classes-all x)) (max -1) most-frequent) (dolist (class classes most-frequent) (let* ((count (f x class))) (if (> count max) (setf max count most-frequent class)))))) (deftest test-xindex-majority-class () (or (fboundp 'weather) (loaddata 'weather)) (check (= 2 (xindex-majority-class (cross-index (weather)))))) ;;; generate a cross-index (defun cross-index (tbl) (let* ((x (xindex-new tbl)) (uniques (xindex-uniques x)) (class-counts (xindex-class-counts x))) (dotimes (i (table-width tbl)) (setf (aref (xindex-ranges x) i) (make-counted))) (dotimes (j (xindex-n x) x) (cross-index1 j (aref (xindex-all x) j) x uniques class-counts)))) (defun cross-index1 (j eg x uniques class-counts) (let* ((i -1) (class (eg-class eg))) (incf (gethash class class-counts 0)) (dolist (range (eg-features eg)) (incf i) (unless (unknownp range) (let* ((xkey `(,class ,i ,range)) (ukey `(,i ,range)) (counted (gethash xkey (xindex-counts x) (make-counted)))) (setf (gethash xkey (xindex-counts x)) (counted+ j counted)) (when (= 1 (incf (gethash ukey uniques 0))) (setf (aref (xindex-ranges x) i) (counted+ range (aref (xindex-ranges x) i))))))))) (deftest test-index () (let ((tmp (cross-index (test-data2)))) (check (samep tmp " #(XINDEX :TABLE
:ALL #(#S(EG :FEATURES (SUNNY YES) :CLASS YES) #S(EG :FEATURES (RAINY YES) :CLASS YES) #S(EG :FEATURES (SUNNY YES) :CLASS YES)) :N 3 :RANGES #(#S(COUNTED :ALL (RAINY SUNNY) :N 2 :SORTED NIL) #S(COUNTED :ALL (YES) :N 1 :SORTED NIL)) :UNIQUES (0 RAINY) = 1 (0 SUNNY) = 2 (1 YES) = 3 :CLASS-COUNTS YES = 3 :COUNTS (YES 0 RAINY) = #S(COUNTED :ALL (1) :N 1 :SORTED NIL) (YES 0 SUNNY) = #S(COUNTED :ALL (2 0) :N 2 :SORTED NIL) (YES 1 YES) = #S(COUNTED :ALL (2 1 0) :N 3 :SORTED NIL) )")))) (defun test-data2 () (data :name 'weather :columns '($forecast $play) :egs '((sunny yes) (rainy yes) (sunny yes)))) ;;;; abcd (defstruct (abcd (:print-function abcd-print)) for (a 0) (b 0) (c 0) (d 0)) (defun abcd-print (x s d) (declare (ignore x)) (format s "#(ABCD :FOR ~a :A ~a :B ~a :C ~a :D ~a :ACC ~2,2f :PD ~2,2f :PF ~2,2f :PREC ~2,2f :F ~2,2f :BAL ~2,2f)" (abcd-for x) (abcd-a x) (abcd-b x) (abcd-c x) (abcd-d x) (accuracy x) (pd x) (pf x) (precision x) (f-measure x) (balance x))) (defun abcd-matrix (pairs classes h) (let ((n 0) renames (syms '((0 . a)(1 . b)(2 . c) (3 . d) (4 . e) (5 . f)(6 . g)(7 . h) (8 . i) (9 . j) (10 . k)(11 . l)(12 . m)(13 . n)(14 . o) (15 . p)(16 . q)(16 . r)(18 . s)(19 . t) (20 . u)(21 . v)(22 . w)(23 . x)(24 . y) (25 . z)(26 . aa)(27 . ab) (28 . ac)(29 . ad) (30 . ae)(31 . af)(32 . ag)(33 . ah)(34 . ai) (35 . aj)(36 . ak)(37 . al)(38 . am)(39 . an) (40 . ao)(41 . ap)(42 . aq)(43 . ar)(44 . as) (45 . at)(46 . au)(47 . av)(48 . aw)(49 . ax) (50 . ay)(51 . az)))) (showh h) (terpri) (doitems (class i classes) (format t "~5<~(~a~)~>" (cdr (assoc i syms))) (push (cons class (cdr (assoc i syms ))) renames)) (format t " <--- classified as~%") (setf n -1) (doitems (class1 i1 classes) (incf n) (doitems (class2 i2 classes) (format t "~5d" (gethash (cons class1 class2) h 0))) (format t " | ~(~a~) = ~a~%" (cdr (assoc i1 syms)) class1)))) (defun abcd-stats (pairs &key (verbose t)) "stats from cons of (want . got)" (let* ((h (make-hash-table :test #'equal)) classes out) (dolist (pair pairs) (unless (member (first pair) classes) (push (first pair) classes)) (unless (member (rest pair) classes) (push (rest pair) classes)) (incf (gethash pair h 0))); while here, collect abcd-stats (if verbose (abcd-matrix pairs classes h)) (dolist (class classes out) (unless (null class) (let ((abcd (make-abcd :for class))) (maphash ; for each item in the hash counts, do #'(lambda (pair count) (abcd-stat (first pair) (rest pair) count class h abcd)) h) (push abcd out)))))) (defun abcd-stat (want got count goal h abcd) (if (eql got goal) (if (eql want goal) (incf (abcd-d abcd) count) (incf (abcd-c abcd) count)) (if (eql want goal) (incf (abcd-b abcd) count) (incf (abcd-a abcd) count)))) (deftest test-abcd () (let* ((tmp (abcd-stats '((a . a) (a . a) (a . b) (a . c) (a . c) (b . a) (b . a) (b . a) (b . b) (c . b) (c . b) (c . a)) :verbose nil)) (b (second tmp))) (check (samep tmp "( #(ABCD :FOR A :A 3 :B 3 :C 4 :D 2 :ACC .42 :PD .40 :PF .57 :PREC .33 :F .47 :BAL .41) #(ABCD :FOR B :A 5 :B 3 :C 3 :D 1 :ACC .50 :PD .25 :PF .38 :PREC .25 :F .30 :BAL .41) #(ABCD :FOR C :A 7 :B 3 :C 2 :D 0 :ACC .58 :PD .00 :PF .22 :PREC .00 :F .00 :BAL .28))") (= (accuracy b) (/ 6 12)) (= (all b) 12) (= (pd b) (/ 1 4)) (= (pf b) (/ 3 8)) (= (precision b) (/ 1 4)) (= (f-measure b) (/ 3 10)) (= (balance b) 0.40707296) ))) (labels ((a (x) (abcd-a x)) (b (x) (abcd-b x)) (c (x) (abcd-c x)) (d (x) (abcd-d x))) (defun recall (x) (pd x)) (defun pd (x) (if (zerop (d x)) 0 (/ (d x) (+ (b x) (d x))))) (defun pf (x) (if (zerop (c x)) 0 (/ (c x) (+ (a x) (c x))))) (defun all (x) (+ (a x) (b x) (c x) (d x))) (defun precision (x) (if (zerop (d x)) 0 (/ (d x) (+ (c x) (d x))))) (defun accuracy (x) (if (and (zerop (a x)) (zerop (d x))) 0 (/ (+ (a x) (d x)) (all x)))) (defun f-measure (x) (if (or (zerop (a x)) (zerop (d x))) 0 (harmonic-mean (pd x) (pf x)))) (defun balance (x &optional (goalpd 1) (goalpf 0)) (- 1 (/ (sqrt (+ (expt (- goalpf (pf x)) 2) (expt (- goalpd (pd x)) 2))) (sqrt 2))))) ;;;; generating folds (defstruct fold index (train (make-counted)) (test (make-counted))) (defun folds (l0 nfolds &key (pred #'<) (key #'identity) ) (let (folds (i nfolds) (l (sort l0 pred :key key))) (dotimes (j nfolds) (push (make-fold :index j) folds)) ; initialize the folds (if (= nfolds 1) (dolist (x l) (counted+ x (fold-test (nth 0 folds))) (counted+ x (fold-train (nth 0 folds)))) (dolist (x l) (decf i) ; go to the next fold (dotimes (j nfolds) (if (= i j) (counted+ x (fold-test (nth j folds))) ; add 1/j to "test" (counted+ x (fold-train (nth j folds))))) ; add rest to "train" (if (zerop i) (setf i nfolds)))) folds)) ; go back to the start (deftest test-folds () (let ((tmp3 (folds '(1 2 30 4 5 6 70 8 9 10 11) 3)) (tmp1 (folds '(1 2 30 4 5 6 70 8 9 10 11) 1))) (check tmp3 " (#S(FOLD :INDEX 2 :TRAIN #S(COUNTED :ALL (70 30 10 9 6 5 2 1) :N 8 :SORTED NIL) :TEST #S(COUNTED :ALL (11 8 4) :N 3 :SORTED NIL)) #S(FOLD :INDEX 1 :TRAIN #S(COUNTED :ALL (30 11 9 8 5 4 1) :N 7 :SORTED NIL) :TEST #S(COUNTED :ALL (70 10 6 2) :N 4 :SORTED NIL)) #S(FOLD :INDEX 0 :TRAIN #S(COUNTED :ALL (70 11 10 8 6 4 2) :N 7 :SORTED NIL) :TEST #S(COUNTED :ALL (30 9 5 1) :N 4 :SORTED NIL)))"))) ;;;; simple learner ;;;; n-way cross val (defun default-reporter (tbl cautions test-result) (declare (ignore tbl cautions)) test-result) (defun default-finale (tbl cautions reports verbose) (declare (ignore tbl cautions)) (abcd-stats reports :verbose verbose)) (defun n-way (n tbl &key trainer tester (reporter #'default-reporter) (finale #'default-finale) (verbose t)) (let ((folds (folds (table-egs tbl) n :pred 'lt :key #'(lambda (x) (isa x tbl)))) reports cautions (make-caution)) ; call (oops cautions) to call a ; global halt (dolist (fold folds) (let* ((training (counted-all (fold-train fold))) (tests (counted-all (fold-train fold))) (tbl1 (table-copy tbl training)) (model (funcall trainer tbl1 cautions training))) (dolist (test tests) (let* ((tested (funcall tester tbl1 cautions model test)) (reported (funcall reporter tbl1 cautions tested))) (push reported reports))))) (funcall finale tbl cautions reports verbose))) ;;;; minusr : learners don't get dumber that this. score 1 if ;;;; the majority class is "wanted" (defun minusr (wanted n tbl &key (verbose t)) (labels ((reporter (tbl cautions test-result) (if test-result 1 0)) (finale (tbl cautions reports verbose) (mean reports)) (trainer (tbl cautions egs) (table-majority-class tbl egs)) (make-tester (wanted) #'(lambda (tbl cautions expected one) (eql (isa one tbl) wanted)))) (n-way n tbl :trainer #'trainer :tester (make-tester wanted) :reporter #'reporter :finale #'finale :verbose verbose))) (deftest test-minusr () (unless (fboundp 'weather) (loaddata 'weather)) (check (= (minusr 2 3 (weather)) (/ 9 14)))) ;;;; minusr : zeror (defun zeror (n tbl &key (verbose t)) (labels ((trainer (tbl cautions egs) (table-majority-class tbl egs)) (tester (tbl cautions majority one) (cons majority (isa one tbl)))) (n-way n tbl :verbose verbose :trainer #'trainer :tester #'tester))) (deftest test-zeror () (unless (fboundp 'weather) (loaddata 'weather)) (let ((tmp (zeror 3 (weather) :verbose nil))) (check (samep tmp "( #(ABCD :FOR 2 :A 0 :B 10 :C 0 :D 18 :ACC .64 :PD .64 :PF .00 :PREC 1.00 :F .00 :BAL .75) #(ABCD :FOR 1 :A 18 :B 0 :C 10 :D 0 :ACC .64 :PD .00 :PF .36 :PREC .00 :F .00 :BAL .25))")))) ;;; not-so naive bayes (defun naiveBayes (n tbl &key (verbose t)) (labels ((trainer (tbl cautions egs) (cross-index tbl)) (tester (tbl cautions xindex one) (cons (isa one tbl) (bayes-classify one xindex)))) (n-way n tbl :verbose verbose :trainer #'trainer :tester #'tester))) (defun bayes-classify (one x &optional (m 2) (k 1)) (let* ((classes (xindex-classes-all x)) (nclasses (xindex-classes-n x)) (n (xindex-n x)) (classi (xindex-classi x)) (like most-negative-fixnum) (classification (first classes))) (dolist (class classes) (let* ((prior (/ (+ (f x class) k) (+ n (* k nclasses)))) (tmp (log prior))) (doitems (feature i one) (unless (= classi i) (unless (unknownp feature) (let ((delta (/ (+ (f x class i feature) (* m prior)) (+ (f x class) m)))) (incf tmp (log delta)))))) (when (> tmp like) (setf like tmp classification class)))) classification)) (deftest test-nb () (unless (fboundp 'weather2) (loaddata 'weather2)) (let ((tmp (naiveBayes 1 (weather2) :verbose nil))) (check (samep tmp " (#(ABCD :FOR NO :A 9 :B 2 :C 0 :D 3 :ACC .86 :PD .60 :PF .00 :PREC 1.00 :F .00 :BAL .72) #(ABCD :FOR YES :A 3 :B 0 :C 2 :D 9 :ACC .86 :PD 1.00 :PF .40 :PREC .82 :F .57 :BAL .72))")))) ;;;; (defun prism (n tbl &key (verbose t)) (labels ((trainer (tbl cautions egs) (prism-trainer tbl verbose)) (tester (tbl cautions model one) (cons (isa one tbl) (prism-tester one model)))) (n-way n tbl :verbose verbose :trainer #'trainer :tester #'tester))) (defstruct rule if then ) (defstruct (rule-info (:include rule)) missed) (defmacro n (class feature range x) `(gethash `(,class ,feature ,range) ,x 0)) (defun prism-tester (eg rules &optional (not-found #\?)) (dolist (rule rules not-found) (let ((tmp t)) (doitems (test i (rule-if rule)) (setf tmp (and tmp (eql (nth (first test) eg) (rest test))))) (if tmp (return-from prism-tester (rule-then rule)))))) (defun prism-trainer (tbl &optional rules last-missed verbose) (let* ((r (prism-rule tbl)) (missed (rule-info-missed r)) (nmissed (length missed))) (cond ( (null missed) (reverse rules)) (t (setf (rule-info-missed r) nmissed) (push r rules) (if verbose (format t "if ~a then ~a (~a)~%" (rule-if r) (rule-then r) nmissed)) (if last-missed (if (< nmissed last-missed) (prism-trainer (table-copy tbl missed) rules nmissed) (reverse rules)) (prism-trainer (table-copy tbl missed) rules nmissed)))))) (defun prism-rule (tbl &optional (rule (make-rule-info))) (multiple-value-bind (class feature range pt) (prism-next-best tbl rule) (if feature (prism-rule1 tbl rule class feature range pt) rule))) (defun prism-rule1 (tbl rule class feature range pt) (let (covered (ncovered 0) (b4 (length (table-egs tbl)))) (push (cons feature range) (rule-if rule)) (setf (rule-then rule) class) (dolist (eg (table-egs tbl)) (if (eql (nth feature eg) range) (push eg covered) (push eg (rule-info-missed rule)))) (cond ((= pt 1) rule) ((null covered) rule) ((< (length covered) b4) (prism-rule (table-copy tbl covered) rule)) (t rule)))) (defun prism-next-best (tbl rule) (let* ((classi (table-class tbl)) (x (make-hash-table :test 'equal)) (ranges (make-hash-table)) (classes (make-hash-table)) best-class best-feature best-range (best-pt -1)) (prism-index tbl x ranges classes) (dotimes (feature (table-width tbl)) (unless (= feature classi) (unless (assoc feature (rule-if rule)) (let ((all (gethash feature ranges))) (dohash (class tee classes) (dolist (range all) (let* ((pee (n class feature range x)) (pt (/ pee tee))) (when (> pt best-pt) (setf best-pt pt best-feature feature best-range range best-class class ))))))))) (values best-class best-feature best-range best-pt))) (defun prism-index (tbl &optional (x (make-hash-table :test 'equal)) (ranges (make-hash-table)) (classes (make-hash-table))) (let ((classi (table-class tbl))) (dolist (eg (table-egs tbl)) (let ((class (nth classi eg))) (incf (gethash class classes 0)) (doitems (range feature eg) (incf (n class feature range x)) (let ((old (gethash feature ranges))) (unless (member range old) (push range (gethash feature ranges))))))))) (deftest test-prism () (check (samep (prism 3 (contact-lens) :verbose nil) "(#(ABCD :FOR HARD :A 25 :B 0 :C 15 :D 8 :ACC .69 :PD 1.00 :PF .38 :PREC .35 :F .55 :BAL .73) #(ABCD :FOR NONE :A 18 :B 30 :C 0 :D 0 :ACC .38 :PD .00 :PF .00 :PREC .00 :F .00 :BAL .29) #(ABCD :FOR ? :A 23 :B 0 :C 25 :D 0 :ACC .48 :PD .00 :PF .52 :PREC .00 :F .00 :BAL .20) #(ABCD :FOR SOFT :A 38 :B 10 :C 0 :D 0 :ACC .79 :PD .00 :PF .00 :PREC .00 :F .00 :BAL .29))"))) ;;;; (defun iterative-dichotomize (n tbl &key splitter) (labels ((trainer (tbl cautions egs) (id-trainer tbl :splitter splitter)) (tester (tbl cautions model one) (cons (isa one tbl) (id-tester one model)))) (n-way n tbl :trainer #'trainer :tester #'tester))) (defun id-tester (eg tree) (id-tester1 eg (node-kids tree))) (defun id-tester1 (eg trees) (cond ((null trees) nil) ((node-p trees) (rest (node-key trees))) (t (let* ((tree (first trees)) (pair (node-key tree)) (kids (node-kids tree)) (feature (first pair)) (range (rest pair))) (if kids (if (eql (nth feature eg) range) (id-tester1 eg (node-kids tree)) (id-tester1 eg (rest trees))) range))))) (defun id-trainer (tbl &key splitter (min 4) (condense #'table-majority-class)) (make-node :key 'root :kids (id-trainer1 tbl splitter min condense))) (defun id-trainer1 (tbl splitter min condense) (let ((before (table-height tbl))) (if (< before min) (make-node :key (cons 'then (funcall condense tbl))) (let* ((x (cross-index tbl)) (split (funcall splitter x)) (egs (table-egs tbl)) (names (xindex-feature-names x))) (mapcar #'(lambda (range) (id-trainer2 range x egs split splitter min tbl before condense)) (xindex-feature split x)))))) (defun id-trainer2 (range x egs split splitter min tbl before condense) (let (some) (dolist (eg egs) (if (eql range (nth split eg)) (push eg some))) (if (< (1+ (length some)) before) (make-node :key (cons split range) :kids (id-trainer1 (table-copy tbl some) splitter min condense)) (make-node :key (cons 'then (funcall condense tbl)))))) (defstruct range-details feature range size details) (defun splitter (x &key baseline detail debug (compare #'-) (overall #'range-details-weighted-sum)) (let (best (max -1) (classi (xindex-classi x)) (before (funcall baseline x))) (dotimes (feature (xindex-width x) best) (unless (= feature classi) (let* ((after (range-details-all feature x :detail detail :overall overall)) (delta (funcall compare before after))) (if debug (o feature delta)) (if (> delta max) (setf max delta best feature ))))))) (defun range-details-all (feature x &key detail overall) (let ((details (mapcar #'(lambda(range) (range-details-one feature range x detail)) (xindex-feature feature x)))) (funcall overall details feature x))) (defun range-details-one (feature range x detail) "collect information on one range" (let ((n (xindex-unique-n x feature range ))) (make-range-details :feature feature :range range :size n :details (funcall detail feature range n x)))) (defun range-details-weighted-sum (details feature x) "return the weighted sum of the details weighted sum" (let* ((total 0) (weights (mapcar #'(lambda (d) (let ((tmp (range-details-size d))) (incf total tmp) (cons tmp (range-details-details d)))) details))) (weighted-sum weights total))) ;;;; info gain stuff (defun id3 (n tbl) (iterative-dichotomize n tbl :splitter #'info-splitter)) (defun info-splitter (x) (splitter x :debug nil :baseline #'info-baseline :detail #'info-detail)) (defun info (l &optional total) (let ((some (sum l)) (out 0)) (dolist (x l out) (unless (zerop x) (let ((p (/ x some))) (incf out (* (if total (/ some total) 1) -1 p (log p 2)))))))) (defun info-detail (feature range n x) "collect information content detais of one range" (labels ((class2freq (class) (f x class feature range))) (let* ((classes (xindex-classes-all x)) (class-frequencies (mapcar #'class2freq classes))) (info class-frequencies n)))) (defun info-baseline (x) (labels ((class2freq (class) (f x class))) (let* ((classes (xindex-classes-all x)) (class-frequencies (mapcar #'class2freq classes))) (info class-frequencies (xindex-n x))))) (deftest test-info-splitter () (or (fboundp 'weather2) (loaddata 'weather2)) (check (= 0 ; outlook (info-splitter (cross-index (weather2)))))) (deftest test-id-trainer () (let ((tmp (id-trainer (weather2) :splitter #'info-splitter))) (check (samep tmp " #S(NODE :KEY ROOT :KIDS (#S(NODE :KEY (0 . RAINY) :KIDS (#S(NODE :KEY (3 . TRUE) :KIDS #S(NODE :KEY (THEN . NO) :KIDS NIL)) #S(NODE :KEY (3 . FALSE) :KIDS #S(NODE :KEY (THEN . YES) :KIDS NIL)))) #S(NODE :KEY (0 . SUNNY) :KIDS (#S(NODE :KEY (2 . HIGH) :KIDS #S(NODE :KEY (THEN . NO) :KIDS NIL)) #S(NODE :KEY (2 . NORMAL) :KIDS #S(NODE :KEY (THEN . YES) :KIDS NIL)))) #S(NODE :KEY (0 . OVERCAST) :KIDS (#S(NODE :KEY (THEN . YES) :KIDS NIL))))) ") (eq (id-tester '(sunny nil normal false) tmp) 'yes)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; test suit ; weather ; breast-cancer ; cholesterol ; mushroom