;;;; the kokako algorithm ;;;; note: things can become best then rest bu ;;;; once something is rest it remains rest for ever. so ;;;; we can do the rest counts in one pass through the data. (defstruct wme best (b 0) ; list of best egs, number of best egs rest (r 0) ; list of rest egs, number of rest egs (enough 30) ; maximum number of bests (too-few 4) ; stop if too few examples (sufficient 1.5) ; number of rests must be sufficient*b (besth ; hash table for attribute range counts in best (make-hash-table :test #'equalp)) (resth ; hash table for attribute range counts in rest (make-hash-table :test #'equalp))) ;; selectors for the items in best, rest lists (defun score-columns (score columns) (cons score columns)) (defun score (eg) (first eg)) (defun columns (eg) (rest eg)) (defun inc (eg h &optional before) "increment a hash table count, unless we've already used it" (labels ((old (key value) (and before (member value (gethash key before))))) (let ((i -1)) (dolist (x (columns eg)) (incf i) (unless (old i x) (incf (gethash `(,i ,x) h 0))))))) (defun jump (w) "move things from best to rest" (push (pop (wme-best w)) (wme-rest w)) (decf (wme-b w)) (incf (wme-r w))) (defun rest+ (new w) "insert into rest, and update the resth index" (incf (wme-r w)) (push new (wme-rest w)) (inc new (wme-resth w))) (defun best+ (new w) "insert into best, keeping the order in best. don't change the besth index since best could change if better bests arrive in the future" (labels ((insert (cons list) (cond ((null list) (list cons)) ((<= (first cons) (first (first list))) (cons cons list)) (t (cons (first list) (insert cons (rest list))))))) (incf (wme-b w)) (setf (wme-best w) (insert new (wme-best w))))) (defun train (input score &optional (w (make-wme))) "add new input into best or rest" (labels ((betterp () (> score (first (first (wme-best w))))) (enoughp () (>= (wme-b w) (wme-enough w)))) (let ((new (score-columns score input))) (if (enoughp) (if (betterp) (progn (jump w) (best+ new w)) (rest+ new w)) (best+ new w)) w))) (defun b^2/b+r (w before) "return attribute ranges ranked by their b^2/(b+r) score" (let ((best (wme-besth w)) (rest (wme-resth w)) out) (labels ((rank (b r) (/ (* b b) (+ b r))) (result (k v) (cons (float (rank v (gethash k rest 0))) k))) (sufficient w) (dolist (eg (wme-best w)) (inc eg best before)) (maphash #'(lambda (k v) (push (result k v) out)) best) (sort out #'> :key #'car)))) (defun sufficient (w) "returns a working memory with sufficient items in rest" (when (< (/ (wme-r w) (wme-b w)) (wme-sufficient w)) (jump w) (sufficient w))) (defun run (source &optional (enough 30) (want (make-hash-table)) (runs 0) baseline ) "main function" (let (nums (n 0) (w (make-wme :enough enough))) (dolist (eg (funcall source)) (when (wantedp want eg) ; select the relevant subset of examples (incf n) (let ((num (first (last eg)))) (push num nums) (train (butlast eg) num w)))) ; train on the relevant examples (when (< n (wme-too-few w)) (return-from run n)) ; early termination (unless (zerop n) (let* ((sorted (sort nums #'<)) (pair (rest (first (b^2/b+r w want)))) ;; "pair" is now the next best thing to do (key (first pair)) (value (second pair)) (median (median sorted n)) (spread (spread median sorted n))) (when (zerop runs) (format t "~%~a~%" source) (setf baseline sorted)) ; store the baseline (used in ; reporting) (format t "found ~a egs with median ~a [~2,0F%] (min=~a spread= ~a max=~a). recommend ~a = ~a~%" n median (if baseline (* 100 (/ (position median baseline) (length baseline))) 50) (first sorted) spread (first (last sorted)) key value) (wanted key value want) ; add "key value" to the wanted lise (run source enough want (1+ runs) baseline))))) ;;; support code for the run ;; some maths (defun median (l n) (nth (round (* 0.5 n)) l)) (defun spread (median l n) (- (nth (round (* 0.75 n)) l) median)) ;;; the query system (defun wantedp (wanted eg) "check if an example 'eg' satisfies all the constraints in 'wanted'" (maphash #'(lambda (key values) (unless (wantedp1 key values eg) (return-from wantedp nil))) wanted) t) (defun wantedp1 (key values eg) "checks that the example contains at least one value from key" (if (null values) ; bad news! nil (let ((value (first values))) (or (eql (nth key eg) value) (wantedp1 key (rest values) eg))))) (defun wanted (key value queries) (let* ((values (gethash key queries))) (if values (or (member value values) (push value values)) (push value values)) (setf (gethash key queries) values) queries)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; test suit (defun tests () (test1) (test2) (test3) (test4)) (defun test1 () (load "/Users/timm/tmp/weather.lisp") (run 'weather-nominal)) (defun test2 () (load "/Users/timm/tmp/breast-cancer.lisp") (run 'breast-cancer)) (defun test3 () ;(load "/Users/timm/tmp/cholesterol.lisp") (run 'cholesterol)) (defun test4 () (load "/Users/timm/tmp/mushroom.lisp") (run 'mushroom))