; -*- lisp -*- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This file is relation.lisp system. ; Copyright, 2008, Tim Menzies (tim@menzies.us) ; ; This is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This code is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; You should have received a copy of the GNU General Public License ; a long with with code. If not, see . ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; store as vectors on for depenendent, one for independents ; need varlues nindependetnes nedpendents ; need a seperate class variable. if only one depednent then that is it ; else need a combiner ; need a classDiscretizer ; need bore. min/max ; need a weight for each class (for normailization) ; need a bestRest discretizer #+SBCL (DECLAIM (SB-EXT:MUFFLE-CONDITIONS CL:STYLE-WARNING)) ;;;; demos (defun demo0 () (pima-diabetes) ) (defun demo1 () (pima-diabetes :filter #'bore) ) (defun demo10 (&optional (dull 0.01) (sort #'identity)) (let* ((r (diabetes)) (all (rel-height r))) (best (used-samples r :sort sort) all dull))) ;;;; util stuff (defmacro inc> (new old) `(if (> ,new ,old) (setf ,old ,new))) (defmacro inc< (new old) `(if (< ,new ,old) (setf ,old ,new))) (defmacro negativep (x) `(< ,x 0)) (defmacro o (x) `(progn (format t "[~a]=[~a] " (quote ,x) ,x) ,x)) (defmacro oo (&rest l) `(progn ,@(mapcar #'(lambda(x) `(o ,x)) l) (terpri))) ;;;; relation stuff (defparameter *the-relation* nil) (defun new-relation (&optional (r (make-rel))) "Create a fresh 'the-relation'." (setf *the-relation* r)) ;;; structs (defstruct row (n 0) (raw (make-eg)) (cooked (make-eg)) (repeats 1) ) (defstruct eg dependent independent ) (defmacro row-klass (row) `(eg-dependent (row-cooked ,row))) (defstruct rel "A relation stores rows of data and meta-knowledge about each column and some derived data." (name "relation") dims (diagonal 0) (bins 10) dependents independents columns (width 0) ; width = number of columns rows (height 0) ; height = number of rows ) (defstruct feature "Features are either continuous or discrete" name values (utility 1) dependentp) (defstruct (continuous (:include feature)) "Continuous features are characterized by the min and max value" (max most-negative-double-float) ; so anything else becomes max (min most-positive-double-float) ; so anything else becomes min ) (defstruct (discrete (:include feature)) "Discrete features are characterized by their range of symbols." range) ;;; relation definiton and access stuff (defmacro defrelation (name &body body) "Return a discretiezed relation, with all the ranges converted to integer indexes 1...N" `(defun ,name (&key (*the-relation* *the-relation*) (filter #'identity)) (relation ,name) ,@body (funcall filter *the-relation*) *the-relation* )) (defmacro relation (x) "define a new relation. make it _the_ relation." `(progn (new-relation) (setf (rel-name *the-relation*) ',x))) (defun defattr (new) "Add a new attribute to the current relation'." (macrolet ((add-last (x y) `(setf ,x (append ,x (list ,y))))) (or *the-relation* (new-relation)) (incf (rel-width *the-relation*)) (if (feature-dependentp new) (add-last (rel-dependents *the-relation*) new) (add-last (rel-independents *the-relation*) new)) ; here's some redundant info- but makes the data reading easier (add-last (rel-columns *the-relation*) new))) ; define discrete columns (defmacro one (n &rest l) `(defone ',n nil 1 ',l )) ;; independent (defmacro one! (n &rest l) `(defone ',n t 1 ',l )) ;; dependent (defmacro one!! (n u &rest l) `(defone ',n t ,u ',l )) ;; weighted dependent ; define continuous columns (defmacro num (n) `(defnum ',n nil 1 )) ;; independent (defmacro num> (n) `(defnum ',n 1 1 )) ;; dependent, to maximize (defmacro num< (n) `(defnum ',n -1 1 )) ;; dependent, to minimize (defmacro num>! (n u) `(defnum ',n 1 ,u )) ;; weighted dependent, to maximize (defmacro num datum (continuous-max column)) (inc< datum (continuous-min column))) (if (feature-dependentp column) (push datum dependent) (push datum independent)))) (unless (eql width (length row0)) (warn "wrong width ~a~%" row) (return-from data1)) (mapcar #'cell row0 (rel-columns *the-relation*)) (setf (eg-dependent (row-raw row)) (reverse dependent)) (setf (eg-independent (row-raw row)) (reverse independent)) (push row (rel-rows *the-relation*)) (incf (rel-height *the-relation*))))) (defun dist2heaven (l cols) "untested" (let ((alpha 0) (dim 0)) (mapcar #'(lambda(i col) (let* ((min (continuous-min col)) (max (continuous-max col)) (utility (abs (feature-utility col))) (norm (/ (- i min) (- max min (/ -1 most-positive-double-float))))) (incf dim (expt utility 2)) (if (hatep col) (setf norm (- 1 norm))) (setf norm (* utility norm)) (incf alpha (expt (- norm utility) 2)))) l cols) (- 1 ;; smaller distance to good, the better (/ (sqrt alpha) (sqrt dim)) ;; distance normalized to 0..1 ))) (defun bore (&optional (*the-relation* *the-relation*) (best 0.2) (score #'dist2heaven)) "If row's dependent variabels are in within 'best' of the highest scoring variables, then set (eg-dependent (row-cooked row)) to t" (let* ((rows (rel-rows *the-relation*)) (cols (rel-dependents *the-relation*)) (n (round (* best (rel-height *the-relation*)))) (dists (mapcar #'(lambda (row) (funcall score (eg-dependent (row-raw row)) cols)) rows)) (border (elt (sort (copy-list dists) #'>) n))) (mapc #'(lambda (row dist) (setf (eg-dependent (row-cooked row)) (>= dist border))) rows dists) *the-relation*)) (defun equal-width (&optional (*the-relation* *the-relation*)) "Convert all numerics to a range 1..10 for (min to max)/bins." (let ((bins (rel-bins *the-relation*))) (labels ((fudge (x col) "need to move 0..N-1 to N. Can't blow number of bins" (min bins (1+ (binned1 x col)))) (binned1 (x col) (floor (/ (- x (continuous-min col)) (/ (- (continuous-max col) (continuous-min col)) bins)))) (binned (x col) (if (numericp col) (fudge x col) (1+ (position x (discrete-range col)))))) (dolist (one (rel-buffer *the-relation*) *the-relation*) (let* ((old (rel-columns *the-relation*)) (new (mapcar #'binned one old))) (push new (rel-rows *the-relation*))))))) (defun ranges (&optional (*the-relation* *the-relation*)) (let (dims (bins (rel-bins *the-relation*)) (cols (rel-columns *the-relation*))) (labels ((counts (col) (if (numericp col) bins (length (discrete-range col))))) (let ((dims (mapcar #'counts (butlast cols)))) (setf (rel-dims *the-relation*) dims (rel-diagonal *the-relation*) (sqrt (sum dims)))))) *the-relation*) (defrelation pima-diabetes (num preg) (num plas) (num pres) (num>! skin 2) (num insu) (num mass) (num< pedi) (num age) (one class tested_negative tested_positive) ; preg;plas;pres;skin;insu;mass;pedi; age;class (data 1 89 66 23 94 28.1 0.167 21 tested_negative) (data 1 85 66 29 0 26.6 0.351 31 tested_negative) (data 5 116 74 0 0 25.6 0.201 30 tested_negative) (data 6 148 72 35 0 33.6 0.627 50 tested_positive) (data 8 183 64 0 0 23.3 0.672 32 tested_positive) (data 0 137 40 35 168 43.1 2.288 33 tested_positive) (data 3 78 50 32 88 31 0.248 26 tested_positive) )