;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This file is part of "NOVA":  NOVA = search + COCOMO tools
; Copyright, 2008, Tim Menzies tim@menzies.us
;
; NOVA is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; NOVA is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
; You should have received a copy of the GNU General Public License
; a long with NOVA.  If not, see <http://www.gnu.org/licenses/>.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;; copyright and usage stuff.

(defun gpl3-string (what when who email)
  (format nil "This file is part of ~a:  NOVA = search + COCOMO tools
Copyright, ~a, ~a ~a.

~a is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

~a is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY, without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
a long with ~a.  If not, see <http://www.gnu.org/licenses/>."  
    what when who email what what what))

(defun about-string (what when who email)
  (format nil "~a
Copyright (C) ~a ~a ~a 

This program comes with ABSOLUTELY NO WARRANTY- 
for details type (copyright). This is free software, 
and you are welcome to redistribute it
under certain conditions; type (copyright)," 
	  what when who email))

;;;; random stuff
(defparameter *seed0* 10013)
(defparameter *seed*  *seed0*)

(defun reset-seed () (setf *seed* *seed0*))

(defun park-miller-randomizer ()
  "The Park-Miller multiplicative congruential randomizer
  (CACM, October 88, Page 1195). Creates pseudo random floating
  point numbers in the range 0.0 < x <= 1.0."
  (let ((multiplier 
	 16807.0d0);16807 is (expt 7 5)
	(modulus 
	 2147483647.0d0)) ;2147483647 is (- (expt 2 31) 1)
    (let ((temp (* multiplier *seed*)))
      (setf *seed* (mod temp modulus))
      (/ *seed* modulus))))

(defun my-random (n)
  "Returns a pseudo random floating-point number
  in range 0.0 <= number < n"
  (let ((random-number (park-miller-randomizer)))
    ;; We subtract the randomly generated number from 1.0
    ;; before scaling so that we end up in the range
    ;; 0.0 <= x < 1.0, not 0.0 < x <= 1.0
    (* n (- 1.0d0 random-number))))

(defun my-random-int (n)
  "Returns a pseudo-random integer in the range 0 <= n-1."
  (let ((random-number (/ (my-random 1000.0) 1000)))
    (floor (* n random-number))))

(defun random-demo ()
  (let (counts out)
    (labels 
	((sorter (x y) (< (car x) (car y)))
         (zap    ()    (setf out nil)
	               (reset-seed) 
	               (setf counts (make-hash-table)))
	 (inc    (n)   (setf (gethash n counts) 
			     (1+  (gethash n counts 0)))) 
	 (cache  (k v) (push (list k v) out)))
      (dotimes (i 2 t)
	(zap)
	(dotimes (i 10000) 
	  (inc  (my-random-int 5)))
	(maphash #'cache counts)
	(print (sort out #'sorter))))))

(egs :my-random
     (eg '(dotimes (i 5) 
	   (format t "~a~%" (my-random -23))))
     (eg '(random-demo)))

;;;; line stuff 
(defstruct line 
  "A 'line' runs between two points 'x1@y1' to 'x2@y2'
with gradient 'm' and y-intercept 'b'. Also, some runs
are 'verticalp'; i.e. run vertically."
  x1 y1 x2 y2 m b verticalp)

(defun point-to-line (x1 y1 x2 y2) 
  "Create a line from two points."
  (if (> x1 x2)
      (point-to-line x2 y2 x1 y1)
      (let* ((rise      (- y2 y1))
	     (run       (- x2 x1))
	     (verticalp (zerop run))
	     m 
	     b)
	(unless verticalp
	  (setf m    (/ rise run)
		b    (- y2 (* m x2))))
	(make-line :x1 x1 :y1  y1 :x2 x2 :y2 y2 
		   :m m :b b :verticalp verticalp))))

(defun line-y (x l)
  "Return the 'y' point associated with 'x' on line 'l'."
  (if (line-verticalp l)
      (line-y1 l)
      (+ (* (line-m l)  x) 
	 (line-b l))))

(defun interpolate (x x1 y1 x2 y2 &optional too-big )
  "interpolate between the points x1@y2 and x2@y2;
  returns 'too-big' if x > x2 and returns x1 if x<x1"
  (cond ((<   x x1)   y1)
	((eql x x1)   y1)
	((eql x x2)   y2)
	((< x1 x x2)  (line-y x
			      (point-to-line x1 y1 x2 y2)))
	(t            too-big)))

(defun interpolates (x l)
  "return a y value for x by interpolating over (x . y) pairs in l"
  (let* ((one  (pop l))
	 (two  (first l)))
    (or (if (null l) (rest one))
	(interpolate  x (first one) (rest one) 
		        (first two) (rest two))
	(interpolates x l))))

(egs :line
     (eg '(point-to-line 0 10 1 0)
	 :of "making a new line")
     (eg '(line-y 0.5 (point-to-line 0 10 1 0))
	 :of "accessing point"
	 )
     (eg  '(interpolates 25 '((0 . 0) (5 . 5) (30 . 30)))
	 :out 25)
)

;;;; list stuff
(defun as-list (x)
  "Ensure that x is a list."
  (if (listp x)
      x
      (list x)))

(defun geta (key list &optional (default nil))  
  "Return a value from an assocation 'list'' of, if absent, 
  some 'default' value."
  (or 
    (cdr (assoc key list)) 
    default))

(egs :assoc
     (eg '(geta 'fred '((fred  . 1) (jane . 2))))
     (eg '(geta 'fred '((frodo . 1) (jane . 2)) 23)))

;;;; misc stuff


(defun my-command-line ()
  "Access the command-line."
  (or   
   #+SBCL *posix-argv*  
   #+LISPWORKS system:*line-arguments-list*
   #+CMU extensions:*command-line-words*
   nil))

(defun my-getenv (name &optional default)
  "Return a variable setting from the environment 
outside of LISP."
  #+CMU
  (let ((x (assoc name ext:*environment-list* 
		  :test #'string=))) 
    (if x (cdr x)  default)) 
  #-CMU 
  (or #+Allegro   (sys:getenv name)
      #+CLISP     (ext:getenv name) 
      #+ECL       (si:getenv name) 
      #+SBCL      (sb-unix::posix-getenv name) 
      #+LISPWORKS (lispworks:environment-variable name)
      default))

(defun euclidean (&rest l)
  "Returns the distance of a point in n-dimensional
  space from the orgin. 
  Example1: (euclidean  1 1) => 1.4142135
  Example2: (euclidean 10 10 10 10) => 20.0"
  (let ((sumsq 0))
    (dolist (x l (sqrt sumsq)) 
      (incf sumsq (* x x)))))

(defun chars (n &optional (c "*") (str t))
  "print a string of characters 'c' to stream 'str'
  (and the default stream is standard output)."
  (if (> n 0)
      (dotimes (i n) (princ c str))))

(let (last-loaded)
  (defmacro l (&optional (f last-loaded))
    `(progn 
      (setf last-loaded ',f)
      (load (string-downcase (format nil "~a.lisp" ',f)))
      ',f)))

(defmacro sym (&rest args) `(sym-prim ',args))

(defun sym-prim (l) (intern (format nil "~a~{-~a~}" (car l) (cdr l))))

;;;; stats stuff

(defun list2stdev (l)
  "Return the standard deviation of a list of numbers."
  (let ((n 0) (sum 0) (sumSq 0))
    (dolist (x l (stdev n sum sumSq))
      (incf n)
      (incf sum x)
      (incf sumSq (* x x )))))

(defun stdev (n sum sumSq) 
  "Compute the mean and standard deviation."
  (sqrt (/ (- sumSq(/ (* sum sum) n)) (- n 1))))

(defun sum (l)
  "Sum a list of numbers."
  (let ((sum 0))
    (dolist (x l sum)
      (incf sum x))))  

(defun mean (l)
  "Return the mean and sum of list of numbers."
  (let ((sum 0)
	(n   0))
    (dolist (one l (/ sum n))
      (incf n)
      (incf sum one))))

(defun mean-sd (l)
  "Return the mean and standard deviation of a list of numbers"
  (let ((sum 0)
	(n   0)
	(sumSq 0))
    (labels ((mean () (/ sum n))
	     (sd   () (sqrt (/ (- sumSq(/ (* sum sum) n)) (- n 1) ))))
      (dolist (x l)
	(incf n)
	(incf sum   x)
	(incf sumSq (* x x)))
      (values 
       (mean) 
       (sd)))))

(defun median (nums) 
  "Return 50% and (75-50)% values, the 25% value"
  (labels ((mean (x y) (/ (+ x y ) 2)))
    (let* ((n1         (sort nums #'<))
	   (l          (length n1))
	   (mid        (floor (/ l 2)))
	   (midval     (nth mid  n1))
	   (25percent  (nth (floor (* l 0.25)) n1))
	   (75percent  (nth (floor (* l 0.75)) n1))
	   (50percent  (if (oddp l) 
			   midval
			   (mean midval (nth (- mid 1) n1)))))	 
      (values  
       50percent
       (- 75percent 
	  50percent)
       75percent
       25percent))))

(egs :average
     (eg '(sum    '(1 1 1 1 2 2 4 100)))
     (eg '(median '(1 1 1 1 2 2 4 100)))
     (eg '(mean   '(1 1 1 1 2 2 4 100))))

;;;; ttests

(defun ttest-from-lists (one two &optional (conf 95)) 
  "Given two lists of numbers 'a' and 'b', return the ttest result."
  (let ((as 0) (asq 0) (an 0)(bs 0) (bsq 0) (bn 0))
    (dolist (a one) 
      (incf an) (incf as a) (incf asq (* a a)))
    (dolist (b two) 
      (incf bn) (incf bs b) (incf bsq (* b b)))
    (ttest as asq an bs bsq bn :conf conf)))
    
(defun ttest (as asq an bs bsq bn &key (conf 95))
  "Compares means of two indpendent samples a,b 
  of sizes an,nb, with sums as,bs and sumsquared asq,bsq
  at either a 95 or 99% confidence"
  (labels 
      ((less ()  (< (/ as an) (/ bs bn)))
       (same ()  (let* ((tcrit    (tcritical (+ an bn -2) conf))
			(ssa      (- asq (/ (* as as) an)))
			(ssb      (- bsq (/ (* bs bs) bn)))
			(pooled   (/ (+ ssa ssb) (+ bn  an -2)))
			(sxasb    (sqrt (* pooled (+ (/ 1 an) (/ 1 bn)))))
			(tvalue   (abs (/ (- (/ bs bn) (/ as an)) sxasb))))
		   (oo tcrit tvalue)
		   (> tcrit tvalue))))
    (cond ((same)  0)    ; H0  : mean of a same as mean of b
	  ((less) -1)    ; H1a : mean of a <       mean of b 
	  (t       1)))) ; H1b : mean of a >       mean of b

(let ((ttable '((95 . ((   1  . 12.70   )
		       (   3  .  3.1820 )
		       (   5  .  2.5710 )
		       (  10  .  2.2280 )
		       (  20  .  2.0860 )
		       (  80  .  1.99   )
		       ( 320  .  1.97   )))
		(99 . ((   1  . 63.6570 )
		       (   3  .  5.8410 )
		       (   5  .  4.0320 )
		       (  10  .  3.1690 )
		       (  20  .  2.8450 )
		       (  80  .  2.64   )
		       ( 320  .  2.58   ))))))

 (defun tcritical (n conf)
   "Returns the t-test critical values. Keeps those
   values as a piecewise set of lines and intermediary
   values are interpolated between the points."
   (interpolates n (geta conf ttable)))
)


(defun ttest-demo (&optional (fudge 1))
  "implements the demo described at 
   http://www.cas.buffalo.edu/classes/psy/segal/2072001/ttests/t-tests1.html"
  (let ((one '(105 112  96 124 103  92 97  108 105 110))
	(two '( 98 108 114 106 117 118 126 116 122 108)))
    (setf one (mapcar #'(lambda (x) (* x fudge)) one))
    (ttest-from-lists one two)))

(egs :ttests
     (eg '(ttest-demo )
	 :out 0)
     (eg '(ttest-demo 1.1)
	 :out 0)
     (eg '(ttest-demo 1.2)
	 :out 1)
     )

;;;; report stuff
(defstruct dist 
  (min    most-positive-double-float) 
  (max    most-negative-double-float) 
  (bars   '()) 
  (sum    0)
  (sumsq  0)
  (n      0)
  (sorted t)
  (fuzz   1)
  (fat   nil)
  (all    '())
)

(defun dist-compare (d1 d2 &key (conf 95))
  "compares means of two independent samples d1 d2"
  (ttest (dist-sum d1) (dist-sumsq d1) (dust-n d1)
	 (dist-sum d2) (dist-sumsq d2) (dust-n d2)
	 :conf conf))

(defun as-dist (l)
  "Return a dist with all the numbers of l"
  (if (eq 'dist (type-of l))
      l
      (dist-adds l)))

(defun dist-adds (l &optional (d (make-dist)))
  "Add the numbers in the list 'l' to a dist."
  (dolist (x l d)
    (dist-add x d)))

(defun dist-mean (d)
  (/ (dist-sum d) (dist-n d)))

(defun dist-sd (d)
  (stdev (dist-n d) (dist-sum d) (dist-sumsq d)))

(defun dist-add (num &optional (d (make-dist)))
  (incf (dist-n      d))
  (incf (dist-sum    d) num)
  (incf (dist-sumsq  d) (* num num))
  (if (dist-fat d) (push num (dist-all d)))
  (if (< num (dist-min d)) (setf (dist-min d) num))
  (if (> num (dist-max d)) (setf (dist-max d) num))
  (let* ((fuzz (dist-fuzz d))
	 (num1 (* fuzz (round (/ num fuzz))))
	 (val  (cdr (assoc num1 (dist-bars d)))))
    (if val
	(setf (cdr (assoc num1 (dist-bars d))) (1+ val))
	(push (cons num1 1) (dist-bars d)))
    (setf (dist-sorted d) nil))
  d)

(defun dist-sort (d)
  (labels ((car< (a b) (< (car a ) (car b))))
    (unless (dist-sorted d) 
      (setf (dist-bars d) (sort (dist-bars d) #'car<)
	    (dist-sorted d) t))
    d))

(defun distogram (d0 &key 
		  header
		  (lwidth 5) (decimals 2) (rwidth 5)
		  (shrink 1)  (on "-") (pad 4) (str t))
  "Prints the dist 'd0' on stream 'str', shrinking the right hand side
  bars by 'shrink'. 'Header' is some text to show on top. The other 
  variables control details of how each line is printed."
  (let ((d (as-dist d0)))
    (unless (dist-sorted d)
      (dist-sort d))
    (let ((sum   (dist-sum  d))
	  (bars  (dist-bars d))
	  (min   (dist-min  d))
	  (max   (dist-max  d))
	  (n     (dist-n    d))
	  (fuzz  (dist-fuzz d))
	  (percentSum 0)
	  (fmt   
	   (format nil " ~~~a,~af * ~~~ad = ~~3d% ~~a " 
		   lwidth decimals rwidth)))
      (if header
	  (format t "~a~%" header))
      (dolist (bar bars t)
	(let* ((key     (car bar))
	       (value   (cdr bar))
	       (percent (floor (* 100 (/ value n))))
	       (stars   (round (/ value shrink)))
	       (halfp   (< percentSum 50)))
	  (incf percentSum percent)
	  (format str fmt
		  (+ key (/ fuzz 2)) 
		  value 
		  percent
		(if halfp "<" ">")
		)
	  (chars stars on)
	  (terpri str))))))
    
(defun demo-distogram ()
  (let ((log (make-dist :fuzz 2)))
    (dotimes (i 1000)
      (dist-add (sqrt (random 100)) log))
    (distogram log :shrink 10)
    ))

(egs :distogram
     (eg '(demo-distogram)))

;;;; ranking statistics

(defun showh (tmp)
  (let (out)
    (maphash #'(lambda (k v) (push (cons k v) out)) tmp)
    (mapc 'print (sort out #'(lambda (a b) 
			       (< (car a) (car b)))))))

(defun rank (l  &optional (ranks (make-hash-table)) (n 0))
  "Return a hash of the ranks in the sorted list of numbers 'l'.
  All numbers in a run of repeated entries get the average rank of that run."
    (if (null l)
      ranks
      (let (repeats sum now)
	(labels ((walk () (incf n) (pop l))
		 (new  () (setf repeats 1) (setf sum     n))
		 (same () (incf sum n) (incf repeats))
		 (spin () (when (eql now (car l)) 
			    (walk) (same) (spin))))
	  (setf now (walk))
	  (new)
	  (spin)
	  (setf (gethash now ranks) 
		(/ sum repeats))
	  (rank l ranks n)))))

(defun mann-whitney-demo ()
  (mann-whitney '(4.6   4.7   4.9  5.1   5.2   5.5  
		  5.8   6.1   6.5  6.5   7.2)
		'(5.2   5.3   5.4  5.6   6.2   6.3  
		  6.8   7.7   8.0  8.1)))
	
(defun mann-whitney-demo-big (&optional (fudge 1)) 
  "Generate two lists of 10000 random floats. Multiple the
  second  list by 'fudge'. Check if the median ranks of list one
  is the same as list two (return '0'), smaller than list two
  (return '-1'), or larger than list two (return '1')."
  (labels ((big (n s) (let (out) 
			(dotimes (i n out) 
			  (push (* s (random 100)) out)))))
    (let ((one (big 10000 1))
	  (two (big 10000 fudge)))
      (time (mann-whitney one two)))))

(defun mann-whitney (a b &optional (conf 95))
  "The 'method 1' of http://faculty.vassar.edu/lowry/ch11a.html."
  (labels ((as-ranks (l r) (mapcar #'(lambda (x) (gethash x r)) l))
	   (sum      (l)   (let ((s 0)) (dolist (x l s) (incf s x))))
	   (median   (l n) (let ((sorted (sort l #'<))
				 (midv (floor (/ n 2))))
			     (if (oddp n)
				 (nth midv sorted)
				 (/ (+ (nth (1- midv) sorted) 
				       (nth  midv     sorted)) 2)))))
    (let* ((all    (sort (copy-list (append a  b)) #'<))
	   (ranks  (rank all))
	   (ranksa (as-ranks a ranks))
	   (ranksb (as-ranks b ranks))
	   (na     (length a))
	   (nb     (length b))
	   (n      (+ na nb))	   
	   (tcrict (tcritical n conf))
	   (suma   (* 1.0 (sum ranksa)))
	   (ta     (/ (* na (+ n 1)) 2.0))
	   (sigma  (sqrt (/  (* na nb (+ n 1)) 12.0)))
	   (za     (/ (+ (- suma ta) 0.5) sigma)))
      ;(oo suma ta sigma tcrict) ;debug line
      (cond  ((< (abs za) tcrict)                        0)
	     ((< (median ranksa na) (median ranksb nb)) -1)
	     (t                                          1)))))