(ns code.utils.utils (:use (code.utils utils)) (:use (incanter core stats))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; GENERAL FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn sane-numbers [data] "Ensures that no noise is used in equal frequency or width binning" (sort (filter #(number? %) data))) (defn dummy [d nm] (-> (nrow d) (repeat nm) (matrix) (bind-columns d))) (defn member1? [item lst] "Uses recursion and give stackoverflow error for large datasets" (if (empty? lst) 'nil (if (= item (first lst)) true (member1? item (rest lst))))) (defn member? [item lst] (loop [l lst] (if (empty? l) 'nil (if (= item (first l)) true (recur (rest l)))))) (defn file-result [filename results] (binding [*out* (java.io.FileWriter. filename)] (loop [rlt results ans ""] (if (empty? rlt) (println "") (recur (rest rlt) (println (first rlt))))) (flush))) (defn Transpose [data] (apply vector (apply #'map (cons #'vector data)))) (defn get-median [col] (nth (quantile col) 2)) (defn get-spread [col] (- (nth (quantile col) 3) (nth (quantile col) 1))) (defn FindPos [val lst counter] "Recursive method counter starts at -1" (if (= (not-empty lst) nil) counter (if (= val (first lst)) (inc counter) (FindPos val (rest lst) (inc counter))))) (defn FindPos1 [val lst counter] "Loop used to avoid stackoverflow: if val is not in lst there is an infinite loop. counter starts at 0" (loop [l lst cnum counter] (if (= val (first l)) cnum (recur (rest l) (inc cnum))))) (defn extract-unique [lst] (loop [l lst result []] (if (empty? l) (matrix (remove #(= 'none %) result)) (recur (rest l) (conj result (if (member? (first l) result) 'none (first l))))))) (defn add-labels [clusters] (loop [i 0 c clusters result []] (if (empty? c) (matrix (apply concat result)) (recur (inc i) (rest c) (conj result (let [ans (first c)] (bind-columns ans (matrix (repeat (nrow ans) i))))))))) (defn notX [X data] "X represents the attribute numbers of attributes to keep or get rid of. Its a vector. NotX are the others" (loop [d data result []] (if (empty? d) (apply vector (remove #(= nil %) result)) (recur (rest d) (conj result (if (not (member? (first d) X)) (first d) nil)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; COMPRESS FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn n-elts [elt n] (if (> n 1) (list n elt) (list 1 elt))) (defn compr1 [elt n lst] "Uses recursion and give stackoverflow error for large datasets" (if (empty? lst) (list (n-elts elt n)) (let [after (first lst)] (if (= after elt) (compr1 elt (+ n 1) (rest lst)) (cons (n-elts elt n) (compr1 after 1 (rest lst))))))) (defn compr [lst] (let [l (group-by second (sort-by second (map #(vector 1 %) lst)))] (map #(list (count (second %)) (first %)) l))) (defn compress [lst] "Accepts list of sorted values and returns unique values along with the number of times they occur in the list" (if (or (= (count lst) 0) (= (count lst) 1)) lst (compr lst))) (defn compress1 [lst] "Accepts list of sorted values and returns unique values along with the number of times they occur in the list" (if (or (= (count lst) 0) (= (count lst) 1)) lst (compr1 (first lst) 1 (rest lst)))) (defn unique-compress [coll] "Accepts a compressed list and returns only the unique values" (map #(second %) coll)) (defn uc [lst] (unique-compress (compress (sort lst)))) (defn k-majority [col] "Accepts a list of values and returns the one which occurs most often" (second (first (reverse (sort-by first (compress (sort col))))))) (defn same? [col] (let [mysame?(if (= 1 (nrow (compress (sort col)))) true false)] mysame?)) (defn mymode [coll] (last (last (sort-by first (compress (sort coll)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; DISTANCE FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;Distance for discrete data (defn EuclideanDistance [[Oi Oj _]] (let [k (if (= (last Oi) (last Oj)) 0 1)] (Math/sqrt (apply + (map #(Math/pow % 2) (conj (map - (butlast Oi) (butlast Oj)) k )))))) (defn EuclideanDistance2 [[Oi Oj _]] (Math/sqrt (apply + (map #(if (= %1 %2) 0 1) Oi Oj)))) (def EuclideanDistance (memoize EuclideanDistance)) ;Distance for continuous data (defn EuclideanDistance1 [[Oi Oj _]] ; (let [k (if (= (last Oi) (last Oj)) 0 1)] (Math/sqrt (apply + (map #(Math/pow % 2) (map - Oi Oj) )))) ;Euclidean Distance between projections (defn ProjectionED [[Oi Oj xi xj]] (Math/sqrt (- (Math/pow (EuclideanDistance [Oi Oj]) 2) (Math/pow (- xi xj) 2)))) (defn numeric-euclidean-distance! [one two] (if (= (ncol one) (ncol two)) (let [on1 (last one) tw2 (last two) c1 (Math/pow (if (= on1 tw2) 0 1) 2)] (loop [di (transient []) i 0] (if (> (- (count one) 1) i) (recur (conj! di (Math/pow (- (nth one i) (nth two i)) 2)) (inc i)) (Math/sqrt (apply + (conj (persistent! di) c1)))))) (loop [di (transient []) i 0] (if (< i (min (count one) (count two))) (recur (conj! di (Math/pow (- (nth one i) (nth two i)) 2)) (inc i)) (Math/sqrt (apply + (persistent! di))))))) (def numeric numeric-euclidean-distance!) (defn cat-euclidean-distance [one two] (loop [i 0 result (transient [])] (if (>= i (min (count one) (count two))) (Math/sqrt (apply + (persistent! result))) (recur (inc i) (conj! result (if (= (nth one i) (nth two i)) 0 1)))))) (def cat cat-euclidean-distance) (defn get-nearest-distance [one data distance] "Returns distance of nearest like or unlike neighbour" (second (second ;first (sort-by second (map #(vector % (distance one %)) data))))) (defn get-farthest-distance [one data distance] "Returns distance of nearest like or unlike neighbour" (second (last (sort-by second (map #(vector % (distance one %)) data))))) (defn get-nearest [one data distance] ; (let [data1 (if (= (nrow data) 1) [data] data)] (first (sort-by second (map #(vector % (distance one %)) data))));) (defn get-farthest [one data distance] (last (sort-by second (map #(vector % (distance one %)) data)))) (defn get-nearest-k [one data distance k] (take k (sort-by second (map #(vector % (distance one %)) data)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; shuffle1 FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn shuffle10 [X] (loop [i 0 result (transient [])] (if (= i (nrow X)) (persistent! result) (recur (inc i) (conj! result (rand-int (nrow X))))))) (defn shuffle1 [X] (let [idx (shuffle10 X) bcols (bind-columns idx X) scols (matrix (sort-by first (filter #(> (ncol %) 0) bcols))) Xcols (sel scols :cols (range 1 (ncol scols)))] Xcols)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; SWAP FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn myswap [l Dl] (let [unique-l (uc (to-vect Dl)) swap-l (fn [item] (let [new-unique-l (remove #(= % item) unique-l) new-item (if (= (count new-unique-l) 1) (first new-unique-l) (first (shuffle1 new-unique-l)))] new-item))] (matrix (map #(swap-l %) l)))) (defn swap-class [num D] "Takes the entire data set and only swaps the last column" (let [att (sel D :cols (range 0 (- (ncol D) 1))) klass (sel D :cols (- (ncol D) 1)) klass10 (matrix (take (* num (count klass)) klass)) klass90 (matrix (drop (* num (count klass)) klass)) klass-new (bind-rows (myswap klass10 klass) klass90)] (bind-columns att klass-new))) (defn swap-attributes1 [num1 D] "selects random attribute value for replacement" (let [independent (sel D :cols (range (dec (ncol D)))) klass (sel D :cols (- (ncol D) 1)) change-x (fn [x one] ;x is value to be changed (let [notx0 (filter #(not= x %) (to-vect one)) notx (if (empty? notx0) x (first (shuffle notx0)))] notx)) swap-one (fn [one] ;column (let [idx (range (count one)) nump (Math/ceil (* num1 (count one))) get-idx (sample idx :size nump :replacement false)] (loop [i 0 result []] (if (= i (count one)) result (recur (inc i) (conj result (if (member? i get-idx) (change-x (nth one i) one) (nth one i))))))))] (bind-columns (trans (map #(swap-one %) (trans independent))) klass))) (defn swap-attributes [num1 D] "selects mode of remaining attribute values for replacement" (let [independent (sel D :cols (range (dec (ncol D)))) klass (sel D :cols (- (ncol D) 1)) change-x (fn [x one] ;x is value to be changed (let [notx0 (filter #(not= x %) (to-vect one)) notx (if (empty? notx0) x (mymode notx0))] notx)) swap-one (fn [one] ;column (let [idx (range (count one)) nump (Math/ceil (* num1 (count one))) get-idx (sample idx :size nump :replacement false)] (loop [i 0 result []] (if (= i (count one)) result (recur (inc i) (conj result (if (member? i get-idx) (change-x (nth one i) one) (nth one i))))))))] (bind-columns (trans (map #(swap-one %) (trans independent))) klass))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; RANDOM FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn initial-egs [data] "Returns a random eg for each class" (loop [d (shuffle1 data) goals (uc (to-vect (sel data :cols (- (ncol data) 1)))) result []] (if (empty? goals) result (recur d (rest goals) (conj result (first (filter #(= (last %) (first goals)) d))))))) (defn make-random-data [D n] "Make n random datasets from D" (loop [i 0 results (transient [])] (if (= i n) (persistent! results) (recur (inc i) (conj! results (shuffle1 D)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; CLUSTER FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn centroid [cluster] (if (= (nrow cluster) 1) cluster (let [t-cluster (to-vect (trans cluster)) C (matrix cluster) C-noclass (sel C :cols (range 0 (- (ncol C) 1))) klass (matrix [(k-majority (last t-cluster))]) ans1 (bind-columns (div (reduce plus C-noclass) (count cluster)) klass) ans2 (if (= (count ans1) 1) (first ans1) ans1)] ans2))) (defn mysummary [cluster distance] (let [clus (sel cluster :cols (range 0 (dec (ncol cluster)))) cent (centroid clus) piv1 (get-farthest cent (if (= (nrow clus) 1) [clus] clus) distance) piv2 (get-farthest (first piv1) (if (= (nrow clus) 1) [clus] clus) distance) dis (get-nearest-distance cent (if (= (nrow clus) 1) [clus] clus) distance)] ;(min (second piv1) (second piv2))] ;(if (apply = (map last cluster)) ;(get-farthest cent (if (= (nrow clus) 1) [clus] clus) distance)] ;(get-farthest-distance cent (filter #(not= (last cent) (last %)) (if (= (nrow cluster) 1) [cluster] cluster)) distance))] [cent dis])) (defn mygroup [data] (let [goals (uc (to-vect (sel data :cols (- (ncol data) 1))))] (loop [g goals result (transient [])] (if (empty? g) (persistent! result) (recur (rest g) (conj! result (apply vector (filter #(= (last %) (first g)) (to-vect data))))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;Getting queries from csv files ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn myqueries [data] (apply vector (map #(to-vect (matrix (partition 2 %))) data))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;PR ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn mu-mle [data n] (div (apply plus data) n)) (defn sig-mle [data n] (let [mu (mu-mle data n) one (fn [x] (mult (minus x mu) (minus x mu)))] (loop [d data result []] (if (empty? d) (div (apply plus result) n) (recur (rest d) (conj result (one (first d)))))))) (defn db1 [data c1 c2 c3 c4 c] (let [mydb (fn [one] (let [x1 (first one) x2 (second one)] (+ (* c1 (pow x1 2)) (* c2 (pow x2 2)) (* c3 x1) (* c4 x2) c))) klass (fn [one] (if (> (mydb one) 0) 1 2))] (loop [d data result []] (if (empty? d) ; (compress (sort result)) result (recur (rest d) (conj result (klass (first d)))))))) (defn dem02 [lst] (loop [l lst fcount 0 scount 0] (if (empty? l) [fcount scount] (recur (rest l) (if (= (first l) 1) (inc fcount) fcount) (if (= (first l) 2) (inc scount) scount))))) ;specific to imox_data (defn section [data] (loop [d data result []] (if (empty? d) result (recur (matrix (drop 48 d)) (conj result (matrix (take 48 d))))))) (defn imox-train-test-data [data] (let [imox4 (section data) imox-train (loop [im imox4 result []] (if (empty? im) result (recur (rest im) (conj result (sel (sel (first im) :rows (range 0 24)) :cols (range 0 8)))))) tests-w-class (loop [im imox4 result []] (if (empty? im) result (recur (rest im) (conj result (sel (sel (first im) :rows (range 24 48)) :cols (range 0 9)))))) imox-test (loop [twc tests-w-class result []] (if (empty? twc) (matrix (apply concat result)) (recur (rest twc) (conj result (sel (first twc) :cols (range 0 8))))))] [imox-train imox-test (matrix (apply concat tests-w-class))])) (defn sigs [train-data] (loop [td train-data result []] (if (empty? td) result (recur (rest td) (conj result (covariance (first td))))))) (defn mus [train-data] (loop [td train-data result []] (if (empty? td) result (recur (rest td) (conj result (mu-mle (first td) 24)))))) (defn likelihood [x mu sig] (* (/ 1 (pow (* 2 Math/PI) 4)) (/ 1 (sqrt (det sig))) (exp (* (/ -1 2) (mmult (minus x mu) (solve sig) (trans (minus x mu))))))) (defn imox-trainer [x MU SIG] (loop [mu MU sig SIG result []] (if (empty? mu) result (recur (rest mu) (rest sig) (conj result (likelihood x (first mu) (first sig))))))) (defn imox-classifier [x MU SIG] (let [likes (imox-trainer x MU SIG) px (* 0.25 (apply + likes)) posteriors (map #(/ (* 0.25 %1) px) likes) classify (second (first (reverse (sort-by first (map #(vector %1 %2) posteriors [1 2 3 4])))))] classify)) (defn demo01 [D] (let [data (imox-train-test-data D) MU (mus (first data)) SIG (sigs (first data)) X (second data) got (loop [x X result []] (if (empty? x) result (recur (rest x) (conj result (imox-classifier (first x) MU SIG))))) want (trans (sel (nth data 2) :cols (- (ncol D) 1)))] [got want]))