(ns code.utils.preprocess (:use (code.utils utils)) (:use (incanter core stats))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; CROSS-VALIDATION FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FOLD FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn folds [n data] "Returns data separated into n bins" (let [break (Math/floor (/ (nrow data) (* 1.0 n))) folds0 (fn [] (loop [d data result []] (if (= (nrow result) n) (if (= (apply + (map nrow result)) (nrow data)) result (loop [dat (if (= (nrow d) 1) [d] d) r result ans []] (if (empty? dat) (apply vector (concat ans r)) (recur (rest dat) (rest r) (conj ans (conj (first r) (first dat))))))) (recur (if (<= (nrow d) break) d (matrix (drop break d))) (conj result (if (<= (nrow d) break) d (matrix (take break d)))))))) folds1 (fn [] (let [nfold (folds0)] (loop [i 0 result (transient [])] (if (>= i n) (persistent! result) (recur (inc i) (conj! result {:index i :test (nth nfold i) :train (matrix (apply concat (filter #(not= (nth nfold i) %) nfold)))}))))))] (folds1))) ;(def folds (memoize folds)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FILTER FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn k-matrix [k data distance] "Returns matrix where each row has a majority k class where k is 1, 3, or 5" (let [k-nearest0 (fn [one] (take k (sort-by second (map #(vector % (distance (butlast one) (butlast %))) (to-vect data))))) k-nearest1 (fn [one] (let [pt1 (k-nearest0 one) pt (if (= (nrow pt1) 1) (repeat 3 (first pt1)) pt1) nearest-k (last (trans (map first pt)))] (to-vect nearest-k)))] (loop [d data result []] (if (empty? d) (if (not= nil (matrix (remove #(= % nil) result))) (matrix (remove #(= % nil) result)) (println "Error: k-matrix fn returned nil, need to reduce value or k")) (recur (rest d) (conj result (if (= (last (first d)) (k-majority (k-nearest1 (first d)))) (first d) nil))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; DATASETS FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn sift [one data type] (loop [d data result []] (if (empty? d) (remove #(= % nil) result) (recur (rest d) (conj result (if (= type 'nuns) (if (not= (last one) (last (first d))) (first d) nil) (if (= (last one) (last (first d))) (first d) nil))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; EQUAL FREQUENCY BINNING FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (def OUT []) (def N 1) (defn breaks10 [x l b4 max-size] (when-not (= x b4) (when (>= N max-size) (def N 0) (def OUT (conj OUT x)))) (cond (not-empty l) (and (def N (inc N)) (breaks10 (first l) (rest l) x max-size)) true (if (not (member? x OUT)) (def OUT (conj OUT x))))) (defn breaks1 [x l b4 max-size] (loop [x x lx (rest l) b4 nil] (when-not (= x b4) (when (>= N max-size) (def N 0) (def OUT (conj OUT x)))) (cond (not-empty lx) (and (def N (inc N)) (recur (first lx) (rest lx) x)) true (if (not (member? x OUT)) (def OUT (conj OUT x)))))) (defn efb10 [x break-at] (if (= x '?) x (if (<= x (first break-at)) (first break-at) (efb10 x (rest break-at))))) (defn efb1 [x break-at] (if (= x '?) x (loop [bk break-at] (if (<= x (first bk)) (first bk) (recur (rest bk)))))) (defn breaks [l max-size] (breaks1 (first l) (rest l) 'nil max-size) (let [result OUT] (def OUT []) ;reset (def N 1) ;reset result)) (defn efb ([data] (efb data 10)) ([data nb] ;nb = no. of bins (let [numbers (sane-numbers data) want (Math/round (/ (count numbers) (* nb 1.0))) break-at (breaks numbers want) new (map (fn [datum] (efb1 datum break-at)) data)] new))) (defn efb2 ([data] (efb2 data 10)) ([data n] (let [dat (to-vect (trans data))] (loop [d dat result (transient [])] (if (empty? d) (persistent! result) (recur (rest d) (conj! result (efb (first d) n)))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; DISCRETIZE FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;send in all the columns that need to be discretized, with no. of bins and the dataset ;if kols is empty then user wants the entire dataset discretized except for class col (defn discretize [data kols n] (loop [kol (to-vect (trans data)) i 0 result []] (if (empty? kol) (trans result) (recur (rest kol) (inc i) (conj result (if (member? i (if (empty? kols) (range 0 (- (ncol data) 1)) kols)) (apply vector (efb (first kol) n)) (first kol))))))) (defn for-numeric-cliff [one binned original] (loop [b binned o original result []] (if (empty? b) (matrix (remove #(= % nil) result)) (recur (rest b) (rest o) (conj result (if (= one (first b)) (first o) nil)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; BORE FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defn my-best-rest [data] "data = binned version = a" (let [group-it (mygroup data)] (loop [g group-it results (transient [])] (if (empty? g) (persistent! results) (recur (rest g) (conj! results (vector (first g) (matrix (apply concat (filter #(not= (first g) %) group-it)))))))))) (defn bin-rank [D val col best rest] "D = binned data set" (let [pbest (/ (nrow best) (nrow D)) prest (/ (nrow rest) (nrow D)) freqEbest (fn [] (let [one (filter #(= (nth % col) val) best)] (if (= (nrow one) 0) 0 (let [freqEbest0 (count (filter #(= (nth % col) val) best)) freqEbest1 (/ freqEbest0 (nrow best))] freqEbest1)))) freqErest (fn [] (let [two (filter #(= (nth % col) val) rest)] (if (= (nrow two) 0) 0 (let [freqErest0 (count (filter #(= (nth % col) val) rest)) freqErest1 (/ freqErest0 (nrow rest))] freqErest1)))) likebestE (* (freqEbest) pbest) likerestE (* (freqErest) prest) rank (/ (Math/pow likebestE 2) (+ likebestE likerestE))] [val rank col])) (defn rank-vals [D best rest] (let [get-ranks1 (fn [col] (let [vals (uc (to-vect (sel D :cols col)))] (map #(bin-rank D % col best rest) vals))) get-ranks2 (map #(get-ranks1 %) (range 0 (- (ncol D) 1))) get-ranks3 (map #(reverse (sort-by second %)) get-ranks2) get-ranks4 (reverse (sort-by second (map first get-ranks3)))] get-ranks4)) (defn get-ranks-only [D best rest] (let [get-ranks1 (fn [col] (let [vals (uc (to-vect (sel D :cols col)))] (map #(bin-rank D % col best rest) vals))) get-ranks2 (map #(get-ranks1 %) (range 0 (- (ncol D) 1)))] get-ranks2)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FASTMAP FUNCTIONS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (def *col* -1) (def *projection-matrix* []) (def *pivots* []) (declare ChooseObjects ProjectVals) (defn FastMap ;[dataset k fnc] ([dataset k] (FastMap dataset k EuclideanDistance1)) ([dataset k fnc] (let [rand-data dataset optdata (Transpose (reverse (pop (reverse (Transpose rand-data))))) score (peek (reverse (Transpose rand-data)))] (if (<= k 0) (let [result *projection-matrix* pivot-result *pivots*] (def *projection-matrix* []) (def *pivots* []) (def *col* -1) ; (Transpose result)) (Transpose (conj result score))) (and (def *col* (inc *col*)) (let [pivots (ChooseObjects optdata fnc ) proj-vals (ProjectVals pivots optdata fnc)] (def *pivots* (conj *pivots* pivots)) (def *projection-matrix* (conj *projection-matrix* proj-vals)) (FastMap rand-data (- k 1) ProjectionED))))))) ; ChooseObjects (defn FarPoint [Oi xi dataset fnc col] (take 2 (last (sort-by last (map #(list (FindPos %1 dataset -1) %1 (fnc [Oi %1 xi %2])) dataset col))))) (defn ChooseObjects [dataset fnc ] (let [length (count dataset) counter (Math/round (rand (- length 1))); is now k initialPM (repeat (count dataset) 0) emptyPM? (= (not-empty *projection-matrix*) nil) col (if emptyPM? initialPM (nth *projection-matrix* (dec *col*))) pivot1 (FarPoint (nth dataset counter) (nth col counter) dataset fnc col) pivot2 (FarPoint (second pivot1) (nth col (first pivot1)) dataset fnc col)] [pivot1 pivot2])) ; Get xi from eqn (defn ProjectVal [pivots Oi proj-pivots xi fnc] (let [distance-a (Math/pow (fnc [(second (first pivots)) Oi (first proj-pivots) xi]) 2) distance-b (Math/pow (fnc [(second (second pivots)) Oi (second proj-pivots) xi]) 2) distance-c (Math/pow (fnc [(second (first pivots)) (second (second pivots)) (first proj-pivots) (second proj-pivots)]) 2)] (/ (- (+ distance-a distance-c) distance-b) (* 2 distance-c)))) (defn ProjectVals [pivots dataset fnc] (let [col (if (= (not-empty *projection-matrix*) nil) (repeat (count dataset) 0) (nth *projection-matrix* (dec *col*))) proj-pivots (list (nth col (first (first pivots))) (nth col (first (second pivots))))] (map #(ProjectVal pivots %1 proj-pivots %2 fnc) dataset col))) (defn for-weka-numeric0 [x] ;x=value (if (= x 0) 0.000001 (if (= (/ x x) 1) (* x 1.000001) x))) (defn for-weka-numeric1 [x1] ;x1 = instance (let [klass (last x1) indy (butlast x1)] (trans (conj (apply vector (map #(for-weka-numeric0 %) indy)) klass)))) (defn pre-data [data] (loop [d data result []] (if (empty? d) (matrix result) (recur (rest d) (conj result (if (= (last (first d)) 0) (for-weka-numeric1 (first d)) (conj (apply vector (butlast (for-weka-numeric1 (first d)))) 1)))))))