(ns code.privacy.datafly (:use (code.utils utils preprocess)) (:use (incanter core stats))) ;returns count, vectors of qi and freq count (defn datafly-freq [qi] (let [freq-tbl (compress (sort (to-vect qi))) freq-count (count freq-tbl) freq-map (fn [x] (first (filter #(= x (second %)) freq-tbl))) new-freq-tbl (loop [q qi result []] ;first is the count, second the value (if (empty? q) result (recur (rest q) (conj result (freq-map (first q))))))] [freq-count new-freq-tbl])) ;returns the #q in qis to be generalized, index (defn datafly-qis [qis q sav] (loop [qi (trans qis) i 0 result []] (if (empty? qi) (apply vector (map first (take q (filter #(not (member? (first %) sav)) (reverse (sort-by second result)))))) (recur (rest qi) (inc i) (conj result (vector i (first (datafly-freq (first qi))))))))) (defn datafly-hier [selected-qids qis-idx] (map #(vector %1 %2) qis-idx (efb2 selected-qids))) ;k=[2 4], delete if any odd num for k and store index ;check that count is divisible by k, ;if not - delete ;returns data with removed rows and anon first column (defn datafly-first-qi [qis-idx k data hier type] ;from top (let [idx-first (sel data :cols (first qis-idx)) new-freq-tbl (second (datafly-freq idx-first)) one-iteration (loop [nft new-freq-tbl i 0 result []] (if (empty? nft) result (recur (rest nft) (inc i) (conj result (if (< (first (first nft)) k) (nth (second (first hier)) i) (second (first nft))))))) remove-vals1 (sort-by second (map #(vector (mod (first %) k) (second %)) (compress (sort one-iteration)))) two-iteration (loop [i1 one-iteration result3 []] (if (empty? i1) result3 (recur (rest i1) (conj result3 (if (> (first (first (filter #(= (first i1) (second %)) remove-vals1))) 0) -10000 (first i1)))))) new-data (bind-columns (sel data :cols (range (first qis-idx))) (matrix one-iteration) (sel data :cols (range (inc (first qis-idx)) (ncol data)))) remove-vals2 (group-by #(nth % (first qis-idx)) new-data) new-data1 (loop [rv2 (sort remove-vals2) rv1 remove-vals1 result2 []] (if (empty? rv2) result2 (recur (rest rv2) (rest rv1) (conj result2 (drop (first (first rv1)) (second (first rv2))))))) new-data2 (bind-columns (sel data :cols (range (first qis-idx))) (matrix two-iteration) (sel data :cols (range (inc (first qis-idx)) (ncol data))))] (if (= type 'other) new-data2 new-data1))) ;returns generalized column for partition at idx (defn datafly-general [lst hier idx idxr] (if (apply = lst) lst (let [h1 (second (first (filter #(= (first %) idx) hier))) h2 (efb h1) h3 (efb h2) generalize1 (matrix (map #(nth h1 %) idxr)) generalize2 (matrix (map #(nth h2 %) idxr)) generalize3 (matrix (map #(nth h3 %) idxr))] (if (apply = generalize1) generalize1 (if (apply = generalize2) generalize2 (if (apply = generalize3) generalize3 (matrix (repeat (count lst) -10000)))))))) ;returns generalized column for all partitions at idx (defn datafly-partition [one k hier idx] ;one = data after first iter (let [part0 (loop [x one result []] (if (empty? x) (matrix (apply concat result)) (recur (rest x) (conj result (sort-by #(nth % idx) (first x)))))) part (partition k part0)] (loop [p part i 0 result []] (if (empty? p) result (recur (rest p) (+ i k) (conj result (datafly-general (sel (matrix (first p)) :cols idx) hier idx (range i (+ i k))))))))) (defn datafly [data1 k q] ;q=2 and 3 (let [data (bind-columns (sel data1 :cols (range 10)) (matrix (efb (sel data1 :cols 10))) (sel data1 :cols (range 11 21))) qis-idx (datafly-qis data q [10 20]) hier (datafly-hier (sel data :cols qis-idx) qis-idx) one0 (matrix (apply concat (datafly-first-qi qis-idx k data hier 'others))) one (datafly-first-qi qis-idx k data hier 'others) ans0 (map #(datafly-partition one k hier %) qis-idx) ans1 (matrix (map #(apply concat %) ans0)) ans2 (map #(vector %1 %2) qis-idx ans1) ans3 (matrix (map second (sort-by first ans2)))] (loop [dat (trans one0) i 0 a3 ans3 result []] (if (empty? dat) (trans result) (recur (rest dat) (inc i) (if (member? i qis-idx) (rest a3) a3) (conj result (if (member? i qis-idx) (first a3) (first dat)))))))) (defn datafly2 [_ data _ _ _ _ _] (datafly data 2 4)) (defn datafly4 [_ data _ _ _ _ _] (datafly data 4 4))