/timm's /charming /python /tricks
Download
stats.py.
Read more on How to be Charming (in Python).
001: import sys,math 002: sys.dont_write_bytecode=True # don't make .pyc files 003: from lib import * 004: from about import * 005: from a12 import * 006: 007: def scottknott(data,small=3,b=250, conf=0.05): 008: """Recursively split data, maximizing delta of 009: the expected value of the mean before and 010: after the splits. 011: Reject splits with under 3 items""" 012: def theSame(one, two): 013: if a12small(two, one): return True 014: return not bootstrap(one, two, b=b, conf=conf) 015: all = reduce(lambda x,y:x+y,data) 016: same = lambda l,r: theSame(l.saw(), r.saw()) 017: big = lambda n: n > small 018: return rdiv(data,all,minMu,big,same) 019: 020: def rdiv(data, # a list of class Nums 021: all, # all the data combined into one num 022: div, # function: find the best split 023: big, # function: rejects small splits 024: same): # function: rejects similar splits 025: """Looks for ways to split sorted data, 026: Recurses into each split. Assigns a 'rank' number 027: to all the leaf splits found in this way. 028: """ 029: def recurse(parts,all,rank=0): 030: "Split, then recurse on each part." 031: cut,left,right = div(parts,all,big,same) 032: if cut: 033: # if cut, rank "right" higher than "left" 034: rank = recurse(parts[:cut],left,rank) + 1 035: rank = recurse(parts[cut:],right,rank) 036: else: 037: # if no cut, then all get same rank 038: for part in parts: 039: part.rank = rank 040: return rank 041: recurse(sorted(data),all) 042: return data 043: 044: def minMu(parts,all,big,same): 045: """Find a cut in the parts that maximizes 046: the expected value of the difference in 047: the mean before and after the cut. 048: Reject splits that are insignificantly 049: different or that generate very small subsets. 050: """ 051: cut,left,right = None,None,None 052: before, mu = 0, all.mu 053: for i,l,r in leftRight(parts): 054: if big(l.n) and big(r.n): 055: if not same(l,r): 056: n = all.n * 1.0 057: x = l.n/n*(mu - l.mu)**2 + \ 058: r.n/n*(mu - r.mu)**2 059: if x > before: 060: before,cut,left,right = x,i,l,r 061: return cut,left,right 062: 063: def leftRight(parts): 064: """Iterator. For all items in 'parts', 065: return everything to the left and everything 066: from here to the end. For reasons of 067: efficiency, take a first pass over the data 068: to pre-compute and cache right-hand-sides 069: """ 070: rights = {} 071: n = j = len(parts) - 1 072: while j > 0: 073: rights[j] = parts[j] 074: if j < n: rights[j] += rights[j+1] 075: j -=1 076: left = parts[0] 077: for i,one in enumerate(parts): 078: if i> 0: 079: yield i,left,rights[i] 080: left += one 081: 082: def bootstrap(y0,z0,conf=0.05,b=1000): 083: """The bootstrap hypothesis test from 084: p220 to 223 of Efron's book 'An 085: introduction to the bootstrap.""" 086: class total(): 087: "quick and dirty data collector" 088: def __init__(i,some=[]): 089: i.sum = i.n = i.mu = 0 ; i.all=[] 090: for one in some: i.put(one) 091: def put(i,x): 092: i.all.append(x); 093: i.sum +=x; i.n += 1; i.mu = float(i.sum)/i.n 094: def __add__(i1,i2): return total(i1.all + i2.all) 095: def testStatistic(y,z): 096: """Checks if two means are different, tempered 097: by the sample size of 'y' and 'z'""" 098: tmp1 = tmp2 = 0 099: for y1 in y.all: tmp1 += (y1 - y.mu)**2 100: for z1 in z.all: tmp2 += (z1 - z.mu)**2 101: s1 = float(tmp1)/(y.n - 1) 102: s2 = float(tmp2)/(z.n - 1) 103: delta = z.mu - y.mu 104: if s1+s2: 105: delta = delta/((s1/y.n + s2/z.n)**0.5) 106: return delta 107: def one(lst): return lst[ int(any(len(lst))) ] 108: def any(n) : return random.uniform(0,n) 109: y, z = total(y0), total(z0) 110: x = y + z 111: tobs = testStatistic(y,z) 112: yhat = [y1 - y.mu + x.mu for y1 in y.all] 113: zhat = [z1 - z.mu + x.mu for z1 in z.all] 114: bigger = 0.0 115: for i in range(b): 116: if testStatistic(total([one(yhat) for _ in yhat]), 117: total([one(zhat) for _ in zhat])) > tobs: 118: bigger += 1 119: return bigger / b < conf 120: 121: def bootstrapd(): 122: def worker(n=30,mu1=10,sigma1=1,mu2=10.2,sigma2=1): 123: def g(mu,sigma) : return random.gauss(mu,sigma) 124: x = [g(mu1,sigma1) for i in range(n)] 125: y = [g(mu2,sigma2) for i in range(n)] 126: return n,mu1,sigma1,mu2,sigma2,\ 127: 'different' if bootstrap(x,y) else 'same' 128: print worker(30, 10.1, 1, 10.2, 1) 129: print worker(30, 10.1, 1, 10.8, 1) 130: print worker(30, 10.1, 10, 10.8, 1) 131: print msecs(lambda : 132: worker(1000, 10.1, 1, 10.2, 1)) 133: 134: if __name__ == '__main__': 135: eval(cmd('bootstrapd()')) 136: 137:
This file is part of Timm's charming Python tricks.
© 2014, Tim Menzies:
tim.menzies@gmail.com,
http://menzies.us.
Timm's charming Python tricks are free software: you can redistribute it and/or modify it under the terms of the GNU Lesser Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
Timm's charming Python tricks are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU Lesser Public License along with Foobar. If not, see http://www.gnu.org/licenses.