/timm's /charming /python /tricks

stats.py

Download stats.py.
Read more on How to be Charming (in Python).

001: import sys,math
002: sys.dont_write_bytecode=True # don't make .pyc files
003: from lib import *
004: from about import *
005: from a12 import *
006: 
007: def scottknott(data,small=3,b=250, conf=0.05):
008:   """Recursively split data, maximizing delta of
009:   the expected value of the mean before and 
010:   after the splits. 
011:   Reject splits with under 3 items"""
012:   def theSame(one, two):
013:     if a12small(two, one): return True
014:     return  not bootstrap(one, two, b=b, conf=conf)
015:   all  = reduce(lambda x,y:x+y,data)
016:   same = lambda l,r: theSame(l.saw(), r.saw())
017:   big  = lambda    n: n > small    
018:   return rdiv(data,all,minMu,big,same)
019: 
020: def rdiv(data,  # a list of class Nums
021:          all,   # all the data combined into one num
022:          div,   # function: find the best split
023:          big,   # function: rejects small splits
024:          same): # function: rejects similar splits
025:   """Looks for ways to split sorted data, 
026:   Recurses into each split. Assigns a 'rank' number
027:   to all the leaf splits found in this way. 
028:   """
029:   def recurse(parts,all,rank=0):
030:     "Split, then recurse on each part."
031:     cut,left,right = div(parts,all,big,same)
032:     if cut: 
033:       # if cut, rank "right" higher than "left"
034:       rank = recurse(parts[:cut],left,rank) + 1
035:       rank = recurse(parts[cut:],right,rank)
036:     else: 
037:       # if no cut, then all get same rank
038:       for part in parts: 
039:         part.rank = rank
040:     return rank
041:   recurse(sorted(data),all)
042:   return data
043: 
044: def minMu(parts,all,big,same):
045:   """Find a cut in the parts that maximizes
046:   the expected value of the difference in
047:   the mean before and after the cut.
048:   Reject splits that are insignificantly
049:   different or that generate very small subsets.
050:   """
051:   cut,left,right = None,None,None
052:   before, mu     =  0, all.mu
053:   for i,l,r in leftRight(parts):
054:     if big(l.n) and big(r.n):
055:       if not same(l,r):
056:         n = all.n * 1.0
057:         x = l.n/n*(mu - l.mu)**2 + \
058:             r.n/n*(mu - r.mu)**2  
059:         if x > before:
060:           before,cut,left,right = x,i,l,r
061:   return cut,left,right
062: 
063: def leftRight(parts):
064:   """Iterator. For all items in 'parts',
065:   return everything to the left and everything
066:   from here to the end. For reasons of
067:   efficiency, take a first pass over the data
068:   to pre-compute and cache right-hand-sides
069:   """
070:   rights = {}
071:   n = j = len(parts) - 1
072:   while j > 0:
073:     rights[j] = parts[j]
074:     if j < n: rights[j] += rights[j+1]
075:     j -=1
076:   left = parts[0]
077:   for i,one in enumerate(parts):
078:     if i> 0: 
079:       yield i,left,rights[i]
080:       left += one
081: 
082: def bootstrap(y0,z0,conf=0.05,b=1000):
083:   """The bootstrap hypothesis test from
084:      p220 to 223 of Efron's book 'An
085:     introduction to the bootstrap."""
086:   class total():
087:     "quick and dirty data collector"
088:     def __init__(i,some=[]):
089:       i.sum = i.n = i.mu = 0 ; i.all=[]
090:       for one in some: i.put(one)
091:     def put(i,x):
092:       i.all.append(x);
093:       i.sum +=x; i.n += 1; i.mu = float(i.sum)/i.n
094:     def __add__(i1,i2): return total(i1.all + i2.all)
095:   def testStatistic(y,z): 
096:     """Checks if two means are different, tempered
097:      by the sample size of 'y' and 'z'"""
098:     tmp1 = tmp2 = 0
099:     for y1 in y.all: tmp1 += (y1 - y.mu)**2 
100:     for z1 in z.all: tmp2 += (z1 - z.mu)**2
101:     s1    = float(tmp1)/(y.n - 1)
102:     s2    = float(tmp2)/(z.n - 1)
103:     delta = z.mu - y.mu
104:     if s1+s2:
105:       delta =  delta/((s1/y.n + s2/z.n)**0.5)
106:     return delta
107:   def one(lst): return lst[ int(any(len(lst))) ]
108:   def any(n)  : return random.uniform(0,n)
109:   y, z   = total(y0), total(z0)
110:   x      = y + z
111:   tobs   = testStatistic(y,z)
112:   yhat   = [y1 - y.mu + x.mu for y1 in y.all]
113:   zhat   = [z1 - z.mu + x.mu for z1 in z.all]
114:   bigger = 0.0
115:   for i in range(b):
116:     if testStatistic(total([one(yhat) for _ in yhat]),
117:                      total([one(zhat) for _ in zhat])) > tobs:
118:       bigger += 1
119:   return bigger / b < conf
120: 
121: def bootstrapd(): 
122:   def worker(n=30,mu1=10,sigma1=1,mu2=10.2,sigma2=1):
123:     def g(mu,sigma) : return random.gauss(mu,sigma)
124:     x = [g(mu1,sigma1) for i in range(n)]
125:     y = [g(mu2,sigma2) for i in range(n)]
126:     return n,mu1,sigma1,mu2,sigma2,\
127:         'different' if bootstrap(x,y) else 'same'
128:   print worker(30, 10.1, 1, 10.2, 1)
129:   print worker(30, 10.1, 1, 10.8, 1)
130:   print worker(30, 10.1, 10, 10.8, 1)
131:   print msecs(lambda : 
132:               worker(1000, 10.1, 1, 10.2, 1))
133: 
134: if __name__ == '__main__':
135:   eval(cmd('bootstrapd()'))
136: 
137:

This file is part of Timm's charming Python tricks.
© 2014, Tim Menzies: tim.menzies@gmail.com, http://menzies.us.

Timm's charming Python tricks are free software: you can redistribute it and/or modify it under the terms of the GNU Lesser Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

Timm's charming Python tricks are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU Lesser Public License along with Foobar. If not, see http://www.gnu.org/licenses.