/timm's /charming /python /tricks
Download
stats.py.
Read more on How to be Charming (in Python).
001: import sys,math
002: sys.dont_write_bytecode=True # don't make .pyc files
003: from lib import *
004: from about import *
005: from a12 import *
006:
007: def scottknott(data,small=3,b=250, conf=0.05):
008: """Recursively split data, maximizing delta of
009: the expected value of the mean before and
010: after the splits.
011: Reject splits with under 3 items"""
012: def theSame(one, two):
013: if a12small(two, one): return True
014: return not bootstrap(one, two, b=b, conf=conf)
015: all = reduce(lambda x,y:x+y,data)
016: same = lambda l,r: theSame(l.saw(), r.saw())
017: big = lambda n: n > small
018: return rdiv(data,all,minMu,big,same)
019:
020: def rdiv(data, # a list of class Nums
021: all, # all the data combined into one num
022: div, # function: find the best split
023: big, # function: rejects small splits
024: same): # function: rejects similar splits
025: """Looks for ways to split sorted data,
026: Recurses into each split. Assigns a 'rank' number
027: to all the leaf splits found in this way.
028: """
029: def recurse(parts,all,rank=0):
030: "Split, then recurse on each part."
031: cut,left,right = div(parts,all,big,same)
032: if cut:
033: # if cut, rank "right" higher than "left"
034: rank = recurse(parts[:cut],left,rank) + 1
035: rank = recurse(parts[cut:],right,rank)
036: else:
037: # if no cut, then all get same rank
038: for part in parts:
039: part.rank = rank
040: return rank
041: recurse(sorted(data),all)
042: return data
043:
044: def minMu(parts,all,big,same):
045: """Find a cut in the parts that maximizes
046: the expected value of the difference in
047: the mean before and after the cut.
048: Reject splits that are insignificantly
049: different or that generate very small subsets.
050: """
051: cut,left,right = None,None,None
052: before, mu = 0, all.mu
053: for i,l,r in leftRight(parts):
054: if big(l.n) and big(r.n):
055: if not same(l,r):
056: n = all.n * 1.0
057: x = l.n/n*(mu - l.mu)**2 + \
058: r.n/n*(mu - r.mu)**2
059: if x > before:
060: before,cut,left,right = x,i,l,r
061: return cut,left,right
062:
063: def leftRight(parts):
064: """Iterator. For all items in 'parts',
065: return everything to the left and everything
066: from here to the end. For reasons of
067: efficiency, take a first pass over the data
068: to pre-compute and cache right-hand-sides
069: """
070: rights = {}
071: n = j = len(parts) - 1
072: while j > 0:
073: rights[j] = parts[j]
074: if j < n: rights[j] += rights[j+1]
075: j -=1
076: left = parts[0]
077: for i,one in enumerate(parts):
078: if i> 0:
079: yield i,left,rights[i]
080: left += one
081:
082: def bootstrap(y0,z0,conf=0.05,b=1000):
083: """The bootstrap hypothesis test from
084: p220 to 223 of Efron's book 'An
085: introduction to the bootstrap."""
086: class total():
087: "quick and dirty data collector"
088: def __init__(i,some=[]):
089: i.sum = i.n = i.mu = 0 ; i.all=[]
090: for one in some: i.put(one)
091: def put(i,x):
092: i.all.append(x);
093: i.sum +=x; i.n += 1; i.mu = float(i.sum)/i.n
094: def __add__(i1,i2): return total(i1.all + i2.all)
095: def testStatistic(y,z):
096: """Checks if two means are different, tempered
097: by the sample size of 'y' and 'z'"""
098: tmp1 = tmp2 = 0
099: for y1 in y.all: tmp1 += (y1 - y.mu)**2
100: for z1 in z.all: tmp2 += (z1 - z.mu)**2
101: s1 = float(tmp1)/(y.n - 1)
102: s2 = float(tmp2)/(z.n - 1)
103: delta = z.mu - y.mu
104: if s1+s2:
105: delta = delta/((s1/y.n + s2/z.n)**0.5)
106: return delta
107: def one(lst): return lst[ int(any(len(lst))) ]
108: def any(n) : return random.uniform(0,n)
109: y, z = total(y0), total(z0)
110: x = y + z
111: tobs = testStatistic(y,z)
112: yhat = [y1 - y.mu + x.mu for y1 in y.all]
113: zhat = [z1 - z.mu + x.mu for z1 in z.all]
114: bigger = 0.0
115: for i in range(b):
116: if testStatistic(total([one(yhat) for _ in yhat]),
117: total([one(zhat) for _ in zhat])) > tobs:
118: bigger += 1
119: return bigger / b < conf
120:
121: def bootstrapd():
122: def worker(n=30,mu1=10,sigma1=1,mu2=10.2,sigma2=1):
123: def g(mu,sigma) : return random.gauss(mu,sigma)
124: x = [g(mu1,sigma1) for i in range(n)]
125: y = [g(mu2,sigma2) for i in range(n)]
126: return n,mu1,sigma1,mu2,sigma2,\
127: 'different' if bootstrap(x,y) else 'same'
128: print worker(30, 10.1, 1, 10.2, 1)
129: print worker(30, 10.1, 1, 10.8, 1)
130: print worker(30, 10.1, 10, 10.8, 1)
131: print msecs(lambda :
132: worker(1000, 10.1, 1, 10.2, 1))
133:
134: if __name__ == '__main__':
135: eval(cmd('bootstrapd()'))
136:
137:
This file is part of Timm's charming Python tricks.
© 2014, Tim Menzies:
tim.menzies@gmail.com,
http://menzies.us.
Timm's charming Python tricks are free software: you can redistribute it and/or modify it under the terms of the GNU Lesser Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
Timm's charming Python tricks are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU Lesser Public License along with Foobar. If not, see http://www.gnu.org/licenses.