Here are some references to various papers I discussed: ----------------------------------------------------- Stochastic clustering (with GENIC): @INPROCEEDINGS{Gupta04, author = {C. Gupta and R. Grossman}, title = {Genic: A single pass generalized incremental algorithm for clustering}, booktitle = {In SIAM Int. Conf. on Data Mining}, year = {2004}, pages = {22--24}, publisher = {SIAM} } ----------------------------------------------------- Stochastic clustering (with FASTMAP) source (lisp): http://iccle.googlecode.com/svn/trunk/lib/tools/pre/fastmap.lisp @inproceedings{223812, author = {Faloutsos, Christos and Lin, King-Ip}, title = {FastMap: a fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets}, booktitle = {SIGMOD '95: Proceedings of the 1995 ACM SIGMOD international conference on Management of data}, year = {1995}, isbn = {0-89791-731-6}, pages = {163--174}, location = {San Jose, California, United States}, doi = {http://doi.acm.org/10.1145/223784.223812}, publisher = {ACM}, address = {New York, NY, USA}, } Note that, formally, FastMap is an example of a Nystrom approximation of the eigenvetors and eigenvalues of a matrix: http://research.microsoft.com/apps/pubs/?id=69185 ----------------------------------------------------- Not-so-Naive Naive Bayes Why does NB work so well? Pedro Domingos, Michael J. Pazzani: On the Optimality of the Simple Bayesian Classifier under Zero-One Loss. Machine Learning 29(2-3): 103-130 (1997) Discretization helps Naive Bayes: @inproceedings{dou95, title = "Supervised and Unsupervised Discretization of Continuous Features", year = 1995, author = "James Dougherty and Ron Kohavi and Mehran Sahami", booktitle = "International Conference on Machine Learning", pages = "194-202", note = "Available from \url{http://www.cs.pdx.edu/~timm/dm/dougherty95supervised.pdf}" } After years and years of research, discretization in naive bayes is still very simple: @article{1487463, author = {Yang, Ying and Webb, Geoffrey I.}, title = {Discretization for naive-Bayes learning: managing discretization bias and variance}, journal = {Mach. Learn.}, volume = {74}, number = {1}, year = {2009}, issn = {0885-6125}, pages = {39--74}, doi = {http://dx.doi.org/10.1007/s10994-008-5083-5}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } ----------------------------------------------------- Text mining TF*IDF (and text mining tutorial): http://unbox.org/wisp/var/08/textmine/tellingmore/report4.pdf (shorter form appeared at ICSM'08: http://menzies.us/pdf/08severis.pdf For the (very messy) shell scripts that implement this text mining, ----------------------------------------------------- Time series mining (with SAX) Visually mining and monitoring massive time series (2004) by Jessica Lin , Eamonn Keogh , Stefano Lonardi , Jeffrey P. Lankford , Donna M. Nystrom In Proceedings of the 10 th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.78.1766