Here are some references to various papers I discussed: ----------------------------------------------------- Stochastic clustering (with GENIC): @INPROCEEDINGS{Gupta04, author = {C. Gupta and R. Grossman}, title = {Genic: A single pass generalized incremental algorithm for clustering}, booktitle = {In SIAM Int. Conf. on Data Mining}, year = {2004}, pages = {22--24}, publisher = {SIAM} } ----------------------------------------------------- Stochastic clustering (with FASTMAP) source (lisp): @inproceedings{223812, author = {Faloutsos, Christos and Lin, King-Ip}, title = {FastMap: a fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets}, booktitle = {SIGMOD '95: Proceedings of the 1995 ACM SIGMOD international conference on Management of data}, year = {1995}, isbn = {0-89791-731-6}, pages = {163--174}, location = {San Jose, California, United States}, doi = {}, publisher = {ACM}, address = {New York, NY, USA}, } Note that, formally, FastMap is an example of a Nystrom approximation of the eigenvetors and eigenvalues of a matrix: ----------------------------------------------------- Not-so-Naive Naive Bayes Why does NB work so well? Pedro Domingos, Michael J. Pazzani: On the Optimality of the Simple Bayesian Classifier under Zero-One Loss. Machine Learning 29(2-3): 103-130 (1997) Discretization helps Naive Bayes: @inproceedings{dou95, title = "Supervised and Unsupervised Discretization of Continuous Features", year = 1995, author = "James Dougherty and Ron Kohavi and Mehran Sahami", booktitle = "International Conference on Machine Learning", pages = "194-202", note = "Available from \url{}" } After years and years of research, discretization in naive bayes is still very simple: @article{1487463, author = {Yang, Ying and Webb, Geoffrey I.}, title = {Discretization for naive-Bayes learning: managing discretization bias and variance}, journal = {Mach. Learn.}, volume = {74}, number = {1}, year = {2009}, issn = {0885-6125}, pages = {39--74}, doi = {}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } ----------------------------------------------------- Text mining TF*IDF (and text mining tutorial): (shorter form appeared at ICSM'08: For the (very messy) shell scripts that implement this text mining, ----------------------------------------------------- Time series mining (with SAX) Visually mining and monitoring massive time series (2004) by Jessica Lin , Eamonn Keogh , Stefano Lonardi , Jeffrey P. Lankford , Donna M. Nystrom In Proceedings of the 10 th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining