\begin{thebibliography}{} \bibitem[at~UCD, 2009]{mlgUcd} at~UCD, M. L.~G. (2009). \newblock Mlg datasets. \newblock \texttt{http://mlg.ucd.ie/datasets}. \bibitem[Bay et~al., 2000]{bay00uci} Bay, S.~D., Kibler, D.~F., Pazzani, M.~J., and Smyth, P. (2000). \newblock The {UCI} {KDD} archive of large data sets for data mining research and experimentation. \newblock {\em SIGKDD Explorations}, 2(2):81--85. \bibitem[Ding and He, 2004]{ding04} Ding, C. and He, X. (2004). \newblock K-means clustering via principal component analysis. \newblock In {\em ICML '04: Proceedings of the twenty-first international conference on Machine learning}, page~29, New York, NY, USA. ACM. \bibitem[Faloutsos and Lin, 1995]{fastmap95} Faloutsos, C. and Lin, K.-I. (1995). \newblock Fastmap: a fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets. \newblock In Carey, M. and Schneider, D., editors, {\em Proceedings of the 1995 ACM SIGMOD International Conference on Management of Data}, pages 163--174. ACM Press. \bibitem[Grimes, 2008]{grimes80rule08} Grimes, S. (2008). \newblock Unstructured data and the 80 percent rule. \newblock Experts Corner: Seth Grimes, Clarabridge Bridgepoints, Issue 3, 2008. \newblock White Paper. \bibitem[Grootjen et~al., 2003]{grootjenHeapsLaw03} Grootjen, F., van Leijenhorst, D., and van~der Weide, T.~P. (2003). \newblock A formal derivation of heaps' law. \bibitem[Gupta and Grossman, 2004]{Gupta04} Gupta, C. and Grossman, R. (2004). \newblock Genic: A single pass generalized incremental algorithm for clustering. \newblock In {\em In SIAM Int. Conf. on Data Mining}. SIAM. \bibitem[Jolliffe, 2002]{jolliffe} Jolliffe, I. (2002). \newblock {\em {Principal component analysis. 2nd edition}}. \newblock Springer. \bibitem[Jones, 1993]{jones93} Jones, K.~S. (1993). \newblock A statistical interpretation of term specificity and its application in retrieval. \newblock {\em Journal of Documentation}, 28:11--21. \bibitem[Kanungo et~al., 2000]{simpleKmeans00} Kanungo, T., Mount, D., Netanyahu, N., Piatko, C., Silverman, R., and Wu, A. (2000). \newblock The analysis of a simple k-means clustering algorithm. \newblock In {\em UMD}. \bibitem[Lang, 1995]{Lang95} Lang, K. (1995). \newblock Newsweeder: Learning to filter netnews. \newblock In {\em Proceedings of the Twelfth International Conference on Machine Learning}, pages 331--339. \bibitem[McCallum et~al., 2000]{canopies00} McCallum, A., Nigam, K., and Ungar, L.~H. (2000). \newblock Efficient clustering of high-dimensional data sets with application to reference matching. \newblock In {\em KDD '00: Proceedings of the sixth ACM SIGKDD international conference on Knowledge discovery and data mining}, pages 169--178, New York, NY, USA. ACM. \bibitem[Porter, 1980]{Porter80} Porter, M.~F. (1980). \newblock {An Algorithm for Suffix Stripping}. \newblock {\em Program}, 14(3):130--137. \bibitem[Ramos, 2003]{Ramos_usingtf-idf} Ramos, J. (2003). \newblock Using tf-idf to determine word relevance in document queries. \bibitem[Salton, 1991]{salton.SMART.91} Salton, G. (1991). \newblock The smart document retrieval project. \newblock In {\em SIGIR '91: Proceedings of the 14th annual international ACM SIGIR conference on Research and development in information retrieval}, pages 356--358, New York, NY, USA. ACM. \end{thebibliography}