\begin{thebibliography}{}

\bibitem[at~UCD, 2009]{mlgUcd}
at~UCD, M. L.~G. (2009).
\newblock Mlg datasets.
\newblock \texttt{http://mlg.ucd.ie/datasets}.

\bibitem[Bay et~al., 2000]{bay00uci}
Bay, S.~D., Kibler, D.~F., Pazzani, M.~J., and Smyth, P. (2000).
\newblock The {UCI} {KDD} archive of large data sets for data mining research
  and experimentation.
\newblock {\em SIGKDD Explorations}, 2(2):81--85.

\bibitem[Ding and He, 2004]{ding04}
Ding, C. and He, X. (2004).
\newblock K-means clustering via principal component analysis.
\newblock In {\em ICML '04: Proceedings of the twenty-first international
  conference on Machine learning}, page~29, New York, NY, USA. ACM.

\bibitem[Faloutsos and Lin, 1995]{fastmap95}
Faloutsos, C. and Lin, K.-I. (1995).
\newblock Fastmap: a fast algorithm for indexing, data-mining and visualization
  of traditional and multimedia datasets.
\newblock In Carey, M. and Schneider, D., editors, {\em Proceedings of the 1995
  ACM SIGMOD International Conference on Management of Data}, pages 163--174.
  ACM Press.

\bibitem[Grimes, 2008]{grimes80rule08}
Grimes, S. (2008).
\newblock Unstructured data and the 80 percent rule.
\newblock Experts Corner: Seth Grimes, Clarabridge Bridgepoints, Issue 3, 2008.
\newblock White Paper.

\bibitem[Grootjen et~al., 2003]{grootjenHeapsLaw03}
Grootjen, F., van Leijenhorst, D., and van~der Weide, T.~P. (2003).
\newblock A formal derivation of heaps' law.

\bibitem[Gupta and Grossman, 2004]{Gupta04}
Gupta, C. and Grossman, R. (2004).
\newblock Genic: A single pass generalized incremental algorithm for
  clustering.
\newblock In {\em In SIAM Int. Conf. on Data Mining}. SIAM.

\bibitem[Jolliffe, 2002]{jolliffe}
Jolliffe, I. (2002).
\newblock {\em {Principal component analysis. 2nd edition}}.
\newblock Springer.

\bibitem[Jones, 1993]{jones93}
Jones, K.~S. (1993).
\newblock A statistical interpretation of term specificity and its application
  in retrieval.
\newblock {\em Journal of Documentation}, 28:11--21.

\bibitem[Kanungo et~al., 2000]{simpleKmeans00}
Kanungo, T., Mount, D., Netanyahu, N., Piatko, C., Silverman, R., and Wu, A.
  (2000).
\newblock The analysis of a simple k-means clustering algorithm.
\newblock In {\em UMD}.

\bibitem[Lang, 1995]{Lang95}
Lang, K. (1995).
\newblock Newsweeder: Learning to filter netnews.
\newblock In {\em Proceedings of the Twelfth International Conference on
  Machine Learning}, pages 331--339.

\bibitem[McCallum et~al., 2000]{canopies00}
McCallum, A., Nigam, K., and Ungar, L.~H. (2000).
\newblock Efficient clustering of high-dimensional data sets with application
  to reference matching.
\newblock In {\em KDD '00: Proceedings of the sixth ACM SIGKDD international
  conference on Knowledge discovery and data mining}, pages 169--178, New York,
  NY, USA. ACM.

\bibitem[Porter, 1980]{Porter80}
Porter, M.~F. (1980).
\newblock {An Algorithm for Suffix Stripping}.
\newblock {\em Program}, 14(3):130--137.

\bibitem[Ramos, 2003]{Ramos_usingtf-idf}
Ramos, J. (2003).
\newblock Using tf-idf to determine word relevance in document queries.

\bibitem[Salton, 1991]{salton.SMART.91}
Salton, G. (1991).
\newblock The smart document retrieval project.
\newblock In {\em SIGIR '91: Proceedings of the 14th annual international ACM
  SIGIR conference on Research and development in information retrieval}, pages
  356--358, New York, NY, USA. ACM.

\end{thebibliography}