\relax \ifx\hyper@anchor\@undefined \global \let \oldcontentsline\contentsline \gdef \contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} \global \let \oldnewlabel\newlabel \gdef \newlabel#1#2{\newlabelxx{#1}#2} \gdef \newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\let \contentsline\oldcontentsline \let \newlabel\oldnewlabel} \else \global \let \hyper@last\relax \fi \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}} \citation{ritthoff01} \citation{mie06} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces The WEKA toolkit running the J48 decision tree learner.}}{2}{figure.1}} \newlabel{fig:weka}{{1}{2}{Introduction\relax }{figure.1}{}} \citation{me00v} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Orange's visual programming environment.}}{3}{figure.2}} \newlabel{fig:orange}{{2}{3}{Introduction\relax }{figure.2}{}} \citation{ramey94} \citation{awkbook} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Rapid-I's operator trees.\nobreakspace {}\cite {mie06}. Internally, this tree is a nested XML expression that is traversed top-down to complete an experiment.}}{4}{figure.3}} \newlabel{fig:yale}{{3}{4}{Introduction\relax }{figure.3}{}} \citation{gay09} \@writefile{toc}{\contentsline {section}{\numberline {2}OURMINE}{5}{section.2}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Built-in Data and Functions}{5}{subsection.2.1}} \citation{ramos03} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces An OURMINE function to clean text documents and collect the results. $Tokes$ is a tokenizer; $caps$ sends all words to lower case; $stops$ removes the stop works listed in "\$Lists/stops.txt"; and $stems$ performs Porter's stemming algorithm (removes confusing suffixes). }}{6}{figure.4}} \newlabel{fig:clean}{{4}{6}{OURMINE\relax }{figure.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces A GAWK implementation of TF-IDF.}}{6}{figure.5}} \newlabel{fig:tfidf}{{5}{6}{Built-in Data and Functions\relax }{figure.5}{}} \citation{Eisenstein04} \citation{orrego04} \citation{gawkai} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Learning and Teaching with OURMINE}{7}{subsection.2.2}} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces A demo OURMINE experiment. This worker function begins by being called by the top level function {\em demo004} on lines 1-4. Noteworthy sections of the demo code are at: line 19, where training sets and test sets are built from 90\% and 10\% of the data respectively, lines 25-27 in which values such as {\em pd,pf} and {\em balance} are computed via the {\em abcd} function that computes values from the confusion matrix, and line 34 in which a {\em Wilcoxon} test is performed on each learner in the experiment using {\em pd} as the performance measure.}}{8}{figure.6}} \newlabel{fig:demo004}{{6}{8}{Learning and Teaching with OURMINE\relax }{figure.6}{}} \@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces A Naive Bayes classifier for a CSV file, where the class label is found in the last column.}}{9}{figure.7}} \newlabel{fig:nbc}{{7}{9}{Learning and Teaching with OURMINE\relax }{figure.7}{}} \@writefile{toc}{\contentsline {section}{\numberline {3}Using Ourmine for Industrial and Research Purposes }{9}{section.3}} \citation{me05c,gay09,me05d,me06d,me07b,turhan08} \citation{turhan08} \citation{kitch07} \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Four Introductory OURMINE programming exercises.}}{10}{figure.8}} \newlabel{fig:ourmine101}{{8}{10}{Learning and Teaching with OURMINE\relax }{figure.8}{}} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Function help in OURMINE.}}{10}{figure.9}} \newlabel{fig:help}{{9}{10}{Learning and Teaching with OURMINE\relax }{figure.9}{}} \citation{matheny09} \citation{me07i} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Experiment I: Commissioning a Learner}{11}{subsection.3.1}} \newlabel{sec:samp}{{3.1}{11}{Experiment I: Commissioning a Learner\relax }{subsection.3.1}{}} \citation{Freund99thealternating} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Results}{12}{subsubsection.3.1.1}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Experiment II: Within vs Cross-Company Data}{12}{subsection.3.2}} \citation{gay09} \citation{turhan08} \citation{lessmann08} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.2.1}Building the Experiment}{13}{subsubsection.3.2.1}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.2.2}Results}{13}{subsubsection.3.2.2}} \@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces The OURMINE script used in conducting the WC vs. CC experiment.}}{14}{figure.10}} \newlabel{fig:promiseExp}{{10}{14}{Building the Experiment\relax }{figure.10}{}} \citation{genic04} \citation{canopies00} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Experiment III: Scaling Up Text Miners}{15}{subsection.3.3}} \newlabel{sec:tmine}{{3.3}{15}{Experiment III: Scaling Up Text Miners\relax }{subsection.3.3}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.3.1}Classes of Methods }{15}{subsubsection.3.3.1}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.3.2}The Algorithms}{16}{subsubsection.3.3.2}} \@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces A PCA dimension feature.}}{17}{figure.11}} \newlabel{fig:pca}{{11}{17}{The Algorithms\relax }{figure.11}{}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.3.3}Building the Experiment}{17}{subsubsection.3.3.3}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Results}{18}{subsection.3.4}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.4.1}Similarities}{18}{subsubsection.3.4.1}} \@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces An OURMINE worker function to cluster data using the K-means algorithm. Note that experiments using other clustering methods (such as GenIc and Canopy), could be conducted by calling line 16 above in much the same way, but with varying flags to represent the clusterer.}}{19}{figure.12}} \newlabel{fig:clusterworker}{{12}{19}{Building the Experiment\relax }{figure.12}{}} \@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces An OURMINE worker function to reduce the data using TF-IDF.}}{20}{figure.13}} \newlabel{fig:tfidfworker}{{13}{20}{Building the Experiment\relax }{figure.13}{}} \@writefile{toc}{\contentsline {section}{\numberline {4}Discussion}{20}{section.4}} \@writefile{toc}{\contentsline {section}{\numberline {5}Conclusions}{21}{section.5}} \bibstyle{plain} \citation{ferri09} \bibdata{refs} \bibcite{awkbook}{1} \bibcite{me05c}{2} \bibcite{Eisenstein04}{3} \bibcite{ferri09}{4} \bibcite{Freund99thealternating}{5} \bibcite{gay09}{6} \bibcite{genic04}{7} \bibcite{kitch07}{8} \bibcite{lessmann08}{9} \bibcite{gawkai}{10} \bibcite{matheny09}{11} \bibcite{canopies00}{12} \bibcite{me00v}{13} \bibcite{me05d}{14} \bibcite{me07i}{15} \bibcite{me06d}{16} \bibcite{me07b}{17} \bibcite{mie06}{18} \bibcite{orrego04}{19} \bibcite{ramey94}{20} \bibcite{ramos03}{21} \bibcite{ritthoff01}{22} \bibcite{turhan08}{23} \@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Experiment \#1 - Part A - (Learner tuning). Probability of Detection (PD) results, sorted by rank then median values. }}{29}{figure.14}} \newlabel{fig:pds2}{{14}{29}{Reviewer \#2\relax }{figure.14}{}} \@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces Experiment \#1 - Part A - (Learner tuning). Probability of False Alarm (PF) results, sorted by rank then median values. }}{29}{figure.15}} \newlabel{fig:pfs2}{{15}{29}{Reviewer \#2\relax }{figure.15}{}} \@writefile{lof}{\contentsline {figure}{\numberline {16}{\ignorespaces Experiment \#1 - Part B - (Random Sampling). Probability of Detection (PD) results, sorted by rank then median values. }}{30}{figure.16}} \newlabel{fig:pds3}{{16}{30}{Reviewer \#2\relax }{figure.16}{}} \@writefile{lof}{\contentsline {figure}{\numberline {17}{\ignorespaces Experiment \#1 - Part B - (Random Sampling). Probability of False Alarm (PF) results, sorted by rank then median values. }}{30}{figure.17}} \newlabel{fig:pfs3}{{17}{30}{Reviewer \#2\relax }{figure.17}{}} \@writefile{lof}{\contentsline {figure}{\numberline {18}{\ignorespaces Experiment \#2 (WC vs. CC). Probability of Detection (PD) results, sorted by rank then median values. }}{31}{figure.18}} \newlabel{fig:pds}{{18}{31}{Reviewer \#2\relax }{figure.18}{}} \@writefile{lof}{\contentsline {figure}{\numberline {19}{\ignorespaces Experiment \#2 (WC vs. CC). Probability of False Alarm (PF) results, sorted by rank then median values. }}{31}{figure.19}} \newlabel{fig:pfs}{{19}{31}{Reviewer \#2\relax }{figure.19}{}} \@writefile{lof}{\contentsline {figure}{\numberline {20}{\ignorespaces Experiment \#3 (Text mining). Similarity values normalized according to the combination of most rigorous reducer and clusterer. Note that $Gain$ is a value representing the difference in cluster intrasimilarity and intersimilarity.}}{31}{figure.20}} \newlabel{fig:sims}{{20}{31}{Reviewer \#2\relax }{figure.20}{}}