\relax \citation{Menzies2009b} \citation{Menzies2009b} \citation{Menzies2007c} \citation{Boehm1981} \citation{Fenton2007c} \citation{Fenton2007b} \citation{Kaariainen2006,Dasgupta2008,Hassan2010} \citation{Kaariainen2006} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}} \newlabel{sect:introduction}{{1}{1}} \citation{Hassan2010} \citation{Menzies2009b,Menzies2007c,Boehm1981,Fenton2007c,Fenton2007b} \@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Contributions}{2}} \@writefile{toc}{\contentsline {section}{\numberline {2}Motivation}{2}} \newlabel{sect:motivation}{{2}{2}} \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} \newlabel{fig:expected}{{1a}{2}} \newlabel{sub@fig:expected}{{(a)}{a}} \newlabel{fig:actual}{{1b}{2}} \newlabel{sub@fig:actual}{{(b)}{b}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Expected and actual topologies for the purpose of demonstration. Assumption-all assumes that \textit {all} instances are used in estimation, hence topology would look like a). Assumption-pop states that only the \textit {popular} instances (filled squares) are used for estimation.\relax }}{2}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Expected}}}{2}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Actual}}}{2}} \newlabel{fig:topologies}{{1}{2}} \@writefile{toc}{\contentsline {section}{\numberline {3}Background}{2}} \newlabel{sect:background}{{3}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Software Effort Estimation}{2}} \citation{Li2009} \citation{Jor2004e} \citation{Jor2005b} \citation{shepperd96} \citation{Mendes2003,Li2009,Kadoda2000} \citation{Keung2011} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Ratio of the instances used for prediction in a closest-neighbor setting to the dataset size. Note that the median percentage value is $25\%$, meaning that only a limited amount of instances are the closest neighbor of other instances and are useful in estimation.\relax }}{3}} \newlabel{fig:selec-perc}{{2}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Active Learning}{3}} \@writefile{toc}{\contentsline {section}{\numberline {4}Methodology}{3}} \newlabel{sect:methodology}{{4}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Algorithms}{3}} \newlabel{equation:normalization}{{1}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Building a Guidance System}{4}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {4.2.1}Toy Example}{4}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces The projects of the toy example. Our hypothetical dataset consists of 3 projects described 1 independent variable (KLOC) and 1 dependent variable (effort in man-months).\relax }}{4}} \newlabel{fig:toy-dataset}{{3}{4}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Visualization of projects on a linear scale, where the axis shows KLOC values.\relax }}{4}} \newlabel{fig:linear-scale}{{4}{4}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces The distance matrix of the projects $P_1$, $P_2$ and $P_3$.\relax }}{4}} \newlabel{fig:toy-distance-matrix}{{5}{4}} \citation{shepperd97} \citation{foss03} \citation{foss03} \citation{shepperd97} \citation{foss03} \citation{Foss} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces The $E(k)$ matrix resulting from the distance matrix of Figure\nobreakspace {}5\hbox {}. The cells with a value of $na$ mean that ordering for that cell is \textit {not-applicable}.\relax }}{5}} \newlabel{fig:ek-matrix}{{6}{5}} \@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces The $E(1)$ matrix and the popularity indices of the toy example. Note that popularity index is the sum of the columns of the $E(1)$ matrix.\relax }}{5}} \newlabel{fig:toy-pop-index}{{7}{5}} \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces The change of active pool for the toy example. Note that in an actual setting transition between $Round_i$ to $Round_{i+1}$ is governed by the stopping rules.\relax }}{5}} \newlabel{fig:active-pool}{{8}{5}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Performance Measures}{5}} \newlabel{ar}{{2}{5}} \newlabel{one}{{3}{5}} \newlabel{onex}{{4}{5}} \newlabel{one}{{5}{5}} \citation{keung2008c,shepperd97} \citation{keung2008c,shepperd97,Finnie1997} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Kocaguneli2010} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Kocaguneli2010} \citation{keung2008c,Li2009,Li2009a,shepperd97,Shepperd1996,Finnie1997} \citation{Menzies2006,Bakir2009,Kocaguneli2010} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Kocaguneli2010} \citation{Keung2008,keung2008c,keung2008b,Kocaguneli2010,shepperd97,Li2008,Kadoda2000,Kirsopp2002,Li2009,Li2009a} \citation{Li2009a,Sentas2005} \citation{Kultur2008,Turhan2007} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Miyazaki1994} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Briand1999,shepperd97} \citation{Menzies2006,Lum2008,Kocaguneli2010,Boehm1981} \citation{Menzies2006,Lum2008,Kocaguneli2010} \citation{Bakir2009} \citation{Boehm1981} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces The 699 projects used in this study come from 20 data sets. Indentation in column one denotes that indented dataset is a subset of its non-indented parent.\relax }}{6}} \newlabel{fig:datasets}{{9}{6}} \@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Comparing algorithms (\textit {i},\textit {j}) on performance ($P_i$,$P_j$). The ``better'' predicate changes according to $P$. For error measures like MRE, ``better'' means lower medians. However, for PRED(25), ``better'' means higher medians.\relax }}{6}} \newlabel{fig:pseudocode-wtl}{{10}{6}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Experiments}{6}} \@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces A sample of effort estimation papers that use the data sets explored in this paper.\relax }}{6}} \newlabel{fig:dataset-paper}{{11}{6}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Datasets}{6}} \citation{Boehm1981} \citation{Keung2011} \citation{Keung2011} \@writefile{toc}{\contentsline {section}{\numberline {5}Results}{7}} \newlabel{sect:results}{{5}{7}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Performance}{7}} \@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces The distribution of datasets into result-categories. Last column shows the number of datasets in each category. Note that $12$ out of $19$ datasets fall into the category of \textit {Pro-Active} meaning that for $63\%$ of the datasets \textit {activeNN} is a substitute for \textit {passiveNN}.\relax }}{7}} \newlabel{fig:dataset-to-category}{{12}{7}} \@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces The $win-loss$ values. The datasets where \textit {activeNN} has lost to another learner (i.e. a negative $win-loss$ value) are highlighted for convenience. \relax }}{7}} \newlabel{fig:win-tie-loss}{{13}{7}} \citation{Keung2011} \citation{Keung2011} \newlabel{fig:cat-active}{{14a}{8}} \newlabel{sub@fig:cat-active}{{(a)}{a}} \newlabel{fig:cat-cart}{{14b}{8}} \newlabel{sub@fig:cat-cart}{{(b)}{b}} \newlabel{fig:cat-con-active}{{14c}{8}} \newlabel{sub@fig:cat-con-active}{{(c)}{c}} \newlabel{fig:cat-con-cart}{{14d}{8}} \newlabel{sub@fig:cat-con-cart}{{(d)}{d}} \@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Sample plots for different category of results. The line parallel to y-axis indicates the stopping point.\relax }}{8}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Pro-Active: desharnais}}}{8}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Pro-CART: albrecht}}}{8}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Con-Active: maxwell}}}{8}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Con-CART: cocomo81s}}}{8}} \newlabel{fig:category-plots}{{14}{8}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Labeling Effort Reduction}{8}} \citation{Alpaydin2004} \citation{Milic2004} \citation{Robson2002} \citation{Kitchenham2001} \@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces The percentage of instances that are labeled at the stopping point. The median percentage value is $38.8\%$. The implication of this table is that it is possible reduce the effort of labeling activities by orders of magnitude.\relax }}{9}} \newlabel{fig:stopping-percentage}{{15}{9}} \@writefile{toc}{\contentsline {section}{\numberline {6}Threats To Validity}{9}} \newlabel{sect:threats-to-validity}{{6}{9}} \@writefile{toc}{\contentsline {section}{\numberline {7}Conclusions}{9}} \newlabel{sect:conclusions}{{7}{9}} \bibstyle{IEEEtran} \bibdata{library} \bibcite{Menzies2009b}{1} \bibcite{Menzies2007c}{2} \bibcite{Boehm1981}{3} \bibcite{Fenton2007c}{4} \bibcite{Fenton2007b}{5} \bibcite{Kaariainen2006}{6} \bibcite{Dasgupta2008}{7} \bibcite{Hassan2010}{8} \bibcite{Li2009}{9} \bibcite{Mendes2003}{10} \bibcite{Kadoda2000}{11} \bibcite{Keung2011}{12} \bibcite{shepperd97}{13} \bibcite{Foss}{14} \bibcite{keung2008c}{15} \bibcite{Finnie1997}{16} \bibcite{Menzies2006}{17} \bibcite{Lum2008}{18} \bibcite{Kocaguneli2010}{19} \bibcite{Li2009a}{20} \bibcite{Shepperd1996}{21} \bibcite{Bakir2009}{22} \bibcite{Keung2008}{23} \bibcite{keung2008b}{24} \bibcite{Li2008}{25} \bibcite{Kirsopp2002}{26} \bibcite{Sentas2005}{27} \bibcite{Kultur2008}{28} \bibcite{Turhan2007}{29} \@writefile{toc}{\contentsline {section}{\numberline {8}Future Work}{10}} \@writefile{toc}{\contentsline {section}{References}{10}} \bibcite{Miyazaki1994}{30} \bibcite{Briand1999}{31} \bibcite{Alpaydin2004}{32} \bibcite{Milic2004}{33} \bibcite{Robson2002}{34} \bibcite{Kitchenham2001}{35}