\relax \citation{me06d} \citation{kitch07} \@writefile{toc}{\contentsline {section}{\numberline {I}Introduction}{1}} \@writefile{toc}{\contentsline {subsection}{\numberline {I-A}Are the defect predictors learned from CC data beneficial for organizations?}{1}} \@writefile{toc}{\contentsline {subsection}{\numberline {I-B}Can we still make use of CC data for defect prediction?}{1}} \@writefile{toc}{\contentsline {subsection}{\numberline {I-C}How much data do organizations need for constructing a local model for defect prediction?}{1}} \citation{kitch07} \citation{mendes07} \citation{abrahamsson07} \citation{macdonell07} \citation{kitch07} \citation{kitch07} \citation{Premraj:2007p1626} \citation{boehm00a} \citation{macdonell07,kitch07} \citation{me07b} \citation{mccabe76} \citation{halstead77} \@writefile{toc}{\contentsline {subsection}{\numberline {I-D}Can our theories and results be generalized?}{2}} \@writefile{toc}{\contentsline {section}{\numberline {II}Related Work}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {II-A}Effort Estimation}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {II-B}Defect Prediction}{2}} \newlabel{sec:dp}{{II-B}{2}} \citation{shu02} \citation{fagan86} \citation{shull00a} \citation{fagan76} \citation{conf/icse/NagappanB05a} \citation{me02f} \citation{halstead77,mccabe76,chapman02,me04g,polyspace,hall00,nikora03,conf/icse/NagappanB05a,khoshgoftaar01,Khoshgoftaar:2004p1877,conf/ictai/TangK04,journals/ese/KhoshgoftaarS03,me03a,me02e,me03k,me03q,porter90,tiang95,khoshgoftaar99,srinivasan95} \citation{rakitin01} \citation{graves00} \citation{Blake+Merz:1998} \citation{Blake+Merz:1998} \citation{fenton96} \citation{fenton96} \citation{shepperd94} \citation{Blake+Merz:1998} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Some representative $pd$s and $pf$s for prediction problems from the UC Irvine machine learning database\nobreakspace {}\cite {Blake+Merz:1998}. These values were generated using the standard settings of a state-of-art decision tree learner (J48). For each data set, ten experiments were conducted, where a decision tree was learned on 90\% of the data, then tests are done of the remaining 10\%. The numbers shown here are the average results across ten such experiments.}}{3}} \newlabel{fig:uci}{{1}{3}} \citation{brooks95} \citation{me07b} \citation{nikora03} \citation{nach08} \citation{jiang07} \citation{jiang07} \citation{me07c} \citation{nach08} \citation{kim08} \citation{halstead77} \citation{mccabe76} \citation{fenton96} \citation{fenton96} \citation{fenton96} \@writefile{toc}{\contentsline {section}{\numberline {III}Data}{4}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Features used in this study. The Halstead features are explained in Figure\nobreakspace {}4\hbox {} and the Mccabe features are explained in Figure\nobreakspace {}5\hbox {}.}}{4}} \newlabel{fig:attr}{{3}{4}} \citation{me07b} \citation{basili02} \citation{me07b} \citation{me07b} \citation{drummond03} \citation{cohen95r} \citation{quinlan92} \citation{holte93} \citation{brieman96} \citation{FreSch97} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Notes on the Halstead features}}{5}} \newlabel{fig:halstead}{{4}{5}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Notes on the McCabe features}}{5}} \newlabel{fig:mccabe}{{5}{5}} \@writefile{toc}{\contentsline {section}{\numberline {IV}Experiment \#1: WC-vs-CC}{5}} \@writefile{toc}{\contentsline {subsection}{\numberline {IV-A}Design}{5}} \citation{lessmann09} \citation{lessmann09} \citation{lessmann09} \citation{witten05} \citation{turhan07qsic} \citation{me07b} \citation{domingos97optimality} \citation{domingos97optimality} \citation{quinlan92} \citation{me07b,me07e} \citation{me07b,me07e} \citation{mann47} \citation{demsar06} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Range of ranks seen in 19 learners building defects predictors when, 10 times, a random 66\% selection of the data are used for training and the remaining data are used for testing. In ranked data, values from one method are replaced by their rank in space of all sorted values (so smaller ranks mean better performance). In this case, the performance value was area under the false positive vs true positive curve (and larger values are better). Vertical lines divide the results into regions where the results are statistically similar. For example, all the methods whose top ranks are 4 to 12 are statistically insignificantly different. From\nobreakspace {}\cite {lessmann09}. }}{6}} \newlabel{fig:lessmann}{{6}{6}} \newlabel{eq:bal}{{4}{6}} \newlabel{eq:bayes}{{1}{6}} \@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces About Bayes classifiers.}}{6}} \newlabel{fig:bayes}{{7}{6}} \citation{me07b} \citation{hayes06} \citation{kitch07} \citation{Premraj:2007p1626} \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Experiment \#1 results averaged over seven NASA tables. Numeric results on left; quartile charts on right. ``Q1'' and ``Q3'' denote the 25\% and 75\% percentile points (respectively). The upper quartile of the first row is not visible since it runs from 100\% to 100\%; i.e. it has zero length.}}{7}} \newlabel{fig:ccwsnum}{{8}{7}} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Summary of U-test results (95\% confidence): moving from WC to CC. For full results, see\nobreakspace {}Figure\nobreakspace {}10\hbox {}. }}{7}} \newlabel{fig:ccwc}{{9}{7}} \@writefile{toc}{\contentsline {subsection}{\numberline {IV-B}Results from Experiment \#1}{7}} \@writefile{toc}{\contentsline {subsection}{\numberline {IV-C}Sanity Checks on Experiment \#1}{7}} \@writefile{toc}{\contentsline {subsection}{\numberline {IV-D}Discussion of Experiment \#1}{7}} \@writefile{toc}{\contentsline {section}{\numberline {V}Experiment \#2: NN-Filtered CC }{7}} \@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Experiment \#2 PD results where $NN_{pd} \ge WC_{pd}$. Rankings computed via Mann-Whitney (95\% confidence) comparing each row to all other rows.}}{8}} \newlabel{fig:pd1}{{11}{8}} \@writefile{toc}{\contentsline {subsection}{\numberline {V-A}Design}{8}} \@writefile{toc}{\contentsline {subsection}{\numberline {V-B}Results}{8}} \@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces Experiment \#2 PD results where $NN_{pd} < WC_{pd}$. }}{8}} \newlabel{fig:pd2}{{12}{8}} \@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Experiments \#2 PF results where $NN_{pf} \le WC_{pf}$.}}{8}} \newlabel{fig:pf1}{{13}{8}} \@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Experiment \#2 PF results where $NN_{pf} > WC_{pf}$. }}{8}} \newlabel{fig:pf2}{{14}{8}} \citation{fenton97} \citation{me08d} \citation{orrego04} \citation{quinlan92} \citation{quinlan92b} \citation{Blake+Merz:1998} \citation{orrego04} \citation{quinlan92} \citation{quinlan92b} \citation{Blake+Merz:1998} \@writefile{toc}{\contentsline {subsection}{\numberline {V-C}Discussions}{9}} \@writefile{toc}{\contentsline {section}{\numberline {VI}Experiment \#3: Incremental WC }{9}} \@writefile{toc}{\contentsline {subsection}{\numberline {VI-A}Design}{9}} \@writefile{toc}{\contentsline {subsection}{\numberline {VI-B}Results from Experiment \#3}{9}} \@writefile{toc}{\contentsline {subsection}{\numberline {VI-C}Sanity Checks on Experiment \#3}{9}} \citation{john95} \citation{orrego04} \citation{boehm00a} \@writefile{lof}{\contentsline {figure}{\numberline {16}{\ignorespaces Y-axis shows plateau point after learning from data sets that have up to $X$ examples (from\nobreakspace {}\cite {orrego04}). The left plot shows results from using Naive Bayes (nbk) or a decision tree learner (j48)\nobreakspace {}\cite {quinlan92} to predict for discrete classes. Right plot shows results from using linear regression (lsr) or model trees (m5)\nobreakspace {}\cite {quinlan92b} to learn predictors for continuous classes. In this study, data sets were drawn from the UC Irvine data repository\nobreakspace {}\cite {Blake+Merz:1998}.}}{10}} \newlabel{fig:plateau}{{16}{10}} \@writefile{toc}{\contentsline {subsection}{\numberline {VI-D}Discussion of Experiment \#3}{10}} \@writefile{toc}{\contentsline {section}{\numberline {VII}Experiment Replication}{10}} \newlabel{sec:exp4}{{VII}{10}} \@writefile{lof}{\contentsline {figure}{\numberline {17}{\ignorespaces An estimate of the effort required to build and test 100 modules.}}{10}} \newlabel{fig:estimate}{{17}{10}} \@writefile{lof}{\contentsline {figure}{\numberline {18}{\ignorespaces Experiment \#1 results for the SOFTLAB tables. Averaged and individual results are shown respectively.}}{11}} \newlabel{fig:ccwcturkey}{{18}{11}} \@writefile{lof}{\contentsline {figure}{\numberline {19}{\ignorespaces Experiment \#2 PD results for the SOFTLAB tables where $NN_{pd} \ge WC_{pd}$. Rankings computed via Mann-Whitney (95\% confidence) comparing each row to all other rows.}}{11}} \newlabel{fig:arpd1}{{19}{11}} \@writefile{lof}{\contentsline {figure}{\numberline {20}{\ignorespaces Experiments \#2 PF results for the SOFTLAB tables where $NN_{pf} \le WC_{pf}$.}}{11}} \newlabel{fig:arpf1}{{20}{11}} \@writefile{lof}{\contentsline {figure}{\numberline {21}{\ignorespaces Experiment \#2 PF results for the SOFTLAB tables where $NN_{pf} > WC_{pf}$. }}{11}} \newlabel{fig:arpf2}{{21}{11}} \@writefile{toc}{\contentsline {section}{\numberline {VIII}Conclusion}{11}} \bibdata{refs} \bibcite{me06d}{1} \bibcite{kitch07}{2} \bibcite{mendes07}{3} \bibcite{abrahamsson07}{4} \bibcite{macdonell07}{5} \bibcite{Premraj:2007p1626}{6} \bibcite{boehm00a}{7} \bibcite{me07b}{8} \bibcite{mccabe76}{9} \bibcite{halstead77}{10} \bibcite{shu02}{11} \bibcite{fagan86}{12} \bibcite{shull00a}{13} \bibcite{fagan76}{14} \bibcite{conf/icse/NagappanB05a}{15} \bibcite{me02f}{16} \bibcite{chapman02}{17} \bibcite{me04g}{18} \bibcite{polyspace}{19} \bibcite{hall00}{20} \bibcite{nikora03}{21} \bibcite{khoshgoftaar01}{22} \@writefile{toc}{\contentsline {section}{References}{12}} \bibcite{Khoshgoftaar:2004p1877}{23} \bibcite{conf/ictai/TangK04}{24} \bibcite{journals/ese/KhoshgoftaarS03}{25} \bibcite{me03a}{26} \bibcite{me02e}{27} \bibcite{me03k}{28} \bibcite{me03q}{29} \bibcite{porter90}{30} \bibcite{tiang95}{31} \bibcite{khoshgoftaar99}{32} \bibcite{srinivasan95}{33} \bibcite{rakitin01}{34} \bibcite{graves00}{35} \bibcite{Blake+Merz:1998}{36} \bibcite{fenton96}{37} \bibcite{shepperd94}{38} \bibcite{brooks95}{39} \bibcite{nach08}{40} \bibcite{jiang07}{41} \bibcite{me07c}{42} \bibcite{kim08}{43} \bibcite{basili02}{44} \bibcite{drummond03}{45} \bibcite{cohen95r}{46} \bibcite{quinlan92}{47} \bibcite{holte93}{48} \bibcite{brieman96}{49} \bibcite{FreSch97}{50} \bibcite{lessmann09}{51} \bibcite{witten05}{52} \bibcite{turhan07qsic}{53} \bibcite{domingos97optimality}{54} \bibcite{me07e}{55} \bibcite{mann47}{56} \bibcite{demsar06}{57} \bibcite{hayes06}{58} \bibcite{fenton97}{59} \bibcite{me08d}{60} \bibcite{orrego04}{61} \bibcite{quinlan92b}{62} \bibcite{john95}{63} \bibstyle{IEEEtran} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Ten tables of data, sorted in order of number of examples. The rows labeled ``NASA'' come from NASA aerospace projects while the rows labeled ``SOFTLAB'' come from a Turkish software company writing applications for domestic appliances. For details on the features used in each data set, see Figure\nobreakspace {}3\hbox {}. }}{14}} \newlabel{fig:data}{{2}{14}} \@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Project-wise Experiment \#1 results for NASA tables.}}{14}} \newlabel{fig:ccwsprojs}{{10}{14}} \@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces Results from experiment \#3. Training set size grows in units of 100 examples, moving left to right over the x-axis. The MC2 results only appear at the maximum x-value since MC2 has less than 200 examples. }}{15}} \newlabel{fig:inc}{{15}{15}}