\relax \citation{me07c} \citation{Arisholm:2006p516,Bell:2006p517,Ostrand:2007p519,me07e,chen05,dekhtyar04,jiang07,nagappan05} \citation{marcus08} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{2}} \@writefile{toc}{\contentsline {section}{\numberline {2}Motivation}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Why Study WC vs CC?}{3}} \citation{musa87} \citation{boehm88} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Why Study Defect Predictors?}{4}} \citation{goseva07} \@writefile{toc}{\contentsline {section}{\numberline {3}Methodology}{5}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Data}{5}} \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Descriptions of ten software projects used in the paper. The rows labeled ``NASA'' come from NASA aerospace projects while the rows labeled ``SOFTLAB'' come from a Turkish software company writing applications for domestic appliances.}}{5}} \newlabel{tbl:data1}{{1}{5}} \citation{me07b} \citation{dou95} \@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary of data from ten software projects of Table\nobreakspace {}1\hbox {}, sorted in order of number of functional units.}}{6}} \newlabel{tbl:data2}{{2}{6}} \newlabel{eq:ent}{{1}{6}} \citation{me07b} \citation{me07b} \citation{me07b} \@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Static code features available in Table\nobreakspace {}2\hbox {} projects.}}{7}} \newlabel{tbl:attr1}{{3}{7}} \newlabel{eq:infogain}{{2}{7}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Best defect predictors learned in \nobreakspace {}\cite {me07b}. Mean results from Naive Bayes after a 10 repeats of (i)\nobreakspace {}randomize the order of the data; (ii)\nobreakspace {}divide that data into ten 90\%:10\% splits for training:test. Prior to learning, all numerics where replaced with logarithms. InfoGain was then used to select the best two or three attributes shown in the right-hand column (and if ``three'' performed as well as ``two'', then this table shows the results using ``two'').}}{8}} \newlabel{fig:best}{{1}{8}} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Attributes used in Figure\nobreakspace {}1\hbox {}. }}{8}} \newlabel{fig:attrs}{{2}{8}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces InfoGain for KC3 attributes. Calculated from Equation\nobreakspace {}2\hbox {}. Lines show means and t-bars show standard deviations after 10 trials on 90\% of the training data (randomly selected).}}{8}} \newlabel{fig:sorted}{{3}{8}} \citation{shepperd94} \@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Static code features shared by NASA and SOFTLAB projects.}}{9}} \newlabel{tbl:attr2}{{4}{9}} \citation{me07b} \citation{lessmann09} \citation{duda76} \citation{domingos97optimality} \citation{domingos97optimality} \citation{yang02} \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Naive Bayes Classifier}{10}} \newlabel{eq:nb}{{3}{10}} \citation{yang02} \citation{witten05} \citation{me07b} \citation{me07b,me07e} \newlabel{eq:b}{{4}{11}} \newlabel{eq:laplace}{{5}{11}} \newlabel{eq:normal}{{6}{11}} \citation{me07b,me07e} \citation{mann47} \citation{demsar06} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Performance Evaluation}{12}} \newlabel{eq:bal}{{9}{12}} \@writefile{toc}{\contentsline {section}{\numberline {4}Analysis \#1: Are CC data ever useful for organizations?}{12}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Design}{12}} \@writefile{lot}{\contentsline {table}{\numberline {5}{\ignorespaces Pseudocode for Analysis 1}}{13}} \newlabel{tbl:exp1}{{5}{13}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Results from Analysis \#1}{13}} \citation{koru08} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Analysis \#1 results averaged over seven NASA tables. Numeric results on left; quartile charts on right. ``Q1'' and ``Q3'' denote the 25\% and 75\% percentile points (respectively). The upper quartile of the first row is not visible since it runs from 100\% to 100\%; i.e. it has zero length.}}{14}} \newlabel{fig:ccwsnum}{{4}{14}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.3} Checking the Analysis \#1 Results}{14}} \citation{hayes06} \@writefile{lot}{\contentsline {table}{\numberline {6}{\ignorespaces Summary of U-test results (95\% confidence): moving from WC to CC. For all projects' results, see\nobreakspace {}Figure\nobreakspace {}5\hbox {}. }}{15}} \newlabel{tbl:ccwc}{{6}{15}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Discussion of Analysis \#1}{15}} \@writefile{toc}{\contentsline {section}{\numberline {5}Analysis \#2: How can companies filter CC data for local tuning? }{15}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Design}{15}} \@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Project-wise Analysis \#1 results for NASA projects.}}{16}} \newlabel{fig:ccwsprojs}{{5}{16}} \citation{baker07} \@writefile{lot}{\contentsline {table}{\numberline {7}{\ignorespaces Pseudocode for Analysis 2}}{17}} \newlabel{tbl:exp2}{{7}{17}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {5.1.1}Nearest Neighbor (NN) Filtering}{17}} \@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Analysis \#2 PD results where $NN_{pd} \ge WC_{pd}$. Rankings computed via Mann-Whitney (95\% confidence) comparing each row to all other rows.}}{18}} \newlabel{fig:pd1}{{6}{18}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Results}{18}} \@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Analysis \#2 PD results where $NN_{pd} < WC_{pd}$. }}{19}} \newlabel{fig:pd2}{{7}{19}} \@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Analysis s \#2 PF results where $NN_{pf} \le WC_{pf}$.}}{19}} \newlabel{fig:pf1}{{8}{19}} \@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Analysis \#2 PF results where $NN_{pf} > WC_{pf}$. }}{19}} \newlabel{fig:pf2}{{9}{19}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Discussions}{20}} \@writefile{toc}{\contentsline {section}{\numberline {6}Analysis \#3: What is the smallest amount of local data needed for constructing a model? }{20}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Design}{20}} \@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Results from analysis \#3. Training set size grows in units of 100 examples, moving left to right over the x-axis. The MC2 results only appear at the maximum x-value since MC2 has less than 200 examples. }}{21}} \newlabel{fig:inc}{{10}{21}} \@writefile{lot}{\contentsline {table}{\numberline {8}{\ignorespaces Pseudocode for Analysis 3}}{21}} \newlabel{tbl:exp3}{{8}{21}} \citation{me08d} \citation{orrego04} \citation{quinlan92} \citation{quinlan92b} \citation{Blake+Merz:1998} \citation{orrego04} \citation{quinlan92} \citation{quinlan92b} \citation{Blake+Merz:1998} \@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Y-axis shows plateau point after learning from data sets that have up to $X$ examples (from\nobreakspace {}\cite {orrego04}). The left plot shows results from using Naive Bayes (nbk) or a decision tree learner (j48)\nobreakspace {}\cite {quinlan92} to predict for discrete classes. Right plot shows results from using linear regression (lsr) or model trees (m5)\nobreakspace {}\cite {quinlan92b} to learn predictors for continuous classes. In this study, data sets were drawn from the UC Irvine data repository\nobreakspace {}\cite {Blake+Merz:1998}.}}{22}} \newlabel{fig:plateau}{{11}{22}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Results from Analysis \#3}{22}} \citation{john95} \citation{orrego04} \citation{boehm00a} \@writefile{lot}{\contentsline {table}{\numberline {9}{\ignorespaces An estimate of the effort required to build and test 100 modules.}}{23}} \newlabel{tbl:estimate}{{9}{23}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Checking the Analysis \#3 Results}{23}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.4}Discussion of Analysis \#3}{23}} \@writefile{toc}{\contentsline {section}{\numberline {7}Replication: Can our results be generalized?}{23}} \newlabel{sec:exp4}{{7}{23}} \@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces Analysis \#1 results for the SOFTLAB projects. Overall and individual results are shown respectively.}}{24}} \newlabel{fig:ccwcturkey}{{12}{24}} \@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Analysis \#2 PD results for the SOFTLAB projects where $NN_{pd} \ge WC_{pd}$. Rankings computed via Mann-Whitney (95\% confidence) comparing each row to all other rows.}}{25}} \newlabel{fig:arpd1}{{13}{25}} \@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Analysis \#2 PF results for the SOFTLAB projects where $NN_{pf} \le WC_{pf}$.}}{25}} \newlabel{fig:arpf1}{{14}{25}} \@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces Analysis \#2 PF results for the SOFTLAB projects where $NN_{pf} > WC_{pf}$. }}{25}} \newlabel{fig:arpf2}{{15}{25}} \citation{lessmann09} \citation{shepperd97} \citation{khoshgoftaar03} \citation{moser08} \citation{shu02} \citation{fagan86} \citation{shull00a} \citation{fagan76} \@writefile{toc}{\contentsline {section}{\numberline {8}Related Work}{26}} \@writefile{toc}{\contentsline {subsection}{\numberline {8.1}Neighborhood Reasoning}{26}} \@writefile{toc}{\contentsline {subsection}{\numberline {8.2}On Defect Prediction Using Static Code Features}{26}} \newlabel{sec:dp}{{8.2}{26}} \citation{moser08} \citation{conf/icse/NagappanB05a} \citation{me02f} \citation{halstead77,mccabe76,chapman02,me04g,polyspace,hall00,nikora03,conf/icse/NagappanB05a,Khoshgoftaar:2004p1877,conf/ictai/TangK04,porter90,tiang95,srinivasan95} \citation{rakitin01} \citation{Blake+Merz:1998} \citation{Blake+Merz:1998} \citation{fenton96} \citation{fenton96} \citation{shepperd94} \@writefile{lot}{\contentsline {table}{\numberline {10}{\ignorespaces Some representative $pd$s and $pf$s for prediction problems from the UC Irvine machine learning database\nobreakspace {}\cite {Blake+Merz:1998}. These values were generated using the standard settings of a state-of-art decision tree learner (J48). For each data set, ten experiments were conducted, where a decision tree was learned on 90\% of the data, then tests are done of the remaining 10\%. The numbers shown here are the average results across ten such experiments.}}{27}} \newlabel{tbl:uci}{{10}{27}} \citation{Blake+Merz:1998} \citation{arisholm06} \citation{brooks95} \@writefile{toc}{\contentsline {section}{\numberline {9}Practical Implications}{28}} \citation{kitch07} \citation{me07b} \citation{lessmann09} \citation{lessmann09} \citation{lessmann09} \citation{graves00} \citation{me07b} \citation{basili02} \@writefile{lof}{\contentsline {figure}{\numberline {16}{\ignorespaces x-axis shows the rank of the data miners given in the y-axis. All the methods whose top ranks are 4 to 12 are statistically insignificantly different. From\nobreakspace {}\cite {lessmann09}. }}{30}} \newlabel{fig:lessmann}{{16}{30}} \@writefile{toc}{\contentsline {section}{\numberline {10}Threats to validity}{30}} \@writefile{toc}{\contentsline {section}{\numberline {11}Conclusion}{31}} \bibcite{me07c}{1} \bibcite{Arisholm:2006p516}{2} \bibcite{Bell:2006p517}{3} \bibcite{Ostrand:2007p519}{4} \bibcite{me07e}{5} \bibcite{chen05}{6} \bibcite{dekhtyar04}{7} \bibcite{jiang07}{8} \bibcite{nagappan05}{9} \bibcite{marcus08}{10} \bibcite{musa87}{11} \bibcite{boehm88}{12} \bibcite{goseva07}{13} \bibcite{me07b}{14} \bibcite{dou95}{15} \bibcite{shepperd94}{16} \bibcite{lessmann09}{17} \bibcite{duda76}{18} \bibcite{domingos97optimality}{19} \bibcite{yang02}{20} \bibcite{witten05}{21} \bibcite{mann47}{22} \bibcite{demsar06}{23} \bibcite{koru08}{24} \bibcite{hayes06}{25} \bibcite{baker07}{26} \bibcite{me08d}{27} \bibcite{orrego04}{28} \bibcite{quinlan92}{29} \bibcite{quinlan92b}{30} \bibcite{Blake+Merz:1998}{31} \bibcite{john95}{32} \bibcite{boehm00a}{33} \bibcite{shepperd97}{34} \bibcite{khoshgoftaar03}{35} \bibcite{moser08}{36} \bibcite{shu02}{37} \bibcite{fagan86}{38} \bibcite{shull00a}{39} \bibcite{fagan76}{40} \bibcite{conf/icse/NagappanB05a}{41} \bibcite{me02f}{42} \bibcite{halstead77}{43} \bibcite{mccabe76}{44} \bibcite{chapman02}{45} \bibcite{me04g}{46} \bibcite{polyspace}{47} \bibcite{hall00}{48} \bibcite{nikora03}{49} \bibcite{Khoshgoftaar:2004p1877}{50} \bibcite{conf/ictai/TangK04}{51} \bibcite{porter90}{52} \bibcite{tiang95}{53} \bibcite{srinivasan95}{54} \bibcite{rakitin01}{55} \bibcite{fenton96}{56} \bibcite{arisholm06}{57} \bibcite{brooks95}{58} \bibcite{kitch07}{59} \bibcite{graves00}{60} \bibcite{basili02}{61}