Caruana Icml04 Icdm06long

{
)7 ."::_: ":7_K:7{pP{_Q7, =

!"# $%'& )(
*,+.-0/213547683:983
;=<?>A@B<?CD<FEG;IHKJL;AM8>BCDNPOQOAJRNPS8@
T,U.VQW 3P9X?476'Y+Z-K6 U.V\[ -K6?]_^`+ZaK+ U
<8ObN5c8CEG;IHKJL;AM8>BCDNPOQOAJRNPS8@
deV\f=g 1%4 VQh
i ;Ij=k:EG;AHbJl;IM8>BCDNPOQO=JlNPS?@
T,U.VQWnm[ +po V\[
<8qsrtIkPEG;IHKJL;AM8>BCDNPOQOAJRNPS8@
uv w=xy{z{|}v ~Qz#sK|}wP5z_v y5 lv~: vbAsKy_~PvlD~Plbv ylzK5z_:xKxPP bQ5
b5PA
Uv y_vsvKv~Pv yx7z_v5lbv yv{v z_FKA|}55vL :{L~P"|x~\
PRIv y_v ~Qz#xLKby{lz{P|?v:{v5PwPwAKy{)z Vvz_Ky#xbPl~:v
vwPy_v{v ~Qzx|}v z_P5pbyF K~:z{y_: z{L~Pv~:v|G
W V_ =xy{z{l= LxKB~Pv PyxB~:v z_%.=|Gv|}Ky_b xbv
lv,py{b|
L y_xKy{Lvz_PK={xK~:P,%|}55v
L
vxKy{~:l~P*X"=Pv {lb~z{y_v v%.uZY xbKv5v LK~

55v l yxy_lvxy_vbv ~Pvy_xz{v=L~P5lIv y_v ~Qz
_
z
y{vvZ [Zu Y \bz{v5v LK~z{y_v v.- Y8u Y
lvxy_~Pl~:xLKby{lz{P|xK~:w:xKy_xK|}v z{vy%{v zz_l~:b
xK~: \bz{vz_P|}w:.-Y-Y7Pby8vxK%xKlbKy_Rz_P|
PKy_xyez{vw\L{v{v Lv z{LK~=vz{exKPz_
vz_y_xKl~|}55v =L~P|x~\5lAvy{v~bzGw:xyx|}v z{v y{v z{
z{Pvv~:v| Lvez{Pv|}55vL}z_:x7z)|x\L|}lvRz
z_l~:bPby)v PxK|Gw:lvbvz{yxL~ P- V n7xy_QL~P
w=vypby{|x~= vK~:{v | lv{v Lv z{LK~xKlL7"v ~P
z_Pv|xy_KL~w:xKy_xK|Gv z{v]y \Gsz{:vebv y_~Pv sxK~:z{PvKvy{~:v
v| LvFz_ vKw5z_l|}L vz_w=vypby{|xK~: v|}v z{
w=xyx|}v z_v y.vK :7xy_\l~:Qx|}|xlz{bv y_~Pv
y{=xbxK PyxK b) y_b_v ~Qz{y_Kw\K|}vx~
wPy{v LK~?Ky"y{vxP5w=vy{L|}v ~Qz_lz{
vz{yxL~x K5zbK}|}55v pKyvxbwPy_ Lv |"\K|}v
vKv ~z_vzw:y{ lv|x~=z{v~|}v z_y{ 5v |}K~P
|}55vL:x0bvv P v LLv ~Qzw=vypby{|x~= vK\0v ^b=xIz{}by v zz{vy
z_y_xz{vz{Pv v ~:v PzKv~:v| Lv"vlvz_lb~D
z_:x~}z_Pv vz|}55v #y_v wAKy{z{vL~}z{:vllz{vy_xz{Py_vK#z{:v y
|}55vLDP7v bv yA=x0Kv%|}v5l5 y{v}KyvKv ~w=\bywAv y{pKy{
|xK~: vbFx7z{:v yz{:xK~, K| l~PvK\5)x~= xb)|}55v l~
#Q \:AP
xK~v ~=v| LvKv:{v)pby{xyz{v w\vevlvz_lb~py{b|
~nv~:{v | lveL}x blLv z{LK~K|}55v GPb{vewPy{v5 z_P:v}v ~l x0yKxvy_y_xKKv|Gz_5K5bvv Lz{:z_v ey:\~:lvLnxv P v Llv vz~bzKswA|}v y{5pK5y_v |xz{~:= x7vKz
z_lb~:Gxy_v b| L~Pv v LKQz{vx0Kvy_xKKL~PKyKz_l~:: Y:v xK{v ~:{v | Lvvlvz{LK~w:y{5 v5Py_vLKvy{GL|}wPlDv _
uLv zz_v y_LZbKQ"zx7z_v"z_:x7z{~Pv v_{xKy{x~:{5}
lv~Qz b~:5lz{LK~"pbyxK~v ~=v| LvK: xK_l:v yBz_ vs|}by{v bQz_xKyzlz{z{:v"v |}w5z)v~:v| LvK
xb Py_xz{vz{:xK~xK~\Rzl~:Pl\5:x|}v| v yLlsz_Pv PPz{z_Pvv ~:{v | Lvsz_Pv|}\Pv PL~%z_PvL yxy_z{=x7z
Lxb{{l:v yxy_vxb Py_xz{vx~:e5LKvy_{vK
|}x5l|}L vz{Pvv ~:{v | L%v ` wAv y{pKy_|}xK~: vz_z{:v%v y{
x~\|Gv z{P5P:x0bv vv ~wPy{bw=Qvez{,Kv ~:v yx7z{vxK 5
yx7z_vKKv z5LKv y{vKFv z_K|G55vLxbKL~P.y_v L|}xK~D :y{bv ywA|}vxv zz{y_QLz{vbw~exGpbPyLlx LPL| 5v ep~\7PxL| P x7v z_ylb~=FRz_vv yzx7 z{LK~=Ky
Kbb:z{yxL~:B|}55v BK\b~PvFz\w=vpvK :Ls5v L{lb~z{y_v v
b~ \zz_y_xKw{xK|Gw:lvIz{:vz{yxL~PL~Pv zwPlz{Kbb =P~Qv z{z_LP?y{x~nLIz{z{:Pvev"v|}~:5{v 5| vL lv,=x0pKy{vb| vvz{P~v:~P{vvBz_ vv zv~5
xbpvx7z{:y{vL~:z{vxK`Kz_y_xKl~PL~Pw=bl~Qz_\bz{L~P
v| LvGz_:x7z):xb}|x75l|P| wAv y{pKy_|x~: v,b~nz_Pv
5:xKwPly_vK?KK:bv ~Pvy_xz{vx,w=Kz{v~bz_LxKlL|}Ky_vGPlbv yv
P
lL Ll| .7xLLPxz{LK~={v z
{v z|G55vL,z_:x~ xbKL~P v LKQz{L~Pz_Pvz{yxL~5
L~Pv zz{npKy v~Pv |G55vLxzz_v ~:'z{z{PQvwAKL~bz
z_:x7z}xy_v)5l Plzz_ xK_lp Ky_y_vz_lb\PLlL7x~v zx w:y{\v5PLv z{%LKx~:y_v)xK:lz{5vz{:z_v|Gx~55vv~:L"vxK| ly_ vLxbv 5 l~x0Kz_vPyv}xbv l~:~:{v | z_P v lvbLy
bKb \bz?pvx7z_Py{v?L~:z{vxK"K5z{yxL~Pl~:w=bl~Qz F#y_y{by
Ky_y{vz_l~PKb5z{w:5z 55v."#ZuLv z{z{v y_Gx\Ly{ YxKl:L7LL|~Pxbv v~=xKvP| P l~PLv}x|GRz_55v vPI z{v Llz{vP~bvzGvw=~:vyvpb| y{ |Lv xK~:0 v a z{ .xb zv
KQKB y{vx7z{v?|}55vL8Rz_GPv by{y_v x7z_vvy{y_Ky z{yxL~5
L~P|}\Pv pby|%Plz{l Lxb{,wPy_ Lv |eK~5lAvy{v~bz5l pbPK~:y_v 7Klv~yF|}z_Pl~\v5z_{vv Lvpz_y_lKb| ~nw:L y{ 5y_ xKvy{5LvPy_vexKlz{lL 7GbK:'Gz{|}5b5w5vz{Ll
:z{b|}lv :~Py{x~=xy_|x~Qz{nbKQ% y{vx7z{v5l |}L vz_Pvv~:{v | lvz_x~\vxbLl b|}wP5z{vw=vypby{|xK~: v
bv yv~PvPy_xKB~Pv z\Lx} K|}w=v z{lz{LK~x|}b~P~:\Pv
|}v z{y_LKvGv 7xL:x7z_vz_PvwAv y{pKy_|x~: v%K#v ~:{v | lv}{v
s+-Q56 879,:
! ;= <>,?@A: BF !D#C2"%A:$EF& ')Q(0 FQ* +A@-G,H0.H.ID JK-,0EM/2L13@ N 40+O@+P 5 Lv z{LK~nK~z_v ~wAv y{pKy_|x~: v|}v z_y{ v v LlvKv,z{P
sz{Pv:yzz{L|}v"xLvxy_~PL~P%|}v z{P5,:xK vv ~,v 7xL:x7z_v
GH0HIRQ@N
POb
S%POLT4J
xb y_b_:xGLPv"0xKy{Lv zKwAv y{pKy_|x~: v|Gv z{y_L
hwdziwf%kuf%ordps+dkuxqv d@fkulnm%kZdf%g4hf{%mDk+t|s+h;osume%dc;dk+~
fs+d
%%%zorm--d{?xqf%k+dR-d0x+gk+tyjd0t|csuh;di;idcj-t2m%s+d
suhwf@s8v d-mec;msq-ds+dk+ortc;dvhwfsqiwfk4fords+dk4xWt|d{ydxs
idk+lnm%kuo]fcwgdvh;dcsukuf%t|c;tc;om--d{x{{}om--d{xf%k+d
fD;-d0s+mf{t|ywkuf%k+c;mo]fss+dkzhwm@v%mWm-m%kywf%s+h;d
f%k+dD9m--d{6i;kud;tgps+tm%cwxRmDcs+h;ds+k4ftcf%cwh;t{|{?gp{to
y;tc;
x+dps4xzf%k+d9gf%g4hwd2h;t?xzx+t|ori;{t|wdxzv mDk+Wtc;vt|s+hs+h;d9{|t|~
ywkuf%k+fcwo]fDdx]orm--d{Zxd{|d0gsut|mDclf%xs+dkeyjd0gfjxdsuh;d
orm--d{x;mcwmshwf:%dZsumydRdp-d0gp-sud-;kut|cwrx+d{dgps+tm%c=
Zq;Kq3Z3K-6
hwdx+t|ori;{dlnmDk+vfk4om--d{x+d{dgps+tm%ci;kum-gpd0-;kudi;kudp~
x+dcsudtcs+h;dcs+kumW;wgsut|mDctxelf%xs9fcjdp>dgps+t%d%yw-s
x+m%ords+tod0x)m@%dk;s4xKs+mZs+h;dhwt|{{g{|to
ywt|c;Rx+dps0%kud-wgt|cw8dc-~
x+dozy;{dZidk+lnm%kuo]fcwgd%)dRorfD-dZs+h;kuddRfD;-t|s+tm%cwx#sumrs+h;t?x
x+d{dgps+tm%ci;kumWgd-wk+ds+mkud-jgpdm@%dk;s+s+tc;wh;d0xdf%k+d
;txugpwxux+d9t|csuh;dRc;dpWssuh;k+ddxwy-~x+dgps+tm%cwxh;dx+dRordpsuh-~
m-;x6o]f:zydqwx+dplnw{jtcmsuh;dk#fi;iw{|t?gfs+tm%cwx6vh;dkudlnm%kuvfk4
xs+diWvtx+dxd{|d0gsut|mDct?xiwk+mDc;ds+mm@Ddk+;ssut|c;j#x+wg4hf%xt|c
lnd0f@su;k+dRx+d{dgps+tm%c}m%hjf:t=%mDh;c=20%%
=)3WW0r9D)w%WW>
t|s+horm--d{x+d{dgps+tm%c..w@-kudi;{?f%gdordcsidk+lnm%k+~
o]f%cwgpdrtoiwk+m@DdxZf%xZs+hwdydxsorm--d{?xRfkudfD;-d0sum9suh;d
dcwx+do
yw{|dD#idf%-xfcwsuh;dc;t?g4W{|-dg{|tc;dx)dklnmDk~
o]f%cwgpd-kum%iwxydgfwx+desuh;dydxsrorm--d{xrtcsuh;d{|ty;k4fku
hjf:%dyjddcwx+dfcjx+d{dgps+tm%co
wxsc;m@vf%;orm--d{?x
suhwf@s8hwks8s+h;ddcwx+do
yw{|dDKt%wk+d]
x+h;m@vxsuh;t?xydhwf:Wtm%k
lnmDk}k+mWms+~ord0fc-~x+wf%k+d0W~dkuk+mDkZc-lnmDksu;cwf@sud{%=ormxsRdk+~
kum%k)ordpsuk+t?gxtd{?o
wg4hy;;ori;tdk6%k4fi;hwxMsuhwfcsuh;txvh;dc
hwt|{{g{|to
ywt|c;Zt?xK-mDc;d#vtsuhzx+orf%{|{W;fsufZxdsux@orf%Wt|c;8ts-t|l.~
gp;{|ss+mrkud{tf%y;{ei;t?g4efr%mWm-xsum%i;i;tc;]im%tcs6h;dR{mDxux
tcrijdklnmDk+o]f%cwgpdgf%czydxt%cwtjgfcs)tls+h;didf%t?x)ot?xuxd02
Mt%;kud}f{?xmRx+h;m@vxKs+hwfs6xd{|d0gs+tc;}orm--d{?x..+|D4p
w=Dk+d0f@s+{ekud;wgpd0xs+h;t?x8i;k+mDy;{do-d{dgsut|mDcvtsuhkudp~
iw{fDgpdordcDsf%{|{m@vxormW;d{?x#s+m]ydf%;-d0sums+h;dRdcwxdo
y;{d
oz;{|s+ti;{|ds+tordxb8cwgpdijd0fijdklnmDk+o]f%cwgpdtx9k+d0f%g4h;d02
t|ls+hwde;cWwx+dorm--d{xf{{6hW;k+szdcwxdo
y;{didk+lnm%kuo]fcwgd%
x+d{dgps+tm%cfD;;x
om--d{x
s+hjf@szv#dkud9fD;-d0yjdlnm%kudek4f@s+hwdk
suhwfc9hW;ks#ijdklnmDk+o]f%cwgpdD)hwtx6jf@s+s+dcwx s+h;dZijdklnmDk+o]f%cwgpd
g;ku%diwfDxsqs+h;dzijd0f>f%cwf{{m@vxxd{|d0gsut|mDcs+mjc;dsu;c;d
dcwx+do
yw{|d0xqyWv dt|Dhs+tc;orm--d{?xqom--d{xZf%w-dsumesuh;d
dcwx+do
yw{|d}o
w{sut|i;{d}s+tordxkudgdt%d}orm%kud8v#dt%hs0
0.36
hillclimb: selection without replacement

hillclimb: selection with replacement
test: selection without replacement
test: selection with replacement
0.355
0.35
RMS Error
b8cedf%g4heijdklnmDk+o]fcjgpdqords+kutgqv dZgpm%oriwf%k+dqdcwx+dozy;{|d}x+dp~
{dgps+tm%cs+mrsuh;dRorm--d{Kt|cs+hwdR{|ty;k4fku]s+hwfsijdklnmDk+o]xydxs
mDc
suhwf@sords+kutg%)#d0gfjxd#v dDdc;dkufs+dxm}o]fcW
;t>dkudcs
orm--d{x{t|y;k4fkutdxKwxjf{{|
gm%csuf%t|cflndvorm--d{?x)vt|s+hrdpW~
gd{{|dcs8idk+lnm%kuorf%cwgpdzm%cf%cW9idk+lnm%kuo]fcwgdordps+kut?g}%wxs
x+d{dgps+tc;8s+h;dydxsxtc;%{d#om--d{Wlnk+mDofZ{|ty;k4fkuZWtd{?;x)kudp~
o]f%k+@fyw{|Z%mWm-}ijdklnmDk+o]f%cwgpdDKcjxdo
y;{d6x+d{dgsut|mDc=:h;m@v~
d%dkqwcw;xdcwxdo
y;{dx9suhwf@sm%;s+idk+lnm%kuos+h;dydxsx+tc-~
D{|deom--d{xh;t?x
x+;D%dxsuxs+hwfs
wx+t|cw-t|dk+dcs
{|d0fkuc-~
tc;8ods+h;m-;x)f%cwRiwf%kuf%ods+dkKx+dpssut|cwDxK%dcwdk4f@s+d0x={t|ywkuf%k+tdx
gm%csuf%t|cwt|c;]fr-t%dkux+dZx+dpsm%lKDmm-W~idk+lnm%kuotc;ormW;d{?x
0.345
0.34
0.335
0.33
0.325
0.32
50
100
150
200
Number of Models in Ensemble
>W0
! #"$%
&'(&)+*
-d{dgsut|mDcevt|s+hk+di;{?f%gpdordcs6jfssudcwx#suh;dZg;ku%d}xmzo
wg4h
suhwf@sfs+d0xsx+dpset?xecwms9cwdd;dsum-dpsudkuort|c;dvh;dcs+m
xs+mDifD;-tc;orm--d{xzs+ms+hwddcwx+do
yw{|dDh;dh;t|{{?gp{t|ozy-~
tc;x+dpsgf%cydjxd0sumxs+mDihwt|{{g{|to
ywt|c;jhwtx}ordf%cwx
dcwx+do
yw{|drxd{|d0gsut|mDc-mWd0x8cwmsRc;dd0ormDk+d
s+dxsxdsuxZsuhwfc
suh;dZywf%x+dp~{|d%d{worm--d{?x6v#m%w{]hjf:%dqjxd0rs+mrxd{|d0gs#om--d{
ijfk4fordpsudk4x36cwx+do
yw{|dx+d{dgsut|mDczjxd0xMs+h;d@f{twf@s+tm%crx+dps
sum]-m]yjm%s+hiwf%kuf%ods+dkR ,Zorm--d{Kxd{|d0gs+tm%c3
=?=)3/.00021843W65M*87Kw:9D-*j
wm%kuv#f%kurx+d{dgps+tm%cx+m%ordpsut|ordx m@%dkwsuxd0fku{|zt|cxd{|d0gs+tm%c
vhwdcdcwx+do
yw{|d0xzf%k+dxo]f{{b8cwd9vf:sumiwk+d%dcszs+h;t?x
t?x
s+mtc;t|s+t?f{<t ;dedcwxdo
y;{dxzvt|s+hormDk+dorm--d{xcwxs+d0f%
m%lxs4fk+s+tc;vtsuhfcdori-sdcwx+dozy;{d%)xmDksRsuh;dorm--d{?x
tcsuh;d
{t|ywkuf%k+yWesuh;dtk8idk+lnm%kuo]fcwgd%fcwiw-sqsuh;dzydxs
orm--d{?xrtcsuh;ddcjxdo
y;{d% =
txeg4h;mDx+dcyW{|mWmDtc;
=
fsijdklnmDk+o]f%cwgpdZm%cs+hwdRh;t|{{?gp{t|ozy;t|cwxds6h;t?xsWi;t?gf%{|{
fD;;?
x >@~@
>ml}s+hwdyjd0xs9om--d{xs+mf%cdcwx+dozy;{|B
d AuD Cp@u
Dk+dd-xsudiWvt?xd
x+d{dgps+tm%cyd%tcwxqWtcwgpdd0f%g4hm%lMsuh;d =
ydxsRormW;d{?x}ijdklnmDk+o]xZv d{|{2suh;dlnmDk+of9xs+kum%c;tc;tsutf%{
dcwx+do
yw{|dfcj
t|stx)ormDk+d-<t Egp;{|slnm%kDk+dd-x+d{dgps+tm%czs+m
jcwom--d{xs+hwfsm@Ddk+;svh;dcf%w-d9s+msuh;dRdcwx+dozy;{d%
=GF3IHKJ-Jj021ZI30WL5)3*W*j
8xs+h;dcW;ozyjdk]ml8orm--d{xrtcf{t|y;k4fkutcwgpkudfDxd0xsuh;d
g4hjfcwgdx ml=wcj-t|cwgm%o
ywt|cwfs+tm%cwx ml=orm--d{x s+hjf@sm@%dk+;s
suh;dh;t{{g{|to
y;tc;x+dps
tcwgk+d0f%x+dxef%Dt|cwgfcort|cwt|orGt ;d
suh;t?xi;kum%y;{do#dzk+d0-wgd}s+h;dzcwo
ydkm%l)orm--d{x8xd{|d0g~
sut|mDcgf%cg4h;mWmDx+d}lnk+mDoy-k4f:vt|cwf]kuf%cw-mDoxufori;{dml
orm--d{xlnk+mDo suh;d{t|ywkuf%k+fcwx+d{dgps+tc;]lnkum%o s+hwfsZxufo~
iw{|dD#lf]ijfk+s+t?gp;{?fkqgm%ozy;t|cjf@s+tm%cm%l orm--d{xqm@%dk+;s4x
suh;d
iwk+mDywfy;t{tsmlsuh;mDx+dz orm--d{xqyjdt|c;9t|cf]k4fcw-mDo
yjfm%l)orm--d{xqtxq{|d0x+xs+hwf%cN
MPO>R Q lnm%S
k Os+h;d
lnkufDgs+tm%c
T UWVXTZYZ[\^]S_G`8abK[cedgf/hIij[ke]R[ml8n2oep qKrZdgèYc/dgfs[è][Vt
ce\<[u][\G[vwa_GT `xgo8a_GVX[]yaTz_<è]kK{[Xabedga(abK[c-[]RaVXTZYZ[\^]
`KT | `WhN`d)aa{&d vwa_G} [zU[d)ake{[jTgU[è]R[VscK\G[]R[\<[v+a_GT `

_^]ab/d)au_<a6gT Za_<VX_G[aTVX[a{_Gv]]Rkev&bd ]uK h#ij[
| G_ \G\Sbed~}@[VXd `6T@K/T@{RakK`K_<a_G[]yaTc-[8][\G[v+a[YhzbK[uKt

`/dg\[`/]R[VscK\G[_^]mabe[yd~}@[{&dgf@[TgUabK[sxgos[è][VcK\G[]hd f@]
U[{[` aSVX[wa{_^v]#d {[d KK{T K{_Gdga[_GùYZ_[{[` a$][waa_G`Kf@]SdgèY
T U4[è][Vsce\<[]]R[[VvwT VXK\G[wrKcekZaN[d@v&b8[è][Vsce\<[(_^]ke]Ra
c-[vdgke][L\G[d {è_<`KfV[abKTZYK]?ab/d)a8-[{UT {V
| [_<f@b a[YLd~} [{&dgf@[(TgUSVXTZYZ[\G]r]RT?abK[Xd~} [{d f [(TgUmd?][wa

T U$[è]R[VscK\G[]yd \G]T8_^]ydP]R_GVXK\<[ | [_Gf b a[YLd~}@[{&dgf [sTgUSabK[
VX[a{_Gv(YZTX`KT adg\ | d~Z]m/[{RUT@{V
c/d ][wt\<[} [\eVTZYZ[\G]hdgf f@_<èf(_G]$YZ_G]vwk/]][YX_G`8[v+a_GT `qZh Kh
zReR(WK
yIZK
`jabK_G]]R[v+a_<T@`
vT VXed {[P-[{UT {VXd èvw[ke]_<èfa[`VX[wa{_^v]Xc-[vdgke][YZ_<Ut
| [uvwT@VXedg{[X[è][VcK\<[u][\G[vwa_GT `jaTPabK[
c-[]RaLVXTZYZ[\^]za{d _<è[Y
ij[[wZ/[{_GVX[` a
Zyr
| _ab][}@[`K{T ce\<[VX]
$ m /
r $ h W rdgèY $ h -xU{T@V
abK[ $X [/T ]R_<aT@{D$\^dg@[zz[{@rN@ +r$ r
dK`K[kKVXT `K_^dYKd)a&d][warZW
r YKdgadU{T@VvwT@\<\^dgc-T {&d)t
aT {&]dgaabe[ a&dg`ZUT@{YW_<è[dg{BNvvw[\G[{&d)aT {r8d èYt

S Z m yrsabK[ èYZ_^dg`S_<`K[]b/[{]-[v+a{d \8YKdgad
kedg\<a_G[{_#[asdg\h<r$ @@whuNNZyrI 6d èYZW

d {[scK_<`/dg{?K{T cK\G[V]h S my /
r S r/dgèY
S Z m dg{[r/x Kred èYjXvw\^d ]]K{T cK\G[V]rK{[wt
]-[v+a_<}@[\G hIi6[#vT `} [{Ra[YNabK[]R[aTcK_Gèdg{Nc-[vdgke][Vdg`
T Uabe[8VX[wa{_^v]
| []Rak/YZdg{[8YZ[e`K[YT `cK_Gèdg{K{T@cZt

\G[V]r(dgèYc-[vd ke][j\G[d {è_<`KfVX[wabKTZYK]]kev&bd ] ]
d èY6c/TT@]Ra_G`Kfzdg{[X[d@]R_G[{aTzd KK\GzaTPcK_Gèdg{K{T ce\<[VX]h
$ m | d ]XvwT@`} [{a[YaTcK_Gèd {ca{[d)a_G`Kf
abK[u\^dg{f []asv\Gd@]]d@]sv\Gd@]]@h $ | d@]vwT@` }@[{a[Y

a | T | d~Z]h $ h WPa{[d)a&]abK[jvT `ZUk/]d cK\<[L\<[aRa[{
d@]?v\Gd@]]LLdgèYBabK[L{[Vd _<`K_G`Kfx@q\G[waRa[{&]?d@]?vw\^d ]]

oerW_G[\^YZ_<èfPd}@[{kK`cedg\^dg`/vw[Y6cK_Gèdg{zK{T cK\G[VPhu mt
$
h /x8ke]R[](\G[waRa[{&]wt&d ](v\Gd@]](ozd èYj\<[aRa[{&]Xgtxg
d@]vw\^d ]]@re_G[\^YZ_<èf?dYZ_<vkK\<arcKkZacedg\^dgèv[Yr/K{T cK\G[VPh

S Z m | d ]NvT `} [{Ra[YaT?cK_Gèdg{c a{[d)a_G`Kf
abK[(\^dg{f [yvwT `KUke]d cK\G[yv\Gd@]]ZT) c-[d `Ztz_<` a_<\d@]vw\^d ]]@h
be[][uYKd)a&d]R[a]
| [{[u][\G[v+a[Yc-[vdgke][XabK[jd {[\^dg{f [
[`KT kef baT2dg\G\<T |
VTZYZ[{dga[]R_G[ja{d _<`dgèY})dg\G_GYed)a_GT `
][wa&]r)d èY]Ra_G\<\ bed~} [$YKd)a&d\G[wUaWUT@{I\^dg{f [/èdg\@a[]a]R[a]hIeT {

T@kK{[wZ-[{_<VX[` a&]r | [k/]R[Yua{d _<`K_G`KfX]R[a]mTgUqgo@o o-T _G` a]h
d v&bXa{d _<`K_G`Kf]d Ve\<[ | d@]S]K\<_<a_<` aTd(a{d _<ù][waTgU o o o
-T _G` a]d èYsdbK_G\<\^vw\G_<VcK_G`Kf)}~d \<_^YKdga_GT `][waT UWo@o o-T _G` a]h
be[(eèd \a[]a]R[a]UT@{VXT@]RaNTgUabK[][K{T cK\G[V]vT ` ad _<`
x oKr o o@ou-T _G`@a&][`KT@kKf bLaTPVdg [YZ_^]vw[{`K_G`KfP]RVdg\G\#YZ_<Ut
| [\<\T `2T@`K[
| [\<\T@`8T abK[{VX[wa{_^v]h
| _abdg`T UXabK[\G[d {è_<`Kfd \<f@Tgt
{_<abKV]redgèYdg\^]TaT]R[} [{&dg\T abK[{[è][VcK\<[yVX[wabKTZYK]h

@e
:-Iz/)
[{RUT@{Vd èvw[VX[wa{_^v]$]Rkev&bud@]# T { LX{&dg`Kf@[mU{T@V

ozaT@r | bK_G\<[8Tgabe[{&]?:Iyr4z {d `Kf [U{T V
oPaT?l
b [{[mlYK[-[èYK]T àbK[Yed)adeh#KT@{]RT@V[yVX[wa{_^v]\<T | [{
| e
})d \<kK[]_GèYZ_^vd)a[c-[waRa[{/[{RUT@{Vdg`/vw[ hKT@{TgabK[{&]bK_Gf be[{
})d \<kK[]dg{[$c-[waa[{hbK[$c/d ][\G_<`K[m{&d)a[]4TgUeVX[wa{_^v]]kev&bd ]
( dg{[$_GèYZ[/[èYZ[` a#T Ueabe[Yed)ader | bK_G\<[T abK[{]]kev&bXd ]
b/d~} [ced@]R[\<_G`K[{dga[]Nab/d)aYK[-[èYLT `zabK[Yed)adeh U

n oKh Kr noKh e(K{T ced cK\<_^]N-TT {
c/d ][\G_<`K[
-[{UT {Vdgèv[ rcKkKa_<UNabK[d~@[]T@Za_GVdg\ noeh @oKr
d@v&bK_G[}_<èfN
nBoKhqgVX_Gf b aIc-[S[Kvw[\G\G[` aI-[{UT {Vdgèv[ h
ITdg\G\<T | d~} [{&dgf@_<èfLd v{T ]]K{T cK\G[V]dg`/YVX[wa{_^v]r | [

vT `} [{Ra-[{UT {Vdgèv[]aT8dèT {VXd \<_G[Y
o I]vdg\G[ h `

abK_^]u]vdg\G[ roj{[K{[]R[` a]ced ][\G_G`K[z/[{RUT@{Vd èvw[8UT {abK[

VX[a{_GvdgèYK{T@cK\G[V
[@h f/h<rSoeh qzUT { ( rIT@{sabK[8c/d ][wt
\G_G`K[
UT@{$[d v&bK{T cK\G[Vuwrd èYz{[e{[]R[`@a&]abK[c-[]Ra

-[{UT {Vdgèv[][[`?UT@{dg`VXTZYZ[\T@`?abK[eèd \a[]RaN][wah

#r/abK[sV[a{_Gvske][YaT?V[d ]kK{[(K{T@cedgce_<\G_aPvd \<_GcK{&d)t
a_<T@`r_G]skK`ke]kedg\$_<àbed)asabK[8ced ][\G_<è[uVXTYK[\#abed)aK{[wt

YK_Gvwa]IlUT {dg\G\/vd ][]r | bK[{[ _^]abe[-[{&vw[`@a$TgU-T@]_a_<}@[]

_GàbK[Na[]Ram][war b/d ]#[Kvw[\G\G[` amvg
d \G_<cK{&d)a_<T@`h ?bK_^]$v{[d)a[]
d8K{T cK\G[V
| bK[`L`KT@{Vd \<_G_G`Kf ]vwT@{[]c-[vdgke][sabK[
c/d ][\G_<`K[(VXTZYZ[\WdgèYPd~ []mT Za_<Vdg\WVTZYZ[\bed~}@[y]_GV_G\^dg{

]vwT {[]hIN`K\G_<@[abK[NTgabe[{#VX[d@]Rke{[]r z_^]S]vdg\G[Y
]TPab/d)a(abe[uVX_<`K_GVskeV
T@ce]R[{}@[Y N]vwT@{[X_^]oKh oPdgèY
abK[(VX[d `8T@ce]R[{}@[Y
N]vwT {[_G](@h oeh
be[c/T aRaT VTgUId cK\G[]RbeT | ]`KT {Vdg\G_<[Y]vwT {[]UT {

| bK[`_<a]/dg{&dgVX[wa[{&]d {[z][wt
U[{[èvw[]_<`P-[{UT {Vdgèv[{[\G_Gd cK\G[ h
[d v&bB\<[dg{`K_G`Kfjd \<f@T {_abKV
NPeSe/R
a_<T@`]R[a][è][Vsce\<[u][\G[vwa_GT `bK_G\<\^vw\G_<Vce](T@`h
\G[vwa[YjUT {s[d v&bK{T cK\G[V
wr
ij[ke][a[`/[{RUT@{Vd èvw[VX[wa{_^v]d vvwkK{&d v:
{TTgatVX[d `Zt] ked {[Yt[{{T {z LKwrV[dg`vw{T@]]t[` a

{ T
Dz +r\G_Uaz:I4wrK{[vw_^]R_GT `~{[vd \<\ScK{[dg t[} [`/T@_<` a
D $wrK{[vw_^]_<T@`-~{[vdg\G\It]vT {[BD +rNd~}@[{&dgf [6K{[wt
v_G]_GT `: wr ( {[d2 ( wr$d èY2dVX[d@]RkK{[zTgU
e{T@cedgcK_G\G_avdg\G_<cK{&d)a_<T@` N

IwhsbK[Ba[` abV[a{_GvB_^]
#N
n ((( R s ( R)Kh2Z
_^]d
{T ceke]aVX[wa{_^v8aTke][
| bK[àbe[vwT {{[v+aXVX[wa{_^vP_G]Xke`Zt
dgèYVX[a{_Gvk/]R_G`KfzabK[u})dg\G_GYed)t

`@a{_G[]_<`
abK[a&dgcK\G[Ndg{[d~} [{d f []IT)}@[{abK[N][} [`XK{T@cK\G[V]hZvT {[]
!#"%$'&)(+*-,.0/1/32)#/3$'4-$5/32)6,.798:(6/3$;/32)6<=8?>?,@$'4 /3A+BC8(
4D0E.5F$E3BC8:G)70H5F$:EI987J2K4LE3$'ML(?BN8:G)*OB-0/.E3A7HAP5Q"=;RSGL0"AP/9T
U <%?7?8:&),.V$:56/32LA+,?WXB-98:,.& E3?,Y(ARZO[=\^]_/`> 4)A798(+(P>!8E3CG)$/
&),.9*a8(+$:G)'W@M)& /b8cE3O&L,.9*aAGa7?$'Gd&)GL70/3A$'Ga"6AP/32e$:/32)JEbB-98cf
,.&LE3?,;,.&L7J2g8:,;hIi;[j/3$KAGL,.&LE3k/32)8/$'GL(P>lB-$S* ?(,H"6AP/32gm'$$S*
*LA,.70E3AB-AG8/3A$:Gon:pq@m'$$S*Y798:(AMLEr8c/3A+$:Gb8cE3=,.?(?70/39*Ts=2)A,* $?,
G)$:/OB-98G[=\^]tA,K8!4D$$:EoB-J/.E3A+7Ku_AP/OA,O0v070/3AwZx8/K*LA,f
/3A+GLm'&LA+,.2LAG)mK4D$$E3(P>l7?8:(AMLEr8/3?*gB-$S*L?(,H5yE3$'Bz"%?((@798(A+M Er8/39*
B-$S*L?(,?T
{|L}~FY:):3C:?9k ?3?XF3)#%0@ ):+IKS ?:1:rbLD^D:.3't:3LJ':):@1L.?kL# 0+003'Z=??.+:

KSL?IZ0r::)LLr:rS+LH6P3VI?3?..'LIZ0r::)KS ??)-JS);3)%?#KSL?D-LDF3:-:3)yJ
6
? :Z L L LL D6 # L J 6S
Z#? ) y) y y) y y yD) y) y S y) y
0'D DP DPSD DP S DPSL LS L DP DP S DP DP
LLJ
DP DPD DP DPS L L DPD DP yL DP DP
: ' DPSS DPSD DPSL DPS LS D DP S DPS DPD: DPSS
JZD ' DSS DSS DPSL D PSS SL DPS .DPSS DPD DL D S
r
DPD yLD DPL DPSL S S yD DP S S y) y)D

DP S DP S DPL DP : c DP DPS y DPS DPS
D`
DPDS DPSD DP) DPSL SL LS DP y PSS DPSS DPSS

DS DP DPS DPSL S DD DL L DP) DPS
L0`
y)) DPSS y yDL y) yL DPS DPD DP D DPSS

DS DS DP S DPS PS PS DS DP P DZ) DSS
L0`0D6 D DS DPS DPD: D DPS D S DP S DPSS
:SHHDF0^JSIYoDD:D:rLJ#cLJJkJ;:r
J 0S! J0 ' yJLtJL)
0yL;)J0:#o: KJ:rIoDDF#J 0j Jc
#^ FV:)J0y'#yK0; r0 zSQ@ ybJ;?
: JyYS 0yJFbJ:r%J 0S6 V: ?
oJ0 =HYF rH oxFJko:Sl 0Sy::0cLJ
Lc0L0Jb0!ocJ0F:
L 0!JlJOocJVJJLS;0?
yLoJ :rS 0yJ :0 yoS0DYe:Y:
lSL :J0'D;yJgo:SgLJS:lJ 0:H D
L= =0:J'90yL ^))30: 0J::KD_S
J 0 ^1L!JVJLSy+3locJ0F
_XX
^ 9DY JV c:y) !0C00:J FgKJ0:

Y # YI
= oSj ?D0 oJ0: Y
! -Q e" !#9 Io:S :rLJ cCH Yo:Lr0:

L $33C0:JlJ% oc0JF:=)30:J0'C^ F DJ
J 0 L:0J:::keSgY:
HHJ o X%S-J% 0Sy::oJ 0:@ #:JK
ykr:y'9J 1)HZ 'rFSKoDDXZ :0 Z0:rDy
FD DKDD:= )C3) " 9 kroZ ?SL6S
y@oF& 1 ' (c ar0 * );yJe LrJF-0c
LJ'JJ + @ #, 9)? * )- . LyJ:Q/ #;yJ
LrJF-0LJ'JJyL!:rLJ 0y!::SJC0 0:0JyL
L1 00kD?S0:Sy!;:_00oS02 SL xyLec J
0F0:KoDDFoS 3 ) Ly)?O0x yy
J Jc'x^ )J' ZL?SLO H y#0oDDF
1
' (CFKrJ:V0Sj:yL)JaJloDDFK
0LJ'JJ ::SJ-:yL)J:gZ :0 O:SS; :r
0K0!SSJ tr: ?SLy#^L'#:rLJ
^ ?J0S4 $3rIF* )ybJ:rIJyLy;oDD:#::SJ
J o 0oDDFCZLo V:rLJScS_0
J!J:JVy' !3 :cJ'Qb* )tJ:r
J xoDD^0 Jg0 J5
" ^D CJ 0VcJ
0YJ _Sg Oy'S0S LJyJ eJVSr0 S
0k0 yk: JYySlF*)oJb:rHoDD:XJL S

: JyKoc0Dg ;:L?xoc0JF-S0:rJL!
HZL:J oDDxZL?SLy6
7Y o )8 e
o'SJ 0xSb :96JyL; 0S)0ycJJo0S ke
J_'3lJ !KDD:< c HZ 'rFSjoZL?SS
:L?oKDD:oJ0 JV#^ )0:KCJy= ) :yy
)DVS0^02 yF0yLOJc> LyL-0HKDD:;
oDD:0:DF90yL;S00J'0:! H0 SyyJ:@?
ACBEDGFHI%JKMLNO/P IQSR OUTWV IYX 3[
Z LNO/P I Q 3 Z R OUTWV I
;0 A B F60^ R )IOUTW VI oM \z LNO/P I F#0Y^ ]
?S0 ^LLr` _?S
^0C0:DF90:l0 yy
y3JSK: Ja _YFKcF 0 S00o^ZL:J KDD:
ZL?SLybLDJJ 0" xLb CSQ0O'CocJ
0F:
. yJ o'S_ 0o y'e0c 0C cJOyoJc
:cJ F-0xc: C;' V JJ 0o-JlSJ
:JVyôcJ SC0H'3#D)FDD DoFd 1
yI:KS%J'oocJ0F J % 0 )[ e g f / 9h .
r:V J:cJ JF30)J D0:rLJ
0-S0;:JVyboc0DH H3ôJ' J @ ?
iJOyCJ:cJ !FSbJLDJoy:bJ-:JK
yC; jX0)JL;: ?gJ 0o cboc0JF
i 1 0JJF!jJ0yLlJL ;:J0 J
S LVoDD:H0c O IH-KJDJ'r:)J:
y :cJ 5
a0 VOL1 0r0y cL)J0yJlJ
JOy-J:cJ l k H c:y)J 0S
HcLJrJ:L0ybrJ0 J 0S6S:JVy^r:y'9
0yLr '3?%JSIJyCSOD% jX0)I:S0-oc0D
DSlS?SoJ;JcJJL;F;S!3 j'9J -HZS= :D
:0SJxyL?rC y'9J a #KDD::HKcLrFrJ)
J 0SSFrCJL :r0#JS6 0^ 0KrJ;FrYr:y'9
0yL FbS jX:90yLo^Z!SHr:y'90yg DJ 0o c
m@no1m3prqs%m/otvudwpyxdzgm@o1m8p{wY|Ym3s<o&tvwu{}~U}u1=mxCw:tm3u1tvwu1
pb}nm`p{mxdu1<3o%tSwU~m3u1gx1x1=ng&<o3}u1m3tvgs=s%83wnx1udws=s=m@|M
APR
ROC
ALL
MXE
LFT
BEP
RMS
FSC
ACC
12
l<lW^88^@55/WU:
C
n xdzg=o&om/>xd%wnC#mawp{}:udm{m3no1m3p&qgs%mao1m3s=m@x1=wnCx1wxdzgm
qm@oxao=ngs%m-p2wY|Ym@s=o
##Y=n}:qs%m/ro=ngCm@ud3m3nx
udm@|g>xd%wn%nm@u1udwu/ngm8}|Y~U}:nxd}mbwtm@ud3m3nx&udm@|Y>
xd%wn=nm3udu1wu8wp{}:udm@|x1wngwudpb}:s=%@m@|odwu1m/ob=o8x1z}Ux
nwudp{}s%=3m/|odwu1m/o`*z}:ngmb}o`qmxxdm3u&p2wY|Ym@s=or}:udm&tvwgn|
}n|+o1zg%txxdzgmxdw+wthx1zgm{o1@}:s=mgl`"zgm&m@ud3m3nxudm@|Y>
xd%wn+=n+m3udu1wutvudwpp2wY|Ym@sx1wG|gm3m3n|gowns%wnxdzgm
m3u1tvwudpb}:n3m@o4w:t"p{wY|Ym3s<o}:n|C4Mngwxwnowp{mbw:x1zm3u
p{wY|Ym@sMx1z}:x|Ymngm@o#xdzgm`x1wwt(xdzgm m@utvwu1pb}nm od3}:s=m
w@}:s<gs<}Uxdmhm@ud3m3nx(udm@|Yx1=wn =nm3ududwu/3#mhu*ox3wn^~m@ux
m/}*z+p{m3x1ud=x1w8s=wodozgm3udm&8u1m@gu1m/om@nxdom3u1tvm@x4m3u1tvwu1
pb}nm+}n|udm3u1m/om@nx*o2wudoxbm@utvwu1pb}:nmSm@utvm/>x
u1m/|Y=x1=wn^%m@s=|gob4 r[ y
h6r}n| h6G6`5
}s%sw:tzg<*zz}W~ms%wo1o-gnmg}:p{gs=m+=s%s4zgm3s=l t
=p{gudwU~m@o tvu1wp-x1w^x1zgm-s%wo1o1m@o&}:udmag
}n|g:udm@o1m@>xd%~m3s="}:n|xdzgm+m3u*m@nx-u1m/|Y>xd%wn5%n
m@u1udwu8=o86ngmwx1m3nxd=}s4|Y<od}|Y~U}:nx*}:mwtm@ud3m3nx
udm@|g>xd%wn%ns=wodo<ox1z}Ux%x%~m@o"p{wudmm3p{gz}o1=o"xdwaudm
|g>xd%wnor}:xs%wU6s%wo1o@udm@|Y=ngs%wo1o tvu1wp axdw+g g
<o}audm@|g>xd%wnlgzgm@ud}ou1m/|Y3%ng{s=wodo"tvudwpE &x1w
%/
o1}p{m{}qo1ws=Yx1mb*z}ngmW>M<o wngs=+}udm@|g>xd%wnl
no1wp{m|Ywp{}%no#x1zg<oqg<}o<o}ggudwgud=}:x1m nwx1zgm@udo%x
<o"ngwx@[wu1pb}s%=3m/|-odwu1m/o#|gw2nw:xz}W~mxdzg=oqg=}o3
}:qs%m` ozwUohx1zmm3u*m3nx#u1m/|Y>xd%wnb%n8s%wo1o[tvwum@no1m3p2
qs%mom@s%m/>xd%wnwnx1zgmxdm@ox2u1wqgs%m@pbo2}n|/+p{mxdu1<3o@
3wp{}u1m/|`x1w4x1zmqm@oxhp2wY|Ym@s=oho1m3s=m@x1m/| tvwuSm/}*zru1wqgs%m@p
}n|p{mxdu1<:Co&;xdzngwu1pb}:s==3m@|odwu1m/o3n}sm3u1tvwu1
pb}nm/o}:udm`m@ox1=p{}:x1m/|-wns=}u1mn}:sMxdm@oxom3xdonw:xo1m@|
tvwu(xdud}%n%ng}:n|r=o(x1zgm"}W~m@ud}mSwU~m@ulx#wx1ud<}:s<o(%x1z&m@}*z
u1wqgs%m@p#Swo%x1=~m`m3nxdu1=m@o=nx1zm xd}qgs%mrp2m/}:nm3ududwu"}o
udm@|gm/|MY
mhm@no1m3prqs%mo1m3s=m@x1=wnm@utvwu1p{m@|-qmxxdm3u/
no1m3prqs%mo1m3s=m@x1=wn=nobw:tU2xd%p{m@o2o1=ng%3}nx}Ux
g /5n}W~m3u*}:mhm3nom@prqgs=mo1m3s=m@x1=wnu1m/|Y3m@o
s=wodogU^wU~m@uxdzgm`qm/oxp{wY|Ym@s=o"tvwum@}*z8gudwqgs=m3pE}:n|
p{m3x1ud=r(wpb}:mrxdzg=o 3wn3u1m3x1m%txdzgm{qm/ox p{wY|Ym3shz}o
}33gud} 8Ss%wo1o&=o2g=@S}n|}ngudm@|Y>
xd%wn=nCs%wo1o`wu1udm@o1wn|gox1wudm@|Y=ng-s=wodo4x1wg @8wu
=n3u1m/}o1%ng+}33gud}xdw ^8Y%p{=s=}u1s=S%t#xdzgm8qm@ox
p{wY|Ym@sz}o#a g}n-gudm@|Y>x1=wn8%n-s%wo1owu1
udm@o1wn|go(x1wùdm@|Y=ng xdw4%/:l(m/ox[o1mx*o3wnxd}%n
g @}o1m@o@ n3u1m/}o1%n}33gud} ^p{m/}:noUW
p{wu1m 3}om/o"}:udm4u1m/|Y=x1m/|-3wudu1m/>x1s=aq^bxdzgm m3no1m3p&qgs=m@o@
"zm%p{gudwU~m@p{m3nxh=ongw:x|gud}p{}:x1<:qgYx3wno1=ox1m@nx1s=r=nY
3u1m/}o1%n}@gu*}3 g wuSu1m/|Y=ng`&g gUW3wp2
}:udm@|x1wxdzgmaqm@oxrw:t:p{wY|Ym3s<o <o %p{gudm@odo1%~mbzgm3n
xdzgm qm@oxp2wY|Ym@s=oz}W~mò*z-wwY|m3u1tvwudp{}nm#Yx#m
Mean Percent Advantage Over Best Single Model
10
0.2
0.4
0.6
0.8
Fraction of Models in Bagged Samples
/1":^*d`@@/@>:**d//2d

U

;[3 h r3
*U (
^3/ ;
|gwn x"}:nxxdwawU~m@udoxd}:x1m`x1zmr=p2u1wU~m3p{m3nxw:thm@nom@prqgs=m
o1m3s=m@x1=wnl
gwurwp{}:ud=o1wn(lq}:=ngwu qw^wox1=ngxdu1m@m@o
^=m3s<|go}au1m/|Y>xd%wn%ns=wodowp{}:udm@|-x1wbu*}Wx1udm3m@o@
g &xd%p{m/ox1zmqm@ngmxw:thm3nom@prqgs=mom@s%m/>xd%wnl##Yxx1zgm@nl
=p{gudwU~=ng xdzgm4m@utvwu1pb}:nmwtlp2m/|Y=w^3u1mz%za~U}:ud=}nm
p{wY|Ym@s=oo1*z}o"|Ym@3=o1=wnaxdu1m@m@o"<o"m@}o=m3ux1z}n-=p{gudwU~=ng
xdzgm m3u1tvwudp{}nm`wt(x1zm ~m3udbqm@oxp{w^|gm3s<o3
l!(#"%$'&(&
#&C/*),+.-0/21433/5&
=gudm o1zgwUo[x1zgmm@ud3m3nxudm@|Y>x1=wnb=nas=wodo[wtm@no1m3p2
qs%m o1m3s=m@>xd%wn8%x1zq}:=ng&}o ^x1zgm4tvu*}x1=wn8wt(p{wY|Ym3s<o
=nxdzgmq}:o@W~U}:ud%m/oltvudwp|YwUnx1w #tvwu(xdzgm4@p{mx1
ud<3o}W~m3u*}:m/|}udwodoxdzgm-gudwqs%m@p{o@-#x q}o
3wnxd}%n}:s=sMp{w^|gm3s<oz=*z=o"m 6g%~U}s%m@nx#x1w{ngwxq}:=ng
na}W~m3u*}:mUq}m/|2m3no1m3p&qgs%mom@s%m/>x1=wnbudm@|gm/o[s=wodo[}:n
}|g|Y%x1=wn}s^g }:x gY"zgmudm@o1gs%xdo%n2}qgs%m/o[#}:n|
r}:udmtvwu g}r~U}:s=gm#mo1m3s=m@x1m@|aqm3tvwudms=ww^%ng&}Ux
xdzgm@o1mudm@o1gs%xdo@}:n|{zg=*7
z =gudmò1zgwUoh<oo1gqwYx1=pb}:s
"z=og <o[}:qwgx}4x1zg=ud|2wtxdzgmx1wxd}sgGqm3nmgx#m
o1m3m %x1zm3no1m3p&qgs%mo1m3s=m@x1=wnl8
%gudmrvzg<*zo1m@oxdzgm
n}:sxdm@oxo1mx*od"o1gm@oxdo %nxdzgmùd}ngm`g=:
9Cg 2wgs=|
^=m3s<|tvuxdzgm3ub=p{gu1wU~m3p{m3nx/<
;Cm+}:udm3gu1udm3nxds%m3^m3u1
=p{m3nx1=ng;xdzo=ng3u1wo1o-~U}:s==|}Ux1=wnx1wg<*5}ngm/}:u
wYx1=pb}:sM~U}:s=gm wt tvwum/}*zgudwqgs=m3p }n|-p2m3x1ud=
=>?A@BDCFE:G'HJIKMLN?AOP@BQBQRSTKURPVXWYEHUZ[]\*G_â`bRPc
d
@e_G'RPcfOPCgB
m3udm#m`gudm@o1m3nx"}23}omòx1|Y8zgm3udmwYx1=p{%@%n{}n8m@nY
o1m3p&qgs=m"x1w4x1zgmwududm@>xSp{mxdu1<"z}o[%p{}>x[wn{}:nb}ggs==@}U
h
iQjkmlon8prqQsutTvwTtx]yz:t|{}DwTy~x%~x
gxD
tJDtUtTtTwTy~x74tvyDt5tTy%r{tv:x]tmvxD~xM:mv~yD2

_D
]
4

D
_a

'
QD D
rQ_f
rQ_f Q
a!
_ D
_
_r
4
Q
_4
_r
Q
Qom

o
]'

]
_r
_
'r

'!
mr'
'
_
'
_

'
'
_r
]'
'r
_
'
]'

r_r
_
_r
4
Q
m_om
_]
m_r
Qom
_Q
_]
4'
'
_]
QQ
_r
rr
r
_
Q]
_
_o
_r
_Q
_r
r '
Q_
!Q4o
_
]_r
Qo
'
'
]'4
_r
rr
rr_
r
'Q
]
Q
]_4
m_rr
rr_
m_rr
_r_
ar
]r
Qom'
m_r
m_r]
m_]
Qomr
Qr]
__m
rr4
_o]
_rr

T2ru2'!'rT'!'PT!m'Q_!'T'!m
'NDT_!'r mmm!m]:mD_ 8
TT%mrr2mmrTUm!|Tf 8'oarroPM(
''TmP'!m!! Q TTrXP!'
mmTro!(A!!'Tm'Tr
'r%oro!
]2!m'!<T'Q
Tm!']r!o']T7PmTD!m''2TT]r!o']T
(mT!%DT2o%*T!oTPN'0ro!TTQ
PJ!2'7T'_ XT
T'''Mm_!2mDT
mTouTmmDr_mDm'J'T_o|T

o#'8'T'!Mr2om!
''!!]T_(

mr!_5
or
(mT282To:rP!]

T'PTroT!r]mrMT'2__m
Q
!2 ]r!mD%A4Q
T!'<_5
2']'!
(rTmDor!!T4Q'''MrTrMr_!or
2

FTm<4T'(P'5

'
0,
!!TTm:m
P!_
2'T]%D]m:

mT
r0P'T'T!!'*]r
T

T'7(P%r!']TNPr
(m4
r!']T7Jo
T'A%__!
rT(P_

m7r2P'!o_T2 m%_P'!

m!!D_5

P8!TP
rTmm 8'o

]5'U(P
T! u|TU2'U m
X'TTmD!
4o2TT54o'r!M!_
'%P mr''!o!'!'UT7!!_%r!gr
r!'!'']'_'P'M'!__MmTr2'!
?A@rqA@UsPALU_IK`t]F_IC_XfM _LuZ+vuMOw_yxzQU` _Y[ZIC_`

{
}|}~ N 6 a " a

H, N |
6 }a|} ~ " a A
N 8

N
|
,

&

6 a " a $
[H}
U
}
py a
}

U +

5r
0']UJ'2TrTm!DM!mD(r
%mT!a
*r
2__g
8
4mrTT0T'
']U#]
r
2TomT!DNm!'2
TaT'82Pm!D
!A'm%
_!2 'm'rQ'mTr_oP!MT5_
!mmm!m]aT2
m_5
m
T'
_'*]JP
p
r(U27mUm!!
PT!D!'Y_ (D
:r
T'
8'2m!DTM

'mT
2P7m!D
8mT
roP
APrDorM_ umTmmm:m'N2__mQ
mmT
_ (

Tr*!]b:D
TmmP
2Tom
!
r
7m!D2
(DDT
5'_%'P:
oP
X0 2_Q
,]2T'
X7_
T2 m4|f'_M!]:DX
'm_2'*
'!F:D
8'o
Pr
PTr:mP'!Jm!!7oa'X_'!'
TmPgorTamDT!DT4Q'Df!Tr!%
D:T'rTa'orfJ_ (DarT'!'r
"!$#&%(')+*,''%.-/102/,3546*678*698:<;=-98>
?A@CBD@FEHG8IKJCLGNMOJKPRQSGTQULSVWJKICICXICJKY[ZJCQ\^]F_M `
a
^bdcfehgCicfjKgCkfl &m Tn
2o

TT'am_!2 %mP'!UT72To
mm!!*
'
*Pm
r!
mm U!!!m!'!'T'J]o']T'Dar
mm'!UP|!%_,r!:_]rDT8]rm
2__!
8'4o'r!!(m'T
!'Mrr2TmaTT'
rr2TmU
!!5AmN0!'7(PTT'
m!!
,%__
P'!2P|!J'2]r!o']T
'
?A@F@PDYFM GNMOJKPRQG8I"PD`M
'!o_<!'rT!om_P!

r02Q'mo
A
!__7o**T!m! m2Q'mXm]!
f
M_oT!'_TTT'!'UTTf'!' #Tr_
''TXTT'
2__mo5'
XmTmr'Tm
gQ_'!'m 0__mgr!'!'!Xr_T]Tf 8'
o'rr2mmJ'm'r2'(mT2
]o']T0PmT
Pr!'
2Q'mgoXmom.P7
T2m!!2__m8P|TrNm%
o%'r'TrTTN2'D!Ur!8%__!2r!'
Tr!
TP'!T
r(
2QNTT']o']T'rT
'_!'TT'02__XT
8'!%'T]Q!'%rQ!2
4
Of2 .d.6h "hd.$8.FNFfhh.Azh

TTT( d R R N 2T T
T 88 NNf NNrN8 NNrhNfO NNffff NNff NNhdf NNrffff NNfff8f NNhf
TR NNfffN NNffff NNrfff NNhff8 NNhfdN NNfNOfd NNhdfd NNfffd NNhfd
N T8 NNrfff NNrfOf NNffff NNfff NNffN NNff NNfdf NNNfOf NNNdf
N Nhff Nf NrO Nrf Nrff Nhd Nfff Nff Nf
D d R R N 2T T
T 88 NNrNOff NNrNO NNhffff NNhffN NNhf NNhf8f NN ff NN ffff NNff
TR NNffff NNNfONf NNfNOd NNrhfN NNrhd NNff NN Nfff NNNfO NNhfff
N T8 NNhfff NNfNO NNrfONf NNhfff NNhNfO NNff NNNO NNff NNhf
N NrO Nrf NrN Nf N Nff Nfff Nff Nff
tT8HOA2OT(O8OOfRz8 R(8
T O8OO+2Ot"N NO{8dpft8} N NOO" r TFT8Or
Of
R88Dt {RO({TtrtH8
DOOfT6 z8f 2 {(OO"NTf"8DT {
t DTO}Nt 2 T8 r
{OtNNO+8f pTf8
. T8 O r pO
8N8,tNNO,fHOT(O88} 8OFdfT
tNN
"8O8T $T8
T NT
H [r $t N Nr { ( {" rp8r8[N8

tO6}O Oy(F(O r,8
(OT O (Ff8OTTOHrF8 "!$#%!'&(
8t8 HtrH Dt8N " RU
tWrpt y* )NROT(y + " R, .-./ 0
8O{8fT T1
!d2 "3 Ot8N8
zzNTfOO 5 4,8t 6 879 t8tO:
O8 f<T8
u8f f; <>=???
tNNO + r88 @<BA ???tD +fT
C<D=$?2?Wf tfDNp^Nu^tON8
0 D
RNNF.=$?tAtNOO{88 rH T" fGTT E +=T?t8 * HfO
8 z
NNhffffN
NNrhfN
NNffffd
NNO
8 z
NNf8
NNhNfO
NNNO
Nfff
D{fTf (T8O,OfyDfp0
f 1 8r tr$ 8 HR(O8RHN,0
t H} $ r N d8NOf

OT(O8tTTf Dt8R^6(tR 0
UNNO}z8T T 62 O rWNDl 0
{T{ ftrf 8 8y8fT
t O,* N Ht T 8Rf0ODHtN8trFO ,{f $fN O
H:Iwv:I8x]iYaUsR8yzxNR9cp^bY|{} ce~ ~N9f2cpMN UaR
TT(O r2. ( fz8Tfr(, 0

DWfTOA T 0OODtNNO,t
T NT288zr( * 8tNNGDtH
OTT
(O &8NON8yr8OO8,0
88{OW,(T 0OOt8r7 {fp8 0
8T8[8 f td C T n #y 6Op 0
8(O r6 .} T(* 0 +tNNr,
8TO[p $ W8
T N y8T8.
pO r ^ d8)NDtOO
8, 0OT(O8H(O A
G
qEklkF
TT O TOu f (O (O
tT(RffpTO[t8t WO T 1 8ff6 fN t T {fR8 fz2tNON{rRO(p8{8fRO6Ttl00

T [O UOOT yTTf[t Nt rfh 4,8 . fOf88f tfTSR'0
8 "(8fr"N (8 t{ Nt z ft(T"f8DOR$ O,O88
ttO,T T r( AT
[WNt tR "OTtNNOdO8r O8:O

UfN.(D2 FOTff8.rtUN NfOO 2fTN8t$ 0
C. T8yOSu TNTONNO OTTHD{fT$pf Ot88ODl 0
S [T ^T8
( OA {Tt )NDt .z88{
:H I,J8I5K;L8M NOPN QN R8S%TVUXWRZY\[F]2^'_Ùa^bOdceR8f2]gh]iM*^'N f
Fjkj

l
m
Xn 7 #
$)

8o
')
p0

qn
sr
t-./
uo

"*(s*l;$*lw*le'Ela'8l*,3,
,2P23w3l*i,%e*l*e'pa32l$
lekpp*wapemppkl$eq,l$2e
s23leeblw$mpp*8b*$2,
q"FkFXF33F3kq.k9a
tt (l;'$ek$$e2,e(l
'i` *5%FlFkquap $mpw3$a
*,wp $p3t*ii2,'lzE
la3"pw$*k*il ukp$u,p*,V,2il
b2*$2,eae2*e*$qw:2p*$F*il b3
e'$('q2q,leF,epw3$a
s*t$Vsiq1le",3*
t Vl$%(qE22p*i"e*i2il2
E'$tleuis*(ewepXee,i5pa98ip
b$e$e2*ilebeZepZepZe p2qEp:
a3m',we',2P3*(l%eV,2ipb%2(lE3l2p
,ea2(Plde*$;$+eZ**i
apqll$k32 .ep3e.2'*le
; '$1le;lelle1*l,llele,2pl
l$l,e
8$ew*hblw$hllee
Ea3$2ple

Fpp8,eile

\'$2
b2:
F
5"
5q
Va

E$w3e*'kl$8a
V
Vq,leelp
e! " V2#
$ V2qq(blw$"3$e, b$a%

&&'(Pq,l)
$2w$llee
**
+'i
,
k2,e22$2- .e,p,2$. Va
/ 1 032 ; ua$l*$. a22El*
,m21si3l$1l, q2ep me",e$
2.*82ql(8a$21*a3ela' b2*$2$
el(lqz /546 2 (ai 1l(a2
23leE*$d'*le 5lEV1eda23l

qe
elt$ l*'pZ: eip:$a5 $:$:p e52$
$ipu3l*a.a23leaz,l /547 2 4798;: ul
3la
l,X,*(ewl,2+l< qeZ**i
lelle*l,llwe zsii'pq.ip.
b$p2$p eZ2$e$22ipei"*
46=*8?> V"l"232*e*w,
!@%:i9
A222e
2- u,$ sew$ke** , e2pw$.q,l
qw+be 22pVe eVe 2peepk2pt$
'lule(*2ew$C B 'l1*l
Dlq9*
EGFP H$ epepXe$
@w
ICJKML<NOKQP ,3l $ I L P z8E;1le
@ e
el.*$le,"q,l$e*qppe a39
+$3
*leplP*31
!@ 2plpel2$,,
3lee* lm32R
+$3 2-
*SUT6V&W!X5YZ![\^]_Z VG`
>
Va$, Ve$"lebe,sqe

e,lw$qp 3$k3e3, lEE1leP8
a$,l*eVe*mq,l;e
bdc%+Ve +Ve'ei$
2ew$s'l2;$& 1e*q,l|leQ7
'f
g a32l;$a *l
h 6 i:jZ
w$26F
, , tkB2lF 2-m%V.*s2lu;'
ee", $ee;e'$23 *
*;$:9n 2-D2,eeelpoqr<s!tvuxw

y wzq"{|u}tvu!~$6 Ue8 $ s2e
F3*" a # $2 Xl*euepe,\;'
eePleeaetv{<|5|u}w{|uxq tuGq)"{kks<u
of t xCw*Cq<|t w{Ox<w|Z e
F3*" a , , a & 22- e 2,ee,l,

w"$eeee*;"1ll`*l ,eupep
pp *."{|uxql{|(t rtq"|u}wvt~-wuxr<wdw|w<q{zrzps x2
Fei*5+ 6 $2- 9b 3w$b*$2,$5w3,

$azlE'2*le3,dee*{kUrU$s|u^w{|uxq"
tuxq"5.u"uoq-rzstvuxw y w<q{|u^tvu ~ e:k2$e $2-
2li$p;$e}p $fe $awl*}(
t
a a$,l* ( V e*3lZ 7( ,V l27 , 2A 32:
63 2- p eesl2l;eewlqp
e, Fb,we$e{kUrU!t~Usvs" y tv{<"{|uxwOdw<

rtwuxr<w"{kks}zs
A222ep"7 2-
,ew$*`*, 3 lee
elw*$e"q"uxr<wtvuw{|uGwGowvs}
"2b x
, A 2e: a Gb 2 .Ees*.2k '
ll(lel.l*,2:e{|(t rtq"xu}wvt~ wuxr<w &*
**Z3F me, ll , .b *;$:
1 $22-
^q~ w<nt- {kwUwtvu ~nquG%<U| tvu ~ 8eew*-q *
sl#)
# ibì
ee35+ , + $il; 33 2- V a*l,l
$2etep",e' 2,3*ms*l;$*
quxr<wtvuw{kq"5u"{|q"(t"u{kUrzw<|tvu ~x<
wn
zB
$lel*l 2%3e
d!kd% : $ p $!
+$3 A 3 2- + e,wVelep5leesl
l2;2e,$dlwlX%*2el*B ,,,
epPepe"q"uxr<w*tvu y q{(~ woq{(~tvu;Cqz|tv
w{<% e:|}b
+l'223V , F
ei*5+ $2 8lupl
ee,,2`23 $ee2Roqr<s!tvuxw y w<q{|u^tvu ~
2
e $ee
d $2eU desii,Eee2Pl;'
ee, $ee} .
\'*,_|uo!df{kks<
u"u}tvuGwzq"{|(tv5q tuQquGfCqz|t rzq"(t"u
C6e^C7"}"<l.^Ck^"}}.OC
(-- Uk}k^!-)z" k}CQC-zz
k^d$}k-!7z}}zz.k<<<v!nOv}
G^<}}|G G.kx<_"<!vx
z"|}v!"
xkl z
Uf<" UzC^ k
e <

Getting the Most Out of Ensemble Selection

Rich Caruana, Art Munson, Alexandru Niculescu-Mizil
Department of Computer Science
Cornell University
Technical Report 2006-2045
{caruana, mmunson, alexn} @cs.cornell.edu
Abstract
An ensemble is a collection of classifiers whose predictions are combined with the goal of achieving better performance than the constituent classifiers. A large body of research now exists showing that ensemble learning often increases performance (e.g. bagging [3], boosting [21], stacking [25]).
Recently, ensemble selection [7] was proposed as a
technique for building ensembles from large collections of
diverse classifiers. Ensemble selection employs greedy forward selection to select models to add to the ensemble, a
method categorized in the literature as overproduce and
choose [20]. Compared to previous work, ensemble selection uses many more classifiers, allows optimizing to arbitrary performance metrics, and includes refinements to
prevent overfitting to the ensembles training dataa larger
problem when selecting from more classifiers.
In this paper we analyze four previously unexplored aspects of ensemble selection. First, we evaluate ensemble
selections performance when all the models are calibrated

to place their predictions on a canonical scale. Making
calibrated models available to ensemble selection provides
significant improvement on probability measures such as
squared error and cross-entropy. It appears, however, that
calibration does not make ensemble selection itself more
effective; most of the benefit results from improvements in
the base-level models and not from better ensemble building.
Second, we explore how ensemble selection behaves
with varying amounts of training data available for the critical forward selection step. Despite previous refinements
to avoid overfitting the data used for ensemble hillclimbing [7], our experiments show that ensemble selection is
still prone to overfitting when the hillclimb set is small.
This is especially true if their model bagging procedure is
not used. Surprisingly, although ensemble selection overfits with small data, reliably picking a single good model
is even hardermaking ensemble selection more valuable.
With enough hillclimbing data (around 5k points), overfitting becomes negligible. Motivated by these results, we
present a method for embedding cross-validation inside ensemble selection to maximize the amount of hillclimbing
data.1 Cross-validation boosts the performance of ensemble
selection, doubling its previously reported benefit. While
adding cross-validation to ensemble selection is computationally expensive, it is valuable for domains that require
the best possible performance, and for domains in which
labeled data is scarce.
Ensemble selections ability to optimize to any performance metric is an attractive capability of the method that
is particularly useful in domains which use non-traditional
performance measures such as natural language processing [14]. Because of this, the third aspect we investigate
is what benefit, if any, comes from being able to optimize to any metric. Our experiments reinforce the intuition
that it is best to optimize to the target performance metric;
This technical report is an expanded version of a paper accepted at the

2006 International Conference on Data Mining.
1 This is different from wrapping cross-validation around ensemble selection, which would not increase the data available for hillclimbing.
We investigate four previously unexplored aspects of ensemble selection, a procedure for building ensembles of
classifiers. First we test whether adjusting model predictions to put them on a canonical scale makes the ensembles
more effective. Second, we explore the performance of ensemble selection when different amounts of data are available for ensemble hillclimbing. Third, we quantify the benefit of ensemble selections ability to optimize to arbitrary
metrics. Fourth, we study the performance impact of pruning the number of models available for ensemble selection.
Based on our results we present improved ensemble selection methods that double the benefit of the original method.
1 Introduction
however, they also show that minimizing squared error or

cross-entropy frequently yields ensembles with competitive
performanceseemingly regardless of the metric.
Fourth, we test ensemble selections performance when
only the best X% models are available for selection. These
experiments confirm our intuition that the potential for overfitting increases with more models. Using only the top 1020% of the models yields performance better than or equivalent to ensemble selection without this model pruning.
high variance. Ensemble selection uses bagging over models to reduce this variance. Multiple ensembles are built
from random subsets of the models, and then averaged together. This is analogous to the feature bagging methods
proposed by Bay [1] and Bryll et al. [5] and used in random
forests [4].
Another technique used in the original paper is allowing
models to be added to the ensemble more than once. This
provides two benefits. First, models added multiple times
get more weight in the ensemble average. Second, when
models are added without replacement, ensemble performance deteriorates quickly after the best models have been
exhausted because poorer models must then be added. This
makes deciding when to stop adding models to the ensemble
critical because overshooting the optimal stopping point can
yield much worse performance. Selection with replacement
allows selection to continue adding copies of good models instead of being forced to add inferior models. This, in
turn, makes deciding when to stop adding models far less
critical. All of the experiments in this paper use these three
techniques.
2 Background
In this section we briefly review the ensemble selection
procedure first proposed by Caruana et al. [7]. Ensemble selection is an overproduce and select ensemble method carried to an extreme where thousands of models are trained
using many different learning methods, and overfitting is
moderated by applying several techniques.
In ensemble selection, models are trained using as many
learning methods and control parameters as can be applied
to the problem. Little or no attempt is made to optimize
the performance of the individual models; all models, no
matter what their performance, are added to the model library for the problem. The expectation is that some of the
models will yield good performance on the problem, either
in isolation or in combination with other models, for any
reasonable performance metric.
Once the model library is collected, an ensemble is built
by selecting from the library the subset of models that
yield the best performance on the target optimization metric. Models are selected for inclusion in the ensemble using
greedy forward stepwise model selection. The performance
of adding a potential model to the ensemble is estimated using a hillclimbing set containing data not used to train the
models. At each step ensemble selection adds to the ensemble the model in the library that maximizes the performance
of the ensemble to this held-aside hillclimbing data.
When there are thousands of models to select from, the
chances of overfitting increase dramatically. Caruana et al.
describe two methods to combat overfitting. The first controls how ensembles are initialized. The second performs
model bagginganalogous to feature bagging [1, 5]to reduce the variance of the selection process.
Ensemble Initialization: Instead of starting with an
empty ensemble, Caruana et al. suggest initializing ensembles with the N models that have the best uni-model performance on the hillclimb set and performance metric.
Bagged Ensemble Selection: It is well known that feature subset selection (e.g. forward stepwise feature selection) is unstable when there are many relevant features [2].
Ensemble selection is like feature selection, where models
are features and model subsets are found by forward stepwise selection. Because of this, ensemble selection also has
3 Methodology
We use all of the learning methods and data sets used by
Caruana et al. [7], and all of the performance metrics except
CAL (a probability calibration metric) and SAR (a metric
that combines accuracy, squared error, and ROC area). In
addition, we also train models with logistic regression (LOGREG), nave bayes (NB), and random forests (RF) [4], and
experiment with four additional data sets: MG, CALHOUS,
COD, and BACT. All of the data sets are binary classification problems. The learning methods and data sets are
described in Appendix A and B, respectively. The performance metrics we study are described in the following
subsection.
3.1 Performance Metrics

The eight performance metrics we use can be divided
into three groups: threshold metrics, ordering/rank metrics
and probability metrics.
The threshold metrics are accuracy (ACC), F-score
(FSC) and lift (LFT). For thresholded metrics, it is not important how close a prediction is to a threshold, only if it is
above or below threshold. See Giudici [10] for a description
of Lift Curves. Usually ACC and FSC have a fixed threshold (we use 0.5). For lift, often a fixed percent, p, of cases
are predicted as positive and the rest as negative (we use
p = 25%).
The ordering/rank metrics depend only on the ordering
of the cases, not the actual predicted values. As long as
ordering is preserved, it makes no difference if predicted
2
Table 1. Performance with and without model calibration. The best score in each column is bolded.
ACC FSC
LFT ROC APR BEP RMS MXE MEAN
ES-BOTH
0.920 0.888 0.967 0.982 0.972 0.964 0.932 0.944
0.946
ES-PREV
0.922 0.893 0.967 0.981 0.966 0.965 0.919 0.932
0.943
ES-NOCAL
0.919 0.897 0.967 0.982 0.970 0.965 0.912 0.925
0.942
ES-CAL
0.912 0.847 0.969 0.981 0.969 0.966 0.935 0.940
0.940
BAYESAVG-BOTH
0.893 0.814 0.964 0.978 0.963 0.956 0.918 0.934
0.928
BAYESAVG-CAL
0.889 0.820 0.962 0.977 0.960 0.955 0.912 0.925
0.925
MODSEL-BOTH
0.871 0.861 0.939 0.973 0.948 0.938 0.901 0.916
0.918
BAYESAVG-PREV
0.881 0.789 0.956 0.970 0.956 0.947 0.893 0.911
0.913
MODSEL-PREV
0.872 0.860 0.939 0.973 0.948 0.938 0.879 0.892
0.913
MODSEL-CAL
0.870 0.819 0.943 0.973 0.948 0.940 0.892 0.910
0.912
MODSEL-NOCAL
0.871 0.858 0.939 0.973 0.948 0.938 0.861 0.871
0.907
BAYESAVG-NOCAL 0.875 0.784 0.955 0.968 0.953 0.941 0.874 0.892
0.905
values fall between 0 and 1 or 0.89 and 0.90. These metrics
measure how well the positive cases are ordered before negative cases and can be viewed as a summary of model performance across all possible thresholds. The rank metrics
we use are area under the ROC curve (ROC), average precision (APR), and precision/recall break even point (BEP).
See Provost and Fawcett [19] for a discussion of ROC from
a machine learning perspective.
The probability metrics are minimized (in expectation)
when the predicted value for each case coincides with the
true conditional probability of that case being positive class.
The probability metrics are squared error (RMS) and crossentropy (MXE).
0.14 from a neural net does not necessarily mean the same
thing as a prediction of 0.14 from a boosted tree or SVM.
Predictions from neural nets often are well-calibrated posterior probabilities, but predictions from SVMs are just normalized distances to the decision surface. Averaging predictions from models that are not on commensurate scales
may hurt ensemble performance.
In this section we evaluate the performance of ensemble selection after translating all model predictions to the
common language of well-calibrated posterior probabilities. Learning algorithms such as boosted trees and stumps,
SVMs, or nave bayes have poorly calibrated predictions
[15]. A number of methods have been proposed for mapping predictions to posterior probabilities. In this paper we
adopt the method Platt developed for SVMs [18], but which
also works well for other learning algorithms [15]. Platts
method transforms predictions by passing them through a
sigmoid whose parameters are learned on an independent
calibration set. In this paper, the ensemble selection hillclimb set is used for calibration as well.
Table 1 shows the performance of ensemble selection
(ES), model selection (MODSEL),2 and Bayesian model
averaging (BAYESAVG) [8], with and without calibrated
models. Results are shown for four different model libraries: 1) only uncalibrated models (NOCAL), 2) only calibrated models (CAL), 3) both calibrated and uncalibrated
models (BOTH), and 4) only SVMs are calibrated, to mimic
prior experiments [7] (PREV). Each entry is the average of
five folds on each of the eleven problems. The last column
shows the mean performance over all eight metrics. Rows
are sorted by mean performance.
Comparing results for ensemble selection with and without calibration (ES-CAL and ES-NOCAL), we see that calibrating models improves RMS and MXE (significant at .05)
but hurts FSC. There is little difference for LFT, ROC, APR
3.2 Comparing Across Performance Metrics

To permit averaging across metrics and problems, performances must be placed on comparable scales. Following
Caruana et al. [7] we scale performance for each problem
and metric from 0 to 1, where 0 is baseline performance
and 1 is the best performance achieved by any model or ensemble. We use the following baseline model: predict p for
every case, where p is the percent of positives in the data.
One disadvantage of normalized scores is that recovering
a raw performance requires knowing what performances define the top and bottom of the scale, and as new best models
are found the top of the scale may change. Note that the
normalized scores presented here differ from those reported
in Caruana et al. [7] because we are finding better models
that shift the top of the scales. The numbers defining the
normalized scales can be found in Appendix C.
4 Ensembles of Calibrated Models

Models trained by different learning algorithms do not
necessarily speak the same language. A prediction of
2 Model
selection chooses the best single model using the hillclimb set.
and BEP. For model selection we see the same trends: calibrated models yield better RMS and MXE and worse FSC.
The magnitudes of the differences suggest that most if not
all of the improvement in RMS and MXE for ensemble selection with calibrated models is due to having better models in the library rather than from ensemble selection taking
advantage of the common scale of the calibrated models.
We are not sure why calibration makes FSC performance
worse for both MODSEL and ES, but again suspect that the
differences between ES-CAL and ES-NOCAL are due to
differences in the performance of the base-level models.
Having both calibrated and uncalibrated models in the
library (ES-BOTH and MODSEL-BOTH) gives the best of
both worlds: it alleviates the problem with FSC while retaining the RMS and MXE improvements. For the rest of
the experiments in this paper we use libraries containing
both calibrated and uncalibrated models.
Unlike with ensemble selection, using calibrated models
for Bayesian model averaging improves performance on all
metrics, not just RMS and MXE (significant at .05). With
calibrated models, Bayesian averaging outperforms model
selection but is still not as good as ensemble selection.
selection is hurt less by a small hillclimbing set than model

selection, suggesting that it is less prone to overfitting than
model selection. Because of this, the benefit of ensemble
selection over the best models appears to be strongest when
training data is scarce, a regime [7] did not examine. (They
used 5k training data with 1k points held aside for ensemble
stepwise selection.) As the size of the hillclimbing sets goes
from 1k to 10k, ensemble selection maintains its edge over
model selection.
With small hillclimb sets, using bagging with ensemble
selection is crucial to getting good performance; without
it, mean performance using a 100 point hillclimb set drops
from 0.888 to 0.817. In contrast, bagging provides very
little if any benefit when a very large hillclimb set is used
(more than 5000 points with our data sets).
6 Cross-Validated Ensemble Selection

It is clear from the results in Section 5 that the larger the
hillclimb set, the better ensemble selections performance
will be. To maximize the amount of available data, we apply
cross-validation to ensemble selection. Simply wrapping
cross-validation around ensemble selection, however, will
not help because the algorithm will still have just a fraction
of the training data available for hillclimbing. Instead, we
embed cross-validation within ensemble selection so that all
of the training data can be used for the critical ensemble
hillclimbing step. Conceptually, the procedure makes crossvalidated models, then runs ensemble selection the usual
way on a library of cross-validated base-level models.
A cross-validated model is created by training a model
for each fold with the same model parameters. If there are
5 folds, there will be 5 individual models (each trained on
4000 points) that are siblings; these siblings should only
differ based on variance due to their different training samples. To make a prediction for a test point, a cross-validated
model simply averages the predictions made by each of the
sibling models. The prediction for a training point (that subsequently will be used for ensemble hillclimbing), however,
only comes from the individual model that did not see the
point during training. In essence, the cross-validated model
delegates the prediction responsibility for a point that will
be used for hillclimbing to the one sibling model that is not
biased for that point.
Selecting a cross-validated model, whether during model
selection or ensemble selection, means choosing all of the
sibling models as a unit. If 5-fold cross-validation is used,
selection chooses groups containing 5 sibling models at a
time. In this case, when selection adds a cross-validated
model to a growing ensemble, it really adds 5 different models of the same model type to the ensemble, each of which
receives the same weight in the ensemble average.
We ran ensemble selection with 5-fold cross-validation;
5 Analysis of Training Size

The original ensemble selection paper demonstrated the
methods effectiveness using relatively small hillclimbing
sets containing 1000 data points. Since the data used for
hillclimbing is data taken away from training the individual models, keeping the hillclimb set small is important.
Smaller hillclimb sets, however, are easier to overfit to, particularly when there are many models from which to select.
To explore ensemble selections sensitivity to the size of
the hillclimb set, we ran ensemble selection with hillclimb
sets containing 100, 250, 500, 1000, 2500, 5000, and 10000
data points. In each run we randomly selected the points for
the hillclimb set and used the remainder for the test set. The
hyperspectral and medis data sets contained too few points
to leave sufficient test sets when using a 10K hillclimbing
set and were omitted. Due to time constraints and the cost
of generating the learning curves, we only used one random
sample at each size and did not repeat the experiment.
Figure 1 shows learning curves for our eight performance measures and their mean. Each graph is an average
over 9 problems. The x-axis uses a logscale to better show
what happens with small hillclimbing sets. Normalized performance scores are plotted on the y-axis. For comparison,
the graphs include the performance achieved by picking the
single best model (MODSEL).
Unsurprisingly, the performance achieved with both ensemble selection and model selection using only 100 points
for hillclimbing is quite bad. As data increases, both methods do better as they overfit less. Interestingly, ensemble
4
bagging
no bagging
modsel
10000
hillclimb data size
bagging
no bagging
modsel
1000
hillclimb data size
bagging
no bagging
modsel
1000
normalized performance
hillclimb data size
10000
0.9
0.8
0.7
bagging
no bagging
modsel
0.5
0.4
100
1000
hillclimb data size
bagging
no bagging
modsel
1000
10000
MEAN
0.6
10000
hillclimb data size
MXE
bagging
no bagging
modsel
1000
10000
0.98
0.96
0.94
0.92
0.9
0.88
0.86
0.84
0.82
100
hillclimb data size
RMS
1
0.95
0.9
0.85
0.8
0.75
0.7
0.65
0.6
0.55
100
1000
BEP
10000
1
0.98
0.96
0.94
0.92
0.9
0.88
0.86
0.84
0.82
100
bagging
no bagging
modsel
hillclimb data size
APR
bagging
no bagging
modsel
1000
10000
1
0.98
0.96
0.94
0.92
0.9
0.88
0.86
0.84
100
hillclimb data size
ROC
1
0.98
0.96
0.94
0.92
0.9
0.88
0.86
0.84
100
LFT
1000
1
0.95
0.9
0.85
0.8
0.75
0.7
0.65
0.6
0.55
100
FSC
ACC
1
0.95
0.9
0.85
0.8
0.75
0.7
0.65
0.6
0.55
0.5
100
10000
1
0.95
0.9
0.85
0.8
0.75
0.7
100
bagging
no bagging
modsel
1000
10000
hillclimb data size
Figure 1. Learning curves for ensemble selection with and without bagging, and for picking the best
single model (modsel).
Table 2. Performance with and without cross-validation for ensemble selection and model selection.
ACC FSC
LFT ROC APR BEP RMS MXE MEAN
ES-BOTH-CV
0.935 0.926 0.982 0.996 0.992 0.977 0.984 0.989
0.973
MODSEL-BOTH-CV 0.907 0.923 0.971 0.985 0.968 0.963 0.945 0.961
0.953
ES-BOTH
0.920 0.888 0.967 0.982 0.972 0.964 0.932 0.944
0.946
MODSEL-BOTH
0.871 0.861 0.939 0.973 0.948 0.938 0.901 0.916
0.918
this is analogous to normal ensemble selection with a 5000
point hillclimb set. Table 2 shows the results averaged over
all the problems. Not only does cross-validation greatly improve ensemble selection performance, it also provides the
same benefit to model selection. Five-fold cross-validated
model selection actually outperforms non-cross-validated
ensemble selection by a small but noticeable amount. However, ensemble selection with embedded cross-validation
continues to outperform model selection.
selection achieves a raw accuracy score of 90%, and crossvalidated ensemble selection achieves 95% accuracy, then
the percent reduction in loss is 50%the loss has been reduced by half. The MEAN row is the average improvement
for each metric, across datasets. For comparison, the PREV
row is the performance of the original non-cross-validated
ensemble selection method (i.e. no cross-validation and
only SVMs are calibrated).
Embedding cross-validation within ensemble selection
doubles its benefit over simple model selection (from 6.90%
to 12.77%). This is somewhat of an unfair comparison; if a
cross-validated model library is available, it is just as easy
to do cross-validated model selection as it is to do crossvalidated ensemble selection. The last row in Table 3 shows
Table 3 provides a different way to look at the results.

The numbers in the table (except for the last row) are the
percent reduction in loss of cross-validated ensemble selection, relative to non-cross-validated model selection, the
baseline used in Caruana et al. [7]. For example, if model
5
Table 3. Percent loss reduction by dataset.

ADULT
BACT
CALHOUS
COD
COVTYPE
HS
LETTER.p2
LETTER.p1
MEDIS
MG
SLAC
MEAN
PREV
MEANcv
ACC
2.77
2.08
7.95
5.73
6.68
13.66
15.21
21.55
2.77
4.45
2.49
7.76
4.96
2.89
FSC
5.89
3.83
9.49
7.46
7.26
16.36
14.50
25.66
-0.05
1.98
3.27
8.70
4.56
3.07
LFT
8.72
16.42
48.00
14.33
12.35
12.32
100.00
0.29
2.08
4.25
13.65
21.13
16.22
10.82
ROC
7.45
4.13
8.69
9.14
11.34
37.53
32.84
69.10
6.33
11.84
6.92
18.67
8.43
9.97
the percent loss reduction of cross-validated ensemble selection compared to cross-validated model selection. Comparing PREV and MEANcv , we see that after embedding
cross-validation, ensemble selection provides slightly less
benefit over model selection than un-cross-validated ensemble selection did over un-cross validated model selection.
While training five times as many models is computationally expensive, it may be useful for domains where the
best possible performance is needed. Potentially more interesting, in domains where labeled data is scarce, crossvalidated ensemble selection is attractive because a) it does
not require sacrificing part of the training data for hillclimbing, b) it maximizes the size of the hillclimbing set (which
Figure 1 shows is critical when hillclimb data is small), and
c) training the cross-validated models is much more feasible
with smaller training data.
APR
6.70
5.49
8.81
10.52
14.99
37.78
33.05
45.29
7.28
12.65
9.62
17.47
6.24
9.37
BEP
7.58
1.76
6.15
7.11
7.64
16.77
15.85
19.25
4.62
6.04
2.73
8.68
5.15
2.84
RMS
2.26
1.42
7.17
2.39
7.80
12.65
17.13
19.59
1.40
2.57
1.66
6.91
3.27
2.54
MXE
4.08
4.15
12.74
3.79
12.92
27.43
29.47
34.58
2.70
6.10
3.33
12.84
6.39
4.22
MEAN
5.68
4.91
13.63
7.56
10.12
21.81
32.26
29.41
3.39
6.23
5.46
12.77
6.90
5.71
Table 4. Performance of ensemble selection

when forced to optimize to one set metric.
ES-BOTH-CV
ES-BOTH
RMS
0.969
0.935
MXE
0.968
0.936
OPTMETRIC
0.973
0.946
significantly smaller.
The scatter plots in Figure 2 plot the performance of optimizing to RMS against the performance of optimizing to
OPTMETRIC, with one graph per target metric. Again, we
can see that ensemble selection performs somewhat better
with OPTMETRIC. Always optimizing RMS is frequently
very competitive, especially when performance gets close
to a normalized score of 1. This is why the benefit of direct
metric optimization is so small for cross-validated ensemble selection. These results suggest that optimizing RMS
(or MXE) may be a good alternative if the target metric is
too expensive to use for hillclimbing.
7 Direct Metric Optimization

One interesting feature of ensemble selection is its ability to build an ensemble optimized to an arbitrary metric. To
test how much benefit this capability actually provides, we
compare ensemble selection that optimizes the target metric with ensemble selection that optimizes a predetermined
metric regardless of the target metric. For each of the 8
metrics, we train an ensemble that optimizes it and evaluate
the performance on all metrics. Optimizing RMS or MXE
yields the best results.
Table 4 lists the performance of ensemble selection for a)
always optimizing to RMS, b) always optimizing to MXE,
and c) optimizing the true target metric (OPTMETRIC).
When cross-validation is not used, there is modest benefit
to optimizing to the target metric. With cross-validation,
however, the benefit from optimizing to the target metric is
8 Model Library Pruning

Including a large number of base level models, with a
wide variety of parameter settings, in the model library
helps ensure that at least some of the models will have
good performance regardless of the metric optimized. At
the same time, increasing the number of available models
also increases the risk of overfitting the hillclimb set. Moreover, some of the models have such poor performance that
they are unlikely to be useful for any metric one would want
to optimize. Eliminating these models should not hurt performance, and might help.
In this section we investigate ensemble selections per6
ACC
FSC
LFT
ROC
MEAN
1
0.57
0.24
APR
0.89
BEP
0.92
RMS
1
MXE
1
0.75
0.86
0.89
0.82
0.84
Figure 2. Scatter plots of ensemble selection performance when RMS is optimized (x-axis) vs when
the target metric is optimized (y-axis). Points above the line indicate better performance by optimizing
to the target metric (e.g. accuracy) than when optimizing RMS. Each point represents a different data
set; circles are averages for a problem over 5 folds, and Xs are performances using cross-validation.
Each metric (and the mean across metrics) is plotted separately for clarity.
The graphs in Figure 3 show the average behavior across
our 11 data sets. Ensemble selections behavior under pruning may in fact vary when each data set is considered individually. Averaging across problems could hide different
peak points. Figure 4 shows RMS performance for each of
the problems.
formance when employing varying levels of library pruning. The pruning works as follows: the models are sorted
by their performance on the target metric (with respect to
the hillclimb set), and only the top X% of the models are
used for ensemble selection. Note that this pruning is different from work on ensemble pruning [12, 22, 23, 26, 13].
This is a pre-processing method, while ensemble pruning
post-processes an existing ensemble.
Figure 3 shows the effect of pruning for each performance metric, averaged across the 11 data sets and 5
folds using non-cross-validated ensemble selection with
and without bagging. For comparison, flat lines illustrate
the performance achieved by model selection (modsel) and
non-pruned ensemble selection (es-both). The legend is
shown in the ACC graph.
The figure clearly shows that pruning usually does not
hurt ensemble selection performance, and often improves
it. For ACC, LFT, and BEP pruned ensemble selection
(the line with boxes) seems to yield the same performance
as non-pruned ensemble selection . For the other metrics,
pruning yields superior performance. Indeed, when using
more than 50% of the models performance decreases. Interestingly, library pruning reduces the need for bagging,
presumably by reducing the potential for overfitting.3
Although performance starts to decline at different pruning levels for the different problems, it is clear that larger
model libraries increase the risk of overfitting the hillclimb
set. Using 100% of the models is never worthwhile. At
best, using the full library can match the performance of using only a small subset. In the worst case, ensemble selection overfits. This is particularly evident for the COD data
set where model selection outperforms ensemble selection
unless pruning is employed.
While further work is needed to develop good heuristics
for automatically choosing an appropriate pruning level for
a data set, simply using the top 1020% models seems to be
a good rule of thumb. An open problem is finding a better
pruning method. For example, taking into account model
diversity (see for example [11, 17]) might find better pruned
sets.
3 The
bagging line at 100% does not always match the es-both line,
even though these should be equivalent configurations. This is particularly
evident for FSC, the highest variance metric. The sorting performed before
pruning alters ensemble selections model sampling, resulting in additional
variance.
bagging
no bagging
es-both
modsel-both
0
20
40
60
80
100
0.905
0.9
0.895
0.89
0.885
0.88
0.875
0.87
0.865
0.86
20
% models
40
20
40
60
80
100
20
0.97
0.965
0.96
0.955
0.95
0.945
20
40
60
80
100
0.93
0.92
0.91
60
80
100
20
% models
40
60
80
100
80
100
80
100
0.97
0.965
0.96
0.955
0.95
0.945
0.94
0.935
20
40
60
% models
MEAN
0.94
0.965
0.96
0.955
0.95
0.945
0.94
0.935
0.93
0.925
0.92
0.915
40
BEP
0.94
0.935
MXE
0.95
40
0.95
0.945
% models
0.975
RMS
20
0.96
0.955
% models
0.96
100
0.98
% models
0.9
80
0.965
APR
60
0.97
% models
ROC
0.99
0.988
0.986
0.984
0.982
0.98
0.978
0.976
0.974
0.972
LFT
FSC
ACC
0.925
0.92
0.915
0.91
0.905
0.9
0.895
0.89
0.885
0.88
0.875
0.87
60
80
100
0.955
0.95
0.945
0.94
0.935
0.93
0.925
0.92
0.915
% models
20
40
60
% models
Figure 3. Pruned ensemble selection performance.
9 Discussion
ing the predictions of the sibling models, we use only the

predictions of one of the siblings. Using this procedure we
construct five ensemble models, one for each fold, and report their mean performance. This provides a measure of
the benefit due to the increase in the size of the hillclimb set
(from cross-validation) while eliminating the bagging-like
effect due to sibling model averaging.
In the other experiment, we use the smaller hillclimb sets
used by un-cross-validated ensemble selection, but we do
average the predictions of the sibling models. We again
construct five ensemble models, one for each fold, and report their mean performance. This allows us to identify the
performance increase due to the bagging-like effect of averaging the predictions of the sibling models.
Table 5 shows the results of these experiments. Entries in
the table show the improvement provided by using a larger
hillclimb set (ES-HILL) and by averaging the sibling models (ES-AVG) as a percentage of the total benefit of crossvalidated ensemble selection. For example, looking at the
ACC column, increasing the size of the hillclimb set from
1k to 5k yields a benefit equal to 32.9% of the total benefit
provided by cross-validated ensemble selection, and aver-
In this section we further analyze the benefit of embedding cross validation within ensemble selection and also
briefly describe other work we are doing to make ensemble selection models smaller and faster.
9.1 Benefits of Cross-Validation

The results in Section 6 show that embedding crossvalidation within ensemble selection significantly increases
the performance of ensemble selection. There are two factors that could explain this increase in performance. First,
the bigger hillclimbing set could make selecting models to
add to the ensemble more reliable and thus make overfitting harder. Second, averaging the predictions of the sibling
models could provide a bagging-like effect that improves
the performance of the base-level models. To tease apart
the benefit due to each of these factors we perform two additional experiments.
In one experiment, we use the same hillclimbing set as
cross-validated ensemble selection, but instead of averag8
20
40
60
80
100
0.97
0.96
0.95
0.94
0.93
0.92
0.91
0.9
0.89
0.88
0.87
20
40
% models
0.92
0.91
0.9
0.89
0.88
40
60
80
100
0.945
0.94
0.935
0.93
0.925
0.92
0.915
0.91
0.905
20
% models
20
40
60
40
60
80
100
80
100
0.96
0.95
20
40
60
80
100
20
40
0.91
0.9
0.89
0.88
0.87
40
60
80
100
0.94
0.92
0.9
0.88
0.86
0.84
0.82
0.8
0.78
0.76
20
40
60
80
100
SLAC: RMS
0.92
20
60
% models
MG: RMS
0.98
0.97
0.96
0.95
0.94
0.93
0.92
0.91
0.9
0.89
0.88
% models
100
MEDIS: RMS
0.955
80
% models
0.965
0.945
60
HS: RMS
40
% models
0.97
% models
0.86
20
LETTER.p2: RMS
% models
LETTER.p1: RMS
0.96
0.95
0.94
0.93
0.92
0.91
0.9
0.89
0.88
100
0.93
20
80
bagging
no bagging
es-both
modsel-both
COVTYPE: RMS
COD: RMS
60
0.985
0.98
0.975
0.97
0.965
0.96
0.955
0.95
0.945
0.94
0.935
0.93
% models
0.94
0.87
CALHOUS: RMS
BACT: RMS
ADULT: RMS
0.98
0.975
0.97
0.965
0.96
0.955
0.95
0.945
0.94
80
100
% models
0.97
0.96
0.95
0.94
0.93
0.92
0.91
0.9
20
40
60
80
100
% models
Figure 4. RMS performance for pruned ensemble selection.
aging the sibling models yields a benefit equal to 80.5%.

The third row in the table is the sum of the first two rows.
If the sum is lower than 100% the effects from ES-HILL
and ES-AVG are super-additive, i.e. combining the two effects provides more benefit than the sum of the individual
improvements. If the sum is higher than 100% then the two
effects are sub-additive. For ACC, the sum is 113.4%, indicating that the effects of these two factors are sub-additive:
the total performance is slightly less than would be expected
if the factors were independent. Except for the high variance metrics, FSC and ACC, the sums are close to 100%,
indicating that the two effects are nearly independent.
The learning curves in Figure 1 suggest that increas-
ing the size of the hillclimb set from 1k to 5k would explain almost all of the benefit of cross-validation. These
results, however, show that on average across the eight metrics the benefit from ES-HILL and ES-AVG are roughly
equal. About half of the benefit from embedding crossvalidation within ensemble selection appears to result from
the increase in the size of the hillclimb set, and the other
half appears to result from averaging the sibling models.
Increasing the size of the hillclimb set via cross-validation
(as opposed to having more data available for hillclimbing)
provides less benefit in practice because there is a mismatch
between the base-level models used to make predictions on
the hillclimbing set and the sibling-averaged models that
9
Table 5. Breakdown of improvement from cross-validation.

ES-HILL
ES-AVG
SUM
ACC
32.9%
80.5%
113.4%
FSC
37.2%
13.6%
50.8%
LFT
48.0%
54.0%
102.0%
ROC
38.8%
59.0%
97.8%
will be used in the ensemble. In other words ensemble selection is hillclimbing using slightly different models than
the ones it actually adds to the ensemble.
APR
40.8%
55.7%
96.5%
BEP
19.4%
77.4%
96.8%
RMS
55.1%
46.8%
101.9%
MXE
56.7%
51.8%
108.5%
MEAN
41.1%
54.9%
96.0%
In comparison to model selection, however, ensemble selection seems much more resistant to overfitting when data is
scarce. Further experiments varying the amount of training
data provided to the base-level models are needed to see if
ensemble selection is truly able to outperform model selection by such a significant amount on small data sets.
Counter to our and others intuition [9], calibrating models to put all predictions on the same scale before averaging
them did not improve ensemble selections effectiveness.
Most of calibrations improvement comes from the superior
base-level models.
Our experiments show that directly optimizing to a target metric is better than always optimizing to some predetermined metric. That said, always optimizing to RMS or
MXE was surprisingly competitive. These metrics may be
good optimization proxies if the target metric is too expensive to compute repeatedly during hillclimbing.
Finally, pruning the number of available models reduces
the risk of overfitting during hillclimbing while also yielding faster ensemble building. In our experiments pruning
rarely hurt performance and frequently improved it.
9.2 Model Compression

While very accurate, the ensembles built by ensemble selection are exceptionally complex. On average, storing the
learned ensemble requires 550 MB, and classifying a single
test case takes about 0.5 seconds. This prohibits their use
in applications where storage space is at a premium (e.g.
PDAs), where test sets are large (e.g. Google), or where
computational power is limited (e.g. hearing aids). In a separate paper we address these issues by using a model compression [6] method to obtain models that perform as well
as the ensembles built by ensemble selection, but which are
faster and more compact.
The main idea behind model compression is to train a
fast and compact model to approximate the function learned
by a slow, large, but high performing model. Unlike the true
function that is unknown, the function learned by the high
performing model is available and can be used to label large
amounts of synthetic data. A fast, compact and expressive
model trained on enough synthetic data will not overfit and
will closely approximate the function learned by the original model. This allows a slow, complex model such as a
massive ensemble to be compressed into a fast, compact
model with little loss in performance.
In the model compression paper, we use neural networks
to compress ensembles produced by ensemble selection. On
average the compressed models retain more than 90% of the
improvement provided by ensemble selection (over model
selection), while being more than 1000 times smaller and
1000 times faster.
Acknowledgments
We thank Lars Backstrom for help with exploring alternative model calibration methods and the anonymous reviewers for helpful comments on paper drafts. This work
was supported by NSF Award 0412930.
References
[1] S. D. Bay. Combining nearest neighbor classifiers through
multiple feature subsets. In ICML, pages 3745, 1998.
[2] L. Breiman. Heuristics of instability in model selection.
Technical report, Statistics Department, University of California at Berkeley, 1994.
[3] L. Breiman. Bagging predictors. Machine Learning,
24(2):123140, 1996.
[4] L. Breiman. Random forests. Machine Learning, 45(1):5
32, 2001.
[5] R. K. Bryll, R. Gutierrez-Osuna, and F. K. H. Quek. Attribute bagging: Improving accuracy of classifier ensembles by using random feature subsets. Pattern Recognition,
36(6):12911302, 2003.
10 Conclusions
Embedding cross-validation inside ensemble selection
greatly increases its performance. Half of this benefit is due
to having more data for hillclimbing; the other half is due to
a bagging effect that results from the way cross-validation
is embedded within ensemble selection. Unsurprisingly, reducing the amount of hillclimbing data hurts performance
because ensemble selection can overfit this data more easily.
10
A Learning Methods
[6] C. Bucila, R. Caruana, and A. Niculescu-Mizil. Model compression: Making big, slow models practical. In Proc. of the
12th International Conf. on Knowledge Discovery and Data
Mining (KDD06), 2006.
[7] R. Caruana, A. Niculescu-Mizil, G. Crew, and A. Ksikes.
Ensemble selection from libraries of models. In ICML,
2004.
[8] P. Domingos. Bayesian averaging of classifiers and the overfitting problem. In ICML, pages 223230. Morgan Kaufmann, San Francisco, CA, 2000.
[9] R. P. W. Duin. The combining classifier: To train or not to
train? In ICPR (2), pages 765770, 2002.
[10] P. Giudici. Applied Data Mining. John Wiley and Sons, New
York, 2003.
[11] L. I. Kuncheva and C. J. Whitaker. Measures of diversity in
classifier ensembles and their relationship with the ensemble
accuracy. Machine Learning, 51(2):181207, 2003.
[12] D. D. Margineantu and T. G. Dietterich. Pruning adaptive
boosting. In ICML, pages 211218. Morgan Kaufmann,
1997.
[13] G. Martnez-Munoz and A. Suarez. Pruning in ordered bagging ensembles. In ICML, pages 609616, New York, NY,
USA, 2006. ACM Press.
[14] A. Munson, C. Cardie, and R. Caruana. Optimizing to
arbitrary NLP metrics using ensemble selection. In HLTEMNLP, pages 539546, 2005.
[15] A. Niculescu-Mizil and R. Caruana. Predicting good probabilities with supervised learning. In ICML05, 2005.
[16] C. Perlich, F. Provost, and J. S. Simonoff. Tree induction
vs. logistic regression: A learning-curve analysis. J. Mach.
Learn. Res., 4:211255, 2003.
[17] A. H. Peterson and T. R. Martinez. Estimating the potential for combining learning models. In Proc. of the ICML
Workshop on Meta-Learning, pages 6875, 2005.
[18] J. Platt. Probabilistic outputs for support vector machines
and comparison to regularized likelihood methods. In Adv.
in Large Margin Classifiers, 1999.
[19] F. J. Provost and T. Fawcett. Analysis and visualization of
classifier performance: Comparison under imprecise class
and cost distributions. In Knowledge Discovery and Data
Mining, pages 4348, 1997.
[20] F. Roli, G. Giacinto, and G. Vernazza. Methods for designing multiple classifier systems. In Multiple Classifier Systems, pages 7887, 2001.
[21] R. Schapire. The boosting approach to machine learning: An
overview. In In MSRI Workshop on Nonlinear Estimation
and Classification, 2001.
[22] W. N. Street and Y.-H. Kim. A streaming ensemble algorithm (SEA) for large-scale classification. In KDD, pages
377382, 2001.
[23] G. Tsoumakas, L. Angelis, and I. Vlahavas. Selective fusion of heterogeneous classifiers. Intelligent Data Analysis,
9(6):511525, 2005.
[24] I. H. Witten and E. Frank. Data Mining: Practical Machine
Learning Tools and Techniques. Morgan Kaufmann, San
Francisco, second edition, 2005.
[25] D. H. Wolpert. Stacked generalization. Neural Networks,
5:241259, 1992.
[26] Y. Zhang, S. Burer, and W. N. Street. Ensemble pruning via
semi-definite programming. Journal of Machine Learning
Research, 7:13151338, 2006.
In addition to the learning methods used by Caruana

et al. [7] (decision trees, bagged trees, boosted trees and
stumps, KNN, neural nets, and SVMs), we use three more
model types: logistic regression, nave bayes, and random
forests. These are trained as follows:
Logistic Regression (LOGREG): we train both unregularized and regularized models, varying the ridge parameter
by factors of 10 from 108 to 104 . Attributes are scaled to
mean 0 and standard deviation 1.
Random Forests (RF): we use the Weka implementation [24]. The forests have 1024 trees, and the size of the
feature set to consider at each split is 1, 2, 4, 6, 8, 12, 16 or
20.
Nave Bayes (NB): we use the Weka implementation and
try all three of the Weka options for handling continuous attributes: modeling them as a single normal, modeling them
with kernel estimation, or discretizing them using supervised discretization.
In total, around 2,500 models are trained for each data
set. When calibrated models are included for ensemble selection the number doubles to 5,000.
Data Sets
We experiment with 11 binary classification problems. ADULT, COV TYPE, HS, LETTER.P1, LETTER.P2, MEDIS, and SLAC were used by Caruana et
al. [7]. The four new data sets we use are BACT, COD,
CALHOUS, and MG. COD, BACT, and CALHOUS are
three of the datasets used in Perlich et al. [16]. MG is a
medical data set. See Table 6 for characteristics of the 11
problems.
Table 6. Description of problems
PROBLEM
ADULT
BACT
COD
CALHOUS
COV TYPE
HS
LETTER . P 1
LETTER . P 2
MEDIS
MG
SLAC
11
# ATTR
14/104
11/170
15/60
9
54
200
16
16
63
124
59
TRAIN
TEST
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
4000
35222
34262
14000
14640
25000
4366
14000
14000
8199
12807
25000
% POZ
25%
69%
50%
52%
36%
24%
3%
53%
11%
17%
50%
Table 7. Scales used to compute normalized scores. Each entry shows bottom / top for the scale.
ADULT
BACT
CALHOUS
COD
COVTYPE
HS
LETTER.p1
LETTER.p2
MEDIS
MG
SLAC
ACC
0.752 / 0.859
0.692 / 0.780
0.517 / 0.889
0.501 / 0.784
0.639 / 0.859
0.759 / 0.949
0.965 / 0.994
0.533 / 0.968
0.893 / 0.905
0.831 / 0.900
0.501 / 0.726
FSC
0.398 / 0.705
0.818 / 0.855
0.681 / 0.893
0.666 / 0.796
0.531 / 0.804
0.389 / 0.894
0.067 / 0.917
0.696 / 0.970
0.193 / 0.447
0.290 / 0.663
0.667 / 0.751
LFT
1.000 / 2.842
1.000 / 1.345
1.000 / 1.941
1.000 / 1.808
1.000 / 2.487
1.000 / 3.656
1.000 / 4.001
1.000 / 1.887
1.000 / 2.917
1.000 / 3.210
1.000 / 1.727
ROC
0.500 / 0.915
0.500 / 0.794
0.500 / 0.959
0.500 / 0.866
0.500 / 0.926
0.500 / 0.985
0.500 / 0.999
0.500 / 0.996
0.500 / 0.853
0.500 / 0.911
0.500 / 0.813
C Performance Scales
Table 7 lists the performance numbers that determine the
normalized scores. Each entry contains the baseline performance (bottom of the scale) and the best performance
achieved by any model or ensemble (top of the scale).
12
APR
0.248 / 0.808
0.692 / 0.891
0.517 / 0.964
0.499 / 0.864
0.362 / 0.879
0.243 / 0.962
0.036 / 0.975
0.534 / 0.997
0.108 / 0.462
0.170 / 0.740
0.501 / 0.816
BEP
0.248 / 0.708
0.692 / 0.824
0.517 / 0.895
0.499 / 0.782
0.361 / 0.805
0.241 / 0.898
0.035 / 0.917
0.533 / 0.970
0.107 / 0.469
0.169 / 0.686
0.501 / 0.727
RMS
0.432 / 0.312
0.462 / 0.398
0.500 / 0.283
0.500 / 0.387
0.480 / 0.320
0.428 / 0.198
0.184 / 0.067
0.499 / 0.157
0.309 / 0.272
0.375 / 0.278
0.500 / 0.420
MXE
0.808 / 0.442
0.891 / 0.697
0.999 / 0.380
1.000 / 0.663
0.944 / 0.478
0.797 / 0.195
0.219 / 0.025
0.997 / 0.125
0.491 / 0.365
0.656 / 0.373
1.000 / 0.755

Caruana Icml04 Icdm06long

Încărcat de

Informații document

Titlu original

Drepturi de autor

Formate disponibile

Partajați acest document

Partajați sau inserați document

Opțiuni de partajare

Vi se pare util acest document?

Este necorespunzător acest conținut?

Drepturi de autor:

Formate disponibile

Caruana Icml04 Icdm06long

Încărcat de

Drepturi de autor:

Formate disponibile

{

vxKy{~:l~P*X"=Pv {lb~z{y_v v%.uZY xbKv5v LK~

hillclimb: selection without replacement

Number of Models in Ensemble

`KT | `WhN`d)aa{&d vwa_G} [zU[d)ake{[jTgU[`e]R[VscK\G[]R[\<[v+a_GT `

| G_ \G\Sbed~}@[VXd `6T@K/T@{Ra kK`K_<a_G[]ya Tc-[8][\G[v+a [YhzbK[uKt

U[{[` aSVX[wa{ _^v]#d {[d KK{ T K{ _Gdga[_G`uYZ_ [{ [` a$][waa_G`Kf@]Sdg`eY

| [_<f@b a[YLd~} [{&dgf@[(TgUSVXTZYZ[\G]r]RT?abK[Xd~} [{ d f [(TgUmd?][wa

VX[a{ _Gv(YZTX`KT adg\ | d~Z]m/[{RUT@{V

c/d ][wt\<[} [\eVTZYZ[\G]hdgf f@_<`ef(_G]$YZ_G] vwk/]][YX_G`8[v+a_GT `qZh Kh

`ja bK_G]]R[v+a _<T@`

vT VXed {[P-[{UT { VXd `evw[ke]_<`efa[`VX[wa {_^v]Xc-[vdgke][YZ_<Ut

| [uvwT@VXedg{ [X[`e][VcK\<[u][\G[vwa_GT `jaTPa bK[

a T {&]dgaabe[ a&dg`ZUT@{ YW_<`e[dg{BNvvw[\G[{&d)a T {r8d `eYt

kedg\<a_G[{ _#[asdg\h<r$ @@whuNNZyrI 6d `eYZW

| []Rak/YZdg{ [8YZ[e`K[YT `cK_G`edg{ K{T@cZt

a bK[u\^dg{ f []asv\Gd@]]d@]sv\Gd@]]@h $ | d@]vwT@` }@[{a[Y

d@]?v\Gd@]]LLdg`eYBa bK[L{ [Vd _<`K_G`Kfx@q\G[waRa [{&]?d@]?vw\^d ] ]

h /x8ke]R[](\G[waRa [{&]wt&d ](v\Gd@]](ozd `eYj\<[aRa [{&]Xgtxg

d@]vw\^d ] ]@re_G[\^YZ_<`ef?dYZ_<vkK\<arcKkZacedg\^dg`ev[Yr/K{ T cK\G[VPh

| [{ [u][\G[v+a [Yc-[vdgke][XabK[jd {[\^dg{ f [

[`KT kef baT2dg\G\<T |

VTZYZ[{ dga[]R_G[ja { d _<`dg`eY})dg\G_GYed)a_GT `

][wa&]r)d `eY]Ra_G\<\ bed~} [$YKd)a&d\G[wUaWUT@{I\^dg{ f [/`edg\@a[]a]R[a ]hIeT {

| [\<\T@`8T abK[{VX[wa{ _^v]h

| _ a bdg`T UXabK[\G[d {`e_<`Kfd \<f@Tgt

{ _<abKV]redg`eYdg\^]Ta T]R[} [{&dg\T abK[{[`e][VcK\<[yVX[wa bKTZYK]h

[{RUT@{Vd `evw[VX[wa {_^v]$]Rkev&bud@]# T { LX{&dg`Kf@[mU{T@V

b/d~} [ced@]R[\<_G`K[{ dga[]Nab/d)aYK[-[`eYLT `za bK[Yed)a deh U

nBoKhqgVX_Gf b aIc-[S[Kvw[\G\G[` aI-[{UT { Vdg`ev[ h

ITdg\G\<T | d~} [{&dgf@_<`efLd v{T ]]K{ T cK\G[V]dg`/YVX[wa {_^v]r | [

a bK_^]u]vdg\G[ roj{[K{[]R[` a ]ced ][\G_G`K[z/[{RUT@{Vd `evw[8UT {a bK[

UT@{$[d v&bK{ T cK\G[Vuwrd `eYz{ [e{[]R[`@a&]a bK[c-[]Ra

-[{UT { Vdg`ev[][[`?UT@{dg`VXTZYZ[\T@`?a bK[e`ed \a[]RaN][wah

YK_Gvwa ]IlUT {dg\G\/vd ][]r | bK[{[ _^]abe[-[{&vw[`@a$TgU-T@]_ a _<}@[]

c/d ][\G_<`K[(VXTZYZ[\Wdg`eYPd~ []mT Za _<Vdg\WVTZYZ[\bed~}@[y]_GV_G\^dg{

be[c/T aRa T VTgUId cK\G[]RbeT | ]`KT { Vdg\G_<[Y] vwT { []UT {

U[{[`evw[]_<`P-[{UT { Vdg`ev[{ [\G_Gd cK\G[ h

[d v&bB\<[dg{ `K_G`Kfjd \<f@T { _ a bKV

a _<T@`]R[a ][`e][Vsce\<[u][\G[vwa_GT `bK_G\<\^vw\G_<Vce](T@`h

\G[vwa[YjUT {s[d v&bK{ T cK\G[V

{ TTgatVX[d `Zt] ked {[Yt[{{ T {z LKwrV[dg`vw{ T@] ]t[` a

e{T@cedgcK_G\G_ avdg\G_<cK{&d)a _<T@` N

| bK[`abe[vwT { { [v+aXVX[wa {_^vP_G]Xke`Zt

dg`eYVX[a{ _Gvk/]R_G`Kfza bK[u})dg\G_GYed)t

a bK[a&dgcK\G[Ndg{ [d~} [{ d f []IT)}@[{a bK[N][} [`XK{T@cK\G[V]hZvT { []

{ |L}~FY:):3C:?9k ?3?XF3)#%0@ ):+IKS ?:1:rbLD^D:.3't: 3LJ':):@1L.?kL# 0+003'Z=??.+:

0k0 yk: JYySlF*)oJb:rHoDD:XJL S

Mean Percent Advantage Over Best Single Model

Fraction of Models in Bagged Samples

?A@rqA@UsPALU_IK`t]F_IC_XfM _LuZ+vuMOw_yxzQU` _Y[ZIC_`

gQ_'!'m 0__mgr!'!'!Xr_T]Tf 8'

 Of2 .d.6h "hd.$8.FNFf hh.Azh

H:Iwv:I8x]iYaUsR8yzxNR9cp^bY|{} ce~ ~N9f2cpMN UaR

TT(O  r2. ( fz8Tfr(, 0

TT O  TOu f (O  (O 

tT(RffpTO[t8t W O T 1 8ff6 fN t  T {fR8 fz2tNON{rRO(p 8{8fRO6Ttl00

 [W Nt tR "OTtNNOdO8r O8:O

Va$,  Ve$" le b e,sqe

*;$:9n 2-D 2,eeel poqr<s!tvuxw

F3*" a , , a & 22- e 2,ee,l,

F e i*5+ 6 $ 2- 9b 3w$b *$2,$5w 3,

e, Fb,we$e{kUrU! t~Usvs" y tv{<"{|uxwOdw<

{

vxKy{~:l~P*X"=Pv {lb~z{y_v v%.uZY xbKv5v LK~

`KT | `WhN`d)aa{&d vwa_G} [zU[d)ake{[jTgU[`e]R[VscK\G[]R[\<[v+a_GT `

| G_ \G\Sbed~}@[VXd `6T@K/T@{RakK`K_<a_G[]yaTc-[8][\G[v+a[YhzbK[uKt

U[{[` aSVX[wa{_^v]#d {[d KK{T K{_Gdga[_G`uYZ_[{[` a$][waa_G`Kf@]Sdg`eY

| [_<f@b a[YLd~} [{&dgf@[(TgUSVXTZYZ[\G]r]RT?abK[Xd~} [{d f [(TgUmd?][wa

VX[a{_Gv(YZTX`KT adg\ | d~Z]m/[{RUT@{V

c/d ][wt\<[} [\eVTZYZ[\G]hdgf f@_<`ef(_G]$YZ_G]vwk/]][YX_G`8[v+a_GT `qZh Kh

`jabK_G]]R[v+a_<T@`

vT VXed {[P-[{UT {VXd `evw[ke]_<`efa[`VX[wa{_^v]Xc-[vdgke][YZ_<Ut

| [uvwT@VXedg{[X[`e][VcK\<[u][\G[vwa_GT `jaTPabK[

aT {&]dgaabe[ a&dg`ZUT@{YW_<`e[dg{BNvvw[\G[{&d)aT {r8d `eYt

kedg\<a_G[{_#[asdg\h<r$ @@whuNNZyrI 6d `eYZW

| []Rak/YZdg{[8YZ[e`K[YT `cK_G`edg{K{T@cZt

abK[u\^dg{f []asv\Gd@]]d@]sv\Gd@]]@h $ | d@]vwT@` }@[{a[Y

d@]?v\Gd@]]LLdg`eYBabK[L{[Vd _<`K_G`Kfx@q\G[waRa[{&]?d@]?vw\^d ]]

h /x8ke]R[](\G[waRa[{&]wt&d ](v\Gd@]](ozd `eYj\<[aRa[{&]Xgtxg

d@]vw\^d ]]@re_G[\^YZ_<`ef?dYZ_<vkK\<arcKkZacedg\^dg`ev[Yr/K{T cK\G[VPh

| [{[u][\G[v+a[Yc-[vdgke][XabK[jd {[\^dg{f [

[`KT kef baT2dg\G\<T |

VTZYZ[{dga[]R_G[ja{d _<`dg`eY})dg\G_GYed)a_GT `

][wa&]r)d `eY]Ra_G\<\ bed~} [$YKd)a&d\G[wUaWUT@{I\^dg{f [/`edg\@a[]a]R[a]hIeT {

| [\<\T@`8T abK[{VX[wa{_^v]h

| _abdg`T UXabK[\G[d {`e_<`Kfd \<f@Tgt

{_<abKV]redg`eYdg\^]TaT]R[} [{&dg\T abK[{[`e][VcK\<[yVX[wabKTZYK]h

[{RUT@{Vd `evw[VX[wa{_^v]$]Rkev&bud@]# T { LX{&dg`Kf@[mU{T@V

b/d~} [ced@]R[\<_G`K[{dga[]Nab/d)aYK[-[`eYLT `zabK[Yed)adeh U

nBoKhqgVX_Gf b aIc-[S[Kvw[\G\G[` aI-[{UT {Vdg`ev[ h

ITdg\G\<T | d~} [{&dgf@_<`efLd v{T ]]K{T cK\G[V]dg`/YVX[wa{_^v]r | [

abK_^]u]vdg\G[ roj{[K{[]R[` a]ced ][\G_G`K[z/[{RUT@{Vd `evw[8UT {abK[

UT@{$[d v&bK{T cK\G[Vuwrd `eYz{[e{[]R[`@a&]abK[c-[]Ra

-[{UT {Vdg`ev[][[`?UT@{dg`VXTZYZ[\T@`?abK[e`ed \a[]RaN][wah

YK_Gvwa]IlUT {dg\G\/vd ][]r | bK[{[ _^]abe[-[{&vw[`@a$TgU-T@]_a_<}@[]

c/d ][\G_<`K[(VXTZYZ[\Wdg`eYPd~ []mT Za_<Vdg\WVTZYZ[\bed~}@[y]_GV_G\^dg{

be[c/T aRaT VTgUId cK\G[]RbeT | ]`KT {Vdg\G_<[Y]vwT {[]UT {

U[{[`evw[]_<`P-[{UT {Vdg`ev[{[\G_Gd cK\G[ h

[d v&bB\<[dg{`K_G`Kfjd \<f@T {_abKV

a_<T@`]R[a][`e][Vsce\<[u][\G[vwa_GT `bK_G\<\^vw\G_<Vce](T@`h

\G[vwa[YjUT {s[d v&bK{T cK\G[V

{TTgatVX[d `Zt] ked {[Yt[{{T {z LKwrV[dg`vw{T@]]t[` a

e{T@cedgcK_G\G_avdg\G_<cK{&d)a_<T@` N

| bK[`abe[vwT {{[v+aXVX[wa{_^vP_G]Xke`Zt

dg`eYVX[a{_Gvk/]R_G`KfzabK[u})dg\G_GYed)t

abK[a&dgcK\G[Ndg{[d~} [{d f []IT)}@[{abK[N][} [`XK{T@cK\G[V]hZvT {[]

{|L}~FY:):3C:?9k ?3?XF3)#%0@ ):+IKS ?:1:rbLD^D:.3't:3LJ':):@1L.?kL# 0+003'Z=??.+:

0k0 yk: JYySlF*)oJb:rHoDD:XJL S

?A@rqA@UsPALU_IK`t]F_IC_XfM _LuZ+vuMOw_yxzQU` _Y[ZIC_`

gQ_'!'m 0__mgr!'!'!Xr_T]Tf 8'

Of2 .d.6h "hd.$8.FNFfhh.Azh

H:Iwv:I8x]iYaUsR8yzxNR9cp^bY|{} ce~ ~N9f2cpMN UaR

TT(O r2. ( fz8Tfr(, 0

TT O TOu f (O (O

tT(RffpTO[t8t WO T 1 8ff6 fN t T {fR8 fz2tNON{rRO(p8{8fRO6Ttl00

[WNt tR "OTtNNOdO8r O8:O

Va$, Ve$"lebe,sqe

*;$:9n 2-D2,eeelpoqr<s!tvuxw

F3*" a , , a & 22- e 2,ee,l,

Fei5+ 6 $2- 9b 3w$b$2,$5w3,

e, Fb,we$e{kUrU!t~Usvs" y tv{<"{|uxwOdw<