Model Selection in Gaussian process regression

In a big picture, a model selection in Gaussian process regression can categorized into two; one is a discrete selection of a proper kernel function and the other is a continuous optimization of selecting 'hyper-parameters' of a selected kernel function. In this post, we will focus on the latter model selection.

Optimizing the hyper-parameters can be solved by either maximizing a marginal likelihood or a leave-one-out cross-validation (LOO-CV) method.

1. Marginal Likelihood and its derivative with respect to a hyperparameter

2. Leave-one-out Likelihood and its derivative

Optimization itself can be done by gradient based algorithm or exhaustive search algorithm.

Result - marginal likelihood (exhaustive search)

Result - marginal likelihood (gradient descent

Code - main (marginal likelihood - exhaustive search)

load('mats/gp_data_toy.mat');

% load('gp_data_rlz.mat');

opt_lmarg = 1;

opt_loo = 0;

% 1. Training data

nr_data = 20;

train_idx = 1+round(linspace(0, gp_ref.nr_data-1, nr_data))';

input = gp_ref.input(train_idx);

output = gp_ref.output(train_idx);

gp_train = struct('nr_data', nr_data, 'input', input, 'output', output);

% 2. Greedy-wise hyperparameter optimization

max_epoch = 1E1;

opt_g2 = 1;

for epoch = 1:max_epoch

% 2.1. l2를 optimize한다.

nr_greedy = 200;

min_l2 = 1E-2; max_l2 = 1E1;

l2_list = linspace(min_l2, max_l2, nr_greedy)';

Ls_l2 = zeros(nr_greedy, 1);

for l2_idx = 1:length(l2_list)

hyp = struct('g2', opt_g2, 'l2', l2_list(l2_idx), 'w2', 1E-8);

if opt_lmarg

Ls_l2(l2_idx) = get_Lmarginal(gp_train, @kernel_se, hyp);

elseif opt_loo

Ls_l2(l2_idx) = get_Lloo(gp_train, @kernel_se, hyp);

end

[max_ll, opt_l2_idx] = max(Ls_l2);

opt_l2 = l2_list(opt_l2_idx);

% 2.2. g2를 optimize한다.

% nr_greedy = 200;

% min_g2 = 1E-2; max_g2 = 1E2;

% g2_list = linspace(min_g2, max_g2, nr_greedy)';

% Ls_g2 = zeros(nr_greedy, 1);

% for g2_idx = 1:length(g2_list)

% hyp = struct('g2', g2_list(g2_idx), 'l2', opt_l2, 'w2', 1E-8);

% if opt_lmarg

% Ls_g2(l2_idx) = get_Lmarginal(gp_train, @kernel_se, hyp);

% elseif opt_loo

% Ls_g2(l2_idx) = get_Lloo(gp_train, @kernel_se, hyp);

% end

% [max_ll, opt_g2_idx] = max(Ls_g2);

% opt_g2 = g2_list(opt_g2_idx);

% 결과를 보여준다.

fprintf('[%03d/%03d] l2: %.1e / g2: %.1e / LL: %.1e\n' ...

, epoch, max_epoch, opt_l2, opt_g2, max_ll);

% Plot log likelihood

plot_ll = 0;

if plot_ll

figure(1); clf; hold on;

hl2 = plot(l2_list, Ls_l2, 'r-');

plot(l2_list(opt_l2_idx), Ls_l2(opt_l2_idx), 'ro');

%hg2 = plot(g2_list, Ls_g2, 'b-');

%plot(g2_list(opt_g2_idx), Ls_g2(opt_g2_idx), 'bo');

title(sprintf('[%d/%d] Log Likelihood: %.1f', epoch, max_epoch, Ls_l2(opt_l2_idx)));

drawnow;

end

% 3. Do GPR

hyp_opt = struct('g2', opt_g2, 'l2', opt_l2, 'w2', 1E-2);

sgpr_opt_struct = init_lgpr(input, output, @kernel_lev_cos, hyp_opt);

sgpr_opt_result = lgpr(sgpr_opt_struct, gp_ref.input);

% 4. Plot

fig2 = figure(2); clf;

scrsz = get(0,'ScreenSize');

set(fig2,'Position', [0.1*scrsz(3),0.2*scrsz(4),0.4*scrsz(3),0.4*scrsz(4)]);

hold on; set(gca,'FontSize', 12);

leg_struct = init_legend();

href = plot(gp_ref.input, gp_ref.output, 'LineWidth', 2);

leg_struct = add_legend(leg_struct, href, 'Referencew data');

htrain = plot(gp_train.input, gp_train.output, 'o', 'Color', 'k', 'LineWidth', 2, 'MarkerSize', 15);

leg_struct = add_legend(leg_struct, htrain, 'Training data');

nr_temp = 10;

colors = summer(nr_temp);

for i = 1:nr_temp

idx = 1+floor(length(l2_list)*(i-1)/nr_temp);

hyp_temp = struct('g2', 5E0, 'l2', l2_list(idx), 'w2', 1E-2);

sgpr_struct = init_lgpr(input, output, @kernel_lev_cos, hyp_temp);

sgpr_result = lgpr(sgpr_struct, gp_ref.input);

htemp = plot(gp_ref.input, sgpr_result.mean, '-', 'Color', colors(i, :), 'LineWidth', 1);

leg_struct = add_legend(leg_struct, htemp, sprintf('l2: %.2f', l2_list(idx)));

end

hopt = plot(gp_ref.input, sgpr_opt_result.mean, '--', 'Color', 'r', 'LineWidth', 3);

leg_struct = add_legend(leg_struct, hopt, sprintf('opt l2: %.2f', opt_l2));

smart_legend(leg_struct.hs(1:leg_struct.nr_data), leg_struct.strs(1:leg_struct.nr_data));

grid on;

Code - main (marginal likelihood - gradient descent)

load('mats/gp_data_toy.mat');

% 1. Training data

nr_data = 20;

train_idx = 1+round(linspace(0, gp_ref.nr_data-1, nr_data))';

input = gp_ref.input(train_idx);

output = gp_ref.output(train_idx);

gp_train = struct('nr_data', nr_data, 'input', input, 'output', output);

% 2. Gradient based optimization

l2_init = 100;

hyp_init = struct('g2', 1E+1, 'l2', l2_init, 'w2', 1E-10);

kfun = @(hyp_in)(kernel_se(input, input, hyp_in));

dkfun = @(hyp_in, param_in)(der_kernel_se(input, input, hyp_in, param_in));

param = 'l2';

hyp_opt = hyp_init;

max_iter = 1E4; step_size = 1E-2;

ll_prev = -inf;

ll_list = zeros(max_iter, 1);

ll_diff__prev = 0;

for iter = 1:max_iter

[der, ll] = der_marginal_likelihood(gp_train, kfun, dkfun, hyp_opt, param);

val = hyp_opt.(param);

hyp_opt.(param) = val + step_size*sign(der);

% 종료 조건

ll_diff = ll-ll_prev;

ll_prev = ll;

ll_diff_sum = abs(ll_diff__prev + ll_diff);

if (ll_diff_sum < 1E-5) && (iter > 1E2)

break;

end

ll_diff__prev = ll_diff;

ll_list(iter) = ll;

fprintf('[%03d/%03d] %s: %.1e / LogLik: %.1e (%.3e / %.3e) \n' ...

, iter, max_iter, param, val, ll, ll_diff, ll_diff_sum);

end

% 3. GPR

sgpr_init_struct = init_lgpr(input, output, @kernel_lev_cos, hyp_init);

sgpr_init_result = lgpr(sgpr_init_struct, gp_ref.input);

sgpr_opt_struct = init_lgpr(input, output, @kernel_lev_cos, hyp_opt);

sgpr_opt_result = lgpr(sgpr_opt_struct, gp_ref.input);

% 4. Plot

% fig3 = figure(3); clf; scrsz = get(0,'ScreenSize');

% set(fig3,'Position', [0.2*scrsz(3),0.2*scrsz(4),0.3*scrsz(3),0.4*scrsz(4)]);

% plot(ll_list, 'LineWidth', 2); title('Log Likelihood'); grid on;

% xlabel('epoch'); set(gca,'FontSize', 16);

fig4 = figure(4); clf; scrsz = get(0,'ScreenSize');

set(fig4,'Position', [0.4*scrsz(3),0.2*scrsz(4),0.4*scrsz(3),0.4*scrsz(4)]);

hold on; set(gca,'FontSize', 12);

leg_struct = init_legend();

href = plot(gp_ref.input, gp_ref.output, 'Color', 'b', 'LineWidth', 3);

leg_struct = add_legend(leg_struct, href, 'Reference data');

htrain = plot(gp_train.input, gp_train.output, 'o', 'Color', 'k', 'LineWidth', 2, 'MarkerSize', 15);

leg_struct = add_legend(leg_struct, htrain, 'Training data');

hinit = plot(gp_ref.input, sgpr_init_result.mean, '-', 'Color', 'g', 'LineWidth', 3);

leg_struct = add_legend(leg_struct, hinit, sprintf('init l2: %.2f', hyp_init.l2));

hopt = plot(gp_ref.input, sgpr_opt_result.mean, '--', 'Color', 'r', 'LineWidth', 3);

leg_struct = add_legend(leg_struct, hopt, sprintf('opt l2: %.2f', hyp_opt.l2));

smart_legend(leg_struct.hs(1:leg_struct.nr_data), leg_struct.strs(1:leg_struct.nr_data));

grid on; set(gca,'FontSize', 16);

Code - get_Lmarginal

function Lmarginal = get_Lmarginal(gp_train, kfun, hyp)

input = gp_train.input;

output = gp_train.output;

nr_data = gp_train.nr_data;

mean_output = mean(output);

mz_output = output - repmat(mean_output, nr_data, 1);

K = kfun(input, input, hyp);

Lmarginal = -0.5*mz_output'/K*mz_output -0.5*log(det(K)) - nr_data/2*log(2*pi);

Full code and dataset

gp_model_selecttion.zip

'Enginius > Machine Learning' 카테고리의 다른 글

Research goals in 2015 (0)	2015.04.22
Image filtering + Hybrid image (0)	2015.04.22
Spline Curve Fitting (0)	2014.12.30
[Compressive Sensing] Armijo's rule, trust region method, proof of Taylor theorem, Schur's lemma (4)	2014.12.11
Gaussian process realization + measurements using interp2 (0)	2014.11.30

Mad for Simplicity

Model Selection in Gaussian process regression

'Enginius > Machine Learning' 카테고리의 다른 글

티스토리툴바

Model Selection in Gaussian process regression

'Enginius > Machine Learning' 카테고리의 다른 글

'Enginius/Machine Learning' Related Articles

티스토리툴바