% inference.tex (c) David J C MacKay %
\documentstyle{article}
\oddsidemargin=0in\textwidth=6in\topmargin=-0.80in\textheight=10in
\renewcommand{\textfraction}{0.1}
\pagestyle{empty}\begin{document}\title{}\author{David J.C. MacKay }\date{}
\noindent
\input{psfig.tex}
\input{/home/mackay/tex/newcommands1}
\begin{center}
{\large \bf
Computational Methods in Physics
\\ \medskip
}
\end{center}
%
\newcommand{\bb}{{\bf b}}
\subsection*{Model Fitting and Model Comparison}
In Bayesian statistical inference we use probabilities to
quantify how plausible hypotheses are, given data and other assumptions.
There are two rules
of probability: the product rule, $P(A,B) = P(A|B)P(B)$,
and the sum rule, $P(A) = \sum_B P(A,B)$.
\paragraph{(a)}
\newcommand{\FIGS}{/home/mackay/book/FIGS}
%
% \begin{quotation}
Unstable particles are emitted from a source and are assumed
by model $\H_1$ to decay at a distance $x$ that has an
exponential probability distribution with characteristic
length $\lambda$. Decay events can only be observed if they
occur in a window extending from $x_{\min}=1$cm to $x_{\max}=20$cm. $N$
decays are observed at locations $\{x_1 \ldots x_N\}$. What is
$\lambda$?
% \end{quotation}
\begin{center}
\mbox{\psfig{figure=\FIGS/decay.ps,width=3in,angle=90,%
bbllx=154mm,bblly=147mm,bbury=257mm,bburx=175mm}}\\
\end{center}
%\subsubsection*{Part b}
\paragraph{(b)}
An alternative model, $\H_2$,
states that the sample contains two equal strength sources with
different decay lengths $\lambda_a$ and $\lambda_b$. Assuming $\H_2$ to be true,
% and given one of the data sets below,
what are $\lambda_a$ and $\lambda_b$, given a data set $\{x_1
\ldots x_N\}$?
\paragraph{(c)}
%\subsubsection*{Part c}
How probable is model $\H_1$
% (which states that there is only a single source with decay length $\l$)
relative to model $\H_2$, given the data?
% as a model for data set 1; and for data set 2?
\medskip
\noindent
Write a program which, given a data file, answers the above three questions.
% The tasks that follow involve the numerical evaluation
% and maximization of functions
% of several variables, and numerical integrations.
\subsubsection*{Model fitting --- how to infer a model's parameters
(parts a and b)}
% One might think of many ways of trying
% to answer this question, such as binning the data and
% fitting an exponential curve to the resulting histogram. But there
% is a unique optimal procedure for answering such questions, which we
% will now review.
%
% Fit model $\H_1$
Consider the first model.
If we knew $\lambda$ then we could write down the probability
distribution of one decay location $x$, given $\lambda$, thus:
\beq
P(x|\lambda,\H_1) =\left\{ \begin{array}{ll}
{\textstyle \frac{1}{\l}}
e^{-x/\lambda } / Z(\lambda) & 1 < x < 20 \\
0 & {\rm otherwise }
\end{array} \right.
\label{basic.likelihood}
\eeq
where
\beq
Z(\l) = \int_1^{20} dx \: \frac{1}{\l} e^{-x/\lambda } = \left(e^{-1/\l} - e^{-20 /\l} \right).
\label{basic.likelihood.Z}
\eeq
The probability distribution of $N$ data points $\{x_1 \ldots x_N\}$,
given $\l$, is a separable distribution:
\beq
P(\{x\}|\lambda,\H_1) = \prod_{n=1}^N P(x|\lambda,\H_1)
\label{likelihood}
\eeq
Now, given a set of data $\{x_1 \ldots x_N\}$, {\bf Bayes' theorem}
[$P(A|B) = P(B|A)P(A)/P(B)$] tells us how to turn this probability
distribution round to obtain the probability distribution of $\lambda$
given the data:
\beqan
\label{bayes.theorem}
% \begin{array}{l}
&& P(\l|\{ x \},\H_1) =
\frac{P(\{x\}|\lambda,\H_1) P(\l|\H_1)}{P(\{x\}|\H_1) } \\
&& \hspace{0.5in} \propto \frac{1}{\left( \l Z(\l) \right)^N}
\exp \left( \textstyle - \sum_1^N x_i / \l \right) P(\l)
.
% \end{array}
\label{basic.posterior}
\eeqan
Bayes' theorem integrates together our initial knowledge
about the alternative hypotheses $\l$, represented by the {\em prior\/}
$P(\l|\H_1)$, with what the data tells us about $\l$, as described by the
{\em likelihood function\/} $P(\{x\}|\lambda,\H_1)$.
The {\em posterior probability\/} $P(\l|\{ x \},\H_1)$ is proportional
to the product of these two functions. The normalizing constant,
$P( \{x\}| \H_1)$ --- which is independent of $\lambda$ and so
is unimportant when we are performing parameter fitting ---
is given (from the sum rule) by
\beq
P( \{x\}| \H_1) = \int P(\{x\}|\lambda,\H_1) P(\l|\H_1) d \lambda\:.
\eeq
Use $l \equiv \log \lambda$ as the parameter (as a general rule,
where a parameter can only
take positive values, it is natural to work with its logarithm).
Let the prior probability distribution for each unknown decay length
be uniform over $\log \lambda$
from $\l_{\min}=0.3$ to $\l_{\max}=30$, that is,
\beq
P(\log \l | \H_1 )
=\left\{ \begin{array}{cc} 1/\log(\l_{\max}/\l_{\min}) &
% {\rm const.} &
%1/(\l_{\max}-\l_{\min})$
\l\in[\l_{\min},\l_{\max}] \\
0 & {\rm otherwise} \end{array} \right. .
\eeq
% To solve part a
% you should simply
For the first model, evaluate the product
$P(\{x\}|\lambda,\H_1) P(\log \l|\H_1)$
(the unnormalized posterior probability)
as a function of $\log \l$.
In the case of the second model, work out the likelihood function
$P(\{x\} | \lambda_a,\lambda_b,\H_2)$, assuming that the sources have
equal strength, \ie, half of the emitted particles come from each (of
course this doesn't mean that half of the {\em observed\/} particles
come from each---some decay events may happen outside the window);
check your likelihood function with a demonstrator then plot the
unnormalized posterior probability as a function of $\log \lambda_a$ and
$\log \lambda_b$. In both cases it may be wise to work with the logarithm
of the probability rather than the probability itself.
\subsubsection*{Model comparison --- part c}
To compare the models we must compute the normalizing constants,
\beq
P(\{x\}|\H_1) = \int P(\{x\}|\lambda,\H_1) P(\log \l|\H_1) d (\log \l)
\eeq
and
\beq
P(\{x\}|\H_2) = \int \! \int P(\{x\}|\lambda_a,\lambda_b,\H_2) P( \log \lambda_a, \log \lambda_b|\H_2)
\, d \lambda_a \, d \lambda_b
.
\eeq
These quantities are called
the evidence for model 1 and model 2 respectively.
By Bayes' theorem, the posterior
probabilities of models 1 and 2 are given by:
\beq
P( \H_1 | \{x\} ) = \frac{ P(\{x\}|\H_1) P( \H_1 ) }
{ P (\{x\}) }
\eeq
and
\beq
P( \H_2 | \{x\} ) = \frac{ P(\{x\}|\H_2) P( \H_2 ) }
{ P (\{x\}) } .
\eeq
The prior probabilities of the models will be taken to be equal,
$P( \H_1 ) = P( \H_2 ) = 1/2$. The normalizing constant is
$P (\{x\}) = P(\{x\}|\H_1) P( \H_1 ) + P(\{x\}|\H_2) P( \H_2 )$.
Evaluate the evidence for each model by doing the integrals over $\l$
(\ie, sum the numbers you evaluated in parts a and b).
Your program should also make a note of the maximum value
% (or maxima)
of the log likelihood for each model.
% Does the model with more parameters
% achieve a greater likelihood than the simpler model?
Apply your program to the first $N=1,2,5 \ldots 50, 100$ samples
from the two data sets in \verb+~djcm1/data/decay+.
\medskip
\hfill DJCM \today
\end{document}