\documentclass[10pt]{article}
\usepackage{amssymb,amsmath}
\usepackage{graphicx}
\usepackage{color}


\addtolength{\textheight}{1.5in}
\addtolength{\topmargin}{-0.8in}
\addtolength{\oddsidemargin}{-0.5in}
\addtolength{\evensidemargin}{-0.5in}
\addtolength{\textwidth}{1.in}

\newcommand{\comment}[1]{}
\newcommand{\mmp}[1]{\textcolor{blue}{MMP: #1}}
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}
\newcommand{\bit}{\begin{itemize}\setlength{\itemsep}{0cm}\setlength{\topsep}{0cm}}
\newcommand{\eit}{\end{itemize}}
\newcommand{\benum}{\begin{enumerate}\setlength{\itemsep}{0cm}\setlength{\parsep
}{0cm}}
\newcommand{\eenum}{\end{enumerate}}
\newcommand{\bprogram}{\begin{tabbing}mm\=mm\=mm\=mm\= \kill\\}
\newcommand{\eprogram}{\end{tabbing}}

% math mode commands

\newcommand{\bigOO}{{\cal O}}
\newcommand{\bbone}{{\mathbf 1}}
\newcommand {\argmax}[2]{\mbox{\raisebox{-1.7ex}{$\stackrel{\textstyle{\rm #1}}{\scriptstyle #2}$}}\,}
\newcommand{\dataset}{{\cal D}}
\newcommand{\datasett}{\dataset=\{x^1,\ldots x^n\}}
\newcommand{\tilDD}{\tilde{\dataset}}

\newcommand{\rrr}{{\mathbb R}}
\newcommand{\fracpartial}[2]{\frac{\partial #1}{\partial #2}}

\newcommand{\setxx}[1]{\{x\,|\,{#1} \}}
\newcommand{\tilN}{\tilde{N}}
\newcommand{\tilx}{\tilde{x}}
\newcommand{\hatl}{\hat{L}}
\newcommand{\betahat}{\hat{\beta}}
\newcommand{\llogit}{_{\rm logit}}
\newcommand{\supind}[1]{^{(#1)}}
\newcommand{\ssig}{\sigma^2}
\newcommand{\N}{\mathcal{N}}
\newcommand{\tils}{\tilde{\sigma}^2}

\newlength{\picwi}
\begin{document}
\setlength{\parindent}{0em}
\setlength{\parskip}{1.3em}

\begin{large}
\centerline{CS 480/680 Homework 8}  % assignment number here

\centerline{Released April 6, 2026}       % out date here
\centerline{NOT GRADED}
\end{large}

%\centerline{\bf } % title here

\centerline{\large \copyright Marina Meil\u{a} and Haochen Sun}
\centerline{\large mmp@uwaterloo.ca}

\vspace{2em}
\begin{small}
\end{small}

%{\em All plots must be included in the .pdf writeup}

{\bf Problem 1 -- Selecting K}\\
{\bf 1.a} 
We will consider the finite Mixture of Gaussians model defined by 
%
\beq \label{eq:mixture}
f(x)\;=\;\frac{1}{K}\sum_{k=1}^K\N(x;\mu_k,\Sigma)\,, 
\eeq
%
namely a model where all clusters have the same covariance matrix. Calculate the number of parameters in this model, first as a literal expression, then for $d=5$, $K=6$.


{\bf 1.b.} Now we want to use BIC to select $K$. You have run the EM algorithm for $K=1,2,\ldots 10$ for a dataset $\dataset$ of size $n=1000$ and obtained the following log-likelihoods.
\\
\begin{tabular}{c|cccccccccccc}
$K$&  1&2&3&4&5&6&7&8&9&10&11&12\\
  \hline
$l$ & -412.5 & -361.0 & -113.2 & 428.4 & 485.15 & 641.7 & 685.5 & 754.2 & 760.1 & 764.5 & 772.2 & 776.3\\
\end{tabular}
%
Plot the BIC values and select $\hat{K}$ the optimum value according to the BIC.
\beq
      BIC(\theta_K)\,=\,\sum_{i=1}^nf(x^i;\theta_K)-\frac{\#\theta_K}{2}\ln n,
      \eeq
      %
where $\theta_K$ is the set of all parameters of a mixture defined by \eqref{eq:mixture} with $K$ clusters.
 
{\bf Problem 2 -- PCA}
The following figures represent the Principal Component Analysis of a data set $\dataset$.
\setlength{\picwi}{.44\textwidth}
\begin{tabular}{cc}
  \includegraphics[width=\picwi]{fig-PCA-hw8-xx.png}
  &
  \includegraphics[width=\picwi]{fig-PCA-hw8-sigmas.png}
\\
  \includegraphics[width=\picwi]{fig-PCA-hw8-xx-pca.png}
  &
  \includegraphics[width=\picwi]{fig-PCA-hw8-var-expl.png}
\end{tabular}

{\bf 2.a} What is the dimension $D$ of the data points? 

{\bf 2.b} Does the data live in a lower dimension $D'<D$? This means that by a rotation of the entire data set, one can bring the data into $D'$ dimensions, with the remaining $D-D'$ dimensions being 0. If so, what is $D'$?

{\bf 2.c} Select a $d$ so that the variance explained by $d$-PCA is at least $75\%$ of the total variance. 

{\bf 2.d} Assume that we have a $\dataset'=\{ 3x^i+1,\;x^i\in \dataset \}$. What will be $\tils_1$ for the data $\dataset'$? What $d$ should be chosen to explain at least $75\%$ of the total variance.

{\bf Problem 3 -- Cross-validation}

{\bf 3.a} Return to Problem 2 from Hw1, and run $m$-fold CV with $m=6$. What is the value $\hat{K}$ you obtain?

{\bf 3.b} Can you think of a way to do CV for clustering? Can any of the methods for selecting $K$ for clustering be described as a CV method?

{\bf Problem 4 -- Sparse Regression}

{\bf 4.a} Show that for any $\beta\in \rrr^d$, $\|\beta\|_1\geq \|\beta\|$ where $\|\cdot\|$ represents the Euclidean norm. Assume $\|\beta\|=1$. For what values of $\beta$ do we have equality?

{\bf 4.b} Let $\dataset$ be a data set with $n$ pairs $(x^i,y^i)$, and let $\betahat$ be the unique minimum value of
\beq \label{eq:l1-J}
J(\beta)\;=\;\frac{1}{n}\sum_{i=1}^n(y^i-\beta^Tx^i)^2+ \lambda \|\beta\|_1,\quad \lambda>0
\eeq
%
Assume that the data satisfy $y^i=\beta_*^Tx^i$ for $i=1:n$, where $\beta$ is $s$-sparse. What is the sample size needed to ensure that $\hat{\beta}$ is also $s$-sparse and a good approximation to $\beta_*$? (Note that if $s'<s$, any $s'$-sparse vector is also $s$-sparse); answer with $\bigOO$ notation.

{\bf 4.c} Assume $n\geq 2$, is $\betahat=\beta_*$?

{\bf 4.d} Assume now $y^i=\beta_*^Tx^i+\epsilon_i$ for $i=1:n$, and let $\beta_{LS}\neq 0$ be the minimizer of $L_{LS}$ the unregularized solution. Show that $\|\beta_{LS}\|_1>\|\betahat\|_1$ for any $\lambda>0$. 

{\bf 4.e} When $d=1$, calculate the smallest value of $\lambda_{\rm max}$ for which $\betahat=0$ in \eqref{eq:l1-J}.


\end{document}
