Measures.tex
\section{Probability measures}
{\tiny [[Ryan: \bf{The quickest and clearest way to marshal all the requisite facts about equal-slices involves describing it as a uniform mixture of product distributions. However, this has the slight downside of introducing non-discrete probability distributions into an otherwise entirely finitary paper. In particular, an extremely pedantic mathematician might not enjoy the casual treatment I employ in dealing with continuous distributions. However, I think getting hyper-formal would badly obscure things for no real gain.}]]}
\subsection{Comparing product distributions}
We begin by recording some simple facts about distances between probability distributions. Throughout this subsection we assume $\pi$ and $\nu$ are probability distributions on the same finite set $\Omega$.\noteryan{Perhaps change $\Omega$ to $[k]$ throughout.}
\begin{definition} Given a distribution $\pi$ on $\Omega$, write $\min(\pi) = \min_{x \in \Omega} \pi[x]$. \end{definition}
\begin{definition} The \emph{total variation distance} $\dtv{\pi}{\nu}$ is defined by \[ \dtv{\pi}{\nu} = \frac12 \sum_{x \in \Omega} |\pi[x] - \nu[x]| = \max_{A \subseteq X} |\pi[A] - \nu[A]|. \] \end{definition} \begin{fact} \label{fact:tv-mix} Let $(\nu_\kappa)_{\kappa \in K}$ be a family of distributions on $\Omega$, let $\lambda$ be a distribution on $K$, and let $\mu$ be the associated mixture distribution, given by drawing $\kappa \sim \lambda$ and then drawing from $\nu_\kappa$. Then\noteryan{need to prove? It's just the triangle inequality} \[ \dtv{\pi}{\mu} \leq \Ex_{\kappa \sim \lambda}[\dtv{\pi}{\nu_\kappa}]. \] \end{fact} \begin{definition} The \emph{$\chi^2$ distance} $\dchi{\pi}{\nu}$ is defined by\noteryan{I assume the reader is savvy enough to realize we mean to take the positive square root\dots} \[ \dchi{\pi}{\nu}^2 = \Varx_{x \sim \pi}\left[\frac{\nu[x]}{\pi[x]}\right]. \] Note that $\dchi{\pi}{\nu}$ is \emph{not} symmetric in $\pi$ and $\nu$. \end{definition}
The $\chi^2$ distance is introduced to help us prove the following fact: \begin{proposition} \label{prop:mix-distance} Assume $\min(\pi) \neq 0$; i.e., $\pi$ has full support.\noteryan{A bit of a drag to bring this up, but technically, it's needed for the first deduction in the proof.} Suppose $\pi$ is slightly mixed with $\nu$, forming $\hat{\pi}$; specifically, $\hat{\pi} = (1-\eps) \pi + \eps \nu$. Then the associated product distributions $\pi^{\otimes n}$, $\hat{\pi}^{\otimes n}$ on $\Omega^{n}$ satisfy \[ \dtv{\pi^{\otimes n}}{\hat{\pi}^{\otimes n}} \leq \dchi{\pi}{\nu} \cdot \eps \sqrt{n}. \] \end{proposition} \begin{proof} It is a straightforward consequence of Cauchy-Schwarz (see, e.g.~\cite[p.\ 101]{Rei89}) that \[ \dtv{\pi^{\otimes n}}{\hat{\pi}^{\otimes n}} \leq \dchi{\pi}{\hat{\pi}} \cdot \sqrt{n}, \] and the identity $\dchi{\pi}{\hat{\pi}} = \eps \dchi{\pi}{\nu}$ follows easily from the definitions. \end{proof} This can be bounded independently of $\nu$, as follows: \begin{corollary} \label{cor:mix-distance} In the setting of Proposition~\ref{prop:mix-distance}, \[ \dtv{\pi^{\otimes n}}{\hat{\pi}^{\otimes n}} \leq \sqrt{{\textstyle \frac{1}{\min(\pi)}} - 1} \cdot \eps \sqrt{n}, \] \end{corollary} \begin{proof} It is easy to check that the distribution $\nu$ maximizing $\dchi{\pi}{\nu}$ is the one putting all its mass on the $x$ minimizing $\pi[x]$. In this case one calculates $\dchi{\pi}{\nu} = \sqrt{\frac{1}{\min(\pi)} - 1}$. \end{proof}
\subsection{Equal-slices}
\begin{definition} We say that $\nu$ is a \emph{random distribution} on $[k]$ if the vector $(\nu[1], \dots, \nu[k])$ is chosen uniformly from the simplex $\{(p_1, \dots, p_k) : \sum p_i = 1, p_i \geq 0\ \forall i\}$. \end{definition}
\begin{remark} It is well known (see, e.g.~\cite[Ch.\ 1]{Rei89}) that a random distribution $\nu$ can equivalently be defined by choosing $k-1$ points in $[0,1]$ uniformly and independently and letting $\nu[i]$ be the length of the $i$th spacing thus formed, $i = 1 \dots k$.\noteryan{The reader may take this to be the definition, if they desire.} \end{remark}
\begin{definition} The \emph{equal-slices} distribution on $[k]^{n}$, denoted $\eqs{k}^{n}$, is defined as follows: First, choose a random distribution $\nu$ on $[k]$; then draw from the product distribution $\nu^{\otimes n}$ on $[k]^{n}$. We sometimes write $\eqs{k}$ in place of $\eqs{k}^n$ when $n$ is clear from context. \end{definition}
The terminology ``equal-slices comes from the following:\noteryan{So far we don't actually need the observations that follow} \begin{definition} Given a vector $C \in \N^k$ whose components sum to $n$, the associated \emph{slice} of $[k]^{n}$ is the set \[ \{x \in [k]^{n} : x \text{ has $C_1$ many $1$'s, $C_2$ many $2$'s, \dots, $C_k$ many $k$'s}\}. \] We abuse terminology by also referring to $C$ as the slice. A slice is \emph{nondegenerate} when $C_i \neq 0$ for all $i$. There are $\binom{n+k-1}{k-1}$ different slices, of which $\binom{n-1}{k-1}$ are nondegenerate.\noteryan{Justify with bars and dots?} \end{definition} \begin{proposition} The equal-slices distribution $\eqs{k}^{n}$ is equivalent to first choosing a slice uniformly from the $\binom{n+k-1}{k-1}$ many possibilities, then drawing a string uniformly from that slice. \noteryan{Include proof? If so, it's by throwing $n+k-1$ points into the interval and looking at the result in two ways.} \end{proposition}
The following observation is also crucial: \begin{lemma} Let $x \sim \eqs{k}^n$, and let $a \sim [k-1]$ be uniformly random. Then $\chg{x}{k}{a}$ is distributed as $\eqs{k-1}^n$. \end{lemma} \begin{proof} \noteryan{needs to be filled in} \end{proof}
\subsection{Random restrictions} \begin{definition} A \emph{restriction} on $[k]^n$ is a pair $(J, x_\barJ)$, where $J \subseteq [n]$ and $x_\barJ \in [k]^{\barJ}$, where $\barJ = [n] \setminus J$. It is thought of as a ``partial string, where the $\barJ$-coordinates are ``fixed according to $x_\barJ$ and the $J$-coordinates are still ``free. Given a fixing $y_J \in [k]^J$, we write $(x_\barJ, y_J)$ for the complete composite string in $[k]^n$.\noteryan{Some may say this is bad notation, but I believe it is very useful notation. We can address this point.} \end{definition}
\begin{definition} \label{def:restriction} Let $0 \leq \eps \leq 1$ and let $(\pi^m)$ be a family of probability distributions, where $\pi^m$ is a distribution on $[k]^m$. We say that $(J, x_\barJ)$ is an \emph{$(\epsilon,\pi)$-random restriction} if $J$ is chosen by including each coordinate of $[n]$ independently with probability $\eps$, and then $x_\barJ$ is drawn from $[k]^{\barJ}$ according to $\pi^{|\barJ|}$. \end{definition} In the above definition, $\pi^m$ is typically either equal-slices $\eqs{k}^m$ or a product distribution $\nu^{\otimes m}$ based on some distribution $\nu$ on $[k]$. \\
The following standard large-deviation bound will prove useful: \begin{fact} \label{fact:large-dev} If $(J,x_\barJ)$ is an $(\epsilon,\pi)$-random restriction, then $(\eps/2)n \leq |J| \leq 2\eps n$ except with probability at most $2\exp(-\eps n/8)$. \end{fact}
\begin{definition} In the setting of Definition~\ref{def:restriction}, suppose that $(\nu^m)$ is another family of distributions. The \emph{$(\eps, \pi, \nu)$-composite distribution} generates an $(\eps,\pi)$-random restriction $(J, x_\barJ)$ and then generates $y_J \in [k]^J$ according to $\nu^{|J|}$. We sometimes think of this distribution as just being a distribution on composite strings $z = (x_\barJ, y_J) \in [k]^n$. \end{definition}
\begin{fact} If $\pi$ and $\nu$ are distributions on $[k]$, thought of also as product distribution families, then the $(\eps, \pi, \nu)$-composite distribution on $[k]^n$ is precisely the product distribution $\hat{\pi}^{\otimes n}$, where $\hat{\pi}$ is the mixture distribution $(1-\eps)\pi + \eps \nu$ on $[k]$. \end{fact}
Because of this, we can use Corollary~\ref{cor:mix-distance} to bound the total variation distance between $\pi^{\otimes n}$ and a composite distribution. For example, we conclude:\noteryan{this proposition lets us pass from uniform to eq-slices at the very beginning} \begin{proposition} \label{prop:unif-composite} Let $\mu_k$ denote the uniform distribution $[k]$, and let $\nu$ be any distribution on $[k]$. Writing $\mu_k^{(\nu)}$ for the $(\eps,\mu_k,\nu)$-composite distribution on strings in $[k]^n$, we have \[ \dtv{\mu_k^{\otimes n}}{\mu_k^{(\nu)}} \leq \sqrt{k-1} \cdot \eps \sqrt{n}. \] \end{proposition} Since the equal-slices distribution $\eqs{k}^{m}$ on $m$ coordinates is a mixture of product distributions $\nu^{\otimes m}$, we conclude from Proposition~\ref{prop:unif-composite} and Fact~\ref{fact:tv-mix}: \begin{proposition} Let $\mu_k$ denote the uniform distribution $[k]$. Writing $\mu_k'$ for the $(\eps,\mu_k,\eqs{k})$-composite distribution on strings in $[k]^n$, we have \[ \dtv{\mu_k^{\otimes n}}{\mu_k'} \leq \sqrt{k-1} \cdot \eps \sqrt{n}. \] \end{proposition}
Using Corollary~\ref{cor:mix-distance} more generally, we obtain:\noteryan{this lets us pass from an arbitrary product distribution to equal-slices} \begin{proposition} \label{prop:prod-composite} Let $\nu$ be any distribution on $[k]$. Writing $\nu'$ for the $(\eps,\nu,\eqs{k})$-composite distribution on strings in $[k]^n$, we have \[ \dtv{\mu_k^{\otimes n}}{\nu'} \leq {\textstyle \sqrt{\frac{1}{\min(\nu)}-1}} \cdot \eps \sqrt{n}. \] \end{proposition} Hence:\noteryan{this proposition lets us do restrictions under equal slices} \begin{proposition} \label{prop:eqs-compos} Let $\eqsc{k}^{(\eps)}$ denote the $(\eps, \eqs{k}, \eqs{k})$-composite distribution on strings in $[k]^n$. Then \[ \dtv{\eqs{\mu}^n}{\eqsc{k}^{(\eps)}} \leq 2k \eps \sqrt{n}. \] \end{proposition} \begin{proof} Recalling again that $\eqs{\mu}^m$ is the mixture of product distributions $\nu^{\otimes m}$, where $\nu$ is a random distribution on $[k]$, we apply Fact~\ref{fact:tv-mix} and Proposition~\ref{prop:prod-composite} to conclude \[ \dtv{\eqs{\mu}^n}{\eqsc{k}^{(\eps)}} \leq \Ex_{\nu}\left[{\textstyle \sqrt{\frac{1}{\min(\nu)}-1}}\right] \cdot \eps \sqrt{n}. \] We can upper-bound the expectation by \[ \Ex_{\nu}\left[{\textstyle \sqrt{\frac{1}{\min(\nu)}}}\right] = \int_{0}^\infty \Pr_{\nu}\left[{\textstyle \sqrt{\frac{1}{\min(\nu)}}} \geq t\right]\,dt = \int_{0}^\infty \Pr_{\nu}\left[\min(\nu) \leq \frac{1}{t^2}\right]\,dt \leq k + \int_{k}^\infty \Pr_{\nu}\left[\min(\nu) \leq \frac{1}{t^2}\right]\,dt. \label{eqn:eq-dist-integral} \] \ignore{ \begin{eqnarray} \Ex_{\nu}\left[{\textstyle \sqrt{\frac{1}{\min(\nu)}}}\right] & = & \int_{0}^\infty \Pr_{\nu}\left[{\textstyle \sqrt{\frac{1}{\min(\nu)}}} \geq t\right]\,dt\nonumber\\ & = & \int_{0}^\infty \Pr_{\nu}[\min(\nu) \leq 1/t^2]\,dt\nonumber\\ & \leq & k + \int_{k}^\infty \Pr_{\nu}[\min(\nu) \leq 1/t^2]\,dt. \label{eqn:eq-dist-integral} \end{eqnarray} } Think of $\nu$ as being given by the spacings between $k$ random points chosen from $[0,1]$. By a union bound, $\Pr_{\nu}[\min(\nu) \leq 1/t^2]$ is at most $\binom{k}{2}$ times the probability that two random points in $[0,1]$ are within distance $1/t^2$. This is clearly at most $2/t^2$. Hence the integral above is bounded by \[ \int_{k}^\infty \binom{k}{2} \cdot \frac{2}{t^2}\,dt \leq \int_{k}^\infty \frac{k^2}{t^2}\,dt = k, \] completing the proof. \end{proof}