%\documentstyle[10pt,twoside]{article}
%\documentstyle[twoside]{article}
\documentclass[twoside]{article}
\setlength{\oddsidemargin}{0 in}
\setlength{\evensidemargin}{0 in}
\setlength{\topmargin}{-0.6 in}
\setlength{\textwidth}{6.7 in}
\setlength{\textheight}{8.5 in}
\setlength{\headsep}{0.75 in}
\setlength{\parindent}{0 in}
\setlength{\parskip}{0.1 in}
\usepackage{amsmath,amssymb,enumerate,ifthen,tikz,floatrow,amsmath, algorithm,algorithmic}

\usetikzlibrary{positioning}% To get more advances positioning options
\usetikzlibrary{arrows}% To get more arrow heads
% The following commands sets up the lecnum (lecture number)
% counter and make various numbering schemes work relative
% to the lecture number.
%
\newcounter{lecnum}
\renewcommand{\thepage}{\thelecnum-\arabic{page}}
\renewcommand{\thesection}{\thelecnum.\arabic{section}}
\renewcommand{\theequation}{\thelecnum.\arabic{equation}}
\renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
\renewcommand{\thetable}{\thelecnum.\arabic{table}}

\newtheorem{theorem}{Theorem} 
\newtheorem{lemma}{Lemma} 
\newtheorem{claim}{Claim} 
\newtheorem{proposition}{Proposition} 
\newtheorem{prob}{Problem} 
\newtheorem{corollary}{Corollary} 
\newtheorem{question}{Question} 
\newtheorem{conjecture}{Conjecture} 
\newtheorem{example}{Example} 
\newtheorem{definition}{Definition} 
\newtheorem{remarka}{Remark} 

\def\P{\mathop{\rm P}\nolimits}
\def\NP{\mathop{\rm NP}\nolimits}
\def\DTIME{\mathop{\rm DTIME}\nolimits}
\def\BPTIME{\mathop{\rm BPTIME}\nolimits}
\def\ZPTIME{\mathop{\rm ZPTIME}\nolimits}
\def\polylog{\mathop{\rm polylog}\nolimits}

\newenvironment{remark}{\begin{remarka}\rm}{\end{remarka}} 
\newenvironment{proof}{{\bf Proof.}}{\hfill\rule{2mm}{2mm}} 
\newenvironment{pproof}[1]{\noindent{\textbf{Proof of #1.}}}{\hfill\rule{2mm}{2mm}} 
\newcommand{\calI}{{\cal I}}
\newcommand{\calT}{{\cal T}}
\newcommand{\calP}{{\cal P}}
\newcommand{\E}{{\rm E}}
\newcommand{\Var}{{\rm Var}}
%\newcommand{\Pr}{{\rm Pr}}
\newcommand{\opt}{\mbox{\sc opt}}
\newcommand{\OPT}{\mbox{\sc OPT}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\ZZ}{\mathbb{Z}}


%
% The following macro is used to generate the header.
%
\newcommand{\lecture}[5]{
   \pagestyle{myheadings}
   \thispagestyle{plain}
   \newpage
   \setcounter{lecnum}{#1}
   \setcounter{page}{1}
   \noindent
   \begin{center}
   \framebox{
      \vbox{\vspace{2mm}
    \hbox to 6.28in { {\bf CMPUT 675: Algorithms for Streaming and Big Data
                        \hfill Fall 2019} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1 (#2): #3 \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5} }
      \vspace{2mm}}
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

%
% Convention for citations is authors' initials followed by the year.
% For example, to cite a paper by Leighton and Maggs you would type
% \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
% (To avoid bibliography problems, for now we redefine the \cite command.)
%
\renewcommand{\cite}[1]{[#1]}

\input{epsf}

%Use this command for a figure; it puts a figure in wherever you want it.
%usage: \fig{NUMBER}{FIGURE-SIZE}{CAPTION}{FILENAME}
\newcommand{\fig}[4]{
			\vspace{0.2 in}
			\setlength{\epsfxsize}{#2}
			\centerline{\epsfbox{#4}}
			\begin{center}
			Figure \thelecnum.#1:~#3
			\end{center}
	}

% Use these for theorems, lemmas, proofs, etc.

% Some useful equation alignment commands, borrowed from TeX
\makeatletter
\def\eqalign#1{\,\vcenter{\openup\jot\m@th
  \ialign{\strut\hfil$\displaystyle{##}$&$\displaystyle{{}##}$\hfil
      \crcr#1\crcr}}\,}
\def\eqalignno#1{\displ@y \tabskip\@centering
  \halign to\displaywidth{\hfil$\displaystyle{##}$\tabskip\z@skip
    &$\displaystyle{{}##}$\hfil\tabskip\@centering
    &\llap{$##$}\tabskip\z@skip\crcr
    #1\crcr}}
\def\leqalignno#1{\displ@y \tabskip\@centering
  \halign to\displaywidth{\hfil$\displaystyle{##}$\tabskip\z@skip
    &$\displaystyle{{}##}$\hfil\tabskip\@centering
    &\kern-\displaywidth\rlap{$##$}\tabskip\displaywidth\crcr
    #1\crcr}}
\makeatother

% **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:

\begin{document}
%FILL IN THE RIGHT INFO.
%\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
\lecture{1}{Sep 4, 2019}{Introduction, background}{Mohammad R. Salavatipour}{Mohammad R. Salavatipour}

% **** YOUR NOTES GO HERE:

% Some general latex examples and examples making use of the
% macros follow.  
%**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
%**** ARE NEVER READ BY ANYBODY.
%This lecture's notes illustrate some uses of
%various \LaTeX\ macros.  
%Take a look at this and imitate.
%
%\section{Some theorems and stuff} % Don't be this informal in your notes!
%
%We now delve right into the proof.
%
%\begin{lemma}
%This is the first lemma of the lecture.
%\end{lemma}
%
%\begin{proof}
%The proof is by induction on \ldots.
%For fun, we throw in a figure.
%%%%NOTE USAGE !
%\fig{1}{1in}{A Fun Figure}{funfig.eps}
%
%This is the end of the proof, which is marked with a little box.
%\end{proof}
%
%\subsection{A few items of note}
%
%Here is an itemized list:
%\begin{itemize}
%\item this is the first item;
%\item this is the second item.
%\end{itemize}
%
%Here is an enumerated list:
%\begin{enumerate}
%\item this is the first item;
%\item this is the second item.
%\end{enumerate}
%
%Here is an exercise:
%
%{\bf Exercise:}  Show that ${\rm P}\ne{\rm NP}$.
%
%Here is how to define things in the proper mathematical style.
%Let $f_k$ be the $AND-OR$ function, defined by
%
%\[ f_k(x_1, x_2, \ldots, x_{2^k}) = \left\{ \begin{array}{ll}
%
%	x_1 & \mbox{if $k = 0$;} \\
%
%	AND(f_{k-1}(x_1, \ldots, x_{2^{k-1}}),
%	   f_{k-1}(x_{2^{k-1} + 1}, \ldots, x_{2^k}))
%	 & \mbox{if $k$ is even;} \\
%
%	OR(f_{k-1}(x_1, \ldots, x_{2^{k-1}}),
%	   f_{k-1}(x_{2^{k-1} + 1}, \ldots, x_{2^k}))	
%	& \mbox{otherwise.} 
%	\end{array}
%	\right. \]
%
%\begin{theorem}
%This is the first theorem.
%\end{theorem}
%
%\begin{proof}
%This is the proof of the first theorem. We show how to write pseudo-code now.
%%*** USE PSEUDO-CODE ONLY IF IT IS CLEARER THAN AN ENGLISH DESCRIPTION
%
%Consider a comparison between $x$ and~$y$:
%\begin{tabbing}
%\hspace*{.25in} \= \hspace*{.25in} \= \hspace*{.25in} \= \hspace*{.25in} \= \hspace*{.25in} \=\kill
%\>{\bf if} $x$ or $y$ or both are in $S$ {\bf then } \\
%\>\> answer accordingly \\
%\>{\bf else} \\
%\>\>    Make the element with the larger score (say $x$) win the comparison \\
%\>\> {\bf if} $F(x) + F(y) < \frac{n}{t-1}$ {\bf then} \\%
%\>\>\> $F(x) \leftarrow F(x) + F(y)$ \\
%\>\>\> $F(y) \leftarrow 0$ \\
%\>\> {\bf else}  \\
%\>\>\> $S \leftarrow S \cup \{ x \} $ \\
%\>\>\> $r \leftarrow r+1$ \\
%\>\> {\bf endif} \\
%\>{\bf endif} 
%\end{tabbing}
%
%This concludes the proof.
%\end{proof}
%
%
%\section{Next topic}
%
%Here is a citation, just for fun \cite{CW87}.
%

% **** THIS ENDS THE EXAMPLES. DON'T DELETE THE FOLLOWING LINE:
\section{Introduction }
There is huge growth in data gathering and hence more demand for processing the data to extract useful information from the raw
data in various applications.
The goal of this course is to learn some of the tools and techniques to design efficient and fast algorithms that can extract
useful information from raw data. Typically, these algorithms have to run fast (sometimes sublinear time) and often work with 
much smaller space than the data can possibly be stored. Below is a short list of topics we try to cover in this course

\begin{description}

\item{\bf Streaming:} There are situations when the data comes as a stream which is too large to be stored or processed later. 
We have
only one pass (or sometimes a few passes) over the data and have to make decisions as the data comes.

\item{\bf sketching/sampling:} the goal is to have a compressed form of data from which we can still answer queries and extract
 useful information.

\item{\bf Dimensionality reduction:} In many applications data comes with very high dimensions (e.g. medical applications, 
spam filtering, etc). Designing algorithms to mange high dimension data is difficult. One goal is to reduce the dimension (e.g. via
projection) while preserving (approximately) relevant structure/geometry of the problem.

\item{\bf property testing:} Checking quickly with sufficiently high probability whether a given object has certain properties
(e.g. if the result of a matrix computation is correct, if a large graph has certain graph properties, if a proof is valid, etc).

\item{\bf Sparse Fourier transform:} There are old algorithms to compute discrete Fourier transform of a sequence of 
length $n$ in time
$O(n\log n)$. This has various applications (in signal processing, multiplication of large integers, etc). Sparse Fourier Transform
is an algorithm to compute the DFT when the output is $k$-sparse in time $O(k\log n)$. 

\item{\bf Approximate counting:} Counting the number of objects with certain properties among a very large set (e.g. number 
of solutions
to an equation, or the number of distinct items in a sequence, etc).

\end{description}


Below we do a quick overview of the basic probability background we use throughout the course.

\section{Background on Probability}
Most of the algorithms we discuss are randomized and heavily rely on basic tools from probability theory for their analysis.
Let $\Omega$ be a discrete probability space. A probability function $\Pr:\Omega\rightarrow[0,1]$ has the property that 
$\sum_{x\in\Omega}\Pr(x)=1$. A subset $A\subset\Omega$ is called an event. We define $\Pr(A)=\sum_{x\in A}\Pr(x)$.
We say two events $A,B\subseteq \Omega$ are independent if $\Pr(A\cap B)=\Pr(A)\cdot \Pr(B)$. A random variable is a function
$X:\Omega\rightarrow \RR$. The expected value of $X$, denoted by $\E[X]$, is defined as $\E[X]=\sum_{i\in\Omega} i\cdot \Pr(X=i)$.

\begin{lemma}
$\E[X+Y]=\E[X]+\E[Y]$ and for any constant $c$: $\E[cX]=c\E[X]$.
\end{lemma}

Two random variables $X,Y$ are independent if $\forall x,y\in\RR: \Pr[X=x \wedge Y=y]=\Pr[X=x]\wedge \Pr[Y=y]$.

\begin{lemma}
If $X,Y$ are independent then $\E[XY]=\E[X]\E[Y]$.
\end{lemma}

Variance of a random variable $X$ is defined as $\Var[X]=\E[(X-\E[X])^2]=\sigma^2_X$ and $\sigma_X$ is called standard deviation.

\begin{theorem}[Markov's inequality]
Let $X$ be a non-negative random variable. Then for all $a>0$: $\Pr[X\geq a]\leq\frac{\E[X]}{a}$.
Alternatively $\Pr[X\geq a\E[X]]\leq\frac{1}{a}$.
\end{theorem}

Using Markov's inequality to bound deviation from the mean is called first moment method.

\begin{theorem}[Chebyshev's inequality]
Let $X$ be a random variable and $t>0$. Then $\Pr[|X-\E[X]|>t\leq\frac{\Var[X]}{t^2}$.
Alternatively $\Pr[|X-\E[X]|>t\sigma_X]\leq\frac{1}{t^2}$.
\end{theorem}

As an example, consider a random walk on the integers that starts at origin (zero) and at every step
with probably $\frac{1}{2}$ it moves one step to left or right uniformly randomly.
After $n$ steps, how far from 0 we have traveled?
Let 
\[
X_i=\left\{
\begin{array}{ll}
1 & \mbox{moved right at step $i$}\\
-1 & \mbox{moved left at step $i$}\\
\end{array}
\right.
\]
Let $Y_n$ be the position at step $n$. Then $Y_n=\sum_{i=1}^n X_i$, $E[Y_n]=0$ and $\Var[Y_n]=n$.
Therefore, using Chebyshev's inequality $\Pr[|Y_n|\geq t\sqrt{n}]\leq\frac{1}{t^2}$.

{\bf Chernoff-Hoeffding:} Chernoff bound is a very powerful bound giving exponentially decreasing bound on the tails
of distributions. It can be applied to bound deviation from the mean for ``independent'' random variables. It can be derived
using Markov's inequality.

\begin{theorem}[Chernoff]
Let $X_1,\ldots,X_n$ be independent binary (Poisson) random variables where $\Pr[X_i]=p_i$ and let $X=\sum_i X_i$ and $\mu=\E[X]$.
Then 
\begin{itemize}
\item For any $\delta>0$: $\Pr[X\geq(1+\delta)\mu] < \left(\frac{e^\delta}{(1+\delta)^{(1+\delta)}}\right)^\mu$.
\item For any $0<\delta\leq 1$: $\Pr[X\geq(1+\delta)\mu]\leq e^{\mu\delta^2/3}$
\item For $R\geq 6\mu$: $\Pr[X\geq R]\leq 2^{-R}$.
\end{itemize}
\end{theorem}

Although Chernoff bound gives very powerful bounds it has limited applications to independent variables. There are stronger tools
that can be applied to show concentration for settings with limited dependencies.

{\bf Azuma's inequality} Let $X$ be a random variable determined by $n$ trials $X_1,\ldots,X_n$ such that for all $i$ and any
two possible sequences of outcomes $x_1,\ldots,x_i$ and $x_1,\ldots,x_{i-1},x'_i$:

$$|\E[X|X_1=x_1,\ldots,X_i=x_i]-\E[X|X_1=x_1,\ldots,X_{i-1}=x_{i-1},X_i=x'_i]\leq c_i$$
for constants $c_i$ then $\Pr[|X-\E[X]|>t]\leq 2e^{-t^2/(2\sum_i c^2_i)}$.

\section{Approximate counting of events}
We start with a simple problem of (approximately) counting the number of events. Suppose we want to design an algorithm
that monitors a long sequence of events and the goal is to have an approximate number of the events at any given time.
Clearly if we have had $n$ events we can keep a counter using $O(\log n)$ bits. It's not difficult to show that any deterministic
exact algorithm needs this much space.

We can however keep an approximate count using much less space, as little as $O(\log\log n)$ bits.
To be more precise, suppose we want to have an estimate $\tilde{n}$ for $n$ such that $\tilde{n}=(1\pm\epsilon)n$ and
$\Pr[|\tilde{n}-n|>\epsilon n]<\delta$ for a given $\delta$. We call this an $(\epsilon,\delta)$-estimator. Here we describe Morris
algorithm, that keeps a counter for $\log n$ instead of $n$; so only $O(\log\log n)$ bits of space are required.

\fbox{\parbox{\textwidth}{
%\begin{algorithm}
{\bf Morris approximate counting}\\
\begin{enumerate}
\item $X\leftarrow 0$
\item For each new even increment $X$ with probability $\frac{1}{2^X}$
\item return $\tilde{n}=2^X-1$.
\end{enumerate} 
%\end{algorithm}
}}


Let $X_n$ be the random variable representing the value of $X$ after $n$ steps and let $Y_n=2^{X_n}$.

\begin{lemma}
$\E[Y_n]=n+1$
\end{lemma}
\begin{proof}
We use induction on $n$. The base case of $n=0$ is easy. For induction step, assume it is true for $Y_n$. Then:

\begin{eqnarray*}
\E[Y_{n+1} &=& \sum_{i=0} \Pr[X_n=i]\E[2^{X_{n+1}}|X_n=i]\\
           &=& \sum_{i=0} \Pr[X_n=i]\left(2^i(1-\frac{1}{2^i})+\frac{1}{2^i}\cdot2^{i+1}\right)\\
           &=& \sum_{i=0} \Pr[X_n=i]2^i + \sum_{i=0} \Pr[X_n=i]\\
           &=& \E[Y_n]+1\\
           &=& (n+1)+1.
\end{eqnarray*}
\end{proof}

Thus, the output $\tilde{n}=2^X-1$ is an estimate for $n$ (in expectation). Also, since $\E[Y_n]=n+1$, it implies that
$\E[X_n]=\log_2(n+1)$ and so the expected number of bits used after $n$ steps is $O(\log\log n)$.

\begin{lemma}
$\E[Y^2_n]=\frac{3}{2}n^2+\frac{3}{2}n+1$ and $\Var[Y_n]=\frac{n(n-1)}{2}$.
\end{lemma}
\begin{proof}
Again we use induction on $n$. Base case of $n=0$ is easy to check. For induction step, assuming that the statement is true
for $Y_n$:

\begin{eqnarray*}
\E[Y^2_{n+1}] &=& \sum_{i=0} 2^{2i}\Pr[X_{n+1}=i]\\
              &=& \sum_{i=0} 2^{2i}\left(\Pr[X_n=i](1-\frac{1}{2^i})+\Pr[X_n=i-1]\frac{1}{2^{i-1}}\right)\\
              &=& \sum_{i=0} 2^{2i}\Pr[X_n=i] + \sum_{i=0}(-2^i\Pr[X_n=i-1]+4\times2^{i-1}\Pr[X_n=i-1])\\
              &=& \E[Y^2_n]+3\E[Y_n]\\
              &=& \frac{3}{2}n^2+\frac{3}{2}n+1 + 3(n+1)\\
              &=& \frac{3}{2}(n+1)^2+\frac{3}{2}(n+1)+1.
\end{eqnarray*}

Also $\Var[Y_n]=\E[Y^2_n]-\E[Y_n]^2=\frac{n(n-1)}{2}$.
\end{proof}

Thus, using this lemma and Chebyshev's inequality:

$$\Pr[|\tilde{n}-n|>\epsilon n] < \frac{1}{(\epsilon n)^2}\cdot \frac{n(n-1)}{2}\simeq\frac{1}{2\epsilon^2}$$
But this is useless for small values of $\epsilon$. So we need to boost the success probability.
We would like, given $\epsilon,\delta>0$, have a bound of the form $\Pr[|\tilde{n}-n|>\epsilon n]\leq \delta$ using $O(\log\log n)$
bits.

\subsection{Morris+: Using average to boost probability}
Suppose we run $r$ parallel copies of Morris algorithm and find values $\tilde{n}_i$ for $1\leq i\leq r$ and then let
$\tilde{n}=\frac{1}{r}\sum_{i=1}^r \tilde{n}_i$. Since each $\tilde{n}_i$ is an estimator for $n$ then 

$$\Pr[|\tilde{n}-n|>\epsilon n] \leq \frac{1}{2r\epsilon^2} <\delta$$

if we choose $r>\frac{1}{2\epsilon^2\delta}$. The amount of space used will be $O(\log\log n/(\epsilon^2\delta)$.
In particular, if we choose $r>\frac{2}{\epsilon^2}$ then we get $\Pr[|tilde{n}-n|>\epsilon n]<\frac{1}{4}$.

\subsection{Morris++: Using Median to boost probability}
We can do even better to boost success probability: instead of using average we use the median.
More specifically, run $\ell=c\log\frac{1}{\delta}$ parallel copies of Morris+, for some large constant $c$. Suppose we get
estimators $Z_1,\ldots,Z_\ell$ and let $\tilde{n}$ be the {\em median} of them. Note that by the arguments for Morris+
$\Pr[|Z_i-n|>\epsilon n] <\frac{1}{4}$ for each $i$. Thus, if we define a 0/1 random variable $Y_i=1$ if $|Z_i-n|>\epsilon n$
then $Y_i$'s are independent and $\Pr[Y_i=1]<\frac{1}{4}$ and $\E[\sum_i Y_i]<\ell/4$. We will have $|\tilde{n}-n|>\epsilon n$
only if at least $\frac{\ell}{2}$ of the $Z_i$'s are larger than $n$ by $\epsilon n$. Using Chernoff bound:

$$\Pr[|\tilde{n}-n|>\epsilon n] \leq \Pr[|\sum_i Y_i - \E[\sum_i Y_i]|>\frac{\ell}{4}] < (\frac{e}{4})^{\ell/4} < \delta$$

for $\ell=c\log\frac{1}{\delta}$ for large constant $c$.
Also, the space complexity will be $O(\epsilon^{-2}\log\frac{1}{\delta}\log\log(\frac{n}{\epsilon\delta}))$.


% If you need to add references, use the following format:

\section*{References}

\begin{itemize}
\item[Mor787] {\sc R. Morris},
Counting large numbers of events in small registers. {\em Commun. ACM}, 21(10):840-842, 1978.
Matrix multiplication via arithmetic progressions,

%
%\item[S69] {\sc V.~Strassen}, Gaussian Elimination Is Not Optimal,
%{\it Numerische Mathematik\/~\bf13}, 1969, pp.~354--356.
%
%\item[P84] {\sc V.~Pan}, {\it How To Multiply Matrices Faster},
%Springer-Verlag, Lecture Notes in Computer Science Vol.~179, 1984.
%
\end{itemize}

\end{document}