-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_capitolo.tex
383 lines (270 loc) · 19.3 KB
/
2_capitolo.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
\chapter{Basic tools}
In this chapter, we introduce some notion of similarity already existing in the literature and then extending them to define similarity among subgraphs in labeled graphs.
In the last sections we present two different techniques we will use afterwards to estimate such similarity.
\section{Similarity indices}
\begin{definizione}\label{def:jaccard}
Given two sets $A$ and $B$ we define the \textbf{Jaccard} index as the size of the intersection divided by the size of the union between the two sets
\begin{equation}
J(A,B) = \frac{|A \cap B|}{|A \cup B|}
\end{equation}
\end{definizione}
\begin{definizione}\label{def:bray}
Given two sets $A$ and $B$ we define the \textbf{Bray-Curtis} index as
\begin{equation}
BC(A,B) = \frac{2 \times |A \cap B|}{|A| + |B|}
\end{equation}
\end{definizione}
\begin{esempio}
Given $A = \{1, 3, 4, 5, 7, 8\}$ and $B = \{1, 2, 4, 6, 8\}$ we have
\begin{equation*}
J(A,B) = \frac{|\{1, 4, 8\}|}{|\{1, 2, 3, 4, 5, 6, 7, 8\}|} = \frac{3}{8}
\end{equation*}
\begin{equation*}
BC(A,B) = \frac{2 \times |\{1, 4, 8\}|}{|\{1, 3, 4, 5, 7, 8\}| + |\{1, 2, 4, 6, 8\}|} = \frac{6}{11}
\end{equation*}
\end{esempio}
Note that when $A = B$ we have $J(A, B) = BC(A, B) = 1$ and when $A \cap B = \emptyset$ we have $J (A, B) = BC (A, B) = 0$.
Using sets may be limiting as we consider only once the repeated values. Thus we can extend the two previous definitions to the multisets.\medskip
\begin{definizione}
A \textbf{multiset} is a generalization of set that allows multiple instances of elements.
\end{definizione}
To avoid confusion afterwards we use square brackets $[\ ]$ to indicate multisets and curly brackets $\{\ \}$ to indicate sets.\medskip
A multiset can also be seen as an array of frequencies of its objects: for instance, we can indicate with $A = (2, 0, 3)$ the multiset with $2$ elements of the first type, $0$ elements of the second type and $3$ elements of the third type, where this notation is equivalent to write $A = [1, 1, 3, 3, 3]$.\medskip
Given two multisets $A = (a_{1}, \ldots, a_{n}) $ and $B = (b_{1}, \ldots, b_{n})$ we define the following operations.
\begin{itemize}
\item intersection $C = A \cap B = (c_{1}, \ldots, c_{n})$ where $c_{i} = \min(a_{i}, b_{i})$
\item union $C = A \cup B = (c_{1}, \ldots, c_{n})$ where $c_{i} = \max(a_{i}, b_{i})$
\item disjoint union $C = A \uplus B = (c_{1}, \ldots, c_{n})$ where $c_{i} = a_{i} + b_{i}$
\end{itemize}
% \begin{definizione}\label{def:wjaccard}
% Given two multisets $A = (a_{1}, \ldots, a_{n}) $ and $B = (b_{1}, \ldots, b_{n})$ we define the \textbf{Frequency Jaccard index} as:
%
% \begin{equation}
% FJ(A,B) = \frac{\sum\limits_{i=1}^n { min(a_{i}, b_{i}) } }{\sum\limits_{i=1}^n { max(a_{i}, b_{i}) }}
% \end{equation}
% \end{definizione}
\begin{definizione}\label{def:wbray}
Given two multisets $A = (a_{1}, \ldots, a_{n}) $ and $B = (b_{1}, \ldots, b_{n})$ we define the \textbf{Bray-Curtis} index as
\begin{equation}
BC(A,B) = \frac{ 2 \times \sum\limits_{i=1}^n { min(a_{i}, b_{i}) } }{\sum\limits_{i=1}^n {(a_{i} + b_{i})}}
\end{equation}
\end{definizione}
As a side note, Bray-Curtis is a relevant index for multisets, and is also known as Steinhaus similarity, Pielou's Similarity, Sorensen's quantitative, and Czekanowski's similarity~\cite{legendre1998numerical}.
\begin{esempio}
Given $A = (0, 2, 3, 1, 0, 3) $ and $B = (2, 0, 1, 3, 1, 2)$ we have
\begin{equation}
J(A,B) = \frac{0 + 0 + 1 + 1 + 0 + 2}{2 + 2 + 3 + 3 + 1 + 3} = \frac{2}{7}
\end{equation}
\begin{equation}
BC(A,B) = \frac{2 \times (0 + 0 + 1 + 1 + 0 + 2) }{2 + 2 + 4 + 4 + 1 + 5} = \frac{4}{9}
\end{equation}
\end{esempio}
Both indices are widely used in practical application, Bray-Curtis index gives a larger weight to the intersection, on the other side the Jaccard Index is a metric and may be preferred to Bray-Curtis index since the latter is only a semi-metric (as it does not satisfy the triangle inequality).
\begin{definizione}
A function $f : A \times A \rightarrow \mathbb{R}^{+} $ is called \textbf{metric} (or simply \textbf{distance}) if it satisfies, for all $x, y, z \in A$, the following conditions.
\begin{itemize}
\item Non-negativity: $f(x,y) \geq 0$ and $f(x,y) = 0$ iff $x = y$
\item Symmetry: $f(x, y) = f(y, x)$
\item Triangle inequality: $f(x, z) \leq f(x, y) + f(y, z)$
\end{itemize}
\end{definizione}
\section{Document similarity}
Document similarity is a hot topic in Information Retrieval, as it can be seen as the problem of duplicates detection~\cite{Broder2000} or, from another point of view, plagiarisms detection.\bigskip
To define document similarity we need the notion of \textit{$q$-gram}:
\begin{definizione}
A \textbf{$q$-gram} is a contiguous subsequence of $q$ items from a sequence.
\end{definizione}
In this case the sequence is a document and the items can be words, characters or even syllables. If the elements used are words, $q$-grams can even be called shingles.\medskip
\begin{esempio}
Given the document \textit{"I live and study in Pisa"} all the possible $3$-grams, where items are words, are the following ones:
\textit{"I live and"}, \textit{"live and study"}, \textit{"and study in"} and \textit{"study in Pisa"}.
\end{esempio}
Note that in a document with $n$ words the possible $q$-grams are exactly $n-q+1$.\medskip
It is easy to see that if we use the set, or multiset, of the all possible $q$-grams of two documents we can use it to calculate their similarity based on the Jaccard or Bray-Curtis index.\medskip
Considering that the number of $q$-grams in a document is linear in its number of words, document similarity is not a hard problem as we have only to perform union and intersection between sets, or multisets.\medskip
\begin{esempio}
Given the documents
\begin{equation*}
A = \textit{I live, work and study in Pisa}
\end{equation*}
\begin{equation*}
B = \textit{You work and study in Livorno}
\end{equation*}
The set of their $2$-grams are:
\begin{equation*}
S_{A} = (\textit{I live}, \textit{live work}, \textit{\textbf{work and}}, \textit{\textbf{and study}}, \textit{\textbf{study in}}, \textit{in Pisa})
\end{equation*}
\begin{equation*}
S_{B} = (\textit{You work}, \textit{\textbf{work and}}, \textit{\textbf{and study}}, \textit{\textbf{study in}}, \textit{in Livorno})
\end{equation*}
The similarity using both Jaccard and Bray-Curtis are\medskip
\begin{equation*}
J(S_{A},S_{B}) = \frac{|S_{A} \cap S_{B} |}{|S_{A} \cup S_{B} |} = \frac{3}{8}
\end{equation*}
\begin{equation*}
BC(S_{A},S_{B}) = \frac{2 \times |S_{A} \cap S_{B} |}{|S_{A}| +|S_{B}|} = \frac{6}{11}
\end{equation*}
\end{esempio}
\section{Graph similarity}
The definition of similarity between graphs is more complex, as we have to introduce the concept of graph isomorphism.
\begin{definizione}
A \textbf{graph isomorphism} between two graphs $G=(V_{G}, E_{G})$ and $H=(V_{H}, E_{H})$ is a one-to-one function from vertices of $G$ to vertices of $H$ that preserve the edge structure, i.e.
\begin{equation}
f : V_{G} \rightarrow V_{H} \text{ s.t. } (v, u) \in E_{G} \iff (f(v), f(u)) \in E_{H}
\end{equation}
\end{definizione}
If such isomorphism exists, the graphs are called isomorphic and we denote it with $G \simeq H$.\medskip
With the notion of graph similarity we can define a similarity between two graphs~\cite{Bunke:1998:GDM:289720.289729}.\medskip
\begin{definizione}
The similarity between two graphs $G=(V_{G}, E_{G})$ and $H=(V_{H}, E_{H})$ is the size of the largest graph isomorphism between a subgraph $G' \subseteq G$ and a subgraph $H' \subseteq H$ (seen as number of vertex of $G'$).
\end{definizione}
Unfortunately subgraphs isomorphism is an NP-complete problem~\cite{GareyJohnson:1979}, so the only way to solve it is by using heuristic or approximated methods.
\begin{esempio}
\begin{figure}[ht]
\centering
\begin{minipage}[t]{.45\textwidth}
\centering
\includegraphics[width=5cm,height=2.92cm]{figure/figure-2-3}
\caption{Graph $G=(V_G, E_G)$}
\label{fig:graph-2-1}
\end{minipage}\hfill
\begin{minipage}[t]{.45\textwidth}
\centering
\includegraphics[width=4cm,height=4cm]{figure/figure-2-4}
\caption{Graph $H=(V_H, E_H)$}
\label{fig:graph-2-2}
\end{minipage}
\end{figure}
The two graphs in figures \ref{fig:graph-2-1} and \ref{fig:graph-2-2} are isomorphic with the function $f : V_{G} \rightarrow V_{H}$ s.t. $f(0) = 3$, $f(1) = 4$, $f(2) = 2$, $f(3) = 0$, $f(4) = 5$ and $f(5) = 1$.
\end{esempio}
\section{Subgraph similarity}
After discussing the already existing notions of similarity, we are ready to extend them to define similarity in a labeled complex network.\medskip
Consider a labeled graph $G = (V, E, L)$ over an alphabet $\Sigma$ where $L \rightarrow \Sigma$ is the node labeling, so that each node $u \in V$ has a label $L(u) \in \Sigma$\footnote{Alternatively we can labeling edges in $E$ instead of nodes in $V$ without making too many changes in the following definitions, but for the sake of simplicity we consider the graph labeled on its nodes.}, we are interested in analyzing $G$ using the sequences of labels found along its paths.\medskip
For a fixed integer $q > 0$, consider an arbitrary simple path $\pi = u_{1}, \ldots, u_{q}$. We call the orientation $u_{1} \rightarrow \ldots \rightarrow u_{q}$ of $\pi$ a $q$-path leading to $u_{q}$ and $L(\pi) = L(u_{i}) \ldots L(u_{q}) \in \Sigma^{q}$ its $q$-gram, obtained by concatenating the labels of its nodes.\footnote{Note that in an undirected graph we have, for a single simple path, two possible $q$-paths, one for each orientation: one leading to $u_{q}$ from $u_{1}$ and one leading to $u_{1}$ from $u_{q}$.}.\medskip
For a set of nodes $A \subseteq V$, we define $L(A)$ as the corresponding multiset of $q$-grams for all $q$-path $\pi$ leading to a node $u \in A$.
\begin{equation}
L(A) = [x \in \Sigma^{q} : \exists \text{ $q$-path } \pi \text{ leading to } u \in A \text{ with } L(\pi) = x]
\end{equation}
In this way, for each $q$-path $\pi = u_{1}, \ldots, u_{q}$ leading to $u_{q} \in A$, we have that $u_{i} \in A \cup N^{<q}(A)$ $\forall$ $1 \leq i < q$. \medskip
This is a good definition because, as it was mentioned before, we take into account both the internal structure of $A$ and its neighborhood $N^{<q}(A)$.
Note that we explicitly exclude all the $q$-paths that are beginning and starting outside $A$, as we not considering them influential to define the similarity.\medskip
Given a single $q$-gram $x$ we are interested in its frequency within the multiset $L(A)$ so we define
\begin{equation}
f_{A}[x] = |\{ \pi : \pi \text{ is a $q$-path leading to } u \in A \text{ and } L(\pi) = x \}|
\end{equation}
Note the property that $f_{A}[x] = \sum_{u \in A}{f_{\{u\}}[x]}$.
\begin{definizione}
Given an undirected labeled graph $G = (V,E,L)$ over an alphabet $\Sigma$ and an integer $q > 0$, the Bray-Curtis similarity index between two set of nodes $A, B \subset V$ is:
\begin{equation}\label{bray-sub}
BC(A,B) = \frac{ 2 \times \Sigma_{x \in \Sigma^{q}} \min(f_{A}[x], f_{B}[x]) }{ \Sigma_{x \in \Sigma^{q}} f_{A}[x] + f_{B}[x] }
\end{equation}
\end{definizione}
\begin{definizione}
Given an undirected labeled graph $G = (V,E,L)$ over an alphabet $\Sigma$ and an integer $q > 0$, the Frequency Jaccard similarity index between two set of nodes $A, B \subset V$ is:
\begin{equation}\label{jaccard-sub}
FJ(A,B) = \frac{ \Sigma_{x \in \Sigma^{q}} \min(f_{A}[x], f_{B}[x]) }{ \Sigma_{x \in \Sigma^{q}} f_{A \cup B}[x] }
\end{equation}
\end{definizione}
Let $\mathcal{L} = \{ x \in \Sigma^{q} : x \in L(V) \} \subseteq \Sigma^{q}$ be the set of all distinct $q$-grams found in the $q$-paths of G.
Note that ranging $x$ over $\mathcal{L}$, instead of $\Sigma^{q}$, is sufficient in both the formulas for any $A$ and $B$.\medskip
In general we have that $BC(A,B) \geq FJ(A,B)$.
When $A \cap B = \emptyset$ we have that $f_{A \cup B}[x] = f_{A}[x] + f_{B}[x]$ and $BC(A,B) = 2 \times FJ(A,B)$\medskip
Now we present a little example to better understand our notions.\medskip
\begin{minipage}{0.60\textwidth}\raggedright
\begin{esempio}
\end{esempio}
Given the graph in figure \ref{pic:graph-cc}, we want to calculate the similarity between the two sets $S_{1} = \{0,1\}$ and $S_{2} = \{1, 3\}$ according to their sets of $3$-grams:
\begin{equation*}
L(S_{1}) = [aab, acb, baa, bca, bca, bcb, cab, cba]
\end{equation*}
\begin{equation*}
L(S_{2}) = [baa, baa, bca, bca, caa, cba, cba]
\end{equation*}
So we have that:
\begin{equation*}
FJ(S_{1}, S_{2}) = \frac{4}{11} \text{ and } BC(S_{1}, S_{2}) = \frac{8}{15}
\end{equation*}
\end{minipage}
\begin{minipage}{0.30\textwidth}
\includegraphics[width=\linewidth]{figure/figure-2-1}
\label{pic:graph-cc}
\end{minipage}
\section{Sketches}
We have seen that computing the exact similarity between two documents is an easy problem
as we have only to compute the union and the intersection between the two sets of shingles.\medskip
This task becomes more difficult to handle when we have to consider thousands or millions of documents
(e.g. the set of Internet web pages), each one of them has thousands of shingles.\medskip
To solve this problem, it is no longer possible to handle all the shingles for all the documents,
instead we can, for each of them, keep a relatively small, fixed size \textit{sketch}~\cite{Broder2000}, i.e. a fingerprint of the document.
The computation of the sketches is linear in the size of the documents and can be used to calculate the similarity in linear time in the size of the sketches only.\medskip
In the next chapter, we will use the sketches applied to the set of $q$-grams of a labeled graph to fast compute the similarity between two subgraphs.\medskip
Usually sketches are used with explicit documents, we will use it to avoid to generate all the $q$-grams.\bigskip
Now we present two different existing techniques to compute the sketches of a document,
for simplicity we assume that our document is composed of all numbers between $1$ and $n$, where in practice we use a ranking function to define an ordering of the items in the documents.
\subsection*{Min-wise permutation}
The first approach to compute a sketch of a document is the min-wise permutation~\cite{Broder:1998:MIP:276698.276781}.\bigskip
Given a document $A = \{1, \ldots, n\}$, to calculate its sketch $S_{A}$ of size $k$
we choose $k$ random independent permutations\footnote{We define the minimum of a permutation as the element with the lowest mapped index.} $\pi_{i} : \{1, \ldots, n\} \rightarrow \{1, \ldots, n\}$ and define the sketch $S_A$ of $A$ as:
\begin{equation}
S_{A} = \{ \min(\pi_{1}(A)), \ldots, \min(\pi_{k}(A)) \}
\end{equation}
\medskip
Note that using the min-wise permutation we can take multiple times the same number, as the permutations are independently and uniformly chosen.
\subsection*{Bottom-k sketches}
Another approach that works best in terms of performance is the bottom-k sketches~\cite{Cohen:2007:SDU:1281100.1281133}.\medskip
Instead of using $k$ random permutation $\pi_{1}, \ldots, \pi_{k}$ and taking the minimum from each of them,
as we did before in the min-wise permutation, we choose only one random permutation $\pi$ and take the bottom $k$ elements in the permutation, where $\min_{x}$ indicates the $x$th smallest element in the permutation.
\begin{equation}
S_{A} = \{ \min{\!}_{1}(\pi(A)), \ldots, \min{\!}_{k}(\pi(A)) \}
\end{equation}
\medskip
Note that, unlike the min-wise permutation, in a bottom-k sketch we do not have repeated numbers as we take the numbers from a single permutation.
\begin{esempio}
Consider the document $A=\{1, \ldots, 10\}$ and the following $4$ permutation ($k = 4$)
\begin{equation*}
\pi_{1} = \{3, 5, 8, 2, 4, 9, 1, 10, 7, 6\}
\end{equation*}
\begin{equation*}
\pi_{2} = \{7, 10, 2, 1, 8, 5, 9, 6, 4, 3\}
\end{equation*}
\begin{equation*}
\pi_{3} = \{3, 4, 6, 2, 8, 5, 1, 10, 7, 9\}
\end{equation*}
\begin{equation*}
\pi_{4} = \{9, 1, 3, 5, 4, 10, 7, 8, 2, 6\}
\end{equation*}
With the min-wise permutation approach we have that
\begin{equation*}
S_{A} = \{ \min(\pi_{1}(A)), \min(\pi_{2}(A)), \min(\pi_{3}(A)), \min(\pi_{4}(A)) \} = \{3, 7, 3, 9\}
\end{equation*}
Instead with the bottom-k sketches using $\pi_{4}$ as permutation
\begin{equation*}
S_{A} = \{ \min{\!}_{1}(\pi_{4}(A)), \min{\!}_{2}(\pi_{4}(A)), \min{\!}_{3}(\pi_{4}(A)), \min{\!}_{4}(\pi_{4}(A)) \} = \{9, 1, 3, 5\}
\end{equation*}
\end{esempio}
\section{Color Coding}
The color coding is a method proposed in 1994 by Alon et al.~\cite{Alon:1995:COL:210332.210337} that efficiently finds simple path, cycles or many other small subgraphs using a probabilistic and parameterized algorithm. We will focus only in finding $q$-simple paths.\medskip
The idea behind this method, which gives it the name, is to randomly coloring each node of $V$ with one of the $q$ possible colors.
We restrict our attention to $q = O(\log |V|)$ and denote with $\chi : V \rightarrow [q]$ the coloring function\footnote{Here $[q] \equiv [1, \ldots, q]$ is the set of all possible colors}, where each node $u \in V$ has a color $\chi(u) \in [q]$.\medskip
After assigning a color to each node, we will focus on finding only the colorful $q$-paths. We say that a $q$-path $u_{1}, \ldots, u_{q}$ is colorful iff $\chi(u_{i}) \neq \chi(u_{j})$ for $1 \leq i < j \leq q$ (i.e. all the $q$ colors appear in the $q$-path).\medskip
The main advantage of this method is that it reduces the number of $q$-paths by roughly a factor of $q! / q^{q} \geq 1/e^{q}$, as a colorful $q$-path can use $q!$ colorings of its nodes out of $q^{q}$ possible ones. So the number of $q$-paths exponentially decrease as the value of $q$ increases (e.g. when $q = 3$ we look only for the $\sim22\%$ colorful $q$-paths and only for $\sim4\%$ when $q = 5$).\bigskip
As we will show in the next chapter, all the colorful $q$-paths can be easily found using a dynamic programming approach.\medskip
An interesting fact is that the method of color coding can be derandomized using a $q$-perfect hash family~\cite{Alon:1995:COL:210332.210337}, that enumerates all the possible $q$-colorings of $V$ that are relevant to the computation (as otherwise there would be $n^q$ possible $q$-colorings).
This makes the method exponential in the value of $q$, as if we set $q = |V|$ the problem became to find an Hamiltonian path, which is NP-complete~\cite{GareyJohnson:1979}.\medskip
To improve the precision we can repeat the random coloring of nodes for $e^{q}\ t$ times,
so that the success probability becomes $\geq 1-e^{-t}$.
Another way is to randomly coloring nodes using a larger number of colors $q'$ and then, instead of looking for the colorful $q'$-paths,
we search the color-diversified $q$-paths (i.e. $q$-paths without color repetition in $[q']$)~\cite{deshpande2007randomized}.\medskip
In practice, choosing a simple random coloring is working pretty well on complex networks, and we do not use these optimizations thus avoiding this overhead for our algorithms.\medskip
\begin{minipage}{0.55\textwidth}\raggedright
\begin{esempio}
In this $3$-colored graph out of $6$ simple $3$-paths starting from $0$\medskip
($\textsc{0-1-2}$ $\textsc{0-1-3}$ $\textsc{0-2-1}$ $\textsc{0-2-4}$ $\textsc{0-2-6}$ $\textsc{0-5-6}$)\medskip
only $3$ are colorful ($\textsc{0-1-2}$ $\textsc{0-2-1}$ $\textsc{0-2-4}$).
\end{esempio}
\end{minipage}
\begin{minipage}{0.35\textwidth}
\includegraphics[width=\linewidth]{figure/figure-2-6}
\end{minipage}
\noindent
\clearpage