# License: MIT
#
# Implements multi-view spectral clustering algorithm for data with
# multiple views.
import numpy as np
import scipy as sp
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel
from sklearn.neighbors import NearestNeighbors
from ..utils.utils import check_Xs
from .base import BaseCluster
AFFINITY_METRICS = ['rbf', 'nearest_neighbors', 'poly']
[docs]class MultiviewSpectralClustering(BaseCluster):
r'''An implementation of multi-view spectral clustering.
An implementation of multi-view spectral clustering using the
basic co-training framework as described in [#1Clu]_.
Additionally, this can be effective when the dataset naturally
contains features that are of 2 different data types, such as
continuous features and categorical features [#3Clu]_, and then the
original features are separated into two views in this way.
This algorithm can handle 2 or more views of data.
Parameters
----------
n_clusters : int
The number of clusters
random_state : int, optional, default=None
Determines random number generation for k-means.
info_view : int, optional, default=None
The most informative view. Must be between 0 and n_views-1
If given, then the final clustering will be performed on the
designated view alone. Otherwise, the algorithm will concatenate
across all views and cluster on the result.
max_iter : int, optional, default=10
The maximum number of iterations to run the clustering
algorithm.
n_init : int, optional, default=10
The number of random initializations to use for k-means clustering.
affinity : string, optional, default='rbf'
The affinity metric used to construct the affinity matrix. Options
include 'rbf' (radial basis function), 'nearest_neighbors', and
'poly' (polynomial)
gamma : float, optional, default=None
Kernel coefficient for rbf and polynomial kernels. If None then
gamma is computed as 1 / (2 * median(pair_wise_distances(X))^2)
for each data view X.
n_neighbors : int, optional, default=10
Only used if nearest neighbors is selected for affinity. The
number of neighbors to use for the nearest neighbors kernel.
Attributes
----------
labels_ : array-like, shape (n_samples)
Cluster labels for each sample in the fitted data.
embedding_ : array-like, shape (n_samples, n_clusters)
The final spectral representation of the data to be used as input
for the KMeans clustering step.
Notes
-----
Multi-view spectral clustering adapts the spectral clustering algorithm
to applications where more than one view of data is available. This
algorithm relies on the basic assumptions of the co-training, which are:
(a) Sufficiency: each view is sufficient for classification on its own,
(b) Compatibility: the target functions in both views predict the same
labels for co-occurring features with high probability, and (c)
Conditional independence: the views are conditionally independent given
the class labels. In contrast to multi-view k-means clustering,
multi-view spectral clustering performs well on arbitrary shaped clusters,
and can therefore be readily used in applications where clusters are not
expected to be convex. However multi-view spectral clustering tends to be
computationally expensive unless the similarity graph for the data is
sparse.
Multi-view spectral clustering works by using the spectral embedding
from one view to constrain the similarity graph in the other view. By
iteratively applying this procedure, the clustering of the two views
tend to each other. Here we outline the algorithm for the Multi-view
Spectral clustering algorithm for 2 views.
|
*Multi-view Spectral Clustering Algorithm (for 2 views)*
Input: Similarity matrix for both views: :math:`\mathbf{K}_1, \mathbf{K}_2`
Output: Assignments to k clusters
#. Initialize: :math:`\mathbf{L}_v = \mathbf{D}_v^{-1/2}
\mathbf{K}_v\mathbf{D}_v^{-1/2}` for :math:`v = 1, 2`
:math:`\mathbf{U}_v^0` is an :math:`n \times k` matrix with the
top k eigenvectors of :math:`\mathbf{L}_v` for :math:`v = 1, 2`
#. For :math:`i = 1` to iter:
a. :math:`\mathbf{S}_1 = sym(\mathbf{U}_2^{i-1}
{\mathbf{U}_2^{i-1}}^T\mathbf{K}_1)`
b. :math:`\mathbf{S}_2 = sym(\mathbf{U}_1^{i-1}
{\mathbf{U}_1^{i-1}}^T\mathbf{K}_2)`
c. Use :math:`\mathbf{S}_1` and :math:`\mathbf{S}_2` as the new
graph similarities and compute the Laplacians. Solve for the
largest k eigenvectors to obtain :math:`\mathbf{U}_1^i` and
:math:`\mathbf{U}_2^i`.
#. Row-normalize :math:`\mathbf{U}_1^i` and :math:`\mathbf{U}_2^i`.
#. Form matrix :math:`\mathbf{V} = \mathbf{U}_v^i`, where :math:`v` is
believed to be the most informative view a priori. If there is no
prior knowledge on the view informativeness, matrix
:math:`\mathbf{V}` can also be set to the column-wise concatenation
of the two :math:`\mathbf{U}_v^i` s.
#. Assign example j to cluster c if the j-th row of :math:`\mathbf{V}`
is assigned to cluster c by the k-means algorithm.
References
----------
.. [#1Clu] Abhishek Kumar and Hal Daume. "A co-training approach for
multi-view spectral clustering." In Proceedings of the 28th
International Conference on Machine Learning, page 393–400.
Omnipress, 2011.
Examples
--------
>>> from mvlearn.datasets import load_UCImultifeature
>>> from mvlearn.cluster import MultiviewSpectralClustering
>>> from sklearn.metrics import normalized_mutual_info_score as nmi_score
>>> # Get 5-class data
>>> data, labels = load_UCImultifeature(select_labeled = list(range(5)))
>>> mv_data = data[:2] # first 2 views only
>>> mv_spectral = MultiviewSpectralClustering(n_clusters=5,
... random_state=10, n_init=100)
>>> mv_clusters = mv_spectral.fit_predict(mv_data)
>>> nmi = nmi_score(labels, mv_clusters)
>>> print('{0:.3f}'.format(nmi))
0.872
'''
def __init__(self, n_clusters=2, random_state=None,
info_view=None, max_iter=10, n_init=10, affinity='rbf',
gamma=None, n_neighbors=10):
super().__init__()
self.n_clusters = n_clusters
self.random_state = random_state
self.info_view = info_view
self.max_iter = max_iter
self.n_init = n_init
self.affinity = affinity
self.gamma = gamma
self.n_neighbors = n_neighbors
self.labels_ = None
self.embedding_ = None
def _affinity_mat(self, X):
r'''
Computes the affinity matrix based on the selected
kernel type.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data matrix from which we will compute the
affinity matrix.
Returns
-------
sims : array-like, shape (n_samples, n_samples)
The resulting affinity kernel.
'''
sims = None
# If gamma is None, then compute default gamma value for this view
gamma = self.gamma
if self.gamma is None:
distances = cdist(X, X)
gamma = 1 / (2 * np.median(distances) ** 2)
# Produce the affinity matrix based on the selected kernel type
if (self.affinity == 'rbf'):
sims = rbf_kernel(X, gamma=gamma)
elif(self.affinity == 'nearest_neighbors'):
neighbor = NearestNeighbors(n_neighbors=self.n_neighbors)
neighbor.fit(X)
sims = neighbor.kneighbors_graph(X).toarray()
else:
sims = polynomial_kernel(X, gamma=gamma)
return sims
def _compute_eigs(self, X):
r'''
Computes the top several eigenvectors of the
normalized graph laplacian of a given similarity matrix.
The number of eigenvectors returned is equal to n_clusters.
Parameters
----------
X : array-like, shape(n_samples, n_samples)
The similarity matrix for the data in a single view.
Returns
-------
la_eigs : array-like, shape(n_samples, n_clusters)
The top n_cluster eigenvectors of the normalized graph
laplacian.
'''
# Compute the normalized laplacian
d_mat = np.diag(np.sum(X, axis=1))
# Double check why we take absolute value of d_mat
d_alt = np.sqrt(np.linalg.inv(np.abs(d_mat)))
laplacian = d_alt @ X @ d_alt
# Make the resulting matrix symmetric
laplacian = (laplacian + np.transpose(laplacian)) / 2.0
# Obtain the top n_cluster eigenvectors of the laplacian
u_mat, _, _ = sp.sparse.linalg.svds(laplacian, k=self.n_clusters)
la_eigs = u_mat[:, :self.n_clusters]
return la_eigs
def _param_checks(self, Xs):
r'''
Performs bulk of checks and exception handling for
inputted user parameters.
Parameters
----------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
This list must be of size n_views, corresponding to the number
of views of data. Each view can have a different number of
features, but they must have the same number of samples.
Returns
-------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
The data in the appropriate format.
'''
Xs = check_Xs(Xs)
if len(Xs) < 2:
msg = 'Xs must have at least 2 views'
raise ValueError(msg)
self._n_views = len(Xs)
if not (isinstance(self.n_clusters, int) and self.n_clusters > 0):
msg = 'n_clusters must be a positive integer'
raise ValueError(msg)
if self.random_state is not None:
msg = 'random_state must be convertible to 32 bit unsigned integer'
try:
self.random_state = int(self.random_state)
except ValueError:
raise ValueError(msg)
np.random.seed(self.random_state)
if self.info_view is not None:
if not (isinstance(self.info_view, int) and
(self. info_view >= 0 and self.info_view < self._n_views)):
msg = 'info_view must be an integer between 0 and n_clusters-1'
raise ValueError(msg)
if not (isinstance(self.max_iter, int) and (self.max_iter > 0)):
msg = 'max_iter must be a positive integer'
raise ValueError(msg)
if not (isinstance(self.n_init, int) and self.n_init > 0):
msg = 'n_init must be a positive integer'
raise ValueError(msg)
if self.affinity not in AFFINITY_METRICS:
msg = 'affinity must be a valid affinity metric'
raise ValueError(msg)
if self.gamma is not None and not ((isinstance(
self.gamma, float) or isinstance(self.gamma, int))
and self.gamma > 0):
msg = 'gamma must be a positive float'
raise ValueError(msg)
if not (isinstance(self.n_neighbors, int) and self.n_neighbors > 0):
msg = 'n_neighbors must be a positive integer'
raise ValueError(msg)
return Xs
def fit(self, Xs, y=None):
r'''
Performs clustering on the multiple views of data.
Parameters
----------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
This list must be of size n_views, corresponding to the number
of views of data. Each view can have a different number of
features, but they must have the same number of samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : returns an instance of self.
'''
# Perform checks on the data and inputted parameters
Xs = self._param_checks(Xs)
# Compute the similarity matrices
sims = [self._affinity_mat(X) for X in Xs]
# Initialize matrices of eigenvectors
U_mats = [self._compute_eigs(sim) for sim in sims]
# Iteratively compute new graph similarities, laplacians,
# and eigenvectors
for iter in range(self.max_iter):
# Compute the sums of the products of the spectral embeddings
# and their transposes
eig_sums = [u_mat @ np.transpose(u_mat) for u_mat in U_mats]
U_sum = np.sum(np.array(eig_sums), axis=0)
new_sims = list()
for view in range(self._n_views):
# Compute new graph similarity representation
mat1 = sims[view] @ (U_sum - eig_sums[view])
mat1 = (mat1 + np.transpose(mat1)) / 2.0
new_sims.append(mat1)
# Recompute eigenvectors
U_mats = [self._compute_eigs(sim)
for sim in new_sims]
# Row normalize
for view in range(self._n_views):
U_norm = np.linalg.norm(U_mats[view], axis=1).reshape((-1, 1))
U_norm[U_norm == 0] = 1
U_mats[view] /= U_norm
# Performing k-means clustering
kmeans = KMeans(n_clusters=self.n_clusters, n_init=self.n_init,
random_state=self.random_state)
if self.info_view is not None:
# Use a single view if one was previously designated
self.embedding_ = U_mats[self.info_view]
self.labels_ = kmeans.fit_predict(self.embedding_)
else:
# Otherwise, perform columwise concatenation across views
# and use result for clustering
self.embedding_ = np.hstack(U_mats)
self.labels_ = kmeans.fit_predict(self.embedding_)