# License: MIT
from scipy.stats import norm
import numpy as np
from scipy.sparse.linalg import svds
def _compute_likelihood(arr):
r"""
Computes the log likelihoods based on normal distribution given
a 1d-array of sorted values. If the input has no variance,
the likelihood will be nan.
References:
-----------
.. [#3EUtils] Code from the https://github.com/neurodata/graspy package,
reproduced and shared with permission.
"""
n_elements = len(arr)
likelihoods = np.zeros(n_elements)
for idx in range(1, n_elements + 1):
# split into two samples
s1 = arr[:idx]
s2 = arr[idx:]
# deal with when input only has 2 elements
if (s1.size == 1) & (s2.size == 1):
likelihoods[idx - 1] = -np.inf
continue
# compute means
mu1 = np.mean(s1)
if s2.size != 0:
mu2 = np.mean(s2)
else:
# Prevent numpy warning for taking mean of empty array
mu2 = -np.inf
# compute pooled variance
variance = (
(np.sum((s1 - mu1) ** 2) + np.sum((s2 - mu2) ** 2))
) / (n_elements - 1 - (idx < n_elements))
std = np.sqrt(variance)
# compute log likelihoods
likelihoods[idx - 1] = np.sum(
norm.logpdf(s1, loc=mu1, scale=std)
) + np.sum(norm.logpdf(s2, loc=mu2, scale=std))
return likelihoods
[docs]def select_dimension(
X,
n_components=None,
n_elbows=2,
threshold=None,
return_likelihoods=False,
):
r"""
Generates profile likelihood from array based on Zhu and Godsie
method [#2EUtils]_. Elbows correspond to the optimal embedding
dimension.
Parameters
----------
X : 1d or 2d array-like
Input array generate profile likelihoods for. If 1d-array, it
should be sorted in decreasing order. If 2d-array, shape should be
(n_samples, n_features).
n_components : int, optional, default: None.
Number of components to embed. If None, ``n_components =
floor(log2(min(n_samples, n_features)))``.
Ignored if X is 1d-array.
n_elbows : int, optional, default: 2.
Number of likelihood elbows to return. Must be > 1.
threshold : float, int, optional, default: None
If given, only consider the singular values that
are > threshold. Must be >= 0.
return_likelihoods : bool, optional, default: False
If True, returns the all likelihoods associated with each elbow.
Returns
-------
elbows : list
Elbows indicate subsequent optimal embedding dimensions. Number of
elbows may be less than n_elbows if there are not enough singular
values.
sing_vals : list
The singular values associated with each elbow.
likelihoods : list of array-like
Array of likelihoods of the corresponding to each elbow. Only
returned if `return_likelihoods` is True.
References
----------
.. [#1EUtils] Code from the https://github.com/neurodata/graspy package,
reproduced and shared with permission.
.. [#2EUtils] Zhu, M. and Ghodsi, A.,
"Automatic dimensionality selection from the scree plot via the
use of profile likelihood. Computational Statistics & Data
Analysis." 51(2):918-930, 2006
"""
# Handle input data
if not isinstance(X, np.ndarray):
msg = "X must be a numpy array, not {}.".format(type(X))
raise ValueError(msg)
if X.ndim > 2:
msg = "X must be a 1d or 2d-array, not {}d array.".format(X.ndim)
raise ValueError(msg)
elif np.min(X.shape) <= 1:
msg = "X must have more than 1 samples or 1 features."
raise ValueError(msg)
# Handle n_elbows
if not isinstance(n_elbows, int):
msg = "n_elbows must be an integer, not {}.".format(type(n_elbows))
raise ValueError(msg)
elif n_elbows < 1:
msg = "number of elbows should be an integer > 1, not {}.".format(
n_elbows
)
raise ValueError(msg)
# Handle threshold
if threshold is not None:
if not isinstance(threshold, (int, float)):
msg = "threshold must be an integer or a float, not \
{}.".format(
type(threshold)
)
raise ValueError(msg)
elif threshold < 0:
msg = "threshold must be >= 0, not {}.".format(threshold)
raise ValueError(msg)
# Handle n_components
if n_components is None:
# per recommendation by Zhu & Godsie
k = int(np.ceil(np.log2(np.min(X.shape))))
elif not isinstance(n_components, int):
msg = "n_components must be an integer, not {}.".format(
type(n_components)
)
raise ValueError(msg)
else:
k = n_components
# Check to see if svd is needed
if X.ndim == 1:
D = np.sort(X)[::-1]
elif X.ndim == 2:
# Singular values in decreasing order
D = svds(
A=X, k=k, return_singular_vectors=False
)
D = np.sort(D)[::-1]
# U, D, V = sklearn.utils.extmath.randomized_svd()
if threshold is not None:
D = D[D > threshold]
if len(D) == 0:
msg = "No values greater than threshold {}."
raise IndexError(msg.format(threshold))
idx = 0
elbows = []
values = []
likelihoods = []
for _ in range(n_elbows):
arr = D[idx:]
lq = _compute_likelihood(arr)
idx += np.argmax(lq) + 1
elbows.append(idx)
values.append(D[idx - 1])
likelihoods.append(lq)
if return_likelihoods:
return elbows, values, likelihoods
else:
return elbows, values