# License: MIT
#
# Implements multi-view co-training regression for 2-view data.
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import random
from ..utils.utils import check_Xs, check_Xs_y_nan_allowed
from .base import BaseCoTrainEstimator
[docs]class CTRegressor(BaseCoTrainEstimator):
r"""
This class implements the co-training regression for supervised
and semi supervised learning with the framework as described in [#1CTR]_.
The best use case is when 2 views of input data are sufficiently
distinct and independent as detailed in [#1CTR]_. However this can also
be successfull when a single matrix of input data is given as
both views and two estimators are choosen
which are quite different [#2CTR]_.
In the semi-supervised case, performance can vary greatly, so using
a separate validation set or cross validation procedure is
recommended to ensure the regression model has fit well.
Parameters
----------
estimator1: sklearn object, (only supports KNeighborsRegressor)
The regressor object which will be trained on view1 of the data.
estimator2: sklearn object, (only supports KNeighborsRegressor)
The regressir object which will be trained on view2 of the data.
k_neighbors: int, optional (default = 5)
The number of neighbors to be considered for determining the mean
squared error.
unlabeled_pool_size: int, optional (default = 50)
The number of unlabeled_pool samples which will be kept in a
separate pool for regression and selection by the updated
regressor at each training iteration.
num_iter: int, optional (default = 100)
The maximum number of iteration to be performed
random_state: int (default = None)
The seed for fit() method and other class operations
Attributes
----------
estimator1_ : regressor object, used on view1
estimator2_ : regressor object, used on view2
class_name_: string
The name of the class.
k_neighbors_ : int
The number of neighbors to be considered for determining
the mean squared error.
unlabeled_pool_size: int
The number of unlabeled_pool samples which will be kept in a
separate pool for regression and selection by the updated
regressor at each training iteration.
num_iter: int
The maximum number of iterations to be performed
n_views : int
The number of views in the data
Examples
--------
>>> from mvlearn.semi_supervised import CTRegressor
>>> from sklearn.neighbors import KNeighborsRegressor
>>> import numpy as np
>>> # X1 and X2 are the 2 views of the data
>>> X1 = [[0], [1], [2], [3], [4], [5], [6]]
>>> X2 = [[2], [3], [4], [6], [7], [8], [10]]
>>> y = [10, 11, 12, 13, 14, 15, 16]
>>> # Converting some of the labeled values to nan
>>> y_train = [10, np.nan, 12, np.nan, 14, np.nan, 16]
>>> knn1 = KNeighborsRegressor(n_neighbors = 2)
>>> knn2 = KNeighborsRegressor(n_neighbors = 2)
>>> ctr = CTRegressor(knn1, knn2, k_neighbors = 2, random_state = 42)
>>> ctr = ctr.fit([X1, X2], y_train)
>>> pred = ctr.predict([X1, X2])
>>> print("True value\n{}".format(y))
True value
[10, 11, 12, 13, 14, 15, 16]
>>> print("Predicted value\n{}".format(pred))
Predicted value
[10.75 11.25 11.25 13.25 13.25 14.75 15.25]
Notes
-----
Multi-view co-training is most helpful for tasks in semi-supervised
learning where each view offers unique information not seen in the
other. As is shown in the example notebooks for using this algorithm,
multi-view co-training can provide good regression results even
when number of unlabeled samples far exceeds the number of labeled
samples. This regressor uses 2 sklearn regressors which work individually
on each view but which share information and thus result in improved
performance over looking at the views completely separately.
The regressor needs to be KNeighborsRegressor,
as described in the [#1CTR]_.
Algorithm: Given
* a set *L1*, *L2* having labeled training
samples of each view respectively
* a set *U* of unlabeled samples
Create a pool *U'* of examples by choosing examples at random
from *U*
* Use *L1* to train a regressor *h1* (``estimator1``) that considers
only the view1 portion of the data (i.e. Xs[0])
* Use *L2* to train a regressor *h2* (``estimator2``) that considers
only the view2 portion of the data (i.e. Xs[1])
Loop for *T* iterations
* for each view *j*
* for each *u* in *U'*
* Calculate the neighbors of *u*
* Predict the value of *u* using *hj* estimator
* create a new estimator *hj'* with same parameters
as that of *hj* and train it on the data (*Lj* union *u*)
* predict the value of neighbors from estimator *hj*
and calculate the mean squared error with respect to
original values
* predict the value of neighbors from the new
estimator *hj'* and calculate the mean squared error
with respect to original values
* calculate the difference between the two errors
* store the error in a list named *deltaj*
* select the index with maximum positive value from both
the *delta1* and *delta2*
* let the indexes selected be *index1* and *index2*
* Add the *index1* to *L2*
* Add the *index2* to *L1*
* Remove the selected index from *U'* and replenish
it by taking unlabeled index from *U*
* Use *L1* to train the regressor *h1*
* Use *L2* to train the regressor *h2*
References
----------
.. [#1CTR] Zhou, Zhi-Hua and Li, Ming. "Semi-supervised regression with
co-training." In Proceedings of the 19th International Joint
Conference on Artificial Intelligence, page 908–913, 2005
.. [#2CTR] Goldman, Sally, and Yan Zhou. "Enhancing supervised
learning with unlabeled data." In Proceedings of the Eleventh
Annual Conference on Computational Learning Theory, 2000.
"""
def __init__(
self,
estimator1=None,
estimator2=None,
k_neighbors=5,
unlabeled_pool_size=50,
num_iter=100,
random_state=None
):
# initialize a BaseCTEstimator object
super().__init__(estimator1, estimator2, random_state)
# If not given, initialize with default KNeighborsRegrssor
if estimator1 is None:
estimator1 = KNeighborsRegressor()
if estimator2 is None:
estimator2 = KNeighborsRegressor()
# Initializing the other attributes
self.estimator1_ = estimator1
self.estimator2_ = estimator2
self.k_neighbors_ = k_neighbors
self.unlabeled_pool_size = unlabeled_pool_size
self.num_iter = num_iter
# Used in fit method while selecting a pool of unlabeled samples
random.seed(random_state)
self.n_views = 2
self.class_name_ = "CTRegressor"
# checks whether the parameters given is valid
self._check_params()
def _check_params(self):
r"""
Checks that cotraining parameters are valid. Throws AttributeError
if estimators are invalid. Throws ValueError if any other parameters
are not valid. The checks performed are:
- estimator1 and estimator2 are KNeigborsRegressor
- k_neighbors_ is positive
- unlabeled_pool_size is positive
- num_iter is positive
"""
# The estimator must be KNeighborsRegressor
to_be_matched = "KNeighborsRegressor"
# Taking the str of estimator object
# returns the class name along with other parameters
string1 = str(self.estimator1_)
string2 = str(self.estimator2_)
# slicing the list to get the name of the estimator
string1 = string1[: len(to_be_matched)]
string2 = string2[: len(to_be_matched)]
if string1 != to_be_matched or string2 != to_be_matched:
raise AttributeError(
"Both the estimator needs to be KNeighborsRegressor")
if self.k_neighbors_ <= 0:
raise ValueError("k_neighbors must be positive")
if self.unlabeled_pool_size <= 0:
raise ValueError("unlabeled_pool_size must be positive")
if self.num_iter <= 0:
raise ValueError("number of iterations must be positive")
def fit(self, Xs, y):
r"""
Fit the regressor object to the data in Xs, y.
Parameters
----------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
A list of the different views of data to train on.
y : array, shape (n_samples,)
The target values of the training data. Unlabeled examples
should have label np.nan.
Returns
-------
self : returns an instance of self
"""
# check whether Xs contain NaN and both Xs and y
# are consistent with each other
Xs, y = check_Xs_y_nan_allowed(
Xs, y, multiview=True, enforce_views=self.n_views)
y = np.array(y)
# Xs contain two view
X1 = Xs[0]
X2 = Xs[1]
# Storing the indexes of the unlabeled samples
U = [i[0] for i in enumerate(y) if np.isnan(i[1])]
# Storing the indexes of the labeled samples
L = [i[0] for i in enumerate(y) if not np.isnan(i[1])]
# making two true labels for each view
# So that we can make changes to it without altering original labels
y1 = y.copy()
y2 = y.copy()
# contains the indexes of labeled samples
L1 = L.copy()
L2 = L.copy()
# fitting the estimator object on the train data
self.estimator1_.fit(X1[L1], y1[L1])
self.estimator2_.fit(X2[L2], y2[L2])
# declaring a variable which keeps tracks
# of the number of iteration performed
it = 0
# Randomly selected index of unlabeled data samples
unlabeled_pool = random.sample(
U, min(len(U), self.unlabeled_pool_size))
# Removing the unlabeled samples which were selected earlier
U = [i for i in U if i not in unlabeled_pool]
while it < self.num_iter and unlabeled_pool:
it += 1
# list of k nearest neighbors for all unlabeled samples
neighbors1 = self.estimator1_.kneighbors(
X1[unlabeled_pool],
n_neighbors=self.k_neighbors_,
return_distance=False)
neighbors2 = self.estimator2_.kneighbors(
X2[unlabeled_pool],
n_neighbors=self.k_neighbors_,
return_distance=False)
# Stores the delta value of each view
delta1 = []
delta2 = []
for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors1)):
# Making a copy of L1 to include the unlabeled index
new_L1 = L1.copy()
new_L1.append(u)
# Predicts the value of unlabeled index
pred = self.estimator1_.predict(np.expand_dims(X1[u], axis=0))
# assigning the predicted value to new y
new_y1 = y1.copy()
new_y1[u] = pred
# prediction array before inclusion of unlabeled index
pred_before_inc = []
pred_before_inc = self.estimator1_.predict((X1[L1])[neigh])
# new estimator for training a regressor model on new L1
new_estimator = KNeighborsRegressor()
# Setting the same parameters as that of estimator1 object
new_estimator.set_params(**self.estimator1_.get_params())
new_estimator.fit(X1[new_L1], new_y1[new_L1])
# prediction array after inclusion of unlabeled index
pred_after_inc = []
pred_after_inc = new_estimator.predict((X1[L1])[neigh])
mse_before_inc = mean_squared_error(
(y1[L1])[neigh], pred_before_inc)
mse_after_inc = mean_squared_error(
(y1[L1])[neigh], pred_after_inc)
# appending the calculated value to delta1
delta1.append(mse_before_inc - mse_after_inc)
for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors2)):
# Making a copy of L2 to include the unlabeled index
new_L2 = L2.copy()
new_L2.append(u)
# Predicts the value of unlabeled index
pred_before_inc = []
pred = self.estimator2_.predict(
np.expand_dims(X2[u], axis=0))
# assigning the predicted value to new y
new_y2 = y2.copy()
new_y2[u] = pred
# prediction array before inclusion of unlabeled index
pred_before_inc = self.estimator2_.predict((X2[L2])[neigh])
# new estimator for training a regressor model on new L2
new_estimator = KNeighborsRegressor()
# Setting the same parameters as that of estimator2 object
new_estimator.set_params(**self.estimator2_.get_params())
new_estimator.fit(X2[new_L2], new_y2[new_L2])
# prediction array after inclusion of unlabeled index
pred_after_inc = []
pred_after_inc = new_estimator.predict((X2[L2])[neigh])
mse_before_inc = mean_squared_error(
(y2[L2])[neigh], pred_before_inc)
mse_after_inc = mean_squared_error(
(y2[L2])[neigh], pred_after_inc)
# appending the calculated value to delta2
delta2.append(mse_before_inc - mse_after_inc)
delta1_index = np.argsort(delta1)
delta2_index = np.argsort(delta2)
# list containing the indexes to be included
to_include1 = []
to_include2 = []
"""
If the length of both the delta's is equal to 1 then
include the corresponding index whose value is positive and
greater than the other values.
Else selecting the indexes which have postive and maximum
value from each delta's and incase both the indexes are equal
then look at the second best positive value.
The indexes which are selected from delta1
will be added to the labels of the estimator2 object.
Similarly, the indexes which are selected from delta2
will be added to the labels of the estimator1 object.
"""
if delta1_index.shape[0] == 1 and delta2_index.shape[0] == 1:
if delta1[0] > 0 and delta2[0] > 0:
if delta1[0] >= delta2[0]:
L2.append(unlabeled_pool[0])
to_include2.append(0)
else:
L1.append(unlabeled_pool[0])
to_include1.append(0)
elif delta1[0] > 0:
L2.append(unlabeled_pool[0])
to_include2.append(0)
elif delta2[0] > 0:
L1.append(unlabeled_pool[0])
to_include1.append(0)
else:
# Top two indexes from each delta
index1_1, index1_2 = delta1_index[-1], delta1_index[-2]
index2_1, index2_2 = delta2_index[-1], delta2_index[-2]
if index1_1 != index2_1:
if delta1[index1_1] > 0:
L2.append(unlabeled_pool[index1_1])
to_include2.append(index1_1)
if delta2[index2_1] > 0:
L1.append(unlabeled_pool[index2_1])
to_include1.append(index2_1)
else:
if delta1[index1_1] > 0 and delta2[index2_1] > 0:
if delta1[index1_1] >= delta2[index2_1]:
L2.append(unlabeled_pool[index1_1])
to_include2.append(index1_1)
if delta2[index2_2] > 0:
L1.append(unlabeled_pool[index2_2])
to_include1.append(index2_2)
else:
L1.append(unlabeled_pool[index2_1])
to_include1.append(index2_1)
if delta1[index1_2] > 0:
L2.append(unlabeled_pool[index1_2])
to_include2.append(index1_2)
elif delta1[index1_1] > 0:
L2.append(unlabeled_pool[index1_1])
to_include2.append(index1_1)
elif delta2[index2_1] > 0:
L1.append(unlabeled_pool[index2_1])
to_include1.append(index2_1)
# break if to_include1 and to_include2 are empty
if len(to_include1) == 0 and len(to_include2) == 0:
break
# including the selected index
for i in to_include1:
pred = self.estimator2_.predict(
np.expand_dims(X2[unlabeled_pool[i]], axis=0))
y1[unlabeled_pool[i]] = pred
# including the selected index
for i in to_include2:
pred = self.estimator1_.predict(
np.expand_dims(X1[unlabeled_pool[i]], axis=0))
y2[unlabeled_pool[i]] = pred
# Currently to_include contains the index of unlabeled samples
# in the order in which they are stored in unlabeled_pool
# Converting them to the value which unlabeled_pool stores
# example unlabeled_pool = [10, 15, 17]
# current to_include = [1, 2]
# updated to_include = [15, 17]
to_include1 = [unlabeled_pool[i] for i in to_include1]
to_include2 = [unlabeled_pool[i] for i in to_include2]
# removing the selected index
unlabeled_pool = [
u for u in unlabeled_pool
if (u not in to_include1) and (u not in to_include2)]
# replenishing the unlabeled pool
for u in U:
if len(unlabeled_pool) < self.unlabeled_pool_size:
if u not in unlabeled_pool:
unlabeled_pool.append(u)
else:
break
U = [i for i in U if i not in unlabeled_pool]
# fitting the model on new train data
self.estimator1_.fit(X1[L1], y1[L1])
self.estimator2_.fit(X2[L2], y2[L2])
return self
def predict(self, Xs):
r"""
Predict the values of the samples in the two input views.
Parameters
----------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
A list of the different views of data to predict.
Returns
-------
y_pred : array-like (n_samples,)
The average of the predictions from both estimators is returned
"""
Xs = check_Xs(Xs, multiview=True, enforce_views=self.n_views)
X1 = Xs[0]
X2 = Xs[1]
# predicting the value of each view
pred1 = self.estimator1_.predict(X1)
pred2 = self.estimator2_.predict(X2)
# Taking the average of the predicted value and returning it
return (pred1 + pred2) * 0.5