Source code for mvlearn.semi_supervised.ctregression

# License: MIT
#
# Implements multi-view co-training regression for 2-view data.


import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import random
from ..utils.utils import check_Xs, check_Xs_y_nan_allowed
from .base import BaseCoTrainEstimator


[docs]class CTRegressor(BaseCoTrainEstimator):
    r"""
    This class implements the co-training regression for supervised
    and semi supervised learning with the framework as described in [#1CTR]_.
    The best use case is when 2 views of input data are sufficiently
    distinct and independent as detailed in [#1CTR]_. However this can also
    be successfull when a single matrix of input data is given as
    both views and two estimators are choosen
    which are quite different [#2CTR]_.

    In the semi-supervised case, performance can vary greatly, so using
    a separate validation set or cross validation procedure is
    recommended to ensure the regression model has fit well.

    Parameters
    ----------
    estimator1: sklearn object, (only supports KNeighborsRegressor)
        The regressor object which will be trained on view1 of the data.

    estimator2: sklearn object, (only supports KNeighborsRegressor)
        The regressir object which will be trained on view2 of the data.

    k_neighbors: int, optional (default = 5)
        The number of neighbors to be considered for determining the mean
        squared error.

    unlabeled_pool_size: int, optional (default = 50)
        The number of unlabeled_pool samples which will be kept in a
        separate pool for regression and selection by the updated
        regressor at each training iteration.

    num_iter: int, optional (default = 100)
        The maximum number of iteration to be performed

    random_state: int (default = None)
        The seed for fit() method and other class operations

    Attributes
    ----------
    estimator1_ : regressor object, used on view1

    estimator2_ : regressor object, used on view2

    class_name_: string
        The name of the class.

    k_neighbors_ : int
        The number of neighbors to be considered for determining
        the mean squared error.

    unlabeled_pool_size: int
        The number of unlabeled_pool samples which will be kept in a
        separate pool for regression and selection by the updated
        regressor at each training iteration.

    num_iter: int
        The maximum number of iterations to be performed

    n_views : int
        The number of views in the data

    Examples
    --------
    >>> from mvlearn.semi_supervised import CTRegressor
    >>> from sklearn.neighbors import KNeighborsRegressor
    >>> import numpy as np
    >>> # X1 and X2 are the 2 views of the data
    >>> X1 = [[0], [1], [2], [3], [4], [5], [6]]
    >>> X2 = [[2], [3], [4], [6], [7], [8], [10]]
    >>> y = [10, 11, 12, 13, 14, 15, 16]
    >>> # Converting some of the labeled values to nan
    >>> y_train = [10, np.nan, 12, np.nan, 14, np.nan, 16]
    >>> knn1 = KNeighborsRegressor(n_neighbors = 2)
    >>> knn2 = KNeighborsRegressor(n_neighbors = 2)
    >>> ctr = CTRegressor(knn1, knn2, k_neighbors = 2, random_state =  42)
    >>> ctr = ctr.fit([X1, X2], y_train)
    >>> pred = ctr.predict([X1, X2])
    >>> print("True value\n{}".format(y))
    True value
    [10, 11, 12, 13, 14, 15, 16]
    >>> print("Predicted value\n{}".format(pred))
    Predicted value
    [10.75 11.25 11.25 13.25 13.25 14.75 15.25]

    Notes
    -----
    Multi-view co-training is most helpful for tasks in semi-supervised
    learning where each view offers unique information not seen in the
    other. As is shown in the example notebooks for using this algorithm,
    multi-view co-training can provide good regression results even
    when number of unlabeled samples far exceeds the number of labeled
    samples. This regressor uses 2 sklearn regressors which work individually
    on each view but which share information and thus result in improved
    performance over looking at the views completely separately.
    The regressor needs to be KNeighborsRegressor,
    as described in the [#1CTR]_.

    Algorithm: Given

        * a set *L1*, *L2* having labeled training
        samples of each view respectively

        * a set *U* of unlabeled samples

        Create a pool *U'* of examples by choosing examples at random
        from *U*

        * Use *L1* to train a regressor *h1* (``estimator1``) that considers
          only the view1 portion of the data (i.e. Xs[0])
        * Use *L2* to train a regressor *h2* (``estimator2``) that considers
          only the view2 portion of the data (i.e. Xs[1])

        Loop for *T* iterations
            * for each view *j*
                * for each *u* in *U'*
                    * Calculate the neighbors of *u*
                    * Predict the value of *u* using *hj* estimator
                    * create a new estimator *hj'* with same parameters
                    as that of *hj* and train it on the data (*Lj* union *u*)
                    * predict the value of neighbors from estimator *hj*
                    and calculate the mean squared error with respect to
                    original values
                    * predict the value of neighbors from the new
                    estimator *hj'* and calculate the mean squared error
                    with respect to original values
                    * calculate the difference between the two errors
                    * store the error in a list named *deltaj*
            * select the index with maximum positive value from both
            the *delta1* and *delta2*
            * let the indexes selected be *index1* and *index2*
            * Add the *index1* to *L2*
            * Add the *index2* to *L1*
            * Remove the  selected index from *U'* and replenish
            it by taking unlabeled index from *U*
            * Use *L1* to train the regressor *h1*
            * Use *L2* to train the regressor *h2*

    References
    ----------
    .. [#1CTR] Zhou, Zhi-Hua and Li, Ming. "Semi-supervised regression with
            co-training." In Proceedings of the 19th International Joint
            Conference on Artificial Intelligence, page 908–913, 2005

    .. [#2CTR] Goldman, Sally, and Yan Zhou. "Enhancing supervised
            learning with unlabeled data." In Proceedings of the Eleventh
            Annual Conference on Computational Learning Theory, 2000.

    """

    def __init__(
        self,
        estimator1=None,
        estimator2=None,
        k_neighbors=5,
        unlabeled_pool_size=50,
        num_iter=100,
        random_state=None
    ):

        # initialize a BaseCTEstimator object
        super().__init__(estimator1, estimator2, random_state)

        # If not given, initialize with default KNeighborsRegrssor
        if estimator1 is None:
            estimator1 = KNeighborsRegressor()
        if estimator2 is None:
            estimator2 = KNeighborsRegressor()

        # Initializing the other attributes
        self.estimator1_ = estimator1
        self.estimator2_ = estimator2
        self.k_neighbors_ = k_neighbors
        self.unlabeled_pool_size = unlabeled_pool_size
        self.num_iter = num_iter

        # Used in fit method while selecting a pool of unlabeled samples
        random.seed(random_state)

        self.n_views = 2
        self.class_name_ = "CTRegressor"

        # checks whether the parameters given is valid
        self._check_params()

    def _check_params(self):
        r"""
        Checks that cotraining parameters are valid. Throws AttributeError
        if estimators are invalid. Throws ValueError if any other parameters
        are not valid. The checks performed are:
            - estimator1 and estimator2 are KNeigborsRegressor
            - k_neighbors_ is positive
            - unlabeled_pool_size is positive
            - num_iter is positive
        """

        # The estimator must be KNeighborsRegressor
        to_be_matched = "KNeighborsRegressor"

        # Taking the str of estimator object
        # returns the class name along with other parameters
        string1 = str(self.estimator1_)
        string2 = str(self.estimator2_)

        # slicing the list to get the name of the estimator
        string1 = string1[: len(to_be_matched)]
        string2 = string2[: len(to_be_matched)]

        if string1 != to_be_matched or string2 != to_be_matched:
            raise AttributeError(
                "Both the estimator needs to be KNeighborsRegressor")

        if self.k_neighbors_ <= 0:
            raise ValueError("k_neighbors must be positive")

        if self.unlabeled_pool_size <= 0:
            raise ValueError("unlabeled_pool_size must be positive")

        if self.num_iter <= 0:
            raise ValueError("number of iterations must be positive")

    def fit(self, Xs, y):
        r"""
        Fit the regressor object to the data in Xs, y.

        Parameters
        ----------
        Xs : list of array-likes or numpy.ndarray
            - Xs length: n_views
            - Xs[i] shape: (n_samples, n_features_i)
            A list of the different views of data to train on.

        y : array, shape (n_samples,)
            The target values of the training data. Unlabeled examples
            should have label np.nan.

        Returns
        -------
        self : returns an instance of self
        """

        # check whether Xs contain NaN and both Xs and y
        # are consistent with each other
        Xs, y = check_Xs_y_nan_allowed(
            Xs, y, multiview=True, enforce_views=self.n_views)

        y = np.array(y)

        # Xs contain two view
        X1 = Xs[0]
        X2 = Xs[1]

        # Storing the indexes of the unlabeled samples
        U = [i[0] for i in enumerate(y) if np.isnan(i[1])]

        # Storing the indexes of the labeled samples
        L = [i[0] for i in enumerate(y) if not np.isnan(i[1])]

        # making two true labels for each view
        # So that we can make changes to it without altering original labels
        y1 = y.copy()
        y2 = y.copy()

        # contains the indexes of labeled samples
        L1 = L.copy()
        L2 = L.copy()

        # fitting the estimator object on the train data
        self.estimator1_.fit(X1[L1], y1[L1])
        self.estimator2_.fit(X2[L2], y2[L2])

        # declaring a variable which keeps tracks
        # of the number of iteration performed
        it = 0

        # Randomly selected index of unlabeled data samples
        unlabeled_pool = random.sample(
            U, min(len(U), self.unlabeled_pool_size))

        # Removing the unlabeled samples which were selected earlier
        U = [i for i in U if i not in unlabeled_pool]

        while it < self.num_iter and unlabeled_pool:
            it += 1

            # list of k nearest neighbors for all unlabeled samples
            neighbors1 = self.estimator1_.kneighbors(
                X1[unlabeled_pool],
                n_neighbors=self.k_neighbors_,
                return_distance=False)
            neighbors2 = self.estimator2_.kneighbors(
                X2[unlabeled_pool],
                n_neighbors=self.k_neighbors_,
                return_distance=False)

            # Stores the delta value of each view
            delta1 = []
            delta2 = []

            for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors1)):

                # Making a copy of L1 to include the unlabeled index
                new_L1 = L1.copy()
                new_L1.append(u)

                # Predicts the value of unlabeled index
                pred = self.estimator1_.predict(np.expand_dims(X1[u], axis=0))

                # assigning the predicted value to new y
                new_y1 = y1.copy()
                new_y1[u] = pred

                # prediction array before inclusion of unlabeled index
                pred_before_inc = []

                pred_before_inc = self.estimator1_.predict((X1[L1])[neigh])

                # new estimator for training a regressor model on new L1
                new_estimator = KNeighborsRegressor()

                # Setting the same parameters as that of estimator1 object
                new_estimator.set_params(**self.estimator1_.get_params())
                new_estimator.fit(X1[new_L1], new_y1[new_L1])

                # prediction array after inclusion of unlabeled index
                pred_after_inc = []
                pred_after_inc = new_estimator.predict((X1[L1])[neigh])

                mse_before_inc = mean_squared_error(
                    (y1[L1])[neigh], pred_before_inc)

                mse_after_inc = mean_squared_error(
                    (y1[L1])[neigh], pred_after_inc)

                # appending the calculated value to delta1
                delta1.append(mse_before_inc - mse_after_inc)

            for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors2)):

                # Making a copy of L2 to include the unlabeled index
                new_L2 = L2.copy()
                new_L2.append(u)

                # Predicts the value of unlabeled index
                pred_before_inc = []

                pred = self.estimator2_.predict(
                    np.expand_dims(X2[u], axis=0))

                # assigning the predicted value to new y
                new_y2 = y2.copy()
                new_y2[u] = pred

                # prediction array before inclusion of unlabeled index
                pred_before_inc = self.estimator2_.predict((X2[L2])[neigh])

                # new estimator for training a regressor model on new L2
                new_estimator = KNeighborsRegressor()

                # Setting the same parameters as that of estimator2 object
                new_estimator.set_params(**self.estimator2_.get_params())
                new_estimator.fit(X2[new_L2], new_y2[new_L2])

                # prediction array after inclusion of unlabeled index
                pred_after_inc = []
                pred_after_inc = new_estimator.predict((X2[L2])[neigh])

                mse_before_inc = mean_squared_error(
                    (y2[L2])[neigh], pred_before_inc)

                mse_after_inc = mean_squared_error(
                    (y2[L2])[neigh], pred_after_inc)

                # appending the calculated value to delta2
                delta2.append(mse_before_inc - mse_after_inc)

            delta1_index = np.argsort(delta1)
            delta2_index = np.argsort(delta2)

            # list containing the indexes to be included
            to_include1 = []
            to_include2 = []

            """
            If the length of both the delta's is equal to 1 then
            include the corresponding index whose value is positive and
            greater than the other values.
            Else selecting the indexes which have postive and maximum
            value from each delta's and incase both the indexes are equal
            then look at the second best positive value.
            The indexes which are selected from delta1
            will be added to the labels of the estimator2 object.
            Similarly, the indexes which are selected from delta2
            will be added to the labels of the estimator1 object.
            """
            if delta1_index.shape[0] == 1 and delta2_index.shape[0] == 1:

                if delta1[0] > 0 and delta2[0] > 0:
                    if delta1[0] >= delta2[0]:
                        L2.append(unlabeled_pool[0])
                        to_include2.append(0)
                    else:
                        L1.append(unlabeled_pool[0])
                        to_include1.append(0)

                elif delta1[0] > 0:
                    L2.append(unlabeled_pool[0])
                    to_include2.append(0)

                elif delta2[0] > 0:
                    L1.append(unlabeled_pool[0])
                    to_include1.append(0)

            else:

                # Top two indexes from each delta
                index1_1, index1_2 = delta1_index[-1], delta1_index[-2]
                index2_1, index2_2 = delta2_index[-1], delta2_index[-2]

                if index1_1 != index2_1:
                    if delta1[index1_1] > 0:
                        L2.append(unlabeled_pool[index1_1])
                        to_include2.append(index1_1)

                    if delta2[index2_1] > 0:
                        L1.append(unlabeled_pool[index2_1])
                        to_include1.append(index2_1)

                else:
                    if delta1[index1_1] > 0 and delta2[index2_1] > 0:
                        if delta1[index1_1] >= delta2[index2_1]:
                            L2.append(unlabeled_pool[index1_1])
                            to_include2.append(index1_1)
                            if delta2[index2_2] > 0:
                                L1.append(unlabeled_pool[index2_2])
                                to_include1.append(index2_2)

                        else:
                            L1.append(unlabeled_pool[index2_1])
                            to_include1.append(index2_1)
                            if delta1[index1_2] > 0:
                                L2.append(unlabeled_pool[index1_2])
                                to_include2.append(index1_2)

                    elif delta1[index1_1] > 0:
                        L2.append(unlabeled_pool[index1_1])
                        to_include2.append(index1_1)

                    elif delta2[index2_1] > 0:
                        L1.append(unlabeled_pool[index2_1])
                        to_include1.append(index2_1)

            # break if to_include1 and to_include2 are empty
            if len(to_include1) == 0 and len(to_include2) == 0:
                break

            # including the selected index
            for i in to_include1:
                pred = self.estimator2_.predict(
                    np.expand_dims(X2[unlabeled_pool[i]], axis=0))
                y1[unlabeled_pool[i]] = pred

            # including the selected index
            for i in to_include2:
                pred = self.estimator1_.predict(
                    np.expand_dims(X1[unlabeled_pool[i]], axis=0))
                y2[unlabeled_pool[i]] = pred

            # Currently to_include contains the index of unlabeled samples
            # in the order in which they are stored in unlabeled_pool
            # Converting them to the value which unlabeled_pool stores
            # example unlabeled_pool = [10, 15, 17]
            # current to_include = [1, 2]
            # updated to_include = [15, 17]
            to_include1 = [unlabeled_pool[i] for i in to_include1]
            to_include2 = [unlabeled_pool[i] for i in to_include2]

            # removing the selected index
            unlabeled_pool = [
                u for u in unlabeled_pool
                if (u not in to_include1) and (u not in to_include2)]

            # replenishing the unlabeled pool
            for u in U:
                if len(unlabeled_pool) < self.unlabeled_pool_size:
                    if u not in unlabeled_pool:
                        unlabeled_pool.append(u)
                else:
                    break

            U = [i for i in U if i not in unlabeled_pool]

            # fitting the model on new train data
            self.estimator1_.fit(X1[L1], y1[L1])
            self.estimator2_.fit(X2[L2], y2[L2])

        return self

    def predict(self, Xs):
        r"""
        Predict the values of the samples in the two input views.

        Parameters
        ----------
        Xs : list of array-likes or numpy.ndarray
            - Xs length: n_views
            - Xs[i] shape: (n_samples, n_features_i)
            A list of the different views of data to predict.

        Returns
        -------
        y_pred : array-like (n_samples,)
            The average of the predictions from both estimators is returned
        """
        Xs = check_Xs(Xs, multiview=True, enforce_views=self.n_views)

        X1 = Xs[0]
        X2 = Xs[1]

        # predicting the value of each view
        pred1 = self.estimator1_.predict(X1)
        pred2 = self.estimator2_.predict(X2)

        # Taking the average of the predicted value and returning it
        return (pred1 + pred2) * 0.5