Source code for mvlearn.datasets.nutrimouse

"""Nutrimouse dataset loader"""

# Authors: Ronan Perry
#
# License: MIT

from os.path import dirname, join
import numpy as np
from sklearn.utils import Bunch
import csv


[docs]def load_nutrimouse(return_Xs_y=False):
    r"""
    Load the Nutrimouse dataset [#1paper]_, a two-view dataset from a nutrition
    study on mice, as available from
    https://CRAN.R-project.org/package=CCA [#2r]_.

    Parameters
    ----------
    return_Xs_y : bool, default=False
        If ``True``, returns an ``(Xs, y)`` tuple of the multiple views and
        sample labels as strings. Otherwise returns all the data in a
        dictionary-like `sklearn.utils.Bunch` object.

    Returns
    -------
    dataset : `sklearn.utils.Bunch` object following key: value pairs
        (see Notes for details). Returned if ``return_Xs_y`` is False.
        gene : numpy.ndarray, shape (40, 120)
            The gene expressions (1st view).
        lipid : numpy.ndarray, shape (40, 21)
            The fatty acid concentrations (2nd view)
        genotype : numpy.ndarray, shape (40,)
            The genotype label (1st label).
        diet : numpy.ndarray, shape (40,)
            The diet label (2nd label).
        gene_feature_names : list, length 120
            The names of the genes.
        lipid_feature_names : list, length 21
            The names of the fatty acids.

    (Xs, y) : 2-tuple of the multiple views and labels as strings
        Returned if ``return_Xs_y`` is False.

    Notes
    -----
    This data consists of two views from a nutrition study of 40 mice:

    - gene : expressions of 120 potentially relevant genes
    - lipid : concentrations of 21 hepatic fatty acids

    Each mouse has two labels, four mice per pair of labels:

    - genotype (2 classes) : wt, ppar
    - diet (5 classes) : REF, COC, SUN, LIN, FISH

    References
    ----------
    .. [#1paper] P. Martin, H. Guillou, F. Lasserre, S. Déjean, A. Lan, J-M.
            Pascussi, M. San Cristobal, P. Legrand, P. Besse, T. Pineau.
            "Novel aspects of PPARalpha-mediated regulation of lipid and
            xenobiotic metabolism revealed through a nutrigenomic study."
            Hepatology, 2007.

    .. [#2r] González I., Déjean S., Martin P.G.P and Baccini, A. (2008) CCA:
            "An R Package to Extend Canonical Correlation Analysis." Journal
            of Statistical Software, 23(12).

    Examples
    --------
    >>> from mvlearn.datasets import load_nutrimouse
    >>> # Load both views and labels
    >>> Xs, y = load_nutrimouse(return_Xs_y=True)
    >>> print(len(Xs))
    2
    >>> print([X.shape for X in Xs])
    [(40, 120), (40, 21)]
    >>> print(labels.shape)
    (40, 2)
    """

    module_path = dirname(__file__)
    folder = "nutrimouse"
    Xs_filenames = ["gene", "lipid"]
    y_filenames = ["genotype", "diet"]
    dataset = Bunch()

    for fname in Xs_filenames:
        csv_file = join(module_path, folder, fname + '.csv')
        X = np.genfromtxt(csv_file, delimiter=',', names=True)
        dataset[fname] = X.view((float, len(X.dtype.names)))
        dataset[f'{fname}_feature_names'] = list(X.dtype.names)

    for fname in y_filenames:
        csv_file = join(module_path, folder, fname + '.csv')
        with open(csv_file, newline='') as f:
            y = np.asarray(list(csv.reader(f))[1:]).squeeze()
        class_names, y = np.unique(y, return_inverse=True)
        dataset[fname] = y
        dataset[f'{fname}_names'] = class_names

    if return_Xs_y:
        Xs = [dataset[X_key] for X_key in Xs_filenames]
        y = np.vstack([dataset[y_key] for y_key in y_filenames]).T
        return (Xs, y)
    else:
        return dataset