Source code for mvlearn.datasets.nutrimouse

"""Nutrimouse dataset loader"""

# Authors: Ronan Perry
#
# License: MIT

from os.path import dirname, join
import numpy as np
from sklearn.utils import Bunch
import csv


[docs]def load_nutrimouse(return_Xs_y=False): r""" Load the Nutrimouse dataset [#1paper]_, a two-view dataset from a nutrition study on mice, as available from https://CRAN.R-project.org/package=CCA [#2r]_. Parameters ---------- return_Xs_y : bool, default=False If ``True``, returns an ``(Xs, y)`` tuple of the multiple views and sample labels as strings. Otherwise returns all the data in a dictionary-like `sklearn.utils.Bunch` object. Returns ------- dataset : `sklearn.utils.Bunch` object following key: value pairs (see Notes for details). Returned if ``return_Xs_y`` is False. gene : numpy.ndarray, shape (40, 120) The gene expressions (1st view). lipid : numpy.ndarray, shape (40, 21) The fatty acid concentrations (2nd view) genotype : numpy.ndarray, shape (40,) The genotype label (1st label). diet : numpy.ndarray, shape (40,) The diet label (2nd label). gene_feature_names : list, length 120 The names of the genes. lipid_feature_names : list, length 21 The names of the fatty acids. (Xs, y) : 2-tuple of the multiple views and labels as strings Returned if ``return_Xs_y`` is False. Notes ----- This data consists of two views from a nutrition study of 40 mice: - gene : expressions of 120 potentially relevant genes - lipid : concentrations of 21 hepatic fatty acids Each mouse has two labels, four mice per pair of labels: - genotype (2 classes) : wt, ppar - diet (5 classes) : REF, COC, SUN, LIN, FISH References ---------- .. [#1paper] P. Martin, H. Guillou, F. Lasserre, S. Déjean, A. Lan, J-M. Pascussi, M. San Cristobal, P. Legrand, P. Besse, T. Pineau. "Novel aspects of PPARalpha-mediated regulation of lipid and xenobiotic metabolism revealed through a nutrigenomic study." Hepatology, 2007. .. [#2r] González I., Déjean S., Martin P.G.P and Baccini, A. (2008) CCA: "An R Package to Extend Canonical Correlation Analysis." Journal of Statistical Software, 23(12). Examples -------- >>> from mvlearn.datasets import load_nutrimouse >>> # Load both views and labels >>> Xs, y = load_nutrimouse(return_Xs_y=True) >>> print(len(Xs)) 2 >>> print([X.shape for X in Xs]) [(40, 120), (40, 21)] >>> print(labels.shape) (40, 2) """ module_path = dirname(__file__) folder = "nutrimouse" Xs_filenames = ["gene", "lipid"] y_filenames = ["genotype", "diet"] dataset = Bunch() for fname in Xs_filenames: csv_file = join(module_path, folder, fname + '.csv') X = np.genfromtxt(csv_file, delimiter=',', names=True) dataset[fname] = X.view((float, len(X.dtype.names))) dataset[f'{fname}_feature_names'] = list(X.dtype.names) for fname in y_filenames: csv_file = join(module_path, folder, fname + '.csv') with open(csv_file, newline='') as f: y = np.asarray(list(csv.reader(f))[1:]).squeeze() class_names, y = np.unique(y, return_inverse=True) dataset[fname] = y dataset[f'{fname}_names'] = class_names if return_Xs_y: Xs = [dataset[X_key] for X_key in Xs_filenames] y = np.vstack([dataset[y_key] for y_key in y_filenames]).T return (Xs, y) else: return dataset