Source code for libuplift.datasets.Tamoxifen

"""The Tamoxifen dataset from Melania Pintilie's book "Competing
Risks, A Practical Perspective".

"""

import numpy as np

from .base import _fetch_remote_csv
from .base import RemoteFileMetadata

ARCHIVE = RemoteFileMetadata(
    filename=None, url=('local:Tamoxifen_data'), checksum=None)


[docs]
def fetch_Tamoxifen(data_home=None, download_if_missing=True,
                    random_state=None, shuffle=False,
                    categ_as_strings=False, return_X_y=False,
                    as_frame=False):
    """Load the Tamoxifen randomized trial dataset from Melania
    Pintilie's book "Competing Risks, A Practical Perspective.

    The description of the original study can be found in [1]_.

    Uses a local copy of the data.
    
    **Targets**

    - target_surv_time: survival time
    - target_surv_status: 1=death
    - target_loctime: 
    - target_lcens: 1=local relapse
    - target_axltime: time to axillary relapse
    - target_acens: 1=axillary relapse
    - target_distime: time to distance relapse
    - target_dcens: 1=distance relapse
    - target_maltime: time to second malignancy
    - target_mcens: 1=second malignancy

    Parameters
    ----------
    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None (default)
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    categ_as_strings : bool, default=False
        Whether to return categorical variables as strings.

    return_X_y : boolean, default=False.
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

    as_frame : boolean, default=False
        If True features are returned as pandas DataFrame.  If False
        features are returned as object or float array.  Float array
        is returned if all features are floats.
    
    Returns
    -------
    dataset : dict-like object with the following attributes:

    dataset.data : numpy array
        Each row corresponds to the features in the dataset.

    dataset.DESCR : string
        Description of the dataset.

    (data, target_time, target_status) : tuple if
        ``return_X_y`` is True

    References
    ----------

    .. [1] A.W. Fyles, et al., "Tamoxifen with or without breast
       irradiation in women 50 years of age or older with early breast
       cancer". New England Journal of Medicine, 351(10), 963--970,
       2004 (https://www.nejm.org/doi/10.1056/NEJMoa040595).

    """

    # dictionaries
    treatment_values = {"T":"tamoxifen", "B":"radiation+tamoxifen"}
    hist_values = ["DUC", "LOB", "MED", "MIX", "MUC", "OTH"]
    hrlevel_values = ['NEG', 'POS']
    nodediss_values = ["N", "Y"]

    # attribute descriptions
    treatment_descr = [("treatment", treatment_values, "tx")]
    target_descr = [("target_surv_time", float, "survtime"),
                    ("target_surv_status", np.int32, "stat"),
                    ("target_loctime", float, "loctime"),
                    ("target_lcens", np.int32, "lcens"),
                    ("target_axltime", float, "axltime"),
                    ("target_acens", np.int32, "acens"),
                    ("target_distime", float, "distime"),
                    ("target_dcens", np.int32, "dcens"),
                    ("target_maltime", float, "maltime"),
                    ("target_mcens", np.int32, "mcens"),
                    ]
    feature_descr = [("pathsize", float),
                     ("hist", hist_values),
                     ("hgb", float),
                     ("hrlevel", hrlevel_values),
                     ("nodediss", nodediss_values),
                     ("age", float),
                     ]

    ret = _fetch_remote_csv(ARCHIVE, "Tamoxifen",
                            feature_attrs=feature_descr,
                            treatment_attrs=treatment_descr,
                            target_attrs=target_descr,
                            categ_as_strings=categ_as_strings,
                            return_X_y=return_X_y, as_frame=as_frame,
                            download_if_missing=download_if_missing,
                            random_state=random_state, shuffle=shuffle,
                            total_attrs=17
                            )
    if not return_X_y:
        ret.descr = __doc__
    return ret