Source code for libuplift.datasets.Starbucks

"""The Starbucks dataset.

"""

import numpy as np

from .base import _fetch_remote_csv
from .base import RemoteFileMetadata


ARCHIVE = RemoteFileMetadata(
    filename="training.csv",
    url=('https://raw.githubusercontent.com/01KAT1/'
         'Marketing-Promotion-Campaign-Uplift-Modelling-Starbucks-Dataset/'
         'main/training.csv'),
    checksum=('2ac9d5c601b134b9b69e742b97d01652'
              '9c8082f6a611d5b243ab0c5b2ceaf83e'))

[docs] def fetch_Starbucks(data_home=None, download_if_missing=True, random_state=None, shuffle=False, categ_as_strings=False, return_X_y=False, as_frame=False): """Load the Starbucks dataset. Download it if necessary. There are many versions of this dataset, here the one from https://raw.githubusercontent.com/01KAT1/Marketing-Promotion-Campaign-Uplift-Modelling-Starbucks-Dataset/main/training.csv is used since it is easy to use and has been used in many uplift modeling papers. An original version consisting of several tables can be found at https://github.com/Shuniy/starbucks/tree/main See also an online post about analyzing the data: https://medium.com/@nesreensada/how-to-build-a-profitable-promotion-strategy-easily-with-uplift-modeling-26b2addc3e46 Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None (default) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. shuffle : bool, default=False Whether to shuffle dataset. categ_as_strings : bool, default=False Whether to return categorical variables as strings. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. as_frame : boolean, default=False If True features are returned as pandas DataFrame. If False features are returned as object or float array. Float array is returned if all features are floats. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array Each row corresponds to the features in the dataset. dataset.target_purchase : numpy array Indicator whether a purchase was made. dataset.DESCR : string Description of the dataset. (data, target_purchase) : tuple if ``return_X_y`` is True """ # dictionaries treatment_values = ['No', 'Yes'] V1_values = ["0", "1", "2", "3"] V4_values = ["1", "2"] V5_values = ["1", "2", "3", "4"] V6_values = ["1", "2", "3", "4"] V7_values = ["1", "2"] # attribute descriptions treatment_descr = [("treatment", treatment_values, "Promotion")] target_descr = [("target_purchase", np.int32, "purchase"),] feature_descr = [#("id", np.int32), ("V1", V1_values), ("V2", float), ("V3", float), ("V4", V4_values), ("V5", V5_values), ("V6", V6_values), ("V7", V7_values), ] ret = _fetch_remote_csv(ARCHIVE, "Starbucks", feature_attrs=feature_descr, treatment_attrs=treatment_descr, target_attrs=target_descr, categ_as_strings=categ_as_strings, return_X_y=return_X_y, as_frame=as_frame, download_if_missing=download_if_missing, random_state=random_state, shuffle=shuffle, total_attrs=10 ) if not return_X_y: ret.descr = __doc__ return ret