Source code for libuplift.datasets.Criteo

"""Criteo online advertising dataset.

See https://ailab.criteo.com/criteo-uplift-prediction-dataset/ for details.
"""

import numpy as np

from .base import _fetch_remote_csv
from .base import RemoteFileMetadata


# original source
#ARCHIVE = RemoteFileMetadata(
#    filename="criteo-research-uplift-v2.1.csv.gz",
#    url=('https://go.criteo.net/criteo-research-uplift-v2.1.csv.gz'),
#    checksum=('2716e1bf0fd157a93b5bf86924d90884'
#              '19dfbac2022c6cd90030220634f616dc'))
# GitHub copy, seems faster
ARCHIVE = RemoteFileMetadata(
    filename="criteo-research-uplift-v2.1.csv.gz",
    url=('https://github.com/jszymon/uplift_sklearn_data/releases/download/Criteo/criteo-research-uplift-v2.1.csv.gz'),
    checksum=('2716e1bf0fd157a93b5bf86924d90884'
              '19dfbac2022c6cd90030220634f616dc'))


[docs] def fetch_Criteo(data_home=None, download_if_missing=True, random_state=None, shuffle=False, categ_as_strings=False, return_X_y=False, as_frame=False): """Load the Criteo dataset. Download it if necessary. Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None (default) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. shuffle : bool, default=False Whether to shuffle dataset. categ_as_strings : bool, default=False Whether to return categorical variables as strings. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. as_frame : boolean, default=False If True features are returned as pandas DataFrame. If False features are returned as object or float array. Float array is returned if all features are floats. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array Each row corresponds to the features in the dataset. dataset.target_visit : numpy array Each value is 1 if website visit occurred 0 otherwise. dataset.target_conversion : numpy array Each value is 1 if purchase occurred 0 otherwise. dataset.target_exposure : numpy array Whether the user has been exposed to the ad dataset.DESCR : string Description of the dataset. (data, target_conversion, target_visit, target_exposure) : tuple if ``return_X_y`` is True """ # dictionaries treatment_values = ['control', 'treated'] categ_values = dict() # attribute descriptions treatment_descr = [("treatment", np.int32)] target_descr = [("target_conversion", np.int32, "conversion"), ("target_visit", np.int32, "visit"), ("target_exposure", np.int32, "exposure")] feature_descr = [(f"f{i}", float) for i in range(12)] ret = _fetch_remote_csv(ARCHIVE, "Criteo", feature_attrs=feature_descr, treatment_attrs=treatment_descr, target_attrs=target_descr, categ_as_strings=categ_as_strings, return_X_y=return_X_y, as_frame=as_frame, download_if_missing=download_if_missing, random_state=random_state, shuffle=shuffle, total_attrs=16, all_num=True ) if not return_X_y: ret.descr = __doc__ return ret