Source code for libuplift.datasets.Lalonde

"""The Lalonde (jobs) datasets.

There are two versions of the dataset, including 1974 earnings
(version B) and excluding 1974 earnings (vestion A).
"""


from os.path import dirname, exists, join
from os import remove, makedirs
import csv

import numpy as np

from sklearn.datasets import get_data_home
from .base import _fetch_remote_csv
from .base import _prepare_final_data
from .base import RemoteFileMetadata
from sklearn.utils import Bunch
import joblib
from sklearn.utils import check_random_state

ARCHIVE_A_T = RemoteFileMetadata(
    filename="Lalonde_A_T.txt",
    url='http://www.nber.org/~rdehejia/data/nsw_treated.txt',
    checksum=('ab65cd58de17a78b692e66e4d7142192'
              '59ac180428f24c42ddbb928cfb1820fe'))
ARCHIVE_A_C = RemoteFileMetadata(
    filename="Lalonde_A_C.txt",
    url='http://www.nber.org/~rdehejia/data/nsw_control.txt',
    checksum=('8fd745ed2c3426bb77e34b395fb84d45'
              '6d346ba545af58a65b4963160b0699fd'))

ARCHIVE_B_T = RemoteFileMetadata(
    filename="Lalonde_B_T.txt",
    url='http://www.nber.org/~rdehejia/data/nswre74_treated.txt',
    checksum=('e7b742fe0ff07a0f45e129b4ff108bb9'
              '611cd83d53604732c48a8a0a3e20eda3'))
ARCHIVE_B_C = RemoteFileMetadata(
    filename="Lalonde_B_C.txt",
    url='http://www.nber.org/~rdehejia/data/nswre74_control.txt',
    checksum=('a1364cea459d953dc691a667d99194b4'
              'ad335d6d550354fe23a5d2dc58d729b5'))


[docs] def fetch_Lalonde(version="A", data_home=None, categ_as_strings=False, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False, as_frame=False): """Load the Lalonde datasets (uplift regression). Download it if necessary. There are two versions of the dataset, including 1974 earnings (version B) and excluding 1974 earnings (vestion A). Source: http://users.nber.org/~rdehejia/data/nswdata2.html Parameters ---------- version : string, optional Specify which dataset to return. 'A' for larger files without 1974 earnings, 'B' for smaller files with 1974 earnings. data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None (default) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. shuffle : bool, default=False Whether to shuffle dataset. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. as_frame : boolean, default=False If True features are returned as pandas DataFrame. If False features are returned as object or float array. Float array is returned if all features are floats. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (581012, 54) Each row corresponds to the 54 features in the dataset. dataset.target : numpy array of shape (581012,) Each value corresponds to one of the 7 forest covertypes with values ranging between 1 to 7. dataset.DESCR : string Description of the forest covertype dataset. (data, target) : tuple if ``return_X_y`` is True """ #data_home = get_data_home(data_home=data_home) #Lalonde_dir = join(data_home, "uplift_sklearn", "Lalonde") #samples_path = join(Lalonde_dir, "samples" + version_suffix) #targets_path = join(Lalonde_dir, "targets" + version_suffix) #treatment_path = join(Lalonde_dir, "treatment" + version_suffix) #available = exists(samples_path) # dictionaries header_A = ['treatment', 'age', 'education', 'Black', 'Hispanic', 'married', 'nodegree', 'RE75', 'RE78'] header_B = ['treatment', 'age', 'education', 'Black', 'Hispanic', 'married', 'nodegree', 'RE74', 'RE75', 'RE78'] target_names = ["RE78"] def _float_to_int(x): return np.array(x, float), np.int32 # attribute descriptions treatment_descr = [("treatment", _float_to_int)] target_descr = [("target_RE78", float, "RE78")] feature_descr_all = [("age", float), ("education", float), ("Black", float), ("Hispanic", float), ("married", float), ("nodegree", float)] feature_descr_A = feature_descr_all + [("RE75", float)] feature_descr_B = feature_descr_all + [("RE74", float), ("RE75", float)] csv_reader_args = {"delimiter":' ', "skipinitialspace":True} # choose version if version == "A": version_suffix = "_A" arch_t = ARCHIVE_A_T arch_c = ARCHIVE_A_C n_fields = 9 header = header_A feature_descr = feature_descr_A elif version == "B": version_suffix = "_B" arch_t = ARCHIVE_B_T arch_c = ARCHIVE_B_C n_fields = 10 header = header_B feature_descr = feature_descr_B else: raise ValueError("Lalonde dataset version must be A or B") D_T = _fetch_remote_csv(arch_t, "Lalonde_T"+version_suffix, feature_attrs=feature_descr, treatment_attrs=treatment_descr, target_attrs=target_descr, categ_as_strings=categ_as_strings, return_X_y=False, as_frame=as_frame, download_if_missing=download_if_missing, random_state=random_state, shuffle=False, total_attrs=n_fields, header=header, csv_reader_args=csv_reader_args ) assert np.all(D_T.treatment == 1) D_C = _fetch_remote_csv(arch_c, "Lalonde_C"+version_suffix, feature_attrs=feature_descr, treatment_attrs=treatment_descr, target_attrs=target_descr, categ_as_strings=categ_as_strings, return_X_y=False, as_frame=as_frame, download_if_missing=download_if_missing, random_state=random_state, shuffle=False, total_attrs=n_fields, header=header, csv_reader_args=csv_reader_args ) assert np.all(D_C.treatment == 0) # combine treatment and control datasets D = D_C if as_frame: import pandas D.data = pandas.concat([D_C.data, D_T.data], ignore_index=True) else: D.data = np.concatenate([D_C.data, D_T.data]) D.treatment = np.concatenate([D_C.treatment, D_T.treatment]) D.target_RE78 = np.concatenate([D_C.target_RE78, D_T.target_RE78]) # prepare final return value ret = _prepare_final_data(D, shuffle, return_X_y) if not return_X_y: ret.descr = __doc__ return ret