# Your code here
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from xgbse import XGBSEKaplanNeighbors
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import itertools
import time
[Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv](https://github.com/loft-br/xgboost-survival-embeddings/files/9636594/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv)
csv_path = "C:/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv"
df = pd.read_csv(csv_path)
df = df.set_index('cohort', drop=True)
df.index.rename('index', inplace=True)
df.index = df.index.astype(int)
X = df.drop(columns=['PFS_binaer_Progress', 'Ereignis_korrigiert_Update_2021_03', 'DFS_M_ED_Update_2021_03',
'Pseudonym', 'train_test_mix', 'SUVmax', 'SEX_SPSS',
'DIAGECOG_komplett_ueber_1', 'DIAALTER', 'PTNM_T_SPSS_korr_grob_7th',
'PTNM_N_SPSS_korr', 'STADIUM_GROB_SPSS_7thEdition',
'R_Status', 'PTNM_T_SPSS_korr_7th', 'STADIUM_SPSS_7thEdition',
'Histo_Subtyp', 'NEOADJ_CHEMO', 'NEOADJ_BESTR', 'ADJ_CHEMO', 'ADJ_BESTR',
'ANY_CHEMO', 'ANY_BESTR', 'ASP_high_19_5', 'ASP', 'ASP_high_33_3'])
y_df = pd.DataFrame()
y_df['event'] = df['Ereignis_korrigiert_Update_2021_03'].astype(bool)
y_df['time'] = df['DFS_M_ED_Update_2021_03']
# split dataframe into training+test and validation cohort
X_train_test = X.iloc[X.index.isin([1])]
X_valid = X.iloc[X.index.isin([2])]
y_train_test_df = y_df.iloc[y_df.index.isin([1])]
y_valid_df = y_df.iloc[y_df.index.isin([2])]
s = y_df.dtypes
y_train_test = np.array([tuple(x) for x in y_train_test_df.values], dtype=list(zip(s.index, s)))
y_valid = np.array([tuple(x) for x in y_valid_df.values], dtype=list(zip(s.index, s)))
def score_survival_model(model, X, y):
prediction = model.predict(X)
result = concordance_index_censored(y['event'], y['time'], prediction)
return result[0]
feature_select_dict = {
"MIC" : SelectKBest(mutual_info_classif, k=30),
}
p_grid_dict = {"xgbse" : {"estimator__objective": ["survival:aft"],
"estimator__eval_metric": ["aft-nloglik"],
"estimator__aft_loss_distribution": ["normal", "logistic"],
"estimator__aft_loss_distribution_scale": [0.5, 1.0, 1.5],
"estimator__tree_method": ["hist"],
"estimator__learning_rate": np.logspace(-2, 0, num=6),
"estimator__max_depth": [0, 1, 5, 10, 15, 25, 50],
"estimator__booster": ["dart"],
"estimator__subsample": [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
"estimator__min_child_weight": [1, 2, 5, 10],
"estimator__colsample_bynode": [0.5, 1.0, 2.0]}
}
models_dict = {
"xgbse" : XGBSEKaplanNeighbors(xgb_params=p_grid_dict['xgbse'])
}
inner_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
outer_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
def model_scores(feature_select_dict, models_dict, p_grid_dict, X_train, y_train, X_valid, y_valid):
# define the scaler and prepare empty dict or dataframe to assemble results and best parameters
scaler = RobustScaler(with_centering = False)
models_df_dict = dict()
params_df = pd.DataFrame()
for outerKey in feature_select_dict:
models = pd.DataFrame()
feature_select = feature_select_dict[outerKey]
for innerKey in models_dict:
# instantiate model
model = models_dict[innerKey]
p_grid = p_grid_dict[innerKey]
# inner loop of nested cross-validation: perform hyperparameter tuning in the training set of the outer loop
t1 = time.time()
pipeline = Pipeline([('scaling', scaler), ('feature_selection', feature_select), ('estimator', model)])
clf_model = sklearn.model_selection.GridSearchCV(estimator=pipeline,
scoring=score_survival_model,
param_grid=p_grid,
cv=inner_cv, refit=True)
# outer loop: train the model with training data and score the perfomance on test sets
nested_test_score = sklearn.model_selection.cross_val_score(clf_model, scoring=score_survival_model,
X=X_train, y=y_train, cv=outer_cv)
# calculate AUC from test and validation set
test_mean = nested_test_score.mean()
test_std = nested_test_score.std()
clf_model_fit = clf_model.fit(X_train, y_train)
clf_model_best_parameters = str(clf_model.best_params_)
valid_score = clf_model.score(X_valid, y_valid)
test_plus_valid = test_mean + valid_score
model_time = (time.time()-t1)
# at the end of this nested CV: add results for this model to the models dataframe
models[innerKey] = [test_mean, test_std, model_time, valid_score, test_plus_valid]
df_list = [outerKey, innerKey, clf_model_best_parameters]
params = pd.DataFrame(df_list)
params_df = pd.concat([params_df, params], axis = 1)
# add model results for different feature_select_dict keys to the dict of model results
# add best parmaeters to the dataframe
models_df_dict[outerKey] = models
# subsequent to all model calculations: add multiindex, flip (transpose) dataframe and sort by highest "test+valid"
# finalize the "best parameters" dataframe
multiindex = {(outerKey, innerKey): values for outerKey, innerDict in models_df_dict.items() for innerKey, values in innerDict.items()}
models_df_dict_multiindex = pd.DataFrame(multiindex)
models_df_dict_multiindex.index = ['nested_test_mean', 'nested_test_SD', 'time', 'valid', 'test_plus_valid']
models_transpose = models_df_dict_multiindex.transpose()
models_transpose.index.set_names(['pre', 'model'], inplace=True)
models_transpose = models_transpose.sort_values(by = ['nested_test_mean'], ascending = False)
params_df = params_df.T
params_df.columns = ['feature_select', 'model', 'parameters']
params_df = params_df.sort_values(by = ['model', 'feature_select'], ascending = [True, True])
return models_transpose, params_df
results, params = model_scores(feature_select_dict, models_dict, p_grid_dict, X_train_test, y_train_test, X_valid, y_valid)
results_ready = results.reset_index(level=['pre', 'model'])
print(results_ready)
Use of GridSearchCV is not possible because XGBSE requires hyperparameters to be unique and to be passed during model initiation. Furthermore, parameter vales in the parameter dict need to be without [], while GridSearchCV expects values in [].
XGBSE therefore seems to be incompatible with GridSearchCV.
Furthermore, XGBSE seems to be incompatible with sklearn's pipeline.
If the sklearn pipeline is fitted, the estimator XGBSE receives the X dataframe as a np.array in the last step of the pipeline, which misses an index. This gives a corresponding error because XGBSE fitting seems to require X.index.