domino.prediction
1import numpy as np 2import xarray as xr 3from sklearn.linear_model import LogisticRegression 4from sklearn.metrics import roc_auc_score 5 6 7def CV_None(predictors,target,**cv_kwargs): 8 """No cross validation: train and test data are the same.""" 9 return [[predictors,target,predictors,target]] 10 11def CV_drop1year(predictors,target,**cv_kwargs): 12 """Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data.""" 13 14 X,y=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all')) 15 years=np.unique(X['time.year']) 16 iters=[] 17 for t in years: 18 it=[] 19 it.append(X.where(X['time.year']!=t).dropna('time')) 20 it.append(y.where(y['time.year']!=t).dropna('time')) 21 it.append(X.where(X['time.year']==t).dropna('time')) 22 it.append(y.where(y['time.year']==t).dropna('time')) 23 iters.append(it) 24 return iters 25 26def CV_n_chunks(predictors,target,N=2): 27 """The dataset is split into *N* chunks and each is successively held out as test data.""" 28 X,y=xr.align(predictors,target) 29 L=len(y) 30 l=L//N 31 iters=[] 32 for n in range(N): 33 34 test_times=np.arange(n*l,(n+1)*l) 35 train_times=np.arange(L) 36 train_times=np.delete(train_times,test_times) 37 38 X_train=X.isel(time=train_times) 39 y_train=y.isel(time=train_times) 40 X_test=X.isel(time=test_times) 41 y_test=y.isel(time=test_times) 42 43 iters.append([X_train,y_train,X_test,y_test]) 44 return iters 45 46def CV_manual(train_pred,train_targ,test_pred=None,test_targ=None): 47 """Separate training and test datasets are passed manually.""" 48 if test_pred is None or test_targ is None: 49 raise(ValueError('Both test_pred and test_targ must be specified when using CV_manual')) 50 return [[train_pred,train_targ,test_pred,test_targ]] 51 52def log_regression_model(X_train,y_train,X_test,**model_kwargs): 53 """A wrapper around sklearn's *LogisticRegression* class, which accepts the same *model_kwargs*""" 54 LR=LogisticRegression(**model_kwargs) 55 LR=LR.fit(X_train,y_train) 56 57 det_pred=LR.predict(X_test) 58 prob_pred=LR.predict_proba(X_test) 59 return LR,det_pred,prob_pred 60 61def blank_score(det_pred,prob_pred,target,**score_kwargs): 62 """@private""" 63 if det_pred is not None: 64 assert det_pred.shape[0]==target.shape[0] 65 if prob_pred is not None: 66 assert prob_pred.shape[0]==target.shape[0] 67 return len(target) 68 69def ROC_AUC(det_pred,prob_pred,target,**score_kwargs): 70 """Computes the ROC AUC score, as implemented in sklearn, and accepting the same *score_kwargs*""" 71 #No ROC_AUC for a variable with only one value 72 if len(np.unique(target))==1: 73 return np.nan 74 if len(np.unique(target))==2: 75 prob_pred=prob_pred[:,1] 76 77 78 return roc_auc_score(target,prob_pred,**score_kwargs) 79 80class PredictionTest(object): 81 """ 82 Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray. 83 Supports different predictive models, cross validation approaches and skill scores within a straightforward API. 84 85 **Arguments** 86 87 *predictors* 88 89 An xarray Dataset of scalar predictors 90 91 *predictand* 92 93 An xarray DataArray with a shared coord to *predictors*. 94 """ 95 def __init__(self,predictors,predictand,tolerate_empty_variables=False): 96 """Initialise a PredictionTest object""" 97 98 self.predictand=self._predictand_to_dataarray(predictand) 99 """@private""" 100 101 self.predictors=self._predictors_to_dataset(predictors) 102 """@private""" 103 104 105 self.predefined_models=dict( 106 logregression=log_regression_model 107 ) 108 """@private""" 109 110 self.predefined_cv_methods=dict( 111 nchunks=CV_n_chunks, 112 drop_year=CV_drop1year, 113 manual=CV_manual 114 ) 115 """@private""" 116 117 self.predefined_scores=dict( 118 sklearn_roc_auc=ROC_AUC, 119 test=blank_score 120 ) 121 """@private""" 122 123 self.computed_scores=None 124 """@private""" 125 126 self.tol_empty=tolerate_empty_variables 127 """@private""" 128 return 129 130 def __repr__(self): 131 return 'A PredictionTest object' 132 def __str__(self): 133 return self.__repr__ 134 135 def _predictand_to_dataarray(self,predictand): 136 return predictand 137 138 def _predictors_to_dataset(self,predictors): 139 return predictors 140 141 def _handle_potentially_empty_variable_list(self,vlist): 142 if len(vlist)>0: 143 return 0 144 else: 145 if self.tol_empty: 146 return 1 147 raise(ValueError('Empty variable list passed, but tolerate_empty_variables=False was set in PredictionTest.init')) 148 149 def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}): 150 151 """ 152 153 **Arguments** 154 155 *model* 156 157 A function with the following signature: 158 159 Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments. 160 161 Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None. 162 163 **Optional arguments** 164 165 *score* 166 167 A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature: 168 169 Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments. 170 Output: a scalar skill score. 171 172 *cv_method* 173 A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are: 174 175 *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs* 176 177 *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years. 178 *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*. 179 180 If a custom function is passed it must have the following signature: 181 Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments. 182 183 Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray. 184 185 *predictor_variables* 186 187 If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*. 188 189 If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*. 190 191 If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*. 192 193 If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*. 194 195 *keep_models* 196 If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed. 197 198 *model_kwargs* 199 200 A dictionary of keyword arguments to pass to *model* 201 *cv_kwargs* 202 203 A dictionary of keyword arguments to pass to *cv_method* 204 205 *score_kwargs* 206 207 A dictionary of keyword arguments to pass to *score* 208 209 **Returns** 210 211 If keep_models is False: 212 213 returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate. 214 215 if keep_models is True: 216 217 return (*score_da*,*model_da*) 218 """ 219 if np.ndim(predictor_variables)==1: 220 el_type=[np.ndim(element) for element in predictor_variables] 221 222 if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists 223 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 224 225 else:#just a list 226 score_labels=predictor_variables 227 228 elif np.ndim(predictor_variables)==2: 229 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 230 231 elif predictor_variables =='univariate': 232 predictor_variables = list(self.predictors.data_vars) 233 score_labels=list(self.predictors.data_vars) 234 elif predictor_variables=='all': 235 predictor_variables=[list(self.predictors.data_vars)] 236 score_labels=['all'] 237 else: 238 raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]')) 239 240 if type(score)==str: 241 score=self.predefined_scores[score] 242 243 predictors=self.predictors 244 target=self.predictand 245 predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all')) 246 247 #train_pred,train_target,test_pred,test_target 248 cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs) 249 250 score_da=[] 251 model_da=[] 252 for split_data in cv_iterator: 253 scores={} 254 models={} 255 train_preds,train_target,test_preds,test_target=split_data 256 257 for label,variable in zip(score_labels,predictor_variables): 258 259 train_pred=train_preds[variable] 260 test_pred=test_preds[variable] 261 262 #univariate 263 if type(variable)==str: 264 265 train_pred=np.array(train_pred).reshape([-1,1]) 266 test_pred=np.array(test_pred).reshape([-1,1]) 267 268 #multivariate 269 else: 270 train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var')) 271 test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var')) 272 273 fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs) 274 test_score=score(det_pred,prob_pred,test_target,**score_kwargs) 275 scores[label]=test_score 276 if keep_models: 277 models[label]=fitted_model 278 score_da.append(xr.Dataset(scores)) 279 model_da.append(models) 280 score_da=xr.concat(score_da,'cv') 281 self.computed_scores=score_da 282 283 if keep_models: 284 return score_da,model_da 285 else: 286 return score_da 287 288 def _fit_model_and_predict(self,model,train_pred,train_target,test_pred,**kwargs): 289 290 if type(model)==str: 291 model=self.predefined_models[model] 292 293 train_pred=np.array(train_pred) 294 train_target=np.array(train_target) 295 test_pred=np.array(test_pred) 296 self._verify_model_inputs(train_pred,train_target,test_pred) 297 output=model(train_pred,train_target,test_pred,**kwargs) 298 self._verify_model_output(output,test_pred) 299 return output 300 301 def _verify_model_inputs(self, train_pred,train_target,test_pred,**model_kwargs): 302 303 assert train_pred.ndim==2 304 T1,N=train_pred.shape 305 306 assert test_pred.ndim==2 307 T2,M=test_pred.shape 308 assert N==M 309 310 assert train_target.ndim==1 311 T=len(train_target) 312 assert T==T1 313 return 314 315 def _verify_model_output(self,output,test_pred): 316 317 T,N=test_pred.shape 318 assert len(output)==3 319 model,det_pred,prob_pred=output 320 321 assert (det_pred is not None)|(prob_pred is not None) 322 323 if det_pred is not None: 324 assert det_pred.shape[0]==T 325 assert np.ndim(det_pred)==1 326 if prob_pred is not None: 327 assert prob_pred.shape[0]==T 328 assert np.ndim(prob_pred)==2 329 return 330 331 def _split_data(self,predictors,target,cv_method,**cv_kwargs): 332 if cv_method is None: 333 cv_method=CV_None 334 elif type(cv_method)==str: 335 cv_method=self.predefined_cv_methods[cv_method] 336 337 cv_iterator=cv_method(predictors,target,**cv_kwargs) 338 return cv_iterator 339 340 def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean): 341 342 """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset. 343 344 **Arguments** 345 346 *indices* 347 348 *An xarray.Dataset of indices*. 349 350 **Optional arguments** 351 352 *label* 353 354 A string determining the name of the added attribute key. 355 356 *raise_on_missing_var* 357 358 A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices. 359 360 *reduce_func* 361 362 The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x 363 364 """ 365 s=self.computed_scores 366 if s is None: 367 raise(ValueError('No scores computed.')) 368 for var in s.data_vars: 369 370 score=s[var].values 371 try: 372 indices[var].attrs[label]=reduce_func(score) 373 except: 374 if raise_on_missing_var: 375 raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.')) 376 pass 377 return 378 379
8def CV_None(predictors,target,**cv_kwargs): 9 """No cross validation: train and test data are the same.""" 10 return [[predictors,target,predictors,target]]
No cross validation: train and test data are the same.
12def CV_drop1year(predictors,target,**cv_kwargs): 13 """Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data.""" 14 15 X,y=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all')) 16 years=np.unique(X['time.year']) 17 iters=[] 18 for t in years: 19 it=[] 20 it.append(X.where(X['time.year']!=t).dropna('time')) 21 it.append(y.where(y['time.year']!=t).dropna('time')) 22 it.append(X.where(X['time.year']==t).dropna('time')) 23 it.append(y.where(y['time.year']==t).dropna('time')) 24 iters.append(it) 25 return iters
Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data.
27def CV_n_chunks(predictors,target,N=2): 28 """The dataset is split into *N* chunks and each is successively held out as test data.""" 29 X,y=xr.align(predictors,target) 30 L=len(y) 31 l=L//N 32 iters=[] 33 for n in range(N): 34 35 test_times=np.arange(n*l,(n+1)*l) 36 train_times=np.arange(L) 37 train_times=np.delete(train_times,test_times) 38 39 X_train=X.isel(time=train_times) 40 y_train=y.isel(time=train_times) 41 X_test=X.isel(time=test_times) 42 y_test=y.isel(time=test_times) 43 44 iters.append([X_train,y_train,X_test,y_test]) 45 return iters
The dataset is split into N chunks and each is successively held out as test data.
47def CV_manual(train_pred,train_targ,test_pred=None,test_targ=None): 48 """Separate training and test datasets are passed manually.""" 49 if test_pred is None or test_targ is None: 50 raise(ValueError('Both test_pred and test_targ must be specified when using CV_manual')) 51 return [[train_pred,train_targ,test_pred,test_targ]]
Separate training and test datasets are passed manually.
53def log_regression_model(X_train,y_train,X_test,**model_kwargs): 54 """A wrapper around sklearn's *LogisticRegression* class, which accepts the same *model_kwargs*""" 55 LR=LogisticRegression(**model_kwargs) 56 LR=LR.fit(X_train,y_train) 57 58 det_pred=LR.predict(X_test) 59 prob_pred=LR.predict_proba(X_test) 60 return LR,det_pred,prob_pred
A wrapper around sklearn's LogisticRegression class, which accepts the same model_kwargs
70def ROC_AUC(det_pred,prob_pred,target,**score_kwargs): 71 """Computes the ROC AUC score, as implemented in sklearn, and accepting the same *score_kwargs*""" 72 #No ROC_AUC for a variable with only one value 73 if len(np.unique(target))==1: 74 return np.nan 75 if len(np.unique(target))==2: 76 prob_pred=prob_pred[:,1] 77 78 79 return roc_auc_score(target,prob_pred,**score_kwargs)
Computes the ROC AUC score, as implemented in sklearn, and accepting the same score_kwargs
81class PredictionTest(object): 82 """ 83 Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray. 84 Supports different predictive models, cross validation approaches and skill scores within a straightforward API. 85 86 **Arguments** 87 88 *predictors* 89 90 An xarray Dataset of scalar predictors 91 92 *predictand* 93 94 An xarray DataArray with a shared coord to *predictors*. 95 """ 96 def __init__(self,predictors,predictand,tolerate_empty_variables=False): 97 """Initialise a PredictionTest object""" 98 99 self.predictand=self._predictand_to_dataarray(predictand) 100 """@private""" 101 102 self.predictors=self._predictors_to_dataset(predictors) 103 """@private""" 104 105 106 self.predefined_models=dict( 107 logregression=log_regression_model 108 ) 109 """@private""" 110 111 self.predefined_cv_methods=dict( 112 nchunks=CV_n_chunks, 113 drop_year=CV_drop1year, 114 manual=CV_manual 115 ) 116 """@private""" 117 118 self.predefined_scores=dict( 119 sklearn_roc_auc=ROC_AUC, 120 test=blank_score 121 ) 122 """@private""" 123 124 self.computed_scores=None 125 """@private""" 126 127 self.tol_empty=tolerate_empty_variables 128 """@private""" 129 return 130 131 def __repr__(self): 132 return 'A PredictionTest object' 133 def __str__(self): 134 return self.__repr__ 135 136 def _predictand_to_dataarray(self,predictand): 137 return predictand 138 139 def _predictors_to_dataset(self,predictors): 140 return predictors 141 142 def _handle_potentially_empty_variable_list(self,vlist): 143 if len(vlist)>0: 144 return 0 145 else: 146 if self.tol_empty: 147 return 1 148 raise(ValueError('Empty variable list passed, but tolerate_empty_variables=False was set in PredictionTest.init')) 149 150 def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}): 151 152 """ 153 154 **Arguments** 155 156 *model* 157 158 A function with the following signature: 159 160 Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments. 161 162 Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None. 163 164 **Optional arguments** 165 166 *score* 167 168 A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature: 169 170 Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments. 171 Output: a scalar skill score. 172 173 *cv_method* 174 A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are: 175 176 *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs* 177 178 *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years. 179 *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*. 180 181 If a custom function is passed it must have the following signature: 182 Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments. 183 184 Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray. 185 186 *predictor_variables* 187 188 If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*. 189 190 If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*. 191 192 If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*. 193 194 If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*. 195 196 *keep_models* 197 If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed. 198 199 *model_kwargs* 200 201 A dictionary of keyword arguments to pass to *model* 202 *cv_kwargs* 203 204 A dictionary of keyword arguments to pass to *cv_method* 205 206 *score_kwargs* 207 208 A dictionary of keyword arguments to pass to *score* 209 210 **Returns** 211 212 If keep_models is False: 213 214 returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate. 215 216 if keep_models is True: 217 218 return (*score_da*,*model_da*) 219 """ 220 if np.ndim(predictor_variables)==1: 221 el_type=[np.ndim(element) for element in predictor_variables] 222 223 if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists 224 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 225 226 else:#just a list 227 score_labels=predictor_variables 228 229 elif np.ndim(predictor_variables)==2: 230 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 231 232 elif predictor_variables =='univariate': 233 predictor_variables = list(self.predictors.data_vars) 234 score_labels=list(self.predictors.data_vars) 235 elif predictor_variables=='all': 236 predictor_variables=[list(self.predictors.data_vars)] 237 score_labels=['all'] 238 else: 239 raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]')) 240 241 if type(score)==str: 242 score=self.predefined_scores[score] 243 244 predictors=self.predictors 245 target=self.predictand 246 predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all')) 247 248 #train_pred,train_target,test_pred,test_target 249 cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs) 250 251 score_da=[] 252 model_da=[] 253 for split_data in cv_iterator: 254 scores={} 255 models={} 256 train_preds,train_target,test_preds,test_target=split_data 257 258 for label,variable in zip(score_labels,predictor_variables): 259 260 train_pred=train_preds[variable] 261 test_pred=test_preds[variable] 262 263 #univariate 264 if type(variable)==str: 265 266 train_pred=np.array(train_pred).reshape([-1,1]) 267 test_pred=np.array(test_pred).reshape([-1,1]) 268 269 #multivariate 270 else: 271 train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var')) 272 test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var')) 273 274 fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs) 275 test_score=score(det_pred,prob_pred,test_target,**score_kwargs) 276 scores[label]=test_score 277 if keep_models: 278 models[label]=fitted_model 279 score_da.append(xr.Dataset(scores)) 280 model_da.append(models) 281 score_da=xr.concat(score_da,'cv') 282 self.computed_scores=score_da 283 284 if keep_models: 285 return score_da,model_da 286 else: 287 return score_da 288 289 def _fit_model_and_predict(self,model,train_pred,train_target,test_pred,**kwargs): 290 291 if type(model)==str: 292 model=self.predefined_models[model] 293 294 train_pred=np.array(train_pred) 295 train_target=np.array(train_target) 296 test_pred=np.array(test_pred) 297 self._verify_model_inputs(train_pred,train_target,test_pred) 298 output=model(train_pred,train_target,test_pred,**kwargs) 299 self._verify_model_output(output,test_pred) 300 return output 301 302 def _verify_model_inputs(self, train_pred,train_target,test_pred,**model_kwargs): 303 304 assert train_pred.ndim==2 305 T1,N=train_pred.shape 306 307 assert test_pred.ndim==2 308 T2,M=test_pred.shape 309 assert N==M 310 311 assert train_target.ndim==1 312 T=len(train_target) 313 assert T==T1 314 return 315 316 def _verify_model_output(self,output,test_pred): 317 318 T,N=test_pred.shape 319 assert len(output)==3 320 model,det_pred,prob_pred=output 321 322 assert (det_pred is not None)|(prob_pred is not None) 323 324 if det_pred is not None: 325 assert det_pred.shape[0]==T 326 assert np.ndim(det_pred)==1 327 if prob_pred is not None: 328 assert prob_pred.shape[0]==T 329 assert np.ndim(prob_pred)==2 330 return 331 332 def _split_data(self,predictors,target,cv_method,**cv_kwargs): 333 if cv_method is None: 334 cv_method=CV_None 335 elif type(cv_method)==str: 336 cv_method=self.predefined_cv_methods[cv_method] 337 338 cv_iterator=cv_method(predictors,target,**cv_kwargs) 339 return cv_iterator 340 341 def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean): 342 343 """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset. 344 345 **Arguments** 346 347 *indices* 348 349 *An xarray.Dataset of indices*. 350 351 **Optional arguments** 352 353 *label* 354 355 A string determining the name of the added attribute key. 356 357 *raise_on_missing_var* 358 359 A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices. 360 361 *reduce_func* 362 363 The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x 364 365 """ 366 s=self.computed_scores 367 if s is None: 368 raise(ValueError('No scores computed.')) 369 for var in s.data_vars: 370 371 score=s[var].values 372 try: 373 indices[var].attrs[label]=reduce_func(score) 374 except: 375 if raise_on_missing_var: 376 raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.')) 377 pass 378 return
Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray. Supports different predictive models, cross validation approaches and skill scores within a straightforward API.
Arguments
predictors
An xarray Dataset of scalar predictors
predictand
An xarray DataArray with a shared coord to predictors.
96 def __init__(self,predictors,predictand,tolerate_empty_variables=False): 97 """Initialise a PredictionTest object""" 98 99 self.predictand=self._predictand_to_dataarray(predictand) 100 """@private""" 101 102 self.predictors=self._predictors_to_dataset(predictors) 103 """@private""" 104 105 106 self.predefined_models=dict( 107 logregression=log_regression_model 108 ) 109 """@private""" 110 111 self.predefined_cv_methods=dict( 112 nchunks=CV_n_chunks, 113 drop_year=CV_drop1year, 114 manual=CV_manual 115 ) 116 """@private""" 117 118 self.predefined_scores=dict( 119 sklearn_roc_auc=ROC_AUC, 120 test=blank_score 121 ) 122 """@private""" 123 124 self.computed_scores=None 125 """@private""" 126 127 self.tol_empty=tolerate_empty_variables 128 """@private""" 129 return
Initialise a PredictionTest object
150 def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}): 151 152 """ 153 154 **Arguments** 155 156 *model* 157 158 A function with the following signature: 159 160 Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments. 161 162 Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None. 163 164 **Optional arguments** 165 166 *score* 167 168 A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature: 169 170 Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments. 171 Output: a scalar skill score. 172 173 *cv_method* 174 A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are: 175 176 *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs* 177 178 *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years. 179 *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*. 180 181 If a custom function is passed it must have the following signature: 182 Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments. 183 184 Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray. 185 186 *predictor_variables* 187 188 If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*. 189 190 If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*. 191 192 If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*. 193 194 If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*. 195 196 *keep_models* 197 If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed. 198 199 *model_kwargs* 200 201 A dictionary of keyword arguments to pass to *model* 202 *cv_kwargs* 203 204 A dictionary of keyword arguments to pass to *cv_method* 205 206 *score_kwargs* 207 208 A dictionary of keyword arguments to pass to *score* 209 210 **Returns** 211 212 If keep_models is False: 213 214 returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate. 215 216 if keep_models is True: 217 218 return (*score_da*,*model_da*) 219 """ 220 if np.ndim(predictor_variables)==1: 221 el_type=[np.ndim(element) for element in predictor_variables] 222 223 if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists 224 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 225 226 else:#just a list 227 score_labels=predictor_variables 228 229 elif np.ndim(predictor_variables)==2: 230 score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))] 231 232 elif predictor_variables =='univariate': 233 predictor_variables = list(self.predictors.data_vars) 234 score_labels=list(self.predictors.data_vars) 235 elif predictor_variables=='all': 236 predictor_variables=[list(self.predictors.data_vars)] 237 score_labels=['all'] 238 else: 239 raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]')) 240 241 if type(score)==str: 242 score=self.predefined_scores[score] 243 244 predictors=self.predictors 245 target=self.predictand 246 predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all')) 247 248 #train_pred,train_target,test_pred,test_target 249 cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs) 250 251 score_da=[] 252 model_da=[] 253 for split_data in cv_iterator: 254 scores={} 255 models={} 256 train_preds,train_target,test_preds,test_target=split_data 257 258 for label,variable in zip(score_labels,predictor_variables): 259 260 train_pred=train_preds[variable] 261 test_pred=test_preds[variable] 262 263 #univariate 264 if type(variable)==str: 265 266 train_pred=np.array(train_pred).reshape([-1,1]) 267 test_pred=np.array(test_pred).reshape([-1,1]) 268 269 #multivariate 270 else: 271 train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var')) 272 test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var')) 273 274 fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs) 275 test_score=score(det_pred,prob_pred,test_target,**score_kwargs) 276 scores[label]=test_score 277 if keep_models: 278 models[label]=fitted_model 279 score_da.append(xr.Dataset(scores)) 280 model_da.append(models) 281 score_da=xr.concat(score_da,'cv') 282 self.computed_scores=score_da 283 284 if keep_models: 285 return score_da,model_da 286 else: 287 return score_da
Arguments
model
A function with the following signature:
Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments.
Output: model (A scikit-learn-like fitted model object),det_pred (a numpy array of shape [T2] with deterministic predictions) and prob_pred (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in train_target and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of det_pred and prob_pred must not be None.
Optional arguments
score
A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature:
Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments. Output: a scalar skill score.
cv_method A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are:
nchunks - Divide the dataset into n chunks, using each as the test dataset predicted by the other n-1 chunks, to produce n total skill estimates. n defaults to 2, and is specified in cv_kwargs
drop_year - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years. manual - Treat predictors and predictand as training data. Test data must be passed explicitly via cv_kwargs as test_pred and test_targ.
If a custom function is passed it must have the following signature: Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments.
Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray.
predictor_variables
If 'univariate' all variables in PredictionTest.predictors are used individually to predict PredictionTest.predictand.
If 'all' all variables in PredictionTest.predictors are used collectively to predict PredictionTest.predictand.
If a 1D array of variable names in PredictionTest.predictors, each specified variable is used individually to predict PredictionTest.predictand.
If a 2D array of iterables over variable names in PredictionTest.predictors, each specified combination of variables is used to predict PredictionTest.predictand.
keep_models If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed.
model_kwargs
A dictionary of keyword arguments to pass to model cv_kwargs
A dictionary of keyword arguments to pass to cv_method
score_kwargs
A dictionary of keyword arguments to pass to score
Returns
If keep_models is False:
returns score_da, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate.
if keep_models is True:
return (score_da,model_da)
341 def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean): 342 343 """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset. 344 345 **Arguments** 346 347 *indices* 348 349 *An xarray.Dataset of indices*. 350 351 **Optional arguments** 352 353 *label* 354 355 A string determining the name of the added attribute key. 356 357 *raise_on_missing_var* 358 359 A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices. 360 361 *reduce_func* 362 363 The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x 364 365 """ 366 s=self.computed_scores 367 if s is None: 368 raise(ValueError('No scores computed.')) 369 for var in s.data_vars: 370 371 score=s[var].values 372 try: 373 indices[var].attrs[label]=reduce_func(score) 374 except: 375 if raise_on_missing_var: 376 raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.')) 377 pass 378 return
Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset.
Arguments
indices
An xarray.Dataset of indices.
Optional arguments
label
A string determining the name of the added attribute key.
raise_on_missing_var
A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices.
reduce_func
The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass reduce_func=lambda x: x