domino.prediction

  1import numpy as np
  2import xarray as xr
  3from sklearn.linear_model import LogisticRegression
  4from sklearn.metrics import roc_auc_score
  5
  6
  7def CV_None(predictors,target,**cv_kwargs):
  8    """No cross validation: train and test data are the same."""
  9    return [[predictors,target,predictors,target]]
 10
 11def CV_drop1year(predictors,target,**cv_kwargs):
 12    """Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data."""
 13
 14    X,y=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all'))
 15    years=np.unique(X['time.year'])
 16    iters=[]
 17    for t in years:
 18        it=[]
 19        it.append(X.where(X['time.year']!=t).dropna('time'))
 20        it.append(y.where(y['time.year']!=t).dropna('time'))
 21        it.append(X.where(X['time.year']==t).dropna('time'))
 22        it.append(y.where(y['time.year']==t).dropna('time'))
 23        iters.append(it)
 24    return iters
 25
 26def CV_n_chunks(predictors,target,N=2):
 27    """The dataset is split into *N* chunks and each is successively held out as test data."""
 28    X,y=xr.align(predictors,target)
 29    L=len(y)
 30    l=L//N
 31    iters=[]
 32    for n in range(N):
 33        
 34        test_times=np.arange(n*l,(n+1)*l)
 35        train_times=np.arange(L)
 36        train_times=np.delete(train_times,test_times)
 37        
 38        X_train=X.isel(time=train_times)
 39        y_train=y.isel(time=train_times)                             
 40        X_test=X.isel(time=test_times)
 41        y_test=y.isel(time=test_times)
 42        
 43        iters.append([X_train,y_train,X_test,y_test])
 44    return iters
 45
 46def CV_manual(train_pred,train_targ,test_pred=None,test_targ=None):
 47    """Separate training and test datasets are passed manually."""
 48    if test_pred is None or test_targ is None:
 49        raise(ValueError('Both test_pred and test_targ must be specified when using CV_manual'))
 50    return [[train_pred,train_targ,test_pred,test_targ]]
 51
 52def log_regression_model(X_train,y_train,X_test,**model_kwargs):
 53    """A wrapper around sklearn's *LogisticRegression* class, which accepts the same *model_kwargs*"""
 54    LR=LogisticRegression(**model_kwargs)
 55    LR=LR.fit(X_train,y_train)
 56    
 57    det_pred=LR.predict(X_test)
 58    prob_pred=LR.predict_proba(X_test)
 59    return LR,det_pred,prob_pred
 60
 61def blank_score(det_pred,prob_pred,target,**score_kwargs):
 62    """@private"""
 63    if det_pred is not None:
 64        assert det_pred.shape[0]==target.shape[0]
 65    if prob_pred is not None:
 66        assert prob_pred.shape[0]==target.shape[0]
 67    return len(target)
 68
 69def ROC_AUC(det_pred,prob_pred,target,**score_kwargs):
 70    """Computes the ROC AUC score, as implemented in sklearn, and accepting the same *score_kwargs*"""
 71    #No ROC_AUC for a variable with only one value
 72    if len(np.unique(target))==1:
 73        return np.nan
 74    if len(np.unique(target))==2:
 75        prob_pred=prob_pred[:,1]
 76        
 77    
 78    return roc_auc_score(target,prob_pred,**score_kwargs)
 79
 80class PredictionTest(object):
 81    """
 82    Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray.
 83    Supports different predictive models, cross validation approaches and skill scores within a straightforward API.
 84    
 85    **Arguments**
 86    
 87    *predictors*
 88    
 89    An xarray Dataset of scalar predictors
 90    
 91    *predictand*
 92    
 93    An xarray DataArray with a shared coord to *predictors*.
 94    """
 95    def __init__(self,predictors,predictand,tolerate_empty_variables=False):
 96        """Initialise a PredictionTest object"""
 97        
 98        self.predictand=self._predictand_to_dataarray(predictand)
 99        """@private"""
100        
101        self.predictors=self._predictors_to_dataset(predictors)
102        """@private"""
103
104        
105        self.predefined_models=dict(
106            logregression=log_regression_model
107        )
108        """@private"""
109        
110        self.predefined_cv_methods=dict(
111            nchunks=CV_n_chunks,
112            drop_year=CV_drop1year,
113            manual=CV_manual
114        )
115        """@private"""
116        
117        self.predefined_scores=dict(
118            sklearn_roc_auc=ROC_AUC,
119            test=blank_score
120        )
121        """@private"""
122        
123        self.computed_scores=None
124        """@private"""
125        
126        self.tol_empty=tolerate_empty_variables
127        """@private"""
128        return
129    
130    def __repr__(self):
131        return 'A PredictionTest object'
132    def __str__(self):
133        return self.__repr__
134    
135    def _predictand_to_dataarray(self,predictand):
136        return predictand
137    
138    def _predictors_to_dataset(self,predictors):
139        return predictors
140    
141    def _handle_potentially_empty_variable_list(self,vlist):
142        if len(vlist)>0:
143            return 0
144        else:
145            if self.tol_empty:
146                return 1
147            raise(ValueError('Empty variable list passed, but tolerate_empty_variables=False was set in PredictionTest.init'))
148
149    def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}):
150        
151        """
152        
153        **Arguments**
154        
155        *model*
156        
157        A function with the following signature:
158        
159        Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments.
160        
161        Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None.
162            
163        **Optional arguments**
164        
165        *score*
166        
167        A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature:
168        
169        Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments.
170        Output: a scalar skill score.
171            
172        *cv_method*
173        A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are:
174        
175        *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs*
176        
177        *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years.
178        *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*.
179        
180        If a custom function is passed it must have the following signature:
181        Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments.
182        
183        Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray.
184
185        *predictor_variables*
186        
187        If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*.
188        
189        If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*.
190        
191        If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*.
192        
193        If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*.
194            
195        *keep_models*
196        If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed.
197        
198        *model_kwargs*
199        
200        A dictionary of keyword arguments to pass to *model*
201        *cv_kwargs*
202        
203        A dictionary of keyword arguments to pass to *cv_method*
204
205        *score_kwargs*
206        
207        A dictionary of keyword arguments to pass to *score*
208        
209        **Returns**
210        
211        If keep_models is False:
212        
213        returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate.
214        
215        if keep_models is True:
216        
217        return (*score_da*,*model_da*)
218        """
219        if np.ndim(predictor_variables)==1:
220            el_type=[np.ndim(element) for element in predictor_variables]
221            
222            if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists
223                score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
224                
225            else:#just a list
226                score_labels=predictor_variables
227            
228        elif np.ndim(predictor_variables)==2:
229            score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
230
231        elif predictor_variables =='univariate':
232            predictor_variables = list(self.predictors.data_vars)
233            score_labels=list(self.predictors.data_vars)
234        elif predictor_variables=='all':
235            predictor_variables=[list(self.predictors.data_vars)]
236            score_labels=['all']
237        else:
238            raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]'))
239        
240        if type(score)==str:
241            score=self.predefined_scores[score]
242        
243        predictors=self.predictors
244        target=self.predictand
245        predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all'))
246        
247        #train_pred,train_target,test_pred,test_target
248        cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs)
249        
250        score_da=[]
251        model_da=[]
252        for split_data in cv_iterator:
253            scores={}
254            models={}
255            train_preds,train_target,test_preds,test_target=split_data
256            
257            for label,variable in zip(score_labels,predictor_variables):
258                
259                train_pred=train_preds[variable]
260                test_pred=test_preds[variable]
261                
262                #univariate
263                if type(variable)==str:
264                    
265                    train_pred=np.array(train_pred).reshape([-1,1])
266                    test_pred=np.array(test_pred).reshape([-1,1])
267
268                #multivariate
269                else:
270                    train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var'))
271                    test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var'))
272
273                fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs)
274                test_score=score(det_pred,prob_pred,test_target,**score_kwargs)
275                scores[label]=test_score
276                if keep_models:
277                    models[label]=fitted_model
278            score_da.append(xr.Dataset(scores))
279            model_da.append(models)
280        score_da=xr.concat(score_da,'cv')
281        self.computed_scores=score_da
282
283        if keep_models:
284            return score_da,model_da
285        else:
286            return score_da
287    
288    def _fit_model_and_predict(self,model,train_pred,train_target,test_pred,**kwargs):
289        
290        if type(model)==str:
291            model=self.predefined_models[model]
292        
293        train_pred=np.array(train_pred)
294        train_target=np.array(train_target)
295        test_pred=np.array(test_pred)
296        self._verify_model_inputs(train_pred,train_target,test_pred)
297        output=model(train_pred,train_target,test_pred,**kwargs)
298        self._verify_model_output(output,test_pred)
299        return output
300    
301    def _verify_model_inputs(self, train_pred,train_target,test_pred,**model_kwargs):
302    
303        assert train_pred.ndim==2
304        T1,N=train_pred.shape
305
306        assert test_pred.ndim==2
307        T2,M=test_pred.shape
308        assert N==M
309
310        assert train_target.ndim==1
311        T=len(train_target)
312        assert T==T1
313        return 
314
315    def _verify_model_output(self,output,test_pred):
316        
317        T,N=test_pred.shape
318        assert len(output)==3
319        model,det_pred,prob_pred=output
320
321        assert (det_pred is not None)|(prob_pred is not None)
322
323        if det_pred is not None:
324            assert det_pred.shape[0]==T
325            assert np.ndim(det_pred)==1
326        if prob_pred is not None:
327            assert prob_pred.shape[0]==T
328            assert np.ndim(prob_pred)==2
329        return
330    
331    def _split_data(self,predictors,target,cv_method,**cv_kwargs):
332        if cv_method is None:
333            cv_method=CV_None
334        elif type(cv_method)==str:
335            cv_method=self.predefined_cv_methods[cv_method]
336        
337        cv_iterator=cv_method(predictors,target,**cv_kwargs)
338        return cv_iterator
339    
340    def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean):
341        
342        """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset.
343        
344        **Arguments**
345        
346        *indices*
347        
348        *An xarray.Dataset of indices*.
349        
350        **Optional arguments**
351        
352        *label*
353        
354        A string determining the name of the added attribute key.
355        
356        *raise_on_missing_var*
357        
358        A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices.
359        
360        *reduce_func*
361        
362        The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x
363        
364        """
365        s=self.computed_scores
366        if s is None:
367            raise(ValueError('No scores computed.'))
368        for var in s.data_vars:
369            
370            score=s[var].values
371            try:
372                indices[var].attrs[label]=reduce_func(score)
373            except:
374                if raise_on_missing_var:
375                    raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.'))
376                pass
377        return
378        
379        
def CV_None(predictors, target, **cv_kwargs):
 8def CV_None(predictors,target,**cv_kwargs):
 9    """No cross validation: train and test data are the same."""
10    return [[predictors,target,predictors,target]]

No cross validation: train and test data are the same.

def CV_drop1year(predictors, target, **cv_kwargs):
12def CV_drop1year(predictors,target,**cv_kwargs):
13    """Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data."""
14
15    X,y=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all'))
16    years=np.unique(X['time.year'])
17    iters=[]
18    for t in years:
19        it=[]
20        it.append(X.where(X['time.year']!=t).dropna('time'))
21        it.append(y.where(y['time.year']!=t).dropna('time'))
22        it.append(X.where(X['time.year']==t).dropna('time'))
23        it.append(y.where(y['time.year']==t).dropna('time'))
24        iters.append(it)
25    return iters

Cross validation is performed by dropping out single years of data from the training set to use as test data. This is repeated for all years with non-nan data.

def CV_n_chunks(predictors, target, N=2):
27def CV_n_chunks(predictors,target,N=2):
28    """The dataset is split into *N* chunks and each is successively held out as test data."""
29    X,y=xr.align(predictors,target)
30    L=len(y)
31    l=L//N
32    iters=[]
33    for n in range(N):
34        
35        test_times=np.arange(n*l,(n+1)*l)
36        train_times=np.arange(L)
37        train_times=np.delete(train_times,test_times)
38        
39        X_train=X.isel(time=train_times)
40        y_train=y.isel(time=train_times)                             
41        X_test=X.isel(time=test_times)
42        y_test=y.isel(time=test_times)
43        
44        iters.append([X_train,y_train,X_test,y_test])
45    return iters

The dataset is split into N chunks and each is successively held out as test data.

def CV_manual(train_pred, train_targ, test_pred=None, test_targ=None):
47def CV_manual(train_pred,train_targ,test_pred=None,test_targ=None):
48    """Separate training and test datasets are passed manually."""
49    if test_pred is None or test_targ is None:
50        raise(ValueError('Both test_pred and test_targ must be specified when using CV_manual'))
51    return [[train_pred,train_targ,test_pred,test_targ]]

Separate training and test datasets are passed manually.

def log_regression_model(X_train, y_train, X_test, **model_kwargs):
53def log_regression_model(X_train,y_train,X_test,**model_kwargs):
54    """A wrapper around sklearn's *LogisticRegression* class, which accepts the same *model_kwargs*"""
55    LR=LogisticRegression(**model_kwargs)
56    LR=LR.fit(X_train,y_train)
57    
58    det_pred=LR.predict(X_test)
59    prob_pred=LR.predict_proba(X_test)
60    return LR,det_pred,prob_pred

A wrapper around sklearn's LogisticRegression class, which accepts the same model_kwargs

def ROC_AUC(det_pred, prob_pred, target, **score_kwargs):
70def ROC_AUC(det_pred,prob_pred,target,**score_kwargs):
71    """Computes the ROC AUC score, as implemented in sklearn, and accepting the same *score_kwargs*"""
72    #No ROC_AUC for a variable with only one value
73    if len(np.unique(target))==1:
74        return np.nan
75    if len(np.unique(target))==2:
76        prob_pred=prob_pred[:,1]
77        
78    
79    return roc_auc_score(target,prob_pred,**score_kwargs)

Computes the ROC AUC score, as implemented in sklearn, and accepting the same score_kwargs

class PredictionTest:
 81class PredictionTest(object):
 82    """
 83    Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray.
 84    Supports different predictive models, cross validation approaches and skill scores within a straightforward API.
 85    
 86    **Arguments**
 87    
 88    *predictors*
 89    
 90    An xarray Dataset of scalar predictors
 91    
 92    *predictand*
 93    
 94    An xarray DataArray with a shared coord to *predictors*.
 95    """
 96    def __init__(self,predictors,predictand,tolerate_empty_variables=False):
 97        """Initialise a PredictionTest object"""
 98        
 99        self.predictand=self._predictand_to_dataarray(predictand)
100        """@private"""
101        
102        self.predictors=self._predictors_to_dataset(predictors)
103        """@private"""
104
105        
106        self.predefined_models=dict(
107            logregression=log_regression_model
108        )
109        """@private"""
110        
111        self.predefined_cv_methods=dict(
112            nchunks=CV_n_chunks,
113            drop_year=CV_drop1year,
114            manual=CV_manual
115        )
116        """@private"""
117        
118        self.predefined_scores=dict(
119            sklearn_roc_auc=ROC_AUC,
120            test=blank_score
121        )
122        """@private"""
123        
124        self.computed_scores=None
125        """@private"""
126        
127        self.tol_empty=tolerate_empty_variables
128        """@private"""
129        return
130    
131    def __repr__(self):
132        return 'A PredictionTest object'
133    def __str__(self):
134        return self.__repr__
135    
136    def _predictand_to_dataarray(self,predictand):
137        return predictand
138    
139    def _predictors_to_dataset(self,predictors):
140        return predictors
141    
142    def _handle_potentially_empty_variable_list(self,vlist):
143        if len(vlist)>0:
144            return 0
145        else:
146            if self.tol_empty:
147                return 1
148            raise(ValueError('Empty variable list passed, but tolerate_empty_variables=False was set in PredictionTest.init'))
149
150    def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}):
151        
152        """
153        
154        **Arguments**
155        
156        *model*
157        
158        A function with the following signature:
159        
160        Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments.
161        
162        Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None.
163            
164        **Optional arguments**
165        
166        *score*
167        
168        A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature:
169        
170        Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments.
171        Output: a scalar skill score.
172            
173        *cv_method*
174        A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are:
175        
176        *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs*
177        
178        *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years.
179        *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*.
180        
181        If a custom function is passed it must have the following signature:
182        Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments.
183        
184        Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray.
185
186        *predictor_variables*
187        
188        If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*.
189        
190        If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*.
191        
192        If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*.
193        
194        If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*.
195            
196        *keep_models*
197        If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed.
198        
199        *model_kwargs*
200        
201        A dictionary of keyword arguments to pass to *model*
202        *cv_kwargs*
203        
204        A dictionary of keyword arguments to pass to *cv_method*
205
206        *score_kwargs*
207        
208        A dictionary of keyword arguments to pass to *score*
209        
210        **Returns**
211        
212        If keep_models is False:
213        
214        returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate.
215        
216        if keep_models is True:
217        
218        return (*score_da*,*model_da*)
219        """
220        if np.ndim(predictor_variables)==1:
221            el_type=[np.ndim(element) for element in predictor_variables]
222            
223            if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists
224                score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
225                
226            else:#just a list
227                score_labels=predictor_variables
228            
229        elif np.ndim(predictor_variables)==2:
230            score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
231
232        elif predictor_variables =='univariate':
233            predictor_variables = list(self.predictors.data_vars)
234            score_labels=list(self.predictors.data_vars)
235        elif predictor_variables=='all':
236            predictor_variables=[list(self.predictors.data_vars)]
237            score_labels=['all']
238        else:
239            raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]'))
240        
241        if type(score)==str:
242            score=self.predefined_scores[score]
243        
244        predictors=self.predictors
245        target=self.predictand
246        predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all'))
247        
248        #train_pred,train_target,test_pred,test_target
249        cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs)
250        
251        score_da=[]
252        model_da=[]
253        for split_data in cv_iterator:
254            scores={}
255            models={}
256            train_preds,train_target,test_preds,test_target=split_data
257            
258            for label,variable in zip(score_labels,predictor_variables):
259                
260                train_pred=train_preds[variable]
261                test_pred=test_preds[variable]
262                
263                #univariate
264                if type(variable)==str:
265                    
266                    train_pred=np.array(train_pred).reshape([-1,1])
267                    test_pred=np.array(test_pred).reshape([-1,1])
268
269                #multivariate
270                else:
271                    train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var'))
272                    test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var'))
273
274                fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs)
275                test_score=score(det_pred,prob_pred,test_target,**score_kwargs)
276                scores[label]=test_score
277                if keep_models:
278                    models[label]=fitted_model
279            score_da.append(xr.Dataset(scores))
280            model_da.append(models)
281        score_da=xr.concat(score_da,'cv')
282        self.computed_scores=score_da
283
284        if keep_models:
285            return score_da,model_da
286        else:
287            return score_da
288    
289    def _fit_model_and_predict(self,model,train_pred,train_target,test_pred,**kwargs):
290        
291        if type(model)==str:
292            model=self.predefined_models[model]
293        
294        train_pred=np.array(train_pred)
295        train_target=np.array(train_target)
296        test_pred=np.array(test_pred)
297        self._verify_model_inputs(train_pred,train_target,test_pred)
298        output=model(train_pred,train_target,test_pred,**kwargs)
299        self._verify_model_output(output,test_pred)
300        return output
301    
302    def _verify_model_inputs(self, train_pred,train_target,test_pred,**model_kwargs):
303    
304        assert train_pred.ndim==2
305        T1,N=train_pred.shape
306
307        assert test_pred.ndim==2
308        T2,M=test_pred.shape
309        assert N==M
310
311        assert train_target.ndim==1
312        T=len(train_target)
313        assert T==T1
314        return 
315
316    def _verify_model_output(self,output,test_pred):
317        
318        T,N=test_pred.shape
319        assert len(output)==3
320        model,det_pred,prob_pred=output
321
322        assert (det_pred is not None)|(prob_pred is not None)
323
324        if det_pred is not None:
325            assert det_pred.shape[0]==T
326            assert np.ndim(det_pred)==1
327        if prob_pred is not None:
328            assert prob_pred.shape[0]==T
329            assert np.ndim(prob_pred)==2
330        return
331    
332    def _split_data(self,predictors,target,cv_method,**cv_kwargs):
333        if cv_method is None:
334            cv_method=CV_None
335        elif type(cv_method)==str:
336            cv_method=self.predefined_cv_methods[cv_method]
337        
338        cv_iterator=cv_method(predictors,target,**cv_kwargs)
339        return cv_iterator
340    
341    def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean):
342        
343        """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset.
344        
345        **Arguments**
346        
347        *indices*
348        
349        *An xarray.Dataset of indices*.
350        
351        **Optional arguments**
352        
353        *label*
354        
355        A string determining the name of the added attribute key.
356        
357        *raise_on_missing_var*
358        
359        A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices.
360        
361        *reduce_func*
362        
363        The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x
364        
365        """
366        s=self.computed_scores
367        if s is None:
368            raise(ValueError('No scores computed.'))
369        for var in s.data_vars:
370            
371            score=s[var].values
372            try:
373                indices[var].attrs[label]=reduce_func(score)
374            except:
375                if raise_on_missing_var:
376                    raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.'))
377                pass
378        return

Supports quick and simple testing of the predictive power of predictor variables stored in an xarray.Dataset applied to a target variable stored in an xarray.DataArray. Supports different predictive models, cross validation approaches and skill scores within a straightforward API.

Arguments

predictors

An xarray Dataset of scalar predictors

predictand

An xarray DataArray with a shared coord to predictors.

PredictionTest(predictors, predictand, tolerate_empty_variables=False)
 96    def __init__(self,predictors,predictand,tolerate_empty_variables=False):
 97        """Initialise a PredictionTest object"""
 98        
 99        self.predictand=self._predictand_to_dataarray(predictand)
100        """@private"""
101        
102        self.predictors=self._predictors_to_dataset(predictors)
103        """@private"""
104
105        
106        self.predefined_models=dict(
107            logregression=log_regression_model
108        )
109        """@private"""
110        
111        self.predefined_cv_methods=dict(
112            nchunks=CV_n_chunks,
113            drop_year=CV_drop1year,
114            manual=CV_manual
115        )
116        """@private"""
117        
118        self.predefined_scores=dict(
119            sklearn_roc_auc=ROC_AUC,
120            test=blank_score
121        )
122        """@private"""
123        
124        self.computed_scores=None
125        """@private"""
126        
127        self.tol_empty=tolerate_empty_variables
128        """@private"""
129        return

Initialise a PredictionTest object

def categorical_prediction( self, model, score='sklearn_roc_auc', cv_method=None, predictor_variables='univariate', keep_models=False, model_kwargs={}, cv_kwargs={}, score_kwargs={}):
150    def categorical_prediction(self,model,score='sklearn_roc_auc',cv_method=None,predictor_variables='univariate',keep_models=False,model_kwargs={},cv_kwargs={},score_kwargs={}):
151        
152        """
153        
154        **Arguments**
155        
156        *model*
157        
158        A function with the following signature:
159        
160        Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments.
161        
162        Output: *model* (A scikit-learn-like fitted model object),*det_pred* (a numpy array of shape [T2] with deterministic predictions) and *prob_pred* (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in *train_target* and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of *det_pred* and *prob_pred* must not be None.
163            
164        **Optional arguments**
165        
166        *score*
167        
168        A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature:
169        
170        Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments.
171        Output: a scalar skill score.
172            
173        *cv_method*
174        A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are:
175        
176        *nchunks* - Divide the dataset into *n* chunks, using each as the test dataset predicted by the other *n*-1 chunks, to produce *n* total skill estimates. *n* defaults to 2, and is specified in *cv_kwargs*
177        
178        *drop_year* - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years.
179        *manual* - Treat *predictors* and *predictand* as training data. Test data must be passed explicitly via *cv_kwargs* as *test_pred* and *test_targ*.
180        
181        If a custom function is passed it must have the following signature:
182        Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments.
183        
184        Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray.
185
186        *predictor_variables*
187        
188        If 'univariate' all variables in *PredictionTest.predictors* are used individually to predict *PredictionTest.predictand*.
189        
190        If 'all' all variables in *PredictionTest.predictors* are used collectively to predict *PredictionTest.predictand*.
191        
192        If a 1D array of variable names in *PredictionTest.predictors*, each specified variable is used individually to predict *PredictionTest.predictand*.
193        
194        If a 2D array of iterables over variable names in *PredictionTest.predictors*, each specified combination of variables is used to predict *PredictionTest.predictand*.
195            
196        *keep_models*
197        If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed.
198        
199        *model_kwargs*
200        
201        A dictionary of keyword arguments to pass to *model*
202        *cv_kwargs*
203        
204        A dictionary of keyword arguments to pass to *cv_method*
205
206        *score_kwargs*
207        
208        A dictionary of keyword arguments to pass to *score*
209        
210        **Returns**
211        
212        If keep_models is False:
213        
214        returns *score_da*, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate.
215        
216        if keep_models is True:
217        
218        return (*score_da*,*model_da*)
219        """
220        if np.ndim(predictor_variables)==1:
221            el_type=[np.ndim(element) for element in predictor_variables]
222            
223            if len(np.unique(el_type))==1 and np.unique(el_type)==1:#a list of ragged lists
224                score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
225                
226            else:#just a list
227                score_labels=predictor_variables
228            
229        elif np.ndim(predictor_variables)==2:
230            score_labels=[f'predictor_set_{i+1}' for i in range(len(predictor_variables))]
231
232        elif predictor_variables =='univariate':
233            predictor_variables = list(self.predictors.data_vars)
234            score_labels=list(self.predictors.data_vars)
235        elif predictor_variables=='all':
236            predictor_variables=[list(self.predictors.data_vars)]
237            score_labels=['all']
238        else:
239            raise(ValueError('predictor variables must be one of ["univariate","all",1-D array, 2-D array]'))
240        
241        if type(score)==str:
242            score=self.predefined_scores[score]
243        
244        predictors=self.predictors
245        target=self.predictand
246        predictors,target=xr.align(predictors.dropna('time',how='all'),target.dropna('time',how='all'))
247        
248        #train_pred,train_target,test_pred,test_target
249        cv_iterator=self._split_data(predictors,target,cv_method,**cv_kwargs)
250        
251        score_da=[]
252        model_da=[]
253        for split_data in cv_iterator:
254            scores={}
255            models={}
256            train_preds,train_target,test_preds,test_target=split_data
257            
258            for label,variable in zip(score_labels,predictor_variables):
259                
260                train_pred=train_preds[variable]
261                test_pred=test_preds[variable]
262                
263                #univariate
264                if type(variable)==str:
265                    
266                    train_pred=np.array(train_pred).reshape([-1,1])
267                    test_pred=np.array(test_pred).reshape([-1,1])
268
269                #multivariate
270                else:
271                    train_pred=np.array(train_pred.to_array('var').transpose(*train_pred.dims,'var'))
272                    test_pred=np.array(test_pred.to_array('var').transpose(*test_pred.dims,'var'))
273
274                fitted_model,det_pred,prob_pred=self._fit_model_and_predict(model,train_pred,train_target,test_pred,**model_kwargs)
275                test_score=score(det_pred,prob_pred,test_target,**score_kwargs)
276                scores[label]=test_score
277                if keep_models:
278                    models[label]=fitted_model
279            score_da.append(xr.Dataset(scores))
280            model_da.append(models)
281        score_da=xr.concat(score_da,'cv')
282        self.computed_scores=score_da
283
284        if keep_models:
285            return score_da,model_da
286        else:
287            return score_da

Arguments

model

A function with the following signature:

Input: Three numpy arrays of shape [T1,N], [T1], [T2,N] (train_predictors, train_target, and test_predictors respectively), and an arbitrary number of keyword arguments.

Output: model (A scikit-learn-like fitted model object),det_pred (a numpy array of shape [T2] with deterministic predictions) and prob_pred (a numpy array of shape [T2,M] with probabilistic predictions, where M is the number of unique values in train_target and each element predicts the probability of a given class). Any output can instead be replaced with None, but at least one of det_pred and prob_pred must not be None.

Optional arguments

score

A string specifying a predefined skill score (currently only 'sklearn_roc_auc') or a function with the signature:

Input: det_pred (a numpy array of shape [T2]), prob_pred (a numpy array of shape [T2,M], a truth series (a numpy array of shape [T2]), and an arbitrary number of keyword arguments. Output: a scalar skill score.

cv_method A string specifying a predefined cross-validation method, a custom cross-validation function with corresponding signature, or None, in which case no cross-validation is used (the training and test datasets are the same). Predefined cross-validation methods are:

nchunks - Divide the dataset into n chunks, using each as the test dataset predicted by the other n-1 chunks, to produce n total skill estimates. n defaults to 2, and is specified in cv_kwargs

drop_year - Split the dataset by calendar year, using each year as the test dataset predicted by the remaining years. manual - Treat predictors and predictand as training data. Test data must be passed explicitly via cv_kwargs as test_pred and test_targ.

If a custom function is passed it must have the following signature: Input: predictors (a Dataset), target (a DataArray), and an arbitrary number of keyword arguments.

Output: A train predictor Dataset, a train target DatArray, a test predictor Dataset, and a test target DataArray.

predictor_variables

If 'univariate' all variables in PredictionTest.predictors are used individually to predict PredictionTest.predictand.

If 'all' all variables in PredictionTest.predictors are used collectively to predict PredictionTest.predictand.

If a 1D array of variable names in PredictionTest.predictors, each specified variable is used individually to predict PredictionTest.predictand.

If a 2D array of iterables over variable names in PredictionTest.predictors, each specified combination of variables is used to predict PredictionTest.predictand.

keep_models If True, a dictionary of arrays of fitted models is returned, corresponding to each variable combination and cross validated model that was computed.

model_kwargs

A dictionary of keyword arguments to pass to model cv_kwargs

A dictionary of keyword arguments to pass to cv_method

score_kwargs

A dictionary of keyword arguments to pass to score

Returns

If keep_models is False:

returns score_da, a Dataset of skill scores for each prediction experiment, with a cross_validation coordinate.

if keep_models is True:

return (score_da,model_da)

def add_score_to_index_metadata( self, indices, label='score', raise_on_missing_var=False, reduce_func=<function nanmean>):
341    def add_score_to_index_metadata(self,indices,label='score',raise_on_missing_var=False,reduce_func=np.nanmean):
342        
343        """ Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset.
344        
345        **Arguments**
346        
347        *indices*
348        
349        *An xarray.Dataset of indices*.
350        
351        **Optional arguments**
352        
353        *label*
354        
355        A string determining the name of the added attribute key.
356        
357        *raise_on_missing_var*
358        
359        A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices.
360        
361        *reduce_func*
362        
363        The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass *reduce_func*=lambda x: x
364        
365        """
366        s=self.computed_scores
367        if s is None:
368            raise(ValueError('No scores computed.'))
369        for var in s.data_vars:
370            
371            score=s[var].values
372            try:
373                indices[var].attrs[label]=reduce_func(score)
374            except:
375                if raise_on_missing_var:
376                    raise(ValueError(f'Key {var} present in self.computed_scores but not in indices.'))
377                pass
378        return

Annotate a Dataset of indices with computed skill scores by adding them to the attributes of each DataArray in the Dataset.

Arguments

indices

An xarray.Dataset of indices.

Optional arguments

label

A string determining the name of the added attribute key.

raise_on_missing_var

A boolean, determining if an error is raised if not all variables present in the computed skill scores are present in the indices.

reduce_func

The function used to reduce the 'cv' vector of skill scores to a single value. Default is the mean average, ignoring nans. To add the entire vector of scores for different cross validations, pass reduce_func=lambda x: x