domino.util

View Source

  1import pandas as pd
  2import datetime as dt
  3import numpy as np
  4import xarray as xr
  5import itertools
  6
  7def squeeze_da(da):  
  8    """ Remove length 1 coords from a DataArray"""
  9    return da.drop([c for c in da.coords if np.size(da[c])==1])
 10
 11def drop_scalar_coords(ds):
 12    """ Remove coords without dimensions from Dataset"""
 13    for v in [v for v in list(ds.coords) if ds[v].size<2]:
 14        ds=ds.drop(v)
 15    return ds
 16
 17def make_all_dims_coords(da):
 18    """Convert all dims to coords"""
 19    return da.assign_coords({v:da[v] for v in da.dims})
 20
 21#Takes a scalar input!
 22def is_time_type(x):
 23    return (isinstance(x,dt.date) or isinstance(x,np.datetime64))
 24
 25def offset_time_dim(da,offset,offset_unit='days',offset_dim='time',deep=False):#
 26    """Shifts the time-like *offset_dim* coord of *da* by *offset* *offset_units*.
 27    
 28    e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da."""
 29    
 30    time_offset=dt.timedelta(**{offset_unit:offset})
 31    
 32    #rewritten to handle older pandas versions that don't play nicely with dataarrays
 33    offset_dim_vals=pd.to_datetime(da[offset_dim].values)+time_offset
 34    new_da=da.copy(deep=deep)
 35    new_da=new_da.assign_coords({offset_dim:offset_dim_vals})
 36    
 37    return new_da
 38
 39
 40#Takes a time axis t_arr, and splits it into
 41#contiguous subarrays. Alternatively it splits an 
 42#axis x_arr into subarrays. If no dt is provided
 43#to define contiguous segments, the minimum difference
 44#between elements of t_arr is used
 45#alternatively alternatively, you can set max_t, which overrides
 46#dt, and considers any gap less than max_t to be contiguous
 47def split_to_contiguous(t_arr,x_arr=None,dt=None,max_t=None):
 48    
 49    if x_arr is None:
 50        x_arr=t_arr.copy()
 51        
 52    #make sure everything is the right size
 53    t_arr=np.array(t_arr)
 54    x_arr=np.array(x_arr)
 55    try:
 56        assert len(x_arr)==len(t_arr)
 57    except:
 58        print(len(x_arr))
 59        print(len(t_arr))
 60        raise(AssertionError())
 61    #Use smallest dt if none provided
 62    if dt is None:
 63        dt=np.sort(np.unique(t_arr[1:]-t_arr[:-1]))[0]
 64        
 65    #The default contiguous definition    
 66    is_contiguous = lambda arr,dt,max_t: arr[1:]-arr[:-1]==dt
 67    #The alternate max_t contiguous definition
 68    if max_t is not None:
 69        is_contiguous=lambda arr,dt,max_t: arr[1:]-arr[:-1]<max_t
 70        
 71    #Split wherever is_contiguous is false
 72    return np.split(x_arr[1:],np.atleast_1d(np.squeeze(np.argwhere(np.invert(is_contiguous(t_arr,dt,max_t))))))
 73
 74#Take a list of pvals from multiple hypothesis tests and an alpha value to test against (i.e. a significance threshold)
 75#Returns a boolean list saying whether to regard each pval as significant
 76def holm_bonferroni_correction(pvals,alpha):
 77    p_order=np.argsort(pvals)
 78    N=len(pvals)
 79    
 80    #Calculate sequentially larger alpha values for each successive test
 81    alpha_corrected=alpha/(N+1-np.arange(1,N+1))
 82    
 83    #Put the pvalues in increasing order
 84    sorted_p=np.array(pvals)[p_order]
 85    
 86    #Get the first pval that exceeds its corrected alpha value
 87    K=np.argwhere(sorted_p>alpha_corrected)
 88    if len(K)==0:
 89        K=N
 90    else:
 91        K=np.min(K)
 92    #Keep all the first K-1 and reject the rest:
 93
 94    significant=np.array([*np.repeat(True,K),*np.repeat(False,N-K)])
 95    
 96    #Undo the ordering
 97    significant=significant[np.argsort(p_order)]
 98    
 99    return significant
100
101def event_from_datetimes(events,d1,d2,subset_dict={}):
102    #Input checking, also throws error for ragged lists
103    if np.all(np.array([np.ndim(e) for e in events])==0):
104        events=[events]
105    if not np.all(np.array([np.ndim(e) for e in events])==1):
106        raise(ValueError('events must be 1 or 2 dimensional'))
107
108    duplicate_event_dates=np.sum([np.isin(e1,e2).sum()\
109        for e1,e2 in itertools.combinations(events,2)])
110    if duplicate_event_dates!=0:
111        raise(ValueError('2 events on the same day not supported.'))
112    
113    #Meat of the function
114    daterange=pd.date_range(d1,d2)
115    for k,x in subset_dict.items():
116        daterange=daterange[np.isin(getattr(daterange,k),x)]  
117    event_index=np.zeros(len(daterange))
118    for i,e in enumerate(events):
119        event_index[np.isin(daterange.to_list(),e)]=i+1
120    event_index=xr.DataArray(data=event_index,coords={'time':daterange})
121    return event_index
122
123def xarr_times_to_ints(time_coord):
124    conversion=(1000*cftime.UNIT_CONVERSION_FACTORS["day"])
125    return time_coord.to_numpy().astype(float)/conversion
126
127def restrict(ds,extent_dict):
128    """ Subset the coords of a Dataset *ds*, using *extent_dict*, a dictionary of the form {coord:[lower_bound,upper_bound],...}."""
129    ds=ds.copy()
130    for key in extent_dict:
131        if key in ds.dims:
132            xmin,xmax=extent_dict[key]
133
134            in_range=(ds[key].values>=xmin)&(ds[key].values<=xmax)
135            ds=ds.isel({key:in_range})
136    return ds
137
138def offset_indices(indices,offsets=None,infer_offset=True,attr_kw=None,offset_unit='days',dim='time'):
139    """For a Dataset *indices* and either a dictionary of *offsets* ({data_var:offset,...}) or offsets stored in an attribute *attr_kw*, offset each index along the *dim* coord and take their union."""
140    if offsets is None and not infer_offset:
141        raise(ValueError('offsets must be provided or infer_offset must be True'))
142    if attr_kw is None and infer_offset:
143        raise(ValueError('attr_kw must be specified for infer_offset'))
144    
145    if infer_offset:
146        offsets={v:indices[v].attrs[attr_kw] for v in indices.data_vars}
147    
148    da_arr=[]
149    for v in indices.data_vars:
150        da=indices[v]
151        l=offsets[v]
152        if type(l)==np.int_:
153            l=int(l) #datetime can't handle np.ints, no idea why not.
154        da_arr.append(offset_time_dim(da,-l,offset_unit,offset_dim=dim))
155    ds=xr.merge(da_arr)
156    return ds

def squeeze_da(da): View Source

 8def squeeze_da(da):  
 9    """ Remove length 1 coords from a DataArray"""
10    return da.drop([c for c in da.coords if np.size(da[c])==1])

Remove length 1 coords from a DataArray

def drop_scalar_coords(ds): View Source

12def drop_scalar_coords(ds):
13    """ Remove coords without dimensions from Dataset"""
14    for v in [v for v in list(ds.coords) if ds[v].size<2]:
15        ds=ds.drop(v)
16    return ds

Remove coords without dimensions from Dataset

def make_all_dims_coords(da): View Source

18def make_all_dims_coords(da):
19    """Convert all dims to coords"""
20    return da.assign_coords({v:da[v] for v in da.dims})

Convert all dims to coords

def is_time_type(x): View Source

23def is_time_type(x):
24    return (isinstance(x,dt.date) or isinstance(x,np.datetime64))

def offset_time_dim(da, offset, offset_unit='days', offset_dim='time', deep=False): View Source

26def offset_time_dim(da,offset,offset_unit='days',offset_dim='time',deep=False):#
27    """Shifts the time-like *offset_dim* coord of *da* by *offset* *offset_units*.
28    
29    e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da."""
30    
31    time_offset=dt.timedelta(**{offset_unit:offset})
32    
33    #rewritten to handle older pandas versions that don't play nicely with dataarrays
34    offset_dim_vals=pd.to_datetime(da[offset_dim].values)+time_offset
35    new_da=da.copy(deep=deep)
36    new_da=new_da.assign_coords({offset_dim:offset_dim_vals})
37    
38    return new_da

Shifts the time-like offset_dim coord of da by offset offset_units.

e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da.

def split_to_contiguous(t_arr, x_arr=None, dt=None, max_t=None): View Source

48def split_to_contiguous(t_arr,x_arr=None,dt=None,max_t=None):
49    
50    if x_arr is None:
51        x_arr=t_arr.copy()
52        
53    #make sure everything is the right size
54    t_arr=np.array(t_arr)
55    x_arr=np.array(x_arr)
56    try:
57        assert len(x_arr)==len(t_arr)
58    except:
59        print(len(x_arr))
60        print(len(t_arr))
61        raise(AssertionError())
62    #Use smallest dt if none provided
63    if dt is None:
64        dt=np.sort(np.unique(t_arr[1:]-t_arr[:-1]))[0]
65        
66    #The default contiguous definition    
67    is_contiguous = lambda arr,dt,max_t: arr[1:]-arr[:-1]==dt
68    #The alternate max_t contiguous definition
69    if max_t is not None:
70        is_contiguous=lambda arr,dt,max_t: arr[1:]-arr[:-1]<max_t
71        
72    #Split wherever is_contiguous is false
73    return np.split(x_arr[1:],np.atleast_1d(np.squeeze(np.argwhere(np.invert(is_contiguous(t_arr,dt,max_t))))))

def holm_bonferroni_correction(pvals, alpha): View Source

 77def holm_bonferroni_correction(pvals,alpha):
 78    p_order=np.argsort(pvals)
 79    N=len(pvals)
 80    
 81    #Calculate sequentially larger alpha values for each successive test
 82    alpha_corrected=alpha/(N+1-np.arange(1,N+1))
 83    
 84    #Put the pvalues in increasing order
 85    sorted_p=np.array(pvals)[p_order]
 86    
 87    #Get the first pval that exceeds its corrected alpha value
 88    K=np.argwhere(sorted_p>alpha_corrected)
 89    if len(K)==0:
 90        K=N
 91    else:
 92        K=np.min(K)
 93    #Keep all the first K-1 and reject the rest:
 94
 95    significant=np.array([*np.repeat(True,K),*np.repeat(False,N-K)])
 96    
 97    #Undo the ordering
 98    significant=significant[np.argsort(p_order)]
 99    
100    return significant

def event_from_datetimes(events, d1, d2, subset_dict={}): View Source

102def event_from_datetimes(events,d1,d2,subset_dict={}):
103    #Input checking, also throws error for ragged lists
104    if np.all(np.array([np.ndim(e) for e in events])==0):
105        events=[events]
106    if not np.all(np.array([np.ndim(e) for e in events])==1):
107        raise(ValueError('events must be 1 or 2 dimensional'))
108
109    duplicate_event_dates=np.sum([np.isin(e1,e2).sum()\
110        for e1,e2 in itertools.combinations(events,2)])
111    if duplicate_event_dates!=0:
112        raise(ValueError('2 events on the same day not supported.'))
113    
114    #Meat of the function
115    daterange=pd.date_range(d1,d2)
116    for k,x in subset_dict.items():
117        daterange=daterange[np.isin(getattr(daterange,k),x)]  
118    event_index=np.zeros(len(daterange))
119    for i,e in enumerate(events):
120        event_index[np.isin(daterange.to_list(),e)]=i+1
121    event_index=xr.DataArray(data=event_index,coords={'time':daterange})
122    return event_index

def xarr_times_to_ints(time_coord): View Source

124def xarr_times_to_ints(time_coord):
125    conversion=(1000*cftime.UNIT_CONVERSION_FACTORS["day"])
126    return time_coord.to_numpy().astype(float)/conversion

def restrict(ds, extent_dict): View Source

128def restrict(ds,extent_dict):
129    """ Subset the coords of a Dataset *ds*, using *extent_dict*, a dictionary of the form {coord:[lower_bound,upper_bound],...}."""
130    ds=ds.copy()
131    for key in extent_dict:
132        if key in ds.dims:
133            xmin,xmax=extent_dict[key]
134
135            in_range=(ds[key].values>=xmin)&(ds[key].values<=xmax)
136            ds=ds.isel({key:in_range})
137    return ds

Subset the coords of a Dataset ds, using extent_dict, a dictionary of the form {coord:[lower_bound,upper_bound],...}.

def offset_indices( indices, offsets=None, infer_offset=True, attr_kw=None, offset_unit='days', dim='time'): View Source

139def offset_indices(indices,offsets=None,infer_offset=True,attr_kw=None,offset_unit='days',dim='time'):
140    """For a Dataset *indices* and either a dictionary of *offsets* ({data_var:offset,...}) or offsets stored in an attribute *attr_kw*, offset each index along the *dim* coord and take their union."""
141    if offsets is None and not infer_offset:
142        raise(ValueError('offsets must be provided or infer_offset must be True'))
143    if attr_kw is None and infer_offset:
144        raise(ValueError('attr_kw must be specified for infer_offset'))
145    
146    if infer_offset:
147        offsets={v:indices[v].attrs[attr_kw] for v in indices.data_vars}
148    
149    da_arr=[]
150    for v in indices.data_vars:
151        da=indices[v]
152        l=offsets[v]
153        if type(l)==np.int_:
154            l=int(l) #datetime can't handle np.ints, no idea why not.
155        da_arr.append(offset_time_dim(da,-l,offset_unit,offset_dim=dim))
156    ds=xr.merge(da_arr)
157    return ds

For a Dataset indices and either a dictionary of offsets ({data_var:offset,...}) or offsets stored in an attribute attr_kw, offset each index along the dim coord and take their union.