domino.util
1import pandas as pd 2import datetime as dt 3import numpy as np 4import xarray as xr 5import itertools 6 7def squeeze_da(da): 8 """ Remove length 1 coords from a DataArray""" 9 return da.drop([c for c in da.coords if np.size(da[c])==1]) 10 11def drop_scalar_coords(ds): 12 """ Remove coords without dimensions from Dataset""" 13 for v in [v for v in list(ds.coords) if ds[v].size<2]: 14 ds=ds.drop(v) 15 return ds 16 17def make_all_dims_coords(da): 18 """Convert all dims to coords""" 19 return da.assign_coords({v:da[v] for v in da.dims}) 20 21#Takes a scalar input! 22def is_time_type(x): 23 return (isinstance(x,dt.date) or isinstance(x,np.datetime64)) 24 25def offset_time_dim(da,offset,offset_unit='days',offset_dim='time',deep=False):# 26 """Shifts the time-like *offset_dim* coord of *da* by *offset* *offset_units*. 27 28 e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da.""" 29 30 time_offset=dt.timedelta(**{offset_unit:offset}) 31 32 #rewritten to handle older pandas versions that don't play nicely with dataarrays 33 offset_dim_vals=pd.to_datetime(da[offset_dim].values)+time_offset 34 new_da=da.copy(deep=deep) 35 new_da=new_da.assign_coords({offset_dim:offset_dim_vals}) 36 37 return new_da 38 39 40#Takes a time axis t_arr, and splits it into 41#contiguous subarrays. Alternatively it splits an 42#axis x_arr into subarrays. If no dt is provided 43#to define contiguous segments, the minimum difference 44#between elements of t_arr is used 45#alternatively alternatively, you can set max_t, which overrides 46#dt, and considers any gap less than max_t to be contiguous 47def split_to_contiguous(t_arr,x_arr=None,dt=None,max_t=None): 48 49 if x_arr is None: 50 x_arr=t_arr.copy() 51 52 #make sure everything is the right size 53 t_arr=np.array(t_arr) 54 x_arr=np.array(x_arr) 55 try: 56 assert len(x_arr)==len(t_arr) 57 except: 58 print(len(x_arr)) 59 print(len(t_arr)) 60 raise(AssertionError()) 61 #Use smallest dt if none provided 62 if dt is None: 63 dt=np.sort(np.unique(t_arr[1:]-t_arr[:-1]))[0] 64 65 #The default contiguous definition 66 is_contiguous = lambda arr,dt,max_t: arr[1:]-arr[:-1]==dt 67 #The alternate max_t contiguous definition 68 if max_t is not None: 69 is_contiguous=lambda arr,dt,max_t: arr[1:]-arr[:-1]<max_t 70 71 #Split wherever is_contiguous is false 72 return np.split(x_arr[1:],np.atleast_1d(np.squeeze(np.argwhere(np.invert(is_contiguous(t_arr,dt,max_t)))))) 73 74#Take a list of pvals from multiple hypothesis tests and an alpha value to test against (i.e. a significance threshold) 75#Returns a boolean list saying whether to regard each pval as significant 76def holm_bonferroni_correction(pvals,alpha): 77 p_order=np.argsort(pvals) 78 N=len(pvals) 79 80 #Calculate sequentially larger alpha values for each successive test 81 alpha_corrected=alpha/(N+1-np.arange(1,N+1)) 82 83 #Put the pvalues in increasing order 84 sorted_p=np.array(pvals)[p_order] 85 86 #Get the first pval that exceeds its corrected alpha value 87 K=np.argwhere(sorted_p>alpha_corrected) 88 if len(K)==0: 89 K=N 90 else: 91 K=np.min(K) 92 #Keep all the first K-1 and reject the rest: 93 94 significant=np.array([*np.repeat(True,K),*np.repeat(False,N-K)]) 95 96 #Undo the ordering 97 significant=significant[np.argsort(p_order)] 98 99 return significant 100 101def event_from_datetimes(events,d1,d2,subset_dict={}): 102 #Input checking, also throws error for ragged lists 103 if np.all(np.array([np.ndim(e) for e in events])==0): 104 events=[events] 105 if not np.all(np.array([np.ndim(e) for e in events])==1): 106 raise(ValueError('events must be 1 or 2 dimensional')) 107 108 duplicate_event_dates=np.sum([np.isin(e1,e2).sum()\ 109 for e1,e2 in itertools.combinations(events,2)]) 110 if duplicate_event_dates!=0: 111 raise(ValueError('2 events on the same day not supported.')) 112 113 #Meat of the function 114 daterange=pd.date_range(d1,d2) 115 for k,x in subset_dict.items(): 116 daterange=daterange[np.isin(getattr(daterange,k),x)] 117 event_index=np.zeros(len(daterange)) 118 for i,e in enumerate(events): 119 event_index[np.isin(daterange.to_list(),e)]=i+1 120 event_index=xr.DataArray(data=event_index,coords={'time':daterange}) 121 return event_index 122 123def xarr_times_to_ints(time_coord): 124 conversion=(1000*cftime.UNIT_CONVERSION_FACTORS["day"]) 125 return time_coord.to_numpy().astype(float)/conversion 126 127def restrict(ds,extent_dict): 128 """ Subset the coords of a Dataset *ds*, using *extent_dict*, a dictionary of the form {coord:[lower_bound,upper_bound],...}.""" 129 ds=ds.copy() 130 for key in extent_dict: 131 if key in ds.dims: 132 xmin,xmax=extent_dict[key] 133 134 in_range=(ds[key].values>=xmin)&(ds[key].values<=xmax) 135 ds=ds.isel({key:in_range}) 136 return ds 137 138def offset_indices(indices,offsets=None,infer_offset=True,attr_kw=None,offset_unit='days',dim='time'): 139 """For a Dataset *indices* and either a dictionary of *offsets* ({data_var:offset,...}) or offsets stored in an attribute *attr_kw*, offset each index along the *dim* coord and take their union.""" 140 if offsets is None and not infer_offset: 141 raise(ValueError('offsets must be provided or infer_offset must be True')) 142 if attr_kw is None and infer_offset: 143 raise(ValueError('attr_kw must be specified for infer_offset')) 144 145 if infer_offset: 146 offsets={v:indices[v].attrs[attr_kw] for v in indices.data_vars} 147 148 da_arr=[] 149 for v in indices.data_vars: 150 da=indices[v] 151 l=offsets[v] 152 if type(l)==np.int_: 153 l=int(l) #datetime can't handle np.ints, no idea why not. 154 da_arr.append(offset_time_dim(da,-l,offset_unit,offset_dim=dim)) 155 ds=xr.merge(da_arr) 156 return ds
def
squeeze_da(da):
8def squeeze_da(da): 9 """ Remove length 1 coords from a DataArray""" 10 return da.drop([c for c in da.coords if np.size(da[c])==1])
Remove length 1 coords from a DataArray
def
drop_scalar_coords(ds):
12def drop_scalar_coords(ds): 13 """ Remove coords without dimensions from Dataset""" 14 for v in [v for v in list(ds.coords) if ds[v].size<2]: 15 ds=ds.drop(v) 16 return ds
Remove coords without dimensions from Dataset
def
make_all_dims_coords(da):
18def make_all_dims_coords(da): 19 """Convert all dims to coords""" 20 return da.assign_coords({v:da[v] for v in da.dims})
Convert all dims to coords
def
is_time_type(x):
def
offset_time_dim(da, offset, offset_unit='days', offset_dim='time', deep=False):
26def offset_time_dim(da,offset,offset_unit='days',offset_dim='time',deep=False):# 27 """Shifts the time-like *offset_dim* coord of *da* by *offset* *offset_units*. 28 29 e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da.""" 30 31 time_offset=dt.timedelta(**{offset_unit:offset}) 32 33 #rewritten to handle older pandas versions that don't play nicely with dataarrays 34 offset_dim_vals=pd.to_datetime(da[offset_dim].values)+time_offset 35 new_da=da.copy(deep=deep) 36 new_da=new_da.assign_coords({offset_dim:offset_dim_vals}) 37 38 return new_da
Shifts the time-like offset_dim coord of da by offset offset_units.
e.g. offset_time_dim(da,3,'days'), adds three days to the time axis of da.
def
split_to_contiguous(t_arr, x_arr=None, dt=None, max_t=None):
48def split_to_contiguous(t_arr,x_arr=None,dt=None,max_t=None): 49 50 if x_arr is None: 51 x_arr=t_arr.copy() 52 53 #make sure everything is the right size 54 t_arr=np.array(t_arr) 55 x_arr=np.array(x_arr) 56 try: 57 assert len(x_arr)==len(t_arr) 58 except: 59 print(len(x_arr)) 60 print(len(t_arr)) 61 raise(AssertionError()) 62 #Use smallest dt if none provided 63 if dt is None: 64 dt=np.sort(np.unique(t_arr[1:]-t_arr[:-1]))[0] 65 66 #The default contiguous definition 67 is_contiguous = lambda arr,dt,max_t: arr[1:]-arr[:-1]==dt 68 #The alternate max_t contiguous definition 69 if max_t is not None: 70 is_contiguous=lambda arr,dt,max_t: arr[1:]-arr[:-1]<max_t 71 72 #Split wherever is_contiguous is false 73 return np.split(x_arr[1:],np.atleast_1d(np.squeeze(np.argwhere(np.invert(is_contiguous(t_arr,dt,max_t))))))
def
holm_bonferroni_correction(pvals, alpha):
77def holm_bonferroni_correction(pvals,alpha): 78 p_order=np.argsort(pvals) 79 N=len(pvals) 80 81 #Calculate sequentially larger alpha values for each successive test 82 alpha_corrected=alpha/(N+1-np.arange(1,N+1)) 83 84 #Put the pvalues in increasing order 85 sorted_p=np.array(pvals)[p_order] 86 87 #Get the first pval that exceeds its corrected alpha value 88 K=np.argwhere(sorted_p>alpha_corrected) 89 if len(K)==0: 90 K=N 91 else: 92 K=np.min(K) 93 #Keep all the first K-1 and reject the rest: 94 95 significant=np.array([*np.repeat(True,K),*np.repeat(False,N-K)]) 96 97 #Undo the ordering 98 significant=significant[np.argsort(p_order)] 99 100 return significant
def
event_from_datetimes(events, d1, d2, subset_dict={}):
102def event_from_datetimes(events,d1,d2,subset_dict={}): 103 #Input checking, also throws error for ragged lists 104 if np.all(np.array([np.ndim(e) for e in events])==0): 105 events=[events] 106 if not np.all(np.array([np.ndim(e) for e in events])==1): 107 raise(ValueError('events must be 1 or 2 dimensional')) 108 109 duplicate_event_dates=np.sum([np.isin(e1,e2).sum()\ 110 for e1,e2 in itertools.combinations(events,2)]) 111 if duplicate_event_dates!=0: 112 raise(ValueError('2 events on the same day not supported.')) 113 114 #Meat of the function 115 daterange=pd.date_range(d1,d2) 116 for k,x in subset_dict.items(): 117 daterange=daterange[np.isin(getattr(daterange,k),x)] 118 event_index=np.zeros(len(daterange)) 119 for i,e in enumerate(events): 120 event_index[np.isin(daterange.to_list(),e)]=i+1 121 event_index=xr.DataArray(data=event_index,coords={'time':daterange}) 122 return event_index
def
xarr_times_to_ints(time_coord):
def
restrict(ds, extent_dict):
128def restrict(ds,extent_dict): 129 """ Subset the coords of a Dataset *ds*, using *extent_dict*, a dictionary of the form {coord:[lower_bound,upper_bound],...}.""" 130 ds=ds.copy() 131 for key in extent_dict: 132 if key in ds.dims: 133 xmin,xmax=extent_dict[key] 134 135 in_range=(ds[key].values>=xmin)&(ds[key].values<=xmax) 136 ds=ds.isel({key:in_range}) 137 return ds
Subset the coords of a Dataset ds, using extent_dict, a dictionary of the form {coord:[lower_bound,upper_bound],...}.
def
offset_indices( indices, offsets=None, infer_offset=True, attr_kw=None, offset_unit='days', dim='time'):
139def offset_indices(indices,offsets=None,infer_offset=True,attr_kw=None,offset_unit='days',dim='time'): 140 """For a Dataset *indices* and either a dictionary of *offsets* ({data_var:offset,...}) or offsets stored in an attribute *attr_kw*, offset each index along the *dim* coord and take their union.""" 141 if offsets is None and not infer_offset: 142 raise(ValueError('offsets must be provided or infer_offset must be True')) 143 if attr_kw is None and infer_offset: 144 raise(ValueError('attr_kw must be specified for infer_offset')) 145 146 if infer_offset: 147 offsets={v:indices[v].attrs[attr_kw] for v in indices.data_vars} 148 149 da_arr=[] 150 for v in indices.data_vars: 151 da=indices[v] 152 l=offsets[v] 153 if type(l)==np.int_: 154 l=int(l) #datetime can't handle np.ints, no idea why not. 155 da_arr.append(offset_time_dim(da,-l,offset_unit,offset_dim=dim)) 156 ds=xr.merge(da_arr) 157 return ds
For a Dataset indices and either a dictionary of offsets ({data_var:offset,...}) or offsets stored in an attribute attr_kw, offset each index along the dim coord and take their union.