Data integration¶
In [3]:
Copied!
import scanpy as sc
import anndata
import os
import omicverse as ov
import Epiverse as ev
import scanpy as sc
import anndata
import os
import omicverse as ov
import Epiverse as ev
2023-08-16 17:43:23.672326: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-08-16 17:43:29.057535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2023-08-16 17:43:29.058608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2023-08-16 17:43:29.058641: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
In [4]:
Copied!
[i for i in os.listdir('./') if 'h5ad' in i]
[i for i in os.listdir('./') if 'h5ad' in i]
Out[4]:
['GSM7119498_9pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119504_18pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119508_26pcw_n_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119506_20pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119502_14pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119499_10pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119503_15pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119500_11pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119505_19pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119507_23pcw_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119501_11pcw_r_scATAC_filtered_peak_bc_atac.h5ad', 'GSM7119509_26pcw_o_scATAC_filtered_peak_bc_atac.h5ad']
In [45]:
Copied!
adata_dict={}
for name in [i for i in os.listdir('./') if 'h5ad' in i]:
adata_test=sc.read(name)
#adata_dict[name.split('_scATAC')[0]]=adata_test[:,adata_test.var['chrom']=='chr20']
adata_dict[name.split('_scATAC')[0]]=adata_test
adata_dict={}
for name in [i for i in os.listdir('./') if 'h5ad' in i]:
adata_test=sc.read(name)
#adata_dict[name.split('_scATAC')[0]]=adata_test[:,adata_test.var['chrom']=='chr20']
adata_dict[name.split('_scATAC')[0]]=adata_test
In [46]:
Copied!
peak_obj=ev.pp.ATAC_peaks(adata_dict,chr=list(set(adata_dict['GSM7119498_9pcw'].var['chrom'])),
ncpus=8,near_bp=500)
peak_obj=ev.pp.ATAC_peaks(adata_dict,chr=list(set(adata_dict['GSM7119498_9pcw'].var['chrom'])),
ncpus=8,near_bp=500)
In [47]:
Copied!
peak_obj.init()
peak_obj.init()
In [13]:
Copied!
%%time
peak_obj.concat(batch=3)
%%time
peak_obj.concat(batch=3)
chr4 chr5 chrY chr17 chr19 chr21 chr12 chr10 chrX chr8 chr9 chr14 chr15 chr3 chr7 chr18 chr2 chr20 chr16 chr13 chr6 chr22 chr11 chr1 CPU times: user 52.6 s, sys: 58 s, total: 1min 50s Wall time: 46min 26s
Out[13]:
| chrom | chromstart | chromend | range | range_s | median | new_peak | new_chrom | new_chromstart | new_chromend | |
|---|---|---|---|---|---|---|---|---|---|---|
| peak | ||||||||||
| chr4:10135-10199 | chr4 | 10135 | 10199 | 64 | 32 | 10167 | chr4:9942-10522 | chr4 | 9942 | 10522 |
| chr4:9967-10447 | chr4 | 9967 | 10447 | 480 | 240 | 10207 | chr4:9942-10522 | chr4 | 9942 | 10522 |
| chr4:9942-10522 | chr4 | 9942 | 10522 | 580 | 290 | 10232 | chr4:9942-10522 | chr4 | 9942 | 10522 |
| chr4:9890-10754 | chr4 | 9890 | 10754 | 864 | 432 | 10322 | chr4:9890-10754 | chr4 | 9890 | 10754 |
| chr4:10336-10868 | chr4 | 10336 | 10868 | 532 | 266 | 10602 | chr4:10336-10868 | chr4 | 10336 | 10868 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| chr1:248945415-248946390 | chr1 | 248945415 | 248946390 | 975 | 487 | 248945902 | chr1:248944244-248946321 | chr1 | 248944244 | 248946321 |
| chr1:248945899-248945964 | chr1 | 248945899 | 248945964 | 65 | 32 | 248945931 | chr1:248944244-248946321 | chr1 | 248944244 | 248946321 |
| chr1:248945627-248946274 | chr1 | 248945627 | 248946274 | 647 | 323 | 248945950 | chr1:248945049-248946340 | chr1 | 248945049 | 248946340 |
| chr1:248945754-248946173 | chr1 | 248945754 | 248946173 | 419 | 209 | 248945963 | chr1:248945385-248946377 | chr1 | 248945385 | 248946377 |
| chr1:248945908-248946075 | chr1 | 248945908 | 248946075 | 167 | 83 | 248945991 | chr1:248945415-248946390 | chr1 | 248945415 | 248946390 |
1859354 rows × 10 columns
In [15]:
Copied!
peak_obj.total_peak.to_csv('total_peaks.csv')
peak_obj.total_peak.to_csv('total_peaks.csv')
In [48]:
Copied!
peak_obj.total_peak=pd.read_csv('total_peaks.csv',index_col=0)
peak_obj.total_peak=pd.read_csv('total_peaks.csv',index_col=0)
In [35]:
Copied!
peak_obj.adata_dict[adata_key]
peak_obj.adata_dict[adata_key]
Out[35]:
AnnData object with n_obs × n_vars = 4218 × 111284
obs: 'sample'
var: 'chrom', 'chromstart', 'chromend'
In [36]:
Copied!
peak_obj.total_peak.loc[peak_obj.adata_dict[adata_key].var.index]['new_peak'].values
peak_obj.total_peak.loc[peak_obj.adata_dict[adata_key].var.index]['new_peak'].values
Out[36]:
array(['chr1:9848-10638', 'chr1:180644-181583', 'chr1:189319-192043', ...,
'chrY:56849301-56849591', 'chrY:56850307-56851083',
'chrY:56873822-56873842'], dtype=object)
In [49]:
Copied!
peak_obj.reindex(s=':-')
peak_obj.reindex(s=':-')
In [ ]:
Copied!
concat_adata=anndata.concat([adata1,adata2],join="outer",fill_value=0)
concat_adata=anndata.concat([adata1,adata2],join="outer",fill_value=0)
In [50]:
Copied!
for adata_key in peak_obj.adata_dict.keys():
peak_obj.adata_dict[adata_key].var_names_make_unique()
#peak_obj.adata_dict[adata_key]=peak_obj.adata_dict[adata_key][:,[i for i in peak_obj.adata_dict[adata_key].var_names if '-' not in i]]
for adata_key in peak_obj.adata_dict.keys():
peak_obj.adata_dict[adata_key].var_names_make_unique()
#peak_obj.adata_dict[adata_key]=peak_obj.adata_dict[adata_key][:,[i for i in peak_obj.adata_dict[adata_key].var_names if '-' not in i]]
In [51]:
Copied!
peak_obj.adata_dict[adata_key]
peak_obj.adata_dict[adata_key]
Out[51]:
AnnData object with n_obs × n_vars = 4218 × 111284
obs: 'sample'
var: 'chrom', 'chromstart', 'chromend', 'old_peak', 'new_chrom', 'new_chromstart', 'new_chromend'
In [52]:
Copied!
concat_adata1=peak_obj.merge(method='inner')
concat_adata1
concat_adata1=peak_obj.merge(method='inner')
concat_adata1
Out[52]:
AnnData object with n_obs × n_vars = 70519 × 8208
obs: 'sample'
var: 'chrom', 'new_chrom', 'new_chromstart', 'new_chromend'
In [53]:
Copied!
concat_adata2=peak_obj.merge(method='outer')
concat_adata2
concat_adata2=peak_obj.merge(method='outer')
concat_adata2
Out[53]:
AnnData object with n_obs × n_vars = 70519 × 680193
obs: 'sample'
In [55]:
Copied!
split = concat_adata2.var.index.str.split(r"[{}]".format(':-'))
concat_adata2.var["chrom"] = split.map(lambda x: x[0])
concat_adata2.var["chromstart"] = split.map(lambda x: x[1]).astype(int)
concat_adata2.var["chromend"] = split.map(lambda x: x[2]).astype(int)
split = concat_adata2.var.index.str.split(r"[{}]".format(':-'))
concat_adata2.var["chrom"] = split.map(lambda x: x[0])
concat_adata2.var["chromstart"] = split.map(lambda x: x[1]).astype(int)
concat_adata2.var["chromend"] = split.map(lambda x: x[2]).astype(int)
In [58]:
Copied!
concat_adata2.var['range']=[i-j for i,j in zip(concat_adata2.var["chromend"],concat_adata2.var["chromstart"]) ]
concat_adata2.var['range']=[i-j for i,j in zip(concat_adata2.var["chromend"],concat_adata2.var["chromstart"]) ]
In [61]:
Copied!
np.mean(concat_adata2.var['range'])
np.mean(concat_adata2.var['range'])
Out[61]:
1351.9430338154025
In [62]:
Copied!
concat_adata2
concat_adata2
Out[62]:
AnnData object with n_obs × n_vars = 70519 × 680193
obs: 'sample'
var: 'chrom', 'chromstart', 'chromend', 'range'
In [63]:
Copied!
concat_adata2.write_h5ad('adata_atac_concat.h5ad',compression='gzip')
concat_adata2.write_h5ad('adata_atac_concat.h5ad',compression='gzip')
In [67]:
Copied!
concat_adata2=sc.read('adata_atac_concat.h5ad')
concat_adata2=sc.read('adata_atac_concat.h5ad')
In [64]:
Copied!
concat_adata2.shape[0]*0.05
concat_adata2.shape[0]*0.05
Out[64]:
3525.9500000000003
In [68]:
Copied!
sc.pp.filter_genes(concat_adata2, min_cells = concat_adata2.shape[0]*0.01)
sc.pp.filter_genes(concat_adata2, min_cells = concat_adata2.shape[0]*0.01)
In [69]:
Copied!
concat_adata2
concat_adata2
Out[69]:
AnnData object with n_obs × n_vars = 70519 × 113778
obs: 'sample'
var: 'chrom', 'chromstart', 'chromend', 'range', 'n_cells'
In [71]:
Copied!
concat_adata2.write_h5ad('retina_atac_raw.h5ad',compression='gzip')
concat_adata2.write_h5ad('retina_atac_raw.h5ad',compression='gzip')
In [ ]:
Copied!