forked from galerp/helbig_lab_hpo_sim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprecalc_IC.R
125 lines (77 loc) · 3.48 KB
/
precalc_IC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
library(tidyverse)
library(memoise)
start <- Sys.time()
source("hpo_dist_helpers.R")
message("\n Local_IC file is read. Calculating the similarity_matrix \n \n ")
ic <- local_IC[,c("term","Propagated.local.IC")]
hpo_all <- allHPOs[,c("term","definition")]
ic2 <- merge(ic,hpo_all,by='term',all.x=TRUE,all.y=FALSE)
names(local_IC)[1] = "HPO"
ic2 <- local_IC[,c("HPO","Propagated.local.IC")]
memo_mica <- memoise(mica)
############
#Run Similarity Analysis on entire cohort
############
sim_score <- Compare_Cohort(pat_table_base)
write.csv(sim_score,paste0(input.yaml$output_dir,"/sim_matrix.csv"),row.names = T)
###########
#STEP 4: Create Distribution of Similarity Scores
###########
message("\n Sim score file is written. Permutation Analysis is to be followed \n \n ")
rownames(sim_score) = names(sim_score)
num_iterations = input.yaml$num_of_iterations
n2_100k <- sim_pat_draw(sim_score, 2,num_iterations)
n3_100k <- sim_pat_draw(sim_score, 3,num_iterations)
n4_100k <- sim_pat_draw(sim_score, 4,num_iterations)
n5_100k <- sim_pat_draw(sim_score, 5,num_iterations)
n7_100k <- sim_pat_draw(sim_score, 7,num_iterations)
###########
#STEP 4: Generate P-values for Genes
###########
message("\n Permutation is done. p-value for the genes is calculated \n \n ")
names(variant)[1] <- "famID"
variant <- variant %>% mutate(famID = gsub("-","_", famID))
famIDs_var <- variant$famID %>% unique %>% as.data.frame %>%
dplyr::rename('famID' = '.') %>% dplyr::mutate(var = "variant")
famIDs_sim <- sim_score %>% rownames %>% as.data.frame %>%
dplyr::rename('famID' = '.') %>% dplyr::mutate(sim = "sim_score")
fam_combined <- famIDs_sim %>% dplyr::full_join(famIDs_var)
#Filtering data
##Trios without sim_scores and without variants
no_sim <- as.vector(fam_combined$famID[is.na(fam_combined$sim) == TRUE])
no_var <- as.vector(fam_combined$famID[is.na(fam_combined$var) == TRUE])
#Trios with sim_scores
all_sim <- as.vector(fam_combined$famID[is.na(fam_combined$sim) == FALSE])
variant_sim <- variant %>% filter(famID %in% all_sim)
denovo <- denovo_calc(variant_sim)
#Table of denovos with famID and gene
tab1 <- denovo %>% dplyr::select(famID, Gene.refGene) %>% unique
#list of all genes
all_genes <- tab1 %>% dplyr::count(Gene.refGene) %>%
dplyr::rename(gene = Gene.refGene, Freq = n) %>%
filter(Freq > 1)
#Creating dataframe of similarity comparisons with every combination of patient pairs
##with the same denove gene
#initialize with first gene
pair_corrected <- gene_df(all_genes$gene[1])
#Create for all genes
for (i in 2:nrow(all_genes)){
temp <- gene_df(all_genes$gene[i])
pair_corrected <- pair_corrected %>% bind_rows(temp)
}
#Determine number of pairs per gene
gene_x <- unique(pair_corrected$gene)
gene_count <- as.data.frame(matrix(ncol=9,nrow=length(gene_x)))
names(gene_count) <- c("gene","n_pats","pairs","av_sim","median_sim","mode_sim","p_av","p_median","p_mode")
gene_count <- gene_count %>% mutate(gene = gene_x)
#Finding the average, median, and mode similarity,for each gene in the cohort
#########
#Find the p_value for each gene's similarity
## using the average, median, and mode similarity,for each gene from gene_count table
#########
#loop through each gene for p_av, p_med
gene_stat = gene_compute(gene_count)
write.csv(gene_stat,paste0(input.yaml$output_dir,"/gene_count.csv"),row.names = F)
message("\n The Entire script is now run successfully. Please find the Final output file gene_count.csv in the output directory \n ")
stop = Sys.time()
stop - start