-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadding_rsIDs_v2.R
More file actions
84 lines (70 loc) · 3.67 KB
/
adding_rsIDs_v2.R
File metadata and controls
84 lines (70 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
match_rsID <- function(build, input_filename, column_chromosome_number, column_chromosome_position){
#Load bioMart library
library(biomaRt)
if(build == "GRCh37"){
built_conditions <- c("ENSEMBL_MART_SNP", "hsapiens_snp", "https://grch37.ensembl.org",
'refsnp_id', 'chr_name', 'chrom_start', 'allele',
'chr_name', 'start', 'end')
}
if(build == "GRCh38"){
built_conditions <- c("ENSEMBL_MART_SNP", "hsapiens_snp", "https://oct2022.archive.ensembl.org",
'refsnp_id', 'chr_name', 'chrom_start', 'allele',
'chr_name', 'start', 'end')
}
## Use the default ENSEMBL datasets
snp = useMart(biomart = built_conditions[1],
dataset = built_conditions[2], host = built_conditions[3])
print("Loading dataset...", quote = F)
## Load data into the data frame, SNP_M
SNP_M <- read.table(input_filename, header = T)
print("Dataset loaded", quote = F)
print("Fetching rsIDs from ENSEMBL. This might take a while, and will depend on the number of genetic variants...", quote = F)
#Query data
for(i in 1:nrow(SNP_M)){
## Submit the query
output <- getBM(attributes = c(built_conditions[4], built_conditions[5], built_conditions[6], built_conditions[7]),
filters = c(built_conditions[8], built_conditions[9], built_conditions[10]),
values = list(SNP_M[i, column_chromosome_number], SNP_M[i, column_chromosome_position], SNP_M[i, column_chromosome_position]),
mart = snp)
# Create a list of chr:positions not found in GRCh37/38 database, and immediately write to file using append
if(nrow(output) == 0){
output_notfound <- data.frame(chromosome = NA, variant_position = NA)
output_notfound[i, 1] <- SNP_M[i, column_chromosome_number]
output_notfound[i, 2] <- SNP_M[i, column_chromosome_position]
output_notfound <- na.omit(output_notfound)
write.table(output_notfound, "unsuccesful_rsID_matches.txt", append = T, row.names = F, col.names = F)
}
# Immediately write rsID chr:position matches to file using append
else if (nrow(output) == 1){
write.table(output, "succesful_rsID_matches.txt", append = T, row.names = F, col.names = F)
}
# Filter out rsIDs not matching chr:positions and immediately write rsID chr:position matches to file using append
else if(nrow(output) > 1){
j <- 1
temp <- data.frame(refsnp_id = NA, chr_name = NA, chrom_start = NA, allele = NA)
while(j <= nrow(output)){
if(SNP_M[i,column_chromosome_position] == output[j, 3]){
temp <- rbind(temp, list(output[j, 1], output[j, 2], output[j, 3], output[j, 4]))
j <- j + 1
}else{
j <- j + 1
}
}
# Add only rsID chr:position matches to the file
write.table(output, "succesful_rsID_matches.txt", append = T, row.names = F, col.names = F)
}
}
print("Finished fetching rsIDs from ENSEMBL", quote = F)
print("Adding headers and removing duplicates...", quote = F)
success <- read.table("succesful_rsID_matches.txt", header = F)
colnames(success) <- c("refsnp_id", "chr_name", "chrom_start", "alleles")
success<- success[!duplicated(success), ]
success <- na.omit(success)
write.table(success, "succesful_rsID_matches.txt", row.names = F)
fail <- read.table("unsuccesful_rsID_matches.txt", header = F)
colnames(fail) <- c("chr_name", "chrom_start")
fail <- fail[!duplicated(fail), ]
fail <- na.omit(fail)
write.table(fail, "unsuccesful_rsID_matches.txt", row.names = F)
print("Done!", quote = F)
}