@@ -13,32 +13,37 @@ utils::globalVariables(".data")
13
13
# ' @param force Logical; by default (`FALSE`), .mol files will not be downloaded
14
14
# ' if they are found in `dir`. Set this to `TRUE` to download and overwrite
15
15
# ' existing files.
16
- # '
16
+ # '
17
+ # ' @note For additional functionality for interacting with KEGG, try the
18
+ # ' `KEGGREST` package, which this function was inspired by.
19
+ # '
17
20
# ' @returns A tibble with the columns `compound_ids`, `pathway_ids` (if used),
18
21
# ' and `mol_paths` (paths to downloaded .mol files).
22
+ # '
19
23
# ' @export
20
24
# '
21
25
# ' @examples
22
26
# ' \dontrun{
23
27
# ' get_mol_kegg(compound_ids = c("C16181", "C06074"), dir = tempdir())
24
28
# ' get_mol_kegg(pathway_ids = "map00253", dir = tempdir())
25
29
# ' }
26
- get_mol_kegg <- function (compound_ids , pathway_ids , dir , force = FALSE ){
27
-
28
- if (missing(dir )) stop(" `dir` is required" )
29
- if ((missing(compound_ids ) & missing(pathway_ids )) |
30
- ! missing(compound_ids ) & ! missing(pathway_ids )) {
30
+ get_mol_kegg <- function (compound_ids , pathway_ids , dir , force = FALSE ) {
31
+ if (missing(dir )) stop(" `dir` is required" )
32
+ if (
33
+ (missing(compound_ids ) & missing(pathway_ids )) |
34
+ ! missing(compound_ids ) & ! missing(pathway_ids )
35
+ ) {
31
36
stop(" One of `compound_ids` or `pathway_ids` are required" )
32
37
}
33
-
38
+
34
39
# if compounds are provided
35
40
if (! missing(compound_ids )) {
36
41
if (! all(stringr :: str_detect(compound_ids , " ^[C][:digit:]{5}$" ))) {
37
42
stop(" Some compound_ids are not in the correct KEGG format" )
38
43
}
39
44
fs :: dir_create(dir )
40
45
out_tbl <-
41
- tibble :: tibble(compound_id = compound_ids ) %> %
46
+ tibble :: tibble(compound_id = compound_ids ) %> %
42
47
dplyr :: mutate(mol_path = fs :: path(dir , .data $ compound_id , ext = " mol" ))
43
48
}
44
49
# if pathways are provided
@@ -47,30 +52,40 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
47
52
stop(" Some pathway_ids are not in the correct KEGG format" )
48
53
}
49
54
fs :: dir_create(dir , pathway_ids )
50
- compound_ids_list <- lapply(pathway_ids , keggGetCompounds )
55
+ compound_ids_list <- lapply(pathway_ids , get_compounds_kegg )
51
56
names(compound_ids_list ) <- pathway_ids
52
- out_tbl <-
53
- tibble :: enframe(compound_ids_list , name = " pathway_id" , value = " compound_id" ) %> %
54
- tidyr :: unnest(tidyselect :: everything()) %> %
55
- dplyr :: mutate(mol_path = fs :: path(dir , .data $ pathway_id , .data $ compound_id , ext = " mol" ))
57
+ out_tbl <-
58
+ tibble :: enframe(
59
+ compound_ids_list ,
60
+ name = " pathway_id" ,
61
+ value = " compound_id"
62
+ ) %> %
63
+ tidyr :: unnest(tidyselect :: everything()) %> %
64
+ dplyr :: mutate(
65
+ mol_path = fs :: path(
66
+ dir ,
67
+ .data $ pathway_id ,
68
+ .data $ compound_id ,
69
+ ext = " mol"
70
+ )
71
+ )
56
72
}
57
-
58
- if (isFALSE(force )) {
73
+
74
+ if (isFALSE(force )) {
59
75
to_dl <- out_tbl $ compound_id [! fs :: file_exists(out_tbl $ mol_path )]
60
76
out_paths <- out_tbl $ mol_path [! fs :: file_exists(out_tbl $ mol_path )]
61
77
} else {
62
78
to_dl <- out_tbl $ compound_id
63
79
out_paths <- out_tbl $ mol_path
64
80
}
65
-
81
+
66
82
if (length(to_dl ) == 0 ) {
67
83
# if nothing to download, return early
68
84
return (out_tbl )
69
85
} else {
70
-
71
86
# Download mols
72
87
mols <- dl_mol_kegg(to_dl )
73
-
88
+
74
89
# write mol files
75
90
.write_mol <- function (mol_clean , file_path ) {
76
91
utils :: write.table(
@@ -81,76 +96,98 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
81
96
quote = FALSE
82
97
)
83
98
}
84
-
99
+
85
100
mapply(.write_mol , mol_clean = mols , file_path = out_paths )
86
-
101
+
87
102
return (out_tbl )
88
103
}
89
104
}
90
105
91
106
92
107
# ' Get list of KEGG compound IDs for given KEGG pathway
93
108
# '
94
- # ' This is a temporary helper function until this function is improved and
95
- # ' pushed into KEGGREST package
96
- # '
97
109
# ' @param pathway string that is a KEGG identifier for a molecular pathway
98
110
# ' @noRd
99
- keggGetCompounds <- function (pathway ){
100
-
101
- resp <-
102
- httr2 :: request(" https://rest.kegg.jp/" ) %> %
103
- httr2 :: req_url_path(" link/cpd/" ) %> %
104
- httr2 :: req_url_path_append(pathway ) %> %
105
- httr2 :: req_retry(max_tries = 3 ) %> %
111
+ get_compounds_kegg <- function (pathway ) {
112
+ resp <-
113
+ httr2 :: request(" https://rest.kegg.jp/" ) %> %
114
+ httr2 :: req_user_agent(
115
+ " volcalc (https://github.com/Meredith-Lab/volcalc/)"
116
+ ) %> %
117
+ httr2 :: req_url_path(" link/cpd/" ) %> %
118
+ httr2 :: req_url_path_append(pathway ) %> %
119
+ httr2 :: req_retry(max_tries = 3 ) %> %
106
120
httr2 :: req_perform()
107
-
108
- out <- resp %> %
109
- httr2 :: resp_body_string() %> %
110
- stringr :: str_split_1(" \n " ) %> %
121
+
122
+ out <- resp %> %
123
+ httr2 :: resp_body_string() %> %
124
+ stringr :: str_split_1(" \n " ) %> %
111
125
stringr :: str_extract(" (?<=cpd:).*" )
112
126
out [! is.na(out )]
113
-
114
127
}
115
128
116
- dl_mol_kegg <- function (compound_ids ) {
117
- # balances compound_ids into groups of less than 10 to meet API guidelines
118
- compound_id_list <- split_to_list(compound_ids , max_len = 10 )
119
-
120
- # maps over list, but returns it to a single character vector to simplify wrangling code
121
- raw <-
122
- purrr :: map(compound_id_list , function (x ) KEGGREST :: keggGet(x , option = " mol" )) %> %
123
- purrr :: list_c() %> %
124
- glue :: glue_collapse()
125
- # split into multiples
126
- mols <- stringr :: str_split(raw , " (?<=\\ ${4})" , n = length(compound_ids )) %> %
127
- unlist() %> %
128
- stringr :: str_trim(side = " left" )
129
-
130
- # Adds title to mol file because it is used later on by get_fx_groups()
131
- titles <- purrr :: map(compound_id_list , function (x ) { # for every group of <10 IDs
132
- KEGGREST :: keggGet(x ) %> %
133
- purrr :: map_chr(function (names ) { # for every ID
134
- purrr :: pluck(names , " NAME" , 1 ) %> % # get first element of NAME
135
- stringr :: str_remove(" ;" )
136
- })
137
- }) %> % unlist()
138
- purrr :: map2(mols , titles , function (mol , title ) {
139
- paste0(title , " \n\n\n " , gsub(" >.*" , " " , mol ))
140
- })
141
-
142
- }
129
+ # ' Get and wrangle mol files for a single API request of up to 10 IDs
130
+ # ' @noRd
131
+ .dl_mol_kegg <- function (ids ) {
132
+ if (length(ids ) > 10 ) {
133
+ stop(" Provide 10 or fewer IDs at a time" )
134
+ }
135
+ req_names <-
136
+ httr2 :: request(" https://rest.kegg.jp/get" ) %> %
137
+ httr2 :: req_user_agent(
138
+ " volcalc (https://github.com/Meredith-Lab/volcalc/)"
139
+ ) %> %
140
+ httr2 :: req_url_path_append(paste(ids , collapse = " +" )) %> %
141
+ httr2 :: req_retry(max_tries = 3 )
142
+
143
+ resp_names <- httr2 :: req_perform(req_names ) %> %
144
+ httr2 :: resp_body_string()
145
+
146
+ # There's a lot of stuff in the response, but I only care about the compound name
147
+ names <- resp_names %> %
148
+ stringr :: str_extract_all(" (?<=NAME).+(?=\\ n)" ) %> %
149
+ unlist() %> %
150
+ stringr :: str_trim() %> %
151
+ stringr :: str_remove(" ;" )
143
152
153
+ # get mol file
154
+ req_mols <- req_names %> %
155
+ httr2 :: req_url_path_append(" mol" )
144
156
157
+ resp_mols <- httr2 :: req_perform(req_mols ) %> %
158
+ httr2 :: resp_body_string()
159
+
160
+ # wrangle into valid mol files
161
+ mols <- resp_mols %> %
162
+ stringr :: str_split(" (?<=\\ ${4})" , n = length(ids )) %> %
163
+ unlist() %> %
164
+ stringr :: str_trim(side = " left" )
165
+ mols <-
166
+ gsub(" >.*" , " " , mols ) # for some reason this pattern doesn't work with str_remove()
167
+
168
+ # add compound name in correct place
169
+ paste0(names , " \n\n\n " , mols )
170
+ }
145
171
172
+ # ' Split vector into list elements of max length
173
+ # ' @noRd
146
174
split_to_list <- function (x , max_len = 10 ) {
147
-
148
- if (length(x ) > max_len ) {
175
+ if (length(x ) > max_len ) {
149
176
n_groups <- ceiling(length(x ) / max_len )
150
177
split(x , f = cut(seq_along(x ), breaks = n_groups )) %> %
151
178
purrr :: set_names(NULL )
152
179
} else {
153
180
list (x )
154
181
}
155
-
182
+ }
183
+
184
+ # ' Get mol files for compound_ids by splitting into groups of 10 and calling .dl_mol_kegg
185
+ # ' @noRd
186
+ dl_mol_kegg <- function (compound_ids ) {
187
+ # balances compound_ids into groups of less than 10 to meet API guidelines
188
+ compound_id_list <- split_to_list(compound_ids , max_len = 10 )
189
+
190
+ # maps over list, but returns it to a single character vector to simplify wrangling code
191
+ purrr :: map(compound_id_list , .dl_mol_kegg , .progress = " Downloading" ) %> %
192
+ purrr :: list_c()
156
193
}
0 commit comments