Skip to content

Commit 343a7bb

Browse files
committed
Merge commit '4244ec84fa90f9805382ba3b5a938c870d1b815f'
2 parents d6976c7 + 4244ec8 commit 343a7bb

File tree

9 files changed

+133
-74
lines changed

9 files changed

+133
-74
lines changed

.github/workflows/R-CMD-check.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ jobs:
1818
fail-fast: false
1919
matrix:
2020
config:
21-
- {os: macos-latest, r: 'release'}
22-
- {os: windows-latest, r: 'release'}
21+
- {os: macos-13, r: 'release'} #TODO remove some day when ChemmineOB works on Apple silicon
22+
- {os: macos-latest, r: 'release'}
23+
- {os: windows-latest, r: 'release'}
2324
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
2425
- {os: ubuntu-latest, r: 'release'}
2526
- {os: ubuntu-latest, r: 'oldrel-1'}
@@ -39,7 +40,7 @@ jobs:
3940
http-user-agent: ${{ matrix.config.http-user-agent }}
4041
use-public-rspm: true
4142
- name: macOS openbabel
42-
if: matrix.config.os == 'macos-latest'
43+
if: contains(matrix.config.os, 'macos')
4344
run: |
4445
brew install open-babel
4546
- name: ubuntu openbabel

.github/workflows/test-coverage.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949

5050
- name: Upload test results
5151
if: failure()
52-
uses: actions/upload-artifact@v3
52+
uses: actions/upload-artifact@v4
5353
with:
5454
name: coverage-test-failures
5555
path: ${{ runner.temp }}/package

DESCRIPTION

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Description: Calculate estimated relative volatility index values for
1717
<doi:10.5194/acp-8-2773-2008> or modified SIMPOL.1 method as in
1818
Meredith et al. (2023) <doi:10.5194/acp-8-2773-2008>.
1919
License: MIT + file LICENSE
20-
URL: https://meredith-lab.github.io/volcalc/
20+
URL: https://meredith-lab.github.io/volcalc/, https://cct-datascience.r-universe.dev/volcalc
2121
BugReports: https://github.com/Meredith-Lab/volcalc/issues
2222
Imports:
2323
ChemmineOB,
@@ -26,7 +26,6 @@ Imports:
2626
fs,
2727
glue,
2828
httr2,
29-
KEGGREST,
3029
magrittr,
3130
purrr,
3231
rlang,

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
* adds a `validate = TRUE` option to `calc_vol()` and `get_fx_groups()` that returns `NA`s when there are suspected errors in parsing SMILES or .mol files. This is unfortunately not available on Windows due to differences in the windows version of `ChemmineOB`
44
* adds a dataset, `smarts_simpol1`, describing how functional groups are defined for the SIMPOL.1 and Meredith et al. methods
5+
* `KEGGREST` is no longer a dependency of `volcalc` (previously used in `get_mol_kegg()`)
56

67
# volcalc 2.1.2
78

R/get_mol_kegg.R

Lines changed: 102 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,37 @@ utils::globalVariables(".data")
1313
#' @param force Logical; by default (`FALSE`), .mol files will not be downloaded
1414
#' if they are found in `dir`. Set this to `TRUE` to download and overwrite
1515
#' existing files.
16-
#'
16+
#'
17+
#' @note For additional functionality for interacting with KEGG, try the
18+
#' `KEGGREST` package, which this function was inspired by.
19+
#'
1720
#' @returns A tibble with the columns `compound_ids`, `pathway_ids` (if used),
1821
#' and `mol_paths` (paths to downloaded .mol files).
22+
#'
1923
#' @export
2024
#'
2125
#' @examples
2226
#' \dontrun{
2327
#' get_mol_kegg(compound_ids = c("C16181", "C06074"), dir = tempdir())
2428
#' get_mol_kegg(pathway_ids = "map00253", dir = tempdir())
2529
#' }
26-
get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
27-
28-
if(missing(dir)) stop("`dir` is required")
29-
if ((missing(compound_ids) & missing(pathway_ids)) |
30-
!missing(compound_ids) & !missing(pathway_ids)) {
30+
get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE) {
31+
if (missing(dir)) stop("`dir` is required")
32+
if (
33+
(missing(compound_ids) & missing(pathway_ids)) |
34+
!missing(compound_ids) & !missing(pathway_ids)
35+
) {
3136
stop("One of `compound_ids` or `pathway_ids` are required")
3237
}
33-
38+
3439
#if compounds are provided
3540
if (!missing(compound_ids)) {
3641
if (!all(stringr::str_detect(compound_ids, "^[C][:digit:]{5}$"))) {
3742
stop("Some compound_ids are not in the correct KEGG format")
3843
}
3944
fs::dir_create(dir)
4045
out_tbl <-
41-
tibble::tibble(compound_id = compound_ids) %>%
46+
tibble::tibble(compound_id = compound_ids) %>%
4247
dplyr::mutate(mol_path = fs::path(dir, .data$compound_id, ext = "mol"))
4348
}
4449
# if pathways are provided
@@ -47,30 +52,40 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
4752
stop("Some pathway_ids are not in the correct KEGG format")
4853
}
4954
fs::dir_create(dir, pathway_ids)
50-
compound_ids_list <- lapply(pathway_ids, keggGetCompounds)
55+
compound_ids_list <- lapply(pathway_ids, get_compounds_kegg)
5156
names(compound_ids_list) <- pathway_ids
52-
out_tbl <-
53-
tibble::enframe(compound_ids_list, name = "pathway_id", value = "compound_id") %>%
54-
tidyr::unnest(tidyselect::everything()) %>%
55-
dplyr::mutate(mol_path = fs::path(dir, .data$pathway_id, .data$compound_id, ext = "mol"))
57+
out_tbl <-
58+
tibble::enframe(
59+
compound_ids_list,
60+
name = "pathway_id",
61+
value = "compound_id"
62+
) %>%
63+
tidyr::unnest(tidyselect::everything()) %>%
64+
dplyr::mutate(
65+
mol_path = fs::path(
66+
dir,
67+
.data$pathway_id,
68+
.data$compound_id,
69+
ext = "mol"
70+
)
71+
)
5672
}
57-
58-
if(isFALSE(force)) {
73+
74+
if (isFALSE(force)) {
5975
to_dl <- out_tbl$compound_id[!fs::file_exists(out_tbl$mol_path)]
6076
out_paths <- out_tbl$mol_path[!fs::file_exists(out_tbl$mol_path)]
6177
} else {
6278
to_dl <- out_tbl$compound_id
6379
out_paths <- out_tbl$mol_path
6480
}
65-
81+
6682
if (length(to_dl) == 0) {
6783
#if nothing to download, return early
6884
return(out_tbl)
6985
} else {
70-
7186
# Download mols
7287
mols <- dl_mol_kegg(to_dl)
73-
88+
7489
# write mol files
7590
.write_mol <- function(mol_clean, file_path) {
7691
utils::write.table(
@@ -81,76 +96,98 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
8196
quote = FALSE
8297
)
8398
}
84-
99+
85100
mapply(.write_mol, mol_clean = mols, file_path = out_paths)
86-
101+
87102
return(out_tbl)
88103
}
89104
}
90105

91106

92107
#' Get list of KEGG compound IDs for given KEGG pathway
93108
#'
94-
#' This is a temporary helper function until this function is improved and
95-
#' pushed into KEGGREST package
96-
#'
97109
#' @param pathway string that is a KEGG identifier for a molecular pathway
98110
#' @noRd
99-
keggGetCompounds <- function(pathway){
100-
101-
resp <-
102-
httr2::request("https://rest.kegg.jp/") %>%
103-
httr2::req_url_path("link/cpd/") %>%
104-
httr2::req_url_path_append(pathway) %>%
105-
httr2::req_retry(max_tries = 3) %>%
111+
get_compounds_kegg <- function(pathway) {
112+
resp <-
113+
httr2::request("https://rest.kegg.jp/") %>%
114+
httr2::req_user_agent(
115+
"volcalc (https://github.com/Meredith-Lab/volcalc/)"
116+
) %>%
117+
httr2::req_url_path("link/cpd/") %>%
118+
httr2::req_url_path_append(pathway) %>%
119+
httr2::req_retry(max_tries = 3) %>%
106120
httr2::req_perform()
107-
108-
out <- resp %>%
109-
httr2::resp_body_string() %>%
110-
stringr::str_split_1("\n") %>%
121+
122+
out <- resp %>%
123+
httr2::resp_body_string() %>%
124+
stringr::str_split_1("\n") %>%
111125
stringr::str_extract("(?<=cpd:).*")
112126
out[!is.na(out)]
113-
114127
}
115128

116-
dl_mol_kegg <- function(compound_ids) {
117-
#balances compound_ids into groups of less than 10 to meet API guidelines
118-
compound_id_list <- split_to_list(compound_ids, max_len = 10)
119-
120-
#maps over list, but returns it to a single character vector to simplify wrangling code
121-
raw <-
122-
purrr::map(compound_id_list, function(x) KEGGREST::keggGet(x, option = "mol")) %>%
123-
purrr::list_c() %>%
124-
glue::glue_collapse()
125-
#split into multiples
126-
mols <- stringr::str_split(raw, "(?<=\\${4})", n = length(compound_ids)) %>%
127-
unlist() %>%
128-
stringr::str_trim(side = "left")
129-
130-
# Adds title to mol file because it is used later on by get_fx_groups()
131-
titles <- purrr::map(compound_id_list, function(x) { #for every group of <10 IDs
132-
KEGGREST::keggGet(x) %>%
133-
purrr::map_chr(function(names) { #for every ID
134-
purrr::pluck(names, "NAME", 1) %>% #get first element of NAME
135-
stringr::str_remove(";")
136-
})
137-
}) %>% unlist()
138-
purrr::map2(mols, titles, function(mol, title) {
139-
paste0(title, "\n\n\n", gsub(">.*", "", mol))
140-
})
141-
142-
}
129+
#' Get and wrangle mol files for a single API request of up to 10 IDs
130+
#' @noRd
131+
.dl_mol_kegg <- function(ids) {
132+
if (length(ids) > 10) {
133+
stop("Provide 10 or fewer IDs at a time")
134+
}
135+
req_names <-
136+
httr2::request("https://rest.kegg.jp/get") %>%
137+
httr2::req_user_agent(
138+
"volcalc (https://github.com/Meredith-Lab/volcalc/)"
139+
) %>%
140+
httr2::req_url_path_append(paste(ids, collapse = "+")) %>%
141+
httr2::req_retry(max_tries = 3)
142+
143+
resp_names <- httr2::req_perform(req_names) %>%
144+
httr2::resp_body_string()
145+
146+
# There's a lot of stuff in the response, but I only care about the compound name
147+
names <- resp_names %>%
148+
stringr::str_extract_all("(?<=NAME).+(?=\\n)") %>%
149+
unlist() %>%
150+
stringr::str_trim() %>%
151+
stringr::str_remove(";")
143152

153+
# get mol file
154+
req_mols <- req_names %>%
155+
httr2::req_url_path_append("mol")
144156

157+
resp_mols <- httr2::req_perform(req_mols) %>%
158+
httr2::resp_body_string()
159+
160+
# wrangle into valid mol files
161+
mols <- resp_mols %>%
162+
stringr::str_split("(?<=\\${4})", n = length(ids)) %>%
163+
unlist() %>%
164+
stringr::str_trim(side = "left")
165+
mols <-
166+
gsub(">.*", "", mols) #for some reason this pattern doesn't work with str_remove()
167+
168+
#add compound name in correct place
169+
paste0(names, "\n\n\n", mols)
170+
}
145171

172+
#' Split vector into list elements of max length
173+
#' @noRd
146174
split_to_list <- function(x, max_len = 10) {
147-
148-
if(length(x) > max_len) {
175+
if (length(x) > max_len) {
149176
n_groups <- ceiling(length(x) / max_len)
150177
split(x, f = cut(seq_along(x), breaks = n_groups)) %>%
151178
purrr::set_names(NULL)
152179
} else {
153180
list(x)
154181
}
155-
182+
}
183+
184+
#' Get mol files for compound_ids by splitting into groups of 10 and calling .dl_mol_kegg
185+
#' @noRd
186+
dl_mol_kegg <- function(compound_ids) {
187+
#balances compound_ids into groups of less than 10 to meet API guidelines
188+
compound_id_list <- split_to_list(compound_ids, max_len = 10)
189+
190+
#maps over list, but returns it to a single character vector to simplify wrangling code
191+
purrr::map(compound_id_list, .dl_mol_kegg, .progress = "Downloading") %>%
192+
purrr::list_c()
156193
}

README.Rmd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ For windows, `OpenBabel` is included in the `ChemmineOB` binary and does not nee
7777

7878
For other installation options see the [OpenBabel documentation](https://open-babel.readthedocs.io/en/latest/Installation/install.html) and `ChemmineOB` [install guide](https://github.com/girke-lab/ChemmineOB/blob/master/INSTALL)
7979

80+
> [!NOTE]
81+
> As of Dec 2024, `ChemmineOB` may fail to build on macs with Apple silicon (https://github.com/girke-lab/ChemmineOB/issues/35) causing installation failture for `volcalc`.
82+
8083
## Basic Usage
8184

8285
This is a basic example which shows you how to get an estimated relative volatility index (`rvi`) for two example compounds *beta-2,3,4,5,6-Pentachlorocyclohexanol*, and *Succinate*.

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,11 @@ strings as input, and supports downloading .mol files directly from
4040

4141
## Installation
4242

43-
<!--
4443
Install from CRAN with
4544

4645
``` r
4746
install.packages("volcalc")
4847
```
49-
-->
5048

5149
You can install the development version of `volcalc` from GitHub with
5250

@@ -96,6 +94,11 @@ documentation](https://open-babel.readthedocs.io/en/latest/Installation/install.
9694
and `ChemmineOB` [install
9795
guide](https://github.com/girke-lab/ChemmineOB/blob/master/INSTALL)
9896

97+
> \[!NOTE\]
98+
> As of Dec 2024, `ChemmineOB` may fail to build on macs with Apple
99+
> silicon (<https://github.com/girke-lab/ChemmineOB/issues/35>) causing
100+
> installation failture for `volcalc`.
101+
99102
## Basic Usage
100103

101104
This is a basic example which shows you how to get an estimated relative
@@ -118,7 +121,7 @@ calc_vol(files$mol_path)
118121
#> mol_path formula name rvi category
119122
#> <chr> <chr> <chr> <dbl> <fct>
120123
#> 1 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C6H7Cl… beta… 6.98 high
121-
#> 2 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C4H6O4 Succ2.57 high
124+
#> 2 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C6H7Cl… beta6.98 high
122125

123126
#alternatively, supply a SMILES representation
124127
calc_vol(c("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)O", "C(CC(=O)O)C(=O)O"), from = "smiles")

man/get_mol_kegg.Rd

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-get_mol_kegg.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,14 @@ test_that("works with pathway modules", {
110110
expect_true(all(file.exists(out$mol_path)))
111111
})
112112

113+
test_that("one compound per mol", {
114+
skip_on_cran()
115+
skip_if_offline()
116+
117+
dir <- withr::local_tempdir()
118+
mols <- get_mol_kegg(c("C16181", "C00042"), dir = dir)
119+
mol1 <- readLines(mols$mol_path[1])
120+
mol2 <- readLines(mols$mol_path[2])
121+
expect_equal(sum(stringr::str_detect(mol1, "END")), 1)
122+
expect_equal(sum(stringr::str_detect(mol2, "END")), 1)
123+
})

0 commit comments

Comments
 (0)