SciLifeLab · MatthiasZepper · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,27 +1,27 @@
 [package]
 name = "umi-transfer"
-version = "1.5.0"
+version = "1.6.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-clap = { version = "4.3.11", features = ["derive"] }
-bio = "1.6.0"
-flate2 = "1.0.24"
-itertools = "0.12.1"
-file-format = "0.24.0"
-anyhow = "1.0.82"
+clap = { version = "4.5.28", features = ["derive"] }
+bio = "2.0.3"
+flate2 = "1.0.35"
+itertools = "0.14.0"
+file-format = "0.26.0"
+anyhow = "1.0.95"
 dialoguer = "0.11.0"
-regex = "1.10.4"
-owo-colors = { version = "4.0", features = ["supports-colors"] }
+regex = "1.11.1"
+owo-colors = { version = "4.1", features = ["supports-colors"] }
 gzp = "0.11.3"
 
 [dev-dependencies]
-assert_cmd = "2.0.14"
-assert_fs = "1.1.1"
-predicates = "3.1.0"
-rexpect = "0.5.0"
+assert_cmd = "2.0.16"
+assert_fs = "1.1.2"
+predicates = "3.1.3"
+rexpect = "0.6.0"
 
 [workspace.metadata.marker.lints]
 marker_lints = "0.5.0"
diff --git a/README.md b/README.md
@@ -96,12 +96,12 @@ That should create an executable `target/release/umi-transfer` that can be place
 
 ```shell
 ./target/release/umi-transfer --version
-umi-transfer 1.5.0
+umi-transfer 1.6.0
 ```
 
 ## Usage
 
-The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will automatically append a `with_UMI` suffix to your input file names. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`.
+The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will automatically append a `with_UMI` suffix to your input file names. It additionally allows to choose a custom UMI delimiter with `--delim`, the position of the integrated UMI with`--position`, and to set the flags `-f`, `-c` and `-z`.
 
 `-c` is used to ensure the canonical `1` and `2` of paired files as read numbers in the output, regardless of the read numbers of the input reads. `-f` / `--force` will overwrite existing output files without prompting the user and `-z` enables the internal compression of the output files. Alternatively, you can also specify an output file name with `.gz` suffix to obtain compressed output.
 
@@ -114,6 +114,10 @@ Integrate UMIs from a separate FastQ file
 Usage: umi-transfer external [OPTIONS] --in <R1_IN> --in2 <R2_IN> --umi <RU_IN>
 
 Options:
+  -p, --position <TARGET_POSITION>
+          Choose the target position for the UMI: 'header' or 'inline'. Defaults to 'header'.
+
+            [default: header] [possible values: header, inline]
   -c, --correct_numbers
           Read numbers will be altered to ensure the canonical read numbers 1 and 2 in output file sequence headers.
 
@@ -127,7 +131,7 @@ Options:
 
 
   -t, --threads <NUM_THREADS>
-          Number of threads to use for processing. Defaults to the number of logical cores available.
+          Maximum number of threads to use for processing. Preferably pick odd numbers, 9 or 11 recommended. Defaults to the maximum number of cores available.
 
 
   -f, --force
@@ -181,7 +185,7 @@ umi-transfer external --in read1.fastq --in2 read1.fastq --umi read2.fastq --out
 ### Benchmarks and parameter recommendations
 
 
-With the release of version 1.5,  `umi-transfer` features internal multi-threaded output compression. As a result,  `umi-transfer` 1.5 now runs approximately 25 times faster than version 1.0 when using internal compression and about twice as fast compared to using an external compression tool. This improvement is enabled by the outstanding [`gzp` crate](https://github.com/sstadick/gzp), which abstracts a lot of the underlying complexity away from the main software.
+Since the release of version 1.5,  `umi-transfer` features internal multi-threaded output compression. As a result,  `umi-transfer` 1.5 now runs approximately 25 times faster than version 1.0 when using internal compression and about twice as fast compared to using an external compression tool. This improvement is enabled by the outstanding [`gzp` crate](https://github.com/sstadick/gzp), which abstracts a lot of the underlying complexity away from the main software.
 
 ![Benchmark of different tool versions](docs/img/benchmark_umi-transfer-version.svg)
 

diff --git a/src/main.rs b/src/main.rs
@@ -11,6 +11,7 @@ use crate::umi_external::OptsExternal;
 ///use crate::umi_internal::OptsInternal;
 mod auxiliary;
 mod file_io;
+mod read_editing;
 mod umi_errors;
 mod umi_external;
 
@@ -25,7 +26,7 @@ https://github.com/SciLifeLab/umi-transfer
 
 #[derive(clap::Parser)]
 #[clap(
-    version = "1.5.0",
+    version = "1.6.0",
     author = "Written by Matthias Zepper, Judit Hohenthal & Johannes Alneberg",
     about = "A tool for transferring Unique Molecular Identifiers (UMIs).",
     long_about = "Most tools capable of using UMIs to increase the accuracy of quantitative DNA sequencing experiments expect the respective UMI sequence to be embedded into the reads' IDs. You can use `umi-transfer external` to retrieve UMIs from a separate FastQ file and embed them to the IDs of your paired FastQ files."

diff --git a/src/read_editing.rs b/src/read_editing.rs
@@ -0,0 +1,155 @@
+#[derive(clap::ValueEnum, Clone, Debug)]
+pub enum UMIDestination {
+    Header,
+    Inline,
+}
+
+// Updates the header and description of the reads accordingly
+pub fn umi_to_record_header(
+    input: bio::io::fastq::Record,
+    umi: &[u8],
+    umi_sep: Option<&String>,
+    edit_nr: Option<u8>,
+) -> Result<bio::io::fastq::Record, anyhow::Error> {
+    let delim = umi_sep.as_ref().map(|s| s.as_str()).unwrap_or(":"); // the delimiter for the UMI
+    let new_id = &[input.id(), delim, std::str::from_utf8(umi).unwrap()].concat();
+    if let Some(number) = edit_nr {
+        let mut new_desc = String::from(input.desc().unwrap());
+        new_desc.replace_range(0..1, &number.to_string());
+        let desc: Option<&str> = Some(&new_desc);
+        let new_record =
+            bio::io::fastq::Record::with_attrs(new_id, desc, input.seq(), input.qual());
+        Ok(new_record)
+    } else {
+        let new_record =
+            bio::io::fastq::Record::with_attrs(new_id, input.desc(), input.seq(), input.qual());
+        Ok(new_record)
+    }
+}
+
+// Updates the header and description of the reads accordingly
+pub fn umi_to_record_seq(
+    input: bio::io::fastq::Record,
+    umi: &[u8],
+    umi_qual: &[u8],
+    edit_nr: Option<u8>,
+) -> Result<bio::io::fastq::Record, anyhow::Error> {
+    let mut concatenated_seq = Vec::with_capacity(input.seq().len() + umi.len());
+    concatenated_seq.extend_from_slice(umi);
+    concatenated_seq.extend_from_slice(input.seq());
+    let concatenated_seq_str = String::from_utf8(concatenated_seq).unwrap();
+
+    let mut concatenated_qual = Vec::with_capacity(input.qual().len() + umi_qual.len());
+    concatenated_qual.extend_from_slice(umi_qual);
+    concatenated_qual.extend_from_slice(input.qual());
+    let concatenated_qual_str = String::from_utf8(concatenated_qual).unwrap();
+
+    if let Some(number) = edit_nr {
+        let mut new_desc = String::from(input.desc().unwrap());
+        new_desc.replace_range(0..1, &number.to_string());
+        let desc: Option<&str> = Some(&new_desc);
+        // Unnecessary conversion to bytes and back to String, but Record::new() does
+        // not take arguments and the fields of struct `bio::io::fastq::Record` are private,
+        // so I can't implement another method to create a new record.
+        let new_record = bio::io::fastq::Record::with_attrs(
+            input.id(),
+            desc,
+            concatenated_seq_str.as_bytes(),
+            concatenated_qual_str.as_bytes(),
+        );
+        Ok(new_record)
+    } else {
+        let new_record = bio::io::fastq::Record::with_attrs(
+            input.id(),
+            input.desc(),
+            concatenated_seq_str.as_bytes(),
+            concatenated_qual_str.as_bytes(),
+        );
+        Ok(new_record)
+    }
+}
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_umi_to_record_header_with_edits() {
+        let input = bio::io::fastq::Record::with_attrs(
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031",
+            Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"),
+            b"TCGTTTTCCGC",
+            b"FFFFFFFFFFF",
+        );
+        let umi = b"ACCAGCTA";
+        let umi_sep = "_".to_string();
+        let edit_nr = Some(5);
+
+        let result = umi_to_record_header(input, umi, Some(&umi_sep), edit_nr).unwrap();
+        assert_eq!(
+            result.id(),
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031_ACCAGCTA"
+        );
+        assert_eq!(result.desc(), Some("5:N:0:GCTTCAGGGT+AAGGTAGCGT"));
+        assert_eq!(result.seq(), b"TCGTTTTCCGC");
+        assert_eq!(result.qual(), b"FFFFFFFFFFF");
+    }
+
+    #[test]
+    fn test_umi_to_record_header_plain() {
+        let input = bio::io::fastq::Record::with_attrs(
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031",
+            Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"),
+            b"TCGTTTTCCGC",
+            b"FFFFFFFFFFF",
+        );
+        let umi = b"ACCAGCTA";
+        let umi_sep = ":".to_string();
+
+        let result = umi_to_record_header(input, umi, Some(&umi_sep), None).unwrap();
+        assert_eq!(
+            result.id(),
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031:ACCAGCTA"
+        );
+        assert_eq!(result.desc(), Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"));
+        assert_eq!(result.seq(), b"TCGTTTTCCGC");
+        assert_eq!(result.qual(), b"FFFFFFFFFFF");
+    }
+
+    #[test]
+    fn test_umi_to_record_seq_with_edit_nr() {
+        let input = bio::io::fastq::Record::with_attrs(
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031",
+            Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"),
+            b"TCGTTTTCCGC",
+            b"FFFFFFFFFFF",
+        );
+        let umi = b"ACCAGCTA";
+        let umi_qual = b"########";
+        let edit_nr = Some(5);
+
+        let result = umi_to_record_seq(input, umi, umi_qual, edit_nr).unwrap();
+        assert_eq!(result.id(), "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031");
+        assert_eq!(result.desc(), Some("5:N:0:GCTTCAGGGT+AAGGTAGCGT"));
+        assert_eq!(result.seq(), b"ACCAGCTATCGTTTTCCGC");
+        assert_eq!(result.qual(), b"########FFFFFFFFFFF");
+    }
+
+    #[test]
+    fn test_umi_to_record_seq_without_edit_nr() {
+        let input = bio::io::fastq::Record::with_attrs(
+            "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031",
+            Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"),
+            b"TCGTTTTCCGC",
+            b"FFFFFFFFFFF",
+        );
+        let umi = b"ACCAGCTA";
+        let umi_qual = b"########";
+
+        let result = umi_to_record_seq(input, umi, umi_qual, None).unwrap();
+        assert_eq!(result.id(), "@SCILIFELAB:500:NGISTLM:1:1101:2446:1031");
+        assert_eq!(result.desc(), Some("1:N:0:GCTTCAGGGT+AAGGTAGCGT"));
+        assert_eq!(result.seq(), b"ACCAGCTATCGTTTTCCGC");
+        assert_eq!(result.qual(), b"########FFFFFFFFFFF");
+    }
+}
diff --git a/src/umi_external.rs b/src/umi_external.rs
@@ -5,16 +5,26 @@ use std::path::PathBuf;
 
 use super::file_io;
 use crate::auxiliary::{threads_available, threads_per_task};
+use crate::read_editing::{umi_to_record_header, umi_to_record_seq, UMIDestination};
 use crate::umi_errors::RuntimeErrors;
 #[derive(Debug, Parser)]
 pub struct OptsExternal {
+    #[clap(
+        short = 'p',
+        long = "position",
+        help = "Choose the target position for the UMI: 'header' or 'inline'. Defaults to 'header'.
+        \n ",
+        default_value = "header"
+    )]
+    target_position: UMIDestination,
     #[clap(
         short = 'c',
         long = "correct_numbers",
         help = "Read numbers will be altered to ensure the canonical read numbers 1 and 2 in output file sequence headers.
         \n "
     )]
     edit_nr: bool,
+
     #[clap(
         short = 'z',
         long = "gzip",
@@ -183,7 +193,15 @@ pub fn run(args: OptsExternal) -> Result<i32> {
         if r1_rec.id().eq(ru_rec.id()) {
             // Write to Output file
             let read_nr = if edit_nr { Some(1) } else { None };
-            let r1_rec = update_record(r1_rec, ru_rec.seq(), args.delim.as_ref(), read_nr)?;
+
+            let r1_rec = match args.target_position {
+                UMIDestination::Header => {
+                    umi_to_record_header(r1_rec, ru_rec.seq(), args.delim.as_ref(), read_nr)
+                }
+                UMIDestination::Inline => {
+                    umi_to_record_seq(r1_rec, ru_rec.seq(), ru_rec.qual(), read_nr)
+                }
+            }?;
 
             write_output_r1.write_record(r1_rec)?;
         } else {
@@ -193,7 +211,15 @@ pub fn run(args: OptsExternal) -> Result<i32> {
         if r2_rec.id().eq(ru_rec.id()) {
             // Write to Output file
             let read_nr = if edit_nr { Some(2) } else { None };
-            let r2_rec = update_record(r2_rec, ru_rec.seq(), args.delim.as_ref(), read_nr)?;
+
+            let r2_rec = match args.target_position {
+                UMIDestination::Header => {
+                    umi_to_record_header(r2_rec, ru_rec.seq(), args.delim.as_ref(), read_nr)
+                }
+                UMIDestination::Inline => {
+                    umi_to_record_seq(r2_rec, ru_rec.seq(), ru_rec.qual(), read_nr)
+                }
+            }?;
 
             write_output_r2.write_record(r2_rec)?;
         } else {
@@ -203,27 +229,3 @@ pub fn run(args: OptsExternal) -> Result<i32> {
     println!("Processed {:?} records", counter);
     Ok(counter)
 }
-
-// Updates the header and description of the reads accordingly
-fn update_record(
-    input: bio::io::fastq::Record,
-    umi: &[u8],
-    umi_sep: Option<&String>,
-    edit_nr: Option<u8>,
-) -> Result<bio::io::fastq::Record> {
-    let delim = umi_sep.as_ref().map(|s| s.as_str()).unwrap_or(":"); // the delimiter for the UMI
-    if let Some(number) = edit_nr {
-        let new_id = &[input.id(), delim, std::str::from_utf8(umi).unwrap()].concat();
-        let mut new_desc = String::from(input.desc().unwrap());
-        new_desc.replace_range(0..1, &number.to_string());
-        let desc: Option<&str> = Some(&new_desc);
-        let new_record =
-            bio::io::fastq::Record::with_attrs(new_id, desc, input.seq(), input.qual());
-        Ok(new_record)
-    } else {
-        let new_id = &[input.id(), delim, std::str::from_utf8(umi).unwrap()].concat();
-        let new_record =
-            bio::io::fastq::Record::with_attrs(new_id, input.desc(), input.seq(), input.qual());
-        Ok(new_record)
-    }
-}