|
| 1 | +#include "Util.h" |
| 2 | +#include "Parameters.h" |
| 3 | +#include "Matcher.h" |
| 4 | +#include "Debug.h" |
| 5 | +#include "DBReader.h" |
| 6 | +#include "DBWriter.h" |
| 7 | +#include "IndexReader.h" |
| 8 | +#include "MemoryMapped.h" |
| 9 | +#include "NcbiTaxonomy.h" |
| 10 | +#include "MappingReader.h" |
| 11 | + |
| 12 | +#define ZSTD_STATIC_LINKING_ONLY |
| 13 | +#include <zstd.h> |
| 14 | + |
| 15 | +#include <map> |
| 16 | + |
| 17 | +#ifdef OPENMP |
| 18 | +#include <omp.h> |
| 19 | +#endif |
| 20 | + |
| 21 | +int pairaln(int argc, const char **argv, const Command& command) { |
| 22 | + Parameters &par = Parameters::getInstance(); |
| 23 | + par.parseParameters(argc, argv, command, true, 0, 0); |
| 24 | + |
| 25 | + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; |
| 26 | + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); |
| 27 | + |
| 28 | + std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2); |
| 29 | + MappingReader* mapping = new MappingReader(db2NoIndexName); |
| 30 | + const int dbaccessMode = (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA); |
| 31 | + IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); |
| 32 | + IndexReader *tDbr; |
| 33 | + if (sameDB) { |
| 34 | + tDbr = &qDbr; |
| 35 | + } else { |
| 36 | + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); |
| 37 | + } |
| 38 | + |
| 39 | + DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA); |
| 40 | + alnDbr.open(DBReader<unsigned int>::NOSORT); |
| 41 | + if(alnDbr.getSize() % 2 != 0){ |
| 42 | + Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n"; |
| 43 | + EXIT(EXIT_FAILURE); |
| 44 | + } |
| 45 | + size_t localThreads = 1; |
| 46 | +#ifdef OPENMP |
| 47 | + localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); |
| 48 | +#endif |
| 49 | + |
| 50 | + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES); |
| 51 | + resultWriter.open(); |
| 52 | + |
| 53 | + Debug::Progress progress(alnDbr.getSize()); |
| 54 | +#pragma omp parallel num_threads(localThreads) |
| 55 | + { |
| 56 | + unsigned int thread_idx = 0; |
| 57 | +#ifdef OPENMP |
| 58 | + thread_idx = static_cast<unsigned int>(omp_get_thread_num()); |
| 59 | +#endif |
| 60 | + char buffer[1024 + 32768*4]; |
| 61 | + std::string result; |
| 62 | + result.reserve(1024 * 1024); |
| 63 | + std::vector<Matcher::result_t> resultA; |
| 64 | + resultA.reserve(100000); |
| 65 | + std::vector<Matcher::result_t> resultB; |
| 66 | + resultB.reserve(100000); |
| 67 | + std::unordered_map<unsigned int, Matcher::result_t *> findPair; |
| 68 | + std::string outputA; |
| 69 | + outputA.reserve(100000); |
| 70 | + std::string outputB; |
| 71 | + outputB.reserve(100000); |
| 72 | +#pragma omp for schedule(static, 2) |
| 73 | + for (size_t i = 0; i < alnDbr.getSize(); i+=2) { |
| 74 | + progress.updateProgress(); |
| 75 | + progress.updateProgress(); |
| 76 | + resultA.clear(); |
| 77 | + resultB.clear(); |
| 78 | + outputA.clear(); |
| 79 | + outputB.clear(); |
| 80 | + Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true); |
| 81 | + Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true); |
| 82 | + |
| 83 | + for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) { |
| 84 | + unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey); |
| 85 | + if(findPair.find(taxon) == findPair.end()){ |
| 86 | + findPair.insert({taxon, &resultA[resIdx]}); |
| 87 | + } |
| 88 | + } |
| 89 | + // find pairs |
| 90 | + for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) { |
| 91 | + unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey); |
| 92 | + // found pair! |
| 93 | + if(findPair.find(taxon) != findPair.end()) { |
| 94 | + bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0); |
| 95 | + size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false); |
| 96 | + outputA.append(buffer, len); |
| 97 | + len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false); |
| 98 | + outputB.append(buffer, len); |
| 99 | + findPair.erase (taxon); |
| 100 | + } |
| 101 | + } |
| 102 | + resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx); |
| 103 | + resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx); |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + resultWriter.close(); |
| 108 | + // clean up |
| 109 | + delete mapping; |
| 110 | + alnDbr.close(); |
| 111 | + if (sameDB == false) { |
| 112 | + delete tDbr; |
| 113 | + } |
| 114 | + |
| 115 | + return 0; |
| 116 | +} |
| 117 | + |
0 commit comments