Skip to content

Commit 9a0df0d

Browse files
Add pairaln
1 parent be11943 commit 9a0df0d

File tree

4 files changed

+128
-2
lines changed

4 files changed

+128
-2
lines changed

src/CommandDeclarations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
4646
extern int enrich(int argc, const char **argv, const Command& command);
4747
extern int expandaln(int argc, const char **argv, const Command& command);
4848
extern int expand2profile(int argc, const char **argv, const Command& command);
49+
extern int pairaln(int argc, const char **argv, const Command& command);
4950
extern int countkmer(int argc, const char **argv, const Command& command);
5051
extern int extractalignedregion(int argc, const char **argv, const Command& command);
5152
extern int extractdomains(int argc, const char **argv, const Command& command);

src/MMseqsBase.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,8 +1134,15 @@ std::vector<Command> baseCommands = {
11341134
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
11351135
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
11361136
{"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
1137-
1138-
1137+
{"pairaln", pairaln, &par.threadsandcompression, COMMAND_EXPERT,
1138+
"Pair sequences to match best protein A and B from an species",
1139+
NULL,
1140+
"Martin Steinegger <[email protected]>",
1141+
"<i:queryDB> <i:targetDB> <i:alnDB> <o:alnDB|ca3mDB>",
1142+
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
1143+
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::sequenceDb },
1144+
{"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
1145+
{"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
11391146
{"diffseqdbs", diffseqdbs, &par.diff, COMMAND_SPECIAL,
11401147
"Compute diff of two sequence DBs",
11411148
// "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.",

src/util/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ set(util_source_files
3737
util/msa2result.cpp
3838
util/nrtotaxmapping.cpp
3939
util/countkmer.cpp
40+
util/pairaln.cpp
4041
util/prefixid.cpp
4142
util/profile2pssm.cpp
4243
util/profile2seq.cpp

src/util/pairaln.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#include "Util.h"
2+
#include "Parameters.h"
3+
#include "Matcher.h"
4+
#include "Debug.h"
5+
#include "DBReader.h"
6+
#include "DBWriter.h"
7+
#include "IndexReader.h"
8+
#include "MemoryMapped.h"
9+
#include "NcbiTaxonomy.h"
10+
#include "MappingReader.h"
11+
12+
#define ZSTD_STATIC_LINKING_ONLY
13+
#include <zstd.h>
14+
15+
#include <map>
16+
17+
#ifdef OPENMP
18+
#include <omp.h>
19+
#endif
20+
21+
int pairaln(int argc, const char **argv, const Command& command) {
22+
Parameters &par = Parameters::getInstance();
23+
par.parseParameters(argc, argv, command, true, 0, 0);
24+
25+
const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false;
26+
const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
27+
28+
std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
29+
MappingReader* mapping = new MappingReader(db2NoIndexName);
30+
const int dbaccessMode = (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
31+
IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
32+
IndexReader *tDbr;
33+
if (sameDB) {
34+
tDbr = &qDbr;
35+
} else {
36+
tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
37+
}
38+
39+
DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
40+
alnDbr.open(DBReader<unsigned int>::NOSORT);
41+
if(alnDbr.getSize() % 2 != 0){
42+
Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n";
43+
EXIT(EXIT_FAILURE);
44+
}
45+
size_t localThreads = 1;
46+
#ifdef OPENMP
47+
localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1);
48+
#endif
49+
50+
DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
51+
resultWriter.open();
52+
53+
Debug::Progress progress(alnDbr.getSize());
54+
#pragma omp parallel num_threads(localThreads)
55+
{
56+
unsigned int thread_idx = 0;
57+
#ifdef OPENMP
58+
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
59+
#endif
60+
char buffer[1024 + 32768*4];
61+
std::string result;
62+
result.reserve(1024 * 1024);
63+
std::vector<Matcher::result_t> resultA;
64+
resultA.reserve(100000);
65+
std::vector<Matcher::result_t> resultB;
66+
resultB.reserve(100000);
67+
std::unordered_map<unsigned int, Matcher::result_t *> findPair;
68+
std::string outputA;
69+
outputA.reserve(100000);
70+
std::string outputB;
71+
outputB.reserve(100000);
72+
#pragma omp for schedule(static, 2)
73+
for (size_t i = 0; i < alnDbr.getSize(); i+=2) {
74+
progress.updateProgress();
75+
progress.updateProgress();
76+
resultA.clear();
77+
resultB.clear();
78+
outputA.clear();
79+
outputB.clear();
80+
Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true);
81+
Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true);
82+
83+
for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) {
84+
unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey);
85+
if(findPair.find(taxon) == findPair.end()){
86+
findPair.insert({taxon, &resultA[resIdx]});
87+
}
88+
}
89+
// find pairs
90+
for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) {
91+
unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey);
92+
// found pair!
93+
if(findPair.find(taxon) != findPair.end()) {
94+
bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0);
95+
size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false);
96+
outputA.append(buffer, len);
97+
len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false);
98+
outputB.append(buffer, len);
99+
findPair.erase (taxon);
100+
}
101+
}
102+
resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx);
103+
resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx);
104+
}
105+
}
106+
107+
resultWriter.close();
108+
// clean up
109+
delete mapping;
110+
alnDbr.close();
111+
if (sameDB == false) {
112+
delete tDbr;
113+
}
114+
115+
return 0;
116+
}
117+

0 commit comments

Comments
 (0)