Skip to content

Commit 488df86

Browse files
Refactoring of gff2db
* Improve gff2db to parse multiple files, and try multi-threading * Add Util:getFieldsOfLine to skip over tabs * Correctly deal with GFF3 column 2 that can contain whitespace * Allow multiple GFF feature types in gff2db * Make identifiers stable in gff2db * Improved log output in gff2db Co-authored-by: Milot Mirdita <[email protected]>
1 parent d822533 commit 488df86

File tree

4 files changed

+216
-96
lines changed

4 files changed

+216
-96
lines changed

src/MMseqsBase.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,8 +1146,8 @@ std::vector<Command> baseCommands = {
11461146
"Extract regions from a sequence database based on a GFF3 file",
11471147
NULL,
11481148
"Milot Mirdita <[email protected]>",
1149-
"<i:gff3File> <i:sequenceDB> <o:sequenceDB>",
1150-
CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
1149+
"<i:gff3File1> ... <i:gff3FileN> <i:sequenceDB> <o:sequenceDB>",
1150+
CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
11511151
{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
11521152
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
11531153
{"maskbygff", maskbygff, &par.gff2db, COMMAND_SPECIAL,

src/commons/Parameters.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ Parameters::Parameters():
189189
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
190190
PARAM_HEADER_SPLIT_MODE(PARAM_HEADER_SPLIT_MODE_ID, "--headers-split-mode", "Header split mode", "Header split mode: 0: split position, 1: original header", typeid(int), (void *) &headerSplitMode, "^[0-1]{1}$"),
191191
// gff2db
192-
PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID, "--gff-type", "GFF type", "Type in the GFF file to filter by", typeid(std::string), (void *) &gffType, ""),
192+
PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID, "--gff-type", "GFF type", "Comma separated list of feature types in the GFF file to select", typeid(std::string), (void *) &gffType, ""),
193193
// translatenucs
194194
PARAM_TRANSLATION_TABLE(PARAM_TRANSLATION_TABLE_ID, "--translation-table", "Translation table", "1) CANONICAL, 2) VERT_MITOCHONDRIAL, 3) YEAST_MITOCHONDRIAL, 4) MOLD_MITOCHONDRIAL, 5) INVERT_MITOCHONDRIAL, 6) CILIATE\n9) FLATWORM_MITOCHONDRIAL, 10) EUPLOTID, 11) PROKARYOTE, 12) ALT_YEAST, 13) ASCIDIAN_MITOCHONDRIAL, 14) ALT_FLATWORM_MITOCHONDRIAL\n15) BLEPHARISMA, 16) CHLOROPHYCEAN_MITOCHONDRIAL, 21) TREMATODE_MITOCHONDRIAL, 22) SCENEDESMUS_MITOCHONDRIAL\n23) THRAUSTOCHYTRIUM_MITOCHONDRIAL, 24) PTEROBRANCHIA_MITOCHONDRIAL, 25) GRACILIBACTERIA, 26) PACHYSOLEN, 27) KARYORELICT, 28) CONDYLOSTOMA\n 29) MESODINIUM, 30) PERTRICH, 31) BLASTOCRITHIDIA", typeid(int), (void *) &translationTable, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_MISC | MMseqsParameter::COMMAND_EXPERT),
195195
// createseqfiledb
@@ -749,6 +749,7 @@ Parameters::Parameters():
749749
// gff2db
750750
gff2db.push_back(&PARAM_GFF_TYPE);
751751
gff2db.push_back(&PARAM_ID_OFFSET);
752+
gff2db.push_back(&PARAM_THREADS);
752753
gff2db.push_back(&PARAM_V);
753754
754755

src/commons/Util.h

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,17 @@ class Util {
168168

169169
static bool getLine(const char* data, size_t dataLength, char* buffer, size_t bufferLength);
170170

171-
static inline size_t skipWhitespace(const char* data){
171+
static inline size_t skipWhitespace(const char* data) {
172172
size_t counter = 0;
173-
while((data[counter] == ' ' || data[counter] == '\t') == true) {
173+
while ((data[counter] == ' ' || data[counter] == '\t') == true) {
174+
counter++;
175+
}
176+
return counter;
177+
}
178+
179+
static inline size_t skipTab(const char* data) {
180+
size_t counter = 0;
181+
while (data[counter] == '\t') {
174182
counter++;
175183
}
176184
return counter;
@@ -205,6 +213,14 @@ class Util {
205213
return counter;
206214
}
207215

216+
static inline size_t skipNonTab(const char * data) {
217+
size_t counter = 0;
218+
while (data[counter] != '\t' && data[counter] != '\n' && data[counter] != '\0') {
219+
counter++;
220+
}
221+
return counter;
222+
}
223+
208224
static std::pair<std::string, std::string> createTmpFileNames(const std::string &db,
209225
const std::string &dbindex, int count){
210226
// TODO take only db and not db and dbindex
@@ -245,6 +261,22 @@ class Util {
245261
return elementCounter;
246262
}
247263

264+
static inline size_t getFieldsOfLine(const char* data, const char** words, size_t maxElement) {
265+
size_t elementCounter = 0;
266+
while (*data != '\n' && *data != '\0') {
267+
data += skipTab(data);
268+
words[elementCounter] = data;
269+
elementCounter++;
270+
if (elementCounter >= maxElement)
271+
return elementCounter;
272+
data += skipNonTab(data);
273+
}
274+
if (elementCounter < maxElement)
275+
words[elementCounter] = data;
276+
277+
return elementCounter;
278+
}
279+
248280

249281
static std::pair<ssize_t,ssize_t> getFastaHeaderPosition(const std::string & header);
250282
static std::string parseFastaHeader(const char * header);

src/util/gff2db.cpp

Lines changed: 178 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -4,118 +4,205 @@
44
#include "Util.h"
55
#include "MemoryMapped.h"
66
#include "Orf.h"
7+
#include "Parameters.h"
8+
9+
#ifdef OPENMP
10+
#include <omp.h>
11+
#endif
712

813
int gff2db(int argc, const char **argv, const Command &command) {
914
Parameters &par = Parameters::getInstance();
10-
par.parseParameters(argc, argv, command, true, 0, 0);
15+
par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
1116

12-
MemoryMapped file(par.db1, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
13-
if (!file.isValid()) {
14-
Debug(Debug::ERROR) << "Could not open GFF file " << par.db1 << "\n";
15-
EXIT(EXIT_FAILURE);
16-
}
17-
char *data = (char *) file.getData();
17+
std::string outDb = par.filenames.back();
18+
par.filenames.pop_back();
19+
std::string seqDb = par.filenames.back();
20+
par.filenames.pop_back();
1821

19-
DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_LOOKUP_REV);
22+
DBReader<unsigned int> reader(seqDb.c_str(), (seqDb + ".index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_LOOKUP_REV);
2023
reader.open(DBReader<unsigned int>::NOSORT);
21-
DBReader<unsigned int> headerReader(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
24+
DBReader<unsigned int> headerReader((seqDb + "_h").c_str(), (seqDb + "_h.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
2225
headerReader.open(DBReader<unsigned int>::NOSORT);
2326

24-
DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_NUCLEOTIDES);
27+
std::string outDbIndex = outDb + ".index";
28+
DBWriter writer(outDb.c_str(), outDbIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_NUCLEOTIDES);
2529
writer.open();
26-
DBWriter headerWriter(par.hdr3.c_str(), par.hdr3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB);
30+
std::string outHdr = outDb + "_h";
31+
std::string outHdrIndex = outDb + "_h.index";
32+
DBWriter headerWriter(outHdr.c_str(), outHdrIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
2733
headerWriter.open();
28-
29-
bool shouldCompareType = par.gffType.length() > 0;
30-
31-
Debug::Progress progress;
32-
unsigned int entries_num = 0;
33-
char buffer[1024];
34-
const char* fields[255];
35-
36-
std::string revStr;
37-
revStr.reserve(par.maxSeqLen + 1);
38-
39-
while (*data != '\0') {
40-
progress.updateProgress();
41-
42-
// line is a comment
43-
if (*data == '#') {
44-
data = Util::skipLine(data);
45-
continue;
46-
}
47-
48-
const size_t columns = Util::getWordsOfLine(data, fields, 255);
49-
data = Util::skipLine(data);
50-
if (columns < 9) {
51-
Debug(Debug::WARNING) << "Not enough columns in GFF file\n";
52-
continue;
34+
std::string outLookup = outDb + ".lookup";
35+
std::string outLookupIndex = outDb + ".lookup.index";
36+
DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, 0, Parameters::DBTYPE_OMIT_FILE);
37+
lookupWriter.open();
38+
39+
FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w");
40+
for (size_t i = 0; i < par.filenames.size(); ++i) {
41+
if (fprintf(source, "%zu\t%s\n", i, FileUtil::baseName(par.filenames[i]).c_str()) < 0) {
42+
Debug(Debug::ERROR) << "Cannot write to file " << outDb << ".source\n";
43+
return EXIT_FAILURE;
5344
}
45+
}
46+
if (fclose(source) != 0) {
47+
Debug(Debug::ERROR) << "Cannot close file " << outDb << ".source\n";
48+
return EXIT_FAILURE;
49+
}
5450

55-
if (shouldCompareType) {
56-
std::string type(fields[2], fields[3] - fields[2] - 1);
57-
if (type.compare(par.gffType) != 0) {
58-
continue;
59-
}
60-
}
51+
const std::vector<std::string> features = Util::split(par.gffType, ",");
52+
if (features.empty()) {
53+
Debug(Debug::WARNING) << "No feature types given. All features will be extracted\n";
54+
}
55+
std::vector<size_t> featureCount(features.size(), 0);
6156

62-
size_t start = Util::fast_atoi<size_t>(fields[3]);
63-
size_t end = Util::fast_atoi<size_t>(fields[4]);
64-
if (start == end) {
65-
Debug(Debug::WARNING) << "Invalid sequence length in line " << entries_num << "!\n";
66-
continue;
67-
}
57+
if (par.filenames.size() < reader.getSize()) {
58+
Debug(Debug::WARNING) << "Not enough GFF files are provided. Some results might be omitted\n";
59+
}
6860

69-
std::string name(fields[0], fields[1] - fields[0] - 1);
70-
size_t lookupId = reader.getLookupIdByAccession(name);
71-
if (lookupId == SIZE_MAX) {
72-
Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "!\n";
73-
return EXIT_FAILURE;
61+
unsigned int entries_num = 0;
62+
Debug::Progress progress(par.filenames.size());
63+
#pragma omp parallel
64+
{
65+
int thread_idx = 0;
66+
#ifdef OPENMP
67+
thread_idx = omp_get_thread_num();
68+
#endif
69+
70+
char buffer[32768];
71+
const char* fields[255];
72+
73+
std::string revStr;
74+
revStr.reserve(par.maxSeqLen + 1);
75+
76+
std::vector<size_t> localFeatureCount(features.size(), 0);
77+
78+
#pragma omp for schedule(dynamic, 1) nowait
79+
for (size_t i = 0; i < par.filenames.size(); ++i) {
80+
progress.updateProgress();
81+
MemoryMapped file(par.filenames[i], MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
82+
if (!file.isValid()) {
83+
Debug(Debug::ERROR) << "Could not open GFF file " << par.filenames[i] << "\n";
84+
EXIT(EXIT_FAILURE);
85+
}
86+
char *data = (char *) file.getData();
87+
size_t idx = 0;
88+
while (*data != '\0') {
89+
// line is a comment or empty
90+
if (*data == '#' || *data == '\n') {
91+
data = Util::skipLine(data);
92+
continue;
93+
}
94+
95+
const size_t columns = Util::getFieldsOfLine(data, fields, 255);
96+
data = Util::skipLine(data);
97+
if (columns < 9) {
98+
Debug(Debug::WARNING) << "Not enough columns in GFF file\n";
99+
continue;
100+
}
101+
102+
if (features.empty() == false) {
103+
bool shouldSkip = true;
104+
std::string type(fields[2], fields[3] - fields[2] - 1);
105+
for (size_t i = 0; i < features.size(); ++i) {
106+
if (type.compare(features[i]) == 0) {
107+
localFeatureCount[i]++;
108+
shouldSkip = false;
109+
break;
110+
}
111+
}
112+
if (shouldSkip) {
113+
continue;
114+
}
115+
}
116+
size_t start = Util::fast_atoi<size_t>(fields[3]);
117+
size_t end = Util::fast_atoi<size_t>(fields[4]);
118+
if (start == end) {
119+
Debug(Debug::WARNING) << "Invalid sequence length in line " << idx << "\n";
120+
continue;
121+
}
122+
std::string strand(fields[6], fields[7] - fields[6] - 1);
123+
std::string name(fields[0], fields[1] - fields[0] - 1);
124+
size_t lookupId = reader.getLookupIdByAccession(name);
125+
if (lookupId == SIZE_MAX) {
126+
Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "\n";
127+
EXIT(EXIT_FAILURE);
128+
}
129+
unsigned int lookupKey = reader.getLookupKey(lookupId);
130+
size_t seqId = reader.getId(lookupKey);
131+
if (seqId == UINT_MAX) {
132+
Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "\n";
133+
EXIT(EXIT_FAILURE);
134+
}
135+
136+
unsigned int key = __sync_fetch_and_add(&(entries_num), 1);
137+
size_t bufferLen;
138+
if (strand == "+") {
139+
bufferLen = Orf::writeOrfHeader(buffer, lookupKey, start, end, 0, 0);
140+
} else {
141+
bufferLen = Orf::writeOrfHeader(buffer, lookupKey, end, start, 0, 0);
142+
}
143+
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
144+
145+
char *seq = reader.getData(seqId, thread_idx);
146+
//check the strand instead of end - start
147+
ssize_t length = end - start + 1;
148+
writer.writeStart(thread_idx);
149+
if (strand == "+") {
150+
size_t len = snprintf(buffer, sizeof(buffer), "%u\t%s_%zu_%zu_%zu\t%zu\n", key, name.c_str(), idx, start, end, i);
151+
lookupWriter.writeData(buffer, len, thread_idx, false, false);
152+
writer.writeAdd(seq + start - 1 , length, thread_idx);
153+
} else {
154+
size_t len = snprintf(buffer, sizeof(buffer), "%u\t%s_%zu_%zu_%zu\t%zu\n", key, name.c_str(), idx, end, start, i);
155+
lookupWriter.writeData(buffer, len, thread_idx, false, false);
156+
for (size_t i = end - 1 ; i >= end - length; i--) {
157+
revStr.append(1, Orf::complement(seq[i]));
158+
}
159+
writer.writeAdd(revStr.c_str(), revStr.size(), thread_idx);
160+
revStr.clear();
161+
}
162+
writer.writeAdd("\n", 1, thread_idx);
163+
writer.writeEnd(key, thread_idx);
164+
idx++;
165+
}
166+
file.close();
74167
}
75-
unsigned int key = reader.getLookupKey(lookupId);
76-
77-
size_t headerId = headerReader.getId(key);
78-
if (headerId == UINT_MAX) {
79-
Debug(Debug::ERROR) << "GFF entry not found in header database: " << name << "!\n";
80-
return EXIT_FAILURE;
168+
#pragma omp critical
169+
for (size_t i = 0; i < features.size(); ++i) {
170+
featureCount[i] += localFeatureCount[i];
81171
}
82-
unsigned int id = par.identifierOffset + entries_num;
83-
84-
headerWriter.writeStart(0);
85-
headerWriter.writeAdd(headerReader.getData(headerId, 0), std::max(headerReader.getSeqLen(headerId), (size_t)2) - 2, 0);
86-
int len = snprintf(buffer, 1024, " %zu-%zu\n", start, end);
87-
headerWriter.writeAdd(buffer, len, 0);
88-
headerWriter.writeEnd(id, 0);
89-
90-
size_t seqId = reader.getId(key);
91-
if (seqId == UINT_MAX) {
92-
Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "!\n";
93-
return EXIT_FAILURE;
172+
}
173+
lookupWriter.close(true);
174+
FileUtil::remove(lookupWriter.getIndexFileName());
175+
headerWriter.close(true);
176+
writer.close(true);
177+
headerReader.close();
178+
reader.close();
179+
if (Debug::debugLevel >= Debug::INFO && features.size() > 0) {
180+
Debug(Debug::INFO) << "Found these feature types and counts:\n";
181+
for (size_t i = 0; i < features.size(); ++i) {
182+
Debug(Debug::INFO) << " - " << features[i] << ": " << featureCount[i] << "\n";
94183
}
184+
} else {
185+
Debug(Debug::INFO) << (entries_num + 1) << " features were extracted\n";
186+
}
95187

96-
ssize_t length = end - start;
97-
char *seq = reader.getData(seqId, 0);
98-
99-
writer.writeStart(0);
100-
if (length > 0) {
101-
writer.writeAdd(seq + start, length + 1, 0);
102-
} else {
103-
for (size_t i = start; i >= start + length; i--) {
104-
revStr.append(1, Orf::complement(seq[i]));
188+
if (par.filenames.size() > 1 && par.threads > 1) {
189+
// make identifiers stable
190+
#pragma omp parallel
191+
{
192+
#pragma omp single
193+
{
194+
#pragma omp task
195+
{
196+
DBWriter::createRenumberedDB(outHdr, outHdrIndex, "", "");
197+
}
198+
199+
#pragma omp task
200+
{
201+
DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex);
202+
}
105203
}
106-
writer.writeAdd(revStr.c_str(), revStr.size(), 0);
107-
revStr.clear();
108204
}
109-
writer.writeAdd("\n", 1, 0);
110-
writer.writeEnd(id, 0);
111-
112-
entries_num++;
113205
}
114-
headerWriter.close();
115-
writer.close();
116-
headerReader.close();
117-
reader.close();
118-
file.close();
119206

120207
return EXIT_SUCCESS;
121208
}

0 commit comments

Comments
 (0)