Skip to content

Commit e818a9e

Browse files
author
Ben Fulton
committed
Add a max_size parameter to the load command. Fixes #23
1 parent 00b0896 commit e818a9e

File tree

10 files changed

+102
-72
lines changed

10 files changed

+102
-72
lines changed

cafe/cafe_commands.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,7 @@ struct load_args get_load_arguments(vector<Argument> pargs)
667667
args.num_threads = 0;
668668
args.num_random_samples = 0;
669669
args.pvalue = -1;
670+
args.max_size = -1;
670671

671672
for (size_t i = 0; i < pargs.size(); i++)
672673
{
@@ -678,6 +679,9 @@ struct load_args get_load_arguments(vector<Argument> pargs)
678679
if (!strcmp(parg->opt, "-r"))
679680
sscanf(parg->argv[0], "%d", &args.num_random_samples);
680681

682+
if (!strcmp(parg->opt, "-max_size"))
683+
sscanf(parg->argv[0], "%d", &args.max_size);
684+
681685
if (!strcmp(parg->opt, "-p"))
682686
sscanf(parg->argv[0], "%lf", &args.pvalue);
683687

@@ -733,6 +737,8 @@ bool endsWith(std::string const &fullString, std::string const &ending) {
733737
\brief Loads families from a family file with a defined format
734738
*
735739
* Takes six arguments: -t, -r, -p, -l, -i, and -filter
740+
*
741+
* -filter ensures that there is at least one copy at the root (using parsimony) for each family.
736742
*/
737743
int cafe_cmd_load(Globals& globals, std::vector<std::string> tokens)
738744
{
@@ -764,7 +770,7 @@ int cafe_cmd_load(Globals& globals, std::vector<std::string> tokens)
764770

765771
param->str_fdata = string_new_with_string(args.family_file_name.c_str());
766772
ifstream ifst(args.family_file_name);
767-
param->pfamily = load_gene_families(ifst, 1, separator);
773+
param->pfamily = load_gene_families(ifst, separator, args.max_size);
768774
if (param->pfamily == NULL)
769775
throw runtime_error("Failed to load file\n");
770776

@@ -1948,7 +1954,7 @@ int cafe_cmd_accuracy(Globals& globals, std::vector<std::string> tokens)
19481954
{
19491955
// read in truth data
19501956
ifstream ifst(truthfile);
1951-
pCafeFamily truthfamily = load_gene_families(ifst, 1, '\t');
1957+
pCafeFamily truthfamily = load_gene_families(ifst, '\t', -1);
19521958
if (truthfamily == NULL) {
19531959
fprintf(stderr, "failed to read in true values %s\n", truthfile.c_str());
19541960
return -1;
@@ -2100,7 +2106,7 @@ int cafe_cmd_cvfamily(Globals& globals, std::vector<std::string> tokens)
21002106

21012107
// read in training data
21022108
ifstream ifst(trainfile);
2103-
pCafeFamily tmpfamily = load_gene_families(ifst, 1, '\t');
2109+
pCafeFamily tmpfamily = load_gene_families(ifst, '\t', -1);
21042110
if (tmpfamily == NULL) {
21052111
fprintf(stderr, "failed to read in training data %s\n", trainfile);
21062112
return -1;
@@ -2185,7 +2191,7 @@ int cafe_cmd_cvspecies(Globals& globals, std::vector<std::string> tokens)
21852191

21862192
// read in training data
21872193
std::ifstream ifst(trainfile);
2188-
pCafeFamily tmpfamily = load_gene_families(ifst, 1, '\t');
2194+
pCafeFamily tmpfamily = load_gene_families(ifst, '\t', -1);
21892195
if (tmpfamily == NULL) {
21902196
fprintf(stderr, "failed to read in training data %s\n", trainfile);
21912197
fprintf(stderr, "did you load the family data with the cross-validation option (load -i <familyfile> -cv)?\n");

cafe/cafe_commands.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ void tree_set_branch_lengths(pCafeTree pcafe, std::vector<int> lengths);
7272
struct load_args {
7373
int num_threads;
7474
int num_random_samples;
75+
int max_size;
7576
double pvalue;
7677
bool filter;
7778
std::string log_file_name;

cafe/cross_validator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ double cross_validator::validate_by_family(pCafeParam param, const char* queryfi
259259

260260
// read in validation data
261261
std::ifstream ifst(truthfile);
262-
pCafeFamily truthfamily = load_gene_families(ifst, 1, '\t');
262+
pCafeFamily truthfamily = load_gene_families(ifst, '\t', -1);
263263
if (truthfamily == NULL) {
264264
fprintf(stderr, "failed to read in true values %s\n", truthfile);
265265
return -1;

cafe/gene_family.cpp

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,12 @@ void cafe_family_free(pCafeFamily pcf)
9999
pcf = NULL;
100100
}
101101

102-
pCafeFamily load_gene_families(std::istream& ist, int bpatcheck, char separator)
102+
int to_int(string s)
103+
{
104+
return stoi(s);
105+
}
106+
107+
pCafeFamily load_gene_families(std::istream& ist, char separator, int max_size)
103108
{
104109
char buf[STRING_BUF_SIZE];
105110
if (!ist)
@@ -122,18 +127,21 @@ pCafeFamily load_gene_families(std::istream& ist, int bpatcheck, char separator)
122127
if (!ist)
123128
break;
124129

125-
cafe_family_add_item(pcf, string_split(buf, separator));
130+
vector<string> values = string_split(buf, separator);
131+
vector<int> sizes(values.size() - 2);
132+
transform(values.begin() + 2, values.end(), sizes.begin(), to_int);
133+
if (max_size < 0 || *max_element(sizes.begin(), sizes.end()) <= max_size)
134+
cafe_family_add_item(pcf, values[1], values[0], sizes);
126135
}
127-
if (bpatcheck)
128-
{
136+
129137
__cafe_famliy_check_the_pattern(pcf);
130-
}
138+
131139
return pcf;
132140
}
133141

134142
/// Data array is assumes to contain a description, an identifier, and a set of integers
135143
/// giving the family size in species order
136-
void cafe_family_add_item(pCafeFamily pcf, const vector<string>& data)
144+
void cafe_family_add_item(pCafeFamily pcf, std::string id, std::string desc, std::vector<int> sizes)
137145
{
138146
pCafeFamilyItem pitem = (pCafeFamilyItem)memory_new(1, sizeof(CafeFamilyItem));
139147
pitem->count = (int*)calloc(pcf->num_species, sizeof(int));
@@ -143,27 +151,18 @@ void cafe_family_add_item(pCafeFamily pcf, const vector<string>& data)
143151
pitem->mu = NULL;
144152
pitem->holder = 1;
145153

146-
if (data.size() != size_t(pcf->num_species + 2))
154+
if (sizes.size() != size_t(pcf->num_species))
147155
{
148-
std::cerr << "Inconsistency in column count: expected " << pcf->num_species + 2 << ", but found " << data.size();
156+
std::cerr << "Inconsistency in column count: expected " << pcf->num_species + 2 << ", but found " << sizes.size() + 2;
149157
}
150158

151-
pitem->desc = new char[data[0].size()+1];
152-
strcpy(pitem->desc, data[0].c_str());
153-
pitem->id = new char[data[1].size() + 1];
154-
strcpy(pitem->id, data[1].c_str());
159+
pitem->desc = new char[desc.size()+1];
160+
strcpy(pitem->desc, desc.c_str());
161+
pitem->id = new char[id.size() + 1];
162+
strcpy(pitem->id, id.c_str());
163+
164+
copy(sizes.begin(), sizes.end(), pitem->count);
155165

156-
for (int j = 0; j < pcf->num_species; j++)
157-
{
158-
if (data[j + 2].empty())
159-
{
160-
pitem->count[j] = 0;
161-
}
162-
else
163-
{
164-
pitem->count[j] = atoi(data[j + 2].c_str());
165-
}
166-
}
167166
pcf->max_size = max(pcf->max_size, *std::max_element(pitem->count, pitem->count + pcf->num_species));
168167
arraylist_add(pcf->flist, pitem);
169168
}

cafe/gene_family.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ const int COMMA_AS_WHITESPACE = 1;
1616
std::vector<std::string> tokenize(std::string s, int flags);
1717

1818
pCafeFamily cafe_family_init(const std::vector<std::string>& species_list);
19-
pCafeFamily load_gene_families(std::istream& ist, int bpatcheck, char separator);
19+
pCafeFamily load_gene_families(std::istream& ist, char separator, int max_size);
2020
void cafe_family_free(pCafeFamily pcf);
2121

22-
void cafe_family_add_item(pCafeFamily pcf, const std::vector<std::string>& data);
22+
void cafe_family_add_item(pCafeFamily pcf, std::string id, std::string desc, std::vector<int> sizes);
2323
void cafe_family_item_free(pCafeFamilyItem pitem);
2424

2525
int log_cluster_membership(pCafeFamily pcf, int k_value, double **p_z_membership, std::ostream& log);

tests/command_tests.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,21 @@ TEST(CommandTests, get_load_arguments)
189189
struct load_args args = get_load_arguments(build_argument_list(command));
190190
LONGS_EQUAL(1, args.num_threads);
191191
LONGS_EQUAL(2, args.num_random_samples);
192+
LONGS_EQUAL(-1, args.max_size);
192193
DOUBLES_EQUAL(.05, args.pvalue, .000001);
193194
STRCMP_EQUAL("log.txt", args.log_file_name.c_str());
194195
STRCMP_EQUAL("fam.txt", args.family_file_name.c_str());
195196
CHECK(!args.filter);
196197
}
197198

199+
TEST(CommandTests, get_load_arguments_filter_and_max_size)
200+
{
201+
vector<string> command = tokenize("load -filter -max_size 100", REGULAR_WHITESPACE);
202+
struct load_args args = get_load_arguments(build_argument_list(command));
203+
LONGS_EQUAL(100, args.max_size);
204+
CHECK(args.filter);
205+
}
206+
198207
TEST(CommandTests, cafe_cmd_load)
199208
{
200209
try
@@ -261,7 +270,7 @@ TEST(CommandTests, cafe_cmd_tree_syncs_family_if_loaded)
261270

262271
std::vector<std::string> species = { "chimp", "human", "mouse", "rat", "dog" };
263272
globals.param.pfamily = cafe_family_init(species);
264-
cafe_family_add_item(globals.param.pfamily, { "description", "id", "3", "5", "7", "11", "13" });
273+
cafe_family_add_item(globals.param.pfamily, "id", "description", {3, 5, 7, 11, 13 });
265274

266275
LONGS_EQUAL(-1, globals.param.pfamily->index[0]);
267276
cafe_cmd_tree(globals, tokens);
@@ -297,7 +306,7 @@ void prepare_viterbi(CafeParam& param)
297306
param.lambda = lambdas;
298307

299308
param.pfamily = cafe_family_init({ "chimp", "human", "mouse", "rat", "dog" });
300-
cafe_family_add_item(param.pfamily, { "description", "id", "3", "5", "7", "11", "13" });
309+
cafe_family_add_item(param.pfamily, "id", "description", { 3, 5, 7, 11, 13 });
301310

302311
cafe_family_set_species_index(param.pfamily, param.pcafe);
303312

tests/family_tests.cpp

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ TEST_GROUP(FamilyTests)
4040
TEST(FamilyTests, TestCafeFamily)
4141
{
4242
std::ifstream ifst("Nonexistent.tab");
43-
POINTERS_EQUAL(NULL, load_gene_families(ifst, 1, '\t'));
43+
POINTERS_EQUAL(NULL, load_gene_families(ifst, '\t', -1));
4444

4545
pCafeFamily fam = cafe_family_init({ "Dog", "Chimp", "Human", "Mouse", "Rat" });
4646

47-
cafe_family_add_item(fam, { "OTOPETRIN", "ENSF1", "3", "5", "7", "11", "13" });
47+
cafe_family_add_item(fam, "ENSF1", "OTOPETRIN", { 3, 5, 7, 11, 13 });
4848

4949
LONGS_EQUAL(5, fam->num_species);
5050
LONGS_EQUAL(1, fam->flist->size);
@@ -65,10 +65,10 @@ TEST(FamilyTests, TestCafeFamily)
6565
TEST(FamilyTests, load_gene_families_tab_separated)
6666
{
6767
std::ifstream ifst("Nonexistent.tab");
68-
POINTERS_EQUAL(NULL, load_gene_families(ifst, 1, '\t'));
68+
POINTERS_EQUAL(NULL, load_gene_families(ifst, '\t', -1));
6969

7070
std::istringstream ist("FAMILYDESC\tFAMILY\tA\tB\nOTOPETRIN\tENSF2\t2\t10\n");
71-
pCafeFamily fam = load_gene_families(ist, 0, '\t');
71+
pCafeFamily fam = load_gene_families(ist, '\t', -1);
7272
LONGS_EQUAL(2, fam->num_species);
7373
STRCMP_EQUAL("A", fam->species[0]);
7474
STRCMP_EQUAL("B", fam->species[1]);
@@ -83,10 +83,25 @@ TEST(FamilyTests, load_gene_families_tab_separated)
8383
cafe_family_free(fam);
8484
}
8585

86+
TEST(FamilyTests, filter_families_greater_than_max_size)
87+
{
88+
std::istringstream ist("FAMILYDESC\tFAMILY\tA\tB\ndesc\tid1\t2\t50\ndesc\tid2\t20\t10\n");
89+
pCafeFamily fam = load_gene_families(ist, '\t', 25);
90+
LONGS_EQUAL(2, fam->num_species);
91+
STRCMP_EQUAL("A", fam->species[0]);
92+
STRCMP_EQUAL("B", fam->species[1]);
93+
94+
LONGS_EQUAL(1, fam->flist->size);
95+
96+
CHECK(get_family_item(fam, "id1") == NULL);
97+
CHECK(get_family_item(fam, "id2") != NULL);
98+
cafe_family_free(fam);
99+
}
100+
86101
TEST(FamilyTests, load_gene_families_comma_separated)
87102
{
88103
std::istringstream ist("FAMILY DESC,FAMILY,A,B\nOXYSTEROL BINDING RELATED,ENSF2,2,10\n");
89-
pCafeFamily fam = load_gene_families(ist, 0, ',');
104+
pCafeFamily fam = load_gene_families(ist, ',', -1);
90105
LONGS_EQUAL(2, fam->num_species);
91106
STRCMP_EQUAL("A", fam->species[0]);
92107
STRCMP_EQUAL("B", fam->species[1]);
@@ -106,7 +121,7 @@ TEST(FamilyTests, load_gene_families_is_not_whitespace_separated)
106121
std::istringstream ist("FAMILYDESC FAMILY A B C\n");
107122
try
108123
{
109-
pCafeFamily fam = load_gene_families(ist, 0, '\t');
124+
pCafeFamily fam = load_gene_families(ist, '\t', -1);
110125
FAIL("Exception should have been thrown");
111126
cafe_family_free(fam);
112127
}
@@ -119,7 +134,7 @@ TEST(FamilyTests, load_gene_families_is_not_whitespace_separated)
119134
TEST(FamilyTests, cafe_family_set_size)
120135
{
121136
pCafeFamily pcf = cafe_family_init({"chimp", "human", "mouse", "rat", "dog" });
122-
cafe_family_add_item(pcf, { "description", "id", "3", "5", "7", "11", "13" });
137+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
123138

124139
pCafeTree ptree = create_tree();
125140
cafe_family_set_species_index(pcf, ptree);
@@ -155,7 +170,7 @@ TEST(FamilyTests, cafe_family_add_item)
155170
{
156171
pCafeFamily pcf = cafe_family_init({"chimp", "human", "mouse", "rat", "dog" });
157172

158-
cafe_family_add_item(pcf, { "description", "id", "3", "5", "7", "11", "13" });
173+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
159174
LONGS_EQUAL(1, pcf->flist->size);
160175
pCafeFamilyItem pitem = (pCafeFamilyItem)arraylist_get(pcf->flist, 0);
161176
STRCMP_EQUAL("description", pitem->desc);
@@ -203,7 +218,7 @@ TEST(FamilyTests, cafe_family_set_size_with_family_forced)
203218
pCafeFamily pcf = cafe_family_init({"chimp", "human", "mouse", "rat", "dog" });
204219
pCafeTree cafe_tree = create_tree();
205220
cafe_family_set_species_index(pcf, cafe_tree);
206-
cafe_family_add_item(pcf, { "description", "id", "3", "5", "7", "11", "13" });
221+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
207222

208223
// SUT
209224
cafe_family_set_size_with_family_forced(pcf, 0, cafe_tree);
@@ -218,7 +233,7 @@ TEST(FamilyTests, write_family_gainloss)
218233
{
219234
std::ostringstream ost;
220235
pCafeFamily pcf = cafe_family_init({"chimp", "human", "mouse", "rat", "dog" });
221-
cafe_family_add_item(pcf, { "description", "id", "3", "5", "7", "11", "13" });
236+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
222237
pCafeTree tree1 = create_tree();
223238
cafe_family_set_species_index(pcf, tree1);
224239
pCafeTree tree2 = cafe_tree_copy(tree1);
@@ -237,11 +252,11 @@ TEST(FamilyTests, write_family)
237252
{
238253
std::ostringstream ost;
239254
pCafeFamily pcf = cafe_family_init({"chimp", "human", "mouse", "rat", "dog" });
240-
cafe_family_add_item(pcf, { "my_description", "my_id", "3", "5", "7", "11", "13" });
255+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
241256

242257
write_family(ost, pcf);
243258
STRCMP_CONTAINS("Desc\tFamily ID\tchimp\thuman\tmouse\trat\tdog\n", ost.str().c_str())
244-
STRCMP_CONTAINS("my_description\tmy_id\t3\t5\t7\t11\t13\n", ost.str().c_str())
259+
STRCMP_CONTAINS("description\tid\t3\t5\t7\t11\t13\n", ost.str().c_str())
245260
cafe_family_free(pcf);
246261

247262
}
@@ -250,13 +265,13 @@ TEST(FamilyTests, log_cluster_membership)
250265
{
251266
std::ostringstream ost;
252267
pCafeFamily pcf = cafe_family_init({ "chimp", "human", "mouse", "rat", "dog" });
253-
cafe_family_add_item(pcf, { "my_description", "my_id", "3", "5", "7", "11", "13" });
268+
cafe_family_add_item(pcf, "id", "description", { 3, 5, 7, 11, 13 });
254269
double **z = (double **)memory_new_2dim(2, 2, sizeof(double));
255270
z[0][0] = 7.7;
256271
z[0][1] = 11.13;
257272
log_cluster_membership(pcf, 2, z, ost);
258273
STRCMP_CONTAINS("The Number of families : 1\n", ost.str().c_str());
259-
STRCMP_CONTAINS("family my_id: 7.7 11.13", ost.str().c_str());
274+
STRCMP_CONTAINS("family id: 7.7 11.13", ost.str().c_str());
260275

261276
cafe_family_free(pcf);
262277
}

tests/lambda_tests.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ TEST(LambdaTests, TestCmdLambda)
103103

104104
globals.param.pfamily = cafe_family_init({"Dog", "Chimp", "Human", "Mouse", "Rat" });
105105

106-
cafe_family_add_item(globals.param.pfamily, { "OTOPETRIN", "ENSF00000002390", "7", "7", "13", "18", "7" });
106+
cafe_family_add_item(globals.param.pfamily, "ENSF00000002390", "OTOPETRIN", { 7, 7, 13, 18, 7 });
107107

108108
init_family_size(&globals.param.family_size, globals.param.pfamily->max_size);
109109
cafe_tree_set_parameters(globals.param.pcafe, &globals.param.family_size, 0);
@@ -573,10 +573,10 @@ TEST(LambdaTests, best_lambda_by_fminsearch)
573573
globals.param.pcafe = cafe_tree_new(tree, &range, 0, 0);
574574
globals.param.pfamily = cafe_family_init({ "A", "B", "C", "D" });
575575

576-
cafe_family_add_item(globals.param.pfamily, { "1", "ENS01", "5", "10", "2", "6" });
577-
cafe_family_add_item(globals.param.pfamily, { "2", "ENS02", "5", "10", "2", "6" });
578-
cafe_family_add_item(globals.param.pfamily, { "3", "ENS03", "5", "10", "2", "6" });
579-
cafe_family_add_item(globals.param.pfamily, { "4", "ENS04", "5", "10", "2", "6" });
576+
cafe_family_add_item(globals.param.pfamily, "ENS01", "1", { 5, 10, 2, 6 });
577+
cafe_family_add_item(globals.param.pfamily, "ENS02", "2", { 5, 10, 2, 6 });
578+
cafe_family_add_item(globals.param.pfamily, "ENS03", "3", { 5, 10, 2, 6 });
579+
cafe_family_add_item(globals.param.pfamily, "ENS04", "4", { 5, 10, 2, 6 });
580580

581581
init_family_size(&globals.param.family_size, globals.param.pfamily->max_size);
582582
cafe_tree_set_parameters(globals.param.pcafe, &globals.param.family_size, 0);
@@ -638,7 +638,7 @@ TEST(LambdaTests, get_posterior)
638638

639639
param.pfamily = cafe_family_init({ "chimp", "human", "mouse", "rat", "dog" });
640640
cafe_family_set_species_index(param.pfamily, param.pcafe);
641-
cafe_family_add_item(param.pfamily, { "description", "id", "3", "5", "7", "11", "13" });
641+
cafe_family_add_item(param.pfamily, "id", "description", { 3, 5, 7, 11, 13 });
642642

643643
param.ML = (double*)memory_new(15, sizeof(double));
644644
param.MAP = (double*)memory_new(15, sizeof(double));

tests/report_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ TEST(ReportTests, write_families_line)
206206
globals.param.pcafe = tree;
207207
globals.param.pfamily = cafe_family_init({ "chimp", "human", "mouse", "rat", "dog" });
208208
cafe_family_set_species_index(globals.param.pfamily, tree);
209-
cafe_family_add_item(globals.param.pfamily, { "description", "id", "3", "5", "7", "11", "13" });
209+
cafe_family_add_item(globals.param.pfamily, "id", "description", { 3, 5, 7, 11, 13 });
210210

211211
viterbi_parameters* v = globals.viterbi;
212212
v->num_nodes = 6;
@@ -250,7 +250,7 @@ TEST(ReportTests, family_line_item_sets_pvalues)
250250
pCafeTree tree = create_tree(range);
251251
globals.param.pcafe = tree;
252252
cafe_family_set_species_index(globals.param.pfamily, tree);
253-
cafe_family_add_item(globals.param.pfamily, { "description", "id", "3", "5", "7", "11", "13" });
253+
cafe_family_add_item(globals.param.pfamily, "id", "description", { 3, 5, 7, 11, 13 });
254254

255255
viterbi_parameters* v = globals.viterbi;
256256
v->num_nodes = 6;

0 commit comments

Comments
 (0)