sourmash-bio · ctb · Jan 28, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 25, 2025
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
@@ -123,11 +123,21 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec<GBAssemblyData>, u
     // Check column names
     let header = rdr.headers()?;
     let expected_header = vec!["accession", "name"];
-    if header != expected_header {
-        return Err(anyhow!(
-            "Invalid column names in CSV file. Columns should be: {:?}",
-            expected_header
-        ));
+
+    for h in expected_header.iter() {
+        if !header.iter().any(|e| *h == e) {
+            return Err(anyhow!(
+                "Missing column name '{}' in CSV file. Columns should be: {:?}",
+                h,
+                expected_header
+            ));
+        }
+    }
+
+    for h in header.iter() {
+        if !expected_header.iter().any(|e| h == *e) {
+            eprintln!("WARNING: extra column '{}' in CSV file. Ignoring.", h);
+        }
     }
 
     for result in rdr.records() {

diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py
@@ -374,6 +374,32 @@ def test_gbsketch_bad_acc(runtmp):
                 assert sig.md5sum() == ss3.md5sum()
 
 
+def test_gbsketch_extra_column(runtmp, capfd):
+    acc_csv = get_test_data('acc.csv')
+    acc_mod = runtmp.output('acc_mod.csv')
+
+    with open(acc_csv, 'r') as inF, open(acc_mod, 'w') as outF:
+        lines = inF.readlines()
+        for line in lines:
+            outF.write(line.strip() + ',extra\n')
+
+    output = runtmp.output('simple.zip')
+    failed = runtmp.output('failed.csv')
+    ch_fail = runtmp.output('checksum_dl_failed.csv')
+
+    runtmp.sourmash('scripts', 'gbsketch', acc_mod, '-o', output,
+                    '--failed', failed, '-r', '3', '--checksum-fail', ch_fail,
+                    '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200")
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.err)
+    print(f"looking for path: {output}")
+
+    assert "WARNING: extra column 'extra' in CSV file. Ignoring." in captured.err
+
+
 def test_gbsketch_missing_accfile(runtmp, capfd):
     acc_csv = runtmp.output('acc1.csv')
     output = runtmp.output('simple.zip')
@@ -405,7 +431,7 @@ def test_gbsketch_empty_accfile(runtmp, capfd):
 
     captured = capfd.readouterr()
     print(captured.err)
-    assert 'Error: Invalid column names in CSV file. Columns should be: ["accession", "name"]' in captured.err
+    assert "Error: Missing column name 'accession' in CSV file." in captured.err
 
 
 def test_gbsketch_bad_acc_fail(runtmp, capfd):