Fix issues with some awks (mawk) printing large ints as floats

milot-mirdita · milot-mirdita · commit eaecacf4ba24 · 2025-03-04T16:13:24.000+09:00
diff --git a/data/workflow/cascaded_clustering.sh b/data/workflow/cascaded_clustering.sh
@@ -159,8 +159,8 @@ if [ -n "$REASSIGN" ]; then
         if notExists "${TMP_PATH}/seq_wrong_assigned_pref.dbtype"; then
             if notExists "${TMP_PATH}/seq_seeds.merged.dbtype"; then
                 # combine seq dbs
-                MAXOFFSET=$(awk '($2+$3) > max{max=$2+$3}END{print max}' "${TMP_PATH}/seq_seeds.index")
-                awk -v OFFSET="${MAXOFFSET}" 'FNR==NR{print $0; next}{print $1"\t"$2+OFFSET"\t"$3}' "${TMP_PATH}/seq_seeds.index" \
+                MAXOFFSET=$(awk '($2+$3) > max { max = $2+$3 } END { printf("%.0f\n", max); }' "${TMP_PATH}/seq_seeds.index")
+                awk -v OFFSET="${MAXOFFSET}" 'FNR == NR { print $0; next } { printf("%s\t%.0f\t%s\n", $1, $2+OFFSET, $3); }' "${TMP_PATH}/seq_seeds.index" \
                      "${TMP_PATH}/seq_wrong_assigned.index" > "${TMP_PATH}/seq_seeds.merged.index"
                 ln -s "$(abspath "${TMP_PATH}/seq_seeds")" "${TMP_PATH}/seq_seeds.merged.0"
                 ln -s "$(abspath "${TMP_PATH}/seq_wrong_assigned")" "${TMP_PATH}/seq_seeds.merged.1"
diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh
@@ -165,7 +165,7 @@ case "${SELECTION}" in
         if notExists "${TMP_PATH}/msa.index"; then
             date "+%s" > "${TMP_PATH}/version"
             downloadFile "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_latest.tar.gz" "${TMP_PATH}/pdb70.tar.gz"
-            tar -xOzf "${TMP_PATH}/pdb70.tar.gz" pdb70_a3m.ffdata | tr -d '\000' | awk -v outfile="${TMP_PATH}/msa" 'function writeEntry() { printf "%s\0", data >> outfile; size = length(data) + 1; data=""; print id"\t"offset"\t"size >> outindex; offset = offset + size; } BEGIN { data = ""; offset = 0; id = 1; if(length(outfile) == 0) { outfile="output"; } outindex = outfile".index"; printf("") > outfile; printf("") > outindex; printf("%c%c%c%c",11,0,0,0) > outfile".dbtype"; } /^>ss_/ { inss = 1; entry = 0; next; } inss == 1 { inss = 0; next; } /^>/ && entry == 0 { if (id > 1) { writeEntry(); } id = id + 1; data = ">"substr($1, 2)"\n"; entry = entry + 1; next; } entry > 0 { data = data""$0"\n"; entry = entry + 1; next; } END { writeEntry(); close(outfile); close(outfile".index"); }'
+            tar -xOzf "${TMP_PATH}/pdb70.tar.gz" pdb70_a3m.ffdata | tr -d '\000' | awk -v outfile="${TMP_PATH}/msa" 'function writeEntry() { printf "%s\0", data >> outfile; size = length(data) + 1; data=""; printf("%s\t%.0f\t%s\n", id, offset, size) >> outindex; offset = offset + size; } BEGIN { data = ""; offset = 0; id = 1; if(length(outfile) == 0) { outfile="output"; } outindex = outfile".index"; printf("") > outfile; printf("") > outindex; printf("%c%c%c%c",11,0,0,0) > outfile".dbtype"; } /^>ss_/ { inss = 1; entry = 0; next; } inss == 1 { inss = 0; next; } /^>/ && entry == 0 { if (id > 1) { writeEntry(); } id = id + 1; data = ">"substr($1, 2)"\n"; entry = entry + 1; next; } entry > 0 { data = data""$0"\n"; entry = entry + 1; next; } END { writeEntry(); close(outfile); close(outfile".index"); }'
             rm -f "${TMP_PATH}/pdb70.tar.gz"
         fi
         INPUT_TYPE="A3M"
diff --git a/data/workflow/searchslicedtargetprofile.sh b/data/workflow/searchslicedtargetprofile.sh
@@ -118,7 +118,7 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
             || fail "result2stats died"
     fi
     # update the starting point for the next step and the total number of pref results
-    NUM_PREF_RESULTS_IN_STEP=$(awk '{sum+=$1;} END{print sum;}' "${TMP_PATH}/pref_count.tsv")
+    NUM_PREF_RESULTS_IN_STEP=$(awk '{sum+=$1;} END { printf("%.0f\n", sum); }' "${TMP_PATH}/pref_count.tsv")
     rm -f "${TMP_PATH}/pref_count.tsv"
 
     NUM_PREF_RESULTS_IN_ALL_PREV_STEPS="$((NUM_PREF_RESULTS_IN_ALL_PREV_STEPS+NUM_PREF_RESULTS_IN_STEP))"
diff --git a/data/workflow/update_clustering.sh b/data/workflow/update_clustering.sh
@@ -131,7 +131,7 @@ fi
 if notExists "${TMP_PATH}/newMappingSeqs"; then
     log "=== Update new sequences with old keys"
     MAXID="$(awk '$1 > max { max = $1 } END { print max }' "${OLDDB}.index" "${NEWDB}.index")"
-    awk -v highest="$MAXID" 'BEGIN { start=highest+1 } { print $1"\t"start; start=start+1; }' \
+    awk -v highest="$MAXID" 'BEGIN { start=highest+1 } { printf("%s\t%.0f\n", $1, start); start=start+1; }' \
         "${TMP_PATH}/newSeqs" > "${TMP_PATH}/newSeqs.mapped"
     awk '{ print $2"\t"$1 }' "${TMP_PATH}/mappingSeqs" > "${TMP_PATH}/mappingSeqs.reverse"
     cat "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newSeqs.mapped" > "${TMP_PATH}/newMappingSeqs"