Fix numerical underflow/overflow vulnerabilities

jbrukh · claude · jbrukh · commit 0dd9d42ab446 · 2025-12-07T15:22:27.000-05:00
- Add Laplace smoothing to getPriors to avoid log(0) for zero priors - Handle division by zero in ProbScores when all scores underflow - Improve SafeProbScores with log-sum-exp trick for probability recovery - Add logScoresToProbs helper for numerically stable probability conversion - Update tests for new numerical behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/bayesian.go b/bayesian.go
@@ -150,22 +150,22 @@ func NewClassifierFromReader(r io.Reader) (c *Classifier, err error) {
 
 // getPriors returns the prior probabilities for the
 // classes provided -- P(C_j).
-//
-// TODO: There is a way to smooth priors, currently
-// not implemented here.
+// Uses Laplace smoothing to ensure no prior is zero:
+// P(C_j) = (count_j + 1) / (total + num_classes)
 func (c *Classifier) getPriors() (priors []float64) {
 	n := len(c.Classes)
-	priors = make([]float64, n, n)
+	priors = make([]float64, n)
 	sum := 0
 	for index, class := range c.Classes {
 		total := c.datas[class].Total
 		priors[index] = float64(total)
 		sum += total
 	}
-	if sum != 0 {
-		for i := 0; i < n; i++ {
-			priors[i] /= float64(sum)
-		}
+	// Apply Laplace smoothing to priors to avoid log(0)
+	floatN := float64(n)
+	floatSum := float64(sum)
+	for i := 0; i < n; i++ {
+		priors[i] = (priors[i] + 1) / (floatSum + floatN)
 	}
 	return
 }
@@ -339,30 +339,41 @@ func (c *Classifier) Classify(document []string) (class Class, scores []float64,
 // never seen before. Depending on the application, this
 // may or may not be a concern. Consider using SafeProbScores()
 // instead.
+//
+// If all scores underflow to zero, returns equal probabilities
+// for all classes (1/n each).
 func (c *Classifier) ProbScores(doc []string) (scores []float64, inx int, strict bool) {
 	if c.tfIdf && !c.DidConvertTfIdf {
 		panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling ProbScores.")
 	}
 	n := len(c.Classes)
-	scores = make([]float64, n, n)
+	scores = make([]float64, n)
 	priors := c.getPriors()
 	sum := float64(0)
 	// calculate the score for each class
 	for index, class := range c.Classes {
 		data := c.datas[class]
-		// c is the sum of the logarithms
-		// as outlined in the refresher
 		score := priors[index]
 		for _, word := range doc {
 			score *= data.getWordProb(word)
 		}
 		scores[index] = score
 		sum += score
 	}
-	for i := 0; i < n; i++ {
-		scores[i] /= sum
+	// Handle underflow: if sum is 0, all scores underflowed
+	// Return equal probabilities to avoid NaN
+	if sum == 0 {
+		equal := 1.0 / float64(n)
+		for i := 0; i < n; i++ {
+			scores[i] = equal
+		}
+		strict = false
+	} else {
+		for i := 0; i < n; i++ {
+			scores[i] /= sum
+		}
+		inx, strict = findMax(scores)
 	}
-	inx, strict = findMax(scores)
 	atomic.AddInt32(&c.seen, 1)
 	return scores, inx, strict
 }
@@ -383,26 +394,28 @@ func (c *Classifier) ClassifyProb(document []string) (class Class, scores []floa
 // this method returns an ErrUnderflow, allowing the user to deal with it as
 // necessary. Note that underflow, under certain rare circumstances,
 // may still result in incorrect probabilities being returned,
-// but this method guarantees that all error-less invokations
+// but this method guarantees that all error-less invocations
 // are properly classified.
 //
 // Underflow detection is more costly because it also
 // has to make additional log score calculations.
+//
+// When underflow is detected, the returned scores are computed from
+// log-domain scores using the log-sum-exp trick for numerical stability.
 func (c *Classifier) SafeProbScores(doc []string) (scores []float64, inx int, strict bool, err error) {
 	if c.tfIdf && !c.DidConvertTfIdf {
 		panic("Using a TF-IDF classifier. Please call ConvertTermsFreqToTfIdf before calling SafeProbScores.")
 	}
 
 	n := len(c.Classes)
-	scores = make([]float64, n, n)
-	logScores := make([]float64, n, n)
+	scores = make([]float64, n)
+	logScores := make([]float64, n)
 	priors := c.getPriors()
 	sum := float64(0)
+
 	// calculate the score for each class
 	for index, class := range c.Classes {
 		data := c.datas[class]
-		// c is the sum of the logarithms
-		// as outlined in the refresher
 		score := priors[index]
 		logScore := math.Log(priors[index])
 		for _, word := range doc {
@@ -414,22 +427,64 @@ func (c *Classifier) SafeProbScores(doc []string) (scores []float64, inx int, st
 		logScores[index] = logScore
 		sum += score
 	}
-	for i := 0; i < n; i++ {
-		scores[i] /= sum
-	}
-	inx, strict = findMax(scores)
+
+	// Get the winner from log-domain (always reliable)
 	logInx, logStrict := findMax(logScores)
 
-	// detect underflow -- the size
-	// relation between scores and logScores
-	// must be preserved or something is wrong
-	if inx != logInx || strict != logStrict {
+	// Check for underflow: if sum is 0 or prob-domain disagrees with log-domain
+	if sum == 0 {
+		// Complete underflow - use log-sum-exp to recover probabilities
 		err = ErrUnderflow
+		scores = logScoresToProbs(logScores)
+		inx, strict = logInx, logStrict
+	} else {
+		for i := 0; i < n; i++ {
+			scores[i] /= sum
+		}
+		inx, strict = findMax(scores)
+
+		// Detect partial underflow - when prob and log domains disagree
+		if inx != logInx || strict != logStrict {
+			err = ErrUnderflow
+			// Use log-domain results as they're more reliable
+			scores = logScoresToProbs(logScores)
+			inx, strict = logInx, logStrict
+		}
 	}
+
 	atomic.AddInt32(&c.seen, 1)
 	return scores, inx, strict, err
 }
 
+// logScoresToProbs converts log-domain scores to probabilities
+// using the log-sum-exp trick for numerical stability.
+func logScoresToProbs(logScores []float64) []float64 {
+	n := len(logScores)
+	probs := make([]float64, n)
+
+	// Find max for numerical stability
+	maxLog := logScores[0]
+	for i := 1; i < n; i++ {
+		if logScores[i] > maxLog {
+			maxLog = logScores[i]
+		}
+	}
+
+	// Compute exp(log - max) and sum
+	sum := 0.0
+	for i := 0; i < n; i++ {
+		probs[i] = math.Exp(logScores[i] - maxLog)
+		sum += probs[i]
+	}
+
+	// Normalize
+	for i := 0; i < n; i++ {
+		probs[i] /= sum
+	}
+
+	return probs
+}
+
 // ClassifySafe returns the most likely class for the given document
 // along with the probability scores, whether the classification is strict,
 // and an error if underflow is detected.
diff --git a/bayesian_test.go b/bayesian_test.go
@@ -18,8 +18,10 @@ func Assert(t *testing.T, condition bool, args ...interface{}) {
 func TestEmpty(t *testing.T) {
 	c := NewClassifier("Good", "Bad", "Neutral")
 	priors := c.getPriors()
+	// With Laplace smoothing, empty classifier should have uniform priors
+	expected := 1.0 / float64(len(priors))
 	for _, item := range priors {
-		Assert(t, item == 0)
+		Assert(t, item == expected, "expected uniform prior", expected, "got", item)
 	}
 }
 
@@ -200,10 +202,18 @@ func TestInduceUnderflow(t *testing.T) {
 	for i := 0; i < docSize; i++ {
 		document[i] = "word"
 	}
-	// should induce overflow, because each word
+	// should induce underflow, because each word
 	// will have "defaultProb", which is small
-	scores, _, _, err := c.SafeProbScores(document)
+	scores, inx, _, err := c.SafeProbScores(document)
 	Assert(t, err == ErrUnderflow, "Underflow error not detected")
+	// Verify log-sum-exp recovery produces valid probabilities
+	sum := 0.0
+	for _, s := range scores {
+		Assert(t, s >= 0 && s <= 1, "score out of range [0,1]:", s)
+		sum += s
+	}
+	Assert(t, sum > 0.999 && sum < 1.001, "scores don't sum to 1:", sum)
+	Assert(t, inx >= 0 && inx < len(scores), "index out of range:", inx)
 	println(scores)
 }