|
2 | 2 |
|
3 | 3 | import attr |
4 | 4 | import numpy as np |
| 5 | +import pandas as pd |
5 | 6 |
|
6 | 7 | from . import annotations |
7 | 8 | from .annotations import manage_docstring, Annotation |
@@ -54,16 +55,36 @@ def _e_step(data: annotations.LABELED_DATA, priors: annotations.LABEL_PRIORS, er |
54 | 55 | Given performer's answers, labels' prior probabilities and performer's performer's |
55 | 56 | errors probabilities matrix estimates tasks' true labels probabilities. |
56 | 57 | """ |
57 | | - joined = data.join(errors, on=['performer', 'label']) |
| 58 | + |
| 59 | + # We have to multiply lots of probabilities and such products are known to converge |
| 60 | + # to zero exponentialy fast. To avoid floating-point precision problems we work with |
| 61 | + # logs of original values |
| 62 | + joined = data.join(np.log2(errors), on=['performer', 'label']) |
58 | 63 | joined.drop(columns=['performer', 'label'], inplace=True) |
59 | | - probas = priors * joined.groupby('task', sort=False).prod() |
60 | | - return probas.div(probas.sum(axis=1), axis=0) |
| 64 | + log_likelihoods = np.log2(priors) + joined.groupby('task', sort=False).sum() |
| 65 | + |
| 66 | + # Exponentiating log_likelihoods 'as is' may still get us beyond our precision. |
| 67 | + # So we shift every row of log_likelihoods by a constant (which is equivalent to |
| 68 | + # multiplying likelihoods rows by a constant) so that max log_likelihood in each |
| 69 | + # row is equal to 0. This trick ensures proper scaling after exponentiating and |
| 70 | + # does not affect the result of E-step |
| 71 | + scaled_likelihoods = np.exp2(log_likelihoods.sub(log_likelihoods.max(axis=1), axis=0)) |
| 72 | + return scaled_likelihoods.div(scaled_likelihoods.sum(axis=1), axis=0) |
61 | 73 |
|
62 | 74 | @manage_docstring |
63 | 75 | def fit(self, data: annotations.LABELED_DATA) -> Annotation(type='DawidSkene', title='self'): |
64 | 76 |
|
65 | | - # Initialization |
66 | 77 | data = data[['task', 'performer', 'label']] |
| 78 | + |
| 79 | + # Early exit |
| 80 | + if not data.size: |
| 81 | + self.probas_ = pd.DataFrame() |
| 82 | + self.priors_ = pd.Series() |
| 83 | + self.errors_ = pd.DataFrame() |
| 84 | + self.labels_ = pd.Series() |
| 85 | + return self |
| 86 | + |
| 87 | + # Initialization |
67 | 88 | probas = MajorityVote().fit_predict_proba(data) |
68 | 89 | priors = probas.mean() |
69 | 90 | errors = self._m_step(data, probas) |
|
0 commit comments