Skip to content

Commit 07bad1d

Browse files
committed
crowd-kit==0.0.4
1 parent d569426 commit 07bad1d

25 files changed

+440
-270
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
name='crowd-kit',
1010
package_dir={PREFIX: 'src'},
1111
packages=[f'{PREFIX}.{package}' for package in find_packages('src')],
12-
version='0.0.3',
12+
version='0.0.4',
1313
description='Python libraries for crowdsourcing',
1414
license='Apache 2.0',
1515
author='Vladimir Losev',

src/aggregation/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from .bradley_terry import BradleyTerry
22
from .dawid_skene import DawidSkene
33
from .gold_majority_vote import GoldMajorityVote
4-
from .hrrasa import HRRASA
4+
from .hrrasa import HRRASA, TextHRRASA
55
from .m_msr import MMSR
66
from .majority_vote import MajorityVote
77
from .noisy_bt import NoisyBradleyTerry
8-
from .rasa import RASA
8+
from .rasa import RASA, TextRASA
99
from .wawa import Wawa
1010
from .zero_based_skill import ZeroBasedSkill
1111

1212
__all__ = ['DawidSkene', 'MajorityVote', 'MMSR', 'Wawa', 'GoldMajorityVote', 'ZeroBasedSkill', 'HRRASA', 'RASA',
13-
'BradleyTerry', 'NoisyBradleyTerry']
13+
'BradleyTerry', 'NoisyBradleyTerry', 'TextRASA', 'TextHRRASA']

src/aggregation/annotations.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def manage_docstring(obj):
125125

126126
TASKS_LABEL_PROBAS = Annotation(
127127
type=pd.DataFrame,
128-
title="Tasks' true label probability distributions",
128+
title="Tasks' label probability distributions",
129129
description=textwrap.dedent('''
130130
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
131131
is the probability of `task`'s true label to be equal to `label`. Each
@@ -135,7 +135,7 @@ def manage_docstring(obj):
135135

136136
TASKS_LABEL_SCORES = Annotation(
137137
type=pd.DataFrame,
138-
title="Tasks' true label scores",
138+
title="Tasks' label scores",
139139
description=textwrap.dedent('''
140140
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
141141
is the score of `label` for `task`.
@@ -187,9 +187,9 @@ def manage_docstring(obj):
187187
)
188188

189189
WEIGHTS = Annotation(
190-
type=pd.Series,
190+
type=pd.DataFrame,
191191
title='Task weights',
192-
description='A pandas.Series indexed by `task` containing task weights'
192+
description='A pandas.DataFrame containing `task`, `performer` and `weight`'
193193
)
194194

195195

src/aggregation/bradley_terry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def fit(self, data: annotations.PAIRWISE_DATA) -> Annotation(type='BradleyTerry'
3737
M, unique_labels = self._build_win_matrix(data)
3838

3939
if not unique_labels.size:
40-
self.result_ = pd.Series([])
40+
self.result_ = pd.Series([], dtype=np.float64)
4141
return self
4242

4343
T = M.T + M

src/aggregation/closest_to_average.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,15 @@ class ClosestToAverage(BaseAggregator):
2121
distance: Callable[[np.array, np.array], float] = attr.ib()
2222

2323
@manage_docstring
24-
def fit(self, data: annotations.EMBEDDED_DATA, skills: annotations.SKILLS = None,
24+
def fit(self, data: annotations.EMBEDDED_DATA, aggregated_embeddings: annotations.TASKS_EMBEDDINGS = None,
2525
true_embeddings: annotations.TASKS_EMBEDDINGS = None) -> Annotation(type='ClosestToAverage', title='self'):
2626

2727
data = data[['task', 'performer', 'output', 'embedding']]
28-
if skills is None:
28+
if aggregated_embeddings is None:
2929
avg_embeddings = data.groupby('task')['embedding'].avg()
30+
avg_embeddings.update(true_embeddings)
3031
else:
31-
data = data.join(skills.rename('skill'), on='performer')
32-
data['weighted_embedding'] = data.skill * data.embedding
33-
group = data.groupby('task')
34-
avg_embeddings = (group.weighted_embedding.apply(np.sum) / group.skill.sum())
35-
36-
avg_embeddings.update(true_embeddings)
32+
avg_embeddings = aggregated_embeddings
3733

3834
# Calculating distances (scores)
3935
data = data.join(avg_embeddings.rename('avg_embedding'), on='task')

src/aggregation/dawid_skene.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import attr
44
import numpy as np
5+
import pandas as pd
56

67
from . import annotations
78
from .annotations import manage_docstring, Annotation
@@ -54,16 +55,36 @@ def _e_step(data: annotations.LABELED_DATA, priors: annotations.LABEL_PRIORS, er
5455
Given performer's answers, labels' prior probabilities and performer's performer's
5556
errors probabilities matrix estimates tasks' true labels probabilities.
5657
"""
57-
joined = data.join(errors, on=['performer', 'label'])
58+
59+
# We have to multiply lots of probabilities and such products are known to converge
60+
# to zero exponentialy fast. To avoid floating-point precision problems we work with
61+
# logs of original values
62+
joined = data.join(np.log2(errors), on=['performer', 'label'])
5863
joined.drop(columns=['performer', 'label'], inplace=True)
59-
probas = priors * joined.groupby('task', sort=False).prod()
60-
return probas.div(probas.sum(axis=1), axis=0)
64+
log_likelihoods = np.log2(priors) + joined.groupby('task', sort=False).sum()
65+
66+
# Exponentiating log_likelihoods 'as is' may still get us beyond our precision.
67+
# So we shift every row of log_likelihoods by a constant (which is equivalent to
68+
# multiplying likelihoods rows by a constant) so that max log_likelihood in each
69+
# row is equal to 0. This trick ensures proper scaling after exponentiating and
70+
# does not affect the result of E-step
71+
scaled_likelihoods = np.exp2(log_likelihoods.sub(log_likelihoods.max(axis=1), axis=0))
72+
return scaled_likelihoods.div(scaled_likelihoods.sum(axis=1), axis=0)
6173

6274
@manage_docstring
6375
def fit(self, data: annotations.LABELED_DATA) -> Annotation(type='DawidSkene', title='self'):
6476

65-
# Initialization
6677
data = data[['task', 'performer', 'label']]
78+
79+
# Early exit
80+
if not data.size:
81+
self.probas_ = pd.DataFrame()
82+
self.priors_ = pd.Series()
83+
self.errors_ = pd.DataFrame()
84+
self.labels_ = pd.Series()
85+
return self
86+
87+
# Initialization
6788
probas = MajorityVote().fit_predict_proba(data)
6889
priors = probas.mean()
6990
errors = self._m_step(data, probas)

0 commit comments

Comments
 (0)