Skip to content

Commit 9ddbabb

Browse files
committed
crowd-kit==0.0.2
1 parent 5591e3d commit 9ddbabb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+2498
-1557
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
name='crowd-kit',
1010
package_dir={PREFIX: 'src'},
1111
packages=[f'{PREFIX}.{package}' for package in find_packages('src')],
12-
version='0.0.1',
12+
version='0.0.2',
1313
description='Python libraries for crowdsourcing',
1414
license='Apache 2.0',
1515
author='Vladimir Losev',

src/aggregation/__init__.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
from .bradley_terry import BradleyTerry
12
from .dawid_skene import DawidSkene
23
from .gold_majority_vote import GoldMajorityVote
3-
from .majority_vote import MajorityVote
4+
from .hrrasa import HRRASA
45
from .m_msr import MMSR
6+
from .majority_vote import MajorityVote
7+
from .rasa import RASA
58
from .wawa import Wawa
69
from .zero_based_skill import ZeroBasedSkill
7-
from .hrrasa import HRRASA, RASA
810

9-
__all__ = ['DawidSkene', 'MajorityVote', 'MMSR', 'Wawa', 'GoldMajorityVote', 'ZeroBasedSkill', 'HRRASA', 'RASA']
11+
__all__ = ['DawidSkene', 'MajorityVote', 'MMSR', 'Wawa', 'GoldMajorityVote', 'ZeroBasedSkill', 'HRRASA', 'RASA',
12+
'BradleyTerry']

src/aggregation/annotations.py

Lines changed: 99 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import inspect
88
import textwrap
99
from io import StringIO
10-
from typing import ClassVar, Dict, Optional, Type, get_type_hints
10+
from typing import Dict, Optional, Type
1111

1212
import attr
1313
import pandas as pd
@@ -37,7 +37,7 @@ def manage_docstring(obj):
3737
attributes: Dict[str, Annotation] = {}
3838
new_annotations = {}
3939

40-
for key, value in get_type_hints(obj).items():
40+
for key, value in getattr(obj, '__annotations__', {}).items():
4141
if isinstance(value, Annotation):
4242
attributes[key] = value
4343
if value.type is not None:
@@ -64,37 +64,101 @@ def manage_docstring(obj):
6464
return obj
6565

6666

67-
PERFORMERS_SKILLS = Annotation(
68-
type=pd.Series,
69-
title='Predicted skills for each performer',
70-
description=textwrap.dedent("A series of performers' skills indexed by performers"),
67+
# Input data descriptions
68+
69+
70+
EMBEDDED_DATA = Annotation(
71+
type=pd.DataFrame,
72+
title="Performers' outputs with their embeddings",
73+
description='A pandas.DataFrame containing `task`, `performer`, `output` and `embedding` columns.'
7174
)
7275

73-
PROBAS = Annotation(
76+
LABELED_DATA = Annotation(
7477
type=pd.DataFrame,
75-
title='Estimated label probabilities',
78+
title="Performers' labeling results",
79+
description='A pandas.DataFrame containing `task`, `performer` and `label` columns.',
80+
)
81+
82+
83+
PAIRWISE_DATA = Annotation(
84+
type=pd.DataFrame,
85+
title="Performers' pairwise comparison results",
7686
description=textwrap.dedent('''
77-
A frame indexed by `task` and a column for every label id found
78-
in `data` such that `result.loc[task, label]` is the probability of `task`'s
79-
true label to be equal to `label`.
80-
'''),
87+
A pandas.DataFrame containing `performer`, `left`, `right`, and `label` columns'.
88+
For each row `label` must be equal to either `left` or `right`.
89+
''')
8190
)
8291

83-
PRIORS = Annotation(
92+
93+
# Commonly used types
94+
95+
LABEL_PRIORS = Annotation(
8496
type=pd.Series,
8597
title='A prior label distribution',
86-
description="A series of labels' probabilities indexed by labels",
98+
description=textwrap.dedent('''
99+
A pandas.Series indexed by labels and holding corresponding label's
100+
probability of occurrence. Each probability is between 0 and 1,
101+
all probabilities should sum up to 1
102+
'''),
103+
)
104+
105+
LABEL_SCORES = Annotation(
106+
type=pd.Series,
107+
title="'Labels' scores",
108+
description="A pandas.Series index by labels and holding corresponding label's scores",
109+
)
110+
111+
TASKS_EMBEDDINGS = Annotation(
112+
type=pd.DataFrame,
113+
title="Tasks' embeddings",
114+
description=textwrap.dedent("A pandas.DataFrame indexed by `task` with a single column `embedding`."),
87115
)
88116

89117
TASKS_LABELS = Annotation(
90118
type=pd.DataFrame,
91-
title='Estimated labels',
119+
title="Tasks' most likely true labels",
92120
description=textwrap.dedent('''
93-
A pandas.DataFrame indexed by `task` with a single column `label` containing
94-
`tasks`'s most probable label for last fitted data, or None otherwise.
121+
A pandas.Series indexed by `task` such that `labels.loc[task]`
122+
is the tasks's most likely true label.
95123
'''),
96124
)
97125

126+
TASKS_LABEL_PROBAS = Annotation(
127+
type=pd.DataFrame,
128+
title="Tasks' true label probability distributions",
129+
description=textwrap.dedent('''
130+
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
131+
is the probability of `task`'s true label to be equal to `label`. Each
132+
probability is between 0 and 1, all task's probabilities should sum up to 1
133+
'''),
134+
)
135+
136+
TASKS_LABEL_SCORES = Annotation(
137+
type=pd.DataFrame,
138+
title="Tasks' true label scores",
139+
description=textwrap.dedent('''
140+
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
141+
is the score of `label` for `task`.
142+
'''),
143+
)
144+
145+
TASKS_TRUE_LABELS = Annotation(
146+
type=pd.Series,
147+
title="Tasks' ground truth labels",
148+
description=textwrap.dedent('''
149+
A pandas.Series indexed by `task` such that `labels.loc[task]`
150+
is the tasks's ground truth label.
151+
'''),
152+
)
153+
154+
155+
SKILLS = Annotation(
156+
type=pd.Series,
157+
title="Performers' skills",
158+
description="A pandas.Series index by performers and holding corresponding performer's skill",
159+
)
160+
161+
98162
ERRORS = Annotation(
99163
type=pd.DataFrame,
100164
title="Performers' error matrices",
@@ -106,19 +170,28 @@ def manage_docstring(obj):
106170
'''),
107171
)
108172

109-
DATA = Annotation(
173+
174+
WEIGHTED_DATA = Annotation(
110175
type=pd.DataFrame,
111176
title='Input data',
112-
description='A pandas.DataFrame containing `task`, `performer` and `label` columns',
177+
description='A pandas.DataFrame containing `task`, `performer`, `label` and optionally `weight` columns',
178+
)
179+
180+
WEIGHTS = Annotation(
181+
type=pd.Series,
182+
title='Task weights',
183+
description='A pandas.Series indexed by `task` containing task weights'
113184
)
114185

115186

116-
def _make_opitonal_classlevel(annotation: Annotation):
117-
return attr.evolve(annotation, type=ClassVar[Optional[annotation.type]])
187+
def _make_opitonal(annotation: Annotation):
188+
return attr.evolve(annotation, type=Optional[annotation.type])
118189

119190

120-
OPTIONAL_CLASSLEVEL_PERFORMERS_SKILLS = _make_opitonal_classlevel(PERFORMERS_SKILLS)
121-
OPTIONAL_CLASSLEVEL_PROBAS = _make_opitonal_classlevel(PROBAS)
122-
OPTIONAL_CLASSLEVEL_PRIORS = _make_opitonal_classlevel(PRIORS)
123-
OPTIONAL_CLASSLEVEL_TASKS_LABELS = _make_opitonal_classlevel(TASKS_LABELS)
124-
OPTIONAL_CLASSLEVEL_ERRORS = _make_opitonal_classlevel(ERRORS)
191+
OPTIONAL_SCORES = _make_opitonal(TASKS_LABEL_SCORES)
192+
OPTIONAL_SKILLS = _make_opitonal(SKILLS)
193+
OPTIONAL_PROBAS = _make_opitonal(TASKS_LABEL_PROBAS)
194+
OPTIONAL_PRIORS = _make_opitonal(LABEL_PRIORS)
195+
OPTIONAL_LABELS = _make_opitonal(TASKS_LABELS)
196+
OPTIONAL_ERRORS = _make_opitonal(ERRORS)
197+
OPTIONAL_WEIGHTS = _make_opitonal(WEIGHTS)

src/aggregation/base_aggregator.py

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,8 @@
11
__all__ = ['BaseAggregator']
22

3-
import random
4-
from typing import Union, Tuple
5-
6-
import attr
7-
import pandas as pd
8-
9-
from . import annotations
103
from .annotations import manage_docstring
114

125

13-
@attr.attrs(auto_attribs=True)
146
@manage_docstring
157
class BaseAggregator:
168
"""Base functions and fields for all aggregators"""
17-
18-
tasks_labels: annotations.OPTIONAL_CLASSLEVEL_TASKS_LABELS = None
19-
probas: annotations.OPTIONAL_CLASSLEVEL_PROBAS = None
20-
performers_skills: annotations.OPTIONAL_CLASSLEVEL_PERFORMERS_SKILLS = None
21-
22-
@staticmethod
23-
def _max_probas_random_on_ties(x: Union[pd.DataFrame, pd.Series]) -> Tuple[str, float]:
24-
"""Chooses max 'proba' value and return 'label' from same rows
25-
If several rows have same 'proba' - choose random
26-
"""
27-
max_proba = x.proba.max()
28-
max_label_index = random.choice(x[x.proba==max_proba].index)
29-
return x.label[max_label_index], max_proba
30-
31-
@manage_docstring
32-
def _calculate_probabilities(self, estimated_answers: pd.DataFrame) -> annotations.PROBAS:
33-
"""Calculate probabilities for each task for each label
34-
35-
Note:
36-
All "score" must be positive.
37-
If the sum of scores for a task is zero, then all probabilities for this task will be NaN.
38-
39-
Args:
40-
estimated_answers(pandas.DataFrame): Frame with "score" for each pair task-label.
41-
Should contain columns 'score', 'task', 'label'
42-
43-
"""
44-
assert (estimated_answers.score >= 0).all(), 'In answers exists some "score" with negative value'
45-
46-
estimated_answers['proba'] = estimated_answers.score / estimated_answers.groupby('task').score.transform('sum')
47-
self.probas = estimated_answers.pivot(index='task', columns='label', values='proba')
48-
return self.probas
49-
50-
@manage_docstring
51-
def _choose_labels(self, labels_probas: annotations.PROBAS) -> annotations.TASKS_LABELS:
52-
"""Selection of the labels with the most probalitities"""
53-
self.tasks_labels = labels_probas.idxmax(axis="columns").reset_index(name='label')
54-
return self.tasks_labels
55-
56-
@manage_docstring
57-
def _calc_performers_skills(self, answers: pd.DataFrame, task_truth: pd.DataFrame) -> annotations.PERFORMERS_SKILLS:
58-
"""Calculates skill for each performer
59-
60-
Note:
61-
There can be only one * correct label *
62-
63-
Args:
64-
answers (pandas.DataFrame): performers answers for tasks
65-
Should contain columns 'task', 'performer', 'label'
66-
task_truth (pandas.DataFrame): label regarding which to count the skill
67-
Should contain columns 'task', 'label'
68-
Could contain column 'weight'
69-
"""
70-
def _agreed_on_task(x):
71-
"""Calculates performers agreed for each based on:
72-
- result label in 'task_truth',
73-
- performer label in 'answers',
74-
- and 'weight' if it's exist
75-
"""
76-
return int(x['label'] == x['label_truth']) * x.get('weight', 1)
77-
78-
answers_with_results = answers.merge(task_truth, on='task', suffixes=('', '_truth'))
79-
answers_with_results['skill'] = answers_with_results.apply(_agreed_on_task, axis=1)
80-
self.performers_skills = answers_with_results.groupby('performer')['skill'].agg('mean').reset_index()
81-
return self.performers_skills
82-
83-
def _answers_base_checks(self, answers: pd.DataFrame) -> None:
84-
"""Checks basic 'answers' dataset requirements"""
85-
if not isinstance(answers, pd.DataFrame):
86-
raise TypeError('Working only with pandas DataFrame')
87-
assert 'task' in answers, 'There is no "task" column in answers'
88-
assert 'performer' in answers, 'There is no "performer" column in answers'
89-
assert 'label' in answers, 'There is no "label" column in answers'
Lines changed: 1 addition & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,5 @@
1-
from pandas.core.frame import DataFrame
2-
from pandas.core.series import Series
3-
from typing import ClassVar, Tuple, Union, Optional
4-
51
class BaseAggregator:
6-
"""Base functions and fields for all aggregators
7-
Attributes:
8-
tasks_labels (typing.ClassVar[typing.Optional[pandas.core.frame.DataFrame]]): Estimated labels
9-
A pandas.DataFrame indexed by `task` with a single column `label` containing
10-
`tasks`'s most probable label for last fitted data, or None otherwise.
11-
12-
probas (typing.ClassVar[typing.Optional[pandas.core.frame.DataFrame]]): Estimated label probabilities
13-
A frame indexed by `task` and a column for every label id found
14-
in `data` such that `result.loc[task, label]` is the probability of `task`'s
15-
true label to be equal to `label`.
16-
17-
performers_skills (typing.ClassVar[typing.Optional[pandas.core.series.Series]]): Predicted skills for each performer
18-
A series of performers' skills indexed by performers"""
19-
20-
tasks_labels: ClassVar[Optional[DataFrame]]
21-
probas: ClassVar[Optional[DataFrame]]
22-
performers_skills: ClassVar[Optional[Series]]
23-
24-
def __init__(self) -> None:
25-
"""Method generated by attrs for class BaseAggregator."""
26-
...
27-
28-
def _answers_base_checks(self, answers: DataFrame) -> None:
29-
"""Checks basic 'answers' dataset requirements"""
30-
...
31-
32-
def _calc_performers_skills(self, answers: DataFrame, task_truth: DataFrame) -> Series:
33-
"""Calculates skill for each performer
34-
35-
Note:
36-
There can be only one * correct label *
37-
38-
Args:
39-
answers (pandas.DataFrame): performers answers for tasks
40-
Should contain columns 'task', 'performer', 'label'
41-
task_truth (pandas.DataFrame): label regarding which to count the skill
42-
Should contain columns 'task', 'label'
43-
Could contain column 'weight'Returns:
44-
Series: Predicted skills for each performer
45-
A series of performers' skills indexed by performers"""
46-
...
47-
48-
def _calculate_probabilities(self, estimated_answers: DataFrame) -> DataFrame:
49-
"""Calculate probabilities for each task for each label
50-
51-
Note:
52-
All "score" must be positive.
53-
If the sum of scores for a task is zero, then all probabilities for this task will be NaN.
54-
55-
Args:
56-
estimated_answers(pandas.DataFrame): Frame with "score" for each pair task-label.
57-
Should contain columns 'score', 'task', 'label'Returns:
58-
DataFrame: Estimated label probabilities
59-
A frame indexed by `task` and a column for every label id found
60-
in `data` such that `result.loc[task, label]` is the probability of `task`'s
61-
true label to be equal to `label`."""
62-
...
2+
"""Base functions and fields for all aggregators"""
633

64-
def _choose_labels(self, labels_probas: DataFrame) -> DataFrame:
65-
"""Selection of the labels with the most probalitities
66-
Args:
67-
labels_probas (DataFrame): Estimated label probabilities
68-
A frame indexed by `task` and a column for every label id found
69-
in `data` such that `result.loc[task, label]` is the probability of `task`'s
70-
true label to be equal to `label`.
714

72-
Returns:
73-
DataFrame: Estimated labels
74-
A pandas.DataFrame indexed by `task` with a single column `label` containing
75-
`tasks`'s most probable label for last fitted data, or None otherwise."""
76-
...
775

78-
@staticmethod
79-
def _max_probas_random_on_ties(x: Union[DataFrame, Series]) -> Tuple[str, float]:
80-
"""Chooses max 'proba' value and return 'label' from same rows
81-
If several rows have same 'proba' - choose random"""
82-
...

src/aggregation/base_embedding_aggregator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
__all__ = ['BaseEmbeddingAggregator']
2+
3+
14
from typing import Any, Optional
25

36
import numpy as np

0 commit comments

Comments
 (0)