Toloka
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/aggregation/__init__.py‎
Lines changed: 6 additions & 3 deletions b/‎src/aggregation/__init__.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/aggregation/annotations.py‎
Lines changed: 99 additions & 26 deletions b/‎src/aggregation/annotations.py‎
Lines changed: 99 additions & 26 deletions
diff --git a/‎src/aggregation/base_aggregator.py‎
Lines changed: 0 additions & 81 deletions b/‎src/aggregation/base_aggregator.py‎
Lines changed: 0 additions & 81 deletions
diff --git a/‎src/aggregation/base_aggregator.pyi‎
Lines changed: 1 addition & 78 deletions b/‎src/aggregation/base_aggregator.pyi‎
Lines changed: 1 addition & 78 deletions
diff --git a/‎src/aggregation/base_embedding_aggregator.py‎
Lines changed: 3 additions & 0 deletions b/‎src/aggregation/base_embedding_aggregator.py‎
Lines changed: 3 additions & 0 deletions
@@ -9,7 +9,7 @@
     name='crowd-kit',
     package_dir={PREFIX: 'src'},
     packages=[f'{PREFIX}.{package}' for package in find_packages('src')],
-    version='0.0.1',
+    version='0.0.2',
     description='Python libraries for crowdsourcing',
     license='Apache 2.0',
     author='Vladimir Losev',
 
@@ -1,9 +1,12 @@
+from .bradley_terry import BradleyTerry
 from .dawid_skene import DawidSkene
 from .gold_majority_vote import GoldMajorityVote
-from .majority_vote import MajorityVote
+from .hrrasa import HRRASA
 from .m_msr import MMSR
+from .majority_vote import MajorityVote
+from .rasa import RASA
 from .wawa import Wawa
 from .zero_based_skill import ZeroBasedSkill
-from .hrrasa import HRRASA, RASA
 
-__all__ = ['DawidSkene', 'MajorityVote', 'MMSR', 'Wawa', 'GoldMajorityVote', 'ZeroBasedSkill', 'HRRASA', 'RASA']
+__all__ = ['DawidSkene', 'MajorityVote', 'MMSR', 'Wawa', 'GoldMajorityVote', 'ZeroBasedSkill', 'HRRASA', 'RASA',
+           'BradleyTerry']
@@ -7,7 +7,7 @@
 import inspect
 import textwrap
 from io import StringIO
-from typing import ClassVar, Dict, Optional, Type, get_type_hints
+from typing import Dict, Optional, Type
 
 import attr
 import pandas as pd
@@ -37,7 +37,7 @@ def manage_docstring(obj):
     attributes: Dict[str, Annotation] = {}
     new_annotations = {}
 
-    for key, value in get_type_hints(obj).items():
+    for key, value in getattr(obj, '__annotations__', {}).items():
         if isinstance(value, Annotation):
             attributes[key] = value
             if value.type is not None:
@@ -64,37 +64,101 @@ def manage_docstring(obj):
     return obj
 
 
-PERFORMERS_SKILLS = Annotation(
-    type=pd.Series,
-    title='Predicted skills for each performer',
-    description=textwrap.dedent("A series of performers' skills indexed by performers"),
+# Input data descriptions
+
+
+EMBEDDED_DATA = Annotation(
+    type=pd.DataFrame,
+    title="Performers' outputs with their embeddings",
+    description='A pandas.DataFrame containing `task`, `performer`, `output` and `embedding` columns.'
 )
 
-PROBAS = Annotation(
+LABELED_DATA = Annotation(
     type=pd.DataFrame,
-    title='Estimated label probabilities',
+    title="Performers' labeling results",
+    description='A pandas.DataFrame containing `task`, `performer` and `label` columns.',
+)
+
+
+PAIRWISE_DATA = Annotation(
+    type=pd.DataFrame,
+    title="Performers' pairwise comparison results",
     description=textwrap.dedent('''
-        A frame indexed by `task` and a column for every label id found
-        in `data` such that `result.loc[task, label]` is the probability of `task`'s
-        true label to be equal to `label`.
-    '''),
+        A pandas.DataFrame containing `performer`, `left`, `right`, and `label` columns'.
+        For each row `label` must be equal to either `left` or `right`.
+    ''')
 )
 
-PRIORS = Annotation(
+
+# Commonly used types
+
+LABEL_PRIORS = Annotation(
     type=pd.Series,
     title='A prior label distribution',
-    description="A series of labels' probabilities indexed by labels",
+    description=textwrap.dedent('''
+        A pandas.Series indexed by labels and holding corresponding label's
+        probability of occurrence. Each probability is between 0 and 1,
+        all probabilities should sum up to 1
+    '''),
+)
+
+LABEL_SCORES = Annotation(
+    type=pd.Series,
+    title="'Labels' scores",
+    description="A pandas.Series index by labels and holding corresponding label's scores",
+)
+
+TASKS_EMBEDDINGS = Annotation(
+    type=pd.DataFrame,
+    title="Tasks' embeddings",
+    description=textwrap.dedent("A pandas.DataFrame indexed by `task` with a single column `embedding`."),
 )
 
 TASKS_LABELS = Annotation(
     type=pd.DataFrame,
-    title='Estimated labels',
+    title="Tasks' most likely true labels",
     description=textwrap.dedent('''
-        A pandas.DataFrame indexed by `task` with a single column `label` containing
-        `tasks`'s most probable label for last fitted data, or None otherwise.
+        A pandas.Series indexed by `task` such that `labels.loc[task]`
+        is the tasks's most likely true label.
     '''),
 )
 
+TASKS_LABEL_PROBAS = Annotation(
+    type=pd.DataFrame,
+    title="Tasks' true label probability distributions",
+    description=textwrap.dedent('''
+        A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
+        is the probability of `task`'s true label to be equal to `label`. Each
+        probability is between 0 and 1, all task's probabilities should sum up to 1
+    '''),
+)
+
+TASKS_LABEL_SCORES = Annotation(
+    type=pd.DataFrame,
+    title="Tasks' true label scores",
+    description=textwrap.dedent('''
+        A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
+        is the score of `label` for `task`.
+    '''),
+)
+
+TASKS_TRUE_LABELS = Annotation(
+    type=pd.Series,
+    title="Tasks' ground truth labels",
+    description=textwrap.dedent('''
+        A pandas.Series indexed by `task` such that `labels.loc[task]`
+        is the tasks's ground truth label.
+    '''),
+)
+
+
+SKILLS = Annotation(
+    type=pd.Series,
+    title="Performers' skills",
+    description="A pandas.Series index by performers and holding corresponding performer's skill",
+)
+
+
 ERRORS = Annotation(
     type=pd.DataFrame,
     title="Performers' error matrices",
@@ -106,19 +170,28 @@ def manage_docstring(obj):
     '''),
 )
 
-DATA = Annotation(
+
+WEIGHTED_DATA = Annotation(
     type=pd.DataFrame,
     title='Input data',
-    description='A pandas.DataFrame containing `task`, `performer` and `label` columns',
+    description='A pandas.DataFrame containing `task`, `performer`, `label` and optionally `weight` columns',
+)
+
+WEIGHTS = Annotation(
+    type=pd.Series,
+    title='Task weights',
+    description='A pandas.Series indexed by `task` containing task weights'
 )
 
 
-def _make_opitonal_classlevel(annotation: Annotation):
-    return attr.evolve(annotation, type=ClassVar[Optional[annotation.type]])
+def _make_opitonal(annotation: Annotation):
+    return attr.evolve(annotation, type=Optional[annotation.type])
 
 
-OPTIONAL_CLASSLEVEL_PERFORMERS_SKILLS = _make_opitonal_classlevel(PERFORMERS_SKILLS)
-OPTIONAL_CLASSLEVEL_PROBAS = _make_opitonal_classlevel(PROBAS)
-OPTIONAL_CLASSLEVEL_PRIORS = _make_opitonal_classlevel(PRIORS)
-OPTIONAL_CLASSLEVEL_TASKS_LABELS = _make_opitonal_classlevel(TASKS_LABELS)
-OPTIONAL_CLASSLEVEL_ERRORS = _make_opitonal_classlevel(ERRORS)
+OPTIONAL_SCORES = _make_opitonal(TASKS_LABEL_SCORES)
+OPTIONAL_SKILLS = _make_opitonal(SKILLS)
+OPTIONAL_PROBAS = _make_opitonal(TASKS_LABEL_PROBAS)
+OPTIONAL_PRIORS = _make_opitonal(LABEL_PRIORS)
+OPTIONAL_LABELS = _make_opitonal(TASKS_LABELS)
+OPTIONAL_ERRORS = _make_opitonal(ERRORS)
+OPTIONAL_WEIGHTS = _make_opitonal(WEIGHTS)
@@ -1,89 +1,8 @@
 __all__ = ['BaseAggregator']
 
-import random
-from typing import Union, Tuple
-
-import attr
-import pandas as pd
-
-from . import annotations
 from .annotations import manage_docstring
 
 
-@attr.attrs(auto_attribs=True)
 @manage_docstring
 class BaseAggregator:
     """Base functions and fields for all aggregators"""
-
-    tasks_labels: annotations.OPTIONAL_CLASSLEVEL_TASKS_LABELS = None
-    probas: annotations.OPTIONAL_CLASSLEVEL_PROBAS = None
-    performers_skills: annotations.OPTIONAL_CLASSLEVEL_PERFORMERS_SKILLS = None
-
-    @staticmethod
-    def _max_probas_random_on_ties(x: Union[pd.DataFrame, pd.Series]) -> Tuple[str, float]:
-        """Chooses max 'proba' value and return 'label' from same rows
-        If several rows have same 'proba' - choose random
-        """
-        max_proba = x.proba.max()
-        max_label_index = random.choice(x[x.proba==max_proba].index)
-        return x.label[max_label_index], max_proba
-
-    @manage_docstring
-    def _calculate_probabilities(self, estimated_answers: pd.DataFrame) -> annotations.PROBAS:
-        """Calculate probabilities for each task for each label
-
-        Note:
-            All "score" must be positive.
-            If the sum of scores for a task is zero, then all probabilities for this task will be NaN.
-
-        Args:
-            estimated_answers(pandas.DataFrame): Frame with "score" for each pair task-label.
-                Should contain columns 'score', 'task', 'label'
-
-        """
-        assert (estimated_answers.score >= 0).all(), 'In answers exists some "score" with negative value'
-
-        estimated_answers['proba'] = estimated_answers.score / estimated_answers.groupby('task').score.transform('sum')
-        self.probas = estimated_answers.pivot(index='task', columns='label', values='proba')
-        return self.probas
-
-    @manage_docstring
-    def _choose_labels(self, labels_probas: annotations.PROBAS) -> annotations.TASKS_LABELS:
-        """Selection of the labels with the most probalitities"""
-        self.tasks_labels = labels_probas.idxmax(axis="columns").reset_index(name='label')
-        return self.tasks_labels
-
-    @manage_docstring
-    def _calc_performers_skills(self, answers: pd.DataFrame, task_truth: pd.DataFrame) -> annotations.PERFORMERS_SKILLS:
-        """Calculates skill for each performer
-
-        Note:
-            There can be only one * correct label *
-
-        Args:
-            answers (pandas.DataFrame): performers answers for tasks
-                Should contain columns 'task', 'performer', 'label'
-            task_truth (pandas.DataFrame): label regarding which to count the skill
-                Should contain columns 'task', 'label'
-                Could contain column 'weight'
-        """
-        def _agreed_on_task(x):
-            """Calculates performers agreed for each based on:
-            - result label in 'task_truth',
-            - performer label in 'answers',
-            - and 'weight' if it's exist
-            """
-            return int(x['label'] == x['label_truth']) * x.get('weight', 1)
-
-        answers_with_results = answers.merge(task_truth, on='task', suffixes=('', '_truth'))
-        answers_with_results['skill'] = answers_with_results.apply(_agreed_on_task, axis=1)
-        self.performers_skills = answers_with_results.groupby('performer')['skill'].agg('mean').reset_index()
-        return self.performers_skills
-
-    def _answers_base_checks(self, answers: pd.DataFrame) -> None:
-        """Checks basic 'answers' dataset requirements"""
-        if not isinstance(answers, pd.DataFrame):
-            raise TypeError('Working only with pandas DataFrame')
-        assert 'task' in answers, 'There is no "task" column in answers'
-        assert 'performer' in answers, 'There is no "performer" column in answers'
-        assert 'label' in answers, 'There is no "label" column in answers'
@@ -1,82 +1,5 @@
-from pandas.core.frame import DataFrame
-from pandas.core.series import Series
-from typing import ClassVar, Tuple, Union, Optional
-
 class BaseAggregator:
-    """Base functions and fields for all aggregators
-    Attributes:
-        tasks_labels (typing.ClassVar[typing.Optional[pandas.core.frame.DataFrame]]): Estimated labels
-            A pandas.DataFrame indexed by `task` with a single column `label` containing
-            `tasks`'s most probable label for last fitted data, or None otherwise.
-
-        probas (typing.ClassVar[typing.Optional[pandas.core.frame.DataFrame]]): Estimated label probabilities
-            A frame indexed by `task` and a column for every label id found
-            in `data` such that `result.loc[task, label]` is the probability of `task`'s
-            true label to be equal to `label`.
-
-        performers_skills (typing.ClassVar[typing.Optional[pandas.core.series.Series]]): Predicted skills for each performer
-            A series of performers' skills indexed by performers"""
-
-    tasks_labels: ClassVar[Optional[DataFrame]]
-    probas: ClassVar[Optional[DataFrame]]
-    performers_skills: ClassVar[Optional[Series]]
-
-    def __init__(self) -> None:
-        """Method generated by attrs for class BaseAggregator."""
-        ...
-
-    def _answers_base_checks(self, answers: DataFrame) -> None:
-        """Checks basic 'answers' dataset requirements"""
-        ...
-
-    def _calc_performers_skills(self, answers: DataFrame, task_truth: DataFrame) -> Series:
-        """Calculates skill for each performer
-
-        Note:
-            There can be only one * correct label *
-
-        Args:
-            answers (pandas.DataFrame): performers answers for tasks
-                Should contain columns 'task', 'performer', 'label'
-            task_truth (pandas.DataFrame): label regarding which to count the skill
-                Should contain columns 'task', 'label'
-                Could contain column 'weight'Returns:
-            Series: Predicted skills for each performer
-                A series of performers' skills indexed by performers"""
-        ...
-
-    def _calculate_probabilities(self, estimated_answers: DataFrame) -> DataFrame:
-        """Calculate probabilities for each task for each label
-
-        Note:
-            All "score" must be positive.
-            If the sum of scores for a task is zero, then all probabilities for this task will be NaN.
-
-        Args:
-            estimated_answers(pandas.DataFrame): Frame with "score" for each pair task-label.
-                Should contain columns 'score', 'task', 'label'Returns:
-            DataFrame: Estimated label probabilities
-                A frame indexed by `task` and a column for every label id found
-                in `data` such that `result.loc[task, label]` is the probability of `task`'s
-                true label to be equal to `label`."""
-        ...
+    """Base functions and fields for all aggregators"""
 
-    def _choose_labels(self, labels_probas: DataFrame) -> DataFrame:
-        """Selection of the labels with the most probalitities
-        Args:
-            labels_probas (DataFrame): Estimated label probabilities
-                A frame indexed by `task` and a column for every label id found
-                in `data` such that `result.loc[task, label]` is the probability of `task`'s
-                true label to be equal to `label`.
 
-        Returns:
-            DataFrame: Estimated labels
-                A pandas.DataFrame indexed by `task` with a single column `label` containing
-                `tasks`'s most probable label for last fitted data, or None otherwise."""
-        ...
 
-    @staticmethod
-    def _max_probas_random_on_ties(x: Union[DataFrame, Series]) -> Tuple[str, float]:
-        """Chooses max 'proba' value and return 'label' from same rows
-        If several rows have same 'proba' - choose random"""
-        ...
@@ -1,3 +1,6 @@
+__all__ = ['BaseEmbeddingAggregator']
+
+
 from typing import Any, Optional
 
 import numpy as np