Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitleaks.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ description = "Global allow list"
stopwords = [
'''keyEntities''',
'''shiftKey''',
'''duplicated''',
]
paths = [
'''gitleaks\.toml''',
Expand Down
62 changes: 32 additions & 30 deletions label_studio/data_manager/actions/remove_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,41 +144,43 @@ def restore_storage_links_for_duplicated_tasks(duplicates) -> None:
total_restored_links = 0
for data in list(duplicates):
tasks = duplicates[data]
source = None

def _get_storagelink(task):
for link in classes:
if link_id := task.get(link):
return classes[link], link_id
return None

# find first task with existing StorageLink
tasks_without_storagelinks = []
tasks_with_storagelinks = []
for task in tasks:
for link in classes:
if link in task and task[link] is not None:
# we don't support case when there are many storage links in duplicated tasks
if source is not None:
source = None
break
source = (
task,
classes[link],
task[link],
) # last arg is a storage link id
if _get_storagelink(task):
tasks_with_storagelinks.append(task)
else:
tasks_without_storagelinks.append(task)

# add storage links to duplicates
if source:
storage_link_class = source[1] # get link name
for task in tasks:
if task['id'] != source[0]['id']:
# get already existing StorageLink
link_instance = storage_link_class.objects.get(id=source[2])

# assign existing StorageLink to other duplicated tasks
link = storage_link_class(
task_id=task['id'],
key=link_instance.key,
row_index=link_instance.row_index,
row_group=link_instance.row_group,
storage=link_instance.storage,
)
link.save()
total_restored_links += 1
logger.info(f"Restored storage link for task {task['id']} from source task {source[0]['id']}")
if tasks_with_storagelinks:
# we don't support case when there are many storage links in duplicated tasks
storage_link_class, storage_link_id = _get_storagelink(tasks_with_storagelinks[0])
# get already existing StorageLink
link_instance = storage_link_class.objects.get(id=storage_link_id)

for task in tasks_without_storagelinks:
# assign existing StorageLink to other duplicated tasks
link = storage_link_class(
task_id=task['id'],
key=link_instance.key,
row_index=link_instance.row_index,
row_group=link_instance.row_group,
storage=link_instance.storage,
)
link.save()
total_restored_links += 1
logger.info(
f"Restored storage link for task {task['id']} from source task {tasks_with_storagelinks[0]['id']}"
)

logger.info(f'Restored {total_restored_links} storage links for duplicated tasks')

Expand Down
12 changes: 10 additions & 2 deletions label_studio/tests/data_manager/test_api_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,18 @@ def test_action_remove_duplicates(business_client, project_id, storage_model, li
link_model.objects.create(task=task4, key='duplicated.jpg', storage=storage)

# task 5: add a non-duplicated task using the same key, ensuring multiple tasks in the same key don't interfere
task_data = {'data': {'image': 'normal2.jpg'}}
task5 = make_task(task_data, project)
different_task_data = {'data': {'image': 'normal2.jpg'}}
task5 = make_task(different_task_data, project)
link_model.objects.create(task=task5, key='duplicated.jpg', row_index=1, storage=storage)

# task 6: add duplicated task with a different storage link
task6 = make_task(task_data, project)
link_model.objects.create(task=task6, key='duplicated2.jpg', storage=storage)

# task 7: add duplicated task with a different storage link
task7 = make_task(task_data, project)
link_model.objects.create(task=task7, key='duplicated3.jpg', storage=storage)

# call the "remove duplicated tasks" action
status = business_client.post(
f'/api/dm/actions?project={project_id}&id=remove_duplicates',
Expand Down
Loading