feat: DEV-3145: Add command for export project in Opensource (#2824)

guilhermemachado26 · makseq · web-flow · commit f997fc68eefa · 2022-08-25T12:33:29.000+03:00
* Create command to export project

* Implement function to export project

* Add unit tests

* Add extra optional argument to path

* Update unit tests

* Remove expanded drafts from tasks data

* Import EXPORT_DIR directly from base settings

* Use path join to assert filepath

* Update export documentation

* Fixes

* Avoid success message if export fails

* Update unit tests

* Update export documentation

* Max fixes

Co-authored-by: makseq-ubnt &lt;makseq@gmail.com&gt;
diff --git a/docs/source/guide/export.md b/docs/source/guide/export.md
@@ -36,7 +36,20 @@ Use the following steps to export data and annotations from the Label Studio UI.
 
 ### Export timeout in Community Edition
 
-If the export times out, see how to [export snapshots using the SDK](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.export_snapshot_create) or [API](#Export-snapshots-using-the-API).
+If the export times out, see how to [export snapshots using the SDK](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.export_snapshot_create) or [API](#Export-snapshots-using-the-API). You can also use a [console command](#Export-using-console-command) to export your project. For more information, see the following section.
+
+### Export using console command
+
+Use the following command to export data and annotations.
+
+```shell
+label-studio export <project-id> <export-format> --path=<output-path>
+```
+
+To enable logs: 
+```shell
+DEBUG=1 LOG_LEVEL=DEBUG label-studio export <project-id> <export-format> --path=<output-path>
+```
 
 ### Export all tasks including tasks without annotations
 
diff --git a/label_studio/core/argparser.py b/label_studio/core/argparser.py
@@ -3,6 +3,7 @@
 import os
 import json
 
+from .settings.base import EXPORT_DIR
 from .utils.io import find_file
 
 
@@ -133,6 +134,18 @@ def project_name(raw_name):
         '--from-scratch', dest='from_scratch', default=False, action='store_true', help='Recalculate from scratch'
     )
 
+    # export_project sub-command parser
+    export_project = subparsers.add_parser('export', help='Export project in a specific format', parents=[root_parser])
+    export_project.add_argument('project_id', help='Project ID')
+    export_project.add_argument('export_format', help='Export format (JSON, JSON_MIN, CSV, etc)')
+    export_project.add_argument('--export-path', help='Export file path or directory', default=EXPORT_DIR)
+    default_params = '{"annotations__completed_by": {"only_id": null}, "interpolate_key_frames": true}'
+    export_project.add_argument(
+        '--export-serializer-context',
+        help=f"Export serializer context, default value: '{default_params}'",
+        default=default_params
+    )
+
     args = parser.parse_args(input_args)
 
     if not hasattr(args, 'label_config'):
diff --git a/label_studio/data_export/mixins.py b/label_studio/data_export/mixins.py
@@ -115,7 +115,8 @@ def _get_filtered_annotations_queryset(self, annotation_filter_options=None):
         q = reduce(lambda x, y: x | y, q_list)
         return queryset.filter(q)
 
-    def _get_export_serializer_option(self, serialization_options):
+    @staticmethod
+    def _get_export_serializer_option(serialization_options):
         options = {'expand': []}
         if isinstance(serialization_options, dict):
             if (
diff --git a/label_studio/server.py b/label_studio/server.py
@@ -306,6 +306,21 @@ def main():
         calculate_stats_all_orgs(input_args.from_scratch, redis=True)
         return
 
+    if input_args.command == 'export':
+        from tasks.functions import export_project
+
+        try:
+            filename = export_project(
+                input_args.project_id, input_args.export_format, input_args.export_path,
+                serializer_context=input_args.export_serializer_context
+            )
+        except Exception as e:
+            logger.exception(f'Failed to export project: {e}')
+        else:
+            logger.info(f'Project exported successfully: {filename}')
+
+        return
+
     # print version
     if input_args.command == 'version' or input_args.version:
         from label_studio import __version__
diff --git a/label_studio/tasks/functions.py b/label_studio/tasks/functions.py
@@ -1,10 +1,19 @@
+import os
 import sys
 import logging
+import json
+
+from django.conf import settings
 
 from core.models import AsyncMigrationStatus
 from core.redis import start_job_async_or_sync
+from core.utils.common import batch
+from data_export.models import DataExport
+from data_export.serializers import ExportDataSerializer
 from organizations.models import Organization
 from projects.models import Project
+from tasks.models import Task
+from data_export.mixins import ExportMixin
 
 
 def calculate_stats_all_orgs(from_scratch, redis):
@@ -63,3 +72,49 @@ def redis_job_for_calculation(org, from_scratch):
             f"End processing counters for project <{project.title}> ({project.id}), "
             f"processed {str(task_count)} tasks"
         )
+
+
+def export_project(project_id, export_format, path, serializer_context=None):
+    logger = logging.getLogger(__name__)
+
+    project = Project.objects.get(id=project_id)
+
+    export_format = export_format.upper()
+    supported_formats = [s['name'] for s in DataExport.get_export_formats(project)]
+    assert export_format in supported_formats, f'Export format is not supported, please use {supported_formats}'
+
+    task_ids = (
+        Task.objects.filter(project=project)
+        .select_related("project")
+        .prefetch_related("annotations", "predictions")
+    )
+
+    logger.debug(f"Start exporting project <{project.title}> ({project.id}) with task count {task_ids.count()}.")
+
+    # serializer context
+    if isinstance(serializer_context, str):
+        serializer_context = json.loads(serializer_context)
+    serializer_options = ExportMixin._get_export_serializer_option(serializer_context)
+
+    # export cycle
+    tasks = []
+    for _task_ids in batch(task_ids, 1000):
+        tasks += ExportDataSerializer(
+            _task_ids,
+            many=True,
+            **serializer_options
+        ).data
+
+    # convert to output format
+    export_stream, _, filename = DataExport.generate_export_file(
+        project, tasks, export_format, settings.CONVERTER_DOWNLOAD_RESOURCES, {}
+    )
+
+    # write to file
+    filepath = os.path.join(path, filename) if os.path.isdir(path) else path
+    with open(filepath, "wb") as file:
+        file.write(export_stream.read())
+
+    logger.debug(f"End exporting project <{project.title}> ({project.id}) in {export_format} format.")
+
+    return filepath
diff --git a/label_studio/tests/tasks/__init__.py b/label_studio/tests/tasks/__init__.py
diff --git a/label_studio/tests/tasks/test_functions.py b/label_studio/tests/tasks/test_functions.py
@@ -0,0 +1,46 @@
+import io
+import os
+import pytest
+
+from django.conf import settings
+
+from data_export.serializers import ExportDataSerializer
+from tasks.functions import export_project
+
+pytestmark = pytest.mark.django_db
+
+
+class TestExportProject:
+    @pytest.fixture
+    def generate_export_file(self, mocker):
+        return mocker.patch(
+            "tasks.functions.DataExport.generate_export_file",
+            return_value=(io.BytesIO(b"stream"), "application/json", "project.json"),
+        )
+
+    @pytest.fixture
+    def project(self, configured_project):
+        return configured_project
+
+    def test_export_project(self, mocker, generate_export_file, project):
+        data = ExportDataSerializer(
+            project.tasks.all(),
+            many=True,
+            context={"interpolate_key_frames": settings.INTERPOLATE_KEY_FRAMES},
+        ).data
+
+        with mocker.patch("builtins.open"):
+            filepath = export_project(project.id, "JSON", settings.EXPORT_DIR)
+
+        assert filepath == os.path.join(settings.EXPORT_DIR, "project.json")
+
+        generate_export_file.assert_called_once_with(
+            project, data, "JSON", settings.CONVERTER_DOWNLOAD_RESOURCES, {}
+        )
+
+    def test_project_does_not_exist(self, mocker, generate_export_file):
+        with mocker.patch("builtins.open"):
+            with pytest.raises(Exception):
+                export_project(1, "JSON", settings.EXPORT_DIR)
+
+        generate_export_file.assert_not_called()