opensearch-project · IanHoang · Jun 4, 2025 · May 19, 2025
diff --git a/osbenchmark/benchmark.py b/osbenchmark/benchmark.py
@@ -38,6 +38,7 @@
     test_execution_orchestrator, results_publisher, \
         metrics, workload, exceptions, log
 from osbenchmark.builder import provision_config, builder
+from osbenchmark.synthetic_data_generator import synthetic_data_generator
 from osbenchmark.workload_generator import workload_generator
 from osbenchmark.utils import io, convert, process, console, net, opts, versions
 from osbenchmark import aggregator
@@ -157,6 +158,58 @@ def add_workload_source(subparser):
         "--exclude-tasks",
         help="Defines a comma-separated list of tasks not to run. By default all tasks of a test_procedure are run.")
 
+    synthetic_data_generator_parser = subparsers.add_parser("generate-data",
+                                                            help="Generate synthetic data based on existing index mappings or custom module." +
+                                                            "This data can be ported into OSB workloads or ingested into OpenSearch." )
+
+    exclusive_file_inputs = synthetic_data_generator_parser.add_mutually_exclusive_group(required=True)
+    exclusive_file_inputs.add_argument(
+        "--index-mappings",
+        "-i",
+        help="OpenSearch index mappings to generate data from."
+    )
+    exclusive_file_inputs.add_argument(
+        "--custom-module",
+        "-m",
+        help="Custom Python module that defines how to generate documents. " +
+        "It can contain function definitions and even class definitions. " +
+        "This gives users more granular control over how data is generated. " +
+        "This module must contain generate_fake_document() definition."
+    )
+
+    exclusive_params = synthetic_data_generator_parser.add_mutually_exclusive_group(required=True)
+    exclusive_params.add_argument(
+        "--total-size",
+        "-s",
+        type=int,
+        help="Total size in GB of synthetically generated data corpora"
+    )
+    synthetic_data_generator_parser.add_argument(
+        "--index-name",
+        "-n",
+        required=True,
+        help="Index name associated with generated corpora"
+    )
+    synthetic_data_generator_parser.add_argument(
+        "--output-path",
+        "-p",
+        default=os.path.join(os.getcwd(), "generated_corpora"),
+        help="Output path for data corpora. Data corpora will be written in a directory."
+    )
+    synthetic_data_generator_parser.add_argument(
+        "--custom-config",
+        "-c",
+        default=None,
+        help="Optional config where users can specify overrides for mapping synthetic data generator or values that module should use."
+    )
+    synthetic_data_generator_parser.add_argument(
+        "--test-document",
+        "-t",
+        default=False,
+        action="store_true",
+        help="Generates a single synthetic document and displays it to the console so that users can validate generated values and output."
+    )
+
     create_workload_parser = subparsers.add_parser("create-workload", help="Create a OSB workload from existing data")
     create_workload_parser.add_argument(
         "--workload",
@@ -713,7 +766,7 @@ def add_workload_source(subparser):
         default=False)
 
     for p in [list_parser, test_execution_parser, compare_parser, aggregate_parser,
-              download_parser, install_parser, start_parser, stop_parser, info_parser, create_workload_parser]:
+              download_parser, install_parser, start_parser, stop_parser, info_parser, synthetic_data_generator_parser, create_workload_parser]:
         # This option is needed to support a separate configuration for the integration tests on the same machine
         p.add_argument(
             "--configuration-name",
@@ -1089,6 +1142,17 @@ def dispatch_sub_command(arg_parser, args, cfg):
                 execute_test(cfg, args.kill_running_processes)
             else:
                 console.info("Please enter a valid number of test iterations")
+        elif sub_command == "generate-data":
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "index_name", args.index_name)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "index_mappings", args.index_mappings)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "custom_module", args.custom_module)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "custom_config", args.custom_config)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "output_path", args.output_path)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "total_size", args.total_size)
+            cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "test_document", args.test_document)
+
+            synthetic_data_generator.orchestrate_data_generation(cfg)
+
         elif sub_command == "create-workload":
             cfg.add(config.Scope.applicationOverride, "generator", "indices", args.indices)
             cfg.add(config.Scope.applicationOverride, "generator", "number_of_docs", args.number_of_docs)

diff --git a/osbenchmark/exceptions.py b/osbenchmark/exceptions.py
@@ -124,3 +124,10 @@ class ConfigurationError(BenchmarkError):
     Attributes:
         message -- explanation of the error
     """
+
+class MappingsError(BenchmarkError):
+    """Exception raised for errors in OpenSearch mappings provided.
+
+    Attributes:
+        message -- explanation of the error
+    """
diff --git a/osbenchmark/resources/example-basic-mappings.json b/osbenchmark/resources/example-basic-mappings.json
@@ -0,0 +1,39 @@
+{
+    "mappings": {
+      "properties": {
+        "title": {
+          "type": "text",
+          "analyzer": "standard",
+          "fields": {
+            "keyword": {
+              "type": "keyword",
+              "ignore_above": 256
+            }
+          }
+        },
+        "description": {
+          "type": "text"
+        },
+        "price": {
+          "type": "float"
+        },
+        "created_at": {
+          "type": "date",
+          "format": "strict_date_optional_time||epoch_millis"
+        },
+        "is_available": {
+          "type": "boolean"
+        },
+        "category_id": {
+          "type": "integer"
+        },
+        "tags": {
+          "type": "keyword"
+        }
+      }
+    },
+    "settings": {
+      "number_of_shards": 1,
+      "number_of_replicas": 1
+    }
+  }
diff --git a/osbenchmark/resources/example-complex-mappings.json b/osbenchmark/resources/example-complex-mappings.json
@@ -0,0 +1,211 @@
+{
+    "mappings": {
+      "dynamic": "strict",
+      "properties": {
+        "user": {
+          "type": "object",
+          "properties": {
+            "id": {
+              "type": "keyword"
+            },
+            "email": {
+              "type": "keyword"
+            },
+            "name": {
+              "type": "text",
+              "fields": {
+                "keyword": {
+                  "type": "keyword",
+                  "ignore_above": 256
+                },
+                "completion": {
+                  "type": "completion"
+                }
+              },
+              "analyzer": "standard"
+            },
+            "address": {
+              "type": "object",
+              "properties": {
+                "street": {
+                  "type": "text"
+                },
+                "city": {
+                  "type": "keyword"
+                },
+                "state": {
+                  "type": "keyword"
+                },
+                "zip": {
+                  "type": "keyword"
+                },
+                "location": {
+                  "type": "geo_point"
+                }
+              }
+            },
+            "preferences": {
+              "type": "object",
+              "dynamic": true
+            }
+          }
+        },
+        "orders": {
+          "type": "nested",
+          "properties": {
+            "id": {
+              "type": "keyword"
+            },
+            "date": {
+              "type": "date",
+              "format": "strict_date_optional_time||epoch_millis"
+            },
+            "amount": {
+              "type": "float"
+            },
+            "status": {
+              "type": "keyword"
+            },
+            "items": {
+              "type": "nested",
+              "properties": {
+                "product_id": {
+                  "type": "keyword"
+                },
+                "name": {
+                  "type": "text",
+                  "fields": {
+                    "keyword": {
+                      "type": "keyword"
+                    }
+                  }
+                },
+                "quantity": {
+                  "type": "short"
+                },
+                "price": {
+                  "type": "float"
+                },
+                "categories": {
+                  "type": "keyword"
+                }
+              }
+            },
+            "shipping_address": {
+              "type": "object",
+              "properties": {
+                "street": {
+                  "type": "text"
+                },
+                "city": {
+                  "type": "keyword"
+                },
+                "state": {
+                  "type": "keyword"
+                },
+                "zip": {
+                  "type": "keyword"
+                },
+                "location": {
+                  "type": "geo_point"
+                }
+              }
+            }
+          }
+        },
+        "activity_log": {
+          "type": "nested",
+          "properties": {
+            "timestamp": {
+              "type": "date"
+            },
+            "action": {
+              "type": "keyword"
+            },
+            "ip_address": {
+              "type": "ip"
+            },
+            "details": {
+              "type": "object",
+              "enabled": false
+            }
+          }
+        },
+        "metadata": {
+          "type": "object",
+          "properties": {
+            "created_at": {
+              "type": "date"
+            },
+            "updated_at": {
+              "type": "date"
+            },
+            "tags": {
+              "type": "keyword"
+            },
+            "source": {
+              "type": "keyword"
+            },
+            "version": {
+              "type": "integer"
+            }
+          }
+        },
+        "description": {
+          "type": "text",
+          "analyzer": "english",
+          "fields": {
+            "keyword": {
+              "type": "keyword",
+              "ignore_above": 256
+            },
+            "standard": {
+              "type": "text",
+              "analyzer": "standard"
+            }
+          }
+        },
+        "ranking_scores": {
+          "type": "object",
+          "properties": {
+            "popularity": {
+              "type": "float"
+            },
+            "relevance": {
+              "type": "float"
+            },
+            "quality": {
+              "type": "float"
+            }
+          }
+        },
+        "permissions": {
+          "type": "nested",
+          "properties": {
+            "user_id": {
+              "type": "keyword"
+            },
+            "role": {
+              "type": "keyword"
+            },
+            "granted_at": {
+              "type": "date"
+            }
+          }
+        }
+      }
+    },
+    "settings": {
+      "number_of_shards": 3,
+      "number_of_replicas": 2,
+      "analysis": {
+        "analyzer": {
+          "email_analyzer": {
+            "type": "custom",
+            "tokenizer": "uax_url_email",
+            "filter": ["lowercase", "stop"]
+          }
+        }
+      }
+    }
+  }