Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion osbenchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
test_execution_orchestrator, results_publisher, \
metrics, workload, exceptions, log
from osbenchmark.builder import provision_config, builder
from osbenchmark.synthetic_data_generator import synthetic_data_generator
from osbenchmark.workload_generator import workload_generator
from osbenchmark.utils import io, convert, process, console, net, opts, versions
from osbenchmark import aggregator
Expand Down Expand Up @@ -157,6 +158,58 @@ def add_workload_source(subparser):
"--exclude-tasks",
help="Defines a comma-separated list of tasks not to run. By default all tasks of a test_procedure are run.")

synthetic_data_generator_parser = subparsers.add_parser("generate-data",
help="Generate synthetic data based on existing index mappings or custom module." +
"This data can be ported into OSB workloads or ingested into OpenSearch." )

exclusive_file_inputs = synthetic_data_generator_parser.add_mutually_exclusive_group(required=True)
exclusive_file_inputs.add_argument(
"--index-mappings",
"-i",
help="OpenSearch index mappings to generate data from."
)
exclusive_file_inputs.add_argument(
"--custom-module",
"-m",
help="Custom Python module that defines how to generate documents. " +
"It can contain function definitions and even class definitions. " +
"This gives users more granular control over how data is generated. " +
"This module must contain generate_fake_document() definition."
)

exclusive_params = synthetic_data_generator_parser.add_mutually_exclusive_group(required=True)
exclusive_params.add_argument(
"--total-size",
"-s",
type=int,
help="Total size in GB of synthetically generated data corpora"
)
synthetic_data_generator_parser.add_argument(
"--index-name",
"-n",
required=True,
help="Index name associated with generated corpora"
)
synthetic_data_generator_parser.add_argument(
"--output-path",
"-p",
default=os.path.join(os.getcwd(), "generated_corpora"),
help="Output path for data corpora. Data corpora will be written in a directory."
)
synthetic_data_generator_parser.add_argument(
"--custom-config",
"-c",
default=None,
help="Optional config where users can specify overrides for mapping synthetic data generator or values that module should use."
)
synthetic_data_generator_parser.add_argument(
"--test-document",
"-t",
default=False,
action="store_true",
help="Generates a single synthetic document and displays it to the console so that users can validate generated values and output."
)

create_workload_parser = subparsers.add_parser("create-workload", help="Create a OSB workload from existing data")
create_workload_parser.add_argument(
"--workload",
Expand Down Expand Up @@ -713,7 +766,7 @@ def add_workload_source(subparser):
default=False)

for p in [list_parser, test_execution_parser, compare_parser, aggregate_parser,
download_parser, install_parser, start_parser, stop_parser, info_parser, create_workload_parser]:
download_parser, install_parser, start_parser, stop_parser, info_parser, synthetic_data_generator_parser, create_workload_parser]:
# This option is needed to support a separate configuration for the integration tests on the same machine
p.add_argument(
"--configuration-name",
Expand Down Expand Up @@ -1089,6 +1142,17 @@ def dispatch_sub_command(arg_parser, args, cfg):
execute_test(cfg, args.kill_running_processes)
else:
console.info("Please enter a valid number of test iterations")
elif sub_command == "generate-data":
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "index_name", args.index_name)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "index_mappings", args.index_mappings)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "custom_module", args.custom_module)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "custom_config", args.custom_config)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "output_path", args.output_path)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "total_size", args.total_size)
cfg.add(config.Scope.applicationOverride, "synthetic_data_generator", "test_document", args.test_document)

synthetic_data_generator.orchestrate_data_generation(cfg)

elif sub_command == "create-workload":
cfg.add(config.Scope.applicationOverride, "generator", "indices", args.indices)
cfg.add(config.Scope.applicationOverride, "generator", "number_of_docs", args.number_of_docs)
Expand Down
7 changes: 7 additions & 0 deletions osbenchmark/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,10 @@ class ConfigurationError(BenchmarkError):
Attributes:
message -- explanation of the error
"""

class MappingsError(BenchmarkError):
"""Exception raised for errors in OpenSearch mappings provided.

Attributes:
message -- explanation of the error
"""
39 changes: 39 additions & 0 deletions osbenchmark/resources/example-basic-mappings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "standard",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"description": {
"type": "text"
},
"price": {
"type": "float"
},
"created_at": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"is_available": {
"type": "boolean"
},
"category_id": {
"type": "integer"
},
"tags": {
"type": "keyword"
}
}
},
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
}
}
211 changes: 211 additions & 0 deletions osbenchmark/resources/example-complex-mappings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"mappings": {
"dynamic": "strict",
"properties": {
"user": {
"type": "object",
"properties": {
"id": {
"type": "keyword"
},
"email": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"completion": {
"type": "completion"
}
},
"analyzer": "standard"
},
"address": {
"type": "object",
"properties": {
"street": {
"type": "text"
},
"city": {
"type": "keyword"
},
"state": {
"type": "keyword"
},
"zip": {
"type": "keyword"
},
"location": {
"type": "geo_point"
}
}
},
"preferences": {
"type": "object",
"dynamic": true
}
}
},
"orders": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"amount": {
"type": "float"
},
"status": {
"type": "keyword"
},
"items": {
"type": "nested",
"properties": {
"product_id": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"quantity": {
"type": "short"
},
"price": {
"type": "float"
},
"categories": {
"type": "keyword"
}
}
},
"shipping_address": {
"type": "object",
"properties": {
"street": {
"type": "text"
},
"city": {
"type": "keyword"
},
"state": {
"type": "keyword"
},
"zip": {
"type": "keyword"
},
"location": {
"type": "geo_point"
}
}
}
}
},
"activity_log": {
"type": "nested",
"properties": {
"timestamp": {
"type": "date"
},
"action": {
"type": "keyword"
},
"ip_address": {
"type": "ip"
},
"details": {
"type": "object",
"enabled": false
}
}
},
"metadata": {
"type": "object",
"properties": {
"created_at": {
"type": "date"
},
"updated_at": {
"type": "date"
},
"tags": {
"type": "keyword"
},
"source": {
"type": "keyword"
},
"version": {
"type": "integer"
}
}
},
"description": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"standard": {
"type": "text",
"analyzer": "standard"
}
}
},
"ranking_scores": {
"type": "object",
"properties": {
"popularity": {
"type": "float"
},
"relevance": {
"type": "float"
},
"quality": {
"type": "float"
}
}
},
"permissions": {
"type": "nested",
"properties": {
"user_id": {
"type": "keyword"
},
"role": {
"type": "keyword"
},
"granted_at": {
"type": "date"
}
}
}
}
},
"settings": {
"number_of_shards": 3,
"number_of_replicas": 2,
"analysis": {
"analyzer": {
"email_analyzer": {
"type": "custom",
"tokenizer": "uax_url_email",
"filter": ["lowercase", "stop"]
}
}
}
}
}
Loading
Loading