Skip to content

Commit 7203944

Browse files
committed
remove usage of SERVING_LOAD_MODELS and OPTION_MODEL_ID in examples/docs/tests
1 parent b178dc9 commit 7203944

File tree

8 files changed

+48
-100
lines changed

8 files changed

+48
-100
lines changed

.github/workflows/llm_integration.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ jobs:
562562
working-directory: tests/integration
563563
run: |
564564
rm -rf models
565-
echo -en "SERVING_LOAD_MODELS=test::MPI=/opt/ml/model\nOPTION_MAX_ROLLING_BATCH_SIZE=2\nOPTION_OUTPUT_FORMATTER=jsonlines\nOPTION_TENSOR_PARALLEL_DEGREE=1\nOPTION_MODEL_ID=gpt2\nOPTION_TASK=text-generation\nOPTION_ROLLING_BATCH=lmi-dist" > docker_env
565+
echo -en "OPTION_MAX_ROLLING_BATCH_SIZE=2\nOPTION_OUTPUT_FORMATTER=jsonlines\nTENSOR_PARALLEL_DEGREE=1\nHF_MODEL_ID=gpt2\nOPTION_TASK=text-generation\nOPTION_ROLLING_BATCH=lmi-dist" > docker_env
566566
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG nocode lmi
567567
python3 llm/client.py lmi_dist gpt2
568568
docker rm -f $(docker ps -aq)

engines/python/setup/djl_python/test_model.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def load_properties(properties_dir):
219219
def update_properties_with_env_vars(kwargs):
220220
env_vars = os.environ
221221
for key, value in env_vars.items():
222+
if key == "HF_MODEL_ID":
223+
kwargs.setdefault("model_id", value)
222224
if key.startswith("OPTION_"):
223225
key = key[7:].lower()
224226
if key == "entrypoint":

engines/python/setup/djl_python/tests/test_test_model.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,14 @@ def test_all_code(self):
6161

6262
def test_with_env(self):
6363
envs = {
64-
"OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
65-
"SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
64+
"HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
6665
"OPTION_ROLLING_BATCH": "auto",
6766
"OPTION_TGI_COMPAT": "true"
6867
}
6968
for key, value in envs.items():
7069
os.environ[key] = value
7170
huggingface.get_rolling_batch_class_from_str = override_rolling_batch
7271
handler = TestHandler(huggingface)
73-
self.assertEqual(handler.serving_properties["model_id"],
74-
envs["OPTION_MODEL_ID"])
7572
self.assertEqual(handler.serving_properties["rolling_batch"],
7673
envs["OPTION_ROLLING_BATCH"])
7774
self.assertEqual(handler.serving_properties["tgi_compat"],
@@ -100,17 +97,14 @@ def test_with_env(self):
10097

10198
def test_with_tgi_compat_env(self):
10299
envs = {
103-
"OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
104-
"SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
100+
"HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
105101
"OPTION_ROLLING_BATCH": "auto",
106102
"OPTION_TGI_COMPAT": "true"
107103
}
108104
for key, value in envs.items():
109105
os.environ[key] = value
110106
huggingface.get_rolling_batch_class_from_str = override_rolling_batch
111107
handler = TestHandler(huggingface)
112-
self.assertEqual(handler.serving_properties["model_id"],
113-
envs["OPTION_MODEL_ID"])
114108
self.assertEqual(handler.serving_properties["rolling_batch"],
115109
envs["OPTION_ROLLING_BATCH"])
116110
self.assertEqual(handler.serving_properties["tgi_compat"],
@@ -162,16 +156,13 @@ def test_all_code_chat(self):
162156

163157
def test_with_env_chat(self):
164158
envs = {
165-
"OPTION_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
166-
"SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
159+
"HF_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
167160
"OPTION_ROLLING_BATCH": "auto"
168161
}
169162
for key, value in envs.items():
170163
os.environ[key] = value
171164
huggingface.get_rolling_batch_class_from_str = override_rolling_batch
172165
handler = TestHandler(huggingface)
173-
self.assertEqual(handler.serving_properties["model_id"],
174-
envs["OPTION_MODEL_ID"])
175166
self.assertEqual(handler.serving_properties["rolling_batch"],
176167
envs["OPTION_ROLLING_BATCH"])
177168
inputs = [{
@@ -248,8 +239,7 @@ def test_exception_handling(self):
248239
@unittest.skip
249240
def test_profiling(self, logging_method):
250241
envs = {
251-
"OPTION_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
252-
"SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
242+
"HF_MODEL_ID": "TheBloke/Llama-2-7B-Chat-fp16",
253243
"OPTION_ROLLING_BATCH": "auto",
254244
"DJL_PYTHON_PROFILING": "true",
255245
"DJL_PYTHON_PROFILING_TOP_OBJ": "60"
@@ -259,8 +249,6 @@ def test_profiling(self, logging_method):
259249
os.environ[key] = value
260250
huggingface.get_rolling_batch_class_from_str = override_rolling_batch
261251
handler = TestHandler(huggingface)
262-
self.assertEqual(handler.serving_properties["model_id"],
263-
envs["OPTION_MODEL_ID"])
264252
self.assertEqual(handler.serving_properties["rolling_batch"],
265253
envs["OPTION_ROLLING_BATCH"])
266254
inputs = [{
Lines changed: 34 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,14 @@
11
# Container and Model Configurations
22

3-
The configuration supplied to LMI provides required and optional information that LMI will use to load and serve your model.
4-
LMI containers accept configurations provided in two formats. In order of priority, these are:
3+
The configuration supplied to LMI provides information that LMI will use to load and serve your model.
4+
LMI containers accept configurations provided in two formats.
55

66
* `serving.properties` Configuration File (per model configurations)
77
* Environment Variables (global configurations)
88

9-
We recommend using the `serving.properties` configuration file for the following reasons:
10-
11-
* Supports SageMaker Multi Model Endpoints with per model configurations
12-
* Environment Variables are applied globally to all models hosted by the model server, so they can't be used for model specific configuration
13-
* Separates model configuration from the SageMaker Model Object (deployment unit)
14-
* Configurations can be modified and updated independently of the deployment unit/code
15-
16-
Environment Variables are a good option for the proof-of-concept and experimentation phase for a single model.
17-
You can modify the environment variables as part of your deployment script without having to re-upload configurations to S3.
18-
This typically leads to a faster iteration loop when modifying and experimenting with configuration values.
9+
For most use-cases, using environment variables is sufficient.
10+
If you are deploying LMI to serve multiple models within the same container (SageMaker Multi-Model Endpoint use-case), you should use per model `serving.properties` configuration files.
11+
Environment Variables are global settings and will apply to all models being served within a single instance of LMI.
1912

2013
While you can mix configurations between `serving.properties` and environment variables, we recommend you choose one and specify all configuration in that format.
2114
Configurations specified in the `serving.properties` files will override configurations specified in environment variables.
@@ -24,59 +17,50 @@ Both configuration mechanisms offer access to the same set of configurations.
2417

2518
If you know which backend you are going to use, you can find a set of starter configurations in the corresponding [user guide](../user_guides/README.md).
2619
We recommend using the quick start configurations as a starting point if you have decided on a particular backend.
27-
The only change required to the starter configurations is specifying `option.model_id` to point to your model artifacts.
2820

29-
We will now cover the components of a minimal starting configuration. This minimal configuration will look like:
21+
We will now cover the two types of configuration formats
3022

31-
```
32-
# use standard python engine, or mpi aware python engine
33-
engine=<Python|MPI>
34-
# where the model artifacts are stored
35-
option.model_id=<hf_hub_model_id|s3_uri>
36-
# which inference library to use
37-
option.rolling_batch=<auto|vllm|lmi-dist|tensorrtllm>
38-
# how many gpus to shard the model across with tensor parallelism
39-
option.tensor_parallel_degree=<max|number between 1 and number of gpus available>
40-
```
23+
## serving.properties
4124

42-
There are additional configurations that can be specified.
43-
We will cover the common configurations (across backends) in [LMI Common Configurations](#lmi-common-configurations)
25+
### Model Artifact Configuration (required)
4426

45-
## Model Artifact Configuration
27+
If you are deploying model artifacts directly with the container, LMI will detect the artifacts in the default model store `/opt/ml/model`.
28+
This is the default location when using SageMaker, and where SageMaker will mount the artifacts when specified via [`ModelDataSource`](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-uncompressed.html).
29+
You do not need to set any model artifact configurations when using this mechanism.
4630

4731
If you are deploying a model hosted on the HuggingFace Hub, you must specify the `option.model_id=<hf_hub_model_id>` configuration.
48-
When using a model directly from the hub, we recommend you also specify the model revision (commit hash) via `option.revision=<commit hash>`.
32+
When using a model directly from the hub, we recommend you also specify the model revision (commit hash or branch) via `option.revision=<commit hash/branch>`.
4933
Since model artifacts are downloaded at runtime from the Hub, using a specific revision ensures you are using a model compatible with package versions in the runtime environment.
50-
Open Source model artifacts on the hub are subject to change at any time, and these changes may cause issues when instantiating the model (the model may require a newer version of transformers than what is available in the container).
51-
If a model provides custom model (*modeling.py) and custom tokenizer (*tokenizer.py) files, you need to specify `option.trust_remote_code=true` to load and use the model.
34+
Open Source model artifacts on the hub are subject to change at any time.
35+
These changes may cause issues when instantiating the model (updated model artifacts may require a newer version of a dependency than what is bundled in the container).
36+
If a model provides custom model (*modeling.py) and/or custom tokenizer (*tokenizer.py) files, you need to specify `option.trust_remote_code=true` to load and use the model.
5237

5338
If you are deploying a model hosted in S3, `option.model_id=<s3 uri>` should be the s3 object prefix of the model artifacts.
5439
Alternatively, you can upload the `serving.properties` file to S3 alongside your model artifacts (under the same prefix) and omit the `option.model_id` config from your `serving.properties` file.
5540
Example code for leveraging uncompressed artifacts in S3 are provided in the [deploying your endpoint](deploying-your-endpoint.md#configuration---servingproperties) section.
5641

57-
## Inference Library Configuration
58-
59-
LMI expects the following two configurations to determine which backend to use:
42+
### Inference Library Configuration (optional)
6043

61-
* `engine`. The options are `Python` and `MPI`, which dictates how we launch the Python processes
62-
* `option.rolling_batch`. This represents the inference library to use. The available options depend on the container.
63-
* `option.entryPoint`. This represents the default inference handler to use. In most cases, this can be auto-detected and does not need to be specified
44+
Inference library configurations are optional, but allow you to override the default backend for your model.
45+
To override, or explicitly set the inference backend, you should set `option.rolling_batch`.
46+
This represents the inference library to use.
47+
The available options depend on the container.
6448

6549
In the LMI Container:
6650

67-
* to use vLLM, use `engine=Python` and `option.rolling_batch=vllm`
68-
* to use lmi-dist, use `engine=MPI` and `option.rolling_batch=lmi-dist`
69-
* to use huggingface accelerate, use `engine=Python` and `option.rolling_batch=auto` for text generation models, or `option.rolling_batch=disable` for non-text generation models.
51+
* to use vLLM, use `option.rolling_batch=vllm`
52+
* to use lmi-dist, use `option.rolling_batch=lmi-dist`
53+
* to use huggingface accelerate, use `option.rolling_batch=auto` for text generation models, or `option.rolling_batch=disable` for non-text generation models.
7054

7155
In the TensorRT-LLM Container:
7256

73-
* use `engine=MPI` and `option.rolling_batch=trtllm` to use TensorRT-LLM
57+
* use `option.rolling_batch=trtllm` to use TensorRT-LLM (this is the default)
7458

7559
In the Transformers NeuronX Container:
7660

77-
* use `engine=Python` and `option.rolling_batch=auto` to use Transformers NeuronX
61+
* use `option.rolling_batch=auto` to use Transformers NeuronX (this is the default)
7862

79-
## Tensor Parallelism Configuration
63+
### Tensor Parallelism Configuration
8064

8165
The `option.tensor_parallel_degree` configuration is used to specify how many GPUs to shard the model across using tensor parallelism.
8266
This value should be between 1, and the maximum number of GPUs available on an instance.
@@ -87,12 +71,13 @@ Alternatively, if this value is specified as a number, LMI will attempt to maxim
8771

8872
For example, using an instance with 4 gpus and a tensor parallel degree of 2 will result in 2 model copies, each using 2 gpus.
8973

90-
## LMI Common Configurations
74+
75+
### LMI Common Configurations
9176

9277
There are two classes of configurations provided by LMI:
9378

9479
* Model Server level configurations. These configurations do not have a prefix (e.g. `job_queue_size`)
95-
* Engine/Backend level configurations. These configurations have a `option.` prefix (e.g. `option.model_id`)
80+
* Engine/Backend level configurations. These configurations have a `option.` prefix (e.g. `option.dtype`)
9681

9782
Since LMI is built using the DJLServing model server, all DJLServing configurations are available in LMI.
9883
You can find a list of these configurations [here](../../configurations_model.md#python-model-configuration).
@@ -123,48 +108,20 @@ You can find these configurations in the respective [user guides](../user_guides
123108

124109
## Environment Variable Configurations
125110

126-
All LMI Configuration keys available in the `serving.properties` format can be specified as environment variables.
127-
128-
The translation for `engine` is unique. The configuration `engine=<engine>` is translated to `SERVING_LOAD_MODELS=test::<engine>=/opt/ml/model`.
129-
For example:
111+
The core configurations available via environment variables are documented in our [starting guide](../user_guides/starting-guide.md#available-environment-variable-configurations).
130112

131-
* `engine=Python` is translated to environment variable `SERVING_LOAD_MODELS=test::Python=/opt/ml/model`
132-
* `engine=MPI` is translated to environment variable `SERVING_LOAD_MODELS=test::MPI=/opt/ml/model`
113+
For other configurations, the `serving.property` configuration can be translated into an equivalent environment variable configuration.
133114

134-
Configuration keys that start with `option.` can be specified as environment variables using the `OPTION_` prefix.
115+
Keys that start with `option.` can be specified as environment variables using the `OPTION_` prefix.
135116
The configuration `option.<property>` is translated to environment variable `OPTION_<PROPERTY>`. For example:
136117

137-
* `option.model_id` is translated to environment variable `OPTION_MODEL_ID`
138-
* `option.tensor_parallel_degree` is translated to environment variable `OPTION_TENSOR_PARALLEL_DEGREE`
118+
* `option.rolling_batch` is translated to environment variable `OPTION_ROLLING_BATCH`
139119

140-
Configuration keys that do not start with option can be specified as environment variables using the `SERVING_` prefix.
120+
Configuration keys that do not start with `option` can be specified as environment variables using the `SERVING_` prefix.
141121
The configuration `<property>` is translated to environment variable `SERVING_<PROPERTY>`. For example:
142122

143123
* `job_queue_size` is translated to environment variable `SERVING_JOB_QUEUE_SIZE`
144124

145-
For a full example, given the following `serving.properties` file:
146-
147-
```
148-
engine=MPI
149-
option.model_id=tiiuae/falcon-40b
150-
option.entryPoint=djl_python.transformersneuronx
151-
option.trust_remote_code=true
152-
option.tensor_parallel_degree=4
153-
option.max_rolling_batch_size=32
154-
option.rolling_batch=auto
155-
```
156-
157-
We can translate the configuration to environment variables like this:
158-
159-
```
160-
HF_MODEL_ID=tiiuae/falcon-40b
161-
OPTION_ENTRYPOINT=djl_python.transformersneuronx
162-
HF_MODEL_TRUST_REMOTE_CODE=true
163-
TENSOR_PARALLEL_DEGREE=4
164-
OPTION_MAX_ROLLING_BATCH_SIZE=32
165-
OPTION_ROLLING_BATCH=auto
166-
```
167-
168125
Next: [Deploying your endpoint](deploying-your-endpoint.md)
169126

170127
Previous: [Backend Selection](backend-selection.md)

serving/docs/lmi/deployment_guide/deploying-your-endpoint.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,8 @@ The following options may be added to the `ModelDataSource` field to support unc
176176
This mechanism is useful when deploying SageMaker endpoints with network isolation.
177177
Model artifacts will be downloaded by SageMaker and mounted to the container rather than being downloaded by the container at runtime.
178178

179-
If you use this mechanism to deploy the container, you should set `option.model_id=/opt/ml/model` in serving.properties, or `OPTION_MODEL_ID=/opt/ml/model` in environment variables depending on which configuration style you are using.
179+
If you use this mechanism to deploy the container, you do not need to specify the `option.model_id` or `HF_MODEL_ID` config.
180+
LMI will load the model artifacts from the model directory by default, which is where SageMaker downloads and mounts the model artifacts from S3.
180181

181182
Follow this link for a detailed overview of this option: https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-uncompressed.html
182183

serving/docs/lmi/deployment_guide/testing-custom-script.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ from djl_python import huggingface
4848
from djl_python.test_model import TestHandler
4949

5050
envs = {
51-
"OPTION_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
51+
"HF_MODEL_ID": "NousResearch/Nous-Hermes-Llama2-13b",
5252
"OPTION_MPI_MODE": "true",
5353
"OPTION_ROLLING_BATCH": "lmi-dist",
5454
"OPTION_TENSOR_PARALLEL_DEGREE": 4

serving/docs/lmi/tutorials/trtllm_aot_tutorial.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ docker pull 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.27.0-te
5050
These below configurations helps you configure the inference optimizations parameters. You can check all the configurations of TensorRT-LLM LMI handler [in our docs](../user_guides/trt_llm_user_guide.md#advanced-tensorrt-llm-configurations).
5151

5252
```
53-
OPTION_MODEL_ID={{s3url}}
53+
HF_MODEL_ID={{s3url}}
5454
OPTION_TENSOR_PARALLEL_DEGREE=8
5555
OPTION_MAX_ROLLING_BATCH_SIZE=128
5656
OPTION_DTYPE=fp16
@@ -87,7 +87,7 @@ In the below example, the model artifacts will be saved to `$MODEL_REPO_DIR` cre
8787
docker run --runtime=nvidia --gpus all --shm-size 12gb \
8888
-v $MODEL_REPO_DIR:/tmp/trtllm \
8989
-p 8080:8080 \
90-
-e OPTION_MODEL_ID=$OPTION_MODEL_ID \
90+
-e HF_MODEL_ID=$HF_MODEL_ID \
9191
-e OPTION_TENSOR_PARALLEL_DEGREE=$OPTION_TENSOR_PARALLEL_DEGREE \
9292
-e OPTION_MAX_ROLLING_BATCH_SIZE=$OPTION_MAX_ROLLING_BATCH_SIZE \
9393
-e OPTION_DTYPE=$OPTION_DTYPE \
@@ -115,7 +115,7 @@ aws s3 cp $MODEL_REPO_DIR s3://YOUR_S3_FOLDER_NAME/ --recursive
115115
**Note:** After uploading model artifacts to s3, you can just update the model_id(env var or in `serving.properties`) to the newly created s3 url with compiled model artifacts and use the same rest of the environment variables or `serving.properties` when deploying on SageMaker. Here, you can check the [tutorial](https://github.com/deepjavalibrary/djl-demo/blob/master/aws/sagemaker/large-model-inference/sample-llm/trtllm_rollingbatch_deploy_llama_13b.ipynb) on how to run inference using TensorRT-LLM DLC. Below snippet shows example updated model_id.
116116

117117
```
118-
OPTION_MODEL_ID=s3://YOUR_S3_FOLDER_NAME
118+
HF_MODEL_ID=s3://YOUR_S3_FOLDER_NAME
119119
OPTION_TENSOR_PARALLEL_DEGREE=8
120120
OPTION_MAX_ROLLING_BATCH_SIZE=128
121121
OPTION_DTYPE=fp16

serving/docs/lmi/tutorials/trtllm_manual_convert_tutorial.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ Finally, you can use one of the following configuration to load your model on Sa
254254

255255
### 1. Environment variables:
256256
```
257-
OPTION_MODEL_ID=s3://lmi-llm/trtllm/0.5.0/baichuan-13b-tp2/
257+
HF_MODEL_ID=s3://lmi-llm/trtllm/0.5.0/baichuan-13b-tp2/
258258
OPTION_TENSOR_PARALLEL_DEGREE=2
259259
OPTION_MAX_ROLLING_BATCH_SIZE=64
260260
```

0 commit comments

Comments
 (0)