Running Stained Glass Proxy via vLLM on Modal¶
This guide explains how to run the Stained Glass Proxy locally via Docker Compose while pointing to a vLLM server with Stained Glass Output Protection hosted on Modal, a serverless compute platform that lets you run Python code in the cloud without managing infrastructure. A Python test script is included to verify that the system is functioning end-to-end.
Prerequisites¶
You need Python 3.10 or higher and a Modal account. Install the Modal CLI:
Then authenticate with:
Deploying vLLM with Output Protection on Modal¶
We will deploys a vLLM server on Modal using a GPU-backed container with the RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8.
%%writefile vllm_modal_inference.py
from typing import Final
import modal
MODEL_NAME: Final[str] = "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
SERVED_MODEL_NAME: Final[str] = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_REVISION: Final[str] = "12fd6884d2585dd4d020373e7f39f74507b31866" # pragma: allowlist secret
VLLM_PORT: Final[int] = 8000
N_GPU: Final[int] = 1
MINUTES: Final[int] = 60
# Use Docker Image from ECR.
container_pull_secret = modal.Secret.from_name("container-secret")
vllm_image = (
modal.Image.from_aws_ecr(
"**********.dkr.ecr.us-east-1.amazonaws.com/protopia/stainedglass-inference-server:1.2.1-2e7c344-obfuscated",
secret=container_pull_secret,
).env({
"HF_HUB_ENABLE_HF_TRANSFER": "1", # faster model transfers
"SG_REGISTRY_CONNECTION_SECRET": "some-madeup-secret"
}).run_commands("ln -s /usr/bin/python3 /usr/bin/python").entrypoint([])
)
# Configure Cache Volumes
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
app = modal.App("output-protected-vllm-inference")
@app.function(
image=vllm_image,
gpu=f"H100:{N_GPU}",
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
timeout=10 * MINUTES, # how long should we wait for container start?
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
)
@modal.concurrent(max_inputs=32) # how many requests can one replica handle? tune carefully!
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES, requires_proxy_auth=True)
def serve():
import subprocess
cmd = [
"python3", "-m", "stainedglass_output_protection.vllm.entrypoint",
"--no-enable-chunked-prefill",
"--enable-prompt-embeds",
"--model", MODEL_NAME,
"--revision", MODEL_REVISION,
"--served-model-name", SERVED_MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(VLLM_PORT),
]
# assume multiple GPUs are for splitting up large matrix multiplications
cmd += ["--tensor-parallel-size", str(N_GPU)]
print("Launching vLLM with command:")
print(" ".join(cmd))
subprocess.Popen(" ".join(cmd), shell=True)
Overwriting vllm_modal_inference.py
%%capture output --no-display
!modal deploy vllm_modal_inference.py
# If you want to see the output from the above command. Uncomment the line below.
# print(output.stdout) # pyright: ignore[reportUndefinedVariable]
Test Modal Endpoints¶
With Proxy Auth Enabled, You will need to create Proxy Auth Tokens and set the environment variables to use downstream.
import os
os.environ["MODAL_KEY"] = "***********************"
os.environ["MODAL_SECRET"] = "***********************"
Test Endpoints¶
Note: You will need to change LLM_URL below to your modal deployment URL.
from typing import Final
import openai
LLM_URL: Final[str] = (
"https://protopia--output-protected-vllm-inference-serve.modal.run/v1"
)
API_KEY: Final[str] = "dummy_key"
SERVED_MODEL_NAME: Final[str] = "meta-llama/Llama-3.1-8B-Instruct"
MODAL_KEY: Final[str] = os.environ.get("MODAL_KEY", "unknown")
MODAL_SECRET: Final[str] = os.environ.get("MODAL_SECRET", "unknown")
HEADERS: Final[dict[str, str]] = {
"Modal-Key": MODAL_KEY,
"Modal-Secret": MODAL_SECRET,
}
modal_client_with_auth = openai.OpenAI(
api_key=API_KEY, base_url=LLM_URL, default_headers=HEADERS
)
modal_client_no_auth = openai.OpenAI(api_key=API_KEY, base_url=LLM_URL)
# This should thow an authentication error - missing credentials for proxy authorization
modal_client_no_auth.models.list()
--------------------------------------------------------------------------- AuthenticationError Traceback (most recent call last) Cell In[77], line 2 1 # This should thow an authentication error - missing credentials for proxy authorization ----> 2 client_no_auth.models.list() File ~/.conda/envs/proxy/lib/python3.11/site-packages/openai/resources/models.py:91, in Models.list(self, extra_headers, extra_query, extra_body, timeout) 77 def list( 78 self, 79 *, (...) 85 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, 86 ) -> SyncPage[Model]: 87 """ 88 Lists the currently available models, and provides basic information about each 89 one such as the owner and availability. 90 """ ---> 91 return self._get_api_list( 92 "/models", 93 page=SyncPage[Model], 94 options=make_request_options( 95 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout 96 ), 97 model=Model, 98 ) File ~/.conda/envs/proxy/lib/python3.11/site-packages/openai/_base_client.py:1308, in SyncAPIClient.get_api_list(self, path, model, page, body, options, method) 1297 def get_api_list( 1298 self, 1299 path: str, (...) 1305 method: str = "get", 1306 ) -> SyncPageT: 1307 opts = FinalRequestOptions.construct(method=method, url=path, json_data=body, **options) -> 1308 return self._request_api_list(model, page, opts) File ~/.conda/envs/proxy/lib/python3.11/site-packages/openai/_base_client.py:1159, in SyncAPIClient._request_api_list(self, model, page, options) 1155 return resp 1157 options.post_parser = _parser -> 1159 return self.request(page, options, stream=False) File ~/.conda/envs/proxy/lib/python3.11/site-packages/openai/_base_client.py:1047, in SyncAPIClient.request(self, cast_to, options, stream, stream_cls) 1044 err.response.read() 1046 log.debug("Re-raising status error") -> 1047 raise self._make_status_error_from_response(err.response) from None 1049 break 1051 assert response is not None, "could not resolve response (should never happen)" AuthenticationError: modal-http: missing credentials for proxy authorization
# Client with auth headers is successful
print(modal_client_with_auth.models.list().model_dump_json(indent=4))
{ "data": [ { "id": "meta-llama/Llama-3.1-8B-Instruct", "created": 1757547295, "object": "model", "owned_by": "vllm", "root": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", "parent": null, "max_model_len": 131072, "permission": [ { "id": "modelperm-27708c79e5fe4251ad7a566fe15cf29b", "object": "model_permission", "created": 1757547295, "allow_create_engine": false, "allow_sampling": true, "allow_logprobs": true, "allow_search_indices": false, "allow_view": true, "allow_fine_tuning": false, "organization": "*", "group": null, "is_blocking": false } ] } ], "object": "list" }
%%writefile docker-compose.modal.yaml
---
services:
stainedglass:
image: **********.dkr.ecr.us-east-1.amazonaws.com/protopia/stainedglass-proxy:1.12.1-5feefe2-obfuscated
environment:
SGP_INFERENCE_SERVICE_HOST: https://protopia--output-protected-vllm-inference-serve.modal.run
SGP_SGT_PATH: "/app/sgt_model.sgt"
SGP_DEVICE: "cuda"
SGP_MAX_NEW_TOKENS: 1000
SGP_NUM_SGT_WORKERS: 1
SGP_OUTPUT_DECRYPTION: "True"
SGP_USE_AIOHTTP_FOR_UPSTREAM: "True"
SGP_SGT_TORCH_DTYPE: "torch.bfloat16"
SGP_SGT_NOISE_LAYER_ATTENTION: "flash_attention_2"
SGP_RECONSTRUCTION_MAX_SEQUENCE_LENGTH: 2048
SGP_RECONSTRUCTION_MAX_NUM_EMBEDDINGS: 2048
SGP_RECONSTRUCTION_MAX_BATCH_SIZE: 512
SGP_ALLOWED_HEADERS: "Modal-Key,Modal-Secret,x-server-public-key"
ports:
- "8666:8600"
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['6'] # Add/or remove available GPU IDs
capabilities: [gpu]
healthcheck:
test: curl --fail http://localhost:8600/livez || exit 1
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
Overwriting docker-compose.modal.yaml
!docker compose -f docker-compose.modal.yaml up -d
[+] Running 0/1 ⠋ Network inference-providers_default Creating 0.1s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠋ Container inference-providers-stainedglass-1 Starting 0.1s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠙ Container inference-providers-stainedglass-1 Starting 0.2s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠹ Container inference-providers-stainedglass-1 Starting 0.3s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠸ Container inference-providers-stainedglass-1 Starting 0.4s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠼ Container inference-providers-stainedglass-1 Starting 0.5s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠴ Container inference-providers-stainedglass-1 Starting 0.6s [+] Running 1/2 ✔ Network inference-providers_default Created 0.1s ⠦ Container inference-providers-stainedglass-1 Starting 0.7s [+] Running 2/2 ✔ Network inference-providers_default Created 0.1s ✔ Container inference-providers-stainedglass-1 Started 0.7s
!docker compose -f docker-compose.modal.yaml logs
stainedglass-1 | INFO 09-10 23:41:09 [__init__.py:241] Automatically detected platform cuda. stainedglass-1 | 2025-09-10 23:41:10 | uvicorn.error | INFO | None | Started server process [1] stainedglass-1 | 2025-09-10 23:41:10 | uvicorn.error | INFO | None | Waiting for application startup. stainedglass-1 | 2025-09-10 23:41:10 | stainedglass_proxy.dependencies | INFO | None | Initializing pre-run lifespan events. stainedglass-1 | 2025-09-10 23:41:10 | stainedglass_proxy.dependencies | INFO | None | Proxy settings: inference_service_host='https://protopia--output-protected-vllm-inference-serve.modal.run' sgt_path='/app/sgt_model.sgt' min_new_tokens=None seed=None temperature=0.3 top_p=0.2 top_k=5000 repetition_penalty=1.0 upstream_keep_alive_timeout=5.0 session_timeout=60.0 api_username=None api_password=None use_aiohttp_for_upstream=True sagemaker_endpoint_name=None device='cuda' num_sgt_workers=1 tensor_parallel_size=None grace_period_seconds=5 worker_ready_timeout_seconds=None sgt_torch_dtype='torch.bfloat16' sgt_noise_layer_attention='flash_attention_2' max_input_tokens=None output_decryption=True ephemeral_key_refresh_time_seconds=900.0 reconstruction_max_batch_size=512 reconstruction_max_sequence_length=2048 reconstruction_max_num_embeddings=2048 profile=False profile_data_folder=PosixPath('profile') logging_config_file='logging.yaml' license_manager_configuration_file=PosixPath('license_manager_configuration.json') allowed_headers={'modal-key', 'modal-secret', 'x-server-public-key'} stainedglass-1 | 2025-09-10 23:41:10 | stainedglass_proxy.upstream.aiohttp | INFO | None | Initializing HTTP session client. stainedglass-1 | 2025-09-10 23:41:10 | stainedglass_proxy.sgt_manager | INFO | None | Starting 1 SGT worker processes stainedglass-1 | 2025-09-10 23:41:10 | stainedglass_proxy.sgt_manager | INFO | None | Loading Stained Glass Transform model layer for inference. stainedglass-1 | INFO 09-10 23:41:14 [__init__.py:241] Automatically detected platform cuda. stainedglass-1 | 2025-09-10 23:41:15 | sgt_worker_0 | INFO | None | Starting SGT worker 0/1 on device cuda, tensor_parallel=False stainedglass-1 | 2025-09-10 23:41:15 | sgt_worker_0 | INFO | None | Loading SGT on device cuda:0 stainedglass-1 | 2025-09-10 23:41:15 | stainedglass_proxy.sgt_manager | INFO | None | Loading Stained Glass Transform model layer for inference. stainedglass-1 | 2025-09-10 23:41:29 | stainedglass_proxy.sgt_manager | INFO | None | SGT config: {'noisy_model': {'noisy_model_type': 'stainedglass_core.model.noisy_transformer_masking_model.NoiseMaskedNoisyTransformerModel', 'base_model': {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, '_name_or_path': '/models/meta-llama/Meta-Llama-3-8B', 'transformers_version': '4.55.4', 'use_bfloat16': False, 'tf_legacy_loss': False, '_attn_implementation_autoset': True, 'model_type': 'llama', 'output_attentions': False}, 'noise_layer': {'noise_layer_type': 'stainedglass_core.noise_layer.transformer_cloak.TransformerCloak', 'scale': (1.0011717677116394e-08, 1.0), 'transformer_type': 'transformers.models.llama.modeling_llama.LlamaModel', 'estimator_config': {'vocab_size': 128256, 'max_position_embeddings': 8192, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 1, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, '_name_or_path': '/models/meta-llama/Meta-Llama-3-8B', 'transformers_version': '4.55.4', 'use_bfloat16': False, 'tf_legacy_loss': False, '_attn_implementation_autoset': True, 'model_type': 'llama', 'output_attentions': False}, 'percent_to_mask': None, 'shallow': 1.0, 'seed': 42, 'rho_init': 0.0, 'std_dropout': 0.1, 'mean_dropout': 0.1, 'directly_learn_stds': True, 'mean_num_experts': 0, 'std_num_experts': 0, 'noise_layer_dtype': 'torch.bfloat16', 'num_hidden_layers': 1, 'noise_layer_attention': 'flash_attention_2'}, 'init_passthrough_kwargs': {'truncated_layer_index': None, 'target_layer': 'model.embed_tokens', 'target_parameter': None}}, 'noise_tokenizer': {'chat_template': None, 'transform_all_tokens': False, 'transform_tools': False}, 'name': 'dark-sound-115', 'include_all_base_model_params': False, 'parameter_names_relative_to_base_model': ['model.embed_tokens.weight'], '_stainedglass_core_version': '1.9.0'} stainedglass-1 | 2025-09-10 23:41:29 | stainedglass_proxy.sgt_manager | INFO | None | Waiting for workers to signal ready... stainedglass-1 | 2025-09-10 23:41:35 | sgt_worker_0 | INFO | None | SGT worker 0 ready and signaled stainedglass-1 | 2025-09-10 23:41:35 | stainedglass_proxy.sgt_manager | INFO | None | Starting SGT queue dispatcher stainedglass-1 | 2025-09-10 23:41:35 | stainedglass_proxy.sgt_manager | INFO | None | Stained Glass Transform Manager fully initialized stainedglass-1 | 2025-09-10 23:41:35 | uvicorn.error | INFO | None | Application startup complete. stainedglass-1 | 2025-09-10 23:41:35 | uvicorn.error | INFO | None | Uvicorn running on http://0.0.0.0:8600 (Press CTRL+C to quit)
PROXY_URL: Final[str] = "http://localhost:8666/v1"
sgt_client = openai.OpenAI(
api_key=API_KEY, base_url=PROXY_URL, default_headers=HEADERS
)
response = sgt_client.chat.completions.create(
model=SERVED_MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
],
)
print(response.choices[0].message.content)
The 2020 World Series was played at Globe Life Field in Arlington, Texas
!docker compose -f docker-compose.modal.yaml down
[+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 0.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 0.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 0.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 0.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 0.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 0.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 0.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 0.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 0.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 1.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 1.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 1.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 1.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 1.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 1.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 1.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 1.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 1.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 1.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 2.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 2.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 2.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 2.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 2.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 2.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 2.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 2.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 2.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 2.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 3.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 3.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 3.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 3.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 3.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 3.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 3.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 3.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 3.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 3.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 4.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 4.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 4.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 4.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 4.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 4.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 4.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 4.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 4.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 4.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 5.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 5.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 5.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 5.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 5.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 5.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 5.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 5.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 5.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 5.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 6.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 6.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 6.2s [+] Running 0/1 ⠹ Container inference-providers-stainedglass-1 Stopping 6.3s [+] Running 0/1 ⠸ Container inference-providers-stainedglass-1 Stopping 6.4s [+] Running 0/1 ⠼ Container inference-providers-stainedglass-1 Stopping 6.5s [+] Running 0/1 ⠴ Container inference-providers-stainedglass-1 Stopping 6.6s [+] Running 0/1 ⠦ Container inference-providers-stainedglass-1 Stopping 6.7s [+] Running 0/1 ⠧ Container inference-providers-stainedglass-1 Stopping 6.8s [+] Running 0/1 ⠇ Container inference-providers-stainedglass-1 Stopping 6.9s [+] Running 0/1 ⠏ Container inference-providers-stainedglass-1 Stopping 7.0s [+] Running 0/1 ⠋ Container inference-providers-stainedglass-1 Stopping 7.1s [+] Running 0/1 ⠙ Container inference-providers-stainedglass-1 Stopping 7.2s [+] Running 1/2 ✔ Container inference-providers-stainedglass-1 Removed 7.3s ⠋ Network inference-providers_default Removing 0.0s [+] Running 1/2 ✔ Container inference-providers-stainedglass-1 Removed 7.3s ⠙ Network inference-providers_default Removing 0.1s [+] Running 1/2 ✔ Container inference-providers-stainedglass-1 Removed 7.3s ⠹ Network inference-providers_default Removing 0.2s [+] Running 2/2 ✔ Container inference-providers-stainedglass-1 Removed 7.3s ✔ Network inference-providers_default Removed 0.3s
Stopping the Modal Server¶
To stop your Modal app: 1. List apps:
2. Copy the App ID and stop it 3. Re-run to verify that is has stopped