Open
Description
What happened?
Steps to reproduce
- Deploy a SageMaker Endpoint
- Deploy a model on a SageMaker Inference Endpoint that supports Messages API (e.g.: Mistral Small 2501)
- Use LiteLLM to call the model, as per doc
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/<your-endpoint-name>",
model_id="<your-inference-component-name",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.2,
max_tokens=80
)
- See it fail with the log output below
This is probably an issue specific to the model provider sagemaker_chat
not correctly forwarding the inference component name to the Boto3 invoke_endpoint()
API.
Relevant log output
09:34:38 - LiteLLM:DEBUG: utils.py:324 -
09:34:38 - LiteLLM:DEBUG: utils.py:324 - Request to litellm:
09:34:38 - LiteLLM:DEBUG: utils.py:324 - litellm.completion(model='sagemaker_chat/hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846', model_id='hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1', messages=[{'content': 'What is the weather like in Bari, Italy?', 'role': 'user'}], temperature=0, max_tokens=1024, tools=[{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}], allowed_openai_params=['tools'])
09:34:38 - LiteLLM:DEBUG: utils.py:324 -
09:34:38 - LiteLLM:DEBUG: litellm_logging.py:422 - self.optional_params: {}
09:34:38 - LiteLLM:DEBUG: utils.py:324 - SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
09:34:38 - LiteLLM:INFO: utils.py:3076 -
LiteLLM completion() model= hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846; provider = sagemaker_chat
09:34:38 - LiteLLM:DEBUG: utils.py:3079 -
LiteLLM: Params passed to completion() {'model': 'hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846', 'functions': None, 'function_call': None, 'temperature': 0, 'top_p': None, 'n': None, 'stream': None, 'stream_options': None, 'stop': None, 'max_tokens': 1024, 'max_completion_tokens': None, 'modalities': None, 'prediction': None, 'audio': None, 'presence_penalty': None, 'frequency_penalty': None, 'logit_bias': None, 'user': None, 'custom_llm_provider': 'sagemaker_chat', 'response_format': None, 'seed': None, 'tools': [{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}], 'tool_choice': None, 'max_retries': None, 'logprobs': None, 'top_logprobs': None, 'extra_headers': None, 'api_version': None, 'parallel_tool_calls': None, 'drop_params': None, 'allowed_openai_params': ['tools'], 'reasoning_effort': None, 'additional_drop_params': None, 'messages': [{'content': 'What is the weather like in Bari, Italy?', 'role': 'user'}], 'thinking': None, 'model_id': 'hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1'}
09:34:38 - LiteLLM:DEBUG: utils.py:3082 -
LiteLLM: Non-Default params passed to completion() {'temperature': 0, 'max_tokens': 1024, 'tools': [{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}]}
09:34:38 - LiteLLM:DEBUG: utils.py:324 - Final returned optional params: {'temperature': 0, 'max_tokens': 1024, 'tools': [{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}], 'model_id': 'hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1'}
09:34:38 - LiteLLM:DEBUG: litellm_logging.py:422 - self.optional_params: {'temperature': 0, 'max_tokens': 1024, 'tools': [{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}], 'model_id': 'hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1'}
09:34:38 - LiteLLM:DEBUG: base_aws_llm.py:121 - in get credentials
aws_access_key_id=None
aws_secret_access_key=None
aws_session_token=None
aws_region_name=us-east-1
aws_session_name=None
aws_profile_name=None
aws_role_name=None
aws_web_identity_token=None
aws_sts_endpoint=None
09:34:38 - LiteLLM:DEBUG: litellm_logging.py:746 -
POST Request Sent from LiteLLM:
curl -X POST \
https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846/invocations \
-H 'Content-Type: ap****on' -H 'X-Amz-Date: 20****8Z' -H 'X-Amz-Security-Token: IQ****==' -H 'Authorization: AW****39' -H 'Content-Length: *****' \
-d '{'model': 'hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846', 'messages': [{'content': 'What is the weather like in Bari, Italy?', 'role': 'user'}], 'temperature': 0, 'max_tokens': 1024, 'tools': [{'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate BMI given weight in kg and height in meters', 'parameters': {'properties': {'weight_kg': {'title': 'Weight Kg', 'type': 'number'}, 'height_m': {'title': 'Height M', 'type': 'number'}}, 'required': ['weight_kg', 'height_m'], 'title': 'calculate_bmiArguments', 'type': 'object'}}}, {'type': 'function', 'function': {'name': 'fetch_weather', 'description': 'Fetch current weather for a city', 'parameters': {'properties': {'city': {'title': 'City', 'type': 'string'}}, 'required': ['city'], 'title': 'fetch_weatherArguments', 'type': 'object'}}}], 'model_id': 'hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1', 'stream': False}'
09:34:38 - LiteLLM:DEBUG: get_api_base.py:63 - Error occurred in getting api base - litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to call. You passed model=hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846
Pass model as E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/starcoder',..)` Learn more: https://docs.litellm.ai/docs/providers
09:34:38 - LiteLLM:DEBUG: exception_mapping_utils.py:2243 - Logging Details: logger_fn - None | callable(logger_fn) - False
09:34:38 - LiteLLM:DEBUG: litellm_logging.py:2018 - Logging Details LiteLLM-Failure Call: []
Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.
Provider List: https://docs.litellm.ai/docs/providers
---------------------------------------------------------------------------
HTTPStatusError Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/litellm/llms/openai_like/chat/handler.py:372, in OpenAILikeChatHandler.completion(self, model, messages, api_base, custom_llm_provider, custom_prompt_dict, model_response, print_verbose, encoding, api_key, logging_obj, optional_params, acompletion, litellm_params, logger_fn, headers, timeout, client, custom_endpoint, streaming_decoder, fake_stream)
371 try:
--> 372 response = client.post(
373 url=api_base, headers=headers, data=json.dumps(data)
374 )
375 response.raise_for_status()
File /opt/conda/lib/python3.11/site-packages/litellm/llms/custom_httpx/http_handler.py:576, in HTTPHandler.post(self, url, data, json, params, headers, stream, timeout, files, content, logging_obj)
575 setattr(e, "status_code", e.response.status_code)
--> 576 raise e
577 except Exception as e:
File /opt/conda/lib/python3.11/site-packages/litellm/llms/custom_httpx/http_handler.py:558, in HTTPHandler.post(self, url, data, json, params, headers, stream, timeout, files, content, logging_obj)
557 response = self.client.send(req, stream=stream)
--> 558 response.raise_for_status()
559 return response
File /opt/conda/lib/python3.11/site-packages/httpx/_models.py:829, in Response.raise_for_status(self)
828 message = message.format(self, error_type=error_type)
--> 829 raise HTTPStatusError(message, request=request, response=self)
HTTPStatusError: Client error '400 Bad Request' for url 'https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846/invocations'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400
During handling of the above exception, another exception occurred:
OpenAILikeError Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/litellm/main.py:2598, in completion(model, messages, timeout, temperature, top_p, n, stream, stream_options, stop, max_completion_tokens, max_tokens, modalities, prediction, audio, presence_penalty, frequency_penalty, logit_bias, user, reasoning_effort, response_format, seed, tools, tool_choice, logprobs, top_logprobs, parallel_tool_calls, deployment_id, extra_headers, functions, function_call, base_url, api_version, api_key, model_list, thinking, **kwargs)
2596 elif custom_llm_provider == "sagemaker_chat":
2597 # boto3 reads keys from .env
-> 2598 model_response = sagemaker_chat_completion.completion(
2599 model=model,
2600 messages=messages,
2601 model_response=model_response,
2602 print_verbose=print_verbose,
2603 optional_params=optional_params,
2604 litellm_params=litellm_params,
2605 timeout=timeout,
2606 custom_prompt_dict=custom_prompt_dict,
2607 logger_fn=logger_fn,
2608 encoding=encoding,
2609 logging_obj=logging,
2610 acompletion=acompletion,
2611 client=client,
2612 )
2614 ## RESPONSE OBJECT
File /opt/conda/lib/python3.11/site-packages/litellm/llms/sagemaker/chat/handler.py:157, in SagemakerChatHandler.completion(self, model, messages, model_response, print_verbose, encoding, logging_obj, optional_params, litellm_params, timeout, custom_prompt_dict, logger_fn, acompletion, headers, client)
155 custom_stream_decoder = AWSEventStreamDecoder(model="", is_messages_api=True)
--> 157 return openai_like_chat_completions.completion(
158 model=model,
159 messages=messages,
160 api_base=prepared_request.url,
161 api_key=None,
162 custom_prompt_dict=custom_prompt_dict,
163 model_response=model_response,
164 print_verbose=print_verbose,
165 logging_obj=logging_obj,
166 optional_params=inference_params,
167 acompletion=acompletion,
168 litellm_params=litellm_params,
169 logger_fn=logger_fn,
170 timeout=timeout,
171 encoding=encoding,
172 headers=prepared_request.headers, # type: ignore
173 custom_endpoint=True,
174 custom_llm_provider="sagemaker_chat",
175 streaming_decoder=custom_stream_decoder, # type: ignore
176 client=client,
177 )
File /opt/conda/lib/python3.11/site-packages/litellm/llms/openai_like/chat/handler.py:378, in OpenAILikeChatHandler.completion(self, model, messages, api_base, custom_llm_provider, custom_prompt_dict, model_response, print_verbose, encoding, api_key, logging_obj, optional_params, acompletion, litellm_params, logger_fn, headers, timeout, client, custom_endpoint, streaming_decoder, fake_stream)
377 except httpx.HTTPStatusError as e:
--> 378 raise OpenAILikeError(
379 status_code=e.response.status_code,
380 message=e.response.text,
381 )
382 except httpx.TimeoutException:
OpenAILikeError: {"ErrorCode":"INFERENCE_COMPONENT_NAME_MISSING","Message":"Inference Component Name header is required for endpoints to which you plan to deploy inference components. Please include Inference Component Name header or consider using SageMaker models."}
During handling of the above exception, another exception occurred:
BadRequestError Traceback (most recent call last)
Cell In[27], line 6
3 endpoint_name = "hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846"
4 inference_component_name = "hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1"
----> 6 response = litellm.completion(
7 model=f"sagemaker_chat/{endpoint_name}",
8 model_id=inference_component_name,
9 messages=[{ "content": "What is the weather like in Bari, Italy?","role": "user"}],
10 temperature=0,
11 max_tokens=1024,
12 tools=litellm_tools,
13 allowed_openai_params=['tools']
14 )
File /opt/conda/lib/python3.11/site-packages/litellm/utils.py:1247, in client.<locals>.wrapper(*args, **kwargs)
1243 if logging_obj:
1244 logging_obj.failure_handler(
1245 e, traceback_exception, start_time, end_time
1246 ) # DO NOT MAKE THREADED - router retry fallback relies on this!
-> 1247 raise e
File /opt/conda/lib/python3.11/site-packages/litellm/utils.py:1125, in client.<locals>.wrapper(*args, **kwargs)
1123 print_verbose(f"Error while checking max token limit: {str(e)}")
1124 # MODEL CALL
-> 1125 result = original_function(*args, **kwargs)
1126 end_time = datetime.datetime.now()
1127 if "stream" in kwargs and kwargs["stream"] is True:
File /opt/conda/lib/python3.11/site-packages/litellm/main.py:3148, in completion(model, messages, timeout, temperature, top_p, n, stream, stream_options, stop, max_completion_tokens, max_tokens, modalities, prediction, audio, presence_penalty, frequency_penalty, logit_bias, user, reasoning_effort, response_format, seed, tools, tool_choice, logprobs, top_logprobs, parallel_tool_calls, deployment_id, extra_headers, functions, function_call, base_url, api_version, api_key, model_list, thinking, **kwargs)
3145 return response
3146 except Exception as e:
3147 ## Map to OpenAI Exception
-> 3148 raise exception_type(
3149 model=model,
3150 custom_llm_provider=custom_llm_provider,
3151 original_exception=e,
3152 completion_kwargs=args,
3153 extra_kwargs=kwargs,
3154 )
File /opt/conda/lib/python3.11/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py:2214, in exception_type(model, original_exception, custom_llm_provider, completion_kwargs, extra_kwargs)
2212 if exception_mapping_worked:
2213 setattr(e, "litellm_response_headers", litellm_response_headers)
-> 2214 raise e
2215 else:
2216 for error_type in litellm.LITELLM_EXCEPTION_TYPES:
File /opt/conda/lib/python3.11/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py:1034, in exception_type(model, original_exception, custom_llm_provider, completion_kwargs, extra_kwargs)
1032 elif original_exception.status_code == 400:
1033 exception_mapping_worked = True
-> 1034 raise BadRequestError(
1035 message=f"SagemakerException - {original_exception.message}",
1036 llm_provider=custom_llm_provider,
1037 model=model,
1038 response=getattr(original_exception, "response", None),
1039 )
1040 elif original_exception.status_code == 404:
1041 exception_mapping_worked = True
BadRequestError: litellm.BadRequestError: SagemakerException - {"ErrorCode":"INFERENCE_COMPONENT_NAME_MISSING","Message":"Inference Component Name header is required for endpoints to which you plan to deploy inference components. Please include Inference Component Name header or consider using SageMaker models."}
Are you a ML Ops Team?
No
What LiteLLM version are you on ?
1.65.7