@@ -207,7 +207,6 @@ def __init__(self,
207
207
self .add_api_route ("/v1/embeddings" , self ._embeddings_style_openai , methods = ["POST" ])
208
208
self .add_api_route ("/v1/chat/completions" , self ._chat_completions , methods = ["POST" ])
209
209
210
- self .add_api_route ("/v1/models" , self ._models , methods = ["GET" ])
211
210
self .add_api_route ("/tokenizer/{model_name}" , self ._tokenizer , methods = ["GET" ])
212
211
213
212
self ._inference_queue = inference_queue
@@ -483,35 +482,6 @@ async def _embeddings_style_openai(self, post: EmbeddingsStyleOpenAI, authorizat
483
482
"usage" : {"prompt_tokens" : - 1 , "total_tokens" : - 1 }
484
483
}
485
484
486
- async def _models (self , authorization : str = Header (None )):
487
- await self ._account_from_bearer (authorization )
488
- try :
489
- async with aiohttp .ClientSession () as session :
490
- async with session .get ("http://127.0.0.1:8001/v1/caps" ) as resp :
491
- lsp_server_caps = await resp .json ()
492
- except aiohttp .ClientConnectorError as e :
493
- err_msg = f"LSP server is not ready yet: { e } "
494
- log (err_msg )
495
- raise HTTPException (status_code = 401 , detail = err_msg )
496
- completion_models = set ()
497
- for model , caps in lsp_server_caps ["code_completion_models" ].items ():
498
- completion_models .update ({model , * caps ["similar_models" ]})
499
- chat_models = set ()
500
- for model , caps in lsp_server_caps ["code_chat_models" ].items ():
501
- chat_models .update ({model , * caps ["similar_models" ]})
502
- data = [
503
- {
504
- "id" : model , "root" : model , "object" : "model" ,
505
- "created" : 0 , "owned_by" : "" , "permission" : [], "parent" : None ,
506
- "completion" : model in completion_models , "chat" : model in chat_models ,
507
- }
508
- for model in lsp_server_caps ["running_models" ]
509
- ]
510
- return {
511
- "object" : "list" ,
512
- "data" : data ,
513
- }
514
-
515
485
async def _chat_completions (self , post : ChatContext , authorization : str = Header (None )):
516
486
def compose_usage_dict (model_dict , prompt_tokens_n , generated_tokens_n ) -> Dict [str , Any ]:
517
487
usage_dict = dict ()
@@ -543,6 +513,7 @@ def _wrap_output(output: str) -> str:
543
513
return prefix + output + postfix
544
514
545
515
model_dict = self ._model_assigner .models_db_with_passthrough .get (post .model , {})
516
+ assert model_dict .get ('backend' ) == 'litellm'
546
517
547
518
async def litellm_streamer ():
548
519
generated_tokens_n = 0
@@ -613,47 +584,14 @@ async def litellm_non_streamer():
613
584
log (err_msg )
614
585
yield json .dumps (_patch_caps_version ({"error" : err_msg }))
615
586
616
- async def chat_completion_streamer ():
617
- post_url = "http://127.0.0.1:8001/v1/chat"
618
- payload = {
619
- "messages" : messages ,
620
- "stream" : True ,
621
- "model" : post .model ,
622
- "parameters" : {
623
- "temperature" : post .temperature ,
624
- "max_new_tokens" : post .actual_max_tokens ,
625
- }
626
- }
627
- async with aiohttp .ClientSession () as session :
628
- try :
629
- async with session .post (post_url , json = payload ) as response :
630
- finish_reason = None
631
- async for data , _ in response .content .iter_chunks ():
632
- try :
633
- data = data .decode ("utf-8" )
634
- data = json .loads (data [len (prefix ):- len (postfix )])
635
- finish_reason = data ["choices" ][0 ]["finish_reason" ]
636
- data ["choices" ][0 ]["finish_reason" ] = None
637
- except json .JSONDecodeError :
638
- data = {"choices" : [{"finish_reason" : finish_reason }]}
639
- yield _wrap_output (json .dumps (_patch_caps_version (data )))
640
- except aiohttp .ClientConnectorError as e :
641
- err_msg = f"LSP server is not ready yet: { e } "
642
- log (err_msg )
643
- yield _wrap_output (json .dumps (_patch_caps_version ({"error" : err_msg })))
644
-
645
- if model_dict .get ('backend' ) == 'litellm' :
646
- model_name = model_dict .get ('resolve_as' , post .model )
647
- if model_name not in litellm .model_list :
648
- log (f"warning: requested model { model_name } is not in the litellm.model_list (this might not be the issue for some providers)" )
649
- log (f"chat/completions: model resolve { post .model } -> { model_name } " )
650
- prompt_tokens_n = litellm .token_counter (model_name , messages = messages )
651
- if post .tools :
652
- prompt_tokens_n += litellm .token_counter (model_name , text = json .dumps (post .tools ))
653
- response_streamer = litellm_streamer () if post .stream else litellm_non_streamer ()
654
- else :
655
- # TODO: unused refact-lsp logic, remove ASAP
656
- response_streamer = chat_completion_streamer ()
587
+ model_name = model_dict .get ('resolve_as' , post .model )
588
+ if model_name not in litellm .model_list :
589
+ log (f"warning: requested model { model_name } is not in the litellm.model_list (this might not be the issue for some providers)" )
590
+ log (f"chat/completions: model resolve { post .model } -> { model_name } " )
591
+ prompt_tokens_n = litellm .token_counter (model_name , messages = messages )
592
+ if post .tools :
593
+ prompt_tokens_n += litellm .token_counter (model_name , text = json .dumps (post .tools ))
594
+ response_streamer = litellm_streamer () if post .stream else litellm_non_streamer ()
657
595
658
596
return StreamingResponse (response_streamer , media_type = "text/event-stream" )
659
597
0 commit comments