24:I[523644,["/_next/static/chunks/88b08f7719e5055d.js","/_next/static/chunks/89efc200aacc5b16.js","/_next/static/chunks/2f4fc1e4121ced54.js"],"default"] 25:I[209441,["/_next/static/chunks/88b08f7719e5055d.js","/_next/static/chunks/89efc200aacc5b16.js","/_next/static/chunks/2f4fc1e4121ced54.js"],"default"] 1b:["$","path","vktsd0",{"d":"M12.586 2.586A2 2 0 0 0 11.172 2H4a2 2 0 0 0-2 2v7.172a2 2 0 0 0 .586 1.414l8.704 8.704a2.426 2.426 0 0 0 3.42 0l6.58-6.58a2.426 2.426 0 0 0 0-3.42z"}] 1c:["$","circle","kqv944",{"cx":"7.5","cy":"7.5","r":".5","fill":"currentColor"}] 1d:["$","span",null,{"className":"flex items-center gap-1 text-xs text-gray-500","children":[["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-clock h-3.5 w-3.5","aria-hidden":"true","children":[["$","circle","1mglay",{"cx":"12","cy":"12","r":"10"}],["$","path","mmk7yg",{"d":"M12 6v6l4 2"}],"$undefined"]}],"4个月前"]}] 1e:["$","span",null,{"className":"flex items-center gap-1 text-xs text-gray-500","children":[["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-thumbs-up h-3.5 w-3.5","aria-hidden":"true","children":[["$","path","emmmcr",{"d":"M15 5.88 14 10h5.83a2 2 0 0 1 1.92 2.56l-2.33 8A2 2 0 0 1 17.5 22H4a2 2 0 0 1-2-2v-8a2 2 0 0 1 2-2h2.76a2 2 0 0 0 1.79-1.11L12 2a3.13 3.13 0 0 1 3 3.88Z"}],["$","path","1qc93n",{"d":"M7 10v12"}],"$undefined"]}],"0"," 投票"]}] 1f:["$","h1",null,{"className":"text-2xl font-bold text-gray-900 leading-snug","children":"Efficiently serve dozens of fine-tuned models with vLLM on Amazon SageMaker AI and Amazon Bedrock"}] 20:["$","div",null,{"className":"prose prose-sm prose-gray max-w-none","dangerouslySetInnerHTML":{"__html":"

In this post, we explain how we implemented multi-LoRA inference for Mixture of Experts (MoE) models in vLLM, describe the kernel-level optimizations we performed, and show you how you can benefit from this work. We use GPT-OSS 20B as our primary example throughout this post.

延伸阅读

\n"}}] 21:["$","$L24",null,{"href":"https://aws.amazon.com/blogs/machine-learning/efficiently-serve-dozens-of-fine-tuned-models-with-vllm-on-amazon-sagemaker-ai-and-amazon-bedrock","target":"_blank","rel":"noopener noreferrer","contentType":"news","contentId":"cmm6gaumu0010xp4k0t5338ni","slug":"rss_aws_ml-efficiently-serve-dozens-of-fine-tuned-models-with-vllm-on-a-jly78d","title":"Efficiently serve dozens of fine-tuned models with vLLM on Amazon SageMaker AI and Amazon Bedrock","className":"inline-flex items-center gap-2 rounded-xl bg-gray-900 hover:bg-gray-800 px-5 py-2.5 text-sm font-medium text-white transition-all","children":[["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-external-link h-4 w-4","aria-hidden":"true","children":[["$","path","1q9fwt",{"d":"M15

Efficiently serve dozens of fine-tuned models with vLLM on Amazon SageMaker AI and Amazon Bedrock

延伸阅读

相关资讯