SGLang provides OpenAI-compatible endpoints, making it easy to integrate with existing applications.
from openai import OpenAI# Create an OpenAI client pointing to SGLang serverclient = OpenAI( base_url="http://127.0.0.1:30000/v1", api_key="EMPTY" # SGLang doesn't require authentication by default)# Chat completionresponse = client.chat.completions.create( model="meta-llama/Llama-3.1-8B-Instruct", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"} ], temperature=0.7, max_tokens=256)print(response.choices[0].message.content)
Expected Output:
The capital of France is Paris. It is one of the most famous and beautiful cities in the world, known for its iconic landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
The native SGLang API provides a more Pythonic interface with advanced features.
import sglang as sglfrom sglang.srt.server_args import ServerArgsimport dataclasses# Create an offline engineserver_args = ServerArgs( model_path="meta-llama/Llama-3.1-8B-Instruct")llm = sgl.Engine(**dataclasses.asdict(server_args))# Generate responsesprompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is",]sampling_params = {"temperature": 0.8, "top_p": 0.95}outputs = llm.generate(prompts, sampling_params)# Print outputsfor prompt, output in zip(prompts, outputs): print(f"Prompt: {prompt}") print(f"Generated: {output['text']}") print("=" * 50)
Expected Output:
Prompt: Hello, my name isGenerated: John, and I'm excited to share my story with you today. I grew up in a small town in the Midwest==================================================Prompt: The president of the United States isGenerated: the head of state and head of government of the United States of America. The president directs the executive branch==================================================
response = client.completions.create( model="meta-llama/Llama-3.1-8B-Instruct", prompt="Once upon a time", max_tokens=100, temperature=0.8)print(response.choices[0].text)
import sglang as sglfrom sglang.srt.server_args import ServerArgsimport dataclassesserver_args = ServerArgs(model_path="meta-llama/Llama-3.1-8B-Instruct")llm = sgl.Engine(**dataclasses.asdict(server_args))prompts = [f"Question {i}: What is 2+{i}?" for i in range(10)]outputs = llm.generate(prompts, {"temperature": 0.0})for prompt, output in zip(prompts, outputs): print(f"{prompt} -> {output['text']}")