Documentation Index
Fetch the complete documentation index at: https://mintlify.com/sgl-project/sglang/llms.txt
Use this file to discover all available pages before exploring further.
Testing
This guide covers testing practices, test structure, and how to write effective tests for SGLang.Test Structure
SGLang tests are organized in thetest/ directory:
test/
├── srt/ # Runtime tests
│ ├── test_engine.py # Engine tests
│ ├── test_models.py # Model tests
│ └── ...
├── lang/ # Frontend language tests
└── utils/ # Test utilities
Running Tests
Run All Tests
# Run all tests
python -m pytest test/
# Run with verbose output
python -m pytest test/ -v
# Run with coverage
python -m pytest test/ --cov=sglang --cov-report=html
Run Specific Tests
# Run specific test file
python -m pytest test/srt/test_engine.py
# Run specific test class
python -m pytest test/srt/test_engine.py::TestEngine
# Run specific test method
python -m pytest test/srt/test_engine.py::TestEngine::test_generate
# Run tests matching pattern
python -m pytest test/ -k "test_batch"
Run Tests in Parallel
# Install pytest-xdist
pip install pytest-xdist
# Run tests in parallel
python -m pytest test/ -n auto
Writing Tests
Basic Test Structure
import unittest
from sglang import Engine
class TestMyFeature(unittest.TestCase):
@classmethod
def setUpClass(cls):
"""Set up test fixtures (runs once per class)."""
cls.engine = Engine(
model_path="meta-llama/Llama-3.2-1B",
trust_remote_code=True,
)
@classmethod
def tearDownClass(cls):
"""Clean up after tests."""
cls.engine.shutdown()
def test_basic_generation(self):
"""Test basic text generation."""
output = self.engine.generate(
prompt="Hello",
sampling_params={"max_new_tokens": 16}
)
self.assertIn("text", output)
self.assertGreater(len(output["text"]), 0)
def test_batch_generation(self):
"""Test batch generation."""
prompts = ["Hello", "Hi there", "Good morning"]
outputs = self.engine.generate(
prompt=prompts,
sampling_params={"max_new_tokens": 16}
)
self.assertEqual(len(outputs), len(prompts))
for output in outputs:
self.assertIn("text", output)
if __name__ == "__main__":
unittest.main()
Testing with HTTP Server
import unittest
import requests
import subprocess
import time
import signal
class TestHTTPServer(unittest.TestCase):
@classmethod
def setUpClass(cls):
"""Launch server before tests."""
cls.server = subprocess.Popen([
"python", "-m", "sglang.launch_server",
"--model-path", "meta-llama/Llama-3.2-1B",
"--host", "127.0.0.1",
"--port", "30000",
])
# Wait for server to be ready
cls._wait_for_server()
@classmethod
def tearDownClass(cls):
"""Shutdown server after tests."""
cls.server.send_signal(signal.SIGINT)
cls.server.wait(timeout=30)
@classmethod
def _wait_for_server(cls, timeout=60):
"""Wait for server to be ready."""
url = "http://127.0.0.1:30000/health"
start = time.time()
while time.time() - start < timeout:
try:
response = requests.get(url)
if response.status_code == 200:
return
except requests.ConnectionError:
pass
time.sleep(1)
raise TimeoutError("Server did not start in time")
def test_chat_completion(self):
"""Test chat completion endpoint."""
response = requests.post(
"http://127.0.0.1:30000/v1/chat/completions",
json={
"model": "meta-llama/Llama-3.2-1B",
"messages": [{"role": "user", "content": "Hello"}],
"max_completion_tokens": 16,
}
)
self.assertEqual(response.status_code, 200)
data = response.json()
self.assertIn("choices", data)
self.assertGreater(len(data["choices"]), 0)
Testing Async Code
import unittest
import asyncio
from sglang import Engine
class TestAsyncGeneration(unittest.TestCase):
def setUp(self):
self.engine = Engine(model_path="meta-llama/Llama-3.2-1B")
def tearDown(self):
self.engine.shutdown()
def test_async_generate(self):
"""Test async generation."""
async def run_test():
output = await self.engine.async_generate(
prompt="Hello",
sampling_params={"max_new_tokens": 16}
)
self.assertIn("text", output)
return output
# Run async test
output = asyncio.run(run_test())
self.assertIsNotNone(output)
Test Utilities
Reusable Fixtures
Create shared fixtures intest/test_utils.py:
# test/test_utils.py
from sglang import Engine
DEFAULT_PROMPTS = [
"Once upon a time",
"In a galaxy far far away",
"The quick brown fox",
]
class EngineFixture:
"""Reusable engine fixture."""
@classmethod
def create_engine(cls, model_path="meta-llama/Llama-3.2-1B", **kwargs):
"""Create engine with default settings."""
return Engine(
model_path=model_path,
trust_remote_code=True,
log_level="error",
**kwargs
)
from test.test_utils import EngineFixture, DEFAULT_PROMPTS
class TestWithFixture(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.engine = EngineFixture.create_engine()
def test_with_defaults(self):
outputs = self.engine.generate(
prompt=DEFAULT_PROMPTS[:2],
sampling_params={"max_new_tokens": 16}
)
self.assertEqual(len(outputs), 2)
Assertions
class TestAssertions(unittest.TestCase):
def test_output_format(self):
output = self.engine.generate(prompt="Hello")
# Check structure
self.assertIsInstance(output, dict)
self.assertIn("text", output)
self.assertIn("meta_info", output)
# Check types
self.assertIsInstance(output["text"], str)
self.assertIsInstance(output["meta_info"], dict)
# Check values
self.assertGreater(len(output["text"]), 0)
self.assertIn("prompt_tokens", output["meta_info"])
self.assertGreater(output["meta_info"]["prompt_tokens"], 0)
Performance Testing
Throughput Test
import time
class TestPerformance(unittest.TestCase):
def test_throughput(self):
"""Test generation throughput."""
num_requests = 100
prompts = ["Hello world"] * num_requests
start = time.time()
outputs = self.engine.generate(
prompt=prompts,
sampling_params={"max_new_tokens": 16}
)
duration = time.time() - start
throughput = num_requests / duration
print(f"Throughput: {throughput:.2f} req/s")
# Assert minimum throughput
self.assertGreater(throughput, 10.0) # At least 10 req/s
Latency Test
class TestLatency(unittest.TestCase):
def test_latency(self):
"""Test generation latency."""
latencies = []
for _ in range(10):
start = time.time()
self.engine.generate(
prompt="Hello",
sampling_params={"max_new_tokens": 16}
)
latency = time.time() - start
latencies.append(latency)
avg_latency = sum(latencies) / len(latencies)
print(f"Average latency: {avg_latency*1000:.2f}ms")
# Assert maximum latency
self.assertLess(avg_latency, 1.0) # Less than 1 second
Integration Testing
End-to-End Test
class TestEndToEnd(unittest.TestCase):
"""End-to-end integration tests."""
def test_chat_conversation(self):
"""Test multi-turn conversation."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2+2?"}
]
# First turn
output1 = self.engine.generate(
messages=messages,
sampling_params={"max_new_tokens": 32}
)
self.assertIn("4", output1["text"])
# Second turn
messages.append({"role": "assistant", "content": output1["text"]})
messages.append({"role": "user", "content": "What about 3+3?"})
output2 = self.engine.generate(
messages=messages,
sampling_params={"max_new_tokens": 32}
)
self.assertIn("6", output2["text"])
Accuracy Testing
GSM8K Test
import json
import re
class TestAccuracy(unittest.TestCase):
def test_gsm8k_few_shot(self):
"""Test GSM8K accuracy."""
# Load GSM8K examples
with open("test/data/gsm8k_test.jsonl") as f:
examples = [json.loads(line) for line in f][:100]
correct = 0
for example in examples:
question = example["question"]
answer = example["answer"]
# Generate response
output = self.engine.generate(
prompt=f"Question: {question}\nAnswer:",
sampling_params={"max_new_tokens": 256, "temperature": 0}
)
# Extract predicted answer
pred = self._extract_answer(output["text"])
gold = self._extract_answer(answer)
if pred == gold:
correct += 1
accuracy = correct / len(examples)
print(f"GSM8K Accuracy: {accuracy*100:.2f}%")
# Assert minimum accuracy
self.assertGreater(accuracy, 0.70) # At least 70% accuracy
def _extract_answer(self, text):
"""Extract numerical answer from text."""
match = re.search(r"####\s*([\d,]+)", text)
if match:
return match.group(1).replace(",", "")
return None
Mocking and Fixtures
Mock External Dependencies
from unittest.mock import Mock, patch
class TestWithMocks(unittest.TestCase):
@patch('requests.post')
def test_api_call(self, mock_post):
"""Test with mocked API call."""
mock_post.return_value.status_code = 200
mock_post.return_value.json.return_value = {
"choices": [{"message": {"content": "Hello"}}]
}
# Your test code here
response = requests.post("http://example.com")
self.assertEqual(response.status_code, 200)
Test Best Practices
Keep Tests Fast
- Reuse server instances across tests (use
setUpClass) - Use small models for testing (e.g., Llama-3.2-1B)
- Set short
max_new_tokensfor speed - Split long test files into multiple files
Make Tests Deterministic
def test_deterministic_output(self):
"""Test deterministic generation."""
sampling_params = {
"max_new_tokens": 16,
"temperature": 0, # Deterministic
"seed": 42,
}
output1 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
output2 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
self.assertEqual(output1["text"], output2["text"])
Test Error Handling
def test_invalid_model(self):
"""Test error handling for invalid model."""
with self.assertRaises(ValueError):
Engine(model_path="nonexistent/model")
def test_invalid_params(self):
"""Test error handling for invalid params."""
with self.assertRaises(ValueError):
self.engine.generate(
prompt="Hello",
sampling_params={"temperature": -1} # Invalid
)
Use Descriptive Test Names
# Good
def test_batch_generation_with_different_lengths(self):
pass
# Bad
def test_batch(self):
pass
Document Tests
def test_streaming_with_function_calling(self):
"""Test that streaming works correctly with function calling.
This test verifies that:
1. Function calls are properly streamed
2. Arguments are accumulated correctly
3. Final message contains complete function call
"""
# Test code here
CI Integration
GitHub Actions
Tests run automatically in CI. See workflow configuration in.github/workflows/.
Skipping Slow Tests
import unittest
class TestSlow(unittest.TestCase):
@unittest.skipIf(os.getenv("CI") == "true", "Slow test, skip in CI")
def test_expensive_operation(self):
"""This test is too slow for CI."""
pass
Debugging Tests
Print Debug Info
def test_with_debug(self):
output = self.engine.generate(prompt="Hello")
# Print for debugging
print(f"Output: {output}")
print(f"Tokens: {output['meta_info']['completion_tokens']}")
self.assertGreater(output["meta_info"]["completion_tokens"], 0)
Run Single Test with Verbose Output
python -m pytest test/srt/test_engine.py::TestEngine::test_generate -v -s
Use Python Debugger
def test_with_debugger(self):
output = self.engine.generate(prompt="Hello")
import pdb; pdb.set_trace() # Debugger breakpoint
self.assertIn("text", output)
Common Patterns
Parameterized Tests
import unittest
from parameterized import parameterized
class TestParameterized(unittest.TestCase):
@parameterized.expand([
("short", 16),
("medium", 64),
("long", 256),
])
def test_different_lengths(self, name, max_tokens):
"""Test with different output lengths."""
output = self.engine.generate(
prompt="Hello",
sampling_params={"max_new_tokens": max_tokens}
)
self.assertLessEqual(
output["meta_info"]["completion_tokens"],
max_tokens
)
Temporary Files
import tempfile
import os
class TestWithFiles(unittest.TestCase):
def test_with_temp_file(self):
"""Test with temporary file."""
with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
f.write("test data")
temp_path = f.name
try:
# Use temp file
with open(temp_path) as f:
data = f.read()
self.assertEqual(data, "test data")
finally:
# Clean up
os.unlink(temp_path)
Resources
Next Steps
- Contribution Guide - Submit your changes
- Benchmark and Profiling - Performance testing
- Adding Models - Test new models
