Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/sgl-project/sglang/llms.txt

Use this file to discover all available pages before exploring further.

Testing

This guide covers testing practices, test structure, and how to write effective tests for SGLang.

Test Structure

SGLang tests are organized in the test/ directory:
test/
├── srt/                    # Runtime tests
│   ├── test_engine.py     # Engine tests
│   ├── test_models.py     # Model tests
│   └── ...
├── lang/                   # Frontend language tests
└── utils/                  # Test utilities

Running Tests

Run All Tests

# Run all tests
python -m pytest test/

# Run with verbose output
python -m pytest test/ -v

# Run with coverage
python -m pytest test/ --cov=sglang --cov-report=html

Run Specific Tests

# Run specific test file
python -m pytest test/srt/test_engine.py

# Run specific test class
python -m pytest test/srt/test_engine.py::TestEngine

# Run specific test method
python -m pytest test/srt/test_engine.py::TestEngine::test_generate

# Run tests matching pattern
python -m pytest test/ -k "test_batch"

Run Tests in Parallel

# Install pytest-xdist
pip install pytest-xdist

# Run tests in parallel
python -m pytest test/ -n auto

Writing Tests

Basic Test Structure

import unittest
from sglang import Engine

class TestMyFeature(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Set up test fixtures (runs once per class)."""
        cls.engine = Engine(
            model_path="meta-llama/Llama-3.2-1B",
            trust_remote_code=True,
        )
    
    @classmethod
    def tearDownClass(cls):
        """Clean up after tests."""
        cls.engine.shutdown()
    
    def test_basic_generation(self):
        """Test basic text generation."""
        output = self.engine.generate(
            prompt="Hello",
            sampling_params={"max_new_tokens": 16}
        )
        self.assertIn("text", output)
        self.assertGreater(len(output["text"]), 0)
    
    def test_batch_generation(self):
        """Test batch generation."""
        prompts = ["Hello", "Hi there", "Good morning"]
        outputs = self.engine.generate(
            prompt=prompts,
            sampling_params={"max_new_tokens": 16}
        )
        self.assertEqual(len(outputs), len(prompts))
        for output in outputs:
            self.assertIn("text", output)

if __name__ == "__main__":
    unittest.main()

Testing with HTTP Server

import unittest
import requests
import subprocess
import time
import signal

class TestHTTPServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Launch server before tests."""
        cls.server = subprocess.Popen([
            "python", "-m", "sglang.launch_server",
            "--model-path", "meta-llama/Llama-3.2-1B",
            "--host", "127.0.0.1",
            "--port", "30000",
        ])
        # Wait for server to be ready
        cls._wait_for_server()
    
    @classmethod
    def tearDownClass(cls):
        """Shutdown server after tests."""
        cls.server.send_signal(signal.SIGINT)
        cls.server.wait(timeout=30)
    
    @classmethod
    def _wait_for_server(cls, timeout=60):
        """Wait for server to be ready."""
        url = "http://127.0.0.1:30000/health"
        start = time.time()
        while time.time() - start < timeout:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return
            except requests.ConnectionError:
                pass
            time.sleep(1)
        raise TimeoutError("Server did not start in time")
    
    def test_chat_completion(self):
        """Test chat completion endpoint."""
        response = requests.post(
            "http://127.0.0.1:30000/v1/chat/completions",
            json={
                "model": "meta-llama/Llama-3.2-1B",
                "messages": [{"role": "user", "content": "Hello"}],
                "max_completion_tokens": 16,
            }
        )
        self.assertEqual(response.status_code, 200)
        data = response.json()
        self.assertIn("choices", data)
        self.assertGreater(len(data["choices"]), 0)

Testing Async Code

import unittest
import asyncio
from sglang import Engine

class TestAsyncGeneration(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(model_path="meta-llama/Llama-3.2-1B")
    
    def tearDown(self):
        self.engine.shutdown()
    
    def test_async_generate(self):
        """Test async generation."""
        async def run_test():
            output = await self.engine.async_generate(
                prompt="Hello",
                sampling_params={"max_new_tokens": 16}
            )
            self.assertIn("text", output)
            return output
        
        # Run async test
        output = asyncio.run(run_test())
        self.assertIsNotNone(output)

Test Utilities

Reusable Fixtures

Create shared fixtures in test/test_utils.py:
# test/test_utils.py
from sglang import Engine

DEFAULT_PROMPTS = [
    "Once upon a time",
    "In a galaxy far far away",
    "The quick brown fox",
]

class EngineFixture:
    """Reusable engine fixture."""
    
    @classmethod
    def create_engine(cls, model_path="meta-llama/Llama-3.2-1B", **kwargs):
        """Create engine with default settings."""
        return Engine(
            model_path=model_path,
            trust_remote_code=True,
            log_level="error",
            **kwargs
        )
Use in tests:
from test.test_utils import EngineFixture, DEFAULT_PROMPTS

class TestWithFixture(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.engine = EngineFixture.create_engine()
    
    def test_with_defaults(self):
        outputs = self.engine.generate(
            prompt=DEFAULT_PROMPTS[:2],
            sampling_params={"max_new_tokens": 16}
        )
        self.assertEqual(len(outputs), 2)

Assertions

class TestAssertions(unittest.TestCase):
    def test_output_format(self):
        output = self.engine.generate(prompt="Hello")
        
        # Check structure
        self.assertIsInstance(output, dict)
        self.assertIn("text", output)
        self.assertIn("meta_info", output)
        
        # Check types
        self.assertIsInstance(output["text"], str)
        self.assertIsInstance(output["meta_info"], dict)
        
        # Check values
        self.assertGreater(len(output["text"]), 0)
        self.assertIn("prompt_tokens", output["meta_info"])
        self.assertGreater(output["meta_info"]["prompt_tokens"], 0)

Performance Testing

Throughput Test

import time

class TestPerformance(unittest.TestCase):
    def test_throughput(self):
        """Test generation throughput."""
        num_requests = 100
        prompts = ["Hello world"] * num_requests
        
        start = time.time()
        outputs = self.engine.generate(
            prompt=prompts,
            sampling_params={"max_new_tokens": 16}
        )
        duration = time.time() - start
        
        throughput = num_requests / duration
        print(f"Throughput: {throughput:.2f} req/s")
        
        # Assert minimum throughput
        self.assertGreater(throughput, 10.0)  # At least 10 req/s

Latency Test

class TestLatency(unittest.TestCase):
    def test_latency(self):
        """Test generation latency."""
        latencies = []
        
        for _ in range(10):
            start = time.time()
            self.engine.generate(
                prompt="Hello",
                sampling_params={"max_new_tokens": 16}
            )
            latency = time.time() - start
            latencies.append(latency)
        
        avg_latency = sum(latencies) / len(latencies)
        print(f"Average latency: {avg_latency*1000:.2f}ms")
        
        # Assert maximum latency
        self.assertLess(avg_latency, 1.0)  # Less than 1 second

Integration Testing

End-to-End Test

class TestEndToEnd(unittest.TestCase):
    """End-to-end integration tests."""
    
    def test_chat_conversation(self):
        """Test multi-turn conversation."""
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is 2+2?"}
        ]
        
        # First turn
        output1 = self.engine.generate(
            messages=messages,
            sampling_params={"max_new_tokens": 32}
        )
        self.assertIn("4", output1["text"])
        
        # Second turn
        messages.append({"role": "assistant", "content": output1["text"]})
        messages.append({"role": "user", "content": "What about 3+3?"})
        
        output2 = self.engine.generate(
            messages=messages,
            sampling_params={"max_new_tokens": 32}
        )
        self.assertIn("6", output2["text"])

Accuracy Testing

GSM8K Test

import json
import re

class TestAccuracy(unittest.TestCase):
    def test_gsm8k_few_shot(self):
        """Test GSM8K accuracy."""
        # Load GSM8K examples
        with open("test/data/gsm8k_test.jsonl") as f:
            examples = [json.loads(line) for line in f][:100]
        
        correct = 0
        for example in examples:
            question = example["question"]
            answer = example["answer"]
            
            # Generate response
            output = self.engine.generate(
                prompt=f"Question: {question}\nAnswer:",
                sampling_params={"max_new_tokens": 256, "temperature": 0}
            )
            
            # Extract predicted answer
            pred = self._extract_answer(output["text"])
            gold = self._extract_answer(answer)
            
            if pred == gold:
                correct += 1
        
        accuracy = correct / len(examples)
        print(f"GSM8K Accuracy: {accuracy*100:.2f}%")
        
        # Assert minimum accuracy
        self.assertGreater(accuracy, 0.70)  # At least 70% accuracy
    
    def _extract_answer(self, text):
        """Extract numerical answer from text."""
        match = re.search(r"####\s*([\d,]+)", text)
        if match:
            return match.group(1).replace(",", "")
        return None

Mocking and Fixtures

Mock External Dependencies

from unittest.mock import Mock, patch

class TestWithMocks(unittest.TestCase):
    @patch('requests.post')
    def test_api_call(self, mock_post):
        """Test with mocked API call."""
        mock_post.return_value.status_code = 200
        mock_post.return_value.json.return_value = {
            "choices": [{"message": {"content": "Hello"}}]
        }
        
        # Your test code here
        response = requests.post("http://example.com")
        self.assertEqual(response.status_code, 200)

Test Best Practices

Keep Tests Fast

  • Reuse server instances across tests (use setUpClass)
  • Use small models for testing (e.g., Llama-3.2-1B)
  • Set short max_new_tokens for speed
  • Split long test files into multiple files

Make Tests Deterministic

def test_deterministic_output(self):
    """Test deterministic generation."""
    sampling_params = {
        "max_new_tokens": 16,
        "temperature": 0,  # Deterministic
        "seed": 42,
    }
    
    output1 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
    output2 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
    
    self.assertEqual(output1["text"], output2["text"])

Test Error Handling

def test_invalid_model(self):
    """Test error handling for invalid model."""
    with self.assertRaises(ValueError):
        Engine(model_path="nonexistent/model")

def test_invalid_params(self):
    """Test error handling for invalid params."""
    with self.assertRaises(ValueError):
        self.engine.generate(
            prompt="Hello",
            sampling_params={"temperature": -1}  # Invalid
        )

Use Descriptive Test Names

# Good
def test_batch_generation_with_different_lengths(self):
    pass

# Bad
def test_batch(self):
    pass

Document Tests

def test_streaming_with_function_calling(self):
    """Test that streaming works correctly with function calling.
    
    This test verifies that:
    1. Function calls are properly streamed
    2. Arguments are accumulated correctly
    3. Final message contains complete function call
    """
    # Test code here

CI Integration

GitHub Actions

Tests run automatically in CI. See workflow configuration in .github/workflows/.

Skipping Slow Tests

import unittest

class TestSlow(unittest.TestCase):
    @unittest.skipIf(os.getenv("CI") == "true", "Slow test, skip in CI")
    def test_expensive_operation(self):
        """This test is too slow for CI."""
        pass

Debugging Tests

def test_with_debug(self):
    output = self.engine.generate(prompt="Hello")
    
    # Print for debugging
    print(f"Output: {output}")
    print(f"Tokens: {output['meta_info']['completion_tokens']}")
    
    self.assertGreater(output["meta_info"]["completion_tokens"], 0)

Run Single Test with Verbose Output

python -m pytest test/srt/test_engine.py::TestEngine::test_generate -v -s

Use Python Debugger

def test_with_debugger(self):
    output = self.engine.generate(prompt="Hello")
    
    import pdb; pdb.set_trace()  # Debugger breakpoint
    
    self.assertIn("text", output)

Common Patterns

Parameterized Tests

import unittest
from parameterized import parameterized

class TestParameterized(unittest.TestCase):
    @parameterized.expand([
        ("short", 16),
        ("medium", 64),
        ("long", 256),
    ])
    def test_different_lengths(self, name, max_tokens):
        """Test with different output lengths."""
        output = self.engine.generate(
            prompt="Hello",
            sampling_params={"max_new_tokens": max_tokens}
        )
        self.assertLessEqual(
            output["meta_info"]["completion_tokens"],
            max_tokens
        )

Temporary Files

import tempfile
import os

class TestWithFiles(unittest.TestCase):
    def test_with_temp_file(self):
        """Test with temporary file."""
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
            f.write("test data")
            temp_path = f.name
        
        try:
            # Use temp file
            with open(temp_path) as f:
                data = f.read()
            self.assertEqual(data, "test data")
        finally:
            # Clean up
            os.unlink(temp_path)

Resources

Next Steps