Testing

This guide covers testing practices, test structure, and how to write effective tests for SGLang.

Test Structure

SGLang tests are organized in the test/ directory:

test/
├── srt/                    # Runtime tests
│   ├── test_engine.py     # Engine tests
│   ├── test_models.py     # Model tests
│   └── ...
├── lang/                   # Frontend language tests
└── utils/                  # Test utilities

Running Tests

Run All Tests

# Run all tests
python -m pytest test/

# Run with verbose output
python -m pytest test/ -v

# Run with coverage
python -m pytest test/ --cov=sglang --cov-report=html

Run Specific Tests

# Run specific test file
python -m pytest test/srt/test_engine.py

# Run specific test class
python -m pytest test/srt/test_engine.py::TestEngine

# Run specific test method
python -m pytest test/srt/test_engine.py::TestEngine::test_generate

# Run tests matching pattern
python -m pytest test/ -k "test_batch"

Run Tests in Parallel

# Install pytest-xdist
pip install pytest-xdist

# Run tests in parallel
python -m pytest test/ -n auto

Writing Tests

Basic Test Structure

import unittest
from sglang import Engine

class TestMyFeature(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Set up test fixtures (runs once per class)."""
        cls.engine = Engine(
            model_path="meta-llama/Llama-3.2-1B",
            trust_remote_code=True,
        )
    
    @classmethod
    def tearDownClass(cls):
        """Clean up after tests."""
        cls.engine.shutdown()
    
    def test_basic_generation(self):
        """Test basic text generation."""
        output = self.engine.generate(
            prompt="Hello",
            sampling_params={"max_new_tokens": 16}
        )
        self.assertIn("text", output)
        self.assertGreater(len(output["text"]), 0)
    
    def test_batch_generation(self):
        """Test batch generation."""
        prompts = ["Hello", "Hi there", "Good morning"]
        outputs = self.engine.generate(
            prompt=prompts,
            sampling_params={"max_new_tokens": 16}
        )
        self.assertEqual(len(outputs), len(prompts))
        for output in outputs:
            self.assertIn("text", output)

if __name__ == "__main__":
    unittest.main()

Testing with HTTP Server

import unittest
import requests
import subprocess
import time
import signal

class TestHTTPServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Launch server before tests."""
        cls.server = subprocess.Popen([
            "python", "-m", "sglang.launch_server",
            "--model-path", "meta-llama/Llama-3.2-1B",
            "--host", "127.0.0.1",
            "--port", "30000",
        ])
        # Wait for server to be ready
        cls._wait_for_server()
    
    @classmethod
    def tearDownClass(cls):
        """Shutdown server after tests."""
        cls.server.send_signal(signal.SIGINT)
        cls.server.wait(timeout=30)
    
    @classmethod
    def _wait_for_server(cls, timeout=60):
        """Wait for server to be ready."""
        url = "http://127.0.0.1:30000/health"
        start = time.time()
        while time.time() - start < timeout:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    return
            except requests.ConnectionError:
                pass
            time.sleep(1)
        raise TimeoutError("Server did not start in time")
    
    def test_chat_completion(self):
        """Test chat completion endpoint."""
        response = requests.post(
            "http://127.0.0.1:30000/v1/chat/completions",
            json={
                "model": "meta-llama/Llama-3.2-1B",
                "messages": [{"role": "user", "content": "Hello"}],
                "max_completion_tokens": 16,
            }
        )
        self.assertEqual(response.status_code, 200)
        data = response.json()
        self.assertIn("choices", data)
        self.assertGreater(len(data["choices"]), 0)

Testing Async Code

import unittest
import asyncio
from sglang import Engine

class TestAsyncGeneration(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(model_path="meta-llama/Llama-3.2-1B")
    
    def tearDown(self):
        self.engine.shutdown()
    
    def test_async_generate(self):
        """Test async generation."""
        async def run_test():
            output = await self.engine.async_generate(
                prompt="Hello",
                sampling_params={"max_new_tokens": 16}
            )
            self.assertIn("text", output)
            return output
        
        # Run async test
        output = asyncio.run(run_test())
        self.assertIsNotNone(output)

Test Utilities

Reusable Fixtures

Create shared fixtures in test/test_utils.py:

# test/test_utils.py
from sglang import Engine

DEFAULT_PROMPTS = [
    "Once upon a time",
    "In a galaxy far far away",
    "The quick brown fox",
]

class EngineFixture:
    """Reusable engine fixture."""
    
    @classmethod
    def create_engine(cls, model_path="meta-llama/Llama-3.2-1B", **kwargs):
        """Create engine with default settings."""
        return Engine(
            model_path=model_path,
            trust_remote_code=True,
            log_level="error",
            **kwargs
        )

Use in tests:

from test.test_utils import EngineFixture, DEFAULT_PROMPTS

class TestWithFixture(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.engine = EngineFixture.create_engine()
    
    def test_with_defaults(self):
        outputs = self.engine.generate(
            prompt=DEFAULT_PROMPTS[:2],
            sampling_params={"max_new_tokens": 16}
        )
        self.assertEqual(len(outputs), 2)

Assertions

class TestAssertions(unittest.TestCase):
    def test_output_format(self):
        output = self.engine.generate(prompt="Hello")
        
        # Check structure
        self.assertIsInstance(output, dict)
        self.assertIn("text", output)
        self.assertIn("meta_info", output)
        
        # Check types
        self.assertIsInstance(output["text"], str)
        self.assertIsInstance(output["meta_info"], dict)
        
        # Check values
        self.assertGreater(len(output["text"]), 0)
        self.assertIn("prompt_tokens", output["meta_info"])
        self.assertGreater(output["meta_info"]["prompt_tokens"], 0)

Performance Testing

Throughput Test

import time

class TestPerformance(unittest.TestCase):
    def test_throughput(self):
        """Test generation throughput."""
        num_requests = 100
        prompts = ["Hello world"] * num_requests
        
        start = time.time()
        outputs = self.engine.generate(
            prompt=prompts,
            sampling_params={"max_new_tokens": 16}
        )
        duration = time.time() - start
        
        throughput = num_requests / duration
        print(f"Throughput: {throughput:.2f} req/s")
        
        # Assert minimum throughput
        self.assertGreater(throughput, 10.0)  # At least 10 req/s

Latency Test

class TestLatency(unittest.TestCase):
    def test_latency(self):
        """Test generation latency."""
        latencies = []
        
        for _ in range(10):
            start = time.time()
            self.engine.generate(
                prompt="Hello",
                sampling_params={"max_new_tokens": 16}
            )
            latency = time.time() - start
            latencies.append(latency)
        
        avg_latency = sum(latencies) / len(latencies)
        print(f"Average latency: {avg_latency*1000:.2f}ms")
        
        # Assert maximum latency
        self.assertLess(avg_latency, 1.0)  # Less than 1 second

Integration Testing

End-to-End Test

class TestEndToEnd(unittest.TestCase):
    """End-to-end integration tests."""
    
    def test_chat_conversation(self):
        """Test multi-turn conversation."""
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is 2+2?"}
        ]
        
        # First turn
        output1 = self.engine.generate(
            messages=messages,
            sampling_params={"max_new_tokens": 32}
        )
        self.assertIn("4", output1["text"])
        
        # Second turn
        messages.append({"role": "assistant", "content": output1["text"]})
        messages.append({"role": "user", "content": "What about 3+3?"})
        
        output2 = self.engine.generate(
            messages=messages,
            sampling_params={"max_new_tokens": 32}
        )
        self.assertIn("6", output2["text"])

Accuracy Testing

GSM8K Test

import json
import re

class TestAccuracy(unittest.TestCase):
    def test_gsm8k_few_shot(self):
        """Test GSM8K accuracy."""
        # Load GSM8K examples
        with open("test/data/gsm8k_test.jsonl") as f:
            examples = [json.loads(line) for line in f][:100]
        
        correct = 0
        for example in examples:
            question = example["question"]
            answer = example["answer"]
            
            # Generate response
            output = self.engine.generate(
                prompt=f"Question: {question}\nAnswer:",
                sampling_params={"max_new_tokens": 256, "temperature": 0}
            )
            
            # Extract predicted answer
            pred = self._extract_answer(output["text"])
            gold = self._extract_answer(answer)
            
            if pred == gold:
                correct += 1
        
        accuracy = correct / len(examples)
        print(f"GSM8K Accuracy: {accuracy*100:.2f}%")
        
        # Assert minimum accuracy
        self.assertGreater(accuracy, 0.70)  # At least 70% accuracy
    
    def _extract_answer(self, text):
        """Extract numerical answer from text."""
        match = re.search(r"####\s*([\d,]+)", text)
        if match:
            return match.group(1).replace(",", "")
        return None

Mocking and Fixtures

Mock External Dependencies

from unittest.mock import Mock, patch

class TestWithMocks(unittest.TestCase):
    @patch('requests.post')
    def test_api_call(self, mock_post):
        """Test with mocked API call."""
        mock_post.return_value.status_code = 200
        mock_post.return_value.json.return_value = {
            "choices": [{"message": {"content": "Hello"}}]
        }
        
        # Your test code here
        response = requests.post("http://example.com")
        self.assertEqual(response.status_code, 200)

Test Best Practices

Keep Tests Fast

Reuse server instances across tests (use setUpClass)
Use small models for testing (e.g., Llama-3.2-1B)
Set short max_new_tokens for speed
Split long test files into multiple files

Make Tests Deterministic

def test_deterministic_output(self):
    """Test deterministic generation."""
    sampling_params = {
        "max_new_tokens": 16,
        "temperature": 0,  # Deterministic
        "seed": 42,
    }
    
    output1 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
    output2 = self.engine.generate(prompt="Hello", sampling_params=sampling_params)
    
    self.assertEqual(output1["text"], output2["text"])

Test Error Handling

def test_invalid_model(self):
    """Test error handling for invalid model."""
    with self.assertRaises(ValueError):
        Engine(model_path="nonexistent/model")

def test_invalid_params(self):
    """Test error handling for invalid params."""
    with self.assertRaises(ValueError):
        self.engine.generate(
            prompt="Hello",
            sampling_params={"temperature": -1}  # Invalid
        )

Use Descriptive Test Names

# Good
def test_batch_generation_with_different_lengths(self):
    pass

# Bad
def test_batch(self):
    pass

Document Tests

def test_streaming_with_function_calling(self):
    """Test that streaming works correctly with function calling.
    
    This test verifies that:
    1. Function calls are properly streamed
    2. Arguments are accumulated correctly
    3. Final message contains complete function call
    """
    # Test code here

CI Integration

GitHub Actions

Tests run automatically in CI. See workflow configuration in .github/workflows/.

Skipping Slow Tests

import unittest

class TestSlow(unittest.TestCase):
    @unittest.skipIf(os.getenv("CI") == "true", "Slow test, skip in CI")
    def test_expensive_operation(self):
        """This test is too slow for CI."""
        pass

Debugging Tests

Print Debug Info

def test_with_debug(self):
    output = self.engine.generate(prompt="Hello")
    
    # Print for debugging
    print(f"Output: {output}")
    print(f"Tokens: {output['meta_info']['completion_tokens']}")
    
    self.assertGreater(output["meta_info"]["completion_tokens"], 0)

Run Single Test with Verbose Output

python -m pytest test/srt/test_engine.py::TestEngine::test_generate -v -s

Use Python Debugger

def test_with_debugger(self):
    output = self.engine.generate(prompt="Hello")
    
    import pdb; pdb.set_trace()  # Debugger breakpoint
    
    self.assertIn("text", output)

Common Patterns

Parameterized Tests

import unittest
from parameterized import parameterized

class TestParameterized(unittest.TestCase):
    @parameterized.expand([
        ("short", 16),
        ("medium", 64),
        ("long", 256),
    ])
    def test_different_lengths(self, name, max_tokens):
        """Test with different output lengths."""
        output = self.engine.generate(
            prompt="Hello",
            sampling_params={"max_new_tokens": max_tokens}
        )
        self.assertLessEqual(
            output["meta_info"]["completion_tokens"],
            max_tokens
        )

Temporary Files

import tempfile
import os

class TestWithFiles(unittest.TestCase):
    def test_with_temp_file(self):
        """Test with temporary file."""
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
            f.write("test data")
            temp_path = f.name
        
        try:
            # Use temp file
            with open(temp_path) as f:
                data = f.read()
            self.assertEqual(data, "test data")
        finally:
            # Clean up
            os.unlink(temp_path)

Resources

Next Steps

Contribution Guide - Submit your changes
Benchmark and Profiling - Performance testing
Adding Models - Test new models

Contributing

Architecture

Documentation Index

​Testing

​Test Structure

​Running Tests

​Run All Tests

​Run Specific Tests

​Run Tests in Parallel

​Writing Tests

​Basic Test Structure

​Testing with HTTP Server

​Testing Async Code

​Test Utilities

​Reusable Fixtures

​Assertions

​Performance Testing

​Throughput Test

​Latency Test

​Integration Testing

​End-to-End Test

​Accuracy Testing

​GSM8K Test

​Mocking and Fixtures

​Mock External Dependencies

​Test Best Practices

​Keep Tests Fast

​Make Tests Deterministic

​Test Error Handling

​Use Descriptive Test Names

​Document Tests

​CI Integration

​GitHub Actions

​Skipping Slow Tests

​Debugging Tests

​Print Debug Info

​Run Single Test with Verbose Output

​Use Python Debugger

​Common Patterns

​Parameterized Tests

​Temporary Files

​Resources

​Next Steps