Performance

Memory Optimization

Object pooling, arena allocation, and GC tuning for zero-allocation hot paths

Memory Optimization

LX achieves zero heap allocations on the hot path through aggressive pooling, arena allocation, and GC tuning.

Memory Architecture

┌─────────────────────────────────────────────────────────────────────────┐
│                        MEMORY ALLOCATION STRATEGY                        │
├─────────────────────────────────────────────────────────────────────────┤
│                                                                          │
│   Hot Path (Zero Allocation)                                            │
│   ┌─────────────────────────────────────────────────────────────────┐  │
│   │  Order Pool ─→ Trade Pool ─→ Message Pool                       │  │
│   │  Pre-allocated │ Fixed-size │ Recycled                          │  │
│   └─────────────────────────────────────────────────────────────────┘  │
│                                                                          │
│   Warm Path (Pooled Allocation)                                         │
│   ┌─────────────────────────────────────────────────────────────────┐  │
│   │  OrderBook Nodes │ Price Levels │ Ring Buffers                  │  │
│   │  Sync.Pool │ Arena │ Slab Allocator                             │  │
│   └─────────────────────────────────────────────────────────────────┘  │
│                                                                          │
│   Cold Path (Standard Allocation)                                       │
│   ┌─────────────────────────────────────────────────────────────────┐  │
│   │  Configuration │ Logging │ Metrics │ Admin APIs                 │  │
│   │  Normal GC │ Not latency sensitive                              │  │
│   └─────────────────────────────────────────────────────────────────┘  │
│                                                                          │
└─────────────────────────────────────────────────────────────────────────┘

Object Pooling

High-Performance Object Pool

package pool

import (
    "sync"
    "sync/atomic"
    "unsafe"
)

// Pool is a lock-free object pool
type Pool[T any] struct {
    local     []poolLocal
    localSize int
    New       func() T
}

type poolLocal struct {
    _       [64]byte // Cache line padding
    private unsafe.Pointer
    shared  poolChain
    _       [64]byte
}

type poolChain struct {
    head atomic.Pointer[poolChainElt]
    tail atomic.Pointer[poolChainElt]
}

type poolChainElt struct {
    next  atomic.Pointer[poolChainElt]
    value unsafe.Pointer
}

// NewPool creates a new object pool
func NewPool[T any](new func() T) *Pool[T] {
    numCPU := runtime.GOMAXPROCS(0)
    return &Pool[T]{
        local:     make([]poolLocal, numCPU),
        localSize: numCPU,
        New:       new,
    }
}

// Get retrieves an object from the pool
func (p *Pool[T]) Get() T {
    // Try to get from local private first (no atomic)
    l := p.pin()
    ptr := l.private
    if ptr != nil {
        l.private = nil
        return *(*T)(ptr)
    }

    // Try local shared
    obj := l.shared.popHead()
    if obj != nil {
        return *(*T)(obj)
    }

    // Try to steal from other locals
    for i := 0; i < p.localSize; i++ {
        other := &p.local[i]
        obj = other.shared.popTail()
        if obj != nil {
            return *(*T)(obj)
        }
    }

    // Create new
    return p.New()
}

// Put returns an object to the pool
func (p *Pool[T]) Put(x T) {
    l := p.pin()
    if l.private == nil {
        l.private = unsafe.Pointer(&x)
        return
    }
    l.shared.pushHead(unsafe.Pointer(&x))
}

func (p *Pool[T]) pin() *poolLocal {
    pid := runtime_procPin()
    return &p.local[pid%p.localSize]
}

//go:linkname runtime_procPin runtime.procPin
func runtime_procPin() int

Order Pool Implementation

package orderbook

import (
    "sync"
)

const (
    OrderPoolSize = 100000
    TradePoolSize = 10000
)

// OrderPool manages pre-allocated orders
type OrderPool struct {
    orders  []Order
    free    chan int // Indices of free orders
    inUse   []bool
}

func NewOrderPool() *OrderPool {
    p := &OrderPool{
        orders: make([]Order, OrderPoolSize),
        free:   make(chan int, OrderPoolSize),
        inUse:  make([]bool, OrderPoolSize),
    }

    // Pre-populate free list
    for i := 0; i < OrderPoolSize; i++ {
        p.free <- i
    }

    return p
}

// Acquire gets an order from the pool
func (p *OrderPool) Acquire() *Order {
    select {
    case idx := <-p.free:
        p.inUse[idx] = true
        order := &p.orders[idx]
        order.Reset()
        return order
    default:
        // Pool exhausted, allocate new (should be rare)
        return &Order{}
    }
}

// Release returns an order to the pool
func (p *OrderPool) Release(order *Order) {
    // Find index
    idx := int((uintptr(unsafe.Pointer(order)) - uintptr(unsafe.Pointer(&p.orders[0]))) / unsafe.Sizeof(Order{}))

    if idx >= 0 && idx < OrderPoolSize && p.inUse[idx] {
        p.inUse[idx] = false
        select {
        case p.free <- idx:
        default:
            // Channel full, discard
        }
    }
}

// Order reset for reuse
func (o *Order) Reset() {
    o.ID = 0
    o.Symbol = ""
    o.Side = 0
    o.Type = 0
    o.Price = 0
    o.Quantity = 0
    o.FilledQty = 0
    o.Status = 0
    o.Timestamp = 0
}

Arena Allocation

Memory Arena for Batch Processing

package arena

import (
    "sync"
    "unsafe"
)

const (
    ArenaSize     = 1 << 20 // 1MB arenas
    MaxArenas     = 64
)

// Arena is a bump allocator for temporary allocations
type Arena struct {
    current  []byte
    offset   int
    arenas   [][]byte
    mu       sync.Mutex
}

func NewArena() *Arena {
    a := &Arena{
        arenas: make([][]byte, 0, MaxArenas),
    }
    a.grow()
    return a
}

func (a *Arena) grow() {
    block := make([]byte, ArenaSize)
    a.arenas = append(a.arenas, block)
    a.current = block
    a.offset = 0
}

// Alloc allocates n bytes from the arena
func (a *Arena) Alloc(n int) []byte {
    // Align to 8 bytes
    n = (n + 7) &^ 7

    if a.offset+n > len(a.current) {
        a.grow()
    }

    ptr := a.current[a.offset : a.offset+n]
    a.offset += n

    return ptr
}

// AllocOrder allocates an Order from the arena
func (a *Arena) AllocOrder() *Order {
    size := int(unsafe.Sizeof(Order{}))
    ptr := a.Alloc(size)
    return (*Order)(unsafe.Pointer(&ptr[0]))
}

// Reset releases all allocations (bulk free)
func (a *Arena) Reset() {
    // Keep first arena, release others
    for i := 1; i < len(a.arenas); i++ {
        a.arenas[i] = nil
    }
    a.arenas = a.arenas[:1]
    a.current = a.arenas[0]
    a.offset = 0
}

// BatchProcessor uses arena for zero-GC batch processing
type BatchProcessor struct {
    arena *Arena
}

func (bp *BatchProcessor) ProcessBatch(rawOrders [][]byte) []*Order {
    orders := make([]*Order, len(rawOrders))

    for i, raw := range rawOrders {
        order := bp.arena.AllocOrder()
        decodeOrder(raw, order) // No allocation
        orders[i] = order
    }

    return orders
}

func (bp *BatchProcessor) EndBatch() {
    bp.arena.Reset() // Bulk free all orders
}

Slab Allocator

Fixed-Size Slab Allocator

package slab

import (
    "sync"
    "unsafe"
)

// Slab allocates fixed-size objects efficiently
type Slab struct {
    objectSize int
    slabSize   int
    slabs      [][]byte
    freeList   *freeNode
    mu         sync.Mutex
}

type freeNode struct {
    next *freeNode
}

func NewSlab(objectSize, objectsPerSlab int) *Slab {
    // Ensure minimum size for free list pointer
    if objectSize < int(unsafe.Sizeof(freeNode{})) {
        objectSize = int(unsafe.Sizeof(freeNode{}))
    }

    s := &Slab{
        objectSize: objectSize,
        slabSize:   objectSize * objectsPerSlab,
    }
    s.grow()
    return s
}

func (s *Slab) grow() {
    slab := make([]byte, s.slabSize)
    s.slabs = append(s.slabs, slab)

    // Build free list
    for i := 0; i < len(slab); i += s.objectSize {
        node := (*freeNode)(unsafe.Pointer(&slab[i]))
        node.next = s.freeList
        s.freeList = node
    }
}

// Alloc returns a pointer to an object
func (s *Slab) Alloc() unsafe.Pointer {
    s.mu.Lock()
    defer s.mu.Unlock()

    if s.freeList == nil {
        s.grow()
    }

    ptr := unsafe.Pointer(s.freeList)
    s.freeList = s.freeList.next

    // Zero memory
    for i := 0; i < s.objectSize; i++ {
        *(*byte)(unsafe.Add(ptr, i)) = 0
    }

    return ptr
}

// Free returns an object to the slab
func (s *Slab) Free(ptr unsafe.Pointer) {
    s.mu.Lock()
    defer s.mu.Unlock()

    node := (*freeNode)(ptr)
    node.next = s.freeList
    s.freeList = node
}

// Usage example
var orderSlab = NewSlab(int(unsafe.Sizeof(Order{})), 10000)

func AllocOrder() *Order {
    return (*Order)(orderSlab.Alloc())
}

func FreeOrder(o *Order) {
    orderSlab.Free(unsafe.Pointer(o))
}

GC Tuning

Memory Ballast

package main

import (
    "os"
    "runtime"
    "runtime/debug"
)

// Memory ballast reduces GC frequency by making heap appear larger
var ballast []byte

func init() {
    // 10GB ballast - GC runs when heap doubles from this baseline
    ballast = make([]byte, 10<<30)

    // Set GOGC and GOMEMLIMIT
    debug.SetGCPercent(400) // GC at 400% of baseline (less frequent)
    debug.SetMemoryLimit(32 << 30) // 32GB hard limit
}

func main() {
    // Verify settings
    var stats runtime.MemStats
    runtime.ReadMemStats(&stats)

    log.Printf("Heap: %d MB, GC runs: %d",
        stats.HeapAlloc>>20,
        stats.NumGC,
    )
}

GC Pacing Control

package gc

import (
    "runtime"
    "runtime/debug"
    "time"
)

// GCController manages garbage collection timing
type GCController struct {
    targetPause time.Duration
    maxHeap     uint64
}

func NewGCController(targetPause time.Duration, maxHeap uint64) *GCController {
    return &GCController{
        targetPause: targetPause,
        maxHeap:     maxHeap,
    }
}

func (c *GCController) Start() {
    go c.monitor()
}

func (c *GCController) monitor() {
    ticker := time.NewTicker(100 * time.Millisecond)
    defer ticker.Stop()

    for range ticker.C {
        var stats runtime.MemStats
        runtime.ReadMemStats(&stats)

        // Calculate GC overhead
        if stats.NumGC > 0 {
            avgPause := time.Duration(stats.PauseTotalNs / uint64(stats.NumGC))

            if avgPause > c.targetPause {
                // Increase GOGC to reduce frequency
                currentGOGC := debug.SetGCPercent(-1)
                newGOGC := int(float64(currentGOGC) * 1.1)
                if newGOGC > 1000 {
                    newGOGC = 1000
                }
                debug.SetGCPercent(newGOGC)
            }
        }

        // Force GC if approaching limit
        if stats.HeapAlloc > c.maxHeap*90/100 {
            runtime.GC()
        }
    }
}

// TriggerGCDuringLull forces GC during quiet periods
func (c *GCController) TriggerGCDuringLull(activeOrders int) {
    if activeOrders < 100 {
        // Market is quiet, good time for GC
        runtime.GC()
    }
}

GC-Free Data Structures

package gcfree

import "unsafe"

// GCFreeMap avoids GC by using mmap
type GCFreeMap struct {
    data   []byte   // mmap'd memory
    size   int
    count  int
}

func NewGCFreeMap(maxEntries int) (*GCFreeMap, error) {
    size := maxEntries * 128 // 128 bytes per entry

    // mmap anonymous memory
    data, err := syscall.Mmap(
        -1, 0, size,
        syscall.PROT_READ|syscall.PROT_WRITE,
        syscall.MAP_ANON|syscall.MAP_PRIVATE,
    )
    if err != nil {
        return nil, err
    }

    return &GCFreeMap{
        data: data,
        size: size,
    }, nil
}

func (m *GCFreeMap) Set(key uint64, value []byte) {
    offset := (key % uint64(m.size/128)) * 128

    // Write key
    *(*uint64)(unsafe.Pointer(&m.data[offset])) = key

    // Write length
    *(*int)(unsafe.Pointer(&m.data[offset+8])) = len(value)

    // Write value
    copy(m.data[offset+16:offset+128], value)
}

func (m *GCFreeMap) Get(key uint64) ([]byte, bool) {
    offset := (key % uint64(m.size/128)) * 128

    storedKey := *(*uint64)(unsafe.Pointer(&m.data[offset]))
    if storedKey != key {
        return nil, false
    }

    length := *(*int)(unsafe.Pointer(&m.data[offset+8]))
    value := make([]byte, length)
    copy(value, m.data[offset+16:offset+16+length])

    return value, true
}

func (m *GCFreeMap) Close() {
    syscall.Munmap(m.data)
}

Memory Profiling

# Heap profile
go test -memprofile=mem.prof -bench=BenchmarkOrderBook ./pkg/lx/
go tool pprof -http=:8080 mem.prof

# Allocation tracking
go test -memprofilerate=1 -bench=BenchmarkOrderBook ./pkg/lx/

# Live heap profile
curl http://localhost:6060/debug/pprof/heap > heap.prof
go tool pprof -http=:8080 heap.prof

# GC trace
GODEBUG=gctrace=1 ./lxd 2>&1 | head -20
# Output: gc 1 @0.012s 1%: 0.010+0.33+0.010 ms clock, 0.081+0.10/0.33/0+0.085 ms cpu, 4->4->0 MB, 5 MB goal, 8 P

Memory Benchmarks

func BenchmarkOrderAllocation(b *testing.B) {
    b.Run("HeapAlloc", func(b *testing.B) {
        for i := 0; i < b.N; i++ {
            order := &Order{}
            _ = order
        }
    })

    b.Run("PoolAlloc", func(b *testing.B) {
        pool := NewOrderPool()
        b.ResetTimer()
        for i := 0; i < b.N; i++ {
            order := pool.Acquire()
            pool.Release(order)
        }
    })

    b.Run("ArenaAlloc", func(b *testing.B) {
        arena := NewArena()
        b.ResetTimer()
        for i := 0; i < b.N; i++ {
            _ = arena.AllocOrder()
            if i%1000 == 999 {
                arena.Reset()
            }
        }
    })
}

Results

BenchmarkOrderAllocation/HeapAlloc-16      5234567    228 ns/op    128 B/op    1 allocs/op
BenchmarkOrderAllocation/PoolAlloc-16     23456789     51 ns/op      0 B/op    0 allocs/op
BenchmarkOrderAllocation/ArenaAlloc-16    34567890     34 ns/op      0 B/op    0 allocs/op

Memory Usage (100K active orders):
  Heap allocation: 1.2 GB
  Pool allocation: 847 MB
  Arena allocation: 756 MB

GC Pauses:
  Without tuning: P99 = 5.2 ms
  With ballast:   P99 = 0.3 ms
  With GCFreeMap: P99 = 0.1 ms