Performance
Memory Optimization
Object pooling, arena allocation, and GC tuning for zero-allocation hot paths
Memory Optimization
LX achieves zero heap allocations on the hot path through aggressive pooling, arena allocation, and GC tuning.
Memory Architecture
┌─────────────────────────────────────────────────────────────────────────┐
│ MEMORY ALLOCATION STRATEGY │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ Hot Path (Zero Allocation) │
│ ┌─────────────────────────────────────────────────────────────────┐ │
│ │ Order Pool ─→ Trade Pool ─→ Message Pool │ │
│ │ Pre-allocated │ Fixed-size │ Recycled │ │
│ └─────────────────────────────────────────────────────────────────┘ │
│ │
│ Warm Path (Pooled Allocation) │
│ ┌─────────────────────────────────────────────────────────────────┐ │
│ │ OrderBook Nodes │ Price Levels │ Ring Buffers │ │
│ │ Sync.Pool │ Arena │ Slab Allocator │ │
│ └─────────────────────────────────────────────────────────────────┘ │
│ │
│ Cold Path (Standard Allocation) │
│ ┌─────────────────────────────────────────────────────────────────┐ │
│ │ Configuration │ Logging │ Metrics │ Admin APIs │ │
│ │ Normal GC │ Not latency sensitive │ │
│ └─────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────┘Object Pooling
High-Performance Object Pool
package pool
import (
"sync"
"sync/atomic"
"unsafe"
)
// Pool is a lock-free object pool
type Pool[T any] struct {
local []poolLocal
localSize int
New func() T
}
type poolLocal struct {
_ [64]byte // Cache line padding
private unsafe.Pointer
shared poolChain
_ [64]byte
}
type poolChain struct {
head atomic.Pointer[poolChainElt]
tail atomic.Pointer[poolChainElt]
}
type poolChainElt struct {
next atomic.Pointer[poolChainElt]
value unsafe.Pointer
}
// NewPool creates a new object pool
func NewPool[T any](new func() T) *Pool[T] {
numCPU := runtime.GOMAXPROCS(0)
return &Pool[T]{
local: make([]poolLocal, numCPU),
localSize: numCPU,
New: new,
}
}
// Get retrieves an object from the pool
func (p *Pool[T]) Get() T {
// Try to get from local private first (no atomic)
l := p.pin()
ptr := l.private
if ptr != nil {
l.private = nil
return *(*T)(ptr)
}
// Try local shared
obj := l.shared.popHead()
if obj != nil {
return *(*T)(obj)
}
// Try to steal from other locals
for i := 0; i < p.localSize; i++ {
other := &p.local[i]
obj = other.shared.popTail()
if obj != nil {
return *(*T)(obj)
}
}
// Create new
return p.New()
}
// Put returns an object to the pool
func (p *Pool[T]) Put(x T) {
l := p.pin()
if l.private == nil {
l.private = unsafe.Pointer(&x)
return
}
l.shared.pushHead(unsafe.Pointer(&x))
}
func (p *Pool[T]) pin() *poolLocal {
pid := runtime_procPin()
return &p.local[pid%p.localSize]
}
//go:linkname runtime_procPin runtime.procPin
func runtime_procPin() intOrder Pool Implementation
package orderbook
import (
"sync"
)
const (
OrderPoolSize = 100000
TradePoolSize = 10000
)
// OrderPool manages pre-allocated orders
type OrderPool struct {
orders []Order
free chan int // Indices of free orders
inUse []bool
}
func NewOrderPool() *OrderPool {
p := &OrderPool{
orders: make([]Order, OrderPoolSize),
free: make(chan int, OrderPoolSize),
inUse: make([]bool, OrderPoolSize),
}
// Pre-populate free list
for i := 0; i < OrderPoolSize; i++ {
p.free <- i
}
return p
}
// Acquire gets an order from the pool
func (p *OrderPool) Acquire() *Order {
select {
case idx := <-p.free:
p.inUse[idx] = true
order := &p.orders[idx]
order.Reset()
return order
default:
// Pool exhausted, allocate new (should be rare)
return &Order{}
}
}
// Release returns an order to the pool
func (p *OrderPool) Release(order *Order) {
// Find index
idx := int((uintptr(unsafe.Pointer(order)) - uintptr(unsafe.Pointer(&p.orders[0]))) / unsafe.Sizeof(Order{}))
if idx >= 0 && idx < OrderPoolSize && p.inUse[idx] {
p.inUse[idx] = false
select {
case p.free <- idx:
default:
// Channel full, discard
}
}
}
// Order reset for reuse
func (o *Order) Reset() {
o.ID = 0
o.Symbol = ""
o.Side = 0
o.Type = 0
o.Price = 0
o.Quantity = 0
o.FilledQty = 0
o.Status = 0
o.Timestamp = 0
}Arena Allocation
Memory Arena for Batch Processing
package arena
import (
"sync"
"unsafe"
)
const (
ArenaSize = 1 << 20 // 1MB arenas
MaxArenas = 64
)
// Arena is a bump allocator for temporary allocations
type Arena struct {
current []byte
offset int
arenas [][]byte
mu sync.Mutex
}
func NewArena() *Arena {
a := &Arena{
arenas: make([][]byte, 0, MaxArenas),
}
a.grow()
return a
}
func (a *Arena) grow() {
block := make([]byte, ArenaSize)
a.arenas = append(a.arenas, block)
a.current = block
a.offset = 0
}
// Alloc allocates n bytes from the arena
func (a *Arena) Alloc(n int) []byte {
// Align to 8 bytes
n = (n + 7) &^ 7
if a.offset+n > len(a.current) {
a.grow()
}
ptr := a.current[a.offset : a.offset+n]
a.offset += n
return ptr
}
// AllocOrder allocates an Order from the arena
func (a *Arena) AllocOrder() *Order {
size := int(unsafe.Sizeof(Order{}))
ptr := a.Alloc(size)
return (*Order)(unsafe.Pointer(&ptr[0]))
}
// Reset releases all allocations (bulk free)
func (a *Arena) Reset() {
// Keep first arena, release others
for i := 1; i < len(a.arenas); i++ {
a.arenas[i] = nil
}
a.arenas = a.arenas[:1]
a.current = a.arenas[0]
a.offset = 0
}
// BatchProcessor uses arena for zero-GC batch processing
type BatchProcessor struct {
arena *Arena
}
func (bp *BatchProcessor) ProcessBatch(rawOrders [][]byte) []*Order {
orders := make([]*Order, len(rawOrders))
for i, raw := range rawOrders {
order := bp.arena.AllocOrder()
decodeOrder(raw, order) // No allocation
orders[i] = order
}
return orders
}
func (bp *BatchProcessor) EndBatch() {
bp.arena.Reset() // Bulk free all orders
}Slab Allocator
Fixed-Size Slab Allocator
package slab
import (
"sync"
"unsafe"
)
// Slab allocates fixed-size objects efficiently
type Slab struct {
objectSize int
slabSize int
slabs [][]byte
freeList *freeNode
mu sync.Mutex
}
type freeNode struct {
next *freeNode
}
func NewSlab(objectSize, objectsPerSlab int) *Slab {
// Ensure minimum size for free list pointer
if objectSize < int(unsafe.Sizeof(freeNode{})) {
objectSize = int(unsafe.Sizeof(freeNode{}))
}
s := &Slab{
objectSize: objectSize,
slabSize: objectSize * objectsPerSlab,
}
s.grow()
return s
}
func (s *Slab) grow() {
slab := make([]byte, s.slabSize)
s.slabs = append(s.slabs, slab)
// Build free list
for i := 0; i < len(slab); i += s.objectSize {
node := (*freeNode)(unsafe.Pointer(&slab[i]))
node.next = s.freeList
s.freeList = node
}
}
// Alloc returns a pointer to an object
func (s *Slab) Alloc() unsafe.Pointer {
s.mu.Lock()
defer s.mu.Unlock()
if s.freeList == nil {
s.grow()
}
ptr := unsafe.Pointer(s.freeList)
s.freeList = s.freeList.next
// Zero memory
for i := 0; i < s.objectSize; i++ {
*(*byte)(unsafe.Add(ptr, i)) = 0
}
return ptr
}
// Free returns an object to the slab
func (s *Slab) Free(ptr unsafe.Pointer) {
s.mu.Lock()
defer s.mu.Unlock()
node := (*freeNode)(ptr)
node.next = s.freeList
s.freeList = node
}
// Usage example
var orderSlab = NewSlab(int(unsafe.Sizeof(Order{})), 10000)
func AllocOrder() *Order {
return (*Order)(orderSlab.Alloc())
}
func FreeOrder(o *Order) {
orderSlab.Free(unsafe.Pointer(o))
}GC Tuning
Memory Ballast
package main
import (
"os"
"runtime"
"runtime/debug"
)
// Memory ballast reduces GC frequency by making heap appear larger
var ballast []byte
func init() {
// 10GB ballast - GC runs when heap doubles from this baseline
ballast = make([]byte, 10<<30)
// Set GOGC and GOMEMLIMIT
debug.SetGCPercent(400) // GC at 400% of baseline (less frequent)
debug.SetMemoryLimit(32 << 30) // 32GB hard limit
}
func main() {
// Verify settings
var stats runtime.MemStats
runtime.ReadMemStats(&stats)
log.Printf("Heap: %d MB, GC runs: %d",
stats.HeapAlloc>>20,
stats.NumGC,
)
}GC Pacing Control
package gc
import (
"runtime"
"runtime/debug"
"time"
)
// GCController manages garbage collection timing
type GCController struct {
targetPause time.Duration
maxHeap uint64
}
func NewGCController(targetPause time.Duration, maxHeap uint64) *GCController {
return &GCController{
targetPause: targetPause,
maxHeap: maxHeap,
}
}
func (c *GCController) Start() {
go c.monitor()
}
func (c *GCController) monitor() {
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()
for range ticker.C {
var stats runtime.MemStats
runtime.ReadMemStats(&stats)
// Calculate GC overhead
if stats.NumGC > 0 {
avgPause := time.Duration(stats.PauseTotalNs / uint64(stats.NumGC))
if avgPause > c.targetPause {
// Increase GOGC to reduce frequency
currentGOGC := debug.SetGCPercent(-1)
newGOGC := int(float64(currentGOGC) * 1.1)
if newGOGC > 1000 {
newGOGC = 1000
}
debug.SetGCPercent(newGOGC)
}
}
// Force GC if approaching limit
if stats.HeapAlloc > c.maxHeap*90/100 {
runtime.GC()
}
}
}
// TriggerGCDuringLull forces GC during quiet periods
func (c *GCController) TriggerGCDuringLull(activeOrders int) {
if activeOrders < 100 {
// Market is quiet, good time for GC
runtime.GC()
}
}GC-Free Data Structures
package gcfree
import "unsafe"
// GCFreeMap avoids GC by using mmap
type GCFreeMap struct {
data []byte // mmap'd memory
size int
count int
}
func NewGCFreeMap(maxEntries int) (*GCFreeMap, error) {
size := maxEntries * 128 // 128 bytes per entry
// mmap anonymous memory
data, err := syscall.Mmap(
-1, 0, size,
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_ANON|syscall.MAP_PRIVATE,
)
if err != nil {
return nil, err
}
return &GCFreeMap{
data: data,
size: size,
}, nil
}
func (m *GCFreeMap) Set(key uint64, value []byte) {
offset := (key % uint64(m.size/128)) * 128
// Write key
*(*uint64)(unsafe.Pointer(&m.data[offset])) = key
// Write length
*(*int)(unsafe.Pointer(&m.data[offset+8])) = len(value)
// Write value
copy(m.data[offset+16:offset+128], value)
}
func (m *GCFreeMap) Get(key uint64) ([]byte, bool) {
offset := (key % uint64(m.size/128)) * 128
storedKey := *(*uint64)(unsafe.Pointer(&m.data[offset]))
if storedKey != key {
return nil, false
}
length := *(*int)(unsafe.Pointer(&m.data[offset+8]))
value := make([]byte, length)
copy(value, m.data[offset+16:offset+16+length])
return value, true
}
func (m *GCFreeMap) Close() {
syscall.Munmap(m.data)
}Memory Profiling
# Heap profile
go test -memprofile=mem.prof -bench=BenchmarkOrderBook ./pkg/lx/
go tool pprof -http=:8080 mem.prof
# Allocation tracking
go test -memprofilerate=1 -bench=BenchmarkOrderBook ./pkg/lx/
# Live heap profile
curl http://localhost:6060/debug/pprof/heap > heap.prof
go tool pprof -http=:8080 heap.prof
# GC trace
GODEBUG=gctrace=1 ./lxd 2>&1 | head -20
# Output: gc 1 @0.012s 1%: 0.010+0.33+0.010 ms clock, 0.081+0.10/0.33/0+0.085 ms cpu, 4->4->0 MB, 5 MB goal, 8 PMemory Benchmarks
func BenchmarkOrderAllocation(b *testing.B) {
b.Run("HeapAlloc", func(b *testing.B) {
for i := 0; i < b.N; i++ {
order := &Order{}
_ = order
}
})
b.Run("PoolAlloc", func(b *testing.B) {
pool := NewOrderPool()
b.ResetTimer()
for i := 0; i < b.N; i++ {
order := pool.Acquire()
pool.Release(order)
}
})
b.Run("ArenaAlloc", func(b *testing.B) {
arena := NewArena()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = arena.AllocOrder()
if i%1000 == 999 {
arena.Reset()
}
}
})
}Results
BenchmarkOrderAllocation/HeapAlloc-16 5234567 228 ns/op 128 B/op 1 allocs/op
BenchmarkOrderAllocation/PoolAlloc-16 23456789 51 ns/op 0 B/op 0 allocs/op
BenchmarkOrderAllocation/ArenaAlloc-16 34567890 34 ns/op 0 B/op 0 allocs/op
Memory Usage (100K active orders):
Heap allocation: 1.2 GB
Pool allocation: 847 MB
Arena allocation: 756 MB
GC Pauses:
Without tuning: P99 = 5.2 ms
With ballast: P99 = 0.3 ms
With GCFreeMap: P99 = 0.1 ms