This guide covers advanced optimization techniques for Coyote ECS, including SIMD operations, vectorization, and other performance enhancements.
SIMD (Single Instruction, Multiple Data) allows you to process multiple data points in parallel, which can significantly improve performance for certain operations.
Components with multiple similar fields (like Position with x and y) can be vectorized using Zig’s SIMD types:
const std = @import("std");
const Vec2 = @Vector(2, f32);
pub const Components = struct {
pub const Position = struct {
data: Vec2 = Vec2{ 0, 0 },
pub fn init(x: f32, y: f32) Position {
return Position{ .data = Vec2{ x, y } };
}
pub fn add(self: *Position, other: Position) void {
self.data += other.data;
}
pub fn scale(self: *Position, factor: f32) void {
self.data *= @splat(2, factor);
}
};
};
When iterating over components, you can use SIMD operations to process multiple components at once:
pub fn UpdatePositions(world: *World, delta: f32) void {
var it = world.components.iteratorFilter(Components.Position);
const delta_vec = @splat(2, delta);
while(it.next()) |component| {
var pos = component.get(Components.Position);
pos.data += delta_vec;
}
}
For systems that process multiple components of the same type, you can use SIMD to process them in batches:
pub fn UpdateVelocities(world: *World, gravity: f32) void {
var it = world.components.iteratorFilter(Components.Velocity);
const gravity_vec = Vec2{ 0, gravity };
while(it.next()) |component| {
var vel = component.get(Components.Velocity);
vel.data += gravity_vec;
}
}
Coyote ECS provides a iteratorFilterRange
function that allows you to process components in chunks, which is perfect for parallel processing:
The iteratorFilterRange
function allows you to specify a range of components to iterate over:
pub fn UpdatePositionsParallel(world: *World, delta_time: f32) void {
const thread_count = std.Thread.getCpuCount();
const component_count = world.components.count(Components.Position);
const chunk_size = (component_count + thread_count - 1) / thread_count;
var threads: []std.Thread = undefined;
threads = std.heap.page_allocator.alloc(std.Thread, thread_count) catch return;
defer std.heap.page_allocator.free(threads);
var i: usize = 0;
while (i < thread_count) : (i += 1) {
const start = i * chunk_size;
const end = @min(start + chunk_size, component_count);
threads[i] = std.Thread.spawn(.{}, struct {
fn updateChunk(w: *World, start_idx: usize, end_idx: usize, dt: f32) void {
var it = w.components.iteratorFilterRange(Components.Position, start_idx, end_idx);
const dt_vec = @splat(2, dt);
while (it.next()) |component| {
var pos = component.get(Components.Position);
pos.data += dt_vec;
}
}
}.updateChunk, .{ world, start, end, delta_time }) catch continue;
}
// Wait for all threads to complete
for (threads) |thread| {
thread.join();
}
}
For maximum performance, you can combine SIMD operations with parallel processing:
pub fn UpdatePhysicsParallel(world: *World, delta_time: f32) void {
const thread_count = std.Thread.getCpuCount();
const component_count = world.components.count(Components.Position);
const chunk_size = (component_count + thread_count - 1) / thread_count;
var threads: []std.Thread = undefined;
threads = std.heap.page_allocator.alloc(std.Thread, thread_count) catch return;
defer std.heap.page_allocator.free(threads);
var i: usize = 0;
while (i < thread_count) : (i += 1) {
const start = i * chunk_size;
const end = @min(start + chunk_size, component_count);
threads[i] = std.Thread.spawn(.{}, struct {
fn updatePhysicsChunk(w: *World, start_idx: usize, end_idx: usize, dt: f32) void {
// Process positions with SIMD
var pos_it = w.components.iteratorFilterRange(Components.Position, start_idx, end_idx);
const dt_vec = @splat(2, dt);
while (pos_it.next()) |component| {
var pos = component.get(Components.Position);
pos.data += dt_vec;
}
// Process velocities with SIMD
var vel_it = w.components.iteratorFilterRange(Components.Velocity, start_idx, end_idx);
const gravity_vec = Vec2{ 0, 9.8 * dt };
while (vel_it.next()) |component| {
var vel = component.get(Components.Velocity);
vel.data += gravity_vec;
}
}
}.updatePhysicsChunk, .{ world, start, end, delta_time }) catch continue;
}
// Wait for all threads to complete
for (threads) |thread| {
thread.join();
}
}
Coyote ECS currently uses an Array of Structures (AoS) approach for component storage. For SIMD operations, a Structure of Arrays (SoA) approach can be more efficient:
// Current AoS approach
pub const Position = struct {
x: f32 = 0,
y: f32 = 0,
};
// SoA approach for SIMD
pub const PositionStorage = struct {
xs: []f32,
ys: []f32,
pub fn init(allocator: std.mem.Allocator, capacity: usize) !PositionStorage {
return PositionStorage{
.xs = try allocator.alloc(f32, capacity),
.ys = try allocator.alloc(f32, capacity),
};
}
pub fn deinit(self: *PositionStorage, allocator: std.mem.Allocator) void {
allocator.free(self.xs);
allocator.free(self.ys);
}
pub fn updateAll(self: *PositionStorage, count: usize, delta_x: f32, delta_y: f32) void {
const delta_x_vec = @splat(4, delta_x);
const delta_y_vec = @splat(4, delta_y);
var i: usize = 0;
while (i + 4 <= count) : (i += 4) {
const x_vec = std.mem.bytesAsSlice(f32, self.xs[i..i+4]);
const y_vec = std.mem.bytesAsSlice(f32, self.ys[i..i+4]);
x_vec.* += delta_x_vec;
y_vec.* += delta_y_vec;
}
// Handle remaining elements
while (i < count) : (i += 1) {
self.xs[i] += delta_x;
self.ys[i] += delta_y;
}
}
};
To implement SoA in Coyote ECS, you would need to modify the component storage system:
// Example of how SoA could be integrated into Coyote ECS
pub const ComponentStorage = struct {
// For each component type, store arrays of each field
position_xs: []f32,
position_ys: []f32,
velocity_xs: []f32,
velocity_ys: []f32,
// ... other component fields
pub fn updatePositions(self: *ComponentStorage, count: usize, delta_time: f32) void {
const dt_vec = @splat(4, delta_time);
var i: usize = 0;
while (i + 4 <= count) : (i += 4) {
const vel_x_vec = std.mem.bytesAsSlice(f32, self.velocity_xs[i..i+4]);
const vel_y_vec = std.mem.bytesAsSlice(f32, self.velocity_ys[i..i+4]);
const pos_x_vec = std.mem.bytesAsSlice(f32, self.position_xs[i..i+4]);
const pos_y_vec = std.mem.bytesAsSlice(f32, self.position_ys[i..i+4]);
pos_x_vec.* += vel_x_vec.* * dt_vec;
pos_y_vec.* += vel_y_vec.* * dt_vec;
}
// Handle remaining elements
while (i < count) : (i += 1) {
self.position_xs[i] += self.velocity_xs[i] * delta_time;
self.position_ys[i] += self.velocity_ys[i] * delta_time;
}
}
};
For optimal SIMD performance, ensure your component data is properly aligned:
// Ensure 16-byte alignment for SIMD operations
pub const AlignedPosition = struct {
data: Vec2 align(16) = Vec2{ 0, 0 },
};
To measure the performance improvement from SIMD:
pub fn benchmarkSimd() void {
var world = World.create() catch return;
defer world.deinit();
// Create test entities
var i: usize = 0;
while (i < 1000000) : (i += 1) {
var entity = world.entities.create() catch continue;
var position = world.components.create(Components.Position) catch continue;
entity.attach(position, Components.Position{ .x = 0, .y = 0 }) catch continue;
}
// Benchmark non-SIMD
const start1 = std.time.nanoTimestamp();
UpdatePositionsNonSimd(&world, 0.016);
const end1 = std.time.nanoTimestamp();
const non_simd_time = @as(f64, @floatFromInt(end1 - start1)) / 1_000_000.0;
// Benchmark SIMD
const start2 = std.time.nanoTimestamp();
UpdatePositionsSimd(&world, 0.016);
const end2 = std.time.nanoTimestamp();
const simd_time = @as(f64, @floatFromInt(end2 - start2)) / 1_000_000.0;
std.debug.print("Non-SIMD: {d:.2}ms, SIMD: {d:.2}ms, Speedup: {d:.2}x\n",
.{non_simd_time, simd_time, non_simd_time / simd_time});
}