From 124ea12d2e0d73094e57b8843ce22b0f3f230447 Mon Sep 17 00:00:00 2001
From: Kbz-8 <kbz_8.code@proton.me>
Date: Thu, 14 May 2026 00:23:46 +0200
Subject: [PATCH] fixing slow memory leak

---
 src/soft/SoftPipeline.zig                    | 116 +++++------
 src/soft/SoftQueue.zig                       |   3 +-
 src/soft/device/ComputeDispatcher.zig        |   4 -
 src/soft/device/Device.zig                   |   7 +-
 src/soft/device/Renderer.zig                 |  24 ++-
 src/soft/device/clip.zig                     | 202 +++++++++----------
 src/soft/device/rasterizer.zig               |   2 +-
 src/soft/device/rasterizer/bresenham.zig     |   2 +-
 src/soft/device/rasterizer/common.zig        |  17 +-
 src/soft/device/rasterizer/edge_function.zig |   3 +-
 src/vulkan/lib.zig                           |   1 +
 src/vulkan/utils.zig                         |   4 +
 12 files changed, 185 insertions(+), 200 deletions(-)
 create mode 100644 src/vulkan/utils.zig

diff --git a/src/soft/SoftPipeline.zig b/src/soft/SoftPipeline.zig
index 906b285..3e68d56 100644
--- a/src/soft/SoftPipeline.zig
+++ b/src/soft/SoftPipeline.zig
@@ -54,9 +54,13 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache:
 
     const device_allocator = soft_device.device_allocator.allocator();
 
-    var runtimes_allocator_arena: std.heap.ArenaAllocator = .init(device_allocator);
-    errdefer runtimes_allocator_arena.deinit();
-    const runtimes_allocator = runtimes_allocator_arena.allocator();
+    self.* = .{
+        .interface = interface,
+        .runtimes_allocator = .init(device_allocator),
+        .stages = std.EnumMap(Stages, Shader).init(.{}),
+    };
+    errdefer self.runtimes_allocator.deinit();
+    const runtimes_allocator = self.runtimes_allocator.allocator();
 
     const instance: *SoftInstance = @alignCast(@fieldParentPtr("interface", device.instance));
     const runtimes_count = switch (instance.threaded.async_limit) {
@@ -68,57 +72,51 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache:
         },
     };
 
-    self.* = .{
-        .interface = interface,
-        .runtimes_allocator = runtimes_allocator_arena,
-        .stages = std.EnumMap(Stages, Shader).init(.{
-            .compute = blk: {
-                var shader: Shader = undefined;
-                soft_module.ref();
-                shader.module = soft_module;
+    self.stages.put(.compute, blk: {
+        var shader: Shader = undefined;
+        soft_module.ref();
+        shader.module = soft_module;
 
-                const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory;
+        const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory;
 
-                for (runtimes) |*runtime| {
-                    runtime.* = spv.Runtime.init(
-                        runtimes_allocator,
-                        &soft_module.module,
-                        .{
-                            .readImageFloat4 = readImageFloat4,
-                            .readImageInt4 = readImageInt4,
-                            .writeImageFloat4 = writeImageFloat4,
-                            .writeImageInt4 = writeImageInt4,
-                        },
-                    ) catch |err| {
-                        std.log.scoped(.SpvRuntimeInit).err("SPIR-V Runtime failed to initialize, {s}", .{@errorName(err)});
-                        return VkError.Unknown;
-                    };
-                    if (info.stage.p_specialization_info) |specialization| {
-                        if (specialization.p_map_entries) |map| {
-                            const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size];
-                            for (map[0..], 0..specialization.map_entry_count) |entry, _| {
-                                runtime.addSpecializationInfo(
-                                    runtimes_allocator,
-                                    .{
-                                        .id = @intCast(entry.constant_id),
-                                        .offset = @intCast(entry.offset),
-                                        .size = @intCast(entry.size),
-                                    },
-                                    data,
-                                ) catch return VkError.OutOfDeviceMemory;
-                            }
-                        }
+        for (runtimes) |*runtime| {
+            runtime.* = spv.Runtime.init(
+                runtimes_allocator,
+                &soft_module.module,
+                .{
+                    .readImageFloat4 = readImageFloat4,
+                    .readImageInt4 = readImageInt4,
+                    .writeImageFloat4 = writeImageFloat4,
+                    .writeImageInt4 = writeImageInt4,
+                },
+            ) catch |err| {
+                std.log.scoped(.SpvRuntimeInit).err("SPIR-V Runtime failed to initialize, {s}", .{@errorName(err)});
+                return VkError.Unknown;
+            };
+            if (info.stage.p_specialization_info) |specialization| {
+                if (specialization.p_map_entries) |map| {
+                    const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size];
+                    for (map[0..], 0..specialization.map_entry_count) |entry, _| {
+                        runtime.addSpecializationInfo(
+                            runtimes_allocator,
+                            .{
+                                .id = @intCast(entry.constant_id),
+                                .offset = @intCast(entry.offset),
+                                .size = @intCast(entry.size),
+                            },
+                            data,
+                        ) catch return VkError.OutOfDeviceMemory;
                     }
                 }
+            }
+        }
 
-                shader.runtimes = runtimes;
-                shader.entry = runtimes_allocator.dupe(u8, std.mem.span(info.stage.p_name)) catch return VkError.OutOfDeviceMemory;
+        shader.runtimes = runtimes;
+        shader.entry = runtimes_allocator.dupe(u8, std.mem.span(info.stage.p_name)) catch return VkError.OutOfDeviceMemory;
 
-                std.log.scoped(.ComputePipeline).debug("Created {d} runtimes for compute stage", .{runtimes_count});
-                break :blk shader;
-            },
-        }),
-    };
+        std.log.scoped(.ComputePipeline).debug("Created {d} runtimes for compute stage", .{runtimes_count});
+        break :blk shader;
+    });
     return self;
 }
 
@@ -135,9 +133,13 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache:
     const soft_device: *SoftDevice = @alignCast(@fieldParentPtr("interface", device));
     const device_allocator = soft_device.device_allocator.allocator();
 
-    var runtimes_allocator_arena: std.heap.ArenaAllocator = .init(device_allocator);
-    errdefer runtimes_allocator_arena.deinit();
-    const runtimes_allocator = runtimes_allocator_arena.allocator();
+    self.* = .{
+        .interface = interface,
+        .runtimes_allocator = .init(device_allocator),
+        .stages = std.EnumMap(Stages, Shader).init(.{}),
+    };
+    errdefer self.runtimes_allocator.deinit();
+    const runtimes_allocator = self.runtimes_allocator.allocator();
 
     const instance: *SoftInstance = @alignCast(@fieldParentPtr("interface", device.instance));
     const runtimes_count = switch (instance.threaded.async_limit) {
@@ -149,12 +151,6 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache:
         },
     };
 
-    self.* = .{
-        .interface = interface,
-        .runtimes_allocator = runtimes_allocator_arena,
-        .stages = std.EnumMap(Stages, Shader).init(.{}),
-    };
-
     if (info.p_stages) |stages| {
         for (stages[0..], 0..info.stage_count) |stage, _| {
             var shader: Shader = undefined;
@@ -228,9 +224,15 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache:
 
 pub fn destroy(interface: *Interface, allocator: std.mem.Allocator) void {
     const self: *Self = @alignCast(@fieldParentPtr("interface", interface));
+    const soft_device: *SoftDevice = @alignCast(@fieldParentPtr("interface", interface.owner));
+    const device_allocator = soft_device.device_allocator.allocator();
+
     var it = self.stages.iterator();
     while (it.next()) |entry| {
         entry.value.module.unref(allocator);
+        for (entry.value.runtimes) |*rt| {
+            rt.function_stack.clearAndFree(device_allocator); // Hacky to avoid leaks
+        }
     }
     self.runtimes_allocator.deinit();
     allocator.destroy(self);
diff --git a/src/soft/SoftQueue.zig b/src/soft/SoftQueue.zig
index 2e0c47e..80d671c 100644
--- a/src/soft/SoftQueue.zig
+++ b/src/soft/SoftQueue.zig
@@ -95,8 +95,7 @@ fn taskRunner(self: *Self, info: Interface.SubmitInfo, p_fence: ?*base.Fence, ru
     }
 
     var execution_device: ExecutionDevice = undefined;
-    execution_device.init(soft_device);
-    defer execution_device.deinit();
+    execution_device.setup(soft_device);
 
     for (info.command_buffers.items) |command_buffer| {
         const soft_command_buffer: *SoftCommandBuffer = @alignCast(@fieldParentPtr("interface", command_buffer));
diff --git a/src/soft/device/ComputeDispatcher.zig b/src/soft/device/ComputeDispatcher.zig
index 6911fa3..2e9a806 100644
--- a/src/soft/device/ComputeDispatcher.zig
+++ b/src/soft/device/ComputeDispatcher.zig
@@ -45,10 +45,6 @@ pub fn init(device: *SoftDevice, state: *PipelineState) Self {
     };
 }
 
-pub fn deinit(self: *Self) void {
-    _ = self;
-}
-
 pub fn dispatch(self: *Self, group_count_x: u32, group_count_y: u32, group_count_z: u32) VkError!void {
     const group_count: usize = @intCast(group_count_x * group_count_y * group_count_z);
 
diff --git a/src/soft/device/Device.zig b/src/soft/device/Device.zig
index 3e2f0d5..98db38b 100644
--- a/src/soft/device/Device.zig
+++ b/src/soft/device/Device.zig
@@ -39,7 +39,7 @@ pipeline_states: [2]PipelineState,
 
 /// Initializating an execution device and
 /// not creating one to avoid dangling pointers
-pub fn init(self: *Self, device: *SoftDevice) void {
+pub fn setup(self: *Self, device: *SoftDevice) void {
     for (self.pipeline_states[0..], 0..) |*state, i| {
         state.* = .{
             .pipeline = null,
@@ -60,8 +60,3 @@ pub fn init(self: *Self, device: *SoftDevice) void {
     self.compute = .init(device, &self.pipeline_states[@intFromEnum(vk.PipelineBindPoint.compute)]);
     self.renderer = .init(device, &self.pipeline_states[@intFromEnum(vk.PipelineBindPoint.graphics)]);
 }
-
-pub fn deinit(self: *Self) void {
-    self.compute.deinit();
-    self.renderer.deinit();
-}
diff --git a/src/soft/device/Renderer.zig b/src/soft/device/Renderer.zig
index a7d2b39..9242f51 100644
--- a/src/soft/device/Renderer.zig
+++ b/src/soft/device/Renderer.zig
@@ -59,7 +59,7 @@ pub const DrawCall = struct {
     viewport: vk.Viewport,
     scissor: vk.Rect2D,
 
-    pub fn init(allocator: std.mem.Allocator, vertex_count: usize, instance_count: usize, renderer: *Self) VkError!@This() {
+    fn init(allocator: std.mem.Allocator, vertex_count: usize, instance_count: usize, renderer: *Self) VkError!@This() {
         const self: @This() = .{
             .vertices = allocator.alloc(Vertex, vertex_count * instance_count) catch return VkError.OutOfDeviceMemory,
             .renderer = renderer,
@@ -73,6 +73,17 @@ pub const DrawCall = struct {
 
         return self;
     }
+
+    fn deinit(self: *@This(), allocator: std.mem.Allocator) void {
+        for (self.vertices) |*vertex| {
+            for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
+                if (vertex.outputs[location]) |output| {
+                    allocator.free(output.blob);
+                }
+            }
+        }
+        allocator.free(self.vertices);
+    }
 };
 
 device: *SoftDevice,
@@ -96,10 +107,6 @@ pub fn init(device: *SoftDevice, state: *PipelineState) Self {
     };
 }
 
-pub fn deinit(self: *Self) void {
-    _ = self;
-}
-
 pub fn draw(self: *Self, vertex_count: usize, instance_count: usize, first_vertex: usize, first_instance: usize) VkError!void {
     var bounded_allocator: BoundedAllocator = .init(self.device.device_allocator.allocator(), @"1GiB");
     try self.drawCall(&bounded_allocator, vertex_count, instance_count, first_vertex, first_instance, null);
@@ -119,17 +126,18 @@ fn drawCall(self: *Self, bounded_allocator: *BoundedAllocator, vertex_count: usi
     const allocator = bounded_allocator.allocator();
 
     var draw_call = try DrawCall.init(allocator, vertex_count, instance_count, self);
+    defer draw_call.deinit(allocator);
 
     const timer = std.Io.Timestamp.now(io, .real);
     defer if (comptime base.config.logs != .none) {
         const duration = timer.untilNow(io, .real);
-        const ms = duration.toMicroseconds();
+        const ms: f32 = @floatFromInt(duration.toMicroseconds());
         const memory_footprint = @divTrunc(bounded_allocator.queryFootprint(), 1000);
         const logger = std.log.scoped(.SoftwareRenderer);
         if (memory_footprint > 256_000)
-            logger.warn("Drawcall stats:\n>   Took {d}us\n>   Allocated {d} KB", .{ ms, memory_footprint })
+            logger.warn("Drawcall stats:\n>   Took {d:.3}ms\n>   Allocated {d} KB", .{ ms / 1000, memory_footprint })
         else
-            logger.debug("Drawcall stats:\n>   Took {d}us\n>   Allocated {d} KB", .{ ms, memory_footprint });
+            logger.debug("Drawcall stats:\n>   Took {d:.3}ms\n>   Allocated {d} KB", .{ ms / 1000, memory_footprint });
     };
 
     self.vertexShaderStage(allocator, &draw_call, vertex_count, instance_count, first_vertex, first_instance, indices) catch |err| {
diff --git a/src/soft/device/clip.zig b/src/soft/device/clip.zig
index 7de6999..7457611 100644
--- a/src/soft/device/clip.zig
+++ b/src/soft/device/clip.zig
@@ -2,7 +2,6 @@ const std = @import("std");
 const vk = @import("vulkan");
 const base = @import("base");
 const zm = base.zm;
-const lib = @import("../lib.zig");
 const spv = @import("spv");
 
 pub const F32x4 = zm.F32x4;
@@ -36,114 +35,6 @@ const ClippedPolygon = struct {
     }
 };
 
-fn clipDistance(position: F32x4, plane: ClipPlane) f32 {
-    const x = position[0];
-    const y = position[1];
-    const z = position[2];
-    const w = position[3];
-
-    return switch (plane) {
-        .Left => x + w,
-        .Right => w - x,
-        .Bottom => y + w,
-        .Top => w - y,
-        .Near => z,
-        .Far => w - z,
-    };
-}
-
-fn vertexInsidePlane(vertex: *const Vertex, plane: ClipPlane) bool {
-    return clipDistance(vertex.position, plane) >= 0.0;
-}
-
-fn copyBlob(allocator: std.mem.Allocator, blob: []const u8) VkError![]u8 {
-    const result = allocator.alloc(u8, blob.len) catch return VkError.OutOfDeviceMemory;
-    @memcpy(result, blob);
-    return result;
-}
-
-fn writePacked(comptime T: type, bytes: []u8, value: T) void {
-    const raw: [@sizeOf(T)]u8 = @bitCast(value);
-    @memcpy(bytes[0..@sizeOf(T)], raw[0..]);
-}
-
-fn interpolateBlob(allocator: std.mem.Allocator, a: []const u8, b: []const u8, t: f32) VkError![]u8 {
-    const len = @min(a.len, b.len);
-    const result = allocator.alloc(u8, len) catch return VkError.OutOfDeviceMemory;
-
-    var byte_index: usize = 0;
-    while (byte_index + @sizeOf(F32x4) <= len) : (byte_index += @sizeOf(F32x4)) {
-        const value_a = std.mem.bytesToValue(F32x4, a[byte_index..]);
-        const value_b = std.mem.bytesToValue(F32x4, b[byte_index..]);
-        writePacked(F32x4, result[byte_index..], value_a + ((value_b - value_a) * @as(F32x4, @splat(t))));
-    }
-
-    while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) {
-        const value_a = std.mem.bytesToValue(f32, a[byte_index..]);
-        const value_b = std.mem.bytesToValue(f32, b[byte_index..]);
-        writePacked(f32, result[byte_index..], value_a + ((value_b - value_a) * t));
-    }
-
-    if (byte_index < len)
-        @memcpy(result[byte_index..], a[byte_index..len]);
-
-    return result;
-}
-
-fn interpolateVertexForClipping(allocator: std.mem.Allocator, a: *const Vertex, b: *const Vertex, t: f32) VkError!Vertex {
-    var result: Vertex = .{
-        .position = a.position + ((b.position - a.position) * @as(F32x4, @splat(t))),
-        .outputs = undefined,
-    };
-
-    @memset(result.outputs[0..], null);
-
-    for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
-        const out_a = a.outputs[location] orelse continue;
-        const out_b = b.outputs[location] orelse continue;
-
-        result.outputs[location] = .{
-            .interpolation_type = out_a.interpolation_type,
-            .blob = if (out_a.interpolation_type == .flat)
-                try copyBlob(allocator, out_a.blob)
-            else
-                try interpolateBlob(allocator, out_a.blob, out_b.blob, t),
-        };
-    }
-
-    return result;
-}
-
-fn clipPolygonAgainstPlane(allocator: std.mem.Allocator, input: *const ClippedPolygon, plane: ClipPlane) VkError!ClippedPolygon {
-    var output: ClippedPolygon = .{};
-
-    if (input.len == 0)
-        return output;
-
-    var previous = input.vertices[input.len - 1];
-    var previous_inside = vertexInsidePlane(&previous, plane);
-    var previous_distance = clipDistance(previous.position, plane);
-
-    for (input.vertices[0..input.len]) |current| {
-        const current_inside = vertexInsidePlane(&current, plane);
-        const current_distance = clipDistance(current.position, plane);
-
-        if (current_inside != previous_inside) {
-            const t = previous_distance / (previous_distance - current_distance);
-            try output.append(try interpolateVertexForClipping(allocator, &previous, &current, t));
-        }
-
-        if (current_inside)
-            try output.append(current);
-
-        previous = current;
-        previous_inside = current_inside;
-        previous_distance = current_distance;
-    }
-
-    return output;
-}
-
 pub fn clipTriangle(allocator: std.mem.Allocator, v0: *const Vertex, v1: *const Vertex, v2: *const Vertex) VkError!ClippedPolygon {
     var polygon: ClippedPolygon = .{};
     try polygon.append(v0.*);
@@ -189,3 +80,96 @@ pub fn viewportTransformVertex(viewport: vk.Viewport, vertex: *Vertex) void {
 
     vertex.position = zm.f32x4(x_screen, y_screen, z_screen, w);
 }
+
+fn clipDistance(position: F32x4, plane: ClipPlane) f32 {
+    const x, const y, const z, const w = position;
+    return switch (plane) {
+        .Left => x + w,
+        .Right => w - x,
+        .Bottom => y + w,
+        .Top => w - y,
+        .Near => z,
+        .Far => w - z,
+    };
+}
+
+fn isVertexInsidePlane(vertex: *const Vertex, plane: ClipPlane) bool {
+    return clipDistance(vertex.position, plane) >= 0.0;
+}
+
+fn interpolateBlob(allocator: std.mem.Allocator, a: []const u8, b: []const u8, t: f32) VkError![]u8 {
+    const len = @min(a.len, b.len);
+    const result = allocator.alloc(u8, len) catch return VkError.OutOfDeviceMemory;
+
+    var byte_index: usize = 0;
+    while (byte_index + @sizeOf(F32x4) <= len) : (byte_index += @sizeOf(F32x4)) {
+        const value_a = std.mem.bytesToValue(F32x4, a[byte_index..]);
+        const value_b = std.mem.bytesToValue(F32x4, b[byte_index..]);
+        base.utils.writePacked(F32x4, result[byte_index..], value_a + ((value_b - value_a) * zm.f32x4s(t)));
+    }
+
+    while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) {
+        const value_a = std.mem.bytesToValue(f32, a[byte_index..]);
+        const value_b = std.mem.bytesToValue(f32, b[byte_index..]);
+        base.utils.writePacked(f32, result[byte_index..], value_a + ((value_b - value_a) * t));
+    }
+
+    if (byte_index < len)
+        @memcpy(result[byte_index..], a[byte_index..len]);
+
+    return result;
+}
+
+fn interpolateVertexForClipping(allocator: std.mem.Allocator, a: *const Vertex, b: *const Vertex, t: f32) VkError!Vertex {
+    var result: Vertex = .{
+        .position = a.position + ((b.position - a.position) * zm.f32x4s(t)),
+        .outputs = undefined,
+    };
+
+    @memset(result.outputs[0..], null);
+
+    for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
+        const out_a = a.outputs[location] orelse continue;
+        const out_b = b.outputs[location] orelse continue;
+
+        result.outputs[location] = .{
+            .interpolation_type = out_a.interpolation_type,
+            .blob = if (out_a.interpolation_type == .flat)
+                allocator.dupe(u8, out_a.blob) catch return VkError.OutOfDeviceMemory
+            else
+                try interpolateBlob(allocator, out_a.blob, out_b.blob, t),
+        };
+    }
+
+    return result;
+}
+
+fn clipPolygonAgainstPlane(allocator: std.mem.Allocator, input: *const ClippedPolygon, plane: ClipPlane) VkError!ClippedPolygon {
+    var output: ClippedPolygon = .{};
+
+    if (input.len == 0)
+        return output;
+
+    var previous = input.vertices[input.len - 1];
+    var previous_inside = isVertexInsidePlane(&previous, plane);
+    var previous_distance = clipDistance(previous.position, plane);
+
+    for (input.vertices[0..input.len]) |current| {
+        const current_inside = isVertexInsidePlane(&current, plane);
+        const current_distance = clipDistance(current.position, plane);
+
+        if (current_inside != previous_inside) {
+            const t = previous_distance / (previous_distance - current_distance);
+            try output.append(try interpolateVertexForClipping(allocator, &previous, &current, t));
+        }
+
+        if (current_inside)
+            try output.append(current);
+
+        previous = current;
+        previous_inside = current_inside;
+        previous_distance = current_distance;
+    }
+
+    return output;
+}
diff --git a/src/soft/device/rasterizer.zig b/src/soft/device/rasterizer.zig
index c899da8..5577edc 100644
--- a/src/soft/device/rasterizer.zig
+++ b/src/soft/device/rasterizer.zig
@@ -51,7 +51,7 @@ pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocato
     }
 }
 
-fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *const Vertex, v1: *const Vertex, v2: *const Vertex) VkError!void {
+fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *Vertex, v1: *Vertex, v2: *Vertex) VkError!void {
     const clipped_polygon = try clip.clipTriangle(allocator, v0, v1, v2);
 
     if (clipped_polygon.len < 3)
diff --git a/src/soft/device/rasterizer/bresenham.zig b/src/soft/device/rasterizer/bresenham.zig
index 2ac1a22..3094e52 100644
--- a/src/soft/device/rasterizer/bresenham.zig
+++ b/src/soft/device/rasterizer/bresenham.zig
@@ -100,7 +100,7 @@ pub fn drawLine(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, v0:
     wg.await(io) catch return VkError.DeviceLost;
 }
 
-inline fn bresenhamYAtStep(y0: i32, d_x: i32, d_err: i32, y_step: i32, step: usize) i32 {
+fn bresenhamYAtStep(y0: i32, d_x: i32, d_err: i32, y_step: i32, step: usize) i32 {
     if (d_x == 0)
         return y0;
 
diff --git a/src/soft/device/rasterizer/common.zig b/src/soft/device/rasterizer/common.zig
index 3e75a63..7c0336c 100644
--- a/src/soft/device/rasterizer/common.zig
+++ b/src/soft/device/rasterizer/common.zig
@@ -25,15 +25,6 @@ pub fn scissorContainsPixel(scissor: vk.Rect2D, x: i32, y: i32) bool {
         pixel_y < max_y;
 }
 
-fn writePacked(comptime T: type, bytes: []u8, value: T) void {
-    const raw: [@sizeOf(T)]u8 = @bitCast(value);
-    @memcpy(bytes[0..@sizeOf(T)], raw[0..]);
-}
-
-fn interpolateF32x4(value0: F32x4, value1: F32x4, value2: F32x4, b0: f32, b1: f32, b2: f32) F32x4 {
-    return (value0 * @as(F32x4, @splat(b0))) + (value1 * @as(F32x4, @splat(b1))) + (value2 * @as(F32x4, @splat(b2)));
-}
-
 pub fn interpolateVertexOutputs(
     allocator: std.mem.Allocator,
     v0: *const Renderer.Vertex,
@@ -63,14 +54,14 @@ pub fn interpolateVertexOutputs(
             const value0 = std.mem.bytesToValue(F32x4, out0.blob[byte_index..]);
             const value1 = std.mem.bytesToValue(F32x4, out1.blob[byte_index..]);
             const value2 = std.mem.bytesToValue(F32x4, out2.blob[byte_index..]);
-            writePacked(F32x4, input[byte_index..], interpolateF32x4(value0, value1, value2, b0, b1, b2));
+            base.utils.writePacked(F32x4, input[byte_index..], interpolateF32x4(value0, value1, value2, b0, b1, b2));
         }
 
         while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) {
             const value0 = std.mem.bytesToValue(f32, out0.blob[byte_index..]);
             const value1 = std.mem.bytesToValue(f32, out1.blob[byte_index..]);
             const value2 = std.mem.bytesToValue(f32, out2.blob[byte_index..]);
-            writePacked(f32, input[byte_index..], (value0 * b0) + (value1 * b1) + (value2 * b2));
+            base.utils.writePacked(f32, input[byte_index..], (value0 * b0) + (value1 * b1) + (value2 * b2));
         }
 
         if (byte_index < len)
@@ -85,3 +76,7 @@ pub fn interpolateVertexOutputs(
 pub fn interpolateLineOutputs(allocator: std.mem.Allocator, v0: *const Renderer.Vertex, v1: *const Renderer.Vertex, t: f32) VkError![spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8 {
     return interpolateVertexOutputs(allocator, v0, v1, v0, 1.0 - t, t, 0.0);
 }
+
+inline fn interpolateF32x4(value0: F32x4, value1: F32x4, value2: F32x4, b0: f32, b1: f32, b2: f32) F32x4 {
+    return (value0 * zm.f32x4s(b0)) + (value1 * zm.f32x4s(b1)) + (value2 * zm.f32x4s(b2));
+}
diff --git a/src/soft/device/rasterizer/edge_function.zig b/src/soft/device/rasterizer/edge_function.zig
index 8dc2a9b..1fecfcf 100644
--- a/src/soft/device/rasterizer/edge_function.zig
+++ b/src/soft/device/rasterizer/edge_function.zig
@@ -42,7 +42,6 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall,
 
     const pipeline = draw_call.renderer.state.pipeline orelse return;
 
-    var wg: std.Io.Group = .init;
     const runtimes_count = (pipeline.stages.getPtr(.fragment) orelse return).runtimes.len;
     const grid_size: usize = @intFromFloat(@floor(@sqrt(@as(f32, @floatFromInt(runtimes_count)))));
 
@@ -53,6 +52,8 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall,
     const rows_per_run = @divTrunc(height + grid_size - 1, grid_size);
 
     var batch_id: usize = 0;
+
+    var wg: std.Io.Group = .init;
     for (0..grid_size) |gy| {
         for (0..grid_size) |gx| {
             defer batch_id = @mod(batch_id + 1, runtimes_count);
diff --git a/src/vulkan/lib.zig b/src/vulkan/lib.zig
index 2159b0f..3eb54d1 100644
--- a/src/vulkan/lib.zig
+++ b/src/vulkan/lib.zig
@@ -12,6 +12,7 @@ pub const lib_vulkan = @import("lib_vulkan.zig");
 pub const logger = @import("logger.zig");
 pub const format = @import("format.zig");
 pub const config = @import("config");
+pub const utils = @import("utils.zig");
 
 pub const Dispatchable = @import("Dispatchable.zig").Dispatchable;
 pub const fallback_host_allocator = @import("fallback_host_allocator.zig").fallback_host_allocator;
diff --git a/src/vulkan/utils.zig b/src/vulkan/utils.zig
new file mode 100644
index 0000000..88413e8
--- /dev/null
+++ b/src/vulkan/utils.zig
@@ -0,0 +1,4 @@
+pub fn writePacked(comptime T: type, bytes: []u8, value: T) void {
+    const raw: [@sizeOf(T)]u8 = @bitCast(value);
+    @memcpy(bytes[0..@sizeOf(T)], raw[0..]);
+}