From 124ea12d2e0d73094e57b8843ce22b0f3f230447 Mon Sep 17 00:00:00 2001 From: Kbz-8 Date: Thu, 14 May 2026 00:23:46 +0200 Subject: [PATCH] fixing slow memory leak --- src/soft/SoftPipeline.zig | 116 +++++------ src/soft/SoftQueue.zig | 3 +- src/soft/device/ComputeDispatcher.zig | 4 - src/soft/device/Device.zig | 7 +- src/soft/device/Renderer.zig | 24 ++- src/soft/device/clip.zig | 202 +++++++++---------- src/soft/device/rasterizer.zig | 2 +- src/soft/device/rasterizer/bresenham.zig | 2 +- src/soft/device/rasterizer/common.zig | 17 +- src/soft/device/rasterizer/edge_function.zig | 3 +- src/vulkan/lib.zig | 1 + src/vulkan/utils.zig | 4 + 12 files changed, 185 insertions(+), 200 deletions(-) create mode 100644 src/vulkan/utils.zig diff --git a/src/soft/SoftPipeline.zig b/src/soft/SoftPipeline.zig index 906b285..3e68d56 100644 --- a/src/soft/SoftPipeline.zig +++ b/src/soft/SoftPipeline.zig @@ -54,9 +54,13 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache: const device_allocator = soft_device.device_allocator.allocator(); - var runtimes_allocator_arena: std.heap.ArenaAllocator = .init(device_allocator); - errdefer runtimes_allocator_arena.deinit(); - const runtimes_allocator = runtimes_allocator_arena.allocator(); + self.* = .{ + .interface = interface, + .runtimes_allocator = .init(device_allocator), + .stages = std.EnumMap(Stages, Shader).init(.{}), + }; + errdefer self.runtimes_allocator.deinit(); + const runtimes_allocator = self.runtimes_allocator.allocator(); const instance: *SoftInstance = @alignCast(@fieldParentPtr("interface", device.instance)); const runtimes_count = switch (instance.threaded.async_limit) { @@ -68,57 +72,51 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache: }, }; - self.* = .{ - .interface = interface, - .runtimes_allocator = runtimes_allocator_arena, - .stages = std.EnumMap(Stages, Shader).init(.{ - .compute = blk: { - var shader: Shader = undefined; - soft_module.ref(); - shader.module = soft_module; + self.stages.put(.compute, blk: { + var shader: Shader = undefined; + soft_module.ref(); + shader.module = soft_module; - const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory; + const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory; - for (runtimes) |*runtime| { - runtime.* = spv.Runtime.init( - runtimes_allocator, - &soft_module.module, - .{ - .readImageFloat4 = readImageFloat4, - .readImageInt4 = readImageInt4, - .writeImageFloat4 = writeImageFloat4, - .writeImageInt4 = writeImageInt4, - }, - ) catch |err| { - std.log.scoped(.SpvRuntimeInit).err("SPIR-V Runtime failed to initialize, {s}", .{@errorName(err)}); - return VkError.Unknown; - }; - if (info.stage.p_specialization_info) |specialization| { - if (specialization.p_map_entries) |map| { - const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size]; - for (map[0..], 0..specialization.map_entry_count) |entry, _| { - runtime.addSpecializationInfo( - runtimes_allocator, - .{ - .id = @intCast(entry.constant_id), - .offset = @intCast(entry.offset), - .size = @intCast(entry.size), - }, - data, - ) catch return VkError.OutOfDeviceMemory; - } - } + for (runtimes) |*runtime| { + runtime.* = spv.Runtime.init( + runtimes_allocator, + &soft_module.module, + .{ + .readImageFloat4 = readImageFloat4, + .readImageInt4 = readImageInt4, + .writeImageFloat4 = writeImageFloat4, + .writeImageInt4 = writeImageInt4, + }, + ) catch |err| { + std.log.scoped(.SpvRuntimeInit).err("SPIR-V Runtime failed to initialize, {s}", .{@errorName(err)}); + return VkError.Unknown; + }; + if (info.stage.p_specialization_info) |specialization| { + if (specialization.p_map_entries) |map| { + const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size]; + for (map[0..], 0..specialization.map_entry_count) |entry, _| { + runtime.addSpecializationInfo( + runtimes_allocator, + .{ + .id = @intCast(entry.constant_id), + .offset = @intCast(entry.offset), + .size = @intCast(entry.size), + }, + data, + ) catch return VkError.OutOfDeviceMemory; } } + } + } - shader.runtimes = runtimes; - shader.entry = runtimes_allocator.dupe(u8, std.mem.span(info.stage.p_name)) catch return VkError.OutOfDeviceMemory; + shader.runtimes = runtimes; + shader.entry = runtimes_allocator.dupe(u8, std.mem.span(info.stage.p_name)) catch return VkError.OutOfDeviceMemory; - std.log.scoped(.ComputePipeline).debug("Created {d} runtimes for compute stage", .{runtimes_count}); - break :blk shader; - }, - }), - }; + std.log.scoped(.ComputePipeline).debug("Created {d} runtimes for compute stage", .{runtimes_count}); + break :blk shader; + }); return self; } @@ -135,9 +133,13 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache: const soft_device: *SoftDevice = @alignCast(@fieldParentPtr("interface", device)); const device_allocator = soft_device.device_allocator.allocator(); - var runtimes_allocator_arena: std.heap.ArenaAllocator = .init(device_allocator); - errdefer runtimes_allocator_arena.deinit(); - const runtimes_allocator = runtimes_allocator_arena.allocator(); + self.* = .{ + .interface = interface, + .runtimes_allocator = .init(device_allocator), + .stages = std.EnumMap(Stages, Shader).init(.{}), + }; + errdefer self.runtimes_allocator.deinit(); + const runtimes_allocator = self.runtimes_allocator.allocator(); const instance: *SoftInstance = @alignCast(@fieldParentPtr("interface", device.instance)); const runtimes_count = switch (instance.threaded.async_limit) { @@ -149,12 +151,6 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache: }, }; - self.* = .{ - .interface = interface, - .runtimes_allocator = runtimes_allocator_arena, - .stages = std.EnumMap(Stages, Shader).init(.{}), - }; - if (info.p_stages) |stages| { for (stages[0..], 0..info.stage_count) |stage, _| { var shader: Shader = undefined; @@ -228,9 +224,15 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache: pub fn destroy(interface: *Interface, allocator: std.mem.Allocator) void { const self: *Self = @alignCast(@fieldParentPtr("interface", interface)); + const soft_device: *SoftDevice = @alignCast(@fieldParentPtr("interface", interface.owner)); + const device_allocator = soft_device.device_allocator.allocator(); + var it = self.stages.iterator(); while (it.next()) |entry| { entry.value.module.unref(allocator); + for (entry.value.runtimes) |*rt| { + rt.function_stack.clearAndFree(device_allocator); // Hacky to avoid leaks + } } self.runtimes_allocator.deinit(); allocator.destroy(self); diff --git a/src/soft/SoftQueue.zig b/src/soft/SoftQueue.zig index 2e0c47e..80d671c 100644 --- a/src/soft/SoftQueue.zig +++ b/src/soft/SoftQueue.zig @@ -95,8 +95,7 @@ fn taskRunner(self: *Self, info: Interface.SubmitInfo, p_fence: ?*base.Fence, ru } var execution_device: ExecutionDevice = undefined; - execution_device.init(soft_device); - defer execution_device.deinit(); + execution_device.setup(soft_device); for (info.command_buffers.items) |command_buffer| { const soft_command_buffer: *SoftCommandBuffer = @alignCast(@fieldParentPtr("interface", command_buffer)); diff --git a/src/soft/device/ComputeDispatcher.zig b/src/soft/device/ComputeDispatcher.zig index 6911fa3..2e9a806 100644 --- a/src/soft/device/ComputeDispatcher.zig +++ b/src/soft/device/ComputeDispatcher.zig @@ -45,10 +45,6 @@ pub fn init(device: *SoftDevice, state: *PipelineState) Self { }; } -pub fn deinit(self: *Self) void { - _ = self; -} - pub fn dispatch(self: *Self, group_count_x: u32, group_count_y: u32, group_count_z: u32) VkError!void { const group_count: usize = @intCast(group_count_x * group_count_y * group_count_z); diff --git a/src/soft/device/Device.zig b/src/soft/device/Device.zig index 3e2f0d5..98db38b 100644 --- a/src/soft/device/Device.zig +++ b/src/soft/device/Device.zig @@ -39,7 +39,7 @@ pipeline_states: [2]PipelineState, /// Initializating an execution device and /// not creating one to avoid dangling pointers -pub fn init(self: *Self, device: *SoftDevice) void { +pub fn setup(self: *Self, device: *SoftDevice) void { for (self.pipeline_states[0..], 0..) |*state, i| { state.* = .{ .pipeline = null, @@ -60,8 +60,3 @@ pub fn init(self: *Self, device: *SoftDevice) void { self.compute = .init(device, &self.pipeline_states[@intFromEnum(vk.PipelineBindPoint.compute)]); self.renderer = .init(device, &self.pipeline_states[@intFromEnum(vk.PipelineBindPoint.graphics)]); } - -pub fn deinit(self: *Self) void { - self.compute.deinit(); - self.renderer.deinit(); -} diff --git a/src/soft/device/Renderer.zig b/src/soft/device/Renderer.zig index a7d2b39..9242f51 100644 --- a/src/soft/device/Renderer.zig +++ b/src/soft/device/Renderer.zig @@ -59,7 +59,7 @@ pub const DrawCall = struct { viewport: vk.Viewport, scissor: vk.Rect2D, - pub fn init(allocator: std.mem.Allocator, vertex_count: usize, instance_count: usize, renderer: *Self) VkError!@This() { + fn init(allocator: std.mem.Allocator, vertex_count: usize, instance_count: usize, renderer: *Self) VkError!@This() { const self: @This() = .{ .vertices = allocator.alloc(Vertex, vertex_count * instance_count) catch return VkError.OutOfDeviceMemory, .renderer = renderer, @@ -73,6 +73,17 @@ pub const DrawCall = struct { return self; } + + fn deinit(self: *@This(), allocator: std.mem.Allocator) void { + for (self.vertices) |*vertex| { + for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| { + if (vertex.outputs[location]) |output| { + allocator.free(output.blob); + } + } + } + allocator.free(self.vertices); + } }; device: *SoftDevice, @@ -96,10 +107,6 @@ pub fn init(device: *SoftDevice, state: *PipelineState) Self { }; } -pub fn deinit(self: *Self) void { - _ = self; -} - pub fn draw(self: *Self, vertex_count: usize, instance_count: usize, first_vertex: usize, first_instance: usize) VkError!void { var bounded_allocator: BoundedAllocator = .init(self.device.device_allocator.allocator(), @"1GiB"); try self.drawCall(&bounded_allocator, vertex_count, instance_count, first_vertex, first_instance, null); @@ -119,17 +126,18 @@ fn drawCall(self: *Self, bounded_allocator: *BoundedAllocator, vertex_count: usi const allocator = bounded_allocator.allocator(); var draw_call = try DrawCall.init(allocator, vertex_count, instance_count, self); + defer draw_call.deinit(allocator); const timer = std.Io.Timestamp.now(io, .real); defer if (comptime base.config.logs != .none) { const duration = timer.untilNow(io, .real); - const ms = duration.toMicroseconds(); + const ms: f32 = @floatFromInt(duration.toMicroseconds()); const memory_footprint = @divTrunc(bounded_allocator.queryFootprint(), 1000); const logger = std.log.scoped(.SoftwareRenderer); if (memory_footprint > 256_000) - logger.warn("Drawcall stats:\n> Took {d}us\n> Allocated {d} KB", .{ ms, memory_footprint }) + logger.warn("Drawcall stats:\n> Took {d:.3}ms\n> Allocated {d} KB", .{ ms / 1000, memory_footprint }) else - logger.debug("Drawcall stats:\n> Took {d}us\n> Allocated {d} KB", .{ ms, memory_footprint }); + logger.debug("Drawcall stats:\n> Took {d:.3}ms\n> Allocated {d} KB", .{ ms / 1000, memory_footprint }); }; self.vertexShaderStage(allocator, &draw_call, vertex_count, instance_count, first_vertex, first_instance, indices) catch |err| { diff --git a/src/soft/device/clip.zig b/src/soft/device/clip.zig index 7de6999..7457611 100644 --- a/src/soft/device/clip.zig +++ b/src/soft/device/clip.zig @@ -2,7 +2,6 @@ const std = @import("std"); const vk = @import("vulkan"); const base = @import("base"); const zm = base.zm; -const lib = @import("../lib.zig"); const spv = @import("spv"); pub const F32x4 = zm.F32x4; @@ -36,114 +35,6 @@ const ClippedPolygon = struct { } }; -fn clipDistance(position: F32x4, plane: ClipPlane) f32 { - const x = position[0]; - const y = position[1]; - const z = position[2]; - const w = position[3]; - - return switch (plane) { - .Left => x + w, - .Right => w - x, - .Bottom => y + w, - .Top => w - y, - .Near => z, - .Far => w - z, - }; -} - -fn vertexInsidePlane(vertex: *const Vertex, plane: ClipPlane) bool { - return clipDistance(vertex.position, plane) >= 0.0; -} - -fn copyBlob(allocator: std.mem.Allocator, blob: []const u8) VkError![]u8 { - const result = allocator.alloc(u8, blob.len) catch return VkError.OutOfDeviceMemory; - @memcpy(result, blob); - return result; -} - -fn writePacked(comptime T: type, bytes: []u8, value: T) void { - const raw: [@sizeOf(T)]u8 = @bitCast(value); - @memcpy(bytes[0..@sizeOf(T)], raw[0..]); -} - -fn interpolateBlob(allocator: std.mem.Allocator, a: []const u8, b: []const u8, t: f32) VkError![]u8 { - const len = @min(a.len, b.len); - const result = allocator.alloc(u8, len) catch return VkError.OutOfDeviceMemory; - - var byte_index: usize = 0; - while (byte_index + @sizeOf(F32x4) <= len) : (byte_index += @sizeOf(F32x4)) { - const value_a = std.mem.bytesToValue(F32x4, a[byte_index..]); - const value_b = std.mem.bytesToValue(F32x4, b[byte_index..]); - writePacked(F32x4, result[byte_index..], value_a + ((value_b - value_a) * @as(F32x4, @splat(t)))); - } - - while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) { - const value_a = std.mem.bytesToValue(f32, a[byte_index..]); - const value_b = std.mem.bytesToValue(f32, b[byte_index..]); - writePacked(f32, result[byte_index..], value_a + ((value_b - value_a) * t)); - } - - if (byte_index < len) - @memcpy(result[byte_index..], a[byte_index..len]); - - return result; -} - -fn interpolateVertexForClipping(allocator: std.mem.Allocator, a: *const Vertex, b: *const Vertex, t: f32) VkError!Vertex { - var result: Vertex = .{ - .position = a.position + ((b.position - a.position) * @as(F32x4, @splat(t))), - .outputs = undefined, - }; - - @memset(result.outputs[0..], null); - - for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| { - const out_a = a.outputs[location] orelse continue; - const out_b = b.outputs[location] orelse continue; - - result.outputs[location] = .{ - .interpolation_type = out_a.interpolation_type, - .blob = if (out_a.interpolation_type == .flat) - try copyBlob(allocator, out_a.blob) - else - try interpolateBlob(allocator, out_a.blob, out_b.blob, t), - }; - } - - return result; -} - -fn clipPolygonAgainstPlane(allocator: std.mem.Allocator, input: *const ClippedPolygon, plane: ClipPlane) VkError!ClippedPolygon { - var output: ClippedPolygon = .{}; - - if (input.len == 0) - return output; - - var previous = input.vertices[input.len - 1]; - var previous_inside = vertexInsidePlane(&previous, plane); - var previous_distance = clipDistance(previous.position, plane); - - for (input.vertices[0..input.len]) |current| { - const current_inside = vertexInsidePlane(¤t, plane); - const current_distance = clipDistance(current.position, plane); - - if (current_inside != previous_inside) { - const t = previous_distance / (previous_distance - current_distance); - try output.append(try interpolateVertexForClipping(allocator, &previous, ¤t, t)); - } - - if (current_inside) - try output.append(current); - - previous = current; - previous_inside = current_inside; - previous_distance = current_distance; - } - - return output; -} - pub fn clipTriangle(allocator: std.mem.Allocator, v0: *const Vertex, v1: *const Vertex, v2: *const Vertex) VkError!ClippedPolygon { var polygon: ClippedPolygon = .{}; try polygon.append(v0.*); @@ -189,3 +80,96 @@ pub fn viewportTransformVertex(viewport: vk.Viewport, vertex: *Vertex) void { vertex.position = zm.f32x4(x_screen, y_screen, z_screen, w); } + +fn clipDistance(position: F32x4, plane: ClipPlane) f32 { + const x, const y, const z, const w = position; + return switch (plane) { + .Left => x + w, + .Right => w - x, + .Bottom => y + w, + .Top => w - y, + .Near => z, + .Far => w - z, + }; +} + +fn isVertexInsidePlane(vertex: *const Vertex, plane: ClipPlane) bool { + return clipDistance(vertex.position, plane) >= 0.0; +} + +fn interpolateBlob(allocator: std.mem.Allocator, a: []const u8, b: []const u8, t: f32) VkError![]u8 { + const len = @min(a.len, b.len); + const result = allocator.alloc(u8, len) catch return VkError.OutOfDeviceMemory; + + var byte_index: usize = 0; + while (byte_index + @sizeOf(F32x4) <= len) : (byte_index += @sizeOf(F32x4)) { + const value_a = std.mem.bytesToValue(F32x4, a[byte_index..]); + const value_b = std.mem.bytesToValue(F32x4, b[byte_index..]); + base.utils.writePacked(F32x4, result[byte_index..], value_a + ((value_b - value_a) * zm.f32x4s(t))); + } + + while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) { + const value_a = std.mem.bytesToValue(f32, a[byte_index..]); + const value_b = std.mem.bytesToValue(f32, b[byte_index..]); + base.utils.writePacked(f32, result[byte_index..], value_a + ((value_b - value_a) * t)); + } + + if (byte_index < len) + @memcpy(result[byte_index..], a[byte_index..len]); + + return result; +} + +fn interpolateVertexForClipping(allocator: std.mem.Allocator, a: *const Vertex, b: *const Vertex, t: f32) VkError!Vertex { + var result: Vertex = .{ + .position = a.position + ((b.position - a.position) * zm.f32x4s(t)), + .outputs = undefined, + }; + + @memset(result.outputs[0..], null); + + for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| { + const out_a = a.outputs[location] orelse continue; + const out_b = b.outputs[location] orelse continue; + + result.outputs[location] = .{ + .interpolation_type = out_a.interpolation_type, + .blob = if (out_a.interpolation_type == .flat) + allocator.dupe(u8, out_a.blob) catch return VkError.OutOfDeviceMemory + else + try interpolateBlob(allocator, out_a.blob, out_b.blob, t), + }; + } + + return result; +} + +fn clipPolygonAgainstPlane(allocator: std.mem.Allocator, input: *const ClippedPolygon, plane: ClipPlane) VkError!ClippedPolygon { + var output: ClippedPolygon = .{}; + + if (input.len == 0) + return output; + + var previous = input.vertices[input.len - 1]; + var previous_inside = isVertexInsidePlane(&previous, plane); + var previous_distance = clipDistance(previous.position, plane); + + for (input.vertices[0..input.len]) |current| { + const current_inside = isVertexInsidePlane(¤t, plane); + const current_distance = clipDistance(current.position, plane); + + if (current_inside != previous_inside) { + const t = previous_distance / (previous_distance - current_distance); + try output.append(try interpolateVertexForClipping(allocator, &previous, ¤t, t)); + } + + if (current_inside) + try output.append(current); + + previous = current; + previous_inside = current_inside; + previous_distance = current_distance; + } + + return output; +} diff --git a/src/soft/device/rasterizer.zig b/src/soft/device/rasterizer.zig index c899da8..5577edc 100644 --- a/src/soft/device/rasterizer.zig +++ b/src/soft/device/rasterizer.zig @@ -51,7 +51,7 @@ pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocato } } -fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *const Vertex, v1: *const Vertex, v2: *const Vertex) VkError!void { +fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *Vertex, v1: *Vertex, v2: *Vertex) VkError!void { const clipped_polygon = try clip.clipTriangle(allocator, v0, v1, v2); if (clipped_polygon.len < 3) diff --git a/src/soft/device/rasterizer/bresenham.zig b/src/soft/device/rasterizer/bresenham.zig index 2ac1a22..3094e52 100644 --- a/src/soft/device/rasterizer/bresenham.zig +++ b/src/soft/device/rasterizer/bresenham.zig @@ -100,7 +100,7 @@ pub fn drawLine(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, v0: wg.await(io) catch return VkError.DeviceLost; } -inline fn bresenhamYAtStep(y0: i32, d_x: i32, d_err: i32, y_step: i32, step: usize) i32 { +fn bresenhamYAtStep(y0: i32, d_x: i32, d_err: i32, y_step: i32, step: usize) i32 { if (d_x == 0) return y0; diff --git a/src/soft/device/rasterizer/common.zig b/src/soft/device/rasterizer/common.zig index 3e75a63..7c0336c 100644 --- a/src/soft/device/rasterizer/common.zig +++ b/src/soft/device/rasterizer/common.zig @@ -25,15 +25,6 @@ pub fn scissorContainsPixel(scissor: vk.Rect2D, x: i32, y: i32) bool { pixel_y < max_y; } -fn writePacked(comptime T: type, bytes: []u8, value: T) void { - const raw: [@sizeOf(T)]u8 = @bitCast(value); - @memcpy(bytes[0..@sizeOf(T)], raw[0..]); -} - -fn interpolateF32x4(value0: F32x4, value1: F32x4, value2: F32x4, b0: f32, b1: f32, b2: f32) F32x4 { - return (value0 * @as(F32x4, @splat(b0))) + (value1 * @as(F32x4, @splat(b1))) + (value2 * @as(F32x4, @splat(b2))); -} - pub fn interpolateVertexOutputs( allocator: std.mem.Allocator, v0: *const Renderer.Vertex, @@ -63,14 +54,14 @@ pub fn interpolateVertexOutputs( const value0 = std.mem.bytesToValue(F32x4, out0.blob[byte_index..]); const value1 = std.mem.bytesToValue(F32x4, out1.blob[byte_index..]); const value2 = std.mem.bytesToValue(F32x4, out2.blob[byte_index..]); - writePacked(F32x4, input[byte_index..], interpolateF32x4(value0, value1, value2, b0, b1, b2)); + base.utils.writePacked(F32x4, input[byte_index..], interpolateF32x4(value0, value1, value2, b0, b1, b2)); } while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) { const value0 = std.mem.bytesToValue(f32, out0.blob[byte_index..]); const value1 = std.mem.bytesToValue(f32, out1.blob[byte_index..]); const value2 = std.mem.bytesToValue(f32, out2.blob[byte_index..]); - writePacked(f32, input[byte_index..], (value0 * b0) + (value1 * b1) + (value2 * b2)); + base.utils.writePacked(f32, input[byte_index..], (value0 * b0) + (value1 * b1) + (value2 * b2)); } if (byte_index < len) @@ -85,3 +76,7 @@ pub fn interpolateVertexOutputs( pub fn interpolateLineOutputs(allocator: std.mem.Allocator, v0: *const Renderer.Vertex, v1: *const Renderer.Vertex, t: f32) VkError![spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8 { return interpolateVertexOutputs(allocator, v0, v1, v0, 1.0 - t, t, 0.0); } + +inline fn interpolateF32x4(value0: F32x4, value1: F32x4, value2: F32x4, b0: f32, b1: f32, b2: f32) F32x4 { + return (value0 * zm.f32x4s(b0)) + (value1 * zm.f32x4s(b1)) + (value2 * zm.f32x4s(b2)); +} diff --git a/src/soft/device/rasterizer/edge_function.zig b/src/soft/device/rasterizer/edge_function.zig index 8dc2a9b..1fecfcf 100644 --- a/src/soft/device/rasterizer/edge_function.zig +++ b/src/soft/device/rasterizer/edge_function.zig @@ -42,7 +42,6 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, const pipeline = draw_call.renderer.state.pipeline orelse return; - var wg: std.Io.Group = .init; const runtimes_count = (pipeline.stages.getPtr(.fragment) orelse return).runtimes.len; const grid_size: usize = @intFromFloat(@floor(@sqrt(@as(f32, @floatFromInt(runtimes_count))))); @@ -53,6 +52,8 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, const rows_per_run = @divTrunc(height + grid_size - 1, grid_size); var batch_id: usize = 0; + + var wg: std.Io.Group = .init; for (0..grid_size) |gy| { for (0..grid_size) |gx| { defer batch_id = @mod(batch_id + 1, runtimes_count); diff --git a/src/vulkan/lib.zig b/src/vulkan/lib.zig index 2159b0f..3eb54d1 100644 --- a/src/vulkan/lib.zig +++ b/src/vulkan/lib.zig @@ -12,6 +12,7 @@ pub const lib_vulkan = @import("lib_vulkan.zig"); pub const logger = @import("logger.zig"); pub const format = @import("format.zig"); pub const config = @import("config"); +pub const utils = @import("utils.zig"); pub const Dispatchable = @import("Dispatchable.zig").Dispatchable; pub const fallback_host_allocator = @import("fallback_host_allocator.zig").fallback_host_allocator; diff --git a/src/vulkan/utils.zig b/src/vulkan/utils.zig new file mode 100644 index 0000000..88413e8 --- /dev/null +++ b/src/vulkan/utils.zig @@ -0,0 +1,4 @@ +pub fn writePacked(comptime T: type, bytes: []u8, value: T) void { + const raw: [@sizeOf(T)]u8 = @bitCast(value); + @memcpy(bytes[0..@sizeOf(T)], raw[0..]); +}