From d460f22a45fda153cf007b16ec930484001ac3a0 Mon Sep 17 00:00:00 2001 From: Kbz-8 Date: Thu, 14 May 2026 21:44:53 +0200 Subject: [PATCH] improving rasterization performances --- src/soft/SoftImage.zig | 18 ++- src/soft/SoftPipeline.zig | 25 ++-- src/soft/device/BoundedAllocator.zig | 10 -- src/soft/device/ComputeDispatcher.zig | 2 +- src/soft/device/Renderer.zig | 3 + src/soft/device/fragment.zig | 9 +- src/soft/device/rasterizer.zig | 139 +++++++++++++++++-- src/soft/device/rasterizer/common.zig | 8 ++ src/soft/device/rasterizer/edge_function.zig | 110 +++++++-------- src/soft/device/vertex_dispatcher.zig | 15 +- 10 files changed, 227 insertions(+), 112 deletions(-) diff --git a/src/soft/SoftImage.zig b/src/soft/SoftImage.zig index 9b1bd23..21fe5b2 100644 --- a/src/soft/SoftImage.zig +++ b/src/soft/SoftImage.zig @@ -360,7 +360,7 @@ pub fn getTexelMemoryOffset(self: *const Self, offset: vk.Offset3D, subresource: return try self.getSubresourceOffset(subresource.aspect_mask, subresource.mip_level, subresource.array_layer) + self.getTexelMemoryOffsetInSubresource(offset, subresource); } -fn getSubresourceOffset(self: *const Self, aspect_mask: vk.ImageAspectFlags, mip_level: u32, layer: u32) VkError!usize { +pub fn getSubresourceOffset(self: *const Self, aspect_mask: vk.ImageAspectFlags, mip_level: u32, layer: u32) VkError!usize { var offset = try self.getAspectOffset(aspect_mask); for (0..mip_level) |mip| { offset += self.getMultiSampledLevelSize(aspect_mask, @intCast(mip)); @@ -464,18 +464,22 @@ pub fn getMipLevelExtent(self: *const Self, mip_level: u32) vk.Extent3D { pub fn getSliceMemSizeForMipLevel(interface: *const Interface, aspect_mask: vk.ImageAspectFlags, mip_level: u32) usize { const self: *const Self = @alignCast(@fieldParentPtr("interface", interface)); - - const mip_extent = self.getMipLevelExtent(mip_level); - const format = self.interface.formatFromAspect(aspect_mask); - return base.format.sliceMemSize(format, mip_extent.width, mip_extent.height); + return self.getSliceMemSizeForMipLevelWithFormat(aspect_mask, mip_level, interface.format); } pub fn getRowPitchMemSizeForMipLevel(interface: *const Interface, aspect_mask: vk.ImageAspectFlags, mip_level: u32) usize { const self: *const Self = @alignCast(@fieldParentPtr("interface", interface)); + return self.getRowPitchMemSizeForMipLevelWithFormat(aspect_mask, mip_level, interface.format); +} +pub fn getSliceMemSizeForMipLevelWithFormat(self: *const Self, aspect_mask: vk.ImageAspectFlags, mip_level: u32, format: vk.Format) usize { const mip_extent = self.getMipLevelExtent(mip_level); - const format = self.interface.formatFromAspect(aspect_mask); - return base.format.pitchMemSize(format, mip_extent.width); + return base.format.sliceMemSize(base.format.fromAspect(format, aspect_mask), mip_extent.width, mip_extent.height); +} + +pub fn getRowPitchMemSizeForMipLevelWithFormat(self: *const Self, aspect_mask: vk.ImageAspectFlags, mip_level: u32, format: vk.Format) usize { + const mip_extent = self.getMipLevelExtent(mip_level); + return base.format.pitchMemSize(base.format.fromAspect(format, aspect_mask), mip_extent.width); } pub inline fn mapAs(self: *const Self, comptime T: type) VkError!*T { diff --git a/src/soft/SoftPipeline.zig b/src/soft/SoftPipeline.zig index 3e68d56..d278b9b 100644 --- a/src/soft/SoftPipeline.zig +++ b/src/soft/SoftPipeline.zig @@ -19,9 +19,14 @@ const SoftShaderModule = @import("SoftShaderModule.zig"); const Self = @This(); pub const Interface = base.Pipeline; +const Runtime = struct { + mutex: std.Io.Mutex, + rt: spv.Runtime, +}; + const Shader = struct { module: *SoftShaderModule, - runtimes: []spv.Runtime, + runtimes: []Runtime, entry: []const u8, }; @@ -77,10 +82,11 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache: soft_module.ref(); shader.module = soft_module; - const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory; + const runtimes = runtimes_allocator.alloc(Runtime, runtimes_count) catch return VkError.OutOfDeviceMemory; for (runtimes) |*runtime| { - runtime.* = spv.Runtime.init( + runtime.mutex = .init; + runtime.rt = spv.Runtime.init( runtimes_allocator, &soft_module.module, .{ @@ -97,7 +103,7 @@ pub fn createCompute(device: *base.Device, allocator: std.mem.Allocator, cache: if (specialization.p_map_entries) |map| { const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size]; for (map[0..], 0..specialization.map_entry_count) |entry, _| { - runtime.addSpecializationInfo( + runtime.rt.addSpecializationInfo( runtimes_allocator, .{ .id = @intCast(entry.constant_id), @@ -160,10 +166,11 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache: soft_module.ref(); shader.module = soft_module; - const runtimes = runtimes_allocator.alloc(spv.Runtime, runtimes_count) catch return VkError.OutOfHostMemory; + const runtimes = runtimes_allocator.alloc(Runtime, runtimes_count) catch return VkError.OutOfHostMemory; for (runtimes) |*runtime| { - runtime.* = spv.Runtime.init( + runtime.mutex = .init; + runtime.rt = spv.Runtime.init( runtimes_allocator, &soft_module.module, .{ @@ -180,7 +187,7 @@ pub fn createGraphics(device: *base.Device, allocator: std.mem.Allocator, cache: if (specialization.p_map_entries) |map| { const data: []const u8 = @as([*]const u8, @ptrCast(@alignCast(specialization.p_data)))[0..specialization.data_size]; for (map[0..], 0..specialization.map_entry_count) |entry, _| { - runtime.addSpecializationInfo(runtimes_allocator, .{ + runtime.rt.addSpecializationInfo(runtimes_allocator, .{ .id = @intCast(entry.constant_id), .offset = @intCast(entry.offset), .size = @intCast(entry.size), @@ -230,8 +237,8 @@ pub fn destroy(interface: *Interface, allocator: std.mem.Allocator) void { var it = self.stages.iterator(); while (it.next()) |entry| { entry.value.module.unref(allocator); - for (entry.value.runtimes) |*rt| { - rt.function_stack.clearAndFree(device_allocator); // Hacky to avoid leaks + for (entry.value.runtimes) |*runtime| { + runtime.rt.function_stack.clearAndFree(device_allocator); // Hacky to avoid leaks } } self.runtimes_allocator.deinit(); diff --git a/src/soft/device/BoundedAllocator.zig b/src/soft/device/BoundedAllocator.zig index 56c9785..a72408f 100644 --- a/src/soft/device/BoundedAllocator.zig +++ b/src/soft/device/BoundedAllocator.zig @@ -6,7 +6,6 @@ const Self = @This(); const Allocator = std.mem.Allocator; const Alignment = std.mem.Alignment; -mutex: base.SpinMutex, child_allocator: std.mem.Allocator, bound: usize, total_bytes_allocated: std.atomic.Value(usize), @@ -15,7 +14,6 @@ current_bytes_allocated: std.atomic.Value(usize), pub fn init(child_allocator: Allocator, bound: usize) Self { return .{ - .mutex = .{}, .child_allocator = child_allocator, .bound = bound, .total_bytes_allocated = std.atomic.Value(usize).init(0), @@ -46,8 +44,6 @@ pub inline fn queryPeakFootprint(self: *Self) usize { fn alloc(context: *anyopaque, len: usize, alignment: Alignment, ret_addr: usize) ?[*]u8 { const self: *Self = @ptrCast(@alignCast(context)); - self.mutex.lock(); - defer self.mutex.unlock(); if (self.current_bytes_allocated.fetchAdd(len, .monotonic) >= self.bound) return null; _ = self.total_bytes_allocated.fetchAdd(len, .monotonic); @@ -58,8 +54,6 @@ fn alloc(context: *anyopaque, len: usize, alignment: Alignment, ret_addr: usize) fn resize(context: *anyopaque, ptr: []u8, alignment: Alignment, new_len: usize, ret_addr: usize) bool { const self: *Self = @ptrCast(@alignCast(context)); - self.mutex.lock(); - defer self.mutex.unlock(); _ = self.current_bytes_allocated.fetchSub(ptr.len, .monotonic); if (self.current_bytes_allocated.fetchAdd(new_len, .monotonic) >= self.bound) return false; @@ -69,8 +63,6 @@ fn resize(context: *anyopaque, ptr: []u8, alignment: Alignment, new_len: usize, fn remap(context: *anyopaque, ptr: []u8, alignment: Alignment, new_len: usize, ret_addr: usize) ?[*]u8 { const self: *Self = @ptrCast(@alignCast(context)); - self.mutex.lock(); - defer self.mutex.unlock(); _ = self.current_bytes_allocated.fetchSub(ptr.len, .monotonic); if (self.current_bytes_allocated.fetchAdd(new_len, .monotonic) >= self.bound) return null; @@ -80,8 +72,6 @@ fn remap(context: *anyopaque, ptr: []u8, alignment: Alignment, new_len: usize, r fn free(context: *anyopaque, ptr: []u8, alignment: Alignment, ret_addr: usize) void { const self: *Self = @ptrCast(@alignCast(context)); - self.mutex.lock(); - defer self.mutex.unlock(); _ = self.current_bytes_allocated.fetchSub(ptr.len, .monotonic); return self.child_allocator.rawFree(ptr, alignment, ret_addr); } diff --git a/src/soft/device/ComputeDispatcher.zig b/src/soft/device/ComputeDispatcher.zig index 2e9a806..78efb99 100644 --- a/src/soft/device/ComputeDispatcher.zig +++ b/src/soft/device/ComputeDispatcher.zig @@ -92,7 +92,7 @@ inline fn run(data: RunData) !void { const io = data.self.device.interface.io(); const shader = data.pipeline.stages.getPtrAssertContains(.compute); - const rt = &shader.runtimes[data.batch_id]; + const rt = &shader.runtimes[data.batch_id].rt; const entry = try rt.getEntryPointByName(shader.entry); diff --git a/src/soft/device/Renderer.zig b/src/soft/device/Renderer.zig index d60b744..1d42d47 100644 --- a/src/soft/device/Renderer.zig +++ b/src/soft/device/Renderer.zig @@ -65,6 +65,8 @@ pub const DrawCall = struct { render_pass: *SoftRenderPass, framebuffer: *SoftFramebuffer, + rasterizer_wait_group: std.Io.Group, + stats: struct { polygons_drawn: usize, }, @@ -82,6 +84,7 @@ pub const DrawCall = struct { .depth_attachment = if (render_pass.interface.subpasses[0].depth_stencil_attachments) |desc| framebuffer.interface.attachments[desc.attachment] else null, .render_pass = render_pass, .framebuffer = framebuffer, + .rasterizer_wait_group = .init, .stats = .{ .polygons_drawn = 0, }, diff --git a/src/soft/device/fragment.zig b/src/soft/device/fragment.zig index 9e4c0ae..c32df5b 100644 --- a/src/soft/device/fragment.zig +++ b/src/soft/device/fragment.zig @@ -13,11 +13,18 @@ const VkError = base.VkError; const SpvRuntimeError = spv.Runtime.RuntimeError; pub fn shaderInvocation(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, batch_id: usize, position: zm.F32x4, inputs: [spv.SPIRV_MAX_OUTPUT_LOCATIONS][]const u8) SpvRuntimeError!zm.F32x4 { + const io = draw_call.renderer.device.interface.io(); + _ = position; const pipeline = draw_call.renderer.state.pipeline orelse return zm.f32x4s(0.0); const shader = pipeline.stages.getPtrAssertContains(.fragment); - const rt = &shader.runtimes[batch_id]; + const runtime = &shader.runtimes[batch_id]; + const mutex = &runtime.mutex; + const rt = &runtime.rt; + + mutex.lock(io) catch return SpvRuntimeError.Unknown; + defer mutex.unlock(io); const entry = try rt.getEntryPointByName(shader.entry); const output_result = try rt.getResultByLocation(0, .output); diff --git a/src/soft/device/rasterizer.zig b/src/soft/device/rasterizer.zig index 8a31751..e59f399 100644 --- a/src/soft/device/rasterizer.zig +++ b/src/soft/device/rasterizer.zig @@ -5,17 +5,67 @@ const clip = @import("clip.zig"); const bresenham = @import("rasterizer/bresenham.zig"); const edge_function = @import("rasterizer/edge_function.zig"); +const common = @import("rasterizer/common.zig"); const Renderer = @import("Renderer.zig"); const Vertex = Renderer.Vertex; const DrawCall = Renderer.DrawCall; +const SoftImage = @import("../SoftImage.zig"); const VkError = base.VkError; pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall) VkError!void { + const io = draw_call.renderer.device.interface.io(); + const pipeline_data = (renderer.state.pipeline orelse return VkError.InvalidHandleDrv).interface.mode.graphics; const topology = pipeline_data.input_assembly.topology; + const color_attachment = if (draw_call.render_pass.interface.subpasses[0].color_attachments) |attachments| attachments[0].attachment else return VkError.InvalidAttachmentDrv; + const render_target_view: *base.ImageView = draw_call.color_attachments[color_attachment]; + const render_target: *SoftImage = @alignCast(@fieldParentPtr("interface", render_target_view.image)); + + const color_range = render_target_view.subresource_range; + const color_format = render_target_view.format; + + const color_attachment_subresource_offset = try render_target.getSubresourceOffset( + color_range.aspect_mask, + color_range.base_mip_level, + color_range.base_array_layer, + ); + const color_attachment_subresource_size = render_target.getLayerSize(color_range.aspect_mask); + const color_attachment_access: common.RenderTargetAccess = .{ + .mutex = undefined, + .base = try render_target.mapAsSliceWithAddedOffset(u8, color_attachment_subresource_offset, color_attachment_subresource_size), + .row_pitch = render_target.getRowPitchMemSizeForMipLevelWithFormat(color_range.aspect_mask, color_range.base_mip_level, color_format), + .texel_size = base.format.texelSize(color_format), + .format = color_format, + }; + + const depth_attachment_view: ?*base.ImageView = if (draw_call.depth_attachment) |view| view else null; + const depth_attachment: ?*SoftImage = if (depth_attachment_view) |view| @alignCast(@fieldParentPtr("interface", view.image)) else null; + + var depth_attachment_access: ?common.RenderTargetAccess = blk: { + if (depth_attachment == null) + break :blk null; + + const depth_range = depth_attachment_view.?.subresource_range; + const depth_format = depth_attachment_view.?.format; + + const attachment_subresource_offset = try depth_attachment.?.getSubresourceOffset( + depth_range.aspect_mask, + depth_range.base_mip_level, + depth_range.base_array_layer, + ); + const attachment_subresource_size = depth_attachment.?.getLayerSize(depth_range.aspect_mask); + break :blk .{ + .mutex = .init, + .base = try depth_attachment.?.mapAsSliceWithAddedOffset(u8, attachment_subresource_offset, attachment_subresource_size), + .row_pitch = render_target.getRowPitchMemSizeForMipLevelWithFormat(depth_range.aspect_mask, depth_range.base_mip_level, depth_format), + .texel_size = base.format.texelSize(depth_format), + .format = depth_format, + }; + }; + switch (topology) { .triangle_list => for (0..@divTrunc(draw_call.vertices.len, 3)) |triangle_index| { const first_vertex = triangle_index * 3; @@ -23,7 +73,16 @@ pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocato const v1 = &draw_call.vertices[first_vertex + 1]; const v2 = &draw_call.vertices[first_vertex + 2]; - try clipTransformAndRasterizeTriangle(renderer, allocator, draw_call, v0, v1, v2); + try clipTransformAndRasterizeTriangle( + renderer, + allocator, + draw_call, + v0, + v1, + v2, + &color_attachment_access, + if (depth_attachment_access) |*access| access else null, + ); }, .triangle_fan => if (draw_call.vertices.len >= 3) { const v0 = &draw_call.vertices[0]; @@ -31,7 +90,16 @@ pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocato const v1 = &draw_call.vertices[vertex_index]; const v2 = &draw_call.vertices[vertex_index + 1]; - try clipTransformAndRasterizeTriangle(renderer, allocator, draw_call, v0, v1, v2); + try clipTransformAndRasterizeTriangle( + renderer, + allocator, + draw_call, + v0, + v1, + v2, + &color_attachment_access, + if (depth_attachment_access) |*access| access else null, + ); } }, .triangle_strip => if (draw_call.vertices.len >= 3) { @@ -41,17 +109,46 @@ pub fn processThenFragmentStage(renderer: *Renderer, allocator: std.mem.Allocato const v2 = &draw_call.vertices[vertex_index + 2]; if ((vertex_index & 1) == 0) { - try clipTransformAndRasterizeTriangle(renderer, allocator, draw_call, v0, v1, v2); + try clipTransformAndRasterizeTriangle( + renderer, + allocator, + draw_call, + v0, + v1, + v2, + &color_attachment_access, + if (depth_attachment_access) |*access| access else null, + ); } else { - try clipTransformAndRasterizeTriangle(renderer, allocator, draw_call, v1, v0, v2); + try clipTransformAndRasterizeTriangle( + renderer, + allocator, + draw_call, + v1, + v0, + v2, + &color_attachment_access, + if (depth_attachment_access) |*access| access else null, + ); } } }, else => base.unsupported("primitive topology {any}", .{topology}), } + + draw_call.rasterizer_wait_group.await(io) catch return VkError.DeviceLost; } -fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *Vertex, v1: *Vertex, v2: *Vertex) VkError!void { +fn clipTransformAndRasterizeTriangle( + renderer: *Renderer, + allocator: std.mem.Allocator, + draw_call: *DrawCall, + v0: *Vertex, + v1: *Vertex, + v2: *Vertex, + color_attachment_access: *const common.RenderTargetAccess, + depth_attachment_access: ?*common.RenderTargetAccess, +) VkError!void { const clipped_polygon = try clip.clipTriangle(allocator, v0, v1, v2); if (clipped_polygon.len < 3) @@ -66,11 +163,29 @@ fn clipTransformAndRasterizeTriangle(renderer: *Renderer, allocator: std.mem.All clip.viewportTransformVertex(draw_call.viewport, &tv1); clip.viewportTransformVertex(draw_call.viewport, &tv2); - try rasterizeTriangle(renderer, allocator, draw_call, &tv0, &tv1, &tv2); + try rasterizeTriangle( + renderer, + allocator, + draw_call, + &tv0, + &tv1, + &tv2, + color_attachment_access, + depth_attachment_access, + ); } } -fn rasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_call: *DrawCall, v0: *Vertex, v1: *Vertex, v2: *Vertex) VkError!void { +fn rasterizeTriangle( + renderer: *Renderer, + allocator: std.mem.Allocator, + draw_call: *DrawCall, + v0: *Vertex, + v1: *Vertex, + v2: *Vertex, + color_attachment_access: *const common.RenderTargetAccess, + depth_attachment_access: ?*common.RenderTargetAccess, +) VkError!void { if (try triangleIsCulled(renderer, v0, v1, v2)) return; @@ -78,7 +193,15 @@ fn rasterizeTriangle(renderer: *Renderer, allocator: std.mem.Allocator, draw_cal const pipeline_data = (renderer.state.pipeline orelse return VkError.InvalidHandleDrv).interface.mode.graphics; switch (pipeline_data.rasterization.polygon_mode) { - .fill => try edge_function.drawTriangle(allocator, draw_call, v0, v1, v2), + .fill => try edge_function.drawTriangle( + allocator, + draw_call, + v0, + v1, + v2, + color_attachment_access, + depth_attachment_access, + ), .line => { try bresenham.drawLine(allocator, draw_call, v0, v1); try bresenham.drawLine(allocator, draw_call, v1, v2); diff --git a/src/soft/device/rasterizer/common.zig b/src/soft/device/rasterizer/common.zig index 7c0336c..d3189ed 100644 --- a/src/soft/device/rasterizer/common.zig +++ b/src/soft/device/rasterizer/common.zig @@ -9,6 +9,14 @@ const Renderer = @import("../Renderer.zig"); const VkError = base.VkError; const F32x4 = zm.F32x4; +pub const RenderTargetAccess = struct { + mutex: std.Io.Mutex, + base: []u8, + row_pitch: usize, + texel_size: usize, + format: vk.Format, +}; + pub fn scissorContainsPixel(scissor: vk.Rect2D, x: i32, y: i32) bool { const min_x: i64 = @as(i64, scissor.offset.x); const min_y: i64 = @as(i64, scissor.offset.y); diff --git a/src/soft/device/rasterizer/edge_function.zig b/src/soft/device/rasterizer/edge_function.zig index 22a3145..3e3d715 100644 --- a/src/soft/device/rasterizer/edge_function.zig +++ b/src/soft/device/rasterizer/edge_function.zig @@ -6,9 +6,9 @@ const zm = base.zm; const common = @import("common.zig"); const fragment = @import("../fragment.zig"); +const blitter = @import("../blitter.zig"); const Renderer = @import("../Renderer.zig"); -const SoftImage = @import("../../SoftImage.zig"); const VkError = base.VkError; const SpvRuntimeError = spv.Runtime.RuntimeError; @@ -23,12 +23,22 @@ const RunData = struct { min_y: i32, max_y: i32, area: f32, + v0: Renderer.Vertex, + v1: Renderer.Vertex, + v2: Renderer.Vertex, + color_attachment_access: *const common.RenderTargetAccess, + depth_attachment_access: ?*common.RenderTargetAccess, +}; + +pub fn drawTriangle( + allocator: std.mem.Allocator, + draw_call: *Renderer.DrawCall, v0: *Renderer.Vertex, v1: *Renderer.Vertex, v2: *Renderer.Vertex, -}; - -pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, v0: *Renderer.Vertex, v1: *Renderer.Vertex, v2: *Renderer.Vertex) VkError!void { + color_attachment_access: *const common.RenderTargetAccess, + depth_attachment_access: ?*common.RenderTargetAccess, +) VkError!void { const io = draw_call.renderer.device.interface.io(); const min_x: i32 = @intFromFloat(@floor(@min(v0.position[0], v1.position[0], v2.position[0]))); @@ -43,7 +53,7 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, const pipeline = draw_call.renderer.state.pipeline orelse return; const runtimes_count = (pipeline.stages.getPtr(.fragment) orelse return).runtimes.len; - const grid_size: usize = @intFromFloat(@floor(@sqrt(@as(f32, @floatFromInt(runtimes_count))))); + const grid_size: usize = @intFromFloat(@ceil(@sqrt(@as(f32, @floatFromInt(runtimes_count))))); const width: usize = @intCast(max_x - min_x + 1); const height: usize = @intCast(max_y - min_y + 1); @@ -53,7 +63,6 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, var batch_id: usize = 0; - var wg: std.Io.Group = .init; for (0..grid_size) |gy| { for (0..grid_size) |gx| { defer batch_id = @mod(batch_id + 1, runtimes_count); @@ -78,20 +87,25 @@ pub fn drawTriangle(allocator: std.mem.Allocator, draw_call: *Renderer.DrawCall, .allocator = allocator, .draw_call = draw_call, .batch_id = batch_id, - .v0 = v0, - .v1 = v1, - .v2 = v2, + .v0 = v0.*, + .v1 = v1.*, + .v2 = v2.*, .area = area, .min_x = run_min_x, .max_x = run_max_x, .min_y = run_min_y, .max_y = run_max_y, + .color_attachment_access = color_attachment_access, + .depth_attachment_access = depth_attachment_access, }; - wg.async(io, runWrapper, .{run_data}); + draw_call.rasterizer_wait_group.async(io, runWrapper, .{run_data}); } } - wg.await(io) catch return VkError.DeviceLost; + + // To avoid mess with pixel render order without depth buffer to sort them + if (depth_attachment_access == null) + draw_call.rasterizer_wait_group.await(io) catch return VkError.DeviceLost; } inline fn edgeFunction(a: F32x4, b: F32x4, p: F32x4) f32 { @@ -108,12 +122,7 @@ fn runWrapper(data: RunData) void { } inline fn run(data: RunData) !void { - const color_attachment = if (data.draw_call.render_pass.interface.subpasses[0].color_attachments) |attachments| attachments[0].attachment else return VkError.InvalidAttachmentDrv; - const render_target_view: *base.ImageView = data.draw_call.color_attachments[color_attachment]; - const render_target: *SoftImage = @alignCast(@fieldParentPtr("interface", render_target_view.image)); - - const depth_attachment_view: ?*base.ImageView = if (data.draw_call.depth_attachment) |view| view else null; - const depth_attachment: ?*SoftImage = if (depth_attachment_view) |view| @alignCast(@fieldParentPtr("interface", view.image)) else null; + const io = data.draw_call.renderer.device.interface.io(); var y = data.min_y; while (y <= data.max_y) : (y += 1) { @@ -142,38 +151,12 @@ inline fn run(data: RunData) !void { const b2 = w2 / data.area; const z = (b0 * data.v0.position[2]) + (b1 * data.v1.position[2]) + (b2 * data.v2.position[2]); - if (depth_attachment) |depth| { - const depth_value = try depth.readFloat4( - .{ - .x = x, - .y = y, - .z = 0, - }, - .{ - .aspect_mask = depth_attachment_view.?.subresource_range.aspect_mask, - .mip_level = depth_attachment_view.?.subresource_range.base_mip_level, - .array_layer = depth_attachment_view.?.subresource_range.base_array_layer, - }, - depth_attachment_view.?.format, - ); - + // Early depth test to avoid unnecesary computations + if (data.depth_attachment_access) |depth| { + const offset = @as(usize, @intCast(x)) * depth.texel_size + @as(usize, @intCast(y)) * depth.row_pitch; + const depth_value = blitter.readFloat4(depth.base[offset..], depth.format); if (z >= depth_value[0]) continue; - - try depth.writeFloat4( - .{ - .x = x, - .y = y, - .z = 0, - }, - .{ - .aspect_mask = depth_attachment_view.?.subresource_range.aspect_mask, - .mip_level = depth_attachment_view.?.subresource_range.base_mip_level, - .array_layer = depth_attachment_view.?.subresource_range.base_array_layer, - }, - depth_attachment_view.?.format, - zm.f32x4s(z), - ); } const pixel = fragment.shaderInvocation( @@ -181,7 +164,7 @@ inline fn run(data: RunData) !void { data.draw_call, data.batch_id, zm.f32x4(@floatFromInt(x), @floatFromInt(y), z, 1.0), - try common.interpolateVertexOutputs(data.allocator, data.v0, data.v1, data.v2, b0, b1, b2), + try common.interpolateVertexOutputs(data.allocator, &data.v0, &data.v1, &data.v2, b0, b1, b2), ) catch |err| { std.log.scoped(.@"Fragment stage").err("catched a '{s}'", .{@errorName(err)}); if (@errorReturnTrace()) |trace| { @@ -190,20 +173,23 @@ inline fn run(data: RunData) !void { return; }; - try render_target.writeFloat4( - .{ - .x = x, - .y = y, - .z = 0, - }, - .{ - .aspect_mask = render_target_view.subresource_range.aspect_mask, - .mip_level = render_target_view.subresource_range.base_mip_level, - .array_layer = render_target_view.subresource_range.base_array_layer, - }, - render_target_view.format, - pixel, - ); + const color_offset = @as(usize, @intCast(x)) * data.color_attachment_access.texel_size + @as(usize, @intCast(y)) * data.color_attachment_access.row_pitch; + + // After work depth test to avoid overwritten depth pixels during fragment invocations + if (data.depth_attachment_access) |depth| { + const depth_offset = @as(usize, @intCast(x)) * depth.texel_size + @as(usize, @intCast(y)) * depth.row_pitch; + + depth.mutex.lock(io) catch return VkError.DeviceLost; + defer depth.mutex.unlock(io); + + const depth_value = blitter.readFloat4(depth.base[depth_offset..], depth.format); + if (z >= depth_value[0]) + continue; + blitter.writeFloat4(zm.f32x4s(z), depth.base[depth_offset..], depth.format); + blitter.writeFloat4(pixel, data.color_attachment_access.base[color_offset..], data.color_attachment_access.format); + } else { + blitter.writeFloat4(pixel, data.color_attachment_access.base[color_offset..], data.color_attachment_access.format); + } } } } diff --git a/src/soft/device/vertex_dispatcher.zig b/src/soft/device/vertex_dispatcher.zig index 87c355a..69bedf6 100644 --- a/src/soft/device/vertex_dispatcher.zig +++ b/src/soft/device/vertex_dispatcher.zig @@ -35,7 +35,7 @@ pub fn runWrapper(data: RunData) void { inline fn run(data: RunData) !void { const shader = data.pipeline.stages.getPtrAssertContains(.vertex); - const rt = &shader.runtimes[data.batch_id]; + const rt = &shader.runtimes[data.batch_id].rt; try rt.populatePushConstants(data.draw_call.renderer.state.push_constant_blob[0..]); const entry = try rt.getEntryPointByName(shader.entry); @@ -79,19 +79,6 @@ inline fn run(data: RunData) !void { const output: *Renderer.Vertex = &data.draw_call.vertices[(data.instance_index * data.vertex_count) + invocation_index]; try rt.readBuiltIn(std.mem.asBytes(&output.position), .Position); - if (invocation_index == 0) { - const io = data.draw_call.renderer.device.interface.io(); - const file = try std.Io.Dir.cwd().createFile( - io, - "vertex_result_table_dump.txt", - .{ .truncate = true }, - ); - defer file.close(io); - var buffer = [_]u8{0} ** 1024; - var writer = file.writer(io, buffer[0..]); - try rt.dumpResultsTable(data.allocator, &writer.interface); - } - for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| { const result_word = rt.getResultByLocation(@intCast(location), .output) catch |err| switch (err) { SpvRuntimeError.NotFound => continue,