adding pipeline dynamic state and vertex output interpollation

2026-04-27 19:39:49 +02:00
parent f35bce907e
commit 02bb54b841
12 changed files with 286 additions and 80 deletions
@@ -3,6 +3,7 @@ const vk = @import("vulkan");
 const base = @import("base");
 const zm = base.zm;
 const lib = @import("../lib.zig");
+const spv = @import("spv");

 pub const F32x4 = zm.F32x4;

@@ -32,18 +33,24 @@ pub const VertexBuffer = struct {
 };

 pub const DynamicState = struct {
-    viewport: vk.Viewport,
-    scissor: vk.Rect2D,
-    line_width: f32,
+    viewports: ?[]const vk.Viewport,
+    scissor: ?[]vk.Rect2D,
+    line_width: ?f32,
+};
+
+pub const Vertex = struct {
+    position: F32x4,
+    outputs: [spv.SPIRV_MAX_OUTPUT_LOCATIONS]?[]u8,
 };

 pub const Fragment = struct {
    position: F32x4,
    color: F32x4,
+    inputs: [spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8,
 };

 pub const DrawCall = struct {
-    vertices: []F32x4,
+    vertices: []Vertex,
    fragments: []Fragment,
 };

@@ -60,11 +67,17 @@ pub fn init(device: *SoftDevice, state: *PipelineState) Self {
        .state = state,
        .render_pass = null,
        .framebuffer = null,
-        .dynamic_state = undefined,
+        .dynamic_state = .{
+            .viewports = null,
+            .scissor = null,
+            .line_width = null,
+        },
    };
 }

 pub fn draw(self: *Self, vertex_count: usize, instance_count: usize, first_vertex: usize, first_instance: usize) VkError!void {
+    const io = self.device.interface.io();
+
    const render_target_view: *base.ImageView = (self.framebuffer orelse return).interface.attachments[0];
    const render_target: *SoftImage = @alignCast(@fieldParentPtr("interface", render_target_view.image));
    const render_target_memory = if (render_target.interface.memory) |memory| memory else return VkError.InvalidDeviceMemoryDrv;
@@ -73,20 +86,32 @@ pub fn draw(self: *Self, vertex_count: usize, instance_count: usize, first_verte
    defer arena.deinit();
    const allocator = arena.allocator();

+    const timer = std.Io.Timestamp.now(io, .real);
+    defer if (comptime base.config.logs) {
+        const duration = timer.untilNow(io, .real);
+        const ms = duration.toMicroseconds();
+        std.log.scoped(.SoftwareRenderer).debug("Drawcall stats:\n>   Took {d}us\n>   Allocated {d} KB", .{ ms, @divTrunc(arena.queryCapacity(), 1000) });
+    };
+
    var draw_call: DrawCall = .{
-        .vertices = allocator.alloc(F32x4, vertex_count * instance_count) catch return VkError.OutOfDeviceMemory,
+        .vertices = allocator.alloc(Vertex, vertex_count * instance_count) catch return VkError.OutOfDeviceMemory,
        .fragments = undefined,
    };

-    self.vertexShaderStage(&draw_call, vertex_count, instance_count) catch |err| {
+    for (draw_call.vertices) |*vertex| {
+        vertex.outputs = [_]?[]u8{null} ** spv.SPIRV_MAX_OUTPUT_LOCATIONS;
+    }
+
+    self.vertexShaderStage(allocator, &draw_call, vertex_count, instance_count) catch |err| {
        std.log.scoped(.@"Vertex stage").err("catched a '{s}'", .{@errorName(err)});
        if (@errorReturnTrace()) |trace| {
            std.debug.dumpErrorReturnTrace(trace);
        }
    };

-    self.primitiveAssemblyStage(&draw_call);
+    try self.primitiveAssemblyStage(&draw_call);
    try self.rasterizationStage(allocator, &draw_call);
+
    self.fragmentShaderStage(&draw_call) catch |err| {
        std.log.scoped(.@"Fragment stage").err("catched a '{s}'", .{@errorName(err)});
        if (@errorReturnTrace()) |trace| {
@@ -121,7 +146,7 @@ pub fn deinit(self: *Self) void {
    _ = self;
 }

-fn vertexShaderStage(self: *Self, draw_call: *DrawCall, vertex_count: usize, instance_count: usize) !void {
+fn vertexShaderStage(self: *Self, allocator: std.mem.Allocator, draw_call: *DrawCall, vertex_count: usize, instance_count: usize) !void {
    const pipeline = self.state.pipeline orelse return;
    const batch_size = (pipeline.stages.getPtr(.vertex) orelse return).runtimes.len;

@@ -129,6 +154,7 @@ fn vertexShaderStage(self: *Self, draw_call: *DrawCall, vertex_count: usize, ins
    for (0..instance_count) |instance_index| {
        for (0..@min(batch_size, vertex_count)) |batch_id| {
            const run_data: vertex_dispatcher.RunData = .{
+                .allocator = allocator,
                .renderer = self,
                .pipeline = pipeline,
                .batch_id = batch_id,
@@ -144,14 +170,23 @@ fn vertexShaderStage(self: *Self, draw_call: *DrawCall, vertex_count: usize, ins
    wg.await(self.device.interface.io()) catch return VkError.DeviceLost;
 }

-fn primitiveAssemblyStage(self: *Self, draw_call: *DrawCall) void {
-    const viewport = (self.state.pipeline orelse return).interface.mode.graphics.viewport_state.viewports[0];
+fn primitiveAssemblyStage(self: *Self, draw_call: *DrawCall) VkError!void {
+    const viewport = blk: {
+        const pipeline_data = &(self.state.pipeline orelse return VkError.InvalidPipelineDrv).interface.mode.graphics;
+        if (pipeline_data.dynamic_state.viewport) {
+            if (self.dynamic_state.viewports) |viewports|
+                break :blk viewports[0];
+        }
+        if (pipeline_data.viewport_state.viewports) |viewports|
+            break :blk viewports[0];
+        return VkError.Unknown;
+    };

    for (draw_call.vertices) |*vertex| {
-        const x = vertex[0];
-        const y = vertex[1];
-        const z = vertex[2];
-        const w = vertex[3];
+        const x = vertex.position[0];
+        const y = vertex.position[1];
+        const z = vertex.position[2];
+        const w = vertex.position[3];

        // Perspective division.
        const x_ndc = x / w;
@@ -170,7 +205,7 @@ fn primitiveAssemblyStage(self: *Self, draw_call: *DrawCall) void {
        const y_screen = ((p_y / 2.0) * y_ndc) + o_y;
        const z_screen = (p_z * z_ndc) + o_z;

-        vertex.* = zm.f32x4(x_screen, y_screen, z_screen, 1.0);
+        vertex.position = zm.f32x4(x_screen, y_screen, z_screen, 1.0);
    }
 }

@@ -182,9 +217,9 @@ fn rasterizationStage(self: *Self, allocator: std.mem.Allocator, draw_call: *Dra
    switch (topology) {
        .triangle_list => for (0..@divExact(draw_call.vertices.len, 3)) |triangle_index| {
            const first_vertex = triangle_index * 3;
-            const v0 = draw_call.vertices[first_vertex + 0];
-            const v1 = draw_call.vertices[first_vertex + 1];
-            const v2 = draw_call.vertices[first_vertex + 2];
+            const v0 = &draw_call.vertices[first_vertex + 0];
+            const v1 = &draw_call.vertices[first_vertex + 1];
+            const v2 = &draw_call.vertices[first_vertex + 2];

            switch (pipeline_data.rasterization.polygon_mode) {
                .fill => try rasterizer.drawTriangleFilled(allocator, &fragments, v0, v1, v2),
@@ -41,6 +41,18 @@ inline fn run(data: RunData) !void {

    var invocation_index: usize = data.batch_id;
    while (invocation_index < data.fragment_count) : (invocation_index += data.batch_size) {
+        const fragment: *Renderer.Fragment = &data.draw_call.fragments[invocation_index];
+
+        for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
+            const result_word = rt.getResultByLocation(@intCast(location), .input) catch |err| switch (err) {
+                SpvRuntimeError.NotFound => continue,
+                else => return err,
+            };
+            if (result_word != 0) {
+                try rt.writeInput(fragment.inputs[location], result_word);
+            }
+        }
+
        rt.callEntryPoint(allocator, entry) catch |err| switch (err) {
            // Some errors can be safely ignored
            SpvRuntimeError.OutOfBounds,
@@ -49,8 +61,7 @@ inline fn run(data: RunData) !void {
            else => return err,
        };

-        const output: *F32x4 = &data.draw_call.fragments[invocation_index].color;
-        try rt.readOutput(std.mem.asBytes(output), output_result);
-        output.* = std.math.clamp(output.*, zm.f32x4s(0.0), zm.f32x4s(1.0));
+        try rt.readOutput(std.mem.asBytes(&fragment.color), output_result);
+        fragment.color = std.math.clamp(fragment.color, zm.f32x4s(0.0), zm.f32x4s(1.0));
    }
 }
@@ -8,14 +8,76 @@ const VkError = base.VkError;
 const lib = @import("../lib.zig");

 const Renderer = @import("Renderer.zig");
+const spv = @import("spv");

 pub const F32x4 = zm.F32x4;

-pub fn drawLineBresenham(allocator: std.mem.Allocator, fragments: *std.ArrayList(Renderer.Fragment), v0: F32x4, v1: F32x4) VkError!void {
-    var x0: i32 = @intFromFloat(v0[0]);
-    var y0: i32 = @intFromFloat(v0[1]);
-    var x1: i32 = @intFromFloat(v1[0]);
-    var y1: i32 = @intFromFloat(v1[1]);
+fn writePacked(comptime T: type, bytes: []u8, value: T) void {
+    const raw: [@sizeOf(T)]u8 = @bitCast(value);
+    @memcpy(bytes[0..@sizeOf(T)], raw[0..]);
+}
+
+fn interpolateF32x4(value0: F32x4, value1: F32x4, value2: F32x4, b0: f32, b1: f32, b2: f32) F32x4 {
+    return (value0 * @as(F32x4, @splat(b0))) + (value1 * @as(F32x4, @splat(b1))) + (value2 * @as(F32x4, @splat(b2)));
+}
+
+fn interpolateVertexOutputs(
+    allocator: std.mem.Allocator,
+    v0: *const Renderer.Vertex,
+    v1: *const Renderer.Vertex,
+    v2: *const Renderer.Vertex,
+    b0: f32,
+    b1: f32,
+    b2: f32,
+) VkError![spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8 {
+    var inputs: [spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8 = undefined;
+
+    for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
+        const out0 = v0.outputs[location] orelse continue;
+        const out1 = v1.outputs[location] orelse continue;
+        const out2 = v2.outputs[location] orelse continue;
+
+        if (out0.len == 0) {
+            inputs[location] = out0;
+            continue;
+        }
+
+        const len = @min(out0.len, out1.len, out2.len);
+        const input = allocator.alloc(u8, len) catch return VkError.OutOfDeviceMemory;
+
+        var byte_index: usize = 0;
+        while (byte_index + @sizeOf(F32x4) <= len) : (byte_index += @sizeOf(F32x4)) {
+            const value0 = std.mem.bytesToValue(F32x4, out0[byte_index..]);
+            const value1 = std.mem.bytesToValue(F32x4, out1[byte_index..]);
+            const value2 = std.mem.bytesToValue(F32x4, out2[byte_index..]);
+            writePacked(F32x4, input[byte_index..], interpolateF32x4(value0, value1, value2, b0, b1, b2));
+        }
+
+        while (byte_index + @sizeOf(f32) <= len) : (byte_index += @sizeOf(f32)) {
+            const value0 = std.mem.bytesToValue(f32, out0[byte_index..]);
+            const value1 = std.mem.bytesToValue(f32, out1[byte_index..]);
+            const value2 = std.mem.bytesToValue(f32, out2[byte_index..]);
+            writePacked(f32, input[byte_index..], (value0 * b0) + (value1 * b1) + (value2 * b2));
+        }
+
+        if (byte_index < len)
+            @memcpy(input[byte_index..], out0[byte_index..len]);
+
+        inputs[location] = input;
+    }
+
+    return inputs;
+}
+
+fn interpolateLineOutputs(allocator: std.mem.Allocator, v0: *const Renderer.Vertex, v1: *const Renderer.Vertex, t: f32) VkError![spv.SPIRV_MAX_OUTPUT_LOCATIONS][]u8 {
+    return interpolateVertexOutputs(allocator, v0, v1, v0, 1.0 - t, t, 0.0);
+}
+
+pub fn drawLineBresenham(allocator: std.mem.Allocator, fragments: *std.ArrayList(Renderer.Fragment), v0: *Renderer.Vertex, v1: *Renderer.Vertex) VkError!void {
+    var x0: i32 = @intFromFloat(v0.position[0]);
+    var y0: i32 = @intFromFloat(v0.position[1]);
+    var x1: i32 = @intFromFloat(v1.position[0]);
+    var y1: i32 = @intFromFloat(v1.position[1]);

    const steep = blk: {
        if (@abs(y1 - y0) > @abs(x1 - x0)) {
@@ -26,9 +88,12 @@ pub fn drawLineBresenham(allocator: std.mem.Allocator, fragments: *std.ArrayList
        break :blk false;
    };

+    var start_vertex = v0;
+    var end_vertex = v1;
    if (x0 > x1) {
        std.mem.swap(i32, &x0, &x1);
        std.mem.swap(i32, &y0, &y1);
+        std.mem.swap(*Renderer.Vertex, &start_vertex, &end_vertex);
    }

    const d_err = @abs(y1 - y0);
@@ -42,10 +107,14 @@ pub fn drawLineBresenham(allocator: std.mem.Allocator, fragments: *std.ArrayList
    while (x <= x1) : (x += 1) {
        const x_fragment: f32 = @floatFromInt(if (steep) y else x);
        const y_fragment: f32 = @floatFromInt(if (steep) x else y);
+        const t = @as(f32, @floatFromInt(x - x0)) / @as(f32, @floatFromInt(@max(d_x, 1)));
+
+        const z = ((1.0 - t) * start_vertex.position[2]) + (t * end_vertex.position[2]);

        fragments.append(allocator, .{
-            .position = zm.f32x4(x_fragment, y_fragment, 0.0, 1.0),
+            .position = zm.f32x4(x_fragment, y_fragment, z, 1.0),
            .color = zm.f32x4(1.0, 1.0, 1.0, 1.0),
+            .inputs = try interpolateLineOutputs(allocator, start_vertex, end_vertex, t),
        }) catch return VkError.OutOfDeviceMemory;

        err -= @intCast(d_err);
@@ -60,14 +129,15 @@ fn edgeFunction(a: F32x4, b: F32x4, p: F32x4) f32 {
    return ((p[0] - a[0]) * (b[1] - a[1])) - ((p[1] - a[1]) * (b[0] - a[0]));
 }

-pub fn drawTriangleFilled(allocator: std.mem.Allocator, fragments: *std.ArrayList(Renderer.Fragment), v0: F32x4, v1: F32x4, v2: F32x4) VkError!void {
-    const min_x: i32 = @intFromFloat(@floor(@min(v0[0], @min(v1[0], v2[0]))));
-    const max_x: i32 = @intFromFloat(@ceil(@max(v0[0], @max(v1[0], v2[0]))));
-    const min_y: i32 = @intFromFloat(@floor(@min(v0[1], @min(v1[1], v2[1]))));
-    const max_y: i32 = @intFromFloat(@ceil(@max(v0[1], @max(v1[1], v2[1]))));
+pub fn drawTriangleFilled(allocator: std.mem.Allocator, fragments: *std.ArrayList(Renderer.Fragment), v0: *Renderer.Vertex, v1: *Renderer.Vertex, v2: *Renderer.Vertex) VkError!void {
+    const min_x: i32 = @intFromFloat(@floor(@min(v0.position[0], v1.position[0], v2.position[0])));
+    const max_x: i32 = @intFromFloat(@ceil(@max(v0.position[0], v1.position[0], v2.position[0])));
+    const min_y: i32 = @intFromFloat(@floor(@min(v0.position[1], v1.position[1], v2.position[1])));
+    const max_y: i32 = @intFromFloat(@ceil(@max(v0.position[1], v1.position[1], v2.position[1])));

-    const area = edgeFunction(v0, v1, v2);
-    if (area == 0.0) return;
+    const area = edgeFunction(v0.position, v1.position, v2.position);
+    if (area == 0.0)
+        return;

    var y = min_y;
    while (y <= max_y) : (y += 1) {
@@ -75,25 +145,27 @@ pub fn drawTriangleFilled(allocator: std.mem.Allocator, fragments: *std.ArrayLis
        while (x <= max_x) : (x += 1) {
            const p = zm.f32x4(@as(f32, @floatFromInt(x)) + 0.5, @as(f32, @floatFromInt(y)) + 0.5, 0.0, 1.0);

-            const w0 = edgeFunction(v1, v2, p);
-            const w1 = edgeFunction(v2, v0, p);
-            const w2 = edgeFunction(v0, v1, p);
+            const w0 = edgeFunction(v1.position, v2.position, p);
+            const w1 = edgeFunction(v2.position, v0.position, p);
+            const w2 = edgeFunction(v0.position, v1.position, p);

            const inside = if (area > 0.0)
                w0 >= 0.0 and w1 >= 0.0 and w2 >= 0.0
            else
                w0 <= 0.0 and w1 <= 0.0 and w2 <= 0.0;

-            if (!inside) continue;
+            if (!inside)
+                continue;

            const b0 = w0 / area;
            const b1 = w1 / area;
            const b2 = w2 / area;
-            const z = (b0 * v0[2]) + (b1 * v1[2]) + (b2 * v2[2]);
+            const z = (b0 * v0.position[2]) + (b1 * v1.position[2]) + (b2 * v2.position[2]);

            fragments.append(allocator, .{
                .position = zm.f32x4(@floatFromInt(x), @floatFromInt(y), z, 1.0),
                .color = zm.f32x4(1.0, 1.0, 1.0, 1.0),
+                .inputs = try interpolateVertexOutputs(allocator, v0, v1, v2, b0, b1, b2),
            }) catch return VkError.OutOfDeviceMemory;
        }
    }
@@ -12,6 +12,7 @@ const SoftPipeline = @import("../SoftPipeline.zig");
 const VkError = base.VkError;

 pub const RunData = struct {
+    allocator: std.mem.Allocator,
    renderer: *Renderer,
    pipeline: *SoftPipeline,
    batch_id: usize,
@@ -45,20 +46,22 @@ inline fn run(data: RunData) !void {
            else => return err,
        };

-        for (data.pipeline.interface.mode.graphics.input_assembly.attribute_description orelse return) |attribute| {
-            const location_result = try rt.getResultByLocation(attribute.location, .input);
+        if (data.pipeline.interface.mode.graphics.input_assembly.attribute_description) |attributes| {
+            for (attributes) |attribute| {
+                const location_result = try rt.getResultByLocation(attribute.location, .input);

-            const binding_info = (data.pipeline.interface.mode.graphics.input_assembly.binding_description orelse return)[attribute.binding];
+                const binding_info = (data.pipeline.interface.mode.graphics.input_assembly.binding_description orelse return)[attribute.binding];

-            const vertex_buffer = data.renderer.state.data.graphics.vertex_buffers[attribute.binding];
-            const buffer = vertex_buffer.buffer;
-            const buffer_memory_size = base.format.texelSize(attribute.format);
-            const buffer_memory = if (buffer.interface.memory) |memory| memory else return VkError.InvalidDeviceMemoryDrv;
-            const offset = buffer.interface.offset + (binding_info.stride * invocation_index) + attribute.offset;
+                const vertex_buffer = data.renderer.state.data.graphics.vertex_buffers[attribute.binding];
+                const buffer = vertex_buffer.buffer;
+                const buffer_memory_size = base.format.texelSize(attribute.format);
+                const buffer_memory = if (buffer.interface.memory) |memory| memory else return VkError.InvalidDeviceMemoryDrv;
+                const offset = buffer.interface.offset + (binding_info.stride * invocation_index) + attribute.offset;

-            const buffer_memory_map: []u8 = @as([*]u8, @ptrCast(@alignCast(try buffer_memory.map(offset, buffer_memory_size))))[0..buffer_memory_size];
+                const buffer_memory_map: []u8 = @as([*]u8, @ptrCast(@alignCast(try buffer_memory.map(offset, buffer_memory_size))))[0..buffer_memory_size];

-            try rt.writeInput(buffer_memory_map, location_result);
+                try rt.writeInput(buffer_memory_map, location_result);
+            }
        }

        rt.callEntryPoint(allocator, entry) catch |err| switch (err) {
@@ -69,8 +72,21 @@ inline fn run(data: RunData) !void {
            else => return err,
        };

-        const output: *F32x4 = &data.draw_call.vertices[(data.instance_index * data.vertex_count) + invocation_index];
-        try rt.readBuiltIn(std.mem.asBytes(output), .Position);
+        const output: *Renderer.Vertex = &data.draw_call.vertices[(data.instance_index * data.vertex_count) + invocation_index];
+        try rt.readBuiltIn(std.mem.asBytes(&output.position), .Position);
+
+        for (0..spv.SPIRV_MAX_OUTPUT_LOCATIONS) |location| {
+            const result_word = rt.getResultByLocation(@intCast(location), .output) catch |err| switch (err) {
+                SpvRuntimeError.NotFound => continue,
+                else => return err,
+            };
+            if (result_word == 0)
+                continue;
+            const value = rt.results[result_word].getConstValue() catch continue;
+            const needed_size = try value.getPlainMemorySize();
+            output.outputs[location] = data.allocator.alloc(u8, needed_size) catch return VkError.OutOfDeviceMemory;
+            try rt.readOutput(output.outputs[location].?, result_word);
+        }
    }
 }