diff --git a/README.md b/README.md
index 7d03b29..3745c05 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # SPIR-V Interpreter <a href="https://git.kbz8.me/kbz_8/SPIRV-Interpreter/actions?workflows=build.yml"><img src="https://git.kbz8.me/kbz_8/SPIRV-Interpreter/actions/workflows/build.yml/badge.svg"></a> <a href="https://git.kbz8.me/kbz_8/SPIRV-Interpreter/actions?workflows=test.yml"><img src="https://git.kbz8.me/kbz_8/SPIRV-Interpreter/actions/workflows/test.yml/badge.svg"></a>
 
-A small footprint SPIR-V interpreter with zero dependencies to execute SPIR-V shaders on the CPU. It is designed to be used with multiple runtimes concurrently.
+A small footprint SPIR-V interpreter to execute SPIR-V shaders on the CPU. It is designed to be used with multiple runtimes concurrently.
 
 ```zig
 const std = @import("std");
diff --git a/build.zig b/build.zig
index b5ea803..f96a128 100644
--- a/build.zig
+++ b/build.zig
@@ -12,6 +12,9 @@ pub fn build(b: *std.Build) void {
         .optimize = optimize,
     });
 
+    const zmath = b.dependency("zmath", .{});
+    mod.addImport("zmath", zmath.module("root"));
+
     const pretty = b.dependency("pretty", .{ .target = target, .optimize = optimize });
     mod.addImport("pretty", pretty.module("pretty"));
 
diff --git a/build.zig.zon b/build.zig.zon
index ccc0d95..134bdf4 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -2,6 +2,10 @@
     .name = .SPIRV_Interpreter,
     .version = "0.0.1",
     .dependencies = .{
+        .zmath = .{
+            .url = "git+https://github.com/zig-gamedev/zmath.git#3a5955b2b72cd081563fbb084eff05bffd1e3fbb",
+            .hash = "zmath-0.11.0-dev-wjwivdMsAwD-xaLj76YHUq3t9JDH-X16xuMTmnDzqbu2",
+        },
         .pretty = .{ // For debugging purposes
             .url = "git+https://github.com/Kbz-8/pretty#117674465efd4d07d5ae9d9d8ca59c2c323a65ba",
             .hash = "pretty-0.10.6-Tm65r99UAQDEJMgZysD10qE8dinBHr064fPM6YkxVPfB",
diff --git a/example/main.zig b/example/main.zig
index b30b8bb..886a6d1 100644
--- a/example/main.zig
+++ b/example/main.zig
@@ -4,8 +4,8 @@ const spv = @import("spv");
 
 const shader_source = @embedFile("shader.spv");
 
-const screen_width = 200;
-const screen_height = 200;
+const screen_width = 480;
+const screen_height = 240;
 
 pub fn main() !void {
     {
@@ -50,9 +50,7 @@ pub fn main() !void {
         }
 
         var thread_pool: std.Thread.Pool = undefined;
-        try thread_pool.init(.{
-            .allocator = allocator,
-        });
+        try thread_pool.init(.{ .allocator = allocator });
 
         var timer = try std.time.Timer.start();
 
@@ -73,15 +71,15 @@ pub fn main() !void {
 
                 const pixel_map: [*]u32 = @as([*]u32, @ptrCast(@alignCast((surface.getPixels() orelse return).ptr)));
 
+                const delta: f32 = @as(f32, @floatFromInt(timer.read())) / std.time.ns_per_s;
+
                 var frame_timer = try std.time.Timer.start();
                 defer {
                     const ns = frame_timer.lap();
                     const ms = @as(f32, @floatFromInt(ns)) / std.time.ns_per_s;
-                    std.log.info("Took {d:.3}s - {d:.3}fps to render", .{ ms, 1.0 / ms });
+                    std.log.info("Took {d:.3}s - {d:.3}fps to render {d:.2}", .{ ms, 1.0 / ms, delta });
                 }
 
-                const delta: f32 = @as(f32, @floatFromInt(timer.read())) / std.time.ns_per_s;
-
                 var wait_group: std.Thread.WaitGroup = .{};
                 for (0..screen_height) |y| {
                     const runner = &runner_cache.items[y];
@@ -131,10 +129,10 @@ const Runner = struct {
             try rt.readOutput(f32, output[0..], self.color);
 
             const rgba = self.surface.mapRgba(
-                @truncate(@as(u32, @intFromFloat(output[0] * 255.0))),
-                @truncate(@as(u32, @intFromFloat(output[1] * 255.0))),
-                @truncate(@as(u32, @intFromFloat(output[2] * 255.0))),
-                @truncate(@as(u32, @intFromFloat(output[3] * 255.0))),
+                @intCast(@max(@min(@as(i32, @intFromFloat(output[0] * 255.0)), 255), 0)),
+                @intCast(@max(@min(@as(i32, @intFromFloat(output[1] * 255.0)), 255), 0)),
+                @intCast(@max(@min(@as(i32, @intFromFloat(output[2] * 255.0)), 255), 0)),
+                @intCast(@max(@min(@as(i32, @intFromFloat(output[3] * 255.0)), 255), 0)),
             );
 
             pixel_map[(y * self.surface.getWidth()) + x] = rgba.value;
diff --git a/example/mangohud.conf b/example/mangohud.conf
new file mode 100644
index 0000000..569e46f
--- /dev/null
+++ b/example/mangohud.conf
@@ -0,0 +1,6 @@
+gpu_stats=0
+font_size=16
+resolution
+hud_compact
+background_alpha=0
+width=140
diff --git a/example/shader.nzsl b/example/shader.nzsl
index ba00b3f..62955c1 100644
--- a/example/shader.nzsl
+++ b/example/shader.nzsl
@@ -16,16 +16,16 @@ struct FragOut
 [entry(frag)]
 fn main(input: FragIn) -> FragOut
 {
-    const I: i32 = 32;
+    const I: i32 = 128;
     const A: f32 = 7.5;
-    const MA: f32 = 20.0;
+    const MA: f32 = 100.0;
     const MI: f32 = 0.001;
 
     let uv0 = input.pos / input.res * 2.0 - vec2[f32](1.0, 1.0);
     let uv  = vec2[f32](uv0.x * (input.res.x / input.res.y), uv0.y);
 
-    let col = vec3[f32](0.0, 0.0, 0.0);
-    let ro  = vec3[f32](0.0, 0.0, -2.0);
+    let col = vec3[f32](0.0, 0.0, 0.0);  
+    let ro  = vec3[f32](0.0, 0.0, -2.0); 
     let rd  = vec3[f32](uv.x, uv.y, 1.0);
     let dt  = 0.0;
     let ds  = 0.0;
diff --git a/example/shader.spv b/example/shader.spv
index 984e29b..3801e1e 100644
Binary files a/example/shader.spv and b/example/shader.spv differ
diff --git a/example/shader.spv.txt b/example/shader.spv.txt
index 10493e2..9eecca9 100644
--- a/example/shader.spv.txt
+++ b/example/shader.spv.txt
@@ -55,11 +55,11 @@ Schema: 0
  %29 = OpConstant %3 f32(0.2)
  %30 = OpConstant %3 f32(4)
  %31 = OpTypePointer StorageClass(Function) %6
- %32 = OpConstant %6 i32(32)
+ %32 = OpConstant %6 i32(128)
  %33 = OpTypeBool
  %34 = OpConstant %3 f32(0.001)
  %35 = OpConstant %3 f32(0.35)
- %36 = OpConstant %3 f32(20)
+ %36 = OpConstant %3 f32(100)
  %37 = OpConstant %3 f32(0.15)
  %38 = OpConstant %3 f32(0.05)
  %39 = OpConstant %3 f32(1.15)
diff --git a/sandbox/shader.nzsl b/sandbox/shader.nzsl
index ec4e798..59a606d 100644
--- a/sandbox/shader.nzsl
+++ b/sandbox/shader.nzsl
@@ -16,24 +16,24 @@ struct FragOut
 [entry(frag)]
 fn main(input: FragIn) -> FragOut
 {
-    const I: i32 = 128;
+    const I: i32 = 32;
     const A: f32 = 7.5;
-    const MA: f32 = 100.0;
+    const MA: f32 = 2.0;
     const MI: f32 = 0.001;
 
     let uv0 = input.pos / input.res * 2.0 - vec2[f32](1.0, 1.0);
     let uv  = vec2[f32](uv0.x * (input.res.x / input.res.y), uv0.y);
 
-    let col = vec3[f32](0.0, 0.0, 0.0);  
-    let ro  = vec3[f32](0.0, 0.0, -2.0); 
-    let rd  = vec3[f32](uv.x, uv.y, 1.0);
+    let col = vec4[f32](0.0, 0.0, 0.0, 0.0);
+    let ro  = vec4[f32](0.0, 0.0, -2.0, 0.0);
+    let rd  = vec4[f32](uv.x, uv.y, 1.0, 0.0);
     let dt  = 0.0;
     let ds  = 0.0;
     let dm  = -1.0;
     let p   = ro;
-    let c   = vec3[f32](0.0, 0.0, 0.0);
+    let c   = vec4[f32](0.0, 0.0, 0.0, 0.0);
 
-    let l = vec3[f32](0.0, sin(input.time * 0.2) * 4.0, cos(input.time * 0.2) * 4.0);
+    let l = vec4[f32](0.0, sin(input.time * 0.2) * 4.0, cos(input.time * 0.2) * 4.0, 0.0);
 
     for i in 0 -> I
     {
@@ -46,26 +46,23 @@ fn main(input: FragIn) -> FragOut
 
         if (ds <= MI)
         {
-            let value = max(dot(normalize(c - p), normalize(p - l)) - 0.35, 0.0);
-            col = vec3[f32](value, value, value);
+            let value = max(dot(normalize(c - p), normalize(p - l)), 0.0);
+            col = vec4[f32](value, value, value, 1.0);
             break;
         }
 
         if (ds >= MA)
         {
-            if (dot(normalize(rd), normalize(l - ro)) <= 1.0)
+            if (dot(normalize(rd), normalize(l - ro)) < 1.0)
             {
-                let value = max(dot(normalize(rd), normalize(l - ro)) + 0.15, 0.05)/ 1.15 * (1.0 - dm * A);
-                col = vec3[f32](value, value, value);
+                let value = max(dot(normalize(rd), normalize(l - ro)) + 0.15, 0.0) / 1.15 * max(1.0 - dm * A, 0.0);
+                col = vec4[f32](value, value, value, 1.0);
             }
             break;
         }
     }
 
-   //if (col == vec3[f32](0.0, 0.0, 0.0))
-   //    discard;
-
     let output: FragOut;
-    output.color = vec4[f32](col.x, col.y, col.z, 1.0);
+    output.color = col;
     return output;
 }
diff --git a/src/GLSL_std_450/opcodes.zig b/src/GLSL_std_450/opcodes.zig
index c48b821..4e597e9 100644
--- a/src/GLSL_std_450/opcodes.zig
+++ b/src/GLSL_std_450/opcodes.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 const spv = @import("../spv.zig");
 const ext = @import("GLSL_std_450.zig");
 const opc = @import("../opcodes.zig");
+const zm = @import("zmath");
 
 const Module = @import("../Module.zig");
 const Runtime = @import("../Runtime.zig");
@@ -72,11 +73,23 @@ pub var runtime_dispatcher = [_]?OpCodeExtFunc{null} ** ext.GLSLOpMaxValue;
 
 pub fn initRuntimeDispatcher() void {
     // zig fmt: off
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Ceil)]      = MathEngine(.Float, .Ceil).opSingleOperator;
     runtime_dispatcher[@intFromEnum(ext.GLSLOp.Cos)]       = MathEngine(.Float, .Cos).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Exp)]       = MathEngine(.Float, .Exp).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Exp2)]      = MathEngine(.Float, .Exp2).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.FAbs)]      = MathEngine(.Float, .FAbs).opSingleOperator;
     runtime_dispatcher[@intFromEnum(ext.GLSLOp.FMax)]      = MathEngine(.Float, .FMax).opDoubleOperators;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Floor)]     = MathEngine(.Float, .Floor).opSingleOperator;
     runtime_dispatcher[@intFromEnum(ext.GLSLOp.Length)]    = opLength;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Log)]       = MathEngine(.Float, .Log).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Log2)]      = MathEngine(.Float, .Log2).opSingleOperator;
     runtime_dispatcher[@intFromEnum(ext.GLSLOp.Normalize)] = opNormalize;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Round)]     = MathEngine(.Float, .Round).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.SAbs)]      = MathEngine(.SInt,  .SAbs).opSingleOperator;
     runtime_dispatcher[@intFromEnum(ext.GLSLOp.Sin)]       = MathEngine(.Float, .Sin).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Sqrt)]      = MathEngine(.Float, .Sqrt).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Tan)]       = MathEngine(.Float, .Tan).opSingleOperator;
+    runtime_dispatcher[@intFromEnum(ext.GLSLOp.Trunc)]     = MathEngine(.Float, .Trunc).opSingleOperator;
     // zig fmt: on
 }
 
@@ -92,8 +105,20 @@ fn MathEngine(comptime T: ValueType, comptime Op: MathOp) type {
             const operator = struct {
                 fn operation(comptime TT: type, x: TT) RuntimeError!TT {
                     return switch (Op) {
-                        .Sin => @sin(x),
+                        .Ceil => @ceil(x),
                         .Cos => @cos(x),
+                        .Exp => @exp(x),
+                        .Exp2 => @exp2(x),
+                        .FAbs => @abs(x),
+                        .Floor => @floor(x),
+                        .Log => @log(x),
+                        .Log2 => @log2(x),
+                        .Round => @round(x),
+                        .SAbs => if (comptime @typeInfo(TT) == .int) @intCast(@abs(x)) else return RuntimeError.InvalidSpirV,
+                        .Sin => @sin(x),
+                        .Sqrt => @sqrt(x),
+                        .Tan => @tan(x),
+                        .Trunc => @trunc(x),
                         else => RuntimeError.InvalidSpirV,
                     };
                 }
@@ -166,12 +191,6 @@ fn MathEngine(comptime T: ValueType, comptime Op: MathOp) type {
                         else => return RuntimeError.InvalidSpirV,
                     }
                 }
-
-                inline fn applySIMDVector(comptime ElemT: type, comptime N: usize, d: *@Vector(N, ElemT), l: *const @Vector(N, ElemT), r: *const @Vector(N, ElemT)) RuntimeError!void {
-                    inline for (0..N) |i| {
-                        d[i] = try operation(ElemT, l[i], r[i]);
-                    }
-                }
             };
 
             switch (dst.*) {
@@ -181,17 +200,17 @@ fn MathEngine(comptime T: ValueType, comptime Op: MathOp) type {
                     try operator.applyScalar(lane_bits, d_lane, &l_lane, &r_lane);
                 },
 
-                .Vector4f32 => |*d| try operator.applySIMDVector(f32, 4, d, &lhs.Vector4f32, &rhs.Vector4f32),
-                .Vector3f32 => |*d| try operator.applySIMDVector(f32, 3, d, &lhs.Vector3f32, &rhs.Vector3f32),
-                .Vector2f32 => |*d| try operator.applySIMDVector(f32, 2, d, &lhs.Vector2f32, &rhs.Vector2f32),
+                .Vector4f32 => |*d| d.* = try operator.operation(@Vector(4, f32), lhs.Vector4f32, rhs.Vector4f32),
+                .Vector3f32 => |*d| d.* = try operator.operation(@Vector(3, f32), lhs.Vector3f32, rhs.Vector3f32),
+                .Vector2f32 => |*d| d.* = try operator.operation(@Vector(2, f32), lhs.Vector2f32, rhs.Vector2f32),
 
-                .Vector4i32 => |*d| try operator.applySIMDVector(i32, 4, d, &lhs.Vector4i32, &rhs.Vector4i32),
-                .Vector3i32 => |*d| try operator.applySIMDVector(i32, 3, d, &lhs.Vector3i32, &rhs.Vector3i32),
-                .Vector2i32 => |*d| try operator.applySIMDVector(i32, 2, d, &lhs.Vector2i32, &rhs.Vector2i32),
+                .Vector4i32 => |*d| d.* = try operator.operation(@Vector(4, i32), lhs.Vector4i32, rhs.Vector4i32),
+                .Vector3i32 => |*d| d.* = try operator.operation(@Vector(3, i32), lhs.Vector3i32, rhs.Vector3i32),
+                .Vector2i32 => |*d| d.* = try operator.operation(@Vector(2, i32), lhs.Vector2i32, rhs.Vector2i32),
 
-                .Vector4u32 => |*d| try operator.applySIMDVector(u32, 4, d, &lhs.Vector4u32, &rhs.Vector4u32),
-                .Vector3u32 => |*d| try operator.applySIMDVector(u32, 3, d, &lhs.Vector3u32, &rhs.Vector3u32),
-                .Vector2u32 => |*d| try operator.applySIMDVector(u32, 2, d, &lhs.Vector2u32, &rhs.Vector2u32),
+                .Vector4u32 => |*d| d.* = try operator.operation(@Vector(4, u32), lhs.Vector4u32, rhs.Vector4u32),
+                .Vector3u32 => |*d| d.* = try operator.operation(@Vector(3, u32), lhs.Vector3u32, rhs.Vector3u32),
+                .Vector2u32 => |*d| d.* = try operator.operation(@Vector(2, u32), lhs.Vector2u32, rhs.Vector2u32),
 
                 else => return RuntimeError.InvalidSpirV,
             }
@@ -199,12 +218,6 @@ fn MathEngine(comptime T: ValueType, comptime Op: MathOp) type {
     };
 }
 
-inline fn sumSIMDVector(comptime ElemT: type, comptime N: usize, d: *ElemT, v: *const @Vector(N, ElemT)) void {
-    inline for (0..N) |i| {
-        d.* += v[i];
-    }
-}
-
 fn opLength(_: std.mem.Allocator, target_type_id: SpvWord, id: SpvWord, _: SpvWord, rt: *Runtime) RuntimeError!void {
     const target_type = (try rt.results[target_type_id].getVariant()).Type;
     const dst = try rt.results[id].getValue();
@@ -219,9 +232,18 @@ fn opLength(_: std.mem.Allocator, target_type_id: SpvWord, id: SpvWord, _: SpvWo
 
             if (bits == 32) { // More likely to be SIMD if f32
                 switch (src.*) {
-                    .Vector4f32 => |src_vec| sumSIMDVector(f32, 4, &sum, &src_vec),
-                    .Vector3f32 => |src_vec| sumSIMDVector(f32, 3, &sum, &src_vec),
-                    .Vector2f32 => |src_vec| sumSIMDVector(f32, 2, &sum, &src_vec),
+                    .Vector4f32 => |src_vec| {
+                        d_field.* = zm.length4(src_vec)[0];
+                        return;
+                    },
+                    .Vector3f32 => |src_vec| {
+                        d_field.* = zm.length3(zm.f32x4(src_vec[0], src_vec[1], src_vec[2], 0.0))[0];
+                        return;
+                    },
+                    .Vector2f32 => |src_vec| {
+                        d_field.* = zm.length2(zm.f32x4(src_vec[0], src_vec[1], 0.0, 0.0))[0];
+                        return;
+                    },
                     else => {},
                 }
             }
@@ -237,7 +259,6 @@ fn opLength(_: std.mem.Allocator, target_type_id: SpvWord, id: SpvWord, _: SpvWo
                     const s_field = try getValuePrimitiveField(.Float, bits, s_lane);
                     sum += s_field.*;
                 },
-                .Vector4f32, .Vector3f32, .Vector2f32 => {},
                 else => return RuntimeError.InvalidSpirV,
             }
 
@@ -256,17 +277,31 @@ fn opNormalize(_: std.mem.Allocator, target_type_id: SpvWord, id: SpvWord, _: Sp
 
     switch (lane_bits) {
         inline 16, 32, 64 => |bits| {
-            var sum: std.meta.Float(bits) = 0.0;
-
             if (bits == 32) { // More likely to be SIMD if f32
                 switch (src.*) {
-                    .Vector4f32 => |src_vec| sumSIMDVector(f32, 4, &sum, &src_vec),
-                    .Vector3f32 => |src_vec| sumSIMDVector(f32, 3, &sum, &src_vec),
-                    .Vector2f32 => |src_vec| sumSIMDVector(f32, 2, &sum, &src_vec),
+                    .Vector4f32 => |src_vec| {
+                        dst.Vector4f32 = zm.normalize4(src_vec);
+                        return;
+                    },
+                    .Vector3f32 => |src_vec| {
+                        const normed = zm.normalize3(zm.f32x4(src_vec[0], src_vec[1], src_vec[2], 0.0));
+                        dst.Vector3f32[0] = normed[0];
+                        dst.Vector3f32[1] = normed[1];
+                        dst.Vector3f32[2] = normed[2];
+                        return;
+                    },
+                    .Vector2f32 => |src_vec| {
+                        const normed = zm.normalize2(zm.f32x4(src_vec[0], src_vec[1], 0.0, 0.0));
+                        dst.Vector2f32[0] = normed[0];
+                        dst.Vector2f32[1] = normed[1];
+                        return;
+                    },
                     else => {},
                 }
             }
 
+            var sum: std.meta.Float(bits) = 0.0;
+
             switch (src.*) {
                 .Float => {
                     const s_field = try getValuePrimitiveField(.Float, bits, src);
@@ -276,34 +311,17 @@ fn opNormalize(_: std.mem.Allocator, target_type_id: SpvWord, id: SpvWord, _: Sp
                     const s_field = try getValuePrimitiveField(.Float, bits, s_lane);
                     sum += s_field.*;
                 },
-                .Vector4f32, .Vector3f32, .Vector2f32 => {},
                 else => return RuntimeError.InvalidSpirV,
             }
 
             sum = @sqrt(sum);
 
-            if (bits == 32) {
-                switch (dst.*) {
-                    .Vector4f32 => |*dst_vec| inline for (0..4) |i| {
-                        dst_vec[i] = src.Vector4f32[i] / sum;
-                    },
-                    .Vector3f32 => |*dst_vec| inline for (0..3) |i| {
-                        dst_vec[i] = src.Vector3f32[i] / sum;
-                    },
-                    .Vector2f32 => |*dst_vec| inline for (0..2) |i| {
-                        dst_vec[i] = src.Vector2f32[i] / sum;
-                    },
-                    else => {},
-                }
-            }
-
             switch (dst.*) {
                 .Vector => |dst_vec| for (dst_vec, src.Vector) |*d_lane, *s_lane| {
                     const d_field = try getValuePrimitiveField(.Float, bits, d_lane);
                     const s_field = try getValuePrimitiveField(.Float, bits, s_lane);
                     d_field.* = s_field.* / sum;
                 },
-                .Vector4f32, .Vector3f32, .Vector2f32 => {},
                 else => return RuntimeError.InvalidSpirV,
             }
         },
diff --git a/src/lib.zig b/src/lib.zig
index c5c96cd..1d54d92 100644
--- a/src/lib.zig
+++ b/src/lib.zig
@@ -1,4 +1,4 @@
-//! A small footprint SPIR-V interpreter with zero dependencies to execute SPIR-V shaders on the CPU. It is designed to be used with multiple runtimes concurrently.
+//! A small footprint SPIR-V interpreter to execute SPIR-V shaders on the CPU. It is designed to be used with multiple runtimes concurrently.
 //!
 //! ```zig
 //! const std = @import("std");
diff --git a/src/opcodes.zig b/src/opcodes.zig
index 0009286..cb0c19a 100644
--- a/src/opcodes.zig
+++ b/src/opcodes.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const spv = @import("spv.zig");
+const zm = @import("zmath");
 
 const GLSL_std_450 = @import("GLSL_std_450/opcodes.zig");
 
@@ -1141,14 +1142,14 @@ fn opDot(_: std.mem.Allocator, _: SpvWord, rt: *Runtime) RuntimeError!void {
                 else => return RuntimeError.InvalidSpirV,
             }
         },
-        .Vector4f32 => |*vec| inline for (0..4) |i| {
-            value.Float.float32 += vec[i] * op2_value.Vector4f32[i];
+        .Vector4f32 => |vec| value.Float.float32 = zm.dot4(vec, op2_value.Vector4f32)[0],
+        .Vector3f32 => |vec| {
+            const op2_vec = op2_value.Vector3f32;
+            value.Float.float32 = zm.dot3(zm.f32x4(vec[0], vec[1], vec[2], 0.0), zm.f32x4(op2_vec[0], op2_vec[1], op2_vec[2], 0.0))[0];
         },
-        .Vector3f32 => |*vec| inline for (0..3) |i| {
-            value.Float.float32 += vec[i] * op2_value.Vector3f32[i];
-        },
-        .Vector2f32 => |*vec| inline for (0..2) |i| {
-            value.Float.float32 += vec[i] * op2_value.Vector2f32[i];
+        .Vector2f32 => |vec| {
+            const op2_vec = op2_value.Vector2f32;
+            value.Float.float32 = zm.dot2(zm.f32x4(vec[0], vec[1], 0.0, 0.0), zm.f32x4(op2_vec[0], op2_vec[1], 0.0, 0.0))[0];
         },
         else => return RuntimeError.InvalidSpirV,
     }