Add itruncf/fconverti fast path.

2025-06-27 15:41:33 +00:00 · 2019-11-07 01:11:25 +08:00
parent 015491ea77
commit 6135a004a4
3 changed files with 833 additions and 349 deletions
--- a/lib/singlepass-backend/src/codegen_x64.rs
+++ b/lib/singlepass-backend/src/codegen_x64.rs
@ -3277,9 +3277,32 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i32_trunc_uf32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
-
                    Self::emit_relaxed_binop(
                        a,
                        &mut self.machine,
@ -3296,6 +3319,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I32TruncSF32 => {
                let loc =
@ -3306,6 +3330,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i32_trunc_sf32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();

@ -3331,6 +3379,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I64TruncSF32 => {
                let loc =
@ -3341,6 +3390,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i64_trunc_sf32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();

@ -3365,23 +3438,9 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I64TruncUF32 => {
-                /*
-                    ; movq xmm5, r15
-                    ; mov r15d, 1593835520u32 as i32 //float 9.22337203E+18
-                    ; movd xmm1, r15d
-                    ; movd xmm2, Rd(reg as u8)
-                    ; movd xmm3, Rd(reg as u8)
-                    ; subss xmm2, xmm1
-                    ; cvttss2si Rq(reg as u8), xmm2
-                    ; mov r15, QWORD 0x8000000000000000u64 as i64
-                    ; xor r15, Rq(reg as u8)
-                    ; cvttss2si Rq(reg as u8), xmm3
-                    ; ucomiss xmm3, xmm1
-                    ; cmovae Rq(reg as u8), r15
-                    ; movq r15, xmm5
-                */
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
                let ret = self.machine.acquire_locations(
@ -3390,6 +3449,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i64_trunc_uf32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap(); // xmm2

@ -3439,6 +3522,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I32TruncUF64 => {
                let loc =
@ -3449,6 +3533,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i32_trunc_uf64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();

@ -3468,6 +3576,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I32TruncSF64 => {
                let loc =
@ -3478,6 +3587,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i32_trunc_sf64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();

@ -3508,6 +3641,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I64TruncSF64 => {
                let loc =
@ -3518,6 +3652,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i64_trunc_sf64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();

@ -3543,6 +3701,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::I64TruncUF64 => {
                let loc =
@ -3553,6 +3712,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_itruncf() {
+                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
+                    let tmp_in = self.machine.acquire_temp_xmm().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::XMM(tmp_in),
+                    );
+                    a.arch_emit_i64_trunc_uf64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::GPR(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_xmm(tmp_in);
+                    self.machine.release_temp_gpr(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_gpr().unwrap();
                    let tmp_in = self.machine.acquire_temp_xmm().unwrap(); // xmm2

@ -3602,6 +3785,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_xmm(tmp_in);
                    self.machine.release_temp_gpr(tmp_out);
                }
+            }

            Operator::F32ConvertSI32 => {
                let loc =
@ -3612,6 +3796,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f32_convert_si32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3622,6 +3830,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F32ConvertUI32 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3631,6 +3840,29 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f32_convert_ui32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3641,6 +3873,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F32ConvertSI64 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3650,6 +3883,29 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f32_convert_si64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3660,6 +3916,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F32ConvertUI64 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3669,6 +3926,29 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f32_convert_ui64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
                    let tmp = self.machine.acquire_temp_gpr().unwrap();
@ -3695,6 +3975,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }

            Operator::F64ConvertSI32 => {
                let loc =
@ -3705,6 +3986,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f64_convert_si32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3715,6 +4020,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F64ConvertUI32 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3724,6 +4030,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S32,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f64_convert_ui32(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3734,6 +4064,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F64ConvertSI64 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3743,6 +4074,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f64_convert_si64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();

@ -3753,6 +4108,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }
            Operator::F64ConvertUI64 => {
                let loc =
                    get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
@ -3762,6 +4118,30 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    false,
                )[0];
                self.value_stack.push(ret);
+
+                if a.arch_has_fconverti() {
+                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
+                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        loc,
+                        Location::GPR(tmp_in),
+                    );
+                    a.arch_emit_f64_convert_ui64(tmp_in, tmp_out);
+                    Self::emit_relaxed_binop(
+                        a,
+                        &mut self.machine,
+                        Assembler::emit_mov,
+                        Size::S64,
+                        Location::XMM(tmp_out),
+                        ret,
+                    );
+                    self.machine.release_temp_gpr(tmp_in);
+                    self.machine.release_temp_xmm(tmp_out);
+                } else {
                    let tmp_out = self.machine.acquire_temp_xmm().unwrap();
                    let tmp_in = self.machine.acquire_temp_gpr().unwrap();
                    let tmp = self.machine.acquire_temp_gpr().unwrap();
@ -3788,6 +4168,7 @@ impl FunctionCodeGenerator<CodegenError> for X64FunctionCode {
                    self.machine.release_temp_gpr(tmp_in);
                    self.machine.release_temp_xmm(tmp_out);
                }
+            }

            Operator::Call { function_index } => {
                let function_index = function_index as usize;
--- a/lib/singlepass-backend/src/emitter_x64.rs
+++ b/lib/singlepass-backend/src/emitter_x64.rs
@ -173,6 +173,26 @@ pub trait Emitter {

    fn emit_homomorphic_host_redirection(&mut self, target: GPR);
    fn emit_inline_breakpoint(&mut self, ty: InlineBreakpointType);
+
+    fn arch_has_itruncf(&self) -> bool { false }
+    fn arch_emit_i32_trunc_sf32(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i32_trunc_sf64(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i32_trunc_uf32(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i32_trunc_uf64(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i64_trunc_sf32(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i64_trunc_sf64(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i64_trunc_uf32(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+    fn arch_emit_i64_trunc_uf64(&mut self, _src: XMM, _dst: GPR) { unimplemented!() }
+
+    fn arch_has_fconverti(&self) -> bool { false }
+    fn arch_emit_f32_convert_si32(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f32_convert_si64(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f32_convert_ui32(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f32_convert_ui64(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f64_convert_si32(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f64_convert_si64(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f64_convert_ui32(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
+    fn arch_emit_f64_convert_ui64(&mut self, _src: GPR, _dst: XMM) { unimplemented!() }
 }

 fn _dummy(a: &mut Assembler) {
--- a/lib/singlepass-backend/src/translator_aarch64.rs
+++ b/lib/singlepass-backend/src/translator_aarch64.rs
@ -400,6 +400,57 @@ macro_rules! avx_fn {
    }
 }

+macro_rules! avx_fn_bitwise_inv {
+    ($ins:ident, $width:ident, $width_int:ident, $name:ident) => {
+        fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+            match src2 {
+                XMMOrMemory::XMM(src2) => dynasm!(self ; $ins $width(map_xmm(dst).v()), $width(map_xmm(src1).v()), $width(map_xmm(src2).v())),
+                XMMOrMemory::Memory(base, disp) => {
+                    if disp >= 0 {
+                        dynasm!(self ; b >after ; disp: ; .dword disp ; after: ; ldr w_tmp3, <disp ; add x_tmp3, x_tmp3, X(map_gpr(base).x()));
+                    } else {
+                        dynasm!(self ; b >after ; disp: ; .dword -disp ; after: ; ldr w_tmp3, <disp ; sub x_tmp3, X(map_gpr(base).x()), x_tmp3);
+                    }
+
+                    dynasm!(self
+                        ; ldr $width_int(X_TMP1), [x_tmp3]
+                        ; mov v_tmp1.$width[0], $width_int(X_TMP1)
+                        ; $ins $width(map_xmm(dst).v()), $width(map_xmm(src1).v()), $width(V_TMP1)
+                    );
+                }
+            }
+            dynasm!(self
+                ; mov $width_int(X_TMP1), V(map_xmm(dst).v()).$width[0]
+                ; mvn $width_int(X_TMP1), $width_int(X_TMP1)
+                ; mov V(map_xmm(dst).v()).$width[0], $width_int(X_TMP1)
+            );
+        }
+    }
+}
+
+macro_rules! avx_fn_reversed {
+    ($ins:ident, $width:ident, $width_int:ident, $name:ident) => {
+        fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+            match src2 {
+                XMMOrMemory::XMM(src2) => dynasm!(self ; $ins $width(map_xmm(dst).v()), $width(map_xmm(src2).v()), $width(map_xmm(src1).v())),
+                XMMOrMemory::Memory(base, disp) => {
+                    if disp >= 0 {
+                        dynasm!(self ; b >after ; disp: ; .dword disp ; after: ; ldr w_tmp3, <disp ; add x_tmp3, x_tmp3, X(map_gpr(base).x()));
+                    } else {
+                        dynasm!(self ; b >after ; disp: ; .dword -disp ; after: ; ldr w_tmp3, <disp ; sub x_tmp3, X(map_gpr(base).x()), x_tmp3);
+                    }
+
+                    dynasm!(self
+                        ; ldr $width_int(X_TMP1), [x_tmp3]
+                        ; mov v_tmp1.$width[0], $width_int(X_TMP1)
+                        ; $ins $width(map_xmm(dst).v()), $width(V_TMP1), $width(map_xmm(src1).v())
+                    );
+                }
+            }
+        }
+    }
+}
+
 macro_rules! avx_fn_unop {
    ($ins:ident, $width:ident, $name:ident) => {
        fn $name(&mut self, src1: XMM, _src2: XMMOrMemory, dst: XMM) {
@ -1312,20 +1363,6 @@ impl Emitter for Assembler {
        }
    }

-    // TODO: These instructions are only used in FP opcodes. Implement later.
-    fn emit_btc_gpr_imm8_32(&mut self, src: u8, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
-    }
-    fn emit_btc_gpr_imm8_64(&mut self, src: u8, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
-    }
-    fn emit_cmovae_gpr_32(&mut self, src: GPR, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
-    }
-    fn emit_cmovae_gpr_64(&mut self, src: GPR, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
-    }
-
    avx_fn!(fadd, S, W, emit_vaddss);
    avx_fn!(fsub, S, W, emit_vsubss);
    avx_fn!(fmul, S, W, emit_vmulss);
@ -1333,8 +1370,11 @@ impl Emitter for Assembler {
    avx_fn!(fmax, S, W, emit_vmaxss);
    avx_fn!(fmin, S, W, emit_vminss);
    avx_fn!(fcmgt, S, W, emit_vcmpgtss);
+    avx_fn_reversed!(fcmgt, S, W, emit_vcmpltss); // b gt a <=> a lt b
    avx_fn!(fcmge, S, W, emit_vcmpgess);
+    avx_fn_bitwise_inv!(fcmgt, S, W, emit_vcmpless); // a not gt b <=> a le b
    avx_fn!(fcmeq, S, W, emit_vcmpeqss);
+    avx_fn_bitwise_inv!(fcmeq, S, W, emit_vcmpneqss); // a not eq b <=> a neq b
    avx_fn_unop!(fsqrt, S, emit_vsqrtss);
    avx_fn_unop!(frintn, S, emit_vroundss_nearest); // to nearest with ties to even
    avx_fn_unop!(frintm, S, emit_vroundss_floor); // toward minus infinity
@ -1349,8 +1389,11 @@ impl Emitter for Assembler {
    avx_fn!(fmax, D, X, emit_vmaxsd);
    avx_fn!(fmin, D, X, emit_vminsd);
    avx_fn!(fcmgt, D, X, emit_vcmpgtsd);
+    avx_fn_reversed!(fcmgt, D, X, emit_vcmpltsd); // b gt a <=> a lt b
    avx_fn!(fcmge, D, X, emit_vcmpgesd);
+    avx_fn_bitwise_inv!(fcmgt, D, X, emit_vcmplesd); // a not gt b <=> a le b
    avx_fn!(fcmeq, D, X, emit_vcmpeqsd);
+    avx_fn_bitwise_inv!(fcmeq, D, X, emit_vcmpneqsd); // a not eq b <=> a neq b
    avx_fn_unop!(fsqrt, D, emit_vsqrtsd);
    avx_fn_unop!(frintn, D, emit_vroundsd_nearest); // to nearest with ties to even
    avx_fn_unop!(frintm, D, emit_vroundsd_floor); // toward minus infinity
@ -1358,63 +1401,103 @@ impl Emitter for Assembler {
    avx_fn_unop!(frintz, D, emit_vroundsd_trunc); // toward zero
    avx_fn_cvt!(fcvt, D, S, emit_vcvtsd2ss);

-    fn emit_vcmpneqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    fn arch_has_itruncf(&self) -> bool { true }
+    fn arch_emit_i32_trunc_sf32(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzs W(map_gpr(dst).x()), S(map_xmm(src).v()));
    }
-    fn emit_vcmpneqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    fn arch_emit_i32_trunc_sf64(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzs W(map_gpr(dst).x()), D(map_xmm(src).v()));
+    }
+    fn arch_emit_i32_trunc_uf32(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzu W(map_gpr(dst).x()), S(map_xmm(src).v()));
+    }
+    fn arch_emit_i32_trunc_uf64(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzu W(map_gpr(dst).x()), D(map_xmm(src).v()));
+    }
+    fn arch_emit_i64_trunc_sf32(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzs X(map_gpr(dst).x()), S(map_xmm(src).v()));
+    }
+    fn arch_emit_i64_trunc_sf64(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzs X(map_gpr(dst).x()), D(map_xmm(src).v()));
+    }
+    fn arch_emit_i64_trunc_uf32(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzu X(map_gpr(dst).x()), S(map_xmm(src).v()));
+    }
+    fn arch_emit_i64_trunc_uf64(&mut self, src: XMM, dst: GPR) {
+        dynasm!(self ; fcvtzu X(map_gpr(dst).x()), D(map_xmm(src).v()));
    }

-    fn emit_vcmpltss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    fn arch_has_fconverti(&self) -> bool { true }
+    fn arch_emit_f32_convert_si32(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; scvtf S(map_xmm(dst).v()), W(map_gpr(src).x()));
    }
-    fn emit_vcmpltsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    fn arch_emit_f32_convert_si64(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; scvtf S(map_xmm(dst).v()), X(map_gpr(src).x()));
+    }
+    fn arch_emit_f32_convert_ui32(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; ucvtf S(map_xmm(dst).v()), W(map_gpr(src).x()));
+    }
+    fn arch_emit_f32_convert_ui64(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; ucvtf S(map_xmm(dst).v()), X(map_gpr(src).x()));
+    }
+    fn arch_emit_f64_convert_si32(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; scvtf D(map_xmm(dst).v()), W(map_gpr(src).x()));
+    }
+    fn arch_emit_f64_convert_si64(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; scvtf D(map_xmm(dst).v()), X(map_gpr(src).x()));
+    }
+    fn arch_emit_f64_convert_ui32(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; ucvtf D(map_xmm(dst).v()), W(map_gpr(src).x()));
+    }
+    fn arch_emit_f64_convert_ui64(&mut self, src: GPR, dst: XMM) {
+        dynasm!(self ; ucvtf D(map_xmm(dst).v()), X(map_gpr(src).x()));
    }

-    fn emit_vcmpless(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    // These instructions are only used in itruncf-type/fconverti-type opcodes.
+    fn emit_btc_gpr_imm8_32(&mut self, src: u8, dst: GPR) {
+        unimplemented!();
    }
-    fn emit_vcmplesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+    fn emit_btc_gpr_imm8_64(&mut self, src: u8, dst: GPR) {
+        unimplemented!();
+    }
+    fn emit_cmovae_gpr_32(&mut self, src: GPR, dst: GPR) {
+        unimplemented!();
+    }
+    fn emit_cmovae_gpr_64(&mut self, src: GPR, dst: GPR) {
+        unimplemented!();
    }
-
-
    fn emit_ucomiss(&mut self, src: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_ucomisd(&mut self, src: XMMOrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
-
    fn emit_cvttss2si_32(&mut self, src: XMMOrMemory, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_cvttss2si_64(&mut self, src: XMMOrMemory, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_cvttsd2si_32(&mut self, src: XMMOrMemory, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_cvttsd2si_64(&mut self, src: XMMOrMemory, dst: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
-
    fn emit_vcvtsi2ss_32(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_vcvtsi2ss_64(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_vcvtsi2sd_32(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
    fn emit_vcvtsi2sd_64(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }
-
    fn emit_test_gpr_64(&mut self, reg: GPR) {
-        dynasm!(self ; .dword 0 ; .dword 29)
+        unimplemented!();
    }

    fn emit_ud2(&mut self) {