diff --git a/lib/dynasm-backend/src/codegen_x64.rs b/lib/dynasm-backend/src/codegen_x64.rs index 9ed103991..386f0ce52 100644 --- a/lib/dynasm-backend/src/codegen_x64.rs +++ b/lib/dynasm-backend/src/codegen_x64.rs @@ -525,6 +525,7 @@ impl X64FunctionCode { (_, Location::Imm32(_)) | (_, Location::Imm64(_)) => RelaxMode::DstToGPR, (Location::Imm64(_), Location::Memory(_, _)) => RelaxMode::SrcToGPR, (Location::Imm64(_), Location::GPR(_)) if (op as *const u8 != Assembler::emit_mov as *const u8) => RelaxMode::SrcToGPR, + (_, Location::XMM(_)) => RelaxMode::SrcToGPR, _ if (op as *const u8 == Assembler::emit_imul as *const u8) => RelaxMode::BothToGPR, // TODO: optimize this _ => RelaxMode::Direct, }; @@ -563,6 +564,75 @@ impl X64FunctionCode { } } + fn emit_relaxed_avx( + a: &mut Assembler, + m: &mut Machine, + op: fn(&mut Assembler, XMM, XMMOrMemory, XMM), + src1: Location, + src2: Location, + dst: Location, + ) { + let tmp1 = m.acquire_temp_xmm().unwrap(); + let tmp2 = m.acquire_temp_xmm().unwrap(); + let tmp3 = m.acquire_temp_xmm().unwrap(); + let tmpg = m.acquire_temp_gpr().unwrap(); + + let src1 = match src1 { + Location::XMM(x) => x, + Location::GPR(_) | Location::Memory(_, _) => { + a.emit_mov(Size::S64, src1, Location::XMM(tmp1)); + tmp1 + } + Location::Imm32(_) => { + a.emit_mov(Size::S32, src1, Location::GPR(tmpg)); + a.emit_mov(Size::S32, Location::GPR(tmpg), Location::XMM(tmp1)); + tmp1 + } + Location::Imm64(_) => { + a.emit_mov(Size::S64, src1, Location::GPR(tmpg)); + a.emit_mov(Size::S64, Location::GPR(tmpg), Location::XMM(tmp1)); + tmp1 + } + _ => unreachable!() + }; + + let src2 = match src2 { + Location::XMM(x) => XMMOrMemory::XMM(x), + Location::Memory(base, disp) => XMMOrMemory::Memory(base, disp), + Location::GPR(_) => { + a.emit_mov(Size::S64, src2, Location::XMM(tmp2)); + XMMOrMemory::XMM(tmp2) + } + Location::Imm32(_) => { + a.emit_mov(Size::S32, src2, Location::GPR(tmpg)); + a.emit_mov(Size::S32, Location::GPR(tmpg), Location::XMM(tmp2)); + XMMOrMemory::XMM(tmp2) + } + Location::Imm64(_) => { + a.emit_mov(Size::S64, src2, Location::GPR(tmpg)); + a.emit_mov(Size::S64, Location::GPR(tmpg), Location::XMM(tmp2)); + XMMOrMemory::XMM(tmp2) + } + _ => unreachable!() + }; + + match dst { + Location::XMM(x) => { + op(a, src1, src2, x); + }, + Location::Memory(_, _) => { + op(a, src1, src2, tmp3); + a.emit_mov(Size::S64, Location::XMM(tmp3), dst); + }, + _ => unreachable!(), + } + + m.release_temp_gpr(tmpg); + m.release_temp_xmm(tmp3); + m.release_temp_xmm(tmp2); + m.release_temp_xmm(tmp1); + } + fn emit_binop_i32( a: &mut Assembler, m: &mut Machine, @@ -849,6 +919,33 @@ impl X64FunctionCode { value_stack.push((ret, LocalOrTemp::Temp)); } + fn emit_fp_binop_avx( + a: &mut Assembler, + m: &mut Machine, + value_stack: &mut Vec<(Location, LocalOrTemp)>, + f: fn(&mut Assembler, XMM, XMMOrMemory, XMM), + ) { + let loc_b = get_location_released(a, m, value_stack.pop().unwrap()); + let loc_a = get_location_released(a, m, value_stack.pop().unwrap()); + let ret = m.acquire_locations(a, &[WpType::F64], false)[0]; + value_stack.push((ret, LocalOrTemp::Temp)); + + Self::emit_relaxed_avx(a, m, f, loc_a, loc_b, ret); + } + + fn emit_fp_unop_avx( + a: &mut Assembler, + m: &mut Machine, + value_stack: &mut Vec<(Location, LocalOrTemp)>, + f: fn(&mut Assembler, XMM, XMMOrMemory, XMM), + ) { + let loc = get_location_released(a, m, value_stack.pop().unwrap()); + let ret = m.acquire_locations(a, &[WpType::F64], false)[0]; + value_stack.push((ret, LocalOrTemp::Temp)); + + Self::emit_relaxed_avx(a, m, f, loc, loc, ret); + } + // This function must not use any temporary register before `cb` is called. fn emit_call_sysv, F: FnOnce(&mut Assembler)>(a: &mut Assembler, m: &mut Machine, cb: F, params: I) { let params: Vec<_> = params.collect(); @@ -859,6 +956,15 @@ impl X64FunctionCode { a.emit_push(Size::S64, Location::GPR(*r)); } + // Save used XMM registers. + let used_xmms = m.get_used_xmms(); + if used_xmms.len() > 0 { + a.emit_sub(Size::S64, Location::Imm32((used_xmms.len() * 8) as u32), Location::GPR(GPR::RSP)); + for (i, r) in used_xmms.iter().enumerate() { + a.emit_mov(Size::S64, Location::XMM(*r), Location::Memory(GPR::RSP, (i * 8) as i32)); + } + } + let mut stack_offset: usize = 0; // Calculate stack offset. @@ -914,6 +1020,14 @@ impl X64FunctionCode { a.emit_add(Size::S64, Location::Imm32(stack_offset as u32), Location::GPR(GPR::RSP)); } + // Restore XMMs. + if used_xmms.len() > 0 { + for (i, r) in used_xmms.iter().enumerate() { + a.emit_mov(Size::S64, Location::Memory(GPR::RSP, (i * 8) as i32), Location::XMM(*r)); + } + a.emit_add(Size::S64, Location::Imm32((used_xmms.len() * 8) as u32), Location::GPR(GPR::RSP)); + } + // Restore GPRs. for r in used_gprs.iter().rev() { a.emit_pop(Size::S64, Location::GPR(*r)); @@ -1155,7 +1269,7 @@ impl FunctionCodeGenerator for X64FunctionCode { let loc_a = get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap()); let ret = self.machine.acquire_locations(a, &[WpType::I32], false)[0]; a.emit_mov(Size::S32, loc_a, Location::GPR(GPR::RAX)); - a.emit_xor(Size::S32, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX)); + a.emit_cdq(); Self::emit_relaxed_xdiv(a, &mut self.machine, Assembler::emit_idiv, Size::S32, loc_b); a.emit_mov(Size::S32, Location::GPR(GPR::RAX), ret); self.value_stack.push((ret, LocalOrTemp::Temp)); @@ -1232,7 +1346,7 @@ impl FunctionCodeGenerator for X64FunctionCode { let loc_a = get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap()); let ret = self.machine.acquire_locations(a, &[WpType::I64], false)[0]; a.emit_mov(Size::S64, loc_a, Location::GPR(GPR::RAX)); - a.emit_xor(Size::S64, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX)); + a.emit_cqo(); Self::emit_relaxed_xdiv(a, &mut self.machine, Assembler::emit_idiv, Size::S64, loc_b); a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret); self.value_stack.push((ret, LocalOrTemp::Temp)); @@ -1308,6 +1422,45 @@ impl FunctionCodeGenerator for X64FunctionCode { Size::S32, loc, ret, ); } + + Operator::F32Const { value } => self.value_stack.push((Location::Imm32(value.bits()), LocalOrTemp::Temp)), + Operator::F32Add => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vaddss), + Operator::F32Sub => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsubss), + Operator::F32Mul => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmulss), + Operator::F32Div => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vdivss), + Operator::F32Max => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmaxss), + Operator::F32Min => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vminss), + Operator::F32Eq => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpeqss), + Operator::F32Ne => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpneqss), + Operator::F32Lt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpltss), + Operator::F32Le => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpless), + Operator::F32Gt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgtss), + Operator::F32Ge => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgess), + Operator::F32Nearest => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_nearest), + Operator::F32Floor => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_floor), + Operator::F32Ceil => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_ceil), + Operator::F32Trunc => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_trunc), + Operator::F32Sqrt => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsqrtss), + + Operator::F64Const { value } => self.value_stack.push((Location::Imm64(value.bits()), LocalOrTemp::Temp)), + Operator::F64Add => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vaddsd), + Operator::F64Sub => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsubsd), + Operator::F64Mul => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmulsd), + Operator::F64Div => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vdivsd), + Operator::F64Max => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmaxsd), + Operator::F64Min => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vminsd), + Operator::F64Eq => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpeqsd), + Operator::F64Ne => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpneqsd), + Operator::F64Lt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpltsd), + Operator::F64Le => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmplesd), + Operator::F64Gt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgtsd), + Operator::F64Ge => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgesd), + Operator::F64Nearest => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_nearest), + Operator::F64Floor => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_floor), + Operator::F64Ceil => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_ceil), + Operator::F64Trunc => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_trunc), + Operator::F64Sqrt => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsqrtsd), + Operator::Call { function_index } => { let function_index = function_index as usize; let label = self diff --git a/lib/dynasm-backend/src/emitter_x64.rs b/lib/dynasm-backend/src/emitter_x64.rs index 3b0763c62..d5a426930 100644 --- a/lib/dynasm-backend/src/emitter_x64.rs +++ b/lib/dynasm-backend/src/emitter_x64.rs @@ -69,6 +69,12 @@ pub enum Size { S64, } +#[derive(Copy, Clone, Debug)] +pub enum XMMOrMemory { + XMM(XMM), + Memory(GPR, i32), +} + pub trait Emitter { type Label; type Offset; @@ -81,6 +87,8 @@ pub trait Emitter { fn emit_mov(&mut self, sz: Size, src: Location, dst: Location); fn emit_lea(&mut self, sz: Size, src: Location, dst: Location); fn emit_lea_label(&mut self, label: Self::Label, dst: Location); + fn emit_cdq(&mut self); + fn emit_cqo(&mut self); fn emit_xor(&mut self, sz: Size, src: Location, dst: Location); fn emit_jmp(&mut self, condition: Condition, label: Self::Label); fn emit_jmp_location(&mut self, loc: Location); @@ -108,6 +116,49 @@ pub trait Emitter { fn emit_movzx(&mut self, sz_src: Size, src: Location, sz_dst: Size, dst: Location); fn emit_movsx(&mut self, sz_src: Size, src: Location, sz_dst: Size, dst: Location); + fn emit_vaddss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vaddsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vsubss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vsubsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vmulss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vmulsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vdivss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vdivsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vmaxss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vmaxsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vminss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vminsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpeqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmpeqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpneqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmpneqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpltss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmpltsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpless(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmplesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpgtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmpgtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vcmpgess(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vcmpgesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vsqrtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vsqrtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + + fn emit_vroundss_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundss_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundss_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundss_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundsd_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundsd_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundsd_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_vroundsd_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM); + fn emit_ud2(&mut self); fn emit_ret(&mut self); fn emit_call_label(&mut self, label: Self::Label); @@ -306,6 +357,28 @@ macro_rules! trap_op { } } +macro_rules! avx_fn { + ($ins:ident, $name:ident) => { + fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) { + match src2 { + XMMOrMemory::XMM(x) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8))), + XMMOrMemory::Memory(base, disp) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp]), + } + } + } +} + +macro_rules! avx_round_fn { + ($ins:ident, $name:ident, $mode:expr) => { + fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) { + match src2 { + XMMOrMemory::XMM(x) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8)), $mode), + XMMOrMemory::Memory(base, disp) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp], $mode), + } + } + } +} + impl Emitter for Assembler { type Label = DynamicLabel; type Offset = AssemblyOffset; @@ -386,6 +459,12 @@ impl Emitter for Assembler { _ => unreachable!(), } } + fn emit_cdq(&mut self) { + dynasm!(self ; cdq); + } + fn emit_cqo(&mut self) { + dynasm!(self ; cqo); + } fn emit_xor(&mut self, sz: Size, src: Location, dst: Location) { binop_all_nofp!(xor, self, sz, src, dst, {unreachable!()}); } @@ -582,6 +661,54 @@ impl Emitter for Assembler { } } + avx_fn!(vaddss, emit_vaddss); + avx_fn!(vaddsd, emit_vaddsd); + + avx_fn!(vsubss, emit_vsubss); + avx_fn!(vsubsd, emit_vsubsd); + + avx_fn!(vmulss, emit_vmulss); + avx_fn!(vmulsd, emit_vmulsd); + + avx_fn!(vdivss, emit_vdivss); + avx_fn!(vdivsd, emit_vdivsd); + + avx_fn!(vmaxss, emit_vmaxss); + avx_fn!(vmaxsd, emit_vmaxsd); + + avx_fn!(vminss, emit_vminss); + avx_fn!(vminsd, emit_vminsd); + + avx_fn!(vcmpeqss, emit_vcmpeqss); + avx_fn!(vcmpeqsd, emit_vcmpeqsd); + + avx_fn!(vcmpneqss, emit_vcmpneqss); + avx_fn!(vcmpneqsd, emit_vcmpneqsd); + + avx_fn!(vcmpltss, emit_vcmpltss); + avx_fn!(vcmpltsd, emit_vcmpltsd); + + avx_fn!(vcmpless, emit_vcmpless); + avx_fn!(vcmplesd, emit_vcmplesd); + + avx_fn!(vcmpgtss, emit_vcmpgtss); + avx_fn!(vcmpgtsd, emit_vcmpgtsd); + + avx_fn!(vcmpgess, emit_vcmpgess); + avx_fn!(vcmpgesd, emit_vcmpgesd); + + avx_fn!(vsqrtss, emit_vsqrtss); + avx_fn!(vsqrtsd, emit_vsqrtsd); + + avx_round_fn!(vroundss, emit_vroundss_nearest, 0); + avx_round_fn!(vroundss, emit_vroundss_floor, 1); + avx_round_fn!(vroundss, emit_vroundss_ceil, 2); + avx_round_fn!(vroundss, emit_vroundss_trunc, 3); + avx_round_fn!(vroundsd, emit_vroundsd_nearest, 0); + avx_round_fn!(vroundsd, emit_vroundsd_floor, 1); + avx_round_fn!(vroundsd, emit_vroundsd_ceil, 2); + avx_round_fn!(vroundsd, emit_vroundsd_trunc, 3); + fn emit_ud2(&mut self) { dynasm!(self ; ud2); } diff --git a/lib/dynasm-backend/src/machine.rs b/lib/dynasm-backend/src/machine.rs index 0a7686d9e..ae7163c56 100644 --- a/lib/dynasm-backend/src/machine.rs +++ b/lib/dynasm-backend/src/machine.rs @@ -186,9 +186,7 @@ impl Machine { for ty in tys { let loc = match *ty { WpType::F32 | WpType::F64 => { - self.pick_xmm().map(Location::XMM).or_else( - || self.pick_gpr().map(Location::GPR) - ) + self.pick_xmm().map(Location::XMM) }, WpType::I32 | WpType::I64 => { self.pick_gpr().map(Location::GPR)