#include #include #include #include #ifdef __GNUC__ #define LIKELY(x) __builtin_expect(!!(x), 1) #define UNLIKELY(x) __builtin_expect(!!(x), 0) #if __has_attribute(cold) && __has_attribute(preserve_most) #define HINT_COLD __attribute__((cold,preserve_most,noinline)) #elif __has_attribute(cold) #define HINT_COLD __attribute__((cold,noinline)) #else #define HINT_COLD #endif #else #define LIKELY(x) (x) #define UNLIKELY(x) (x) #define HINT_COLD #endif #define op_reg_idx(op) (op).idx #define op_reg_gph(op) (((op).idx & ~0x3) == 0x24) #define op_mem_base(mem) op_reg_idx((mem).base) #define op_mem_idx(mem) op_reg_idx((mem).idx) static bool op_imm_n(int64_t imm, unsigned immsz) { if (immsz == 0 && !imm) return true; if (immsz == 1 && (int8_t) imm == imm) return true; if (immsz == 2 && (int16_t) imm == imm) return true; if (immsz == 3 && (imm&0xffffff) == imm) return true; if (immsz == 4 && (int32_t) imm == imm) return true; if (immsz == 8 && (int64_t) imm == imm) return true; return false; } HINT_COLD static unsigned enc_seg67(uint8_t* buf, unsigned flags) { unsigned idx = 0; if (UNLIKELY(flags & FE_SEG_MASK)) { unsigned seg = (0x65643e362e2600 >> (8 * (flags & FE_SEG_MASK))) & 0xff; buf[idx++] = seg; } if (UNLIKELY(flags & FE_ADDR32)) buf[idx++] = 0x67; return idx; } static unsigned enc_rex_mem(FeMem op0, uint64_t op1) { // Essentially just an and+or due to struct layout. uint32_t val = op1 | op0.flags | (op_mem_base(op0) << 8) | ((uint32_t)op_mem_idx(op0) << 24); // Combine REX.RXB using multiplication for branch-less code. uint32_t masked = val & 0x08000808; return masked ? (uint8_t) (masked * (1|(1<<15)|(1<<25)) >> 26) + 0x40 : 0; } static void enc_imm(uint8_t* buf, uint64_t imm, unsigned immsz) { #ifdef __GNUC__ // Clang doesn't fold the loop into a single store. // See: https://github.com/llvm/llvm-project/issues/154696 if (__builtin_constant_p(immsz)) { __builtin_memcpy(buf, &imm, immsz); return; } #endif for (unsigned i = 0; i < immsz; i++) *buf++ = imm >> 8 * i; } static int enc_mem_common(uint8_t* buf, unsigned ripoff, FeMem op0, uint64_t op1, unsigned disp8scale) { int mod = 0, reg = op1 & 7, rm; unsigned sib = 0x20; bool withsib = false; unsigned dispsz = 0; int32_t off = op0.off; if (op_reg_idx(op0.idx) < 0x80) { int scalabs = op0.scale; if (UNLIKELY((unsigned) (op0.scale - 1) >= 8 || (op0.scale & (op0.scale - 1)))) return 0; unsigned scale = (scalabs & 0xA ? 1 : 0) | (scalabs & 0xC ? 2 : 0); sib = scale << 6 | (op_reg_idx(op0.idx) & 7) << 3; withsib = true; } else if (UNLIKELY(op0.scale != 0)) { return 0; } if (UNLIKELY(op0.base.idx >= 0x20)) { if (UNLIKELY(op0.base.idx >= op_reg_idx(FE_NOREG))) { *buf++ = (reg << 3) | 4; *buf++ = sib | 5; enc_imm(buf, off, 4); return ripoff + 6; } else if (LIKELY(op0.base.idx == FE_IP.idx)) { if (withsib) return 0; *buf++ = (reg << 3) | 5; // Adjust offset, caller doesn't know instruction length. enc_imm(buf, off - ripoff - 5, 4); return ripoff + 5; } else { return 0; } } rm = op_reg_idx(op0.base) & 7; if (off) { if (LIKELY(!disp8scale)) { mod = (int8_t) off == off ? 0x40 : 0x80; dispsz = (int8_t) off == off ? 1 : 4; } else { if (!(off & ((1 << disp8scale) - 1)) && op_imm_n(off >> disp8scale, 1)) off >>= disp8scale, mod = 0x40, dispsz = 1; else mod = 0x80, dispsz = 4; } } else if (rm == 5) { dispsz = 1; mod = 0x40; } // Always write four bytes of displacement. The buffer is always large // enough, and we truncate by returning a smaller "written bytes" count. if (withsib || rm == 4) { *buf++ = mod | (reg << 3) | 4; *buf++ = sib | rm; enc_imm(buf, off, 4); return ripoff + 2 + dispsz; } else { *buf++ = mod | (reg << 3) | rm; enc_imm(buf, off, 4); return ripoff + 1 + dispsz; } } static int enc_mem(uint8_t* buf, unsigned ripoff, FeMem op0, uint64_t op1, bool forcesib, unsigned disp8scale) { if (UNLIKELY(op_reg_idx(op0.idx) == 4)) return 0; if (forcesib && op_reg_idx(op0.idx) == op_reg_idx(FE_NOREG)) { op0.scale = 1; op0.idx = FE_GP(4); } return enc_mem_common(buf, ripoff, op0, op1, disp8scale); } static int enc_mem_vsib(uint8_t* buf, unsigned ripoff, FeMemV op0, uint64_t op1, bool forcesib, unsigned disp8scale) { (void) forcesib; FeMem mem = FE_MEM(op0.base, op0.scale, FE_GP(op_reg_idx(op0.idx)), op0.off); return enc_mem_common(buf, ripoff, mem, op1, disp8scale); } // EVEX/VEX "Opcode" format: // // | EVEX byte 4 | P P M M M - - W | Opcode byte | VEX-D VEX-D-FLIPW // 0 8 16 24 enum { FE_OPC_VEX_WPP_SHIFT = 8, FE_OPC_VEX_WPP_MASK = 0x83 << FE_OPC_VEX_WPP_SHIFT, FE_OPC_VEX_MMM_SHIFT = 10, FE_OPC_VEX_MMM_MASK = 0x1f << FE_OPC_VEX_MMM_SHIFT, FE_OPC_VEX_DOWNGRADE_VEX = 1 << 24, FE_OPC_VEX_DOWNGRADE_VEX_FLIPW = 1 << 25, }; static int enc_vex_common(uint8_t* buf, unsigned opcode, unsigned base, unsigned idx, unsigned reg, unsigned vvvv) { if ((base | idx | reg | vvvv) & 0x10) return 0; bool vex3 = ((base | idx) & 0x08) || (opcode & 0xfc00) != 0x0400; if (vex3) { *buf++ = 0xc4; unsigned b1 = (opcode & FE_OPC_VEX_MMM_MASK) >> FE_OPC_VEX_MMM_SHIFT; if (!(reg & 0x08)) b1 |= 0x80; if (!(idx & 0x08)) b1 |= 0x40; if (!(base & 0x08)) b1 |= 0x20; *buf++ = b1; unsigned b2 = (opcode & FE_OPC_VEX_WPP_MASK) >> FE_OPC_VEX_WPP_SHIFT; if (opcode & 0x20) b2 |= 0x04; b2 |= (vvvv ^ 0xf) << 3; *buf++ = b2; } else { *buf++ = 0xc5; unsigned b2 = opcode >> FE_OPC_VEX_WPP_SHIFT & 3; if (opcode & 0x20) b2 |= 0x04; if (!(reg & 0x08)) b2 |= 0x80; b2 |= (vvvv ^ 0xf) << 3; *buf++ = b2; } *buf++ = (opcode & 0xff0000) >> 16; return 3 + vex3; } static int enc_vex_reg(uint8_t* buf, unsigned opcode, uint64_t rm, uint64_t reg, uint64_t vvvv) { unsigned off = enc_vex_common(buf, opcode, rm, 0, reg, vvvv); buf[off] = 0xc0 | (reg << 3 & 0x38) | (rm & 7); return off ? off + 1 : 0; } static int enc_vex_mem(uint8_t* buf, unsigned opcode, FeMem rm, uint64_t reg, uint64_t vvvv, unsigned ripoff, bool forcesib, unsigned disp8scale) { unsigned off = enc_vex_common(buf, opcode, op_reg_idx(rm.base), op_reg_idx(rm.idx), reg, vvvv); unsigned memoff = enc_mem(buf + off, ripoff + off, rm, reg, forcesib, disp8scale); return off && memoff ? memoff : 0; } static int enc_vex_vsib(uint8_t* buf, unsigned opcode, FeMemV rm, uint64_t reg, uint64_t vvvv, unsigned ripoff, bool forcesib, unsigned disp8scale) { unsigned off = enc_vex_common(buf, opcode, op_reg_idx(rm.base), op_reg_idx(rm.idx), reg, vvvv); unsigned memoff = enc_mem_vsib(buf + off, ripoff + off, rm, reg, forcesib, disp8scale); return off && memoff ? memoff : 0; } static int enc_evex_common(uint8_t* buf, unsigned opcode, unsigned base, unsigned idx, unsigned reg, unsigned vvvv) { *buf++ = 0x62; bool evexr3 = reg & 0x08; bool evexr4 = reg & 0x10; bool evexb3 = base & 0x08; bool evexb4 = base & 0x10; // evexb4 is unused in AVX-512 encoding bool evexx3 = idx & 0x08; bool evexx4 = idx & 0x10; bool evexv4 = vvvv & 0x10; unsigned b1 = (opcode & FE_OPC_VEX_MMM_MASK) >> FE_OPC_VEX_MMM_SHIFT; if (!evexr3) b1 |= 0x80; if (!evexx3) b1 |= 0x40; if (!evexb3) b1 |= 0x20; if (!evexr4) b1 |= 0x10; if (evexb4) b1 |= 0x08; *buf++ = b1; unsigned b2 = (opcode & FE_OPC_VEX_WPP_MASK) >> FE_OPC_VEX_WPP_SHIFT; if (!evexx4) b2 |= 0x04; b2 |= (~vvvv & 0xf) << 3; *buf++ = b2; unsigned b3 = opcode & 0xff; if (!evexv4) b3 |= 0x08; *buf++ = b3; *buf++ = (opcode & 0xff0000) >> 16; return 5; } static unsigned enc_evex_to_vex(unsigned opcode) { return opcode & FE_OPC_VEX_DOWNGRADE_VEX_FLIPW ? opcode ^ 0x8000 : opcode; } // Encode AVX-512 EVEX r/m-reg, non-xmm reg, vvvv, prefer vex static int enc_evex_reg(uint8_t* buf, unsigned opcode, unsigned rm, unsigned reg, unsigned vvvv) { unsigned off; if (!((rm | reg | vvvv) & 0x10) && (opcode & FE_OPC_VEX_DOWNGRADE_VEX)) off = enc_vex_common(buf, enc_evex_to_vex(opcode), rm, 0, reg, vvvv); else off = enc_evex_common(buf, opcode, rm, 0, reg, vvvv); buf[off] = 0xc0 | (reg << 3 & 0x38) | (rm & 7); return off + 1; } // Encode AVX-512 EVEX r/m-reg, xmm reg, vvvv, prefer vex static int enc_evex_xmm(uint8_t* buf, unsigned opcode, unsigned rm, unsigned reg, unsigned vvvv) { unsigned off; if (!((rm | reg | vvvv) & 0x10) && (opcode & FE_OPC_VEX_DOWNGRADE_VEX)) off = enc_vex_common(buf, enc_evex_to_vex(opcode), rm, 0, reg, vvvv); else // AVX-512 XMM reg encoding uses X3 instead of B4. off = enc_evex_common(buf, opcode, rm & 0x0f, rm >> 1, reg, vvvv); buf[off] = 0xc0 | (reg << 3 & 0x38) | (rm & 7); return off + 1; } static int enc_evex_mem(uint8_t* buf, unsigned opcode, FeMem rm, uint64_t reg, uint64_t vvvv, unsigned ripoff, bool forcesib, unsigned disp8scale) { unsigned off; if (!((op_reg_idx(rm.base) | op_reg_idx(rm.idx) | reg | vvvv) & 0x10) && (opcode & FE_OPC_VEX_DOWNGRADE_VEX)) { disp8scale = 0; // Only AVX-512 EVEX compresses displacement off = enc_vex_common(buf, enc_evex_to_vex(opcode), op_reg_idx(rm.base), op_reg_idx(rm.idx), reg, vvvv); } else { off = enc_evex_common(buf, opcode, op_reg_idx(rm.base), op_reg_idx(rm.idx), reg, vvvv); } unsigned memoff = enc_mem(buf + off, ripoff + off, rm, reg, forcesib, disp8scale); return off && memoff ? memoff : 0; } static int enc_evex_vsib(uint8_t* buf, unsigned opcode, FeMemV rm, uint64_t reg, uint64_t vvvv, unsigned ripoff, bool forcesib, unsigned disp8scale) { (void) vvvv; // EVEX VSIB requires non-zero mask operand if (!(opcode & 0x7)) return 0; // EVEX.X4 is encoded in EVEX.V4 unsigned idx = op_reg_idx(rm.idx); unsigned off = enc_evex_common(buf, opcode, op_reg_idx(rm.base), idx & 0x0f, reg, idx & 0x10); unsigned memoff = enc_mem_vsib(buf + off, ripoff + off, rm, reg, forcesib, disp8scale); return off && memoff ? memoff : 0; } unsigned fe64_NOP(uint8_t* buf, unsigned flags) { unsigned len = flags ? flags : 1; // Taken from Intel SDM static const uint8_t tbl[] = { 0x90, 0x66, 0x90, 0x0f, 0x1f, 0x00, 0x0f, 0x1f, 0x40, 0x00, 0x0f, 0x1f, 0x44, 0x00, 0x00, 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, }; unsigned remain = len; for (; remain > 9; remain -= 9) for (unsigned i = 0; i < 9; i++) *(buf++) = tbl[36 + i]; const uint8_t* src = tbl + (remain * (remain - 1)) / 2; for (unsigned i = 0; i < remain; i++) *(buf++) = src[i]; return len; } #include