add support for JIT compilation

2022-12-22 22:50:01 -05:00
parent ca27fb3a98
commit e59f99b440
149 changed files with 81486 additions and 2 deletions
--- a/lib/lepton/asmjit/x86/x86archtraits_p.h
+++ b/lib/lepton/asmjit/x86/x86archtraits_p.h
@ -0,0 +1,148 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86ARCHTRAITS_P_H_INCLUDED
+#define ASMJIT_X86_X86ARCHTRAITS_P_H_INCLUDED
+
+#include "../core/archtraits.h"
+#include "../core/misc_p.h"
+#include "../x86/x86operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86 architecture traits (internal).
+static const constexpr ArchTraits x86ArchTraits = {
+  // SP/FP/LR/PC.
+  Gp::kIdSp, Gp::kIdBp, 0xFF, 0xFF,
+
+  // Reserved.
+  { 0, 0, 0 },
+
+  // HW stack alignment.
+  1,
+
+  // Min/Max stack offset
+  0x7FFFFFFFu, 0x7FFFFFFFu,
+
+  // ISA features [Gp, Vec, Other0, Other1].
+  {{
+    InstHints::kRegSwap | InstHints::kPushPop,
+    InstHints::kNoHints,
+    InstHints::kNoHints,
+    InstHints::kNoHints
+  }},
+
+  // Register signatures.
+  #define V(index) OperandSignature{x86::RegTraits<RegType(index)>::kSignature}
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // RegTypeToTypeId.
+  #define V(index) TypeId(x86::RegTraits<RegType(index)>::kTypeId)
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // TypeIdToRegType.
+  #define V(index) (index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt8)    ? RegType::kX86_GpbLo : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt8)   ? RegType::kX86_GpbLo : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt16)   ? RegType::kX86_Gpw   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt16)  ? RegType::kX86_Gpw   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt32)   ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt32)  ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kIntPtr)  ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUIntPtr) ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kFloat32) ? RegType::kX86_Xmm   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kFloat64) ? RegType::kX86_Xmm   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask8)   ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask16)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask32)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask64)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMmx32)   ? RegType::kX86_Mm    : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMmx64)   ? RegType::kX86_Mm    : RegType::kNone)
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // Word names of 8-bit, 16-bit, 32-bit, and 64-bit quantities.
+  {
+    ArchTypeNameId::kDB,
+    ArchTypeNameId::kDW,
+    ArchTypeNameId::kDD,
+    ArchTypeNameId::kDQ
+  }
+};
+
+//! X64 architecture traits (internal).
+static const constexpr ArchTraits x64ArchTraits = {
+  // SP/FP/LR/PC.
+  Gp::kIdSp, Gp::kIdBp, 0xFF, 0xFF,
+
+  // Reserved.
+  { 0, 0, 0 },
+
+  // HW stack alignment.
+  1,
+
+  // Min/Max stack offset
+  0x7FFFFFFFu, 0x7FFFFFFFu,
+
+  // ISA features [Gp, Vec, Other0, Other1].
+  {{
+    InstHints::kRegSwap | InstHints::kPushPop,
+    InstHints::kNoHints,
+    InstHints::kNoHints,
+    InstHints::kNoHints
+  }},
+
+  // Register signatures.
+  #define V(index) OperandSignature{x86::RegTraits<RegType(index)>::kSignature}
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // RegTypeToTypeId.
+  #define V(index) TypeId(x86::RegTraits<RegType(index)>::kTypeId)
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // TypeIdToRegType.
+  #define V(index) (index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt8)    ? RegType::kX86_GpbLo : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt8)   ? RegType::kX86_GpbLo : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt16)   ? RegType::kX86_Gpw   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt16)  ? RegType::kX86_Gpw   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt32)   ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt32)  ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kInt64)   ? RegType::kX86_Gpq   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUInt64)  ? RegType::kX86_Gpq   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kIntPtr)  ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kUIntPtr) ? RegType::kX86_Gpd   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kFloat32) ? RegType::kX86_Xmm   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kFloat64) ? RegType::kX86_Xmm   : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask8)   ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask16)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask32)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMask64)  ? RegType::kX86_KReg  : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMmx32)   ? RegType::kX86_Mm    : \
+                    index + uint32_t(TypeId::_kBaseStart) == uint32_t(TypeId::kMmx64)   ? RegType::kX86_Mm    : RegType::kNone)
+  {{ ASMJIT_LOOKUP_TABLE_32(V, 0) }},
+  #undef V
+
+  // Word names of 8-bit, 16-bit, 32-bit, and 64-bit quantities.
+  {
+    ArchTypeNameId::kDB,
+    ArchTypeNameId::kDW,
+    ArchTypeNameId::kDD,
+    ArchTypeNameId::kDQ
+  }
+};
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86ARCHTRAITS_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86assembler.cpp
+++ b/lib/lepton/asmjit/x86/x86assembler.cpp
--- a/lib/lepton/asmjit/x86/x86assembler.h
+++ b/lib/lepton/asmjit/x86/x86assembler.h
@ -0,0 +1,685 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86ASSEMBLER_H_INCLUDED
+#define ASMJIT_X86_X86ASSEMBLER_H_INCLUDED
+
+#include "../core/assembler.h"
+#include "../x86/x86emitter.h"
+#include "../x86/x86operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86/X64 assembler implementation.
+//!
+//! x86::Assembler is a code emitter that emits machine code directly into the \ref CodeBuffer. The assembler is capable
+//! of targeting both 32-bit and 64-bit instruction sets, the instruction set can be configured through \ref CodeHolder.
+//!
+//! ### Basics
+//!
+//! The following example shows a basic use of `x86::Assembler`, how to generate a function that works in both 32-bit
+//! and 64-bit modes, and how to connect \ref JitRuntime, \ref CodeHolder, and `x86::Assembler`.
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef int (*SumFunc)(const int* arr, size_t count);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Create a runtime specialized for JIT.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Assembler a(&code);          // Create and attach x86::Assembler to code.
+//!
+//!   // Decide between 32-bit CDECL, WIN64, and SysV64 calling conventions:
+//!   //   32-BIT - passed all arguments by stack.
+//!   //   WIN64  - passes first 4 arguments by RCX, RDX, R8, and R9.
+//!   //   UNIX64 - passes first 6 arguments by RDI, RSI, RCX, RDX, R8, and R9.
+//!   x86::Gp arr, cnt;
+//!   x86::Gp sum = x86::eax;           // Use EAX as 'sum' as it's a return register.
+//!
+//!   if (ASMJIT_ARCH_BITS == 64) {
+//!   #if defined(_WIN32)
+//!     arr = x86::rcx;                 // First argument (array ptr).
+//!     cnt = x86::rdx;                 // Second argument (number of elements)
+//!   #else
+//!     arr = x86::rdi;                 // First argument (array ptr).
+//!     cnt = x86::rsi;                 // Second argument (number of elements)
+//!   #endif
+//!   }
+//!   else {
+//!     arr = x86::edx;                 // Use EDX to hold the array pointer.
+//!     cnt = x86::ecx;                 // Use ECX to hold the counter.
+//!     // Fetch first and second arguments from [ESP + 4] and [ESP + 8].
+//!     a.mov(arr, x86::ptr(x86::esp, 4));
+//!     a.mov(cnt, x86::ptr(x86::esp, 8));
+//!   }
+//!
+//!   Label Loop = a.newLabel();        // To construct the loop, we need some labels.
+//!   Label Exit = a.newLabel();
+//!
+//!   a.xor_(sum, sum);                 // Clear 'sum' register (shorter than 'mov').
+//!   a.test(cnt, cnt);                 // Border case:
+//!   a.jz(Exit);                       //   If 'cnt' is zero jump to 'Exit' now.
+//!
+//!   a.bind(Loop);                     // Start of a loop iteration.
+//!   a.add(sum, x86::dword_ptr(arr));  // Add int at [arr] to 'sum'.
+//!   a.add(arr, 4);                    // Increment 'arr' pointer.
+//!   a.dec(cnt);                       // Decrease 'cnt'.
+//!   a.jnz(Loop);                      // If not zero jump to 'Loop'.
+//!
+//!   a.bind(Exit);                     // Exit to handle the border case.
+//!   a.ret();                          // Return from function ('sum' == 'eax').
+//!   // ----> x86::Assembler is no longer needed from here and can be destroyed <----
+//!
+//!   SumFunc fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   static const int array[6] = { 4, 8, 15, 16, 23, 42 };
+//!
+//!   int result = fn(array, 6);        // Execute the generated code.
+//!   printf("%d\n", result);           // Print sum of array (108).
+//!
+//!   rt.release(fn);                   // Explicitly remove the function from the runtime
+//!   return 0;                         // Everything successful...
+//! }
+//! ```
+//!
+//! The example should be self-explanatory. It shows how to work with labels, how to use operands, and how to emit
+//! instructions that can use different registers based on runtime selection. It implements 32-bit CDECL, WIN64,
+//! and SysV64 caling conventions and will work on most X86/X64 environments.
+//!
+//! Although functions prologs / epilogs can be implemented manually, AsmJit provides utilities that can be used
+//! to create function prologs and epilogs automatically, see \ref asmjit_function for more details.
+//!
+//! ### Instruction Validation
+//!
+//! Assembler prefers speed over strictness by default. The implementation checks the type of operands and fails
+//! if the signature of types is invalid, however, it does only basic checks regarding registers and their groups
+//! used in instructions. It's possible to pass operands that don't form any valid signature to the implementation
+//! and succeed. This is usually not a problem as Assembler provides typed API so operand types are normally checked
+//! by C++ compiler at compile time, however, Assembler is fully dynamic and its \ref emit() function can be called
+//! with any instruction id, options, and operands. Moreover, it's also possible to form instructions that will be
+//! accepted by the typed API, for example by calling `mov(x86::eax, x86::al)` - the C++ compiler won't see a problem
+//! as both EAX and AL are \ref Gp registers.
+//!
+//! To help with common mistakes AsmJit allows to activate instruction validation. This feature instruments
+//! the Assembler to call \ref InstAPI::validate() before it attempts to encode any instruction.
+//!
+//! The example below illustrates how validation can be turned on:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! int main(int argc, char* argv[]) {
+//!   JitRuntime rt;                    // Create a runtime specialized for JIT.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Assembler a(&code);          // Create and attach x86::Assembler to code.
+//!
+//!   // Enable strict validation.
+//!   a.addDiagnosticOptions(DiagnosticOptions::kValidateAssembler);
+//!
+//!   // Try to encode invalid or ill-formed instructions.
+//!   Error err;
+//!
+//!   // Invalid instruction.
+//!   err = a.mov(x86::eax, x86::al);
+//!   printf("Status: %s\n", DebugUtils::errorAsString(err));
+//!
+//!   // Invalid instruction.
+//!   err = a.emit(x86::Inst::kIdMovss, x86::eax, x86::xmm0);
+//!   printf("Status: %s\n", DebugUtils::errorAsString(err));
+//!
+//!   // Ambiguous operand size - the pointer requires size.
+//!   err = a.inc(x86::ptr(x86::rax), 1);
+//!   printf("Status: %s\n", DebugUtils::errorAsString(err));
+//!
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### Native Registers
+//!
+//! All emitters provide functions to construct machine-size registers depending on the target. This feature is
+//! for users that want to write code targeting both 32-bit and 64-bit architectures at the same time. In AsmJit
+//! terminology such registers have prefix `z`, so for example on X86 architecture the following native registers
+//! are provided:
+//!
+//!   - `zax` - mapped to either `eax` or `rax`
+//!   - `zbx` - mapped to either `ebx` or `rbx`
+//!   - `zcx` - mapped to either `ecx` or `rcx`
+//!   - `zdx` - mapped to either `edx` or `rdx`
+//!   - `zsp` - mapped to either `esp` or `rsp`
+//!   - `zbp` - mapped to either `ebp` or `rbp`
+//!   - `zsi` - mapped to either `esi` or `rsi`
+//!   - `zdi` - mapped to either `edi` or `rdi`
+//!
+//! They are accessible through \ref x86::Assembler, \ref x86::Builder, and \ref x86::Compiler. The example below
+//! illustrates how to use this feature:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! typedef int (*Func)(void);
+//!
+//! int main(int argc, char* argv[]) {
+//!   JitRuntime rt;                    // Create a runtime specialized for JIT.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Assembler a(&code);          // Create and attach x86::Assembler to code.
+//!
+//!   // Let's get these registers from x86::Assembler.
+//!   x86::Gp zbp = a.zbp();
+//!   x86::Gp zsp = a.zsp();
+//!
+//!   int stackSize = 32;
+//!
+//!   // Function prolog.
+//!   a.push(zbp);
+//!   a.mov(zbp, zsp);
+//!   a.sub(zsp, stackSize);
+//!
+//!   // ... emit some code (this just sets return value to zero) ...
+//!   a.xor_(x86::eax, x86::eax);
+//!
+//!   // Function epilog and return.
+//!   a.mov(zsp, zbp);
+//!   a.pop(zbp);
+//!   a.ret();
+//!
+//!   // To make the example complete let's call it.
+//!   Func fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!
+//!   int result = fn();                // Execute the generated code.
+//!   printf("%d\n", result);           // Print the resulting "0".
+//!
+//!   rt.release(fn);                   // Remove the function from the runtime.
+//!   return 0;
+//! }
+//! ```
+//!
+//! The example just returns `0`, but the function generated contains a standard prolog and epilog sequence and the
+//! function itself reserves 32 bytes of local stack. The advantage is clear - a single code-base can handle multiple
+//! targets easily. If you want to create a register of native size dynamically by specifying its id it's also possible:
+//!
+//! ```
+//! void example(x86::Assembler& a) {
+//!   x86::Gp zax = a.gpz(x86::Gp::kIdAx);
+//!   x86::Gp zbx = a.gpz(x86::Gp::kIdBx);
+//!   x86::Gp zcx = a.gpz(x86::Gp::kIdCx);
+//!   x86::Gp zdx = a.gpz(x86::Gp::kIdDx);
+//!
+//!   // You can also change register's id easily.
+//!   x86::Gp zsp = zax;
+//!   zsp.setId(4); // or x86::Gp::kIdSp.
+//! }
+//! ```
+//!
+//! ### Data Embedding
+//!
+//! x86::Assembler extends the standard \ref BaseAssembler with X86/X64 specific conventions that are often used by
+//! assemblers to embed data next to the code. The following functions can be used to embed data:
+//!
+//!   - \ref BaseAssembler::embedInt8() - embeds int8_t (portable naming).
+//!   - \ref BaseAssembler::embedUInt8() - embeds uint8_t (portable naming).
+//!   - \ref BaseAssembler::embedInt16() - embeds int16_t (portable naming).
+//!   - \ref BaseAssembler::embedUInt16() - embeds uint16_t (portable naming).
+//!   - \ref BaseAssembler::embedInt32() - embeds int32_t (portable naming).
+//!   - \ref BaseAssembler::embedUInt32() - embeds uint32_t (portable naming).
+//!   - \ref BaseAssembler::embedInt64() - embeds int64_t (portable naming).
+//!   - \ref BaseAssembler::embedUInt64() - embeds uint64_t (portable naming).
+//!   - \ref BaseAssembler::embedFloat() - embeds float (portable naming).
+//!   - \ref BaseAssembler::embedDouble() - embeds double (portable naming).
+//!
+//!   - \ref x86::Assembler::db() - embeds byte (8 bits) (x86 naming).
+//!   - \ref x86::Assembler::dw() - embeds word (16 bits) (x86 naming).
+//!   - \ref x86::Assembler::dd() - embeds dword (32 bits) (x86 naming).
+//!   - \ref x86::Assembler::dq() - embeds qword (64 bits) (x86 naming).
+//!
+//! The following example illustrates how embed works:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! using namespace asmjit;
+//!
+//! void embedData(x86::Assembler& a) {
+//!   a.db(0xFF);         // Embeds 0xFF byte.
+//!   a.dw(0xFF00);       // Embeds 0xFF00 word (little-endian).
+//!   a.dd(0xFF000000);   // Embeds 0xFF000000 dword (little-endian).
+//!   a.embedFloat(0.4f); // Embeds 0.4f (32-bit float, little-endian).
+//! }
+//! ```
+//!
+//! Sometimes it's required to read the data that is embedded after code, for example. This can be done through
+//! \ref Label as shown below:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! using namespace asmjit;
+//!
+//! void embedData(x86::Assembler& a, const Label& L_Data) {
+//!   x86::Gp addr = a.zax();  // EAX or RAX.
+//!   x86::Gp val = x86::edi;  // Where to store some value...
+//!
+//!   // Approach 1 - Load the address to register through LEA. This approach
+//!   //              is flexible as the address can be then manipulated, for
+//!   //              example if you have a data array, which would need index.
+//!   a.lea(addr, L_Data);     // Loads the address of the label to EAX or RAX.
+//!   a.mov(val, dword_ptr(addr));
+//!
+//!   // Approach 2 - Load the data directly by using L_Data in address. It's
+//!   //              worth noting that this doesn't work with indexes in X64
+//!   //              mode. It will use absolute address in 32-bit mode and
+//!   //              relative address (RIP) in 64-bit mode.
+//!   a.mov(val, dword_ptr(L_Data));
+//! }
+//! ```
+//!
+//! ### Label Embedding
+//!
+//! It's also possible to embed labels. In general AsmJit provides the following options:
+//!
+//!   - \ref BaseEmitter::embedLabel() - Embeds absolute address of a label. This is target dependent and would
+//!     embed either 32-bit or 64-bit data that embeds absolute label address. This kind of embedding cannot be
+//!     used in a position independent code.
+//!
+//!   - \ref BaseEmitter::embedLabelDelta() - Embeds a difference between two labels. The size of the difference
+//!     can be specified so it's possible to embed 8-bit, 16-bit, 32-bit, and 64-bit difference, which is sufficient
+//!     for most purposes.
+//!
+//! The following example demonstrates how to embed labels and their differences:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! using namespace asmjit;
+//!
+//! void embedLabel(x86::Assembler& a, const Label& L_Data) {
+//!   // [1] Embed L_Data - the size of the data will be dependent on the target.
+//!   a.embedLabel(L_Data);
+//!
+//!   // [2] Embed a 32-bit difference of two labels.
+//!   Label L_Here = a.newLabel();
+//!   a.bind(L_Here);
+//!   // Embeds int32_t(L_Data - L_Here).
+//!   a.embedLabelDelta(L_Data, L_Here, 4);
+//! }
+//! ```
+//!
+//! ### Using FuncFrame and FuncDetail with x86::Assembler
+//!
+//! The example below demonstrates how \ref FuncFrame and \ref FuncDetail can be used together with \ref x86::Assembler
+//! to generate a function that will use platform dependent calling conventions automatically depending on the target:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! typedef void (*SumIntsFunc)(int* dst, const int* a, const int* b);
+//!
+//! int main(int argc, char* argv[]) {
+//!   JitRuntime rt;                    // Create JIT Runtime.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Assembler a(&code);          // Create and attach x86::Assembler to code.
+//!
+//!   // Decide which registers will be mapped to function arguments. Try changing
+//!   // registers of dst, src_a, and src_b and see what happens in function's
+//!   // prolog and epilog.
+//!   x86::Gp dst   = a.zax();
+//!   x86::Gp src_a = a.zcx();
+//!   x86::Gp src_b = a.zdx();
+//!
+//!   X86::Xmm vec0 = x86::xmm0;
+//!   X86::Xmm vec1 = x86::xmm1;
+//!
+//!   // Create/initialize FuncDetail and FuncFrame.
+//!   FuncDetail func;
+//!   func.init(FuncSignatureT<void, int*, const int*, const int*>(CallConvId::kHost));
+//!
+//!   FuncFrame frame;
+//!   frame.init(func);
+//!
+//!   // Make XMM0 and XMM1 dirty - RegGroup::kVec describes XMM|YMM|ZMM registers.
+//!   frame.setDirtyRegs(RegGroup::kVec, IntUtils::mask(0, 1));
+//!
+//!   // Alternatively, if you don't want to use register masks you can pass BaseReg
+//!   // to addDirtyRegs(). The following code would add both xmm0 and xmm1.
+//!   frame.addDirtyRegs(x86::xmm0, x86::xmm1);
+//!
+//!   FuncArgsAssignment args(&func);   // Create arguments assignment context.
+//!   args.assignAll(dst, src_a, src_b);// Assign our registers to arguments.
+//!   args.updateFrameInfo(frame);      // Reflect our args in FuncFrame.
+//!   frame.finalize();                 // Finalize the FuncFrame (updates it).
+//!
+//!   a.emitProlog(frame);              // Emit function prolog.
+//!   a.emitArgsAssignment(frame, args);// Assign arguments to registers.
+//!   a.movdqu(vec0, x86::ptr(src_a));  // Load 4 ints from [src_a] to XMM0.
+//!   a.movdqu(vec1, x86::ptr(src_b));  // Load 4 ints from [src_b] to XMM1.
+//!   a.paddd(vec0, vec1);              // Add 4 ints in XMM1 to XMM0.
+//!   a.movdqu(x86::ptr(dst), vec0);    // Store the result to [dst].
+//!   a.emitEpilog(frame);              // Emit function epilog and return.
+//!
+//!   SumIntsFunc fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error case.
+//!
+//!   // Execute the generated function.
+//!   int inA[4] = { 4, 3, 2, 1 };
+//!   int inB[4] = { 1, 5, 2, 8 };
+//!   int out[4];
+//!   fn(out, inA, inB);
+//!
+//!   // Prints {5 8 4 9}
+//!   printf("{%d %d %d %d}\n", out[0], out[1], out[2], out[3]);
+//!
+//!   rt.release(fn);
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### Using x86::Assembler as Code-Patcher
+//!
+//! This is an advanced topic that is sometimes unavoidable. AsmJit by default appends machine code it generates
+//! into a \ref CodeBuffer, however, it also allows to set the offset in \ref CodeBuffer explicitly and to overwrite
+//! its content. This technique is extremely dangerous as X86 instructions have variable length (see below), so you
+//! should in general only patch code to change instruction's immediate values or some other details not known the
+//! at a time the instruction was emitted. A typical scenario that requires code-patching is when you start emitting
+//! function and you don't know how much stack you want to reserve for it.
+//!
+//! Before we go further it's important to introduce instruction options, because they can help with code-patching
+//! (and not only patching, but that will be explained in AVX-512 section):
+//!
+//!   - Many general-purpose instructions (especially arithmetic ones) on X86 have multiple encodings - in AsmJit
+//!     this is usually called 'short form' and 'long form'.
+//!
+//!   - AsmJit always tries to use 'short form' as it makes the resulting machine-code smaller, which is always
+//!     good - this decision is used by majority of assemblers out there.
+//!
+//!   - AsmJit allows to override the default decision by using `short_()` and `long_()` instruction options to force
+//!     short or long form, respectively. The most useful is `long_()` as it basically forces AsmJit to always emit
+//!     the longest form. The `short_()` is not that useful as it's automatic (except jumps to non-bound labels). Note
+//!     that the underscore after each function name avoids collision with built-in C++ types.
+//!
+//! To illustrate what short form and long form means in binary let's assume we want to emit "add esp, 16" instruction,
+//! which has two possible binary encodings:
+//!
+//!   - `83C410` - This is a short form aka `short add esp, 16` - You can see opcode byte (0x8C), MOD/RM byte (0xC4)
+//!     and an 8-bit immediate value representing `16`.
+//!
+//!   - `81C410000000` - This is a long form aka `long add esp, 16` - You can see a different opcode byte (0x81), the
+//!     same Mod/RM byte (0xC4) and a 32-bit immediate in little-endian representing `16`.
+//!
+//! It should be obvious that patching an existing instruction into an instruction having a different size may create
+//! various problems. So it's recommended to be careful and to only patch instructions into instructions having the
+//! same size. The example below demonstrates how instruction options can be used to guarantee the size of an
+//! instruction by forcing the assembler to use long-form encoding:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! typedef int (*Func)(void);
+//!
+//! int main(int argc, char* argv[]) {
+//!   JitRuntime rt;                    // Create a runtime specialized for JIT.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Assembler a(&code);          // Create and attach x86::Assembler to code.
+//!
+//!   // Let's get these registers from x86::Assembler.
+//!   x86::Gp zbp = a.zbp();
+//!   x86::Gp zsp = a.zsp();
+//!
+//!   // Function prolog.
+//!   a.push(zbp);
+//!   a.mov(zbp, zsp);
+//!
+//!   // This is where we are gonna patch the code later, so let's get the offset
+//!   // (the current location) from the beginning of the code-buffer.
+//!   size_t patchOffset = a.offset();
+//!   // Let's just emit 'sub zsp, 0' for now, but don't forget to use LONG form.
+//!   a.long_().sub(zsp, 0);
+//!
+//!   // ... emit some code (this just sets return value to zero) ...
+//!   a.xor_(x86::eax, x86::eax);
+//!
+//!   // Function epilog and return.
+//!   a.mov(zsp, zbp);
+//!   a.pop(zbp);
+//!   a.ret();
+//!
+//!   // Now we know how much stack size we want to reserve. I have chosen 128
+//!   // bytes on purpose as it's encodable only in long form that we have used.
+//!
+//!   int stackSize = 128;              // Number of bytes to reserve on the stack.
+//!   a.setOffset(patchOffset);         // Move the current cursor to `patchOffset`.
+//!   a.long_().sub(zsp, stackSize);    // Patch the code; don't forget to use LONG form.
+//!
+//!   // Now the code is ready to be called
+//!   Func fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!
+//!   int result = fn();                // Execute the generated code.
+//!   printf("%d\n", result);           // Print the resulting "0".
+//!
+//!   rt.release(fn);                   // Remove the function from the runtime.
+//!   return 0;
+//! }
+//! ```
+//!
+//! If you run the example it will just work, because both instructions have the same size. As an experiment you can
+//! try removing `long_()` form to see what happens when wrong code is generated.
+//!
+//! ### Code Patching and REX Prefix
+//!
+//! In 64-bit mode there is one more thing to worry about when patching code: REX prefix. It's a single byte prefix
+//! designed to address registers with ids from 9 to 15 and to override the default width of operation from 32 to 64
+//! bits. AsmJit, like other assemblers, only emits REX prefix when it's necessary. If the patched code only changes
+//! the immediate value as shown in the previous example then there is nothing to worry about as it doesn't change
+//! the logic behind emitting REX prefix, however, if the patched code changes register id or overrides the operation
+//! width then it's important to take care of REX prefix as well.
+//!
+//! AsmJit contains another instruction option that controls (forces) REX prefix - `rex()`. If you use it the
+//! instruction emitted will always use REX prefix even when it's encodable without it. The following list contains
+//! some instructions and their binary representations to illustrate when it's emitted:
+//!
+//!   - `__83C410` - `add esp, 16`     - 32-bit operation in 64-bit mode doesn't require REX prefix.
+//!   - `4083C410` - `rex add esp, 16` - 32-bit operation in 64-bit mode with forced REX prefix (0x40).
+//!   - `4883C410` - `add rsp, 16`     - 64-bit operation in 64-bit mode requires REX prefix (0x48).
+//!   - `4183C410` - `add r12d, 16`    - 32-bit operation in 64-bit mode using R12D requires REX prefix (0x41).
+//!   - `4983C410` - `add r12, 16`     - 64-bit operation in 64-bit mode using R12 requires REX prefix (0x49).
+//!
+//! ### More Prefixes
+//!
+//! X86 architecture is known for its prefixes. AsmJit supports all prefixes
+//! that can affect how the instruction is encoded:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! void prefixesExample(x86::Assembler& a) {
+//!   // Lock prefix for implementing atomics:
+//!   //   lock add dword ptr [dst], 1
+//!   a.lock().add(x86::dword_ptr(dst), 1);
+//!
+//!   // Similarly, XAcquire/XRelease prefixes are also available:
+//!   //   xacquire add dword ptr [dst], 1
+//!   a.xacquire().add(x86::dword_ptr(dst), 1);
+//!
+//!   // Rep prefix (see also repe/repz and repne/repnz):
+//!   //   rep movs byte ptr [dst], byte ptr [src]
+//!   a.rep().movs(x86::byte_ptr(dst), x86::byte_ptr(src));
+//!
+//!   // Forcing REX prefix in 64-bit mode.
+//!   //   rex mov eax, 1
+//!   a.rex().mov(x86::eax, 1);
+//!
+//!   // AVX instruction without forced prefix uses the shortest encoding:
+//!   //   vaddpd xmm0, xmm1, xmm2 -> [C5|F1|58|C2]
+//!   a.vaddpd(x86::xmm0, x86::xmm1, x86::xmm2);
+//!
+//!   // Forcing VEX3 prefix (AVX):
+//!   //   vex3 vaddpd xmm0, xmm1, xmm2 -> [C4|E1|71|58|C2]
+//!   a.vex3().vaddpd(x86::xmm0, x86::xmm1, x86::xmm2);
+//!
+//!   // Forcing EVEX prefix (AVX512):
+//!   //   evex vaddpd xmm0, xmm1, xmm2 -> [62|F1|F5|08|58|C2]
+//!   a.evex().vaddpd(x86::xmm0, x86::xmm1, x86::xmm2);
+//!
+//!   // Some instructions accept prefixes not originally intended to:
+//!   //   rep ret
+//!   a.rep().ret();
+//! }
+//! ```
+//!
+//! It's important to understand that prefixes are part of instruction options. When a member function that involves
+//! adding a prefix is called the prefix is combined with existing instruction options, which will affect the next
+//! instruction generated.
+//!
+//! ### Generating AVX512 code.
+//!
+//! x86::Assembler can generate AVX512+ code including the use of opmask registers. Opmask can be specified through
+//! \ref x86::Assembler::k() function, which stores it as an extra register, which will be used by the next
+//! instruction. AsmJit uses such concept for manipulating instruction options as well.
+//!
+//! The following AVX512 features are supported:
+//!
+//!   - Opmask selector {k} and zeroing {z}.
+//!   - Rounding modes {rn|rd|ru|rz} and suppress-all-exceptions {sae} option.
+//!   - AVX512 broadcasts {1toN}.
+//!
+//! The following example demonstrates how AVX512 features can be used:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! void generateAVX512Code(x86::Assembler& a) {
+//!   using namespace x86;
+//!
+//!   // Opmask Selectors
+//!   // ----------------
+//!   //
+//!   //   - Opmask / zeroing is part of the instruction options / extraReg.
+//!   //   - k(reg) is like {kreg} in Intel syntax.
+//!   //   - z() is like {z} in Intel syntax.
+//!
+//!   // vaddpd zmm {k1} {z}, zmm1, zmm2
+//!   a.k(k1).z().vaddpd(zmm0, zmm1, zmm2);
+//!
+//!   // Memory Broadcasts
+//!   // -----------------
+//!   //
+//!   //   - Broadcast data is part of memory operand.
+//!   //   - Use x86::Mem::_1toN(), which returns a new x86::Mem operand.
+//!
+//!   // vaddpd zmm0 {k1} {z}, zmm1, [rcx] {1to8}
+//!   a.k(k1).z().vaddpd(zmm0, zmm1, x86::mem(rcx)._1to8());
+//!
+//!   // Embedded Rounding & Suppress-All-Exceptoins
+//!   // -------------------------------------------
+//!   //
+//!   //   - Rounding mode and {sae} are part of instruction options.
+//!   //   - Use sae() to enable exception suppression.
+//!   //   - Use rn_sae(), rd_sae(), ru_sae(), and rz_sae() - to enable rounding.
+//!   //   - Embedded rounding implicitly sets {sae} as well, that's why the API
+//!   //     also has sae() suffix, to make it clear.
+//!
+//!   // vcmppd k1, zmm1, zmm2, 0x00 {sae}
+//!   a.sae().vcmppd(k1, zmm1, zmm2, 0);
+//!
+//!   // vaddpd zmm0, zmm1, zmm2 {rz}
+//!   a.rz_sae().vaddpd(zmm0, zmm1, zmm2);
+//! }
+//! ```
+class ASMJIT_VIRTAPI Assembler
+  : public BaseAssembler,
+    public EmitterImplicitT<Assembler> {
+public:
+  ASMJIT_NONCOPYABLE(Assembler)
+  typedef BaseAssembler Base;
+
+  //! \name Construction & Destruction
+  //! \{
+
+  ASMJIT_API explicit Assembler(CodeHolder* code = nullptr) noexcept;
+  ASMJIT_API virtual ~Assembler() noexcept;
+
+  //! \}
+
+  //! \cond INTERNAL
+  //! \name Internal
+  //! \{
+
+  // NOTE: x86::Assembler uses _privateData to store 'address-override' bit that is used to decide whether to emit
+  // address-override (67H) prefix based on the memory BASE+INDEX registers. It's either `kX86MemInfo_67H_X86` or
+  // `kX86MemInfo_67H_X64`.
+  inline uint32_t _addressOverrideMask() const noexcept { return _privateData; }
+  inline void _setAddressOverrideMask(uint32_t m) noexcept { _privateData = m; }
+
+  //! \}
+  //! \endcond
+
+  //! \name Emit
+  //! \{
+
+  ASMJIT_API Error _emit(InstId instId, const Operand_& o0, const Operand_& o1, const Operand_& o2, const Operand_* opExt) override;
+
+  //! \}
+  //! \endcond
+
+  //! \name Align
+  //! \{
+
+  ASMJIT_API Error align(AlignMode alignMode, uint32_t alignment) override;
+
+  //! \}
+
+  //! \name Events
+  //! \{
+
+  ASMJIT_API Error onAttach(CodeHolder* code) noexcept override;
+  ASMJIT_API Error onDetach(CodeHolder* code) noexcept override;
+
+  //! \}
+};
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86ASSEMBLER_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86builder.cpp
+++ b/lib/lepton/asmjit/x86/x86builder.cpp
@ -0,0 +1,52 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#if !defined(ASMJIT_NO_X86) && !defined(ASMJIT_NO_BUILDER)
+
+#include "../x86/x86assembler.h"
+#include "../x86/x86builder.h"
+#include "../x86/x86emithelper_p.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+// x86::Builder - Construction & Destruction
+// =========================================
+
+Builder::Builder(CodeHolder* code) noexcept : BaseBuilder() {
+  _archMask = (uint64_t(1) << uint32_t(Arch::kX86)) |
+              (uint64_t(1) << uint32_t(Arch::kX64)) ;
+  assignEmitterFuncs(this);
+
+  if (code)
+    code->attach(this);
+}
+Builder::~Builder() noexcept {}
+
+// x86::Builder - Events
+// =====================
+
+Error Builder::onAttach(CodeHolder* code) noexcept {
+  return Base::onAttach(code);
+}
+
+Error Builder::onDetach(CodeHolder* code) noexcept {
+  return Base::onDetach(code);
+}
+
+// x86::Builder - Finalize
+// =======================
+
+Error Builder::finalize() {
+  ASMJIT_PROPAGATE(runPasses());
+  Assembler a(_code);
+  a.addEncodingOptions(encodingOptions());
+  a.addDiagnosticOptions(diagnosticOptions());
+  return serializeTo(&a);
+}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_X86 && !ASMJIT_NO_BUILDER
--- a/lib/lepton/asmjit/x86/x86builder.h
+++ b/lib/lepton/asmjit/x86/x86builder.h
@ -0,0 +1,351 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86BUILDER_H_INCLUDED
+#define ASMJIT_X86_X86BUILDER_H_INCLUDED
+
+#include "../core/api-config.h"
+#ifndef ASMJIT_NO_BUILDER
+
+#include "../core/builder.h"
+#include "../x86/x86emitter.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86/X64 builder implementation.
+//!
+//! The code representation used by \ref BaseBuilder is compatible with everything AsmJit provides. Each instruction
+//! is stored as \ref InstNode, which contains instruction id, options, and operands. Each instruction emitted will
+//! create a new \ref InstNode instance and add it to the current cursor in the double-linked list of nodes. Since
+//! the instruction stream used by \ref BaseBuilder can be manipulated, we can rewrite the SumInts example from
+//! \ref asmjit_assembler into the following:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! typedef void (*SumIntsFunc)(int* dst, const int* a, const int* b);
+//!
+//! // Small helper function to print the current content of `cb`.
+//! static void dumpCode(BaseBuilder& builder, const char* phase) {
+//!   String sb;
+//!   builder.dump(sb);
+//!   printf("%s:\n%s\n", phase, sb.data());
+//! }
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Create JIT Runtime.
+//!   CodeHolder code;                  // Create a CodeHolder.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Builder cb(&code);           // Create and attach x86::Builder to `code`.
+//!
+//!   // Decide which registers will be mapped to function arguments. Try changing registers
+//!   // of `dst`, `srcA`, and `srcB` and see what happens in function's prolog and epilog.
+//!   x86::Gp dst = cb.zax();
+//!   x86::Gp srcA = cb.zcx();
+//!   x86::Gp srcB = cb.zdx();
+//!
+//!   X86::Xmm vec0 = x86::xmm0;
+//!   X86::Xmm vec1 = x86::xmm1;
+//!
+//!   // Create and initialize `FuncDetail`.
+//!   FuncDetail func;
+//!   func.init(FuncSignatureT<void, int*, const int*, const int*>(CallConvId::kHost));
+//!
+//!   // Remember prolog insertion point.
+//!   BaseNode* prologInsertionPoint = cb.cursor();
+//!
+//!   // Emit function body:
+//!   cb.movdqu(vec0, x86::ptr(srcA));  // Load 4 ints from [srcA] to XMM0.
+//!   cb.movdqu(vec1, x86::ptr(srcB));  // Load 4 ints from [srcB] to XMM1.
+//!   cb.paddd(vec0, vec1);             // Add 4 ints in XMM1 to XMM0.
+//!   cb.movdqu(x86::ptr(dst), vec0);   // Store the result to [dst].
+//!
+//!   // Remember epilog insertion point.
+//!   BaseNode* epilogInsertionPoint = cb.cursor();
+//!
+//!   // Let's see what we have now.
+//!   dumpCode(cb, "Raw Function");
+//!
+//!   // Now, after we emitted the function body, we can insert the prolog, arguments
+//!   // allocation, and epilog. This is not possible with using pure x86::Assembler.
+//!   FuncFrame frame;
+//!   frame.init(func);
+//!
+//!   // Make XMM0 and XMM1 dirty; RegGroup::kVec describes XMM|YMM|ZMM registers.
+//!   frame.setDirtyRegs(RegGroup::kVec, IntUtils::mask(0, 1));
+//!
+//!   FuncArgsAssignment args(&func);   // Create arguments assignment context.
+//!   args.assignAll(dst, srcA, srcB);  // Assign our registers to arguments.
+//!   args.updateFrame(frame);          // Reflect our args in FuncFrame.
+//!   frame.finalize();                 // Finalize the FuncFrame (updates it).
+//!
+//!   // Insert function prolog and allocate arguments to registers.
+//!   cb.setCursor(prologInsertionPoint);
+//!   cb.emitProlog(frame);
+//!   cb.emitArgsAssignment(frame, args);
+//!
+//!   // Insert function epilog.
+//!   cb.setCursor(epilogInsertionPoint);
+//!   cb.emitEpilog(frame);
+//!
+//!   // Let's see how the function's prolog and epilog looks.
+//!   dumpCode(cb, "Prolog & Epilog");
+//!
+//!   // IMPORTANT: Builder requires finalize() to be called to serialize its
+//!   // code to the Assembler (it automatically creates one if not attached).
+//!   cb.finalize();
+//!
+//!   SumIntsFunc fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error case.
+//!
+//!   // Execute the generated function.
+//!   int inA[4] = { 4, 3, 2, 1 };
+//!   int inB[4] = { 1, 5, 2, 8 };
+//!   int out[4];
+//!   fn(out, inA, inB);
+//!
+//!   // Prints {5 8 4 9}
+//!   printf("{%d %d %d %d}\n", out[0], out[1], out[2], out[3]);
+//!
+//!   rt.release(fn);                   // Explicitly remove the function from the runtime.
+//!   return 0;
+//! }
+//! ```
+//!
+//! When the example is executed it should output the following (this one using AMD64-SystemV ABI):
+//!
+//! ```
+//! Raw Function:
+//! movdqu xmm0, [rcx]
+//! movdqu xmm1, [rdx]
+//! paddd xmm0, xmm1
+//! movdqu [rax], xmm0
+//!
+//! Prolog & Epilog:
+//! mov rax, rdi
+//! mov rcx, rsi
+//! movdqu xmm0, [rcx]
+//! movdqu xmm1, [rdx]
+//! paddd xmm0, xmm1
+//! movdqu [rax], xmm0
+//! ret
+//!
+//! {5 8 4 9}
+//! ```
+//!
+//! The number of use-cases of \ref BaseBuilder is not limited and highly depends on your creativity and experience.
+//! The previous example can be easily improved to collect all dirty registers inside the function programmatically
+//! and to pass them to \ref FuncFrame::setDirtyRegs().
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! // NOTE: This function doesn't cover all possible constructs. It ignores instructions that write
+//! // to implicit registers that are not part of the operand list. It also counts read-only registers.
+//! // Real implementation would be a bit more complicated, but still relatively easy to implement.
+//! static void collectDirtyRegs(const BaseNode* first,
+//!                              const BaseNode* last,
+//!                              Support::Array<RegMask, Globals::kNumVirtGroups>& regMask) {
+//!   const BaseNode* node = first;
+//!   while (node) {
+//!     if (node->actsAsInst()) {
+//!       const InstNode* inst = node->as<InstNode>();
+//!       const Operand* opArray = inst->operands();
+//!
+//!       for (uint32_t i = 0, opCount = inst->opCount(); i < opCount; i++) {
+//!         const Operand& op = opArray[i];
+//!         if (op.isReg()) {
+//!           const x86::Reg& reg = op.as<x86::Reg>();
+//!           if (reg.group() <= RegGroup::kMaxVirt) {
+//!             regMask[reg.group()] |= 1u << reg.id();
+//!           }
+//!         }
+//!       }
+//!     }
+//!
+//!     if (node == last)
+//!       break;
+//!     node = node->next();
+//!   }
+//! }
+//!
+//! static void setDirtyRegsOfFuncFrame(const x86::Builder& builder, FuncFrame& frame) {
+//!   Support::Array<RegMask, Globals::kNumVirtGroups> regMask {};
+//!   collectDirtyRegs(builder.firstNode(), builder.lastNode(), regMask);
+//!
+//!   // X86/X64 ABIs only require to save GP/XMM registers:
+//!   frame.setDirtyRegs(RegGroup::kGp, regMask[RegGroup::kGp]);
+//!   frame.setDirtyRegs(RegGroup::kVec, regMask[RegGroup::kVec]);
+//! }
+//! ```
+//!
+//! ### Casting Between Various Emitters
+//!
+//! Even when \ref BaseAssembler and \ref BaseBuilder provide the same interface as defined by \ref BaseEmitter their
+//! platform dependent variants like \ref x86::Assembler and \ref x86::Builder cannot be interchanged or casted to each
+//! other by using a C++ `static_cast<>`. The main reason is the inheritance graph of these classes is different and
+//! cast-incompatible, as illustrated below:
+//!
+//! ```
+//!                                             +--------------+      +=========================+
+//!                    +----------------------->| x86::Emitter |<--+--# x86::EmitterImplicitT<> #<--+
+//!                    |                        +--------------+   |  +=========================+   |
+//!                    |                           (abstract)      |           (mixin)              |
+//!                    |   +--------------+     +~~~~~~~~~~~~~~+   |                                |
+//!                    +-->| BaseAssembler|---->|x86::Assembler|<--+                                |
+//!                    |   +--------------+     +~~~~~~~~~~~~~~+   |                                |
+//!                    |      (abstract)            (final)        |                                |
+//! +===============+  |   +--------------+     +~~~~~~~~~~~~~~+   |                                |
+//! #  BaseEmitter  #--+-->|  BaseBuilder |--+->| x86::Builder |<--+                                |
+//! +===============+      +--------------+  |  +~~~~~~~~~~~~~~+                                    |
+//!    (abstract)             (abstract)     |      (final)                                         |
+//!                    +---------------------+                                                      |
+//!                    |                                                                            |
+//!                    |   +--------------+     +~~~~~~~~~~~~~~+      +=========================+   |
+//!                    +-->| BaseCompiler |---->| x86::Compiler|<-----# x86::EmitterExplicitT<> #---+
+//!                        +--------------+     +~~~~~~~~~~~~~~+      +=========================+
+//!                           (abstract)            (final)                   (mixin)
+//! ```
+//!
+//! The graph basically shows that it's not possible to cast between \ref x86::Assembler and \ref x86::Builder.
+//! However, since both share the base interface (\ref BaseEmitter) it's possible to cast them to a class that
+//! cannot be instantiated, but defines the same interface - the class is called \ref x86::Emitter and was
+//! introduced to make it possible to write a function that can emit to both \ref x86::Assembler and \ref
+//! x86::Builder. Note that \ref x86::Emitter cannot be created, it's abstract and has private constructors and
+//! destructors; it was only designed to be casted to and used as an interface.
+//!
+//! Each architecture-specific emitter implements a member function called
+//! `as<arch::Emitter>()`, which casts the instance to the architecture
+//! specific emitter as illustrated below:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! static void emitSomething(x86::Emitter* e) {
+//!   e->mov(x86::eax, x86::ebx);
+//! }
+//!
+//! static void assemble(CodeHolder& code, bool useAsm) {
+//!   if (useAsm) {
+//!     x86::Assembler assembler(&code);
+//!     emitSomething(assembler.as<x86::Emitter>());
+//!   }
+//!   else {
+//!     x86::Builder builder(&code);
+//!     emitSomething(builder.as<x86::Emitter>());
+//!
+//!     // NOTE: Builder requires `finalize()` to be called to serialize its
+//!     // content to Assembler (it automatically creates one if not attached).
+//!     builder.finalize();
+//!   }
+//! }
+//! ```
+//!
+//! The example above shows how to create a function that can emit code to either \ref x86::Assembler or \ref
+//! x86::Builder through \ref x86::Emitter, which provides emitter-neutral functionality. \ref x86::Emitter,
+//! however, doesn't provide any emitter-specific functionality like `setCursor()`.
+//!
+//! ### Code Injection and Manipulation
+//!
+//! \ref BaseBuilder emitter stores its nodes in a double-linked list, which makes it easy to manipulate that
+//! list during the code generation or afterwards. Each node is always emitted next to the current cursor and
+//! the cursor is advanced to that newly emitted node. The cursor can be retrieved and changed by \ref
+//! BaseBuilder::cursor() and \ref BaseBuilder::setCursor(), respectively.
+//!
+//! The example below demonstrates how to remember a node and inject something
+//! next to it.
+//!
+//! ```
+//! static void example(x86::Builder& builder) {
+//!   // Emit something, after it returns the cursor would point at the last
+//!   // emitted node.
+//!   builder.mov(x86::rax, x86::rdx); // [1]
+//!
+//!   // We can retrieve the node.
+//!   BaseNode* node = builder.cursor();
+//!
+//!   // Change the instruction we just emitted, just for fun...
+//!   if (node->isInst()) {
+//!     InstNode* inst = node->as<InstNode>();
+//!     // Changes the operands at index [1] to RCX.
+//!     inst->setOp(1, x86::rcx);
+//!   }
+//!
+//!   // ------------------------- Generate Some Code -------------------------
+//!   builder.add(x86::rax, x86::rdx); // [2]
+//!   builder.shr(x86::rax, 3);        // [3]
+//!   // ----------------------------------------------------------------------
+//!
+//!   // Now, we know where our node is, and we can simply change the cursor
+//!   // and start emitting something after it. The setCursor() function
+//!   // returns the previous cursor, and it's always a good practice to remember
+//!   // it, because you never know if you are not already injecting the code
+//!   // somewhere else...
+//!   BaseNode* oldCursor = builder.setCursor(node);
+//!
+//!   builder.mul(x86::rax, 8);        // [4]
+//!
+//!   // Restore the cursor
+//!   builder.setCursor(oldCursor);
+//! }
+//! ```
+//!
+//! The function above would actually emit the following:
+//!
+//! ```
+//! mov rax, rcx ; [1] Patched at the beginning.
+//! mul rax, 8   ; [4] Injected.
+//! add rax, rdx ; [2] Followed [1] initially.
+//! shr rax, 3   ; [3] Follows [2].
+//! ```
+class ASMJIT_VIRTAPI Builder
+  : public BaseBuilder,
+    public EmitterImplicitT<Builder> {
+public:
+  ASMJIT_NONCOPYABLE(Builder)
+  typedef BaseBuilder Base;
+
+  //! \name Construction & Destruction
+  //! \{
+
+  ASMJIT_API explicit Builder(CodeHolder* code = nullptr) noexcept;
+  ASMJIT_API virtual ~Builder() noexcept;
+
+  //! \}
+
+  //! \name Events
+  //! \{
+
+  ASMJIT_API Error onAttach(CodeHolder* code) noexcept override;
+  ASMJIT_API Error onDetach(CodeHolder* code) noexcept override;
+
+  //! \}
+
+  //! \name Finalize
+  //! \{
+
+  ASMJIT_API Error finalize() override;
+
+  //! \}
+};
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_BUILDER
+#endif // ASMJIT_X86_X86BUILDER_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86compiler.cpp
+++ b/lib/lepton/asmjit/x86/x86compiler.cpp
@ -0,0 +1,61 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#if !defined(ASMJIT_NO_X86) && !defined(ASMJIT_NO_COMPILER)
+
+#include "../x86/x86assembler.h"
+#include "../x86/x86compiler.h"
+#include "../x86/x86instapi_p.h"
+#include "../x86/x86rapass_p.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+// x86::Compiler - Construction & Destruction
+// ==========================================
+
+Compiler::Compiler(CodeHolder* code) noexcept : BaseCompiler() {
+  _archMask = (uint64_t(1) << uint32_t(Arch::kX86)) |
+              (uint64_t(1) << uint32_t(Arch::kX64)) ;
+  assignEmitterFuncs(this);
+
+  if (code)
+    code->attach(this);
+}
+Compiler::~Compiler() noexcept {}
+
+// x86::Compiler - Events
+// ======================
+
+Error Compiler::onAttach(CodeHolder* code) noexcept {
+  ASMJIT_PROPAGATE(Base::onAttach(code));
+  Error err = addPassT<X86RAPass>();
+
+  if (ASMJIT_UNLIKELY(err)) {
+    onDetach(code);
+    return err;
+  }
+
+  return kErrorOk;
+}
+
+Error Compiler::onDetach(CodeHolder* code) noexcept {
+  return Base::onDetach(code);
+}
+
+// x86::Compiler - Finalize
+// ========================
+
+Error Compiler::finalize() {
+  ASMJIT_PROPAGATE(runPasses());
+  Assembler a(_code);
+  a.addEncodingOptions(encodingOptions());
+  a.addDiagnosticOptions(diagnosticOptions());
+  return serializeTo(&a);
+}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_X86 && !ASMJIT_NO_COMPILER
--- a/lib/lepton/asmjit/x86/x86compiler.h
+++ b/lib/lepton/asmjit/x86/x86compiler.h
@ -0,0 +1,721 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86COMPILER_H_INCLUDED
+#define ASMJIT_X86_X86COMPILER_H_INCLUDED
+
+#include "../core/api-config.h"
+#ifndef ASMJIT_NO_COMPILER
+
+#include "../core/compiler.h"
+#include "../core/type.h"
+#include "../x86/x86emitter.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86/X64 compiler implementation.
+//!
+//! ### Compiler Basics
+//!
+//! The first \ref x86::Compiler example shows how to generate a function that simply returns an integer value. It's
+//! an analogy to the first Assembler example:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef int (*Func)(void);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Runtime specialized for JIT code execution.
+//!   CodeHolder code;                  // Holds code and relocation information.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Compiler cc(&code);          // Create and attach x86::Compiler to code.
+//!
+//!   cc.addFunc(FuncSignatureT<int>());// Begin a function of `int fn(void)` signature.
+//!
+//!   x86::Gp vReg = cc.newGpd();       // Create a 32-bit general purpose register.
+//!   cc.mov(vReg, 1);                  // Move one to our virtual register `vReg`.
+//!   cc.ret(vReg);                     // Return `vReg` from the function.
+//!
+//!   cc.endFunc();                     // End of the function body.
+//!   cc.finalize();                    // Translate and assemble the whole 'cc' content.
+//!   // ----> x86::Compiler is no longer needed from here and can be destroyed <----
+//!
+//!   Func fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   int result = fn();                // Execute the generated code.
+//!   printf("%d\n", result);           // Print the resulting "1".
+//!
+//!   rt.release(fn);                   // Explicitly remove the function from the runtime.
+//!   return 0;
+//! }
+//! ```
+//!
+//! The \ref BaseCompiler::addFunc() and \ref BaseCompiler::endFunc() functions are used to define the function and
+//! its end. Both must be called per function, but the body doesn't have to be generated in sequence. An example of
+//! generating two functions will be shown later. The next example shows more complicated code that contain a loop
+//! and generates a simple memory copy function that uses `uint32_t` items:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef void (*MemCpy32)(uint32_t* dst, const uint32_t* src, size_t count);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Runtime specialized for JIT code execution.
+//!   CodeHolder code;                  // Holds code and relocation information.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Compiler cc(&code);          // Create and attach x86::Compiler to code.
+//!
+//!   FuncNode* funcNode = cc.addFunc(  // Begin the function of the following signature:
+//!     FuncSignatureT<void,            //   Return value - void      (no return value).
+//!       uint32_t*,                    //   1st argument - uint32_t* (machine reg-size).
+//!       const uint32_t*,              //   2nd argument - uint32_t* (machine reg-size).
+//!       size_t>());                   //   3rd argument - size_t    (machine reg-size).
+//!
+//!   Label L_Loop = cc.newLabel();     // Start of the loop.
+//!   Label L_Exit = cc.newLabel();     // Used to exit early.
+//!
+//!   x86::Gp dst = cc.newIntPtr("dst");// Create `dst` register (destination pointer).
+//!   x86::Gp src = cc.newIntPtr("src");// Create `src` register (source pointer).
+//!   x86::Gp i = cc.newUIntPtr("i");   // Create `i` register (loop counter).
+//!
+//!   funcNode->setArg(0, dst);         // Assign `dst` argument.
+//!   funcNode->setArg(1, src);         // Assign `src` argument.
+//!   funcNode->setArg(2, i);           // Assign `i` argument.
+//!
+//!   cc.test(i, i);                    // Early exit if length is zero.
+//!   cc.jz(L_Exit);
+//!
+//!   cc.bind(L_Loop);                  // Bind the beginning of the loop here.
+//!
+//!   x86::Gp tmp = cc.newInt32("tmp"); // Copy a single dword (4 bytes).
+//!   cc.mov(tmp, x86::dword_ptr(src)); // Load DWORD from [src] address.
+//!   cc.mov(x86::dword_ptr(dst), tmp); // Store DWORD to [dst] address.
+//!
+//!   cc.add(src, 4);                   // Increment `src`.
+//!   cc.add(dst, 4);                   // Increment `dst`.
+//!
+//!   cc.dec(i);                        // Loop until `i` is non-zero.
+//!   cc.jnz(L_Loop);
+//!
+//!   cc.bind(L_Exit);                  // Label used by early exit.
+//!   cc.endFunc();                     // End of the function body.
+//!
+//!   cc.finalize();                    // Translate and assemble the whole 'cc' content.
+//!   // ----> x86::Compiler is no longer needed from here and can be destroyed <----
+//!
+//!   // Add the generated code to the runtime.
+//!   MemCpy32 memcpy32;
+//!   Error err = rt.add(&memcpy32, &code);
+//!
+//!   // Handle a possible error returned by AsmJit.
+//!   if (err)
+//!     return 1;
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   // Test the generated code.
+//!   uint32_t input[6] = { 1, 2, 3, 5, 8, 13 };
+//!   uint32_t output[6];
+//!   memcpy32(output, input, 6);
+//!
+//!   for (uint32_t i = 0; i < 6; i++)
+//!     printf("%d\n", output[i]);
+//!
+//!   rt.release(memcpy32);
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### AVX and AVX-512
+//!
+//! AVX and AVX-512 code generation must be explicitly enabled via \ref FuncFrame to work properly. If it's not setup
+//! correctly then Prolog & Epilog would use SSE instead of AVX instructions to work with SIMD registers. In addition,
+//! Compiler requires explicitly enable AVX-512 via \ref FuncFrame in order to use all 32 SIMD registers.
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef void (*Func)(void*);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Runtime specialized for JIT code execution.
+//!   CodeHolder code;                  // Holds code and relocation information.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Compiler cc(&code);          // Create and attach x86::Compiler to code.
+//!
+//!   FuncNode* funcNode = cc.addFunc(FuncSignatureT<void, void*>());
+//!
+//!   // Use the following to enable AVX and/or AVX-512.
+//!   funcNode->frame().setAvxEnabled();
+//!   funcNode->frame().setAvx512Enabled();
+//!
+//!   // Do something with the input pointer.
+//!   x86::Gp addr = cc.newIntPtr("addr");
+//!   x86::Zmm vreg = cc.newZmm("vreg");
+//!
+//!   funcNode->setArg(0, addr);
+//!
+//!   cc.vmovdqu32(vreg, x86::ptr(addr));
+//!   cc.vpaddq(vreg, vreg, vreg);
+//!   cc.vmovdqu32(x86::ptr(addr), vreg);
+//!
+//!   cc.endFunc();                     // End of the function body.
+//!   cc.finalize();                    // Translate and assemble the whole 'cc' content.
+//!   // ----> x86::Compiler is no longer needed from here and can be destroyed <----
+//!
+//!   Func fn;
+//!   Error err = rt.add(&fn, &code);   // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   // Execute the generated code and print some output.
+//!   uint64_t data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+//!   fn(data);
+//!   printf("%llu\n", (unsigned long long)data[0]);
+//!
+//!   rt.release(fn);                   // Explicitly remove the function from the runtime.
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### Recursive Functions
+//!
+//! It's possible to create more functions by using the same \ref x86::Compiler instance and make links between them.
+//! In such case it's important to keep the pointer to \ref FuncNode.
+//!
+//! The example below creates a simple Fibonacci function that calls itself recursively:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef uint32_t (*Fibonacci)(uint32_t x);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Runtime specialized for JIT code execution.
+//!   CodeHolder code;                  // Holds code and relocation information.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Compiler cc(&code);          // Create and attach x86::Compiler to code.
+//!
+//!   FuncNode* funcNode = cc.addFunc(  // Begin of the Fibonacci function, addFunc()
+//!     FuncSignatureT<int, int>());    // Returns a pointer to the FuncNode node.
+//!
+//!   Label L_Exit = cc.newLabel()      // Exit label.
+//!   x86::Gp x = cc.newUInt32();       // Function x argument.
+//!   x86::Gp y = cc.newUInt32();       // Temporary.
+//!
+//!   funcNode->setArg(0, x);
+//!
+//!   cc.cmp(x, 3);                     // Return x if less than 3.
+//!   cc.jb(L_Exit);
+//!
+//!   cc.mov(y, x);                     // Make copy of the original x.
+//!   cc.dec(x);                        // Decrease x.
+//!
+//!   InvokeNode* invokeNode;           // Function invocation:
+//!   cc.invoke(&invokeNode,            //   - InvokeNode (output).
+//!     funcNode->label(),              //   - Function address or Label.
+//!     FuncSignatureT<int, int>());    //   - Function signature.
+//!
+//!   invokeNode->setArg(0, x);         // Assign x as the first argument.
+//!   invokeNode->setRet(0, x);         // Assign x as a return value as well.
+//!
+//!   cc.add(x, y);                     // Combine the return value with y.
+//!
+//!   cc.bind(L_Exit);
+//!   cc.ret(x);                        // Return x.
+//!   cc.endFunc();                     // End of the function body.
+//!
+//!   cc.finalize();                    // Translate and assemble the whole 'cc' content.
+//!   // ----> x86::Compiler is no longer needed from here and can be destroyed <----
+//!
+//!   Fibonacci fib;
+//!   Error err = rt.add(&fib, &code);  // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   // Test the generated code.
+//!   printf("Fib(%u) -> %u\n", 8, fib(8));
+//!
+//!   rt.release(fib);
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### Stack Management
+//!
+//! Function's stack-frame is managed automatically, which is used by the register allocator to spill virtual
+//! registers. It also provides an interface to allocate user-defined block of the stack, which can be used as
+//! a temporary storage by the generated function. In the following example a stack of 256 bytes size is allocated,
+//! filled by bytes starting from 0 to 255 and then iterated again to sum all the values.
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//! #include <stdio.h>
+//!
+//! using namespace asmjit;
+//!
+//! // Signature of the generated function.
+//! typedef int (*Func)(void);
+//!
+//! int main() {
+//!   JitRuntime rt;                    // Runtime specialized for JIT code execution.
+//!   CodeHolder code;                  // Holds code and relocation information.
+//!
+//!   code.init(rt.environment());      // Initialize code to match the JIT environment.
+//!   x86::Compiler cc(&code);          // Create and attach x86::Compiler to code.
+//!
+//!   cc.addFunc(FuncSignatureT<int>());// Create a function that returns int.
+//!
+//!   x86::Gp p = cc.newIntPtr("p");
+//!   x86::Gp i = cc.newIntPtr("i");
+//!
+//!   // Allocate 256 bytes on the stack aligned to 4 bytes.
+//!   x86::Mem stack = cc.newStack(256, 4);
+//!
+//!   x86::Mem stackIdx(stack);         // Copy of stack with i added.
+//!   stackIdx.setIndex(i);             // stackIdx <- stack[i].
+//!   stackIdx.setSize(1);              // stackIdx <- byte ptr stack[i].
+//!
+//!   // Load a stack address to `p`. This step is purely optional and shows
+//!   // that `lea` is useful to load a memory operands address (even absolute)
+//!   // to a general purpose register.
+//!   cc.lea(p, stack);
+//!
+//!   // Clear i (xor is a C++ keyword, hence 'xor_' is used instead).
+//!   cc.xor_(i, i);
+//!
+//!   Label L1 = cc.newLabel();
+//!   Label L2 = cc.newLabel();
+//!
+//!   cc.bind(L1);                      // First loop, fill the stack.
+//!   cc.mov(stackIdx, i.r8());         // stack[i] = uint8_t(i).
+//!
+//!   cc.inc(i);                        // i++;
+//!   cc.cmp(i, 256);                   // if (i < 256)
+//!   cc.jb(L1);                        //   goto L1;
+//!
+//!   // Second loop, sum all bytes stored in `stack`.
+//!   x86::Gp sum = cc.newInt32("sum");
+//!   x86::Gp val = cc.newInt32("val");
+//!
+//!   cc.xor_(i, i);
+//!   cc.xor_(sum, sum);
+//!
+//!   cc.bind(L2);
+//!
+//!   cc.movzx(val, stackIdx);          // val = uint32_t(stack[i]);
+//!   cc.add(sum, val);                 // sum += val;
+//!
+//!   cc.inc(i);                        // i++;
+//!   cc.cmp(i, 256);                   // if (i < 256)
+//!   cc.jb(L2);                        //   goto L2;
+//!
+//!   cc.ret(sum);                      // Return the `sum` of all values.
+//!   cc.endFunc();                     // End of the function body.
+//!
+//!   cc.finalize();                    // Translate and assemble the whole 'cc' content.
+//!   // ----> x86::Compiler is no longer needed from here and can be destroyed <----
+//!
+//!   Func func;
+//!   Error err = rt.add(&func, &code); // Add the generated code to the runtime.
+//!   if (err) return 1;                // Handle a possible error returned by AsmJit.
+//!   // ----> CodeHolder is no longer needed from here and can be destroyed <----
+//!
+//!   printf("Func() -> %d\n", func()); // Test the generated code.
+//!
+//!   rt.release(func);
+//!   return 0;
+//! }
+//! ```
+//!
+//! ### Constant Pool
+//!
+//! Compiler provides two constant pools for a general purpose code generation:
+//!
+//!   - Local constant pool - Part of \ref FuncNode, can be only used by a single function and added after the
+//!     function epilog sequence (after `ret` instruction).
+//!
+//!   - Global constant pool - Part of \ref BaseCompiler, flushed at the end of the generated code by \ref
+//!     BaseEmitter::finalize().
+//!
+//! The example below illustrates how a built-in constant pool can be used:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! static void exampleUseOfConstPool(x86::Compiler& cc) {
+//!   cc.addFunc(FuncSignatureT<int>());
+//!
+//!   x86::Gp v0 = cc.newGpd("v0");
+//!   x86::Gp v1 = cc.newGpd("v1");
+//!
+//!   x86::Mem c0 = cc.newInt32Const(ConstPoolScope::kLocal, 200);
+//!   x86::Mem c1 = cc.newInt32Const(ConstPoolScope::kLocal, 33);
+//!
+//!   cc.mov(v0, c0);
+//!   cc.mov(v1, c1);
+//!   cc.add(v0, v1);
+//!
+//!   cc.ret(v0);
+//!   cc.endFunc();
+//! }
+//! ```
+//!
+//! ### Jump Tables
+//!
+//! x86::Compiler supports `jmp` instruction with reg/mem operand, which is a commonly used pattern to implement
+//! indirect jumps within a function, for example to implement `switch()` statement in a programming languages.
+//! By default AsmJit assumes that every basic block can be a possible jump target as it's unable to deduce targets
+//! from instruction's operands. This is a very pessimistic default that should be avoided if possible as it's costly
+//! and very unfriendly to liveness analysis and register allocation.
+//!
+//! Instead of relying on such pessimistic default behavior, let's use \ref JumpAnnotation to annotate a jump where
+//! all targets are known:
+//!
+//! ```
+//! #include <asmjit/x86.h>
+//!
+//! using namespace asmjit;
+//!
+//! static void exampleUseOfIndirectJump(x86::Compiler& cc) {
+//!   FuncNode* funcNode = cc.addFunc(FuncSignatureT<float, float, float, uint32_t>(CallConvId::kHost));
+//!
+//!   // Function arguments
+//!   x86::Xmm a = cc.newXmmSs("a");
+//!   x86::Xmm b = cc.newXmmSs("b");
+//!   x86::Gp op = cc.newUInt32("op");
+//!
+//!   x86::Gp target = cc.newIntPtr("target");
+//!   x86::Gp offset = cc.newIntPtr("offset");
+//!
+//!   Label L_Table = cc.newLabel();
+//!   Label L_Add = cc.newLabel();
+//!   Label L_Sub = cc.newLabel();
+//!   Label L_Mul = cc.newLabel();
+//!   Label L_Div = cc.newLabel();
+//!   Label L_End = cc.newLabel();
+//!
+//!   funcNode->setArg(0, a);
+//!   funcNode->setArg(1, b);
+//!   funcNode->setArg(2, op);
+//!
+//!   // Jump annotation is a building block that allows to annotate all possible targets where `jmp()` can
+//!   // jump. It then drives the CFG construction and liveness analysis, which impacts register allocation.
+//!   JumpAnnotation* annotation = cc.newJumpAnnotation();
+//!   annotation->addLabel(L_Add);
+//!   annotation->addLabel(L_Sub);
+//!   annotation->addLabel(L_Mul);
+//!   annotation->addLabel(L_Div);
+//!
+//!   // Most likely not the common indirect jump approach, but it
+//!   // doesn't really matter how final address is calculated. The
+//!   // most important path using JumpAnnotation with `jmp()`.
+//!   cc.lea(offset, x86::ptr(L_Table));
+//!   if (cc.is64Bit())
+//!     cc.movsxd(target, x86::dword_ptr(offset, op.cloneAs(offset), 2));
+//!   else
+//!     cc.mov(target, x86::dword_ptr(offset, op.cloneAs(offset), 2));
+//!   cc.add(target, offset);
+//!   cc.jmp(target, annotation);
+//!
+//!   // Acts like a switch() statement in C.
+//!   cc.bind(L_Add);
+//!   cc.addss(a, b);
+//!   cc.jmp(L_End);
+//!
+//!   cc.bind(L_Sub);
+//!   cc.subss(a, b);
+//!   cc.jmp(L_End);
+//!
+//!   cc.bind(L_Mul);
+//!   cc.mulss(a, b);
+//!   cc.jmp(L_End);
+//!
+//!   cc.bind(L_Div);
+//!   cc.divss(a, b);
+//!
+//!   cc.bind(L_End);
+//!   cc.ret(a);
+//!
+//!   cc.endFunc();
+//!
+//!   // Relative int32_t offsets of `L_XXX - L_Table`.
+//!   cc.bind(L_Table);
+//!   cc.embedLabelDelta(L_Add, L_Table, 4);
+//!   cc.embedLabelDelta(L_Sub, L_Table, 4);
+//!   cc.embedLabelDelta(L_Mul, L_Table, 4);
+//!   cc.embedLabelDelta(L_Div, L_Table, 4);
+//! }
+//! ```
+class ASMJIT_VIRTAPI Compiler
+  : public BaseCompiler,
+    public EmitterExplicitT<Compiler> {
+public:
+  ASMJIT_NONCOPYABLE(Compiler)
+  typedef BaseCompiler Base;
+
+  //! \name Construction & Destruction
+  //! \{
+
+  ASMJIT_API explicit Compiler(CodeHolder* code = nullptr) noexcept;
+  ASMJIT_API virtual ~Compiler() noexcept;
+
+  //! \}
+
+  //! \name Virtual Registers
+  //! \{
+
+#ifndef ASMJIT_NO_LOGGING
+# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS)                         \
+    _newRegFmt(&OUT, PARAM, FORMAT, ARGS)
+#else
+# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS)                         \
+    DebugUtils::unused(FORMAT);                                               \
+    DebugUtils::unused(std::forward<Args>(args)...);                          \
+    _newReg(&OUT, PARAM)
+#endif
+
+#define ASMJIT_NEW_REG_CUSTOM(FUNC, REG)                                      \
+    inline REG FUNC(TypeId typeId) {                                          \
+      REG reg(Globals::NoInit);                                               \
+      _newReg(&reg, typeId);                                                  \
+      return reg;                                                             \
+    }                                                                         \
+                                                                              \
+    template<typename... Args>                                                \
+    inline REG FUNC(TypeId typeId, const char* fmt, Args&&... args) {         \
+      REG reg(Globals::NoInit);                                               \
+      ASMJIT_NEW_REG_FMT(reg, typeId, fmt, std::forward<Args>(args)...);      \
+      return reg;                                                             \
+    }
+
+#define ASMJIT_NEW_REG_TYPED(FUNC, REG, TYPE_ID)                              \
+    inline REG FUNC() {                                                       \
+      REG reg(Globals::NoInit);                                               \
+      _newReg(&reg, TYPE_ID);                                                 \
+      return reg;                                                             \
+    }                                                                         \
+                                                                              \
+    template<typename... Args>                                                \
+    inline REG FUNC(const char* fmt, Args&&... args) {                        \
+      REG reg(Globals::NoInit);                                               \
+      ASMJIT_NEW_REG_FMT(reg, TYPE_ID, fmt, std::forward<Args>(args)...);     \
+      return reg;                                                             \
+    }
+
+  template<typename RegT>
+  inline RegT newSimilarReg(const RegT& ref) {
+    RegT reg(Globals::NoInit);
+    _newReg(reg, ref);
+    return reg;
+  }
+
+  template<typename RegT, typename... Args>
+  inline RegT newSimilarReg(const RegT& ref, const char* fmt, Args&&... args) {
+    RegT reg(Globals::NoInit);
+    ASMJIT_NEW_REG_FMT(reg, ref, fmt, std::forward<Args>(args)...);
+    return reg;
+  }
+
+  ASMJIT_NEW_REG_CUSTOM(newReg    , Reg )
+  ASMJIT_NEW_REG_CUSTOM(newGp     , Gp  )
+  ASMJIT_NEW_REG_CUSTOM(newVec    , Vec )
+  ASMJIT_NEW_REG_CUSTOM(newK      , KReg)
+
+  ASMJIT_NEW_REG_TYPED(newInt8   , Gp  , TypeId::kInt8)
+  ASMJIT_NEW_REG_TYPED(newUInt8  , Gp  , TypeId::kUInt8)
+  ASMJIT_NEW_REG_TYPED(newInt16  , Gp  , TypeId::kInt16)
+  ASMJIT_NEW_REG_TYPED(newUInt16 , Gp  , TypeId::kUInt16)
+  ASMJIT_NEW_REG_TYPED(newInt32  , Gp  , TypeId::kInt32)
+  ASMJIT_NEW_REG_TYPED(newUInt32 , Gp  , TypeId::kUInt32)
+  ASMJIT_NEW_REG_TYPED(newInt64  , Gp  , TypeId::kInt64)
+  ASMJIT_NEW_REG_TYPED(newUInt64 , Gp  , TypeId::kUInt64)
+  ASMJIT_NEW_REG_TYPED(newIntPtr , Gp  , TypeId::kIntPtr)
+  ASMJIT_NEW_REG_TYPED(newUIntPtr, Gp  , TypeId::kUIntPtr)
+
+  ASMJIT_NEW_REG_TYPED(newGpb    , Gp  , TypeId::kUInt8)
+  ASMJIT_NEW_REG_TYPED(newGpw    , Gp  , TypeId::kUInt16)
+  ASMJIT_NEW_REG_TYPED(newGpd    , Gp  , TypeId::kUInt32)
+  ASMJIT_NEW_REG_TYPED(newGpq    , Gp  , TypeId::kUInt64)
+  ASMJIT_NEW_REG_TYPED(newGpz    , Gp  , TypeId::kUIntPtr)
+  ASMJIT_NEW_REG_TYPED(newXmm    , Xmm , TypeId::kInt32x4)
+  ASMJIT_NEW_REG_TYPED(newXmmSs  , Xmm , TypeId::kFloat32x1)
+  ASMJIT_NEW_REG_TYPED(newXmmSd  , Xmm , TypeId::kFloat64x1)
+  ASMJIT_NEW_REG_TYPED(newXmmPs  , Xmm , TypeId::kFloat32x4)
+  ASMJIT_NEW_REG_TYPED(newXmmPd  , Xmm , TypeId::kFloat64x2)
+  ASMJIT_NEW_REG_TYPED(newYmm    , Ymm , TypeId::kInt32x8)
+  ASMJIT_NEW_REG_TYPED(newYmmPs  , Ymm , TypeId::kFloat32x8)
+  ASMJIT_NEW_REG_TYPED(newYmmPd  , Ymm , TypeId::kFloat64x4)
+  ASMJIT_NEW_REG_TYPED(newZmm    , Zmm , TypeId::kInt32x16)
+  ASMJIT_NEW_REG_TYPED(newZmmPs  , Zmm , TypeId::kFloat32x16)
+  ASMJIT_NEW_REG_TYPED(newZmmPd  , Zmm , TypeId::kFloat64x8)
+  ASMJIT_NEW_REG_TYPED(newMm     , Mm  , TypeId::kMmx64)
+  ASMJIT_NEW_REG_TYPED(newKb     , KReg, TypeId::kMask8)
+  ASMJIT_NEW_REG_TYPED(newKw     , KReg, TypeId::kMask16)
+  ASMJIT_NEW_REG_TYPED(newKd     , KReg, TypeId::kMask32)
+  ASMJIT_NEW_REG_TYPED(newKq     , KReg, TypeId::kMask64)
+
+#undef ASMJIT_NEW_REG_TYPED
+#undef ASMJIT_NEW_REG_CUSTOM
+#undef ASMJIT_NEW_REG_FMT
+
+  //! \}
+
+  //! \name Stack
+  //! \{
+
+  //! Creates a new memory chunk allocated on the current function's stack.
+  inline Mem newStack(uint32_t size, uint32_t alignment, const char* name = nullptr) {
+    Mem m(Globals::NoInit);
+    _newStack(&m, size, alignment, name);
+    return m;
+  }
+
+  //! \}
+
+  //! \name Constants
+  //! \{
+
+  //! Put data to a constant-pool and get a memory reference to it.
+  inline Mem newConst(ConstPoolScope scope, const void* data, size_t size) {
+    Mem m(Globals::NoInit);
+    _newConst(&m, scope, data, size);
+    return m;
+  }
+
+  //! Put a BYTE `val` to a constant-pool.
+  inline Mem newByteConst(ConstPoolScope scope, uint8_t val) noexcept { return newConst(scope, &val, 1); }
+  //! Put a WORD `val` to a constant-pool.
+  inline Mem newWordConst(ConstPoolScope scope, uint16_t val) noexcept { return newConst(scope, &val, 2); }
+  //! Put a DWORD `val` to a constant-pool.
+  inline Mem newDWordConst(ConstPoolScope scope, uint32_t val) noexcept { return newConst(scope, &val, 4); }
+  //! Put a QWORD `val` to a constant-pool.
+  inline Mem newQWordConst(ConstPoolScope scope, uint64_t val) noexcept { return newConst(scope, &val, 8); }
+
+  //! Put a WORD `val` to a constant-pool.
+  inline Mem newInt16Const(ConstPoolScope scope, int16_t val) noexcept { return newConst(scope, &val, 2); }
+  //! Put a WORD `val` to a constant-pool.
+  inline Mem newUInt16Const(ConstPoolScope scope, uint16_t val) noexcept { return newConst(scope, &val, 2); }
+  //! Put a DWORD `val` to a constant-pool.
+  inline Mem newInt32Const(ConstPoolScope scope, int32_t val) noexcept { return newConst(scope, &val, 4); }
+  //! Put a DWORD `val` to a constant-pool.
+  inline Mem newUInt32Const(ConstPoolScope scope, uint32_t val) noexcept { return newConst(scope, &val, 4); }
+  //! Put a QWORD `val` to a constant-pool.
+  inline Mem newInt64Const(ConstPoolScope scope, int64_t val) noexcept { return newConst(scope, &val, 8); }
+  //! Put a QWORD `val` to a constant-pool.
+  inline Mem newUInt64Const(ConstPoolScope scope, uint64_t val) noexcept { return newConst(scope, &val, 8); }
+
+  //! Put a SP-FP `val` to a constant-pool.
+  inline Mem newFloatConst(ConstPoolScope scope, float val) noexcept { return newConst(scope, &val, 4); }
+  //! Put a DP-FP `val` to a constant-pool.
+  inline Mem newDoubleConst(ConstPoolScope scope, double val) noexcept { return newConst(scope, &val, 8); }
+
+  //! \}
+
+  //! \name Instruction Options
+  //! \{
+
+  //! Force the compiler to not follow the conditional or unconditional jump.
+  inline Compiler& unfollow() noexcept { addInstOptions(InstOptions::kUnfollow); return *this; }
+  //! Tell the compiler that the destination variable will be overwritten.
+  inline Compiler& overwrite() noexcept { addInstOptions(InstOptions::kOverwrite); return *this; }
+
+  //! \}
+
+  //! \name Function Call & Ret Intrinsics
+  //! \{
+
+  //! Invoke a function call without `target` type enforcement.
+  inline Error invoke_(InvokeNode** out, const Operand_& target, const FuncSignature& signature) {
+    return addInvokeNode(out, Inst::kIdCall, target, signature);
+  }
+
+  //! Invoke a function call of the given `target` and `signature` and store the added node to `out`.
+  //!
+  //! Creates a new \ref InvokeNode, initializes all the necessary members to match the given function `signature`,
+  //! adds the node to the compiler, and stores its pointer to `out`. The operation is atomic, if anything fails
+  //! nullptr is stored in `out` and error code is returned.
+  inline Error invoke(InvokeNode** out, const Gp& target, const FuncSignature& signature) { return invoke_(out, target, signature); }
+  //! \overload
+  inline Error invoke(InvokeNode** out, const Mem& target, const FuncSignature& signature) { return invoke_(out, target, signature); }
+  //! \overload
+  inline Error invoke(InvokeNode** out, const Label& target, const FuncSignature& signature) { return invoke_(out, target, signature); }
+  //! \overload
+  inline Error invoke(InvokeNode** out, const Imm& target, const FuncSignature& signature) { return invoke_(out, target, signature); }
+  //! \overload
+  inline Error invoke(InvokeNode** out, uint64_t target, const FuncSignature& signature) { return invoke_(out, Imm(int64_t(target)), signature); }
+
+  //! Return from function.
+  inline Error ret() { return addRet(Operand(), Operand()); }
+  //! \overload
+  inline Error ret(const BaseReg& o0) { return addRet(o0, Operand()); }
+  //! \overload
+  inline Error ret(const BaseReg& o0, const BaseReg& o1) { return addRet(o0, o1); }
+
+  //! \}
+
+  //! \name Jump Tables Support
+  //! \{
+
+  using EmitterExplicitT<Compiler>::jmp;
+
+  //! Adds a jump to the given `target` with the provided jump `annotation`.
+  inline Error jmp(const BaseReg& target, JumpAnnotation* annotation) { return emitAnnotatedJump(Inst::kIdJmp, target, annotation); }
+  //! \overload
+  inline Error jmp(const BaseMem& target, JumpAnnotation* annotation) { return emitAnnotatedJump(Inst::kIdJmp, target, annotation); }
+
+  //! \}
+
+  //! \name Events
+  //! \{
+
+  ASMJIT_API Error onAttach(CodeHolder* code) noexcept override;
+  ASMJIT_API Error onDetach(CodeHolder* code) noexcept override;
+
+  //! \}
+
+  //! \name Finalize
+  //! \{
+
+  ASMJIT_API Error finalize() override;
+
+  //! \}
+};
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_COMPILER
+#endif // ASMJIT_X86_X86COMPILER_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86emithelper.cpp
+++ b/lib/lepton/asmjit/x86/x86emithelper.cpp
@ -0,0 +1,619 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#if !defined(ASMJIT_NO_X86)
+
+#include "../core/formatter.h"
+#include "../core/funcargscontext_p.h"
+#include "../core/string.h"
+#include "../core/support.h"
+#include "../core/type.h"
+#include "../core/radefs_p.h"
+#include "../x86/x86emithelper_p.h"
+#include "../x86/x86emitter.h"
+#include "../x86/x86formatter_p.h"
+#include "../x86/x86instapi_p.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+// x86::EmitHelper - Utilities
+// ===========================
+
+static inline uint32_t getXmmMovInst(const FuncFrame& frame) {
+  bool avx = frame.isAvxEnabled();
+  bool aligned = frame.hasAlignedVecSR();
+
+  return aligned ? (avx ? Inst::kIdVmovaps : Inst::kIdMovaps)
+                 : (avx ? Inst::kIdVmovups : Inst::kIdMovups);
+}
+
+//! Converts `size` to a 'kmov?' instruction.
+static inline uint32_t kmovInstFromSize(uint32_t size) noexcept {
+  switch (size) {
+    case  1: return Inst::kIdKmovb;
+    case  2: return Inst::kIdKmovw;
+    case  4: return Inst::kIdKmovd;
+    case  8: return Inst::kIdKmovq;
+    default: return Inst::kIdNone;
+  }
+}
+
+static inline uint32_t makeCastOp(TypeId dst, TypeId src) noexcept {
+  return (uint32_t(dst) << 8) | uint32_t(src);
+}
+
+// x86::EmitHelper - Emit Reg Move
+// ===============================
+
+ASMJIT_FAVOR_SIZE Error EmitHelper::emitRegMove(
+  const Operand_& dst_,
+  const Operand_& src_, TypeId typeId, const char* comment) {
+
+  // Invalid or abstract TypeIds are not allowed.
+  ASMJIT_ASSERT(TypeUtils::isValid(typeId) && !TypeUtils::isAbstract(typeId));
+
+  Operand dst(dst_);
+  Operand src(src_);
+
+  InstId instId = Inst::kIdNone;
+  uint32_t memFlags = 0;
+  uint32_t overrideMemSize = 0;
+
+  enum MemFlags : uint32_t {
+    kDstMem = 0x1,
+    kSrcMem = 0x2
+  };
+
+  // Detect memory operands and patch them to have the same size as the register. BaseCompiler always sets memory size
+  // of allocs and spills, so it shouldn't be really necessary, however, after this function was separated from Compiler
+  // it's better to make sure that the size is always specified, as we can use 'movzx' and 'movsx' that rely on it.
+  if (dst.isMem()) { memFlags |= kDstMem; dst.as<Mem>().setSize(src.size()); }
+  if (src.isMem()) { memFlags |= kSrcMem; src.as<Mem>().setSize(dst.size()); }
+
+  switch (typeId) {
+    case TypeId::kInt8:
+    case TypeId::kUInt8:
+    case TypeId::kInt16:
+    case TypeId::kUInt16:
+      // Special case - 'movzx' load.
+      if (memFlags & kSrcMem) {
+        instId = Inst::kIdMovzx;
+        dst.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        break;
+      }
+
+      if (!memFlags) {
+        // Change both destination and source registers to GPD (safer, no dependencies).
+        dst.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        src.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+      }
+      ASMJIT_FALLTHROUGH;
+
+    case TypeId::kInt32:
+    case TypeId::kUInt32:
+    case TypeId::kInt64:
+    case TypeId::kUInt64:
+      instId = Inst::kIdMov;
+      break;
+
+    case TypeId::kMmx32:
+      instId = Inst::kIdMovd;
+      if (memFlags) break;
+      ASMJIT_FALLTHROUGH;
+
+    case TypeId::kMmx64 : instId = Inst::kIdMovq ; break;
+    case TypeId::kMask8 : instId = Inst::kIdKmovb; break;
+    case TypeId::kMask16: instId = Inst::kIdKmovw; break;
+    case TypeId::kMask32: instId = Inst::kIdKmovd; break;
+    case TypeId::kMask64: instId = Inst::kIdKmovq; break;
+
+    default: {
+      TypeId scalarTypeId = TypeUtils::scalarOf(typeId);
+      if (TypeUtils::isVec32(typeId) && memFlags) {
+        overrideMemSize = 4;
+        if (scalarTypeId == TypeId::kFloat32)
+          instId = _avxEnabled ? Inst::kIdVmovss : Inst::kIdMovss;
+        else
+          instId = _avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd;
+        break;
+      }
+
+      if (TypeUtils::isVec64(typeId) && memFlags) {
+        overrideMemSize = 8;
+        if (scalarTypeId == TypeId::kFloat64)
+          instId = _avxEnabled ? Inst::kIdVmovsd : Inst::kIdMovsd;
+        else
+          instId = _avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq;
+        break;
+      }
+
+      if (scalarTypeId == TypeId::kFloat32)
+        instId = _avxEnabled ? Inst::kIdVmovaps : Inst::kIdMovaps;
+      else if (scalarTypeId == TypeId::kFloat64)
+        instId = _avxEnabled ? Inst::kIdVmovapd : Inst::kIdMovapd;
+      else if (!_avx512Enabled)
+        instId = _avxEnabled ? Inst::kIdVmovdqa : Inst::kIdMovdqa;
+      else
+        instId = Inst::kIdVmovdqa32;
+      break;
+    }
+  }
+
+  if (!instId)
+    return DebugUtils::errored(kErrorInvalidState);
+
+  if (overrideMemSize) {
+    if (dst.isMem()) dst.as<Mem>().setSize(overrideMemSize);
+    if (src.isMem()) src.as<Mem>().setSize(overrideMemSize);
+  }
+
+  _emitter->setInlineComment(comment);
+  return _emitter->emit(instId, dst, src);
+}
+
+// x86::EmitHelper - Emit Arg Move
+// ===============================
+
+ASMJIT_FAVOR_SIZE Error EmitHelper::emitArgMove(
+  const BaseReg& dst_, TypeId dstTypeId,
+  const Operand_& src_, TypeId srcTypeId, const char* comment) {
+
+  // Deduce optional `dstTypeId`, which may be `TypeId::kVoid` in some cases.
+  if (dstTypeId == TypeId::kVoid) {
+    const ArchTraits& archTraits = ArchTraits::byArch(_emitter->arch());
+    dstTypeId = archTraits.regTypeToTypeId(dst_.type());
+  }
+
+  // Invalid or abstract TypeIds are not allowed.
+  ASMJIT_ASSERT(TypeUtils::isValid(dstTypeId) && !TypeUtils::isAbstract(dstTypeId));
+  ASMJIT_ASSERT(TypeUtils::isValid(srcTypeId) && !TypeUtils::isAbstract(srcTypeId));
+
+  Reg dst(dst_.as<Reg>());
+  Operand src(src_);
+
+  uint32_t dstSize = TypeUtils::sizeOf(dstTypeId);
+  uint32_t srcSize = TypeUtils::sizeOf(srcTypeId);
+
+  InstId instId = Inst::kIdNone;
+
+  // Not a real loop, just 'break' is nicer than 'goto'.
+  for (;;) {
+    if (TypeUtils::isInt(dstTypeId)) {
+      if (TypeUtils::isInt(srcTypeId)) {
+        instId = Inst::kIdMovsx;
+        uint32_t castOp = makeCastOp(dstTypeId, srcTypeId);
+
+        // Sign extend by using 'movsx'.
+        if (castOp == makeCastOp(TypeId::kInt16, TypeId::kInt8 ) ||
+            castOp == makeCastOp(TypeId::kInt32, TypeId::kInt8 ) ||
+            castOp == makeCastOp(TypeId::kInt32, TypeId::kInt16) ||
+            castOp == makeCastOp(TypeId::kInt64, TypeId::kInt8 ) ||
+            castOp == makeCastOp(TypeId::kInt64, TypeId::kInt16))
+          break;
+
+        // Sign extend by using 'movsxd'.
+        instId = Inst::kIdMovsxd;
+        if (castOp == makeCastOp(TypeId::kInt64, TypeId::kInt32))
+          break;
+      }
+
+      if (TypeUtils::isInt(srcTypeId) || src_.isMem()) {
+        // Zero extend by using 'movzx' or 'mov'.
+        if (dstSize <= 4 && srcSize < 4) {
+          instId = Inst::kIdMovzx;
+          dst.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        }
+        else {
+          // We should have caught all possibilities where `srcSize` is less than 4, so we don't have to worry
+          // about 'movzx' anymore. Minimum size is enough to determine if we want 32-bit or 64-bit move.
+          instId = Inst::kIdMov;
+          srcSize = Support::min(srcSize, dstSize);
+
+          dst.setSignature(srcSize == 4 ? Reg::signatureOfT<RegType::kX86_Gpd>()
+                                        : Reg::signatureOfT<RegType::kX86_Gpq>());
+          if (src.isReg())
+            src.setSignature(dst.signature());
+        }
+        break;
+      }
+
+      // NOTE: The previous branch caught all memory sources, from here it's always register to register conversion,
+      // so catch the remaining cases.
+      srcSize = Support::min(srcSize, dstSize);
+
+      if (TypeUtils::isMmx(srcTypeId)) {
+        // 64-bit move.
+        instId = Inst::kIdMovq;
+        if (srcSize == 8)
+          break;
+
+        // 32-bit move.
+        instId = Inst::kIdMovd;
+        dst.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        break;
+      }
+
+      if (TypeUtils::isMask(srcTypeId)) {
+        instId = kmovInstFromSize(srcSize);
+        dst.setSignature(srcSize <= 4 ? Reg::signatureOfT<RegType::kX86_Gpd>()
+                                      : Reg::signatureOfT<RegType::kX86_Gpq>());
+        break;
+      }
+
+      if (TypeUtils::isVec(srcTypeId)) {
+        // 64-bit move.
+        instId = _avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq;
+        if (srcSize == 8)
+          break;
+
+        // 32-bit move.
+        instId = _avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd;
+        dst.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        break;
+      }
+    }
+
+    if (TypeUtils::isMmx(dstTypeId)) {
+      instId = Inst::kIdMovq;
+      srcSize = Support::min(srcSize, dstSize);
+
+      if (TypeUtils::isInt(srcTypeId) || src.isMem()) {
+        // 64-bit move.
+        if (srcSize == 8)
+          break;
+
+        // 32-bit move.
+        instId = Inst::kIdMovd;
+        if (src.isReg())
+          src.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        break;
+      }
+
+      if (TypeUtils::isMmx(srcTypeId))
+        break;
+
+      // This will hurt if AVX is enabled.
+      instId = Inst::kIdMovdq2q;
+      if (TypeUtils::isVec(srcTypeId))
+        break;
+    }
+
+    if (TypeUtils::isMask(dstTypeId)) {
+      srcSize = Support::min(srcSize, dstSize);
+
+      if (TypeUtils::isInt(srcTypeId) || TypeUtils::isMask(srcTypeId) || src.isMem()) {
+        instId = kmovInstFromSize(srcSize);
+        if (Reg::isGp(src) && srcSize <= 4)
+          src.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+        break;
+      }
+    }
+
+    if (TypeUtils::isVec(dstTypeId)) {
+      // By default set destination to XMM, will be set to YMM|ZMM if needed.
+      dst.setSignature(Reg::signatureOfT<RegType::kX86_Xmm>());
+
+      // This will hurt if AVX is enabled.
+      if (Reg::isMm(src)) {
+        // 64-bit move.
+        instId = Inst::kIdMovq2dq;
+        break;
+      }
+
+      // Argument conversion.
+      TypeId dstScalarId = TypeUtils::scalarOf(dstTypeId);
+      TypeId srcScalarId = TypeUtils::scalarOf(srcTypeId);
+
+      if (dstScalarId == TypeId::kFloat32 && srcScalarId == TypeId::kFloat64) {
+        srcSize = Support::min(dstSize * 2, srcSize);
+        dstSize = srcSize / 2;
+
+        if (srcSize <= 8)
+          instId = _avxEnabled ? Inst::kIdVcvtss2sd : Inst::kIdCvtss2sd;
+        else
+          instId = _avxEnabled ? Inst::kIdVcvtps2pd : Inst::kIdCvtps2pd;
+
+        if (dstSize == 32)
+          dst.setSignature(Reg::signatureOfT<RegType::kX86_Ymm>());
+        if (src.isReg())
+          src.setSignature(Reg::signatureOfVecBySize(srcSize));
+        break;
+      }
+
+      if (dstScalarId == TypeId::kFloat64 && srcScalarId == TypeId::kFloat32) {
+        srcSize = Support::min(dstSize, srcSize * 2) / 2;
+        dstSize = srcSize * 2;
+
+        if (srcSize <= 4)
+          instId = _avxEnabled ? Inst::kIdVcvtsd2ss : Inst::kIdCvtsd2ss;
+        else
+          instId = _avxEnabled ? Inst::kIdVcvtpd2ps : Inst::kIdCvtpd2ps;
+
+        dst.setSignature(Reg::signatureOfVecBySize(dstSize));
+        if (src.isReg() && srcSize >= 32)
+          src.setSignature(Reg::signatureOfT<RegType::kX86_Ymm>());
+        break;
+      }
+
+      srcSize = Support::min(srcSize, dstSize);
+      if (Reg::isGp(src) || src.isMem()) {
+        // 32-bit move.
+        if (srcSize <= 4) {
+          instId = _avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd;
+          if (src.isReg())
+            src.setSignature(Reg::signatureOfT<RegType::kX86_Gpd>());
+          break;
+        }
+
+        // 64-bit move.
+        if (srcSize == 8) {
+          instId = _avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq;
+          break;
+        }
+      }
+
+      if (Reg::isVec(src) || src.isMem()) {
+        instId = _avxEnabled ? Inst::kIdVmovaps : Inst::kIdMovaps;
+
+        if (src.isMem() && srcSize < _emitter->environment().stackAlignment())
+          instId = _avxEnabled ? Inst::kIdVmovups : Inst::kIdMovups;
+
+        OperandSignature signature = Reg::signatureOfVecBySize(srcSize);
+        dst.setSignature(signature);
+        if (src.isReg())
+          src.setSignature(signature);
+        break;
+      }
+    }
+
+    return DebugUtils::errored(kErrorInvalidState);
+  }
+
+  if (src.isMem())
+    src.as<Mem>().setSize(srcSize);
+
+  _emitter->setInlineComment(comment);
+  return _emitter->emit(instId, dst, src);
+}
+
+Error EmitHelper::emitRegSwap(
+  const BaseReg& a,
+  const BaseReg& b, const char* comment) {
+
+  if (a.isGp() && b.isGp()) {
+    _emitter->setInlineComment(comment);
+    return _emitter->emit(Inst::kIdXchg, a, b);
+  }
+  else
+    return DebugUtils::errored(kErrorInvalidState);
+}
+
+// x86::EmitHelper - Emit Prolog & Epilog
+// ======================================
+
+static inline void X86Internal_setupSaveRestoreInfo(RegGroup group, const FuncFrame& frame, Reg& xReg, uint32_t& xInst, uint32_t& xSize) noexcept {
+  switch (group) {
+    case RegGroup::kVec:
+      xReg = xmm(0);
+      xInst = getXmmMovInst(frame);
+      xSize = xReg.size();
+      break;
+    case RegGroup::kX86_K:
+      xReg = k(0);
+      xInst = Inst::kIdKmovq;
+      xSize = xReg.size();
+      break;
+    case RegGroup::kX86_MM:
+      xReg = mm(0);
+      xInst = Inst::kIdMovq;
+      xSize = xReg.size();
+      break;
+    default:
+      break;
+  }
+}
+
+ASMJIT_FAVOR_SIZE Error EmitHelper::emitProlog(const FuncFrame& frame) {
+  Emitter* emitter = _emitter->as<Emitter>();
+  uint32_t gpSaved = frame.savedRegs(RegGroup::kGp);
+
+  Gp zsp = emitter->zsp();   // ESP|RSP register.
+  Gp zbp = emitter->zbp();   // EBP|RBP register.
+  Gp gpReg = zsp;            // General purpose register (temporary).
+  Gp saReg = zsp;            // Stack-arguments base pointer.
+
+  // Emit: 'push zbp'
+  //       'mov  zbp, zsp'.
+  if (frame.hasPreservedFP()) {
+    gpSaved &= ~Support::bitMask(Gp::kIdBp);
+    ASMJIT_PROPAGATE(emitter->push(zbp));
+    ASMJIT_PROPAGATE(emitter->mov(zbp, zsp));
+  }
+
+  // Emit: 'push gp' sequence.
+  {
+    Support::BitWordIterator<RegMask> it(gpSaved);
+    while (it.hasNext()) {
+      gpReg.setId(it.next());
+      ASMJIT_PROPAGATE(emitter->push(gpReg));
+    }
+  }
+
+  // Emit: 'mov saReg, zsp'.
+  uint32_t saRegId = frame.saRegId();
+  if (saRegId != BaseReg::kIdBad && saRegId != Gp::kIdSp) {
+    saReg.setId(saRegId);
+    if (frame.hasPreservedFP()) {
+      if (saRegId != Gp::kIdBp)
+        ASMJIT_PROPAGATE(emitter->mov(saReg, zbp));
+    }
+    else {
+      ASMJIT_PROPAGATE(emitter->mov(saReg, zsp));
+    }
+  }
+
+  // Emit: 'and zsp, StackAlignment'.
+  if (frame.hasDynamicAlignment()) {
+    ASMJIT_PROPAGATE(emitter->and_(zsp, -int32_t(frame.finalStackAlignment())));
+  }
+
+  // Emit: 'sub zsp, StackAdjustment'.
+  if (frame.hasStackAdjustment()) {
+    ASMJIT_PROPAGATE(emitter->sub(zsp, frame.stackAdjustment()));
+  }
+
+  // Emit: 'mov [zsp + DAOffset], saReg'.
+  if (frame.hasDynamicAlignment() && frame.hasDAOffset()) {
+    Mem saMem = ptr(zsp, int32_t(frame.daOffset()));
+    ASMJIT_PROPAGATE(emitter->mov(saMem, saReg));
+  }
+
+  // Emit 'movxxx [zsp + X], {[x|y|z]mm, k}'.
+  {
+    Reg xReg;
+    Mem xBase = ptr(zsp, int32_t(frame.extraRegSaveOffset()));
+
+    uint32_t xInst;
+    uint32_t xSize;
+
+    for (RegGroup group : Support::EnumValues<RegGroup, RegGroup(1), RegGroup::kMaxVirt>{}) {
+      Support::BitWordIterator<RegMask> it(frame.savedRegs(group));
+      if (it.hasNext()) {
+        X86Internal_setupSaveRestoreInfo(group, frame, xReg, xInst, xSize);
+        do {
+          xReg.setId(it.next());
+          ASMJIT_PROPAGATE(emitter->emit(xInst, xBase, xReg));
+          xBase.addOffsetLo32(int32_t(xSize));
+        } while (it.hasNext());
+      }
+    }
+  }
+
+  return kErrorOk;
+}
+
+ASMJIT_FAVOR_SIZE Error EmitHelper::emitEpilog(const FuncFrame& frame) {
+  Emitter* emitter = _emitter->as<Emitter>();
+
+  uint32_t i;
+  uint32_t regId;
+
+  uint32_t registerSize = emitter->registerSize();
+  uint32_t gpSaved = frame.savedRegs(RegGroup::kGp);
+
+  Gp zsp = emitter->zsp();   // ESP|RSP register.
+  Gp zbp = emitter->zbp();   // EBP|RBP register.
+  Gp gpReg = emitter->zsp(); // General purpose register (temporary).
+
+  // Don't emit 'pop zbp' in the pop sequence, this case is handled separately.
+  if (frame.hasPreservedFP())
+    gpSaved &= ~Support::bitMask(Gp::kIdBp);
+
+  // Emit 'movxxx {[x|y|z]mm, k}, [zsp + X]'.
+  {
+    Reg xReg;
+    Mem xBase = ptr(zsp, int32_t(frame.extraRegSaveOffset()));
+
+    uint32_t xInst;
+    uint32_t xSize;
+
+    for (RegGroup group : Support::EnumValues<RegGroup, RegGroup(1), RegGroup::kMaxVirt>{}) {
+      Support::BitWordIterator<RegMask> it(frame.savedRegs(group));
+      if (it.hasNext()) {
+        X86Internal_setupSaveRestoreInfo(group, frame, xReg, xInst, xSize);
+        do {
+          xReg.setId(it.next());
+          ASMJIT_PROPAGATE(emitter->emit(xInst, xReg, xBase));
+          xBase.addOffsetLo32(int32_t(xSize));
+        } while (it.hasNext());
+      }
+    }
+  }
+
+  // Emit 'emms' and/or 'vzeroupper'.
+  if (frame.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms());
+  if (frame.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper());
+
+  if (frame.hasPreservedFP()) {
+    // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]'
+    int32_t count = int32_t(frame.pushPopSaveSize() - registerSize);
+    if (!count)
+      ASMJIT_PROPAGATE(emitter->mov(zsp, zbp));
+    else
+      ASMJIT_PROPAGATE(emitter->lea(zsp, ptr(zbp, -count)));
+  }
+  else {
+    if (frame.hasDynamicAlignment() && frame.hasDAOffset()) {
+      // Emit 'mov zsp, [zsp + DsaSlot]'.
+      Mem saMem = ptr(zsp, int32_t(frame.daOffset()));
+      ASMJIT_PROPAGATE(emitter->mov(zsp, saMem));
+    }
+    else if (frame.hasStackAdjustment()) {
+      // Emit 'add zsp, StackAdjustment'.
+      ASMJIT_PROPAGATE(emitter->add(zsp, int32_t(frame.stackAdjustment())));
+    }
+  }
+
+  // Emit 'pop gp' sequence.
+  if (gpSaved) {
+    i = gpSaved;
+    regId = 16;
+
+    do {
+      regId--;
+      if (i & 0x8000) {
+        gpReg.setId(regId);
+        ASMJIT_PROPAGATE(emitter->pop(gpReg));
+      }
+      i <<= 1;
+    } while (regId != 0);
+  }
+
+  // Emit 'pop zbp'.
+  if (frame.hasPreservedFP())
+    ASMJIT_PROPAGATE(emitter->pop(zbp));
+
+  // Emit 'ret' or 'ret x'.
+  if (frame.hasCalleeStackCleanup())
+    ASMJIT_PROPAGATE(emitter->emit(Inst::kIdRet, int(frame.calleeStackCleanup())));
+  else
+    ASMJIT_PROPAGATE(emitter->emit(Inst::kIdRet));
+
+  return kErrorOk;
+}
+
+static Error ASMJIT_CDECL Emitter_emitProlog(BaseEmitter* emitter, const FuncFrame& frame) {
+  EmitHelper emitHelper(emitter, frame.isAvxEnabled(), frame.isAvx512Enabled());
+  return emitHelper.emitProlog(frame);
+}
+
+static Error ASMJIT_CDECL Emitter_emitEpilog(BaseEmitter* emitter, const FuncFrame& frame) {
+  EmitHelper emitHelper(emitter, frame.isAvxEnabled(), frame.isAvx512Enabled());
+  return emitHelper.emitEpilog(frame);
+}
+
+static Error ASMJIT_CDECL Emitter_emitArgsAssignment(BaseEmitter* emitter, const FuncFrame& frame, const FuncArgsAssignment& args) {
+  EmitHelper emitHelper(emitter, frame.isAvxEnabled(), frame.isAvx512Enabled());
+  return emitHelper.emitArgsAssignment(frame, args);
+}
+
+void assignEmitterFuncs(BaseEmitter* emitter) {
+  emitter->_funcs.emitProlog = Emitter_emitProlog;
+  emitter->_funcs.emitEpilog = Emitter_emitEpilog;
+  emitter->_funcs.emitArgsAssignment = Emitter_emitArgsAssignment;
+
+#ifndef ASMJIT_NO_LOGGING
+  emitter->_funcs.formatInstruction = FormatterInternal::formatInstruction;
+#endif
+
+#ifndef ASMJIT_NO_VALIDATION
+  emitter->_funcs.validate = InstInternal::validate;
+#endif
+}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_X86
--- a/lib/lepton/asmjit/x86/x86emithelper_p.h
+++ b/lib/lepton/asmjit/x86/x86emithelper_p.h
@ -0,0 +1,60 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86EMITHELPER_P_H_INCLUDED
+#define ASMJIT_X86_X86EMITHELPER_P_H_INCLUDED
+
+#include "../core/api-config.h"
+
+#include "../core/emithelper_p.h"
+#include "../core/func.h"
+#include "../x86/x86emitter.h"
+#include "../x86/x86operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+static inline RegType vecTypeIdToRegType(TypeId typeId) noexcept {
+  return uint32_t(typeId) <= uint32_t(TypeId::_kVec128End) ? RegType::kX86_Xmm :
+         uint32_t(typeId) <= uint32_t(TypeId::_kVec256End) ? RegType::kX86_Ymm : RegType::kX86_Zmm;
+}
+
+class EmitHelper : public BaseEmitHelper {
+public:
+  bool _avxEnabled;
+  bool _avx512Enabled;
+
+  inline explicit EmitHelper(BaseEmitter* emitter = nullptr, bool avxEnabled = false, bool avx512Enabled = false) noexcept
+    : BaseEmitHelper(emitter),
+      _avxEnabled(avxEnabled || avx512Enabled),
+      _avx512Enabled(avx512Enabled) {}
+
+  Error emitRegMove(
+    const Operand_& dst_,
+    const Operand_& src_, TypeId typeId, const char* comment = nullptr) override;
+
+  Error emitArgMove(
+    const BaseReg& dst_, TypeId dstTypeId,
+    const Operand_& src_, TypeId srcTypeId, const char* comment = nullptr) override;
+
+  Error emitRegSwap(
+    const BaseReg& a,
+    const BaseReg& b, const char* comment = nullptr) override;
+
+  Error emitProlog(const FuncFrame& frame);
+  Error emitEpilog(const FuncFrame& frame);
+};
+
+void assignEmitterFuncs(BaseEmitter* emitter);
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86EMITHELPER_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86emitter.h
+++ b/lib/lepton/asmjit/x86/x86emitter.h
--- a/lib/lepton/asmjit/x86/x86formatter.cpp
+++ b/lib/lepton/asmjit/x86/x86formatter.cpp
@ -0,0 +1,944 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#ifndef ASMJIT_NO_LOGGING
+
+#include "../core/cpuinfo.h"
+#include "../core/misc_p.h"
+#include "../core/support.h"
+#include "../x86/x86formatter_p.h"
+#include "../x86/x86instapi_p.h"
+#include "../x86/x86instdb_p.h"
+#include "../x86/x86operand.h"
+
+#ifndef ASMJIT_NO_COMPILER
+  #include "../core/compiler.h"
+#endif
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+// x86::FormatterInternal - Constants
+// ==================================
+
+struct RegFormatInfo {
+  struct TypeEntry {
+    uint8_t index;
+  };
+
+  struct NameEntry {
+    uint8_t count;
+    uint8_t formatIndex;
+    uint8_t specialIndex;
+    uint8_t specialCount;
+  };
+
+  TypeEntry typeEntries[uint32_t(RegType::kMaxValue) + 1];
+  char typeStrings[128 - 32];
+
+  NameEntry nameEntries[uint32_t(RegType::kMaxValue) + 1];
+  char nameStrings[280];
+};
+
+template<uint32_t X>
+struct RegFormatInfo_T {
+  enum {
+    kTypeIndex    = X == uint32_t(RegType::kX86_GpbLo) ? 1   :
+                    X == uint32_t(RegType::kX86_GpbHi) ? 8   :
+                    X == uint32_t(RegType::kX86_Gpw  ) ? 15  :
+                    X == uint32_t(RegType::kX86_Gpd  ) ? 19  :
+                    X == uint32_t(RegType::kX86_Gpq  ) ? 23  :
+                    X == uint32_t(RegType::kX86_Xmm  ) ? 27  :
+                    X == uint32_t(RegType::kX86_Ymm  ) ? 31  :
+                    X == uint32_t(RegType::kX86_Zmm  ) ? 35  :
+                    X == uint32_t(RegType::kX86_Mm   ) ? 50  :
+                    X == uint32_t(RegType::kX86_KReg ) ? 53  :
+                    X == uint32_t(RegType::kX86_SReg ) ? 43  :
+                    X == uint32_t(RegType::kX86_CReg ) ? 59  :
+                    X == uint32_t(RegType::kX86_DReg ) ? 62  :
+                    X == uint32_t(RegType::kX86_St   ) ? 47  :
+                    X == uint32_t(RegType::kX86_Bnd  ) ? 55  :
+                    X == uint32_t(RegType::kX86_Tmm  ) ? 65  :
+                    X == uint32_t(RegType::kX86_Rip  ) ? 39  : 0,
+
+    kFormatIndex  = X == uint32_t(RegType::kX86_GpbLo) ? 1   :
+                    X == uint32_t(RegType::kX86_GpbHi) ? 6   :
+                    X == uint32_t(RegType::kX86_Gpw  ) ? 11  :
+                    X == uint32_t(RegType::kX86_Gpd  ) ? 16  :
+                    X == uint32_t(RegType::kX86_Gpq  ) ? 21  :
+                    X == uint32_t(RegType::kX86_Xmm  ) ? 25  :
+                    X == uint32_t(RegType::kX86_Ymm  ) ? 31  :
+                    X == uint32_t(RegType::kX86_Zmm  ) ? 37  :
+                    X == uint32_t(RegType::kX86_Mm   ) ? 60  :
+                    X == uint32_t(RegType::kX86_KReg ) ? 65  :
+                    X == uint32_t(RegType::kX86_SReg ) ? 49  :
+                    X == uint32_t(RegType::kX86_CReg ) ? 75  :
+                    X == uint32_t(RegType::kX86_DReg ) ? 80  :
+                    X == uint32_t(RegType::kX86_St   ) ? 55  :
+                    X == uint32_t(RegType::kX86_Bnd  ) ? 69  :
+                    X == uint32_t(RegType::kX86_Tmm  ) ? 89  :
+                    X == uint32_t(RegType::kX86_Rip  ) ? 43  : 0,
+
+    kSpecialIndex = X == uint32_t(RegType::kX86_GpbLo) ? 96  :
+                    X == uint32_t(RegType::kX86_GpbHi) ? 128 :
+                    X == uint32_t(RegType::kX86_Gpw  ) ? 161 :
+                    X == uint32_t(RegType::kX86_Gpd  ) ? 160 :
+                    X == uint32_t(RegType::kX86_Gpq  ) ? 192 :
+                    X == uint32_t(RegType::kX86_SReg ) ? 224 :
+                    X == uint32_t(RegType::kX86_Rip  ) ? 85  : 0,
+
+    kSpecialCount = X == uint32_t(RegType::kX86_GpbLo) ? 8   :
+                    X == uint32_t(RegType::kX86_GpbHi) ? 4   :
+                    X == uint32_t(RegType::kX86_Gpw  ) ? 8   :
+                    X == uint32_t(RegType::kX86_Gpd  ) ? 8   :
+                    X == uint32_t(RegType::kX86_Gpq  ) ? 8   :
+                    X == uint32_t(RegType::kX86_SReg ) ? 7   :
+                    X == uint32_t(RegType::kX86_Rip  ) ? 1   : 0
+  };
+};
+
+#define ASMJIT_REG_TYPE_ENTRY(TYPE) {   \
+  RegFormatInfo_T<TYPE>::kTypeIndex     \
+}
+
+#define ASMJIT_REG_NAME_ENTRY(TYPE) {   \
+  RegTraits<RegType(TYPE)>::kCount,     \
+  RegFormatInfo_T<TYPE>::kFormatIndex,  \
+  RegFormatInfo_T<TYPE>::kSpecialIndex, \
+  RegFormatInfo_T<TYPE>::kSpecialCount  \
+}
+
+static const RegFormatInfo x86RegFormatInfo = {
+  // Register type entries and strings.
+  { ASMJIT_LOOKUP_TABLE_32(ASMJIT_REG_TYPE_ENTRY, 0) },
+
+  "\0"             // #0
+  "gpb\0\0\0\0"    // #1
+  "gpb.hi\0"       // #8
+  "gpw\0"          // #15
+  "gpd\0"          // #19
+  "gpq\0"          // #23
+  "xmm\0"          // #27
+  "ymm\0"          // #31
+  "zmm\0"          // #35
+  "rip\0"          // #39
+  "seg\0"          // #43
+  "st\0"           // #47
+  "mm\0"           // #50
+  "k\0"            // #53
+  "bnd\0"          // #55
+  "cr\0"           // #59
+  "dr\0"           // #62
+  "tmm\0"          // #65
+  ,
+
+  // Register name entries and strings.
+  { ASMJIT_LOOKUP_TABLE_32(ASMJIT_REG_NAME_ENTRY, 0) },
+
+  "\0"
+  "r%ub\0"         // #1
+  "r%uh\0"         // #6
+  "r%uw\0"         // #11
+  "r%ud\0"         // #16
+  "r%u\0"          // #21
+  "xmm%u\0"        // #25
+  "ymm%u\0"        // #31
+  "zmm%u\0"        // #37
+  "rip%u\0"        // #43
+  "seg%u\0"        // #49
+  "st%u\0"         // #55
+  "mm%u\0"         // #60
+  "k%u\0"          // #65
+  "bnd%u\0"        // #69
+  "cr%u\0"         // #75
+  "dr%u\0"         // #80
+
+  "rip\0"          // #85
+  "tmm%u\0"        // #89
+  "\0"             // #95
+
+  "al\0\0" "cl\0\0" "dl\0\0" "bl\0\0" "spl\0"  "bpl\0"  "sil\0"  "dil\0" // #96
+  "ah\0\0" "ch\0\0" "dh\0\0" "bh\0\0" "n/a\0"  "n/a\0"  "n/a\0"  "n/a\0" // #128
+  "eax\0"  "ecx\0"  "edx\0"  "ebx\0"  "esp\0"  "ebp\0"  "esi\0"  "edi\0" // #160
+  "rax\0"  "rcx\0"  "rdx\0"  "rbx\0"  "rsp\0"  "rbp\0"  "rsi\0"  "rdi\0" // #192
+  "n/a\0"  "es\0\0" "cs\0\0" "ss\0\0" "ds\0\0" "fs\0\0" "gs\0\0" "n/a\0" // #224
+};
+#undef ASMJIT_REG_NAME_ENTRY
+#undef ASMJIT_REG_TYPE_ENTRY
+
+static const char* x86GetAddressSizeString(uint32_t size) noexcept {
+  switch (size) {
+    case 1 : return "byte ptr ";
+    case 2 : return "word ptr ";
+    case 4 : return "dword ptr ";
+    case 6 : return "fword ptr ";
+    case 8 : return "qword ptr ";
+    case 10: return "tbyte ptr ";
+    case 16: return "xmmword ptr ";
+    case 32: return "ymmword ptr ";
+    case 64: return "zmmword ptr ";
+    default: return "";
+  }
+}
+
+// x86::FormatterInternal - Format FeatureId
+// =========================================
+
+Error FormatterInternal::formatFeature(String& sb, uint32_t featureId) noexcept {
+  // @EnumStringBegin{"enum": "CpuFeatures::X86", "output": "sFeature", "strip": "k"}@
+  static const char sFeatureString[] =
+    "None\0"
+    "MT\0"
+    "NX\0"
+    "3DNOW\0"
+    "3DNOW2\0"
+    "ADX\0"
+    "AESNI\0"
+    "ALTMOVCR8\0"
+    "AMX_BF16\0"
+    "AMX_INT8\0"
+    "AMX_TILE\0"
+    "AVX\0"
+    "AVX2\0"
+    "AVX512_4FMAPS\0"
+    "AVX512_4VNNIW\0"
+    "AVX512_BF16\0"
+    "AVX512_BITALG\0"
+    "AVX512_BW\0"
+    "AVX512_CDI\0"
+    "AVX512_DQ\0"
+    "AVX512_ERI\0"
+    "AVX512_F\0"
+    "AVX512_FP16\0"
+    "AVX512_IFMA\0"
+    "AVX512_PFI\0"
+    "AVX512_VBMI\0"
+    "AVX512_VBMI2\0"
+    "AVX512_VL\0"
+    "AVX512_VNNI\0"
+    "AVX512_VP2INTERSECT\0"
+    "AVX512_VPOPCNTDQ\0"
+    "AVX_VNNI\0"
+    "BMI\0"
+    "BMI2\0"
+    "CET_IBT\0"
+    "CET_SS\0"
+    "CLDEMOTE\0"
+    "CLFLUSH\0"
+    "CLFLUSHOPT\0"
+    "CLWB\0"
+    "CLZERO\0"
+    "CMOV\0"
+    "CMPXCHG16B\0"
+    "CMPXCHG8B\0"
+    "ENCLV\0"
+    "ENQCMD\0"
+    "ERMS\0"
+    "F16C\0"
+    "FMA\0"
+    "FMA4\0"
+    "FPU\0"
+    "FSGSBASE\0"
+    "FXSR\0"
+    "FXSROPT\0"
+    "GEODE\0"
+    "GFNI\0"
+    "HLE\0"
+    "HRESET\0"
+    "I486\0"
+    "LAHFSAHF\0"
+    "LWP\0"
+    "LZCNT\0"
+    "MCOMMIT\0"
+    "MMX\0"
+    "MMX2\0"
+    "MONITOR\0"
+    "MONITORX\0"
+    "MOVBE\0"
+    "MOVDIR64B\0"
+    "MOVDIRI\0"
+    "MPX\0"
+    "MSR\0"
+    "MSSE\0"
+    "OSXSAVE\0"
+    "OSPKE\0"
+    "PCLMULQDQ\0"
+    "PCONFIG\0"
+    "POPCNT\0"
+    "PREFETCHW\0"
+    "PREFETCHWT1\0"
+    "PTWRITE\0"
+    "RDPID\0"
+    "RDPRU\0"
+    "RDRAND\0"
+    "RDSEED\0"
+    "RDTSC\0"
+    "RDTSCP\0"
+    "RTM\0"
+    "SERIALIZE\0"
+    "SHA\0"
+    "SKINIT\0"
+    "SMAP\0"
+    "SMEP\0"
+    "SMX\0"
+    "SNP\0"
+    "SSE\0"
+    "SSE2\0"
+    "SSE3\0"
+    "SSE4_1\0"
+    "SSE4_2\0"
+    "SSE4A\0"
+    "SSSE3\0"
+    "SVM\0"
+    "TBM\0"
+    "TSX\0"
+    "TSXLDTRK\0"
+    "UINTR\0"
+    "VAES\0"
+    "VMX\0"
+    "VPCLMULQDQ\0"
+    "WAITPKG\0"
+    "WBNOINVD\0"
+    "XOP\0"
+    "XSAVE\0"
+    "XSAVEC\0"
+    "XSAVEOPT\0"
+    "XSAVES\0"
+    "<Unknown>\0";
+
+  static const uint16_t sFeatureIndex[] = {
+    0, 5, 8, 11, 17, 24, 28, 34, 44, 53, 62, 71, 75, 80, 94, 108, 120, 134, 144,
+    155, 165, 176, 185, 197, 209, 220, 232, 245, 255, 267, 287, 304, 313, 317,
+    322, 330, 337, 346, 354, 365, 370, 377, 382, 393, 403, 409, 416, 421, 426,
+    430, 435, 439, 448, 453, 461, 467, 472, 476, 483, 488, 497, 501, 507, 515,
+    519, 524, 532, 541, 547, 557, 565, 569, 573, 578, 586, 592, 602, 610, 617,
+    627, 639, 647, 653, 659, 666, 673, 679, 686, 690, 700, 704, 711, 716, 721,
+    725, 729, 733, 738, 743, 750, 757, 763, 769, 773, 777, 781, 790, 796, 801,
+    805, 816, 824, 833, 837, 843, 850, 859, 866
+  };
+  // @EnumStringEnd@
+
+  return sb.append(sFeatureString + sFeatureIndex[Support::min<uint32_t>(featureId, uint32_t(CpuFeatures::X86::kMaxValue) + 1)]);
+}
+
+// x86::FormatterInternal - Format Register
+// ========================================
+
+ASMJIT_FAVOR_SIZE Error FormatterInternal::formatRegister(String& sb, FormatFlags formatFlags, const BaseEmitter* emitter, Arch arch, RegType type, uint32_t id) noexcept {
+  DebugUtils::unused(arch);
+  const RegFormatInfo& info = x86RegFormatInfo;
+
+#ifndef ASMJIT_NO_COMPILER
+  if (Operand::isVirtId(id)) {
+    if (emitter && emitter->emitterType() == EmitterType::kCompiler) {
+      const BaseCompiler* cc = static_cast<const BaseCompiler*>(emitter);
+      if (cc->isVirtIdValid(id)) {
+        VirtReg* vReg = cc->virtRegById(id);
+        ASMJIT_ASSERT(vReg != nullptr);
+
+        const char* name = vReg->name();
+        if (name && name[0] != '\0')
+          ASMJIT_PROPAGATE(sb.append(name));
+        else
+          ASMJIT_PROPAGATE(sb.appendFormat("%%%u", unsigned(Operand::virtIdToIndex(id))));
+
+        if (vReg->type() != type && uint32_t(type) <= uint32_t(RegType::kMaxValue) && Support::test(formatFlags, FormatFlags::kRegCasts)) {
+          const RegFormatInfo::TypeEntry& typeEntry = info.typeEntries[size_t(type)];
+          if (typeEntry.index)
+            ASMJIT_PROPAGATE(sb.appendFormat("@%s", info.typeStrings + typeEntry.index));
+        }
+
+        return kErrorOk;
+      }
+    }
+  }
+#else
+  DebugUtils::unused(emitter, formatFlags);
+#endif
+
+  if (uint32_t(type) <= uint32_t(RegType::kMaxValue)) {
+    const RegFormatInfo::NameEntry& nameEntry = info.nameEntries[size_t(type)];
+
+    if (id < nameEntry.specialCount)
+      return sb.append(info.nameStrings + nameEntry.specialIndex + id * 4);
+
+    if (id < nameEntry.count)
+      return sb.appendFormat(info.nameStrings + nameEntry.formatIndex, unsigned(id));
+
+    const RegFormatInfo::TypeEntry& typeEntry = info.typeEntries[size_t(type)];
+    if (typeEntry.index)
+      return sb.appendFormat("%s@%u", info.typeStrings + typeEntry.index, id);
+  }
+
+  return sb.appendFormat("<Reg-%u>?%u", uint32_t(type), id);
+}
+
+// x86::FormatterInternal - Format Operand
+// =======================================
+
+ASMJIT_FAVOR_SIZE Error FormatterInternal::formatOperand(
+  String& sb,
+  FormatFlags formatFlags,
+  const BaseEmitter* emitter,
+  Arch arch,
+  const Operand_& op) noexcept {
+
+  if (op.isReg())
+    return formatRegister(sb, formatFlags, emitter, arch, op.as<BaseReg>().type(), op.as<BaseReg>().id());
+
+  if (op.isMem()) {
+    const Mem& m = op.as<Mem>();
+    ASMJIT_PROPAGATE(sb.append(x86GetAddressSizeString(m.size())));
+
+    // Segment override prefix.
+    uint32_t seg = m.segmentId();
+    if (seg != SReg::kIdNone && seg < SReg::kIdCount)
+      ASMJIT_PROPAGATE(sb.appendFormat("%s:", x86RegFormatInfo.nameStrings + 224 + size_t(seg) * 4));
+
+    ASMJIT_PROPAGATE(sb.append('['));
+    switch (m.addrType()) {
+      case Mem::AddrType::kDefault:
+        break;
+      case Mem::AddrType::kAbs:
+        ASMJIT_PROPAGATE(sb.append("abs "));
+        break;
+      case Mem::AddrType::kRel:
+        ASMJIT_PROPAGATE(sb.append("rel "));
+        break;
+    }
+
+    char opSign = '\0';
+    if (m.hasBase()) {
+      opSign = '+';
+      if (m.hasBaseLabel()) {
+        ASMJIT_PROPAGATE(Formatter::formatLabel(sb, formatFlags, emitter, m.baseId()));
+      }
+      else {
+        FormatFlags modifiedFlags = formatFlags;
+        if (m.isRegHome()) {
+          ASMJIT_PROPAGATE(sb.append("&"));
+          modifiedFlags &= ~FormatFlags::kRegCasts;
+        }
+        ASMJIT_PROPAGATE(formatRegister(sb, modifiedFlags, emitter, arch, m.baseType(), m.baseId()));
+      }
+    }
+
+    if (m.hasIndex()) {
+      if (opSign)
+        ASMJIT_PROPAGATE(sb.append(opSign));
+
+      opSign = '+';
+      ASMJIT_PROPAGATE(formatRegister(sb, formatFlags, emitter, arch, m.indexType(), m.indexId()));
+      if (m.hasShift())
+        ASMJIT_PROPAGATE(sb.appendFormat("*%u", 1 << m.shift()));
+    }
+
+    uint64_t off = uint64_t(m.offset());
+    if (off || !m.hasBaseOrIndex()) {
+      if (int64_t(off) < 0) {
+        opSign = '-';
+        off = ~off + 1;
+      }
+
+      if (opSign)
+        ASMJIT_PROPAGATE(sb.append(opSign));
+
+      uint32_t base = 10;
+      if (Support::test(formatFlags, FormatFlags::kHexOffsets) && off > 9) {
+        ASMJIT_PROPAGATE(sb.append("0x", 2));
+        base = 16;
+      }
+
+      ASMJIT_PROPAGATE(sb.appendUInt(off, base));
+    }
+
+    return sb.append(']');
+  }
+
+  if (op.isImm()) {
+    const Imm& i = op.as<Imm>();
+    int64_t val = i.value();
+
+    if (Support::test(formatFlags, FormatFlags::kHexImms) && uint64_t(val) > 9) {
+      ASMJIT_PROPAGATE(sb.append("0x", 2));
+      return sb.appendUInt(uint64_t(val), 16);
+    }
+    else {
+      return sb.appendInt(val, 10);
+    }
+  }
+
+  if (op.isLabel()) {
+    return Formatter::formatLabel(sb, formatFlags, emitter, op.id());
+  }
+
+  return sb.append("<None>");
+}
+
+// x86::FormatterInternal - Format Immediate (Extension)
+// =====================================================
+
+static constexpr char kImmCharStart = '{';
+static constexpr char kImmCharEnd   = '}';
+static constexpr char kImmCharOr    = '|';
+
+struct ImmBits {
+  enum Mode : uint32_t {
+    kModeLookup = 0,
+    kModeFormat = 1
+  };
+
+  uint8_t mask;
+  uint8_t shift;
+  uint8_t mode;
+  char text[48 - 3];
+};
+
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmShuf(String& sb, uint32_t u8, uint32_t bits, uint32_t count) noexcept {
+  uint32_t mask = (1 << bits) - 1;
+
+  for (uint32_t i = 0; i < count; i++, u8 >>= bits) {
+    uint32_t value = u8 & mask;
+    ASMJIT_PROPAGATE(sb.append(i == 0 ? kImmCharStart : kImmCharOr));
+    ASMJIT_PROPAGATE(sb.appendUInt(value));
+  }
+
+  if (kImmCharEnd)
+    ASMJIT_PROPAGATE(sb.append(kImmCharEnd));
+
+  return kErrorOk;
+}
+
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmBits(String& sb, uint32_t u8, const ImmBits* bits, uint32_t count) noexcept {
+  uint32_t n = 0;
+  char buf[64];
+
+  for (uint32_t i = 0; i < count; i++) {
+    const ImmBits& spec = bits[i];
+
+    uint32_t value = (u8 & uint32_t(spec.mask)) >> spec.shift;
+    const char* str = nullptr;
+
+    switch (spec.mode) {
+      case ImmBits::kModeLookup:
+        str = Support::findPackedString(spec.text, value);
+        break;
+
+      case ImmBits::kModeFormat:
+        snprintf(buf, sizeof(buf), spec.text, unsigned(value));
+        str = buf;
+        break;
+
+      default:
+        return DebugUtils::errored(kErrorInvalidState);
+    }
+
+    if (!str[0])
+      continue;
+
+    ASMJIT_PROPAGATE(sb.append(++n == 1 ? kImmCharStart : kImmCharOr));
+    ASMJIT_PROPAGATE(sb.append(str));
+  }
+
+  if (n && kImmCharEnd)
+    ASMJIT_PROPAGATE(sb.append(kImmCharEnd));
+
+  return kErrorOk;
+}
+
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmText(String& sb, uint32_t u8, uint32_t bits, uint32_t advance, const char* text, uint32_t count = 1) noexcept {
+  uint32_t mask = (1u << bits) - 1;
+  uint32_t pos = 0;
+
+  for (uint32_t i = 0; i < count; i++, u8 >>= bits, pos += advance) {
+    uint32_t value = (u8 & mask) + pos;
+    ASMJIT_PROPAGATE(sb.append(i == 0 ? kImmCharStart : kImmCharOr));
+    ASMJIT_PROPAGATE(sb.append(Support::findPackedString(text, value)));
+  }
+
+  if (kImmCharEnd)
+    ASMJIT_PROPAGATE(sb.append(kImmCharEnd));
+
+  return kErrorOk;
+}
+
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
+  String& sb,
+  FormatFlags formatFlags,
+  InstId instId,
+  uint32_t vecSize,
+  const Imm& imm) noexcept {
+
+  DebugUtils::unused(formatFlags);
+
+  static const char vcmpx[] =
+    "EQ_OQ\0" "LT_OS\0"  "LE_OS\0"  "UNORD_Q\0"  "NEQ_UQ\0" "NLT_US\0" "NLE_US\0" "ORD_Q\0"
+    "EQ_UQ\0" "NGE_US\0" "NGT_US\0" "FALSE_OQ\0" "NEQ_OQ\0" "GE_OS\0"  "GT_OS\0"  "TRUE_UQ\0"
+    "EQ_OS\0" "LT_OQ\0"  "LE_OQ\0"  "UNORD_S\0"  "NEQ_US\0" "NLT_UQ\0" "NLE_UQ\0" "ORD_S\0"
+    "EQ_US\0" "NGE_UQ\0" "NGT_UQ\0" "FALSE_OS\0" "NEQ_OS\0" "GE_OQ\0"  "GT_OQ\0"  "TRUE_US\0";
+
+  // Why to make it compatible...
+  static const char vpcmpx[] = "EQ\0" "LT\0" "LE\0" "FALSE\0" "NEQ\0" "GE\0"  "GT\0"    "TRUE\0";
+  static const char vpcomx[] = "LT\0" "LE\0" "GT\0" "GE\0"    "EQ\0"  "NEQ\0" "FALSE\0" "TRUE\0";
+
+  static const char vshufpd[] = "A0\0A1\0B0\0B1\0A2\0A3\0B2\0B3\0A4\0A5\0B4\0B5\0A6\0A7\0B6\0B7\0";
+  static const char vshufps[] = "A0\0A1\0A2\0A3\0A0\0A1\0A2\0A3\0B0\0B1\0B2\0B3\0B0\0B1\0B2\0B3\0";
+
+  static const ImmBits vfpclassxx[] = {
+    { 0x07u, 0, ImmBits::kModeLookup, "QNAN\0" "+0\0" "-0\0" "+INF\0" "-INF\0" "DENORMAL\0" "-FINITE\0" "SNAN\0" }
+  };
+
+  static const ImmBits vfixupimmxx[] = {
+    { 0x01u, 0, ImmBits::kModeLookup, "\0" "+INF_IE\0" },
+    { 0x02u, 1, ImmBits::kModeLookup, "\0" "-VE_IE\0"  },
+    { 0x04u, 2, ImmBits::kModeLookup, "\0" "-INF_IE\0" },
+    { 0x08u, 3, ImmBits::kModeLookup, "\0" "SNAN_IE\0" },
+    { 0x10u, 4, ImmBits::kModeLookup, "\0" "ONE_IE\0"  },
+    { 0x20u, 5, ImmBits::kModeLookup, "\0" "ONE_ZE\0"  },
+    { 0x40u, 6, ImmBits::kModeLookup, "\0" "ZERO_IE\0" },
+    { 0x80u, 7, ImmBits::kModeLookup, "\0" "ZERO_ZE\0" }
+  };
+
+  static const ImmBits vgetmantxx[] = {
+    { 0x03u, 0, ImmBits::kModeLookup, "[1, 2)\0" "[.5, 2)\0" "[.5, 1)\0" "[.75, 1.5)\0" },
+    { 0x04u, 2, ImmBits::kModeLookup, "\0" "NO_SIGN\0" },
+    { 0x08u, 3, ImmBits::kModeLookup, "\0" "QNAN_IF_SIGN\0" }
+  };
+
+  static const ImmBits vmpsadbw[] = {
+    { 0x04u, 2, ImmBits::kModeLookup, "BLK1[0]\0" "BLK1[1]\0" },
+    { 0x03u, 0, ImmBits::kModeLookup, "BLK2[0]\0" "BLK2[1]\0" "BLK2[2]\0" "BLK2[3]\0" },
+    { 0x40u, 6, ImmBits::kModeLookup, "BLK1[4]\0" "BLK1[5]\0" },
+    { 0x30u, 4, ImmBits::kModeLookup, "BLK2[4]\0" "BLK2[5]\0" "BLK2[6]\0" "BLK2[7]\0" }
+  };
+
+  static const ImmBits vpclmulqdq[] = {
+    { 0x01u, 0, ImmBits::kModeLookup, "LQ\0" "HQ\0" },
+    { 0x10u, 4, ImmBits::kModeLookup, "LQ\0" "HQ\0" }
+  };
+
+  static const ImmBits vperm2x128[] = {
+    { 0x0Bu, 0, ImmBits::kModeLookup, "A0\0" "A1\0" "B0\0" "B1\0" "\0" "\0" "\0" "\0" "0\0" "0\0" "0\0" "0\0" },
+    { 0xB0u, 4, ImmBits::kModeLookup, "A0\0" "A1\0" "B0\0" "B1\0" "\0" "\0" "\0" "\0" "0\0" "0\0" "0\0" "0\0" }
+  };
+
+  static const ImmBits vrangexx[] = {
+    { 0x03u, 0, ImmBits::kModeLookup, "MIN\0" "MAX\0" "MIN_ABS\0" "MAX_ABS\0" },
+    { 0x0Cu, 2, ImmBits::kModeLookup, "SIGN_A\0" "SIGN_B\0" "SIGN_0\0" "SIGN_1\0" }
+  };
+
+  static const ImmBits vreducexx_vrndscalexx[] = {
+    { 0x07u, 0, ImmBits::kModeLookup, "\0" "\0" "\0" "\0" "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" },
+    { 0x08u, 3, ImmBits::kModeLookup, "\0" "SAE\0" },
+    { 0xF0u, 4, ImmBits::kModeFormat, "LEN=%d" }
+  };
+
+  static const ImmBits vroundxx[] = {
+    { 0x07u, 0, ImmBits::kModeLookup, "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" "\0" "\0" "\0" "\0" },
+    { 0x08u, 3, ImmBits::kModeLookup, "\0" "INEXACT\0" }
+  };
+
+  uint32_t u8 = imm.valueAs<uint8_t>();
+  switch (instId) {
+    case Inst::kIdVblendpd:
+    case Inst::kIdBlendpd:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, vecSize / 8);
+
+    case Inst::kIdVblendps:
+    case Inst::kIdBlendps:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, vecSize / 4);
+
+    case Inst::kIdVcmppd:
+    case Inst::kIdVcmpps:
+    case Inst::kIdVcmpsd:
+    case Inst::kIdVcmpss:
+      return FormatterInternal_formatImmText(sb, u8, 5, 0, vcmpx);
+
+    case Inst::kIdCmppd:
+    case Inst::kIdCmpps:
+    case Inst::kIdCmpsd:
+    case Inst::kIdCmpss:
+      return FormatterInternal_formatImmText(sb, u8, 3, 0, vcmpx);
+
+    case Inst::kIdVdbpsadbw:
+      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+
+    case Inst::kIdVdppd:
+    case Inst::kIdVdpps:
+    case Inst::kIdDppd:
+    case Inst::kIdDpps:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+
+    case Inst::kIdVmpsadbw:
+    case Inst::kIdMpsadbw:
+      return FormatterInternal_formatImmBits(sb, u8, vmpsadbw, Support::min<uint32_t>(vecSize / 8, 4));
+
+    case Inst::kIdVpblendw:
+    case Inst::kIdPblendw:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+
+    case Inst::kIdVpblendd:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, Support::min<uint32_t>(vecSize / 4, 8));
+
+    case Inst::kIdVpclmulqdq:
+    case Inst::kIdPclmulqdq:
+      return FormatterInternal_formatImmBits(sb, u8, vpclmulqdq, ASMJIT_ARRAY_SIZE(vpclmulqdq));
+
+    case Inst::kIdVroundpd:
+    case Inst::kIdVroundps:
+    case Inst::kIdVroundsd:
+    case Inst::kIdVroundss:
+    case Inst::kIdRoundpd:
+    case Inst::kIdRoundps:
+    case Inst::kIdRoundsd:
+    case Inst::kIdRoundss:
+      return FormatterInternal_formatImmBits(sb, u8, vroundxx, ASMJIT_ARRAY_SIZE(vroundxx));
+
+    case Inst::kIdVshufpd:
+    case Inst::kIdShufpd:
+      return FormatterInternal_formatImmText(sb, u8, 1, 2, vshufpd, Support::min<uint32_t>(vecSize / 8, 8));
+
+    case Inst::kIdVshufps:
+    case Inst::kIdShufps:
+      return FormatterInternal_formatImmText(sb, u8, 2, 4, vshufps, 4);
+
+    case Inst::kIdVcvtps2ph:
+      return FormatterInternal_formatImmBits(sb, u8, vroundxx, 1);
+
+    case Inst::kIdVperm2f128:
+    case Inst::kIdVperm2i128:
+      return FormatterInternal_formatImmBits(sb, u8, vperm2x128, ASMJIT_ARRAY_SIZE(vperm2x128));
+
+    case Inst::kIdVpermilpd:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, vecSize / 8);
+
+    case Inst::kIdVpermilps:
+      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+
+    case Inst::kIdVpshufd:
+    case Inst::kIdPshufd:
+      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+
+    case Inst::kIdVpshufhw:
+    case Inst::kIdVpshuflw:
+    case Inst::kIdPshufhw:
+    case Inst::kIdPshuflw:
+    case Inst::kIdPshufw:
+      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+
+    case Inst::kIdVfixupimmpd:
+    case Inst::kIdVfixupimmps:
+    case Inst::kIdVfixupimmsd:
+    case Inst::kIdVfixupimmss:
+      return FormatterInternal_formatImmBits(sb, u8, vfixupimmxx, ASMJIT_ARRAY_SIZE(vfixupimmxx));
+
+    case Inst::kIdVfpclasspd:
+    case Inst::kIdVfpclassps:
+    case Inst::kIdVfpclasssd:
+    case Inst::kIdVfpclassss:
+      return FormatterInternal_formatImmBits(sb, u8, vfpclassxx, ASMJIT_ARRAY_SIZE(vfpclassxx));
+
+    case Inst::kIdVgetmantpd:
+    case Inst::kIdVgetmantps:
+    case Inst::kIdVgetmantsd:
+    case Inst::kIdVgetmantss:
+      return FormatterInternal_formatImmBits(sb, u8, vgetmantxx, ASMJIT_ARRAY_SIZE(vgetmantxx));
+
+    case Inst::kIdVpcmpb:
+    case Inst::kIdVpcmpd:
+    case Inst::kIdVpcmpq:
+    case Inst::kIdVpcmpw:
+    case Inst::kIdVpcmpub:
+    case Inst::kIdVpcmpud:
+    case Inst::kIdVpcmpuq:
+    case Inst::kIdVpcmpuw:
+      return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcmpx);
+
+    case Inst::kIdVpcomb:
+    case Inst::kIdVpcomd:
+    case Inst::kIdVpcomq:
+    case Inst::kIdVpcomw:
+    case Inst::kIdVpcomub:
+    case Inst::kIdVpcomud:
+    case Inst::kIdVpcomuq:
+    case Inst::kIdVpcomuw:
+      return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcomx);
+
+    case Inst::kIdVpermq:
+    case Inst::kIdVpermpd:
+      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+
+    case Inst::kIdVpternlogd:
+    case Inst::kIdVpternlogq:
+      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+
+    case Inst::kIdVrangepd:
+    case Inst::kIdVrangeps:
+    case Inst::kIdVrangesd:
+    case Inst::kIdVrangess:
+      return FormatterInternal_formatImmBits(sb, u8, vrangexx, ASMJIT_ARRAY_SIZE(vrangexx));
+
+    case Inst::kIdVreducepd:
+    case Inst::kIdVreduceps:
+    case Inst::kIdVreducesd:
+    case Inst::kIdVreducess:
+    case Inst::kIdVrndscalepd:
+    case Inst::kIdVrndscaleps:
+    case Inst::kIdVrndscalesd:
+    case Inst::kIdVrndscaless:
+      return FormatterInternal_formatImmBits(sb, u8, vreducexx_vrndscalexx, ASMJIT_ARRAY_SIZE(vreducexx_vrndscalexx));
+
+    case Inst::kIdVshuff32x4:
+    case Inst::kIdVshuff64x2:
+    case Inst::kIdVshufi32x4:
+    case Inst::kIdVshufi64x2: {
+      uint32_t count = Support::max<uint32_t>(vecSize / 16, 2u);
+      uint32_t bits = count <= 2 ? 1u : 2u;
+      return FormatterInternal_formatImmShuf(sb, u8, bits, count);
+    }
+
+    default:
+      return kErrorOk;
+  }
+}
+
+// x86::FormatterInternal - Format Instruction
+// ===========================================
+
+ASMJIT_FAVOR_SIZE Error FormatterInternal::formatInstruction(
+  String& sb,
+  FormatFlags formatFlags,
+  const BaseEmitter* emitter,
+  Arch arch,
+  const BaseInst& inst, const Operand_* operands, size_t opCount) noexcept {
+
+  InstId instId = inst.id();
+  InstOptions options = inst.options();
+
+  // Format instruction options and instruction mnemonic.
+  if (instId < Inst::_kIdCount) {
+    // VEX|EVEX options.
+    if (Support::test(options, InstOptions::kX86_Vex))
+      ASMJIT_PROPAGATE(sb.append("{vex} "));
+
+    if (Support::test(options, InstOptions::kX86_Vex3))
+      ASMJIT_PROPAGATE(sb.append("{vex3} "));
+
+    if (Support::test(options, InstOptions::kX86_Evex))
+      ASMJIT_PROPAGATE(sb.append("{evex} "));
+
+    // MOD/RM and MOD/MR options
+    if (Support::test(options, InstOptions::kX86_ModRM))
+      ASMJIT_PROPAGATE(sb.append("{modrm} "));
+    else if (Support::test(options, InstOptions::kX86_ModMR))
+      ASMJIT_PROPAGATE(sb.append("{modmr} "));
+
+    // SHORT|LONG options.
+    if (Support::test(options, InstOptions::kShortForm))
+      ASMJIT_PROPAGATE(sb.append("short "));
+
+    if (Support::test(options, InstOptions::kLongForm))
+      ASMJIT_PROPAGATE(sb.append("long "));
+
+    // LOCK|XACQUIRE|XRELEASE options.
+    if (Support::test(options, InstOptions::kX86_XAcquire))
+      ASMJIT_PROPAGATE(sb.append("xacquire "));
+
+    if (Support::test(options, InstOptions::kX86_XRelease))
+      ASMJIT_PROPAGATE(sb.append("xrelease "));
+
+    if (Support::test(options, InstOptions::kX86_Lock))
+      ASMJIT_PROPAGATE(sb.append("lock "));
+
+    // REP|REPNE options.
+    if (Support::test(options, InstOptions::kX86_Rep | InstOptions::kX86_Repne)) {
+      sb.append(Support::test(options, InstOptions::kX86_Rep) ? "rep " : "repnz ");
+      if (inst.hasExtraReg()) {
+        ASMJIT_PROPAGATE(sb.append("{"));
+        ASMJIT_PROPAGATE(formatOperand(sb, formatFlags, emitter, arch, inst.extraReg().toReg<BaseReg>()));
+        ASMJIT_PROPAGATE(sb.append("} "));
+      }
+    }
+
+    // REX options.
+    if (Support::test(options, InstOptions::kX86_Rex)) {
+      const InstOptions kRXBWMask = InstOptions::kX86_OpCodeR |
+                                    InstOptions::kX86_OpCodeX |
+                                    InstOptions::kX86_OpCodeB |
+                                    InstOptions::kX86_OpCodeW ;
+      if (Support::test(options, kRXBWMask)) {
+        ASMJIT_PROPAGATE(sb.append("rex."));
+        if (Support::test(options, InstOptions::kX86_OpCodeR)) sb.append('r');
+        if (Support::test(options, InstOptions::kX86_OpCodeX)) sb.append('x');
+        if (Support::test(options, InstOptions::kX86_OpCodeB)) sb.append('b');
+        if (Support::test(options, InstOptions::kX86_OpCodeW)) sb.append('w');
+        sb.append(' ');
+      }
+      else {
+        ASMJIT_PROPAGATE(sb.append("rex "));
+      }
+    }
+
+    ASMJIT_PROPAGATE(InstInternal::instIdToString(arch, instId, sb));
+  }
+  else {
+    ASMJIT_PROPAGATE(sb.appendFormat("[InstId=#%u]", unsigned(instId)));
+  }
+
+  for (uint32_t i = 0; i < opCount; i++) {
+    const Operand_& op = operands[i];
+    if (op.isNone()) break;
+
+    ASMJIT_PROPAGATE(sb.append(i == 0 ? " " : ", "));
+    ASMJIT_PROPAGATE(formatOperand(sb, formatFlags, emitter, arch, op));
+
+    if (op.isImm() && uint32_t(formatFlags & FormatFlags::kExplainImms)) {
+      uint32_t vecSize = 16;
+      for (uint32_t j = 0; j < opCount; j++)
+        if (operands[j].isReg())
+          vecSize = Support::max<uint32_t>(vecSize, operands[j].size());
+      ASMJIT_PROPAGATE(FormatterInternal_explainConst(sb, formatFlags, instId, vecSize, op.as<Imm>()));
+    }
+
+    // Support AVX-512 masking - {k}{z}.
+    if (i == 0) {
+      if (inst.extraReg().group() == RegGroup::kX86_K) {
+        ASMJIT_PROPAGATE(sb.append(" {"));
+        ASMJIT_PROPAGATE(formatRegister(sb, formatFlags, emitter, arch, inst.extraReg().type(), inst.extraReg().id()));
+        ASMJIT_PROPAGATE(sb.append('}'));
+
+        if (Support::test(options, InstOptions::kX86_ZMask))
+          ASMJIT_PROPAGATE(sb.append("{z}"));
+      }
+      else if (Support::test(options, InstOptions::kX86_ZMask)) {
+        ASMJIT_PROPAGATE(sb.append(" {z}"));
+      }
+    }
+
+    // Support AVX-512 broadcast - {1tox}.
+    if (op.isMem() && op.as<Mem>().hasBroadcast()) {
+      ASMJIT_PROPAGATE(sb.appendFormat(" {1to%u}", Support::bitMask(uint32_t(op.as<Mem>().getBroadcast()))));
+    }
+  }
+
+  // Support AVX-512 embedded rounding and suppress-all-exceptions {sae}.
+  if (inst.hasOption(InstOptions::kX86_ER | InstOptions::kX86_SAE)) {
+    if (inst.hasOption(InstOptions::kX86_ER)) {
+      uint32_t bits = uint32_t(inst.options() & InstOptions::kX86_ERMask) >> Support::ConstCTZ<uint32_t(InstOptions::kX86_ERMask)>::value;
+
+      const char roundingModes[] = "rn\0rd\0ru\0rz";
+      ASMJIT_PROPAGATE(sb.appendFormat(", {%s-sae}", roundingModes + bits * 3));
+    }
+    else {
+      ASMJIT_PROPAGATE(sb.append(", {sae}"));
+    }
+  }
+
+  return kErrorOk;
+}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_LOGGING
--- a/lib/lepton/asmjit/x86/x86formatter_p.h
+++ b/lib/lepton/asmjit/x86/x86formatter_p.h
@ -0,0 +1,58 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86FORMATTER_P_H_INCLUDED
+#define ASMJIT_X86_X86FORMATTER_P_H_INCLUDED
+
+#include "../core/api-config.h"
+#ifndef ASMJIT_NO_LOGGING
+
+#include "../core/formatter.h"
+#include "../core/string.h"
+#include "../x86/x86globals.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+namespace FormatterInternal {
+
+Error ASMJIT_CDECL formatFeature(
+  String& sb,
+  uint32_t featureId) noexcept;
+
+Error ASMJIT_CDECL formatRegister(
+  String& sb,
+  FormatFlags flags,
+  const BaseEmitter* emitter,
+  Arch arch,
+  RegType regType,
+  uint32_t regId) noexcept;
+
+Error ASMJIT_CDECL formatOperand(
+  String& sb,
+  FormatFlags flags,
+  const BaseEmitter* emitter,
+  Arch arch,
+  const Operand_& op) noexcept;
+
+Error ASMJIT_CDECL formatInstruction(
+  String& sb,
+  FormatFlags flags,
+  const BaseEmitter* emitter,
+  Arch arch,
+  const BaseInst& inst, const Operand_* operands, size_t opCount) noexcept;
+
+} // {FormatterInternal}
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_LOGGING
+#endif // ASMJIT_X86_X86FORMATTER_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86func.cpp
+++ b/lib/lepton/asmjit/x86/x86func.cpp
@ -0,0 +1,503 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#if !defined(ASMJIT_NO_X86)
+
+#include "../x86/x86func_p.h"
+#include "../x86/x86emithelper_p.h"
+#include "../x86/x86operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+namespace FuncInternal {
+
+static inline bool shouldThreatAsCDeclIn64BitMode(CallConvId ccId) noexcept {
+  return ccId == CallConvId::kCDecl ||
+         ccId == CallConvId::kStdCall ||
+         ccId == CallConvId::kThisCall ||
+         ccId == CallConvId::kFastCall ||
+         ccId == CallConvId::kRegParm1 ||
+         ccId == CallConvId::kRegParm2 ||
+         ccId == CallConvId::kRegParm3;
+}
+
+ASMJIT_FAVOR_SIZE Error initCallConv(CallConv& cc, CallConvId ccId, const Environment& environment) noexcept {
+  constexpr uint32_t kZax = Gp::kIdAx;
+  constexpr uint32_t kZbx = Gp::kIdBx;
+  constexpr uint32_t kZcx = Gp::kIdCx;
+  constexpr uint32_t kZdx = Gp::kIdDx;
+  constexpr uint32_t kZsp = Gp::kIdSp;
+  constexpr uint32_t kZbp = Gp::kIdBp;
+  constexpr uint32_t kZsi = Gp::kIdSi;
+  constexpr uint32_t kZdi = Gp::kIdDi;
+
+  bool winABI = environment.isPlatformWindows() || environment.isMSVC();
+
+  cc.setArch(environment.arch());
+  cc.setSaveRestoreRegSize(RegGroup::kVec, 16);
+  cc.setSaveRestoreRegSize(RegGroup::kX86_MM, 8);
+  cc.setSaveRestoreRegSize(RegGroup::kX86_K, 8);
+  cc.setSaveRestoreAlignment(RegGroup::kVec, 16);
+  cc.setSaveRestoreAlignment(RegGroup::kX86_MM, 8);
+  cc.setSaveRestoreAlignment(RegGroup::kX86_K, 8);
+
+  if (environment.is32Bit()) {
+    bool isStandardCallConv = true;
+
+    cc.setSaveRestoreRegSize(RegGroup::kGp, 4);
+    cc.setSaveRestoreAlignment(RegGroup::kGp, 4);
+
+    cc.setPreservedRegs(RegGroup::kGp, Support::bitMask(Gp::kIdBx, Gp::kIdSp, Gp::kIdBp, Gp::kIdSi, Gp::kIdDi));
+    cc.setNaturalStackAlignment(4);
+
+    switch (ccId) {
+      case CallConvId::kCDecl:
+        break;
+
+      case CallConvId::kStdCall:
+        cc.setFlags(CallConvFlags::kCalleePopsStack);
+        break;
+
+      case CallConvId::kFastCall:
+        cc.setFlags(CallConvFlags::kCalleePopsStack);
+        cc.setPassedOrder(RegGroup::kGp, kZcx, kZdx);
+        break;
+
+      case CallConvId::kVectorCall:
+        cc.setFlags(CallConvFlags::kCalleePopsStack);
+        cc.setPassedOrder(RegGroup::kGp, kZcx, kZdx);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3, 4, 5);
+        break;
+
+      case CallConvId::kThisCall:
+        // NOTE: Even MINGW (starting with GCC 4.7.0) now uses __thiscall on MS Windows, so we won't bail to any
+        // other calling convention if __thiscall was specified.
+        if (winABI) {
+          cc.setFlags(CallConvFlags::kCalleePopsStack);
+          cc.setPassedOrder(RegGroup::kGp, kZcx);
+        }
+        else {
+          ccId = CallConvId::kCDecl;
+        }
+        break;
+
+      case CallConvId::kRegParm1:
+        cc.setPassedOrder(RegGroup::kGp, kZax);
+        break;
+
+      case CallConvId::kRegParm2:
+        cc.setPassedOrder(RegGroup::kGp, kZax, kZdx);
+        break;
+
+      case CallConvId::kRegParm3:
+        cc.setPassedOrder(RegGroup::kGp, kZax, kZdx, kZcx);
+        break;
+
+      case CallConvId::kLightCall2:
+      case CallConvId::kLightCall3:
+      case CallConvId::kLightCall4: {
+        uint32_t n = uint32_t(ccId) - uint32_t(CallConvId::kLightCall2) + 2;
+
+        cc.setFlags(CallConvFlags::kPassFloatsByVec);
+        cc.setPassedOrder(RegGroup::kGp, kZax, kZdx, kZcx, kZsi, kZdi);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPassedOrder(RegGroup::kX86_K, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPassedOrder(RegGroup::kX86_MM, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPreservedRegs(RegGroup::kGp, Support::lsbMask<uint32_t>(8));
+        cc.setPreservedRegs(RegGroup::kVec, Support::lsbMask<uint32_t>(8) & ~Support::lsbMask<uint32_t>(n));
+
+        cc.setNaturalStackAlignment(16);
+        isStandardCallConv = false;
+        break;
+      }
+
+      default:
+        return DebugUtils::errored(kErrorInvalidArgument);
+    }
+
+    if (isStandardCallConv) {
+      // MMX arguments is something where compiler vendors disagree. For example GCC and MSVC would pass first three
+      // via registers and the rest via stack, however Clang passes all via stack. Returning MMX registers is even
+      // more fun, where GCC uses MM0, but Clang uses EAX:EDX pair. I'm not sure it's something we should be worried
+      // about as MMX is deprecated anyway.
+      cc.setPassedOrder(RegGroup::kX86_MM, 0, 1, 2);
+
+      // Vector arguments (XMM|YMM|ZMM) are passed via registers. However, if the function is variadic then they have
+      // to be passed via stack.
+      cc.setPassedOrder(RegGroup::kVec, 0, 1, 2);
+
+      // Functions with variable arguments always use stack for MM and vector arguments.
+      cc.addFlags(CallConvFlags::kPassVecByStackIfVA);
+    }
+
+    if (ccId == CallConvId::kCDecl) {
+      cc.addFlags(CallConvFlags::kVarArgCompatible);
+    }
+  }
+  else {
+    cc.setSaveRestoreRegSize(RegGroup::kGp, 8);
+    cc.setSaveRestoreAlignment(RegGroup::kGp, 8);
+
+    // Preprocess the calling convention into a common id as many conventions are normally ignored even by C/C++
+    // compilers and treated as `__cdecl`.
+    if (shouldThreatAsCDeclIn64BitMode(ccId))
+      ccId = winABI ? CallConvId::kX64Windows : CallConvId::kX64SystemV;
+
+    switch (ccId) {
+      case CallConvId::kX64SystemV: {
+        cc.setFlags(CallConvFlags::kPassFloatsByVec |
+                    CallConvFlags::kPassMmxByXmm    |
+                    CallConvFlags::kVarArgCompatible);
+        cc.setNaturalStackAlignment(16);
+        cc.setRedZoneSize(128);
+        cc.setPassedOrder(RegGroup::kGp, kZdi, kZsi, kZdx, kZcx, 8, 9);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPreservedRegs(RegGroup::kGp, Support::bitMask(kZbx, kZsp, kZbp, 12, 13, 14, 15));
+        break;
+      }
+
+      case CallConvId::kX64Windows: {
+        cc.setStrategy(CallConvStrategy::kX64Windows);
+        cc.setFlags(CallConvFlags::kPassFloatsByVec |
+                    CallConvFlags::kIndirectVecArgs |
+                    CallConvFlags::kPassMmxByGp     |
+                    CallConvFlags::kVarArgCompatible);
+        cc.setNaturalStackAlignment(16);
+        // Maximum 4 arguments in registers, each adds 8 bytes to the spill zone.
+        cc.setSpillZoneSize(4 * 8);
+        cc.setPassedOrder(RegGroup::kGp, kZcx, kZdx, 8, 9);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3);
+        cc.setPreservedRegs(RegGroup::kGp, Support::bitMask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
+        cc.setPreservedRegs(RegGroup::kVec, Support::bitMask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+        break;
+      }
+
+      case CallConvId::kVectorCall: {
+        cc.setStrategy(CallConvStrategy::kX64VectorCall);
+        cc.setFlags(CallConvFlags::kPassFloatsByVec |
+                    CallConvFlags::kPassMmxByGp     );
+        cc.setNaturalStackAlignment(16);
+        // Maximum 6 arguments in registers, each adds 8 bytes to the spill zone.
+        cc.setSpillZoneSize(6 * 8);
+        cc.setPassedOrder(RegGroup::kGp, kZcx, kZdx, 8, 9);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3, 4, 5);
+        cc.setPreservedRegs(RegGroup::kGp, Support::bitMask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
+        cc.setPreservedRegs(RegGroup::kVec, Support::bitMask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
+        break;
+      }
+
+      case CallConvId::kLightCall2:
+      case CallConvId::kLightCall3:
+      case CallConvId::kLightCall4: {
+        uint32_t n = uint32_t(ccId) - uint32_t(CallConvId::kLightCall2) + 2;
+
+        cc.setFlags(CallConvFlags::kPassFloatsByVec);
+        cc.setNaturalStackAlignment(16);
+        cc.setPassedOrder(RegGroup::kGp, kZax, kZdx, kZcx, kZsi, kZdi);
+        cc.setPassedOrder(RegGroup::kVec, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPassedOrder(RegGroup::kX86_K, 0, 1, 2, 3, 4, 5, 6, 7);
+        cc.setPassedOrder(RegGroup::kX86_MM, 0, 1, 2, 3, 4, 5, 6, 7);
+
+        cc.setPreservedRegs(RegGroup::kGp, Support::lsbMask<uint32_t>(16));
+        cc.setPreservedRegs(RegGroup::kVec, ~Support::lsbMask<uint32_t>(n));
+        break;
+      }
+
+      default:
+        return DebugUtils::errored(kErrorInvalidArgument);
+    }
+  }
+
+  cc.setId(ccId);
+  return kErrorOk;
+}
+
+ASMJIT_FAVOR_SIZE void unpackValues(FuncDetail& func, FuncValuePack& pack) noexcept {
+  TypeId typeId = pack[0].typeId();
+  switch (typeId) {
+    case TypeId::kInt64:
+    case TypeId::kUInt64: {
+      if (Environment::is32Bit(func.callConv().arch())) {
+        // Convert a 64-bit return value to two 32-bit return values.
+        pack[0].initTypeId(TypeId::kUInt32);
+        pack[1].initTypeId(TypeId(uint32_t(typeId) - 2));
+        break;
+      }
+      break;
+    }
+
+    default: {
+      break;
+    }
+  }
+}
+
+ASMJIT_FAVOR_SIZE Error initFuncDetail(FuncDetail& func, const FuncSignature& signature, uint32_t registerSize) noexcept {
+  const CallConv& cc = func.callConv();
+  Arch arch = cc.arch();
+  uint32_t stackOffset = cc._spillZoneSize;
+  uint32_t argCount = func.argCount();
+
+  // Up to two return values can be returned in GP registers.
+  static const uint8_t gpReturnIndexes[4] = {
+    uint8_t(Gp::kIdAx),
+    uint8_t(Gp::kIdDx),
+    uint8_t(BaseReg::kIdBad),
+    uint8_t(BaseReg::kIdBad)
+  };
+
+  if (func.hasRet()) {
+    unpackValues(func, func._rets);
+    for (uint32_t valueIndex = 0; valueIndex < Globals::kMaxValuePack; valueIndex++) {
+      TypeId typeId = func._rets[valueIndex].typeId();
+
+      // Terminate at the first void type (end of the pack).
+      if (typeId == TypeId::kVoid)
+        break;
+
+      switch (typeId) {
+        case TypeId::kInt64:
+        case TypeId::kUInt64: {
+          if (gpReturnIndexes[valueIndex] != BaseReg::kIdBad)
+            func._rets[valueIndex].initReg(RegType::kX86_Gpq, gpReturnIndexes[valueIndex], typeId);
+          else
+            return DebugUtils::errored(kErrorInvalidState);
+          break;
+        }
+
+        case TypeId::kInt8:
+        case TypeId::kInt16:
+        case TypeId::kInt32: {
+          if (gpReturnIndexes[valueIndex] != BaseReg::kIdBad)
+            func._rets[valueIndex].initReg(RegType::kX86_Gpd, gpReturnIndexes[valueIndex], TypeId::kInt32);
+          else
+            return DebugUtils::errored(kErrorInvalidState);
+          break;
+        }
+
+        case TypeId::kUInt8:
+        case TypeId::kUInt16:
+        case TypeId::kUInt32: {
+          if (gpReturnIndexes[valueIndex] != BaseReg::kIdBad)
+            func._rets[valueIndex].initReg(RegType::kX86_Gpd, gpReturnIndexes[valueIndex], TypeId::kUInt32);
+          else
+            return DebugUtils::errored(kErrorInvalidState);
+          break;
+        }
+
+        case TypeId::kFloat32:
+        case TypeId::kFloat64: {
+          RegType regType = Environment::is32Bit(arch) ? RegType::kX86_St : RegType::kX86_Xmm;
+          func._rets[valueIndex].initReg(regType, valueIndex, typeId);
+          break;
+        }
+
+        case TypeId::kFloat80: {
+          // 80-bit floats are always returned by FP0.
+          func._rets[valueIndex].initReg(RegType::kX86_St, valueIndex, typeId);
+          break;
+        }
+
+        case TypeId::kMmx32:
+        case TypeId::kMmx64: {
+          // MM registers are returned through XMM (SystemV) or GPQ (Win64).
+          RegType regType = RegType::kX86_Mm;
+          uint32_t regIndex = valueIndex;
+          if (Environment::is64Bit(arch)) {
+            regType = cc.strategy() == CallConvStrategy::kDefault ? RegType::kX86_Xmm : RegType::kX86_Gpq;
+            regIndex = cc.strategy() == CallConvStrategy::kDefault ? valueIndex : gpReturnIndexes[valueIndex];
+
+            if (regIndex == BaseReg::kIdBad)
+              return DebugUtils::errored(kErrorInvalidState);
+          }
+
+          func._rets[valueIndex].initReg(regType, regIndex, typeId);
+          break;
+        }
+
+        default: {
+          func._rets[valueIndex].initReg(vecTypeIdToRegType(typeId), valueIndex, typeId);
+          break;
+        }
+      }
+    }
+  }
+
+  switch (cc.strategy()) {
+    case CallConvStrategy::kDefault: {
+      uint32_t gpzPos = 0;
+      uint32_t vecPos = 0;
+
+      for (uint32_t argIndex = 0; argIndex < argCount; argIndex++) {
+        unpackValues(func, func._args[argIndex]);
+
+        for (uint32_t valueIndex = 0; valueIndex < Globals::kMaxValuePack; valueIndex++) {
+          FuncValue& arg = func._args[argIndex][valueIndex];
+
+          // Terminate if there are no more arguments in the pack.
+          if (!arg)
+            break;
+
+          TypeId typeId = arg.typeId();
+
+          if (TypeUtils::isInt(typeId)) {
+            uint32_t regId = BaseReg::kIdBad;
+
+            if (gpzPos < CallConv::kMaxRegArgsPerGroup)
+              regId = cc._passedOrder[RegGroup::kGp].id[gpzPos];
+
+            if (regId != BaseReg::kIdBad) {
+              RegType regType = typeId <= TypeId::kUInt32 ? RegType::kX86_Gpd : RegType::kX86_Gpq;
+              arg.assignRegData(regType, regId);
+              func.addUsedRegs(RegGroup::kGp, Support::bitMask(regId));
+              gpzPos++;
+            }
+            else {
+              uint32_t size = Support::max<uint32_t>(TypeUtils::sizeOf(typeId), registerSize);
+              arg.assignStackOffset(int32_t(stackOffset));
+              stackOffset += size;
+            }
+            continue;
+          }
+
+          if (TypeUtils::isFloat(typeId) || TypeUtils::isVec(typeId)) {
+            uint32_t regId = BaseReg::kIdBad;
+
+            if (vecPos < CallConv::kMaxRegArgsPerGroup)
+              regId = cc._passedOrder[RegGroup::kVec].id[vecPos];
+
+            if (TypeUtils::isFloat(typeId)) {
+              // If this is a float, but `kFlagPassFloatsByVec` is false, we have to use stack instead. This should
+              // be only used by 32-bit calling conventions.
+              if (!cc.hasFlag(CallConvFlags::kPassFloatsByVec))
+                regId = BaseReg::kIdBad;
+            }
+            else {
+              // Pass vector registers via stack if this is a variable arguments function. This should be only used
+              // by 32-bit calling conventions.
+              if (signature.hasVarArgs() && cc.hasFlag(CallConvFlags::kPassVecByStackIfVA))
+                regId = BaseReg::kIdBad;
+            }
+
+            if (regId != BaseReg::kIdBad) {
+              arg.initTypeId(typeId);
+              arg.assignRegData(vecTypeIdToRegType(typeId), regId);
+              func.addUsedRegs(RegGroup::kVec, Support::bitMask(regId));
+              vecPos++;
+            }
+            else {
+              uint32_t size = TypeUtils::sizeOf(typeId);
+              arg.assignStackOffset(int32_t(stackOffset));
+              stackOffset += size;
+            }
+            continue;
+          }
+        }
+      }
+      break;
+    }
+
+    case CallConvStrategy::kX64Windows:
+    case CallConvStrategy::kX64VectorCall: {
+      // Both X64 and VectorCall behave similarly - arguments are indexed from left to right. The position of the
+      // argument determines in which register the argument is allocated, so it's either GP or one of XMM/YMM/ZMM
+      // registers.
+      //
+      //       [       X64       ] [VecCall]
+      // Index: #0   #1   #2   #3   #4   #5
+      //
+      // GP   : RCX  RDX  R8   R9
+      // VEC  : XMM0 XMM1 XMM2 XMM3 XMM4 XMM5
+      //
+      // For example function `f(int a, double b, int c, double d)` will be:
+      //
+      //        (a)  (b)  (c)  (d)
+      //        RCX  XMM1 R8   XMM3
+      //
+      // Unused vector registers are used by HVA.
+      bool isVectorCall = (cc.strategy() == CallConvStrategy::kX64VectorCall);
+
+      for (uint32_t argIndex = 0; argIndex < argCount; argIndex++) {
+        unpackValues(func, func._args[argIndex]);
+
+        for (uint32_t valueIndex = 0; valueIndex < Globals::kMaxValuePack; valueIndex++) {
+          FuncValue& arg = func._args[argIndex][valueIndex];
+
+          // Terminate if there are no more arguments in the pack.
+          if (!arg)
+            break;
+
+          TypeId typeId = arg.typeId();
+          uint32_t size = TypeUtils::sizeOf(typeId);
+
+          if (TypeUtils::isInt(typeId) || TypeUtils::isMmx(typeId)) {
+            uint32_t regId = BaseReg::kIdBad;
+
+            if (argIndex < CallConv::kMaxRegArgsPerGroup)
+              regId = cc._passedOrder[RegGroup::kGp].id[argIndex];
+
+            if (regId != BaseReg::kIdBad) {
+              RegType regType = size <= 4 && !TypeUtils::isMmx(typeId) ? RegType::kX86_Gpd : RegType::kX86_Gpq;
+              arg.assignRegData(regType, regId);
+              func.addUsedRegs(RegGroup::kGp, Support::bitMask(regId));
+            }
+            else {
+              arg.assignStackOffset(int32_t(stackOffset));
+              stackOffset += 8;
+            }
+            continue;
+          }
+
+          if (TypeUtils::isFloat(typeId) || TypeUtils::isVec(typeId)) {
+            uint32_t regId = BaseReg::kIdBad;
+
+            if (argIndex < CallConv::kMaxRegArgsPerGroup)
+              regId = cc._passedOrder[RegGroup::kVec].id[argIndex];
+
+            if (regId != BaseReg::kIdBad) {
+              // X64-ABI doesn't allow vector types (XMM|YMM|ZMM) to be passed via registers, however, VectorCall
+              // was designed for that purpose.
+              if (TypeUtils::isFloat(typeId) || isVectorCall) {
+                RegType regType = vecTypeIdToRegType(typeId);
+                arg.assignRegData(regType, regId);
+                func.addUsedRegs(RegGroup::kVec, Support::bitMask(regId));
+                continue;
+              }
+            }
+
+            // Passed via stack if the argument is float/double or indirectly. The trap is - if the argument is
+            // passed indirectly, the address can be passed via register, if the argument's index has GP one.
+            if (TypeUtils::isFloat(typeId)) {
+              arg.assignStackOffset(int32_t(stackOffset));
+            }
+            else {
+              uint32_t gpRegId = cc._passedOrder[RegGroup::kGp].id[argIndex];
+              if (gpRegId != BaseReg::kIdBad)
+                arg.assignRegData(RegType::kX86_Gpq, gpRegId);
+              else
+                arg.assignStackOffset(int32_t(stackOffset));
+              arg.addFlags(FuncValue::kFlagIsIndirect);
+            }
+
+            // Always 8 bytes (float/double/pointer).
+            stackOffset += 8;
+            continue;
+          }
+        }
+      }
+      break;
+    }
+  }
+
+  func._argStackSize = stackOffset;
+  return kErrorOk;
+}
+
+} // {FuncInternal}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_X86
--- a/lib/lepton/asmjit/x86/x86func_p.h
+++ b/lib/lepton/asmjit/x86/x86func_p.h
@ -0,0 +1,33 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86FUNC_P_H_INCLUDED
+#define ASMJIT_X86_X86FUNC_P_H_INCLUDED
+
+#include "../core/func.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86-specific function API (calling conventions and other utilities).
+namespace FuncInternal {
+
+//! Initialize `CallConv` structure (X86 specific).
+Error initCallConv(CallConv& cc, CallConvId ccId, const Environment& environment) noexcept;
+
+//! Initialize `FuncDetail` (X86 specific).
+Error initFuncDetail(FuncDetail& func, const FuncSignature& signature, uint32_t registerSize) noexcept;
+
+} // {FuncInternal}
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86FUNC_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86globals.h
+++ b/lib/lepton/asmjit/x86/x86globals.h
--- a/lib/lepton/asmjit/x86/x86instapi.cpp
+++ b/lib/lepton/asmjit/x86/x86instapi.cpp
--- a/lib/lepton/asmjit/x86/x86instapi_p.h
+++ b/lib/lepton/asmjit/x86/x86instapi_p.h
@ -0,0 +1,41 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86INSTAPI_P_H_INCLUDED
+#define ASMJIT_X86_X86INSTAPI_P_H_INCLUDED
+
+#include "../core/inst.h"
+#include "../core/operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+namespace InstInternal {
+
+#ifndef ASMJIT_NO_TEXT
+Error ASMJIT_CDECL instIdToString(Arch arch, InstId instId, String& output) noexcept;
+InstId ASMJIT_CDECL stringToInstId(Arch arch, const char* s, size_t len) noexcept;
+#endif // !ASMJIT_NO_TEXT
+
+#ifndef ASMJIT_NO_VALIDATION
+Error ASMJIT_CDECL validate(Arch arch, const BaseInst& inst, const Operand_* operands, size_t opCount, ValidationFlags validationFlags) noexcept;
+#endif // !ASMJIT_NO_VALIDATION
+
+#ifndef ASMJIT_NO_INTROSPECTION
+Error ASMJIT_CDECL queryRWInfo(Arch arch, const BaseInst& inst, const Operand_* operands, size_t opCount, InstRWInfo* out) noexcept;
+Error ASMJIT_CDECL queryFeatures(Arch arch, const BaseInst& inst, const Operand_* operands, size_t opCount, CpuFeatures* out) noexcept;
+#endif // !ASMJIT_NO_INTROSPECTION
+
+} // {InstInternal}
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86INSTAPI_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86instdb.cpp
+++ b/lib/lepton/asmjit/x86/x86instdb.cpp
--- a/lib/lepton/asmjit/x86/x86instdb.h
+++ b/lib/lepton/asmjit/x86/x86instdb.h
@ -0,0 +1,563 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86INSTDB_H_INCLUDED
+#define ASMJIT_X86_X86INSTDB_H_INCLUDED
+
+#include "../x86/x86globals.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \addtogroup asmjit_x86
+//! \{
+
+//! Instruction database (X86).
+namespace InstDB {
+
+//! Describes which operation mode is supported by an instruction.
+enum class Mode : uint8_t {
+  //! Invalid mode.
+  kNone = 0x00u,
+  //! X86 mode supported.
+  kX86 = 0x01u,
+  //! X64 mode supported.
+  kX64 = 0x02u,
+  //! Both X86 and X64 modes supported.
+  kAny = 0x03u
+};
+ASMJIT_DEFINE_ENUM_FLAGS(Mode)
+
+//! Converts architecture to operation mode, see \ref Mode.
+static constexpr Mode modeFromArch(Arch arch) noexcept {
+  return arch == Arch::kX86 ? Mode::kX86 :
+         arch == Arch::kX64 ? Mode::kX64 : Mode::kNone;
+}
+
+//! Operand signature flags used by \ref OpSignature.
+enum class OpFlags : uint64_t {
+  //! No operand flags.
+  kNone = 0u,
+
+  kRegGpbLo        = 0x0000000000000001u, //!< Operand can be low 8-bit GPB register.
+  kRegGpbHi        = 0x0000000000000002u, //!< Operand can be high 8-bit GPB register.
+  kRegGpw          = 0x0000000000000004u, //!< Operand can be 16-bit GPW register.
+  kRegGpd          = 0x0000000000000008u, //!< Operand can be 32-bit GPD register.
+  kRegGpq          = 0x0000000000000010u, //!< Operand can be 64-bit GPQ register.
+  kRegXmm          = 0x0000000000000020u, //!< Operand can be 128-bit XMM register.
+  kRegYmm          = 0x0000000000000040u, //!< Operand can be 256-bit YMM register.
+  kRegZmm          = 0x0000000000000080u, //!< Operand can be 512-bit ZMM register.
+  kRegMm           = 0x0000000000000100u, //!< Operand can be 64-bit MM register.
+  kRegKReg         = 0x0000000000000200u, //!< Operand can be 64-bit K register.
+  kRegSReg         = 0x0000000000000400u, //!< Operand can be SReg (segment register).
+  kRegCReg         = 0x0000000000000800u, //!< Operand can be CReg (control register).
+  kRegDReg         = 0x0000000000001000u, //!< Operand can be DReg (debug register).
+  kRegSt           = 0x0000000000002000u, //!< Operand can be 80-bit ST register (X87).
+  kRegBnd          = 0x0000000000004000u, //!< Operand can be 128-bit BND register.
+  kRegTmm          = 0x0000000000008000u, //!< Operand can be 0..8192-bit TMM register.
+  kRegMask         = 0x000000000000FFFFu, //!< Mask of all possible register types.
+
+  kMemUnspecified  = 0x0000000000040000u, //!< Operand can be a scalar memory pointer without size.
+  kMem8            = 0x0000000000080000u, //!< Operand can be an 8-bit memory pointer.
+  kMem16           = 0x0000000000100000u, //!< Operand can be a 16-bit memory pointer.
+  kMem32           = 0x0000000000200000u, //!< Operand can be a 32-bit memory pointer.
+  kMem48           = 0x0000000000400000u, //!< Operand can be a 48-bit memory pointer (FAR pointers only).
+  kMem64           = 0x0000000000800000u, //!< Operand can be a 64-bit memory pointer.
+  kMem80           = 0x0000000001000000u, //!< Operand can be an 80-bit memory pointer.
+  kMem128          = 0x0000000002000000u, //!< Operand can be a 128-bit memory pointer.
+  kMem256          = 0x0000000004000000u, //!< Operand can be a 256-bit memory pointer.
+  kMem512          = 0x0000000008000000u, //!< Operand can be a 512-bit memory pointer.
+  kMem1024         = 0x0000000010000000u, //!< Operand can be a 1024-bit memory pointer.
+  kMemMask         = 0x000000001FFC0000u, //!< Mask of all possible scalar memory types.
+
+  kVm32x           = 0x0000000040000000u, //!< Operand can be a vm32x (vector) pointer.
+  kVm32y           = 0x0000000080000000u, //!< Operand can be a vm32y (vector) pointer.
+  kVm32z           = 0x0000000100000000u, //!< Operand can be a vm32z (vector) pointer.
+  kVm64x           = 0x0000000200000000u, //!< Operand can be a vm64x (vector) pointer.
+  kVm64y           = 0x0000000400000000u, //!< Operand can be a vm64y (vector) pointer.
+  kVm64z           = 0x0000000800000000u, //!< Operand can be a vm64z (vector) pointer.
+  kVmMask          = 0x0000000FC0000000u, //!< Mask of all possible vector memory types.
+
+  kImmI4           = 0x0000001000000000u, //!< Operand can be signed 4-bit immediate.
+  kImmU4           = 0x0000002000000000u, //!< Operand can be unsigned 4-bit immediate.
+  kImmI8           = 0x0000004000000000u, //!< Operand can be signed 8-bit immediate.
+  kImmU8           = 0x0000008000000000u, //!< Operand can be unsigned 8-bit immediate.
+  kImmI16          = 0x0000010000000000u, //!< Operand can be signed 16-bit immediate.
+  kImmU16          = 0x0000020000000000u, //!< Operand can be unsigned 16-bit immediate.
+  kImmI32          = 0x0000040000000000u, //!< Operand can be signed 32-bit immediate.
+  kImmU32          = 0x0000080000000000u, //!< Operand can be unsigned 32-bit immediate.
+  kImmI64          = 0x0000100000000000u, //!< Operand can be signed 64-bit immediate.
+  kImmU64          = 0x0000200000000000u, //!< Operand can be unsigned 64-bit immediate.
+  kImmMask         = 0x00003FF000000000u, //!< Mask of all immediate types.
+
+  kRel8            = 0x0000400000000000u, //!< Operand can be relative 8-bit  displacement.
+  kRel32           = 0x0000800000000000u, //!< Operand can be relative 32-bit displacement.
+  kRelMask         = 0x0000C00000000000u, //!< Mask of all relative displacement types.
+
+  kFlagMemBase     = 0x0001000000000000u, //!< Flag: Only memory base is allowed (no index, no offset).
+  kFlagMemDs       = 0x0002000000000000u, //!< Flag: Implicit memory operand's DS segment.
+  kFlagMemEs       = 0x0004000000000000u, //!< Flag: Implicit memory operand's ES segment.
+
+  kFlagMib         = 0x0008000000000000u, //!< Flag: Operand is MIB (base+index) pointer.
+  kFlagTMem        = 0x0010000000000000u, //!< Flag: Operand is TMEM (sib_mem), AMX memory pointer.
+
+  kFlagImplicit    = 0x0080000000000000u, //!< Flag: Operand is implicit.
+  kFlagMask        = 0x009F000000000000u, //!< Mask of all flags.
+
+  //! Contains mask of all registers, memory operands, immediate operands, and displacement operands.
+  kOpMask          = kRegMask | kMemMask | kVmMask | kImmMask | kRelMask
+};
+ASMJIT_DEFINE_ENUM_FLAGS(OpFlags)
+
+//! Operand signature.
+//!
+//! Contains all possible operand combinations, memory size information, and a fixed register id (or `BaseReg::kIdBad`
+//! if fixed id isn't required).
+struct OpSignature {
+  //! \name Members
+  //! \{
+
+  uint64_t _flags : 56;
+  uint64_t _regMask : 8;
+
+  //! \}
+
+  //! \name Accessors
+  //! \{
+
+  //! Returns operand signature flags.
+  inline OpFlags flags() const noexcept { return (OpFlags)_flags; }
+
+  //! Tests whether the given `flag` is set.
+  inline bool hasFlag(OpFlags flag) const noexcept { return (_flags & uint64_t(flag)) != 0; }
+
+  //! Tests whether this signature contains at least one register operand of any type.
+  inline bool hasReg() const noexcept { return hasFlag(OpFlags::kRegMask); }
+  //! Tests whether this signature contains at least one scalar memory operand of any type.
+  inline bool hasMem() const noexcept { return hasFlag(OpFlags::kMemMask); }
+  //! Tests whether this signature contains at least one vector memory operand of any type.
+  inline bool hasVm() const noexcept { return hasFlag(OpFlags::kVmMask); }
+  //! Tests whether this signature contains at least one immediate operand of any type.
+  inline bool hasImm() const noexcept { return hasFlag(OpFlags::kImmMask); }
+  //! Tests whether this signature contains at least one relative displacement operand of any type.
+  inline bool hasRel() const noexcept { return hasFlag(OpFlags::kRelMask); }
+
+  //! Tests whether the operand is implicit.
+  inline bool isImplicit() const noexcept { return hasFlag(OpFlags::kFlagImplicit); }
+
+  //! Returns a physical register mask.
+  inline RegMask regMask() const noexcept { return _regMask; }
+
+  //! \}
+};
+
+ASMJIT_VARAPI const OpSignature _opSignatureTable[];
+
+//! Instruction signature.
+//!
+//! Contains a sequence of operands' combinations and other metadata that defines a single instruction. This data is
+//! used by instruction validator.
+struct InstSignature {
+  //! \name Members
+  //! \{
+
+  //! Count of operands in `opIndex` (0..6).
+  uint8_t _opCount : 3;
+  //! Architecture modes supported (X86 / X64).
+  uint8_t _mode : 2;
+  //! Number of implicit operands.
+  uint8_t _implicitOpCount : 3;
+  //! Reserved for future use.
+  uint8_t _reserved;
+  //! Indexes to `OpSignature` table.
+  uint8_t _opSignatureIndexes[Globals::kMaxOpCount];
+
+  //! \}
+
+  //! \name Accessors
+  //! \{
+
+  //! Returns instruction operation mode.
+  inline Mode mode() const noexcept { return (Mode)_mode; }
+  //! Tests whether the instruction supports the given operating mode.
+  inline bool supportsMode(Mode mode) const noexcept { return (uint8_t(_mode) & uint8_t(mode)) != 0; }
+
+  //! Returns the number of operands of this signature.
+  inline uint32_t opCount() const noexcept { return _opCount; }
+  //! Returns the number of implicit operands this signature has.
+  inline uint32_t implicitOpCount() const noexcept { return _implicitOpCount; }
+  //! Tests whether this instruction signature has at least one implicit operand.
+  inline bool hasImplicitOperands() const noexcept { return _implicitOpCount != 0; }
+
+  //! Returns indexes to \ref _opSignatureTable for each operand of the instruction.
+  //!
+  //! \note The returned array always provides indexes for all operands (see \ref Globals::kMaxOpCount) even if the
+  //! instruction provides less operands. Undefined operands have always index of zero.
+  inline const uint8_t* opSignatureIndexes() const noexcept { return _opSignatureIndexes; }
+
+  //! Returns index to \ref _opSignatureTable, corresponding to the requested operand `index` of the instruction.
+  inline uint8_t opSignatureIndex(size_t index) const noexcept {
+    ASMJIT_ASSERT(index < Globals::kMaxOpCount);
+    return _opSignatureIndexes[index];
+  }
+
+  //! Returns \ref OpSignature corresponding to the requested operand `index` of the instruction.
+  inline const OpSignature& opSignature(size_t index) const noexcept {
+    ASMJIT_ASSERT(index < Globals::kMaxOpCount);
+    return _opSignatureTable[_opSignatureIndexes[index]];
+  }
+
+  //! \}
+};
+
+ASMJIT_VARAPI const InstSignature _instSignatureTable[];
+
+//! Instruction flags.
+//!
+//! Details about instruction encoding, operation, features, and some limitations.
+enum class InstFlags : uint32_t {
+  //! No flags.
+  kNone = 0x00000000u,
+
+  // Instruction Family
+  // ------------------
+  //
+  // Instruction family information.
+
+  //! Instruction that accesses FPU registers.
+  kFpu = 0x00000100u,
+  //! Instruction that accesses MMX registers (including 3DNOW and GEODE) and EMMS.
+  kMmx = 0x00000200u,
+  //! Instruction that accesses XMM registers (SSE, AVX, AVX512).
+  kVec = 0x00000400u,
+
+  // FPU Flags
+  // ---------
+  //
+  // Used to tell the encoder which memory operand sizes are encodable.
+
+  //! FPU instruction can address `word_ptr` (shared with M80).
+  kFpuM16 = 0x00000800u,
+  //! FPU instruction can address `dword_ptr`.
+  kFpuM32 = 0x00001000u,
+  //! FPU instruction can address `qword_ptr`.
+  kFpuM64 = 0x00002000u,
+  //! FPU instruction can address `tword_ptr` (shared with M16).
+  kFpuM80 = 0x00000800u,
+
+  // Prefixes and Encoding Flags
+  // ---------------------------
+  //
+  // These describe optional X86 prefixes that can be used to change the instruction's operation.
+
+  //! Instruction can be prefixed with using the REP(REPE) or REPNE prefix.
+  kRep = 0x00004000u,
+  //! Rep prefix is accepted, but it has no effect other than being emitted with the instruction (as an extra byte).
+  kRepIgnored = 0x00008000u,
+  //! Instruction can be prefixed with using the LOCK prefix.
+  kLock = 0x00010000u,
+  //! Instruction can be prefixed with using the XACQUIRE prefix.
+  kXAcquire = 0x00020000u,
+  //! Instruction can be prefixed with using the XRELEASE prefix.
+  kXRelease = 0x00040000u,
+  //! Instruction uses MIB (BNDLDX|BNDSTX) to encode two registers.
+  kMib = 0x00080000u,
+  //! Instruction uses VSIB instead of legacy SIB.
+  kVsib = 0x00100000u,
+  //! Instruction uses TSIB (or SIB_MEM) encoding (MODRM followed by SIB).
+  kTsib = 0x00200000u,
+
+  // If both `kPrefixVex` and `kPrefixEvex` flags are specified it means that the instructions can be encoded
+  // by either VEX or EVEX prefix. In that case AsmJit checks global options and also instruction options to decide
+  // whether to emit VEX or EVEX prefix.
+
+  //! Instruction can be encoded by VEX|XOP (AVX|AVX2|BMI|XOP|...).
+  kVex = 0x00400000u,
+  //! Instruction can be encoded by EVEX (AVX512).
+  kEvex = 0x00800000u,
+  //! EVEX encoding is preferred over VEX encoding (AVX515_VNNI vs AVX_VNNI).
+  kPreferEvex = 0x01000000u,
+  //! EVEX and VEX signatures are compatible.
+  kEvexCompat = 0x02000000u,
+  //! EVEX instruction requires K register in the first operand (compare instructions).
+  kEvexKReg = 0x04000000u,
+  //! EVEX instruction requires two operands and K register as a selector (gather instructions).
+  kEvexTwoOp = 0x08000000u,
+  //! VEX instruction that can be transformed to a compatible EVEX instruction.
+  kEvexTransformable = 0x10000000u,
+
+  // Other Flags
+  // -----------
+
+  //! Instruction uses consecutive registers.
+  //!
+  //! Used by V4FMADDPS, V4FMADDSS, V4FNMADDPS, V4FNMADDSS, VP4DPWSSD, VP4DPWSSDS, VP2INTERSECTD, and VP2INTERSECTQ
+  //! instructions
+  kConsecutiveRegs = 0x20000000u
+};
+ASMJIT_DEFINE_ENUM_FLAGS(InstFlags)
+
+//! AVX-512 flags.
+enum class Avx512Flags : uint32_t {
+  //! No AVX-512 flags.
+  kNone = 0,
+
+  //! Internally used in tables, has no meaning.
+  k_ = 0x00000000u,
+  //! Supports masking {k1..k7}.
+  kK = 0x00000001u,
+  //! Supports zeroing {z}, must be used together with `kAvx512k`.
+  kZ = 0x00000002u,
+  //! Supports 'embedded-rounding' {er} with implicit {sae},
+  kER = 0x00000004u,
+  //! Supports 'suppress-all-exceptions' {sae}.
+  kSAE = 0x00000008u,
+  //! Supports 16-bit broadcast 'b16'.
+  kB16 = 0x00000010u,
+  //! Supports 32-bit broadcast 'b32'.
+  kB32 = 0x00000020u,
+  //! Supports 64-bit broadcast 'b64'.
+  kB64 = 0x00000040u,
+  //! Operates on a vector of consecutive registers (AVX512_4FMAPS and AVX512_4VNNIW).
+  kT4X = 0x00000080u,
+
+  //! Implicit zeroing if {k} masking is used. Using {z} is not valid in this case as it's implicit.
+  kImplicitZ = 0x00000100,
+};
+ASMJIT_DEFINE_ENUM_FLAGS(Avx512Flags)
+
+//! Instruction common information.
+//!
+//! Aggregated information shared across one or more instruction.
+struct CommonInfo {
+  //! Instruction flags.
+  uint32_t _flags;
+  //! Reserved for future use.
+  uint32_t _avx512Flags : 11;
+  //! First `InstSignature` entry in the database.
+  uint32_t _iSignatureIndex : 11;
+  //! Number of relevant `ISignature` entries.
+  uint32_t _iSignatureCount : 5;
+  //! Instruction control flow category, see \ref InstControlFlow.
+  uint32_t _controlFlow : 3;
+  //! Specifies what happens if all source operands share the same register.
+  uint32_t _sameRegHint : 2;
+
+  //! \name Accessors
+  //! \{
+
+  //! Returns instruction flags.
+  inline InstFlags flags() const noexcept { return (InstFlags)_flags; }
+  //! Tests whether the instruction has a `flag`.
+  inline bool hasFlag(InstFlags flag) const noexcept { return Support::test(_flags, flag); }
+
+  //! Returns instruction AVX-512 flags.
+  inline Avx512Flags avx512Flags() const noexcept { return (Avx512Flags)_avx512Flags; }
+  //! Tests whether the instruction has an AVX-512 `flag`.
+  inline bool hasAvx512Flag(Avx512Flags flag) const noexcept { return Support::test(_avx512Flags, flag); }
+
+  //! Tests whether the instruction is FPU instruction.
+  inline bool isFpu() const noexcept { return hasFlag(InstFlags::kFpu); }
+  //! Tests whether the instruction is MMX/3DNOW instruction that accesses MMX registers (includes EMMS and FEMMS).
+  inline bool isMmx() const noexcept { return hasFlag(InstFlags::kMmx); }
+  //! Tests whether the instruction is SSE|AVX|AVX512 instruction that accesses XMM|YMM|ZMM registers.
+  inline bool isVec() const noexcept { return hasFlag(InstFlags::kVec); }
+  //! Tests whether the instruction is SSE+ (SSE4.2, AES, SHA included) instruction that accesses XMM registers.
+  inline bool isSse() const noexcept { return (flags() & (InstFlags::kVec | InstFlags::kVex | InstFlags::kEvex)) == InstFlags::kVec; }
+  //! Tests whether the instruction is AVX+ (FMA included) instruction that accesses XMM|YMM|ZMM registers.
+  inline bool isAvx() const noexcept { return isVec() && isVexOrEvex(); }
+
+  //! Tests whether the instruction can be prefixed with LOCK prefix.
+  inline bool hasLockPrefix() const noexcept { return hasFlag(InstFlags::kLock); }
+  //! Tests whether the instruction can be prefixed with REP (REPE|REPZ) prefix.
+  inline bool hasRepPrefix() const noexcept { return hasFlag(InstFlags::kRep); }
+  //! Tests whether the instruction can be prefixed with XACQUIRE prefix.
+  inline bool hasXAcquirePrefix() const noexcept { return hasFlag(InstFlags::kXAcquire); }
+  //! Tests whether the instruction can be prefixed with XRELEASE prefix.
+  inline bool hasXReleasePrefix() const noexcept { return hasFlag(InstFlags::kXRelease); }
+
+  //! Tests whether the rep prefix is supported by the instruction, but ignored (has no effect).
+  inline bool isRepIgnored() const noexcept { return hasFlag(InstFlags::kRepIgnored); }
+  //! Tests whether the instruction uses MIB.
+  inline bool isMibOp() const noexcept { return hasFlag(InstFlags::kMib); }
+  //! Tests whether the instruction uses VSIB.
+  inline bool isVsibOp() const noexcept { return hasFlag(InstFlags::kVsib); }
+  //! Tests whether the instruction uses TSIB (AMX, instruction requires MOD+SIB).
+  inline bool isTsibOp() const noexcept { return hasFlag(InstFlags::kTsib); }
+  //! Tests whether the instruction uses VEX (can be set together with EVEX if both are encodable).
+  inline bool isVex() const noexcept { return hasFlag(InstFlags::kVex); }
+  //! Tests whether the instruction uses EVEX (can be set together with VEX if both are encodable).
+  inline bool isEvex() const noexcept { return hasFlag(InstFlags::kEvex); }
+  //! Tests whether the instruction uses EVEX (can be set together with VEX if both are encodable).
+  inline bool isVexOrEvex() const noexcept { return hasFlag(InstFlags::kVex | InstFlags::kEvex); }
+
+  //! Tests whether the instruction should prefer EVEX prefix instead of VEX prefix.
+  inline bool preferEvex() const noexcept { return hasFlag(InstFlags::kPreferEvex); }
+
+  inline bool isEvexCompatible() const noexcept { return hasFlag(InstFlags::kEvexCompat); }
+  inline bool isEvexKRegOnly() const noexcept { return hasFlag(InstFlags::kEvexKReg); }
+  inline bool isEvexTwoOpOnly() const noexcept { return hasFlag(InstFlags::kEvexTwoOp); }
+  inline bool isEvexTransformable() const noexcept { return hasFlag(InstFlags::kEvexTransformable); }
+
+  //! Tests whether the instruction supports AVX512 masking {k}.
+  inline bool hasAvx512K() const noexcept { return hasAvx512Flag(Avx512Flags::kK); }
+  //! Tests whether the instruction supports AVX512 zeroing {k}{z}.
+  inline bool hasAvx512Z() const noexcept { return hasAvx512Flag(Avx512Flags::kZ); }
+  //! Tests whether the instruction supports AVX512 embedded-rounding {er}.
+  inline bool hasAvx512ER() const noexcept { return hasAvx512Flag(Avx512Flags::kER); }
+  //! Tests whether the instruction supports AVX512 suppress-all-exceptions {sae}.
+  inline bool hasAvx512SAE() const noexcept { return hasAvx512Flag(Avx512Flags::kSAE); }
+  //! Tests whether the instruction supports AVX512 broadcast (either 32-bit or 64-bit).
+  inline bool hasAvx512B() const noexcept { return hasAvx512Flag(Avx512Flags::kB16 | Avx512Flags::kB32 | Avx512Flags::kB64); }
+  //! Tests whether the instruction supports AVX512 broadcast (16-bit).
+  inline bool hasAvx512B16() const noexcept { return hasAvx512Flag(Avx512Flags::kB16); }
+  //! Tests whether the instruction supports AVX512 broadcast (32-bit).
+  inline bool hasAvx512B32() const noexcept { return hasAvx512Flag(Avx512Flags::kB32); }
+  //! Tests whether the instruction supports AVX512 broadcast (64-bit).
+  inline bool hasAvx512B64() const noexcept { return hasAvx512Flag(Avx512Flags::kB64); }
+
+  // Returns the size of the broadcast - either 2, 4, or 8, or 0 if broadcast is not supported.
+  inline uint32_t broadcastSize() const noexcept {
+    constexpr uint32_t kShift = Support::ConstCTZ<uint32_t(Avx512Flags::kB16)>::value;
+    return (uint32_t(_avx512Flags) & uint32_t(Avx512Flags::kB16 | Avx512Flags::kB32 | Avx512Flags::kB64)) >> (kShift - 1);
+  }
+
+  inline uint32_t signatureIndex() const noexcept { return _iSignatureIndex; }
+  inline uint32_t signatureCount() const noexcept { return _iSignatureCount; }
+
+  inline const InstSignature* signatureData() const noexcept { return _instSignatureTable + _iSignatureIndex; }
+  inline const InstSignature* signatureEnd() const noexcept { return _instSignatureTable + _iSignatureIndex + _iSignatureCount; }
+
+  //! Returns a control flow category of the instruction.
+  inline InstControlFlow controlFlow() const noexcept { return (InstControlFlow)_controlFlow; }
+
+  //! Returns a hint that can be used when both inputs are the same register.
+  inline InstSameRegHint sameRegHint() const noexcept { return (InstSameRegHint)_sameRegHint; }
+
+  //! \}
+};
+
+ASMJIT_VARAPI const CommonInfo _commonInfoTable[];
+
+//! Instruction information.
+struct InstInfo {
+  //! Index to \ref _nameData.
+  uint32_t _nameDataIndex : 14;
+  //! Index to \ref _commonInfoTable.
+  uint32_t _commonInfoIndex : 10;
+  //! Index to \ref _additionalInfoTable.
+  uint32_t _additionalInfoIndex : 8;
+
+  //! Instruction encoding (internal encoding identifier used by \ref Assembler).
+  uint8_t _encoding;
+  //! Main opcode value (0..255).
+  uint8_t _mainOpcodeValue;
+  //! Index to \ref _mainOpcodeTable` that is combined with \ref _mainOpcodeValue to form the final opcode.
+  uint8_t _mainOpcodeIndex;
+  //! Index to \ref _altOpcodeTable that contains a full alternative opcode.
+  uint8_t _altOpcodeIndex;
+
+  //! \name Accessors
+  //! \{
+
+  //! Returns common information, see \ref CommonInfo.
+  inline const CommonInfo& commonInfo() const noexcept { return _commonInfoTable[_commonInfoIndex]; }
+
+  //! Returns instruction flags, see \ref Flags.
+  inline InstFlags flags() const noexcept { return commonInfo().flags(); }
+  //! Tests whether the instruction has flag `flag`, see \ref Flags.
+  inline bool hasFlag(InstFlags flag) const noexcept { return commonInfo().hasFlag(flag); }
+
+  //! Returns instruction AVX-512 flags, see \ref Avx512Flags.
+  inline Avx512Flags avx512Flags() const noexcept { return commonInfo().avx512Flags(); }
+  //! Tests whether the instruction has an AVX-512 `flag`, see \ref Avx512Flags.
+  inline bool hasAvx512Flag(Avx512Flags flag) const noexcept { return commonInfo().hasAvx512Flag(flag); }
+
+  //! Tests whether the instruction is FPU instruction.
+  inline bool isFpu() const noexcept { return commonInfo().isFpu(); }
+  //! Tests whether the instruction is MMX/3DNOW instruction that accesses MMX registers (includes EMMS and FEMMS).
+  inline bool isMmx() const noexcept { return commonInfo().isMmx(); }
+  //! Tests whether the instruction is SSE|AVX|AVX512 instruction that accesses XMM|YMM|ZMM registers.
+  inline bool isVec() const noexcept { return commonInfo().isVec(); }
+  //! Tests whether the instruction is SSE+ (SSE4.2, AES, SHA included) instruction that accesses XMM registers.
+  inline bool isSse() const noexcept { return commonInfo().isSse(); }
+  //! Tests whether the instruction is AVX+ (FMA included) instruction that accesses XMM|YMM|ZMM registers.
+  inline bool isAvx() const noexcept { return commonInfo().isAvx(); }
+
+  //! Tests whether the instruction can be prefixed with LOCK prefix.
+  inline bool hasLockPrefix() const noexcept { return commonInfo().hasLockPrefix(); }
+  //! Tests whether the instruction can be prefixed with REP (REPE|REPZ) prefix.
+  inline bool hasRepPrefix() const noexcept { return commonInfo().hasRepPrefix(); }
+  //! Tests whether the instruction can be prefixed with XACQUIRE prefix.
+  inline bool hasXAcquirePrefix() const noexcept { return commonInfo().hasXAcquirePrefix(); }
+  //! Tests whether the instruction can be prefixed with XRELEASE prefix.
+  inline bool hasXReleasePrefix() const noexcept { return commonInfo().hasXReleasePrefix(); }
+
+  //! Tests whether the rep prefix is supported by the instruction, but ignored (has no effect).
+  inline bool isRepIgnored() const noexcept { return commonInfo().isRepIgnored(); }
+  //! Tests whether the instruction uses MIB.
+  inline bool isMibOp() const noexcept { return hasFlag(InstFlags::kMib); }
+  //! Tests whether the instruction uses VSIB.
+  inline bool isVsibOp() const noexcept { return hasFlag(InstFlags::kVsib); }
+  //! Tests whether the instruction uses VEX (can be set together with EVEX if both are encodable).
+  inline bool isVex() const noexcept { return hasFlag(InstFlags::kVex); }
+  //! Tests whether the instruction uses EVEX (can be set together with VEX if both are encodable).
+  inline bool isEvex() const noexcept { return hasFlag(InstFlags::kEvex); }
+  //! Tests whether the instruction uses EVEX (can be set together with VEX if both are encodable).
+  inline bool isVexOrEvex() const noexcept { return hasFlag(InstFlags::kVex | InstFlags::kEvex); }
+
+  inline bool isEvexCompatible() const noexcept { return hasFlag(InstFlags::kEvexCompat); }
+  inline bool isEvexKRegOnly() const noexcept { return hasFlag(InstFlags::kEvexKReg); }
+  inline bool isEvexTwoOpOnly() const noexcept { return hasFlag(InstFlags::kEvexTwoOp); }
+  inline bool isEvexTransformable() const noexcept { return hasFlag(InstFlags::kEvexTransformable); }
+
+  //! Tests whether the instruction supports AVX512 masking {k}.
+  inline bool hasAvx512K() const noexcept { return hasAvx512Flag(Avx512Flags::kK); }
+  //! Tests whether the instruction supports AVX512 zeroing {k}{z}.
+  inline bool hasAvx512Z() const noexcept { return hasAvx512Flag(Avx512Flags::kZ); }
+  //! Tests whether the instruction supports AVX512 embedded-rounding {er}.
+  inline bool hasAvx512ER() const noexcept { return hasAvx512Flag(Avx512Flags::kER); }
+  //! Tests whether the instruction supports AVX512 suppress-all-exceptions {sae}.
+  inline bool hasAvx512SAE() const noexcept { return hasAvx512Flag(Avx512Flags::kSAE); }
+  //! Tests whether the instruction supports AVX512 broadcast (either 32-bit or 64-bit).
+  inline bool hasAvx512B() const noexcept { return hasAvx512Flag(Avx512Flags::kB16 | Avx512Flags::kB32 | Avx512Flags::kB64); }
+  //! Tests whether the instruction supports AVX512 broadcast (16-bit).
+  inline bool hasAvx512B16() const noexcept { return hasAvx512Flag(Avx512Flags::kB16); }
+  //! Tests whether the instruction supports AVX512 broadcast (32-bit).
+  inline bool hasAvx512B32() const noexcept { return hasAvx512Flag(Avx512Flags::kB32); }
+  //! Tests whether the instruction supports AVX512 broadcast (64-bit).
+  inline bool hasAvx512B64() const noexcept { return hasAvx512Flag(Avx512Flags::kB64); }
+
+  //! Returns a control flow category of the instruction.
+  inline InstControlFlow controlFlow() const noexcept { return commonInfo().controlFlow(); }
+  //! Returns a hint that can be used when both inputs are the same register.
+  inline InstSameRegHint sameRegHint() const noexcept { return commonInfo().sameRegHint(); }
+
+  inline uint32_t signatureIndex() const noexcept { return commonInfo().signatureIndex(); }
+  inline uint32_t signatureCount() const noexcept { return commonInfo().signatureCount(); }
+
+  inline const InstSignature* signatureData() const noexcept { return commonInfo().signatureData(); }
+  inline const InstSignature* signatureEnd() const noexcept { return commonInfo().signatureEnd(); }
+
+  //! \}
+};
+
+ASMJIT_VARAPI const InstInfo _instInfoTable[];
+
+static inline const InstInfo& infoById(InstId instId) noexcept {
+  ASMJIT_ASSERT(Inst::isDefinedId(instId));
+  return _instInfoTable[instId];
+}
+
+//! \cond INTERNAL
+static_assert(sizeof(OpSignature) == 8, "InstDB::OpSignature must be 8 bytes long");
+//! \endcond
+
+} // {InstDB}
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86INSTDB_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86instdb_p.h
+++ b/lib/lepton/asmjit/x86/x86instdb_p.h
@ -0,0 +1,311 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86INSTDB_P_H_INCLUDED
+#define ASMJIT_X86_X86INSTDB_P_H_INCLUDED
+
+#include "../x86/x86instdb.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+namespace InstDB {
+
+//! Instruction encoding (X86).
+//!
+//! This is a specific identifier that is used by AsmJit to describe the way each instruction is encoded. Some
+//! encodings are special only for a single instruction as X86 instruction set contains a lot of legacy encodings,
+//! and some encodings describe a group of instructions that share some commons, like MMX, SSE, AVX, AVX512
+//! instructions, etc...
+enum EncodingId : uint32_t {
+  kEncodingNone = 0,                     //!< Never used.
+  kEncodingX86Op,                        //!< X86 [OP].
+  kEncodingX86Op_Mod11RM,                //!< X86 [OP] (opcode with ModRM byte where MOD must be 11b).
+  kEncodingX86Op_Mod11RM_I8,             //!< X86 [OP] (opcode with ModRM byte + 8-bit immediate).
+  kEncodingX86Op_xAddr,                  //!< X86 [OP] (implicit address in the first register operand).
+  kEncodingX86Op_xAX,                    //!< X86 [OP] (implicit or explicit '?AX' form).
+  kEncodingX86Op_xDX_xAX,                //!< X86 [OP] (implicit or explicit '?DX, ?AX' form).
+  kEncodingX86Op_MemZAX,                 //!< X86 [OP] (implicit or explicit '[EAX|RAX]' form).
+  kEncodingX86I_xAX,                     //!< X86 [I] (implicit or explicit '?AX' form).
+  kEncodingX86M,                         //!< X86 [M] (handles 2|4|8-bytes size).
+  kEncodingX86M_NoMemSize,               //!< X86 [M] (handles 2|4|8-bytes size, but doesn't consider memory size).
+  kEncodingX86M_NoSize,                  //!< X86 [M] (doesn't handle any size).
+  kEncodingX86M_GPB,                     //!< X86 [M] (handles single-byte size).
+  kEncodingX86M_GPB_MulDiv,              //!< X86 [M] (like GPB, handles implicit|explicit MUL|DIV|IDIV).
+  kEncodingX86M_Only,                    //!< X86 [M] (restricted to memory operand of any size).
+  kEncodingX86M_Only_EDX_EAX,            //!< X86 [M] (memory operand only, followed by implicit <edx> and <eax>).
+  kEncodingX86M_Nop,                     //!< X86 [M] (special case of NOP instruction).
+  kEncodingX86R_Native,                  //!< X86 [R] (register must be either 32-bit or 64-bit depending on arch).
+  kEncodingX86R_FromM,                   //!< X86 [R] - which specifies memory address.
+  kEncodingX86R32_EDX_EAX,               //!< X86 [R32] followed by implicit EDX and EAX.
+  kEncodingX86Rm,                        //!< X86 [RM] (doesn't handle single-byte size).
+  kEncodingX86Rm_Raw66H,                 //!< X86 [RM] (used by LZCNT, POPCNT, and TZCNT).
+  kEncodingX86Rm_NoSize,                 //!< X86 [RM] (doesn't add REX.W prefix if 64-bit reg is used).
+  kEncodingX86Mr,                        //!< X86 [MR] (doesn't handle single-byte size).
+  kEncodingX86Mr_NoSize,                 //!< X86 [MR] (doesn't handle any size).
+  kEncodingX86Arith,                     //!< X86 adc, add, and, cmp, or, sbb, sub, xor.
+  kEncodingX86Bswap,                     //!< X86 bswap.
+  kEncodingX86Bt,                        //!< X86 bt, btc, btr, bts.
+  kEncodingX86Call,                      //!< X86 call.
+  kEncodingX86Cmpxchg,                   //!< X86 [MR] cmpxchg.
+  kEncodingX86Cmpxchg8b_16b,             //!< X86 [MR] cmpxchg8b, cmpxchg16b.
+  kEncodingX86Crc,                       //!< X86 crc32.
+  kEncodingX86Enter,                     //!< X86 enter.
+  kEncodingX86Imul,                      //!< X86 imul.
+  kEncodingX86In,                        //!< X86 in.
+  kEncodingX86Ins,                       //!< X86 ins[b|q|d].
+  kEncodingX86IncDec,                    //!< X86 inc, dec.
+  kEncodingX86Int,                       //!< X86 int (interrupt).
+  kEncodingX86Jcc,                       //!< X86 jcc.
+  kEncodingX86JecxzLoop,                 //!< X86 jcxz, jecxz, jrcxz, loop, loope, loopne.
+  kEncodingX86Jmp,                       //!< X86 jmp.
+  kEncodingX86JmpRel,                    //!< X86 xbegin.
+  kEncodingX86LcallLjmp,                 //!< X86 lcall/ljmp.
+  kEncodingX86Lea,                       //!< X86 lea.
+  kEncodingX86Mov,                       //!< X86 mov (all possible cases).
+  kEncodingX86Movabs,                    //!< X86 movabs.
+  kEncodingX86MovsxMovzx,                //!< X86 movsx, movzx.
+  kEncodingX86MovntiMovdiri,             //!< X86 movnti/movdiri.
+  kEncodingX86EnqcmdMovdir64b,           //!< X86 enqcmd/enqcmds/movdir64b.
+  kEncodingX86Out,                       //!< X86 out.
+  kEncodingX86Outs,                      //!< X86 out[b|w|d].
+  kEncodingX86Push,                      //!< X86 push.
+  kEncodingX86Pop,                       //!< X86 pop.
+  kEncodingX86Ret,                       //!< X86 ret.
+  kEncodingX86Rot,                       //!< X86 rcl, rcr, rol, ror, sal, sar, shl, shr.
+  kEncodingX86Set,                       //!< X86 setcc.
+  kEncodingX86ShldShrd,                  //!< X86 shld, shrd.
+  kEncodingX86StrRm,                     //!< X86 lods.
+  kEncodingX86StrMr,                     //!< X86 scas, stos.
+  kEncodingX86StrMm,                     //!< X86 cmps, movs.
+  kEncodingX86Test,                      //!< X86 test.
+  kEncodingX86Xadd,                      //!< X86 xadd.
+  kEncodingX86Xchg,                      //!< X86 xchg.
+  kEncodingX86Fence,                     //!< X86 lfence, mfence, sfence.
+  kEncodingX86Bndmov,                    //!< X86 [RM|MR] (used by BNDMOV).
+  kEncodingFpuOp,                        //!< FPU [OP].
+  kEncodingFpuArith,                     //!< FPU fadd, fdiv, fdivr, fmul, fsub, fsubr.
+  kEncodingFpuCom,                       //!< FPU fcom, fcomp.
+  kEncodingFpuFldFst,                    //!< FPU fld, fst, fstp.
+  kEncodingFpuM,                         //!< FPU fiadd, ficom, ficomp, fidiv, fidivr, fild, fimul, fist, fistp, fisttp, fisub, fisubr.
+  kEncodingFpuR,                         //!< FPU fcmov, fcomi, fcomip, ffree, fucom, fucomi, fucomip, fucomp, fxch.
+  kEncodingFpuRDef,                      //!< FPU faddp, fdivp, fdivrp, fmulp, fsubp, fsubrp.
+  kEncodingFpuStsw,                      //!< FPU fnstsw, Fstsw.
+  kEncodingExtRm,                        //!< EXT [RM].
+  kEncodingExtRm_XMM0,                   //!< EXT [RM<XMM0>].
+  kEncodingExtRm_ZDI,                    //!< EXT [RM<ZDI>].
+  kEncodingExtRm_P,                      //!< EXT [RM] (propagates 66H if the instruction uses XMM register).
+  kEncodingExtRm_Wx,                     //!< EXT [RM] (propagates REX.W if GPQ is used or the second operand is GPQ/QWORD_PTR).
+  kEncodingExtRm_Wx_GpqOnly,             //!< EXT [RM] (propagates REX.W if the first operand is GPQ register).
+  kEncodingExtRmRi,                      //!< EXT [RM|RI].
+  kEncodingExtRmRi_P,                    //!< EXT [RM|RI] (propagates 66H if the instruction uses XMM register).
+  kEncodingExtRmi,                       //!< EXT [RMI].
+  kEncodingExtRmi_P,                     //!< EXT [RMI] (propagates 66H if the instruction uses XMM register).
+  kEncodingExtPextrw,                    //!< EXT pextrw.
+  kEncodingExtExtract,                   //!< EXT pextrb, pextrd, pextrq, extractps.
+  kEncodingExtMov,                       //!< EXT mov?? - #1:[MM|XMM, MM|XMM|Mem] #2:[MM|XMM|Mem, MM|XMM].
+  kEncodingExtMovbe,                     //!< EXT movbe.
+  kEncodingExtMovd,                      //!< EXT movd.
+  kEncodingExtMovq,                      //!< EXT movq.
+  kEncodingExtExtrq,                     //!< EXT extrq (SSE4A).
+  kEncodingExtInsertq,                   //!< EXT insrq (SSE4A).
+  kEncodingExt3dNow,                     //!< EXT [RMI] (3DNOW specific).
+  kEncodingVexOp,                        //!< VEX [OP].
+  kEncodingVexOpMod,                     //!< VEX [OP] with MODR/M.
+  kEncodingVexKmov,                      //!< VEX [RM|MR] (used by kmov[b|w|d|q]).
+  kEncodingVexR_Wx,                      //!< VEX|EVEX [R] (propagatex VEX.W if GPQ used).
+  kEncodingVexM,                         //!< VEX|EVEX [M].
+  kEncodingVexM_VM,                      //!< VEX|EVEX [M] (propagates VEX|EVEX.L, VSIB support).
+  kEncodingVexMr_Lx,                     //!< VEX|EVEX [MR] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexMr_VM,                     //!< VEX|EVEX [MR] (VSIB support).
+  kEncodingVexMri,                       //!< VEX|EVEX [MRI].
+  kEncodingVexMri_Lx,                    //!< VEX|EVEX [MRI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexMri_Vpextrw,               //!< VEX|EVEX [MRI] (special case required by VPEXTRW instruction).
+  kEncodingVexRm,                        //!< VEX|EVEX [RM].
+  kEncodingVexRm_ZDI,                    //!< VEX|EVEX [RM<ZDI>].
+  kEncodingVexRm_Wx,                     //!< VEX|EVEX [RM] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexRm_Lx,                     //!< VEX|EVEX [RM] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRm_Lx_Narrow,              //!< VEX|EVEX [RM] (the destination vector size is narrowed).
+  kEncodingVexRm_Lx_Bcst,                //!< VEX|EVEX [RM] (can handle broadcast r32/r64).
+  kEncodingVexRm_VM,                     //!< VEX|EVEX [RM] (propagates VEX|EVEX.L, VSIB support).
+  kEncodingVexRm_T1_4X,                  //!<     EVEX [RM] (used by NN instructions that use RM-T1_4X encoding).
+  kEncodingVexRmi,                       //!< VEX|EVEX [RMI].
+  kEncodingVexRmi_Wx,                    //!< VEX|EVEX [RMI] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexRmi_Lx,                    //!< VEX|EVEX [RMI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvm,                       //!< VEX|EVEX [RVM].
+  kEncodingVexRvm_Wx,                    //!< VEX|EVEX [RVM] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexRvm_ZDX_Wx,                //!< VEX|EVEX [RVM<ZDX>] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexRvm_Lx,                    //!< VEX|EVEX [RVM] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvm_Lx_KEvex,              //!< VEX|EVEX [RVM] (forces EVEX prefix if K register is used on destination).
+  kEncodingVexRvm_Lx_2xK,                //!< VEX|EVEX [RVM] (vp2intersectd/vp2intersectq).
+  kEncodingVexRvmr,                      //!< VEX|EVEX [RVMR].
+  kEncodingVexRvmr_Lx,                   //!< VEX|EVEX [RVMR] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmi,                      //!< VEX|EVEX [RVMI].
+  kEncodingVexRvmi_KEvex,                //!< VEX|EVEX [RVMI] (forces EVEX prefix if K register is used on destination).
+  kEncodingVexRvmi_Lx,                   //!< VEX|EVEX [RVMI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmi_Lx_KEvex,             //!< VEX|EVEX [RVMI] (forces EVEX prefix if K register is used on destination).
+  kEncodingVexRmv,                       //!< VEX|EVEX [RMV].
+  kEncodingVexRmv_Wx,                    //!< VEX|EVEX [RMV] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexRmv_VM,                    //!< VEX|EVEX [RMV] (propagates VEX|EVEX.L, VSIB support).
+  kEncodingVexRmvRm_VM,                  //!< VEX|EVEX [RMV|RM] (propagates VEX|EVEX.L, VSIB support).
+  kEncodingVexRmvi,                      //!< VEX|EVEX [RMVI].
+  kEncodingVexRmMr,                      //!< VEX|EVEX [RM|MR].
+  kEncodingVexRmMr_Lx,                   //!< VEX|EVEX [RM|MR] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmRmv,                    //!< VEX|EVEX [RVM|RMV].
+  kEncodingVexRvmRmi,                    //!< VEX|EVEX [RVM|RMI].
+  kEncodingVexRvmRmi_Lx,                 //!< VEX|EVEX [RVM|RMI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmRmvRmi,                 //!< VEX|EVEX [RVM|RMV|RMI].
+  kEncodingVexRvmMr,                     //!< VEX|EVEX [RVM|MR].
+  kEncodingVexRvmMvr,                    //!< VEX|EVEX [RVM|MVR].
+  kEncodingVexRvmMvr_Lx,                 //!< VEX|EVEX [RVM|MVR] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmVmi,                    //!< VEX|EVEX [RVM|VMI].
+  kEncodingVexRvmVmi_Lx,                 //!< VEX|EVEX [RVM|VMI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvmVmi_Lx_MEvex,           //!< VEX|EVEX [RVM|VMI] (propagates EVEX if the second operand is memory).
+  kEncodingVexVm,                        //!< VEX|EVEX [VM].
+  kEncodingVexVm_Wx,                     //!< VEX|EVEX [VM] (propagates VEX|EVEX.W if GPQ used).
+  kEncodingVexVmi,                       //!< VEX|EVEX [VMI].
+  kEncodingVexVmi_Lx,                    //!< VEX|EVEX [VMI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexVmi4_Wx,                   //!< VEX|EVEX [VMI] (propagates VEX|EVEX.W if GPQ used, DWORD Immediate).
+  kEncodingVexVmi_Lx_MEvex,              //!< VEX|EVEX [VMI] (force EVEX prefix when the second operand is memory)
+  kEncodingVexRvrmRvmr,                  //!< VEX|EVEX [RVRM|RVMR].
+  kEncodingVexRvrmRvmr_Lx,               //!< VEX|EVEX [RVRM|RVMR] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexRvrmiRvmri_Lx,             //!< VEX|EVEX [RVRMI|RVMRI] (propagates VEX|EVEX.L if YMM used).
+  kEncodingVexMovdMovq,                  //!< VEX|EVEX vmovd, vmovq.
+  kEncodingVexMovssMovsd,                //!< VEX|EVEX vmovss, vmovsd.
+  kEncodingFma4,                         //!< FMA4 [R, R, R/M, R/M].
+  kEncodingFma4_Lx,                      //!< FMA4 [R, R, R/M, R/M] (propagates AVX.L if YMM used).
+  kEncodingAmxCfg,                       //!< AMX ldtilecfg/sttilecfg.
+  kEncodingAmxR,                         //!< AMX [R] - tilezero.
+  kEncodingAmxRm,                        //!< AMX tileloadd/tileloaddt1.
+  kEncodingAmxMr,                        //!< AMX tilestored.
+  kEncodingAmxRmv,                       //!< AMX instructions that use TMM registers.
+  kEncodingCount                         //!< Count of instruction encodings.
+};
+
+//! Additional information table, provides CPU extensions required to execute an instruction and RW flags.
+struct AdditionalInfo {
+  //! Index to `_instFlagsTable`.
+  uint8_t _instFlagsIndex;
+  //! Index to `_rwFlagsTable`.
+  uint8_t _rwFlagsIndex;
+  //! Features vector.
+  uint8_t _features[6];
+
+  inline const uint8_t* featuresBegin() const noexcept { return _features; }
+  inline const uint8_t* featuresEnd() const noexcept { return _features + ASMJIT_ARRAY_SIZE(_features); }
+};
+
+// ${NameLimits:Begin}
+// ------------------- Automatically generated, do not edit -------------------
+enum : uint32_t { kMaxNameSize = 17 };
+// ----------------------------------------------------------------------------
+// ${NameLimits:End}
+
+struct InstNameIndex {
+  uint16_t start;
+  uint16_t end;
+};
+
+struct RWInfo {
+  enum Category : uint8_t {
+    kCategoryGeneric,
+    kCategoryMov,
+    kCategoryMovabs,
+    kCategoryImul,
+    kCategoryMovh64,
+    kCategoryPunpcklxx,
+    kCategoryVmaskmov,
+    kCategoryVmovddup,
+    kCategoryVmovmskpd,
+    kCategoryVmovmskps,
+    kCategoryVmov1_2,
+    kCategoryVmov1_4,
+    kCategoryVmov1_8,
+    kCategoryVmov2_1,
+    kCategoryVmov4_1,
+    kCategoryVmov8_1
+  };
+
+  uint8_t category;
+  uint8_t rmInfo;
+  uint8_t opInfoIndex[6];
+};
+
+struct RWInfoOp {
+  uint64_t rByteMask;
+  uint64_t wByteMask;
+  uint8_t physId;
+  uint8_t consecutiveLeadCount;
+  uint8_t reserved[2];
+  OpRWFlags flags;
+};
+
+//! R/M information.
+//!
+//! This data is used to replace register operand by a memory operand reliably.
+struct RWInfoRm {
+  enum Category : uint8_t {
+    kCategoryNone = 0,
+    kCategoryFixed,
+    kCategoryConsistent,
+    kCategoryHalf,
+    kCategoryQuarter,
+    kCategoryEighth
+  };
+
+  enum Flags : uint8_t {
+    kFlagAmbiguous = 0x01,
+    //! Special semantics for PEXTRW - memory operand can only be used with SSE4.1 instruction and it's forbidden in MMX.
+    kFlagPextrw = 0x02,
+    //! Special semantics for MOVSS and MOVSD - doesn't zero extend the destination if the operation is a reg to reg move.
+    kFlagMovssMovsd = 0x04,
+    //! Special semantics for AVX shift instructions that do not provide reg/mem in AVX/AVX2 mode (AVX-512 is required).
+    kFlagFeatureIfRMI = 0x08
+  };
+
+  uint8_t category;
+  uint8_t rmOpsMask;
+  uint8_t fixedSize;
+  uint8_t flags;
+  uint8_t rmFeature;
+};
+
+struct RWFlagsInfoTable {
+  //! CPU/FPU flags read.
+  uint32_t readFlags;
+  //! CPU/FPU flags written or undefined.
+  uint32_t writeFlags;
+};
+
+extern const uint8_t rwInfoIndexA[Inst::_kIdCount];
+extern const uint8_t rwInfoIndexB[Inst::_kIdCount];
+extern const RWInfo rwInfoA[];
+extern const RWInfo rwInfoB[];
+extern const RWInfoOp rwInfoOp[];
+extern const RWInfoRm rwInfoRm[];
+extern const RWFlagsInfoTable _rwFlagsInfoTable[];
+extern const InstRWFlags _instFlagsTable[];
+
+extern const uint32_t _mainOpcodeTable[];
+extern const uint32_t _altOpcodeTable[];
+
+#ifndef ASMJIT_NO_TEXT
+extern const char _nameData[];
+extern const InstNameIndex instNameIndex[26];
+#endif // !ASMJIT_NO_TEXT
+
+extern const AdditionalInfo _additionalInfoTable[];
+
+} // {InstDB}
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86INSTDB_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86opcode_p.h
+++ b/lib/lepton/asmjit/x86/x86opcode_p.h
@ -0,0 +1,436 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86OPCODE_P_H_INCLUDED
+#define ASMJIT_X86_X86OPCODE_P_H_INCLUDED
+
+#include "../x86/x86globals.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+//! Helper class to store and manipulate X86 opcodes.
+//!
+//! The first 8 least significant bits describe the opcode byte as defined in ISA manuals, all other bits
+//! describe other properties like prefixes, see `Opcode::Bits` for more information.
+struct Opcode {
+  uint32_t v;
+
+  //! Describes a meaning of all bits of AsmJit's 32-bit opcode value.
+  //!
+  //! This schema is AsmJit specific and has been designed to allow encoding of all X86 instructions available. X86,
+  //! MMX, and SSE+ instructions always use `MM` and `PP` fields, which are encoded to corresponding prefixes needed
+  //! by X86 or SIMD instructions. AVX+ instructions embed `MMMMM` and `PP` fields in a VEX prefix, and AVX-512
+  //! instructions embed `MM` and `PP` in EVEX prefix.
+  //!
+  //! The instruction opcode definition uses 1 or 2 bytes as an opcode value. 1 byte is needed by most of the
+  //! instructions, 2 bytes are only used by legacy X87-FPU instructions. This means that a second byte is free to
+  //! by used by instructions encoded by using VEX and/or EVEX prefix.
+  //!
+  //! The fields description:
+  //!
+  //! - `MM` field is used to encode prefixes needed by the instruction or as a part of VEX/EVEX prefix. Described as
+  //!   `mm` and `mmmmm` in instruction manuals.
+  //!
+  //!   NOTE: Since `MM` field is defined as `mmmmm` (5 bits), but only 2 least significant bits are used by VEX and
+  //!   EVEX prefixes, and additional 4th bit is used by XOP prefix, AsmJit uses the 3rd and 5th bit for it's own
+  //!   purposes. These bits will probably never be used in future encodings as AVX512 uses only `000mm` from `mmmmm`.
+  //!
+  //! - `PP` field is used to encode prefixes needed by the instruction or as a part of VEX/EVEX prefix. Described as
+  //!   `pp` in instruction manuals.
+  //!
+  //! - `LL` field is used exclusively by AVX+ and AVX512+ instruction sets. It describes vector size, which is `L.128`
+  //!   for XMM register, `L.256` for for YMM register, and `L.512` for ZMM register. The `LL` field is omitted in case
+  //!   that instruction supports multiple vector lengths, however, if the instruction requires specific `L` value it
+  //!   must be specified as a part of the opcode.
+  //!
+  //!   NOTE: `LL` having value `11` is not defined yet.
+  //!
+  //! - `W` field is the most complicated. It was added by 64-bit architecture to promote default operation width
+  //!   (instructions that perform 32-bit operation by default require to override the width to 64-bit explicitly).
+  //!   There is nothing wrong on this, however, some instructions introduced implicit `W` override, for example a
+  //!   `cdqe` instruction is basically a `cwde` instruction with overridden `W` (set to 1). There are some others
+  //!   in the base X86 instruction set. More recent instruction sets started using `W` field more often:
+  //!
+  //!   - AVX instructions started using `W` field as an extended opcode for FMA, GATHER, PERM, and other instructions.
+  //!     It also uses `W` field to override the default operation width in instructions like `vmovq`.
+  //!
+  //!   - AVX-512 instructions started using `W` field as an extended opcode for all new instructions. This wouldn't
+  //!     have been an issue if the `W` field of AVX-512 have matched AVX, but this is not always the case.
+  //!
+  //! - `O` field is an extended opcode field (3 bits) embedded in ModR/M BYTE.
+  //!
+  //! - `CDSHL` and `CDTT` fields describe 'compressed-displacement'. `CDSHL` is defined for each instruction that is
+  //!   AVX-512 encodable (EVEX) and contains a base N shift (base shift to perform the calculation). The `CDTT` field
+  //!   is derived from instruction specification and describes additional shift to calculate the final `CDSHL` that
+  //!   will be used in SIB byte.
+  //!
+  //! \note Don't reorder any fields here, the shifts and masks were defined carefully to make encoding of X86
+  //! instructions fast, especially to construct REX, VEX, and EVEX prefixes in the most efficient way. Changing
+  //! values defined by these enums many cause AsmJit to emit invalid binary representations of instructions passed to
+  //! `x86::Assembler::_emit`.
+  enum Bits : uint32_t {
+    // MM & VEX & EVEX & XOP
+    // ---------------------
+    //
+    // Two meanings:
+    //  * Part of a legacy opcode (prefixes emitted before the main opcode byte).
+    //  * `MMMMM` field in VEX|EVEX|XOP instruction.
+    //
+    // AVX reserves 5 bits for `MMMMM` field, however AVX instructions only use 2 bits and XOP 3 bits. AVX-512 shrinks
+    // `MMMMM` field into `MMM` so it's safe to use [4:3] bits of `MMMMM` field for internal payload.
+    //
+    // AsmJit divides MMMMM field into this layout:
+    //
+    // [2:0] - Used to describe 0F, 0F38 and 0F3A legacy prefix bytes and 3 bits of MMMMM field for XOP/AVX/AVX512.
+    // [3]   - Required by XOP instructions, so we use this bit also to indicate that this is a XOP opcode.
+    // [4]   - Used to force EVEX prefix - this bit is not used by any X86 instruction yet, so AsmJit uses it to
+    //         describe EVEX only instructions or sets its bit when user uses InstOptions::kX86_Evex to force EVEX.
+    kMM_Shift      = 8,
+    kMM_Mask       = 0x1Fu << kMM_Shift,
+    kMM_00         = 0x00u << kMM_Shift,
+    kMM_0F         = 0x01u << kMM_Shift,
+    kMM_0F38       = 0x02u << kMM_Shift,
+    kMM_0F3A       = 0x03u << kMM_Shift,   // Described also as XOP.M3 in AMD manuals.
+    kMM_0F01       = 0x04u << kMM_Shift,   // AsmJit way to describe 0F01 (never VEX/EVEX).
+
+    kMM_MAP5       = 0x05u << kMM_Shift,   // EVEX.MAP5.
+    kMM_MAP6       = 0x06u << kMM_Shift,   // EVEX.MAP6.
+
+    // `XOP` field is only used to force XOP prefix instead of VEX3 prefix. We know XOP encodings always use 0b1000
+    // bit of MM field and that no VEX and EVEX instruction use such bit yet, so we can use this bit to force XOP
+    // prefix to be emitted instead of VEX3 prefix. See `x86VEXPrefix` defined in `x86assembler.cpp`.
+    kMM_XOP08      = 0x08u << kMM_Shift,   // XOP.M8.
+    kMM_XOP09      = 0x09u << kMM_Shift,   // XOP.M9.
+    kMM_XOP0A      = 0x0Au << kMM_Shift,   // XOP.MA.
+
+    kMM_IsXOP_Shift= kMM_Shift + 3,
+    kMM_IsXOP      = kMM_XOP08,
+
+    // NOTE: Force VEX3 allows to force to emit VEX3 instead of VEX2 in some cases (similar to forcing REX prefix).
+    // Force EVEX will force emitting EVEX prefix instead of VEX2|VEX3. EVEX-only instructions will have ForceEvex
+    // always set, however. instructions that can be encoded by either VEX or EVEX prefix should not have ForceEvex
+    // set.
+    kMM_ForceEvex  = 0x10u << kMM_Shift,   // Force 4-BYTE EVEX prefix.
+
+    // FPU_2B - Second-Byte of the Opcode used by FPU
+    // ----------------------------------------------
+    //
+    // Second byte opcode. This BYTE is ONLY used by FPU instructions and collides with 3 bits from `MM` and 5 bits
+    // from 'CDSHL' and 'CDTT'. It's fine as FPU and AVX512 flags are never used at the same time.
+    kFPU_2B_Shift  = 10,
+    kFPU_2B_Mask   = 0xFF << kFPU_2B_Shift,
+
+    // CDSHL & CDTT
+    // ------------
+    //
+    // Compressed displacement bits.
+    //
+    // Each opcode defines the base size (N) shift:
+    //   [0]: BYTE  (1 byte).
+    //   [1]: WORD  (2 bytes).
+    //   [2]: DWORD (4 bytes - float/int32).
+    //   [3]: QWORD (8 bytes - double/int64).
+    //   [4]: OWORD (16 bytes - used by FV|FVM|M128).
+    //
+    // Which is then scaled by the instruction's TT (TupleType) into possible:
+    //   [5]: YWORD (32 bytes)
+    //   [6]: ZWORD (64 bytes)
+    //
+    // These bits are then adjusted before calling EmitModSib or EmitModVSib.
+    kCDSHL_Shift   = 13,
+    kCDSHL_Mask    = 0x7u << kCDSHL_Shift,
+
+    kCDSHL__       = 0x0u << kCDSHL_Shift, // Base element size not used.
+    kCDSHL_0       = 0x0u << kCDSHL_Shift, // N << 0.
+    kCDSHL_1       = 0x1u << kCDSHL_Shift, // N << 1.
+    kCDSHL_2       = 0x2u << kCDSHL_Shift, // N << 2.
+    kCDSHL_3       = 0x3u << kCDSHL_Shift, // N << 3.
+    kCDSHL_4       = 0x4u << kCDSHL_Shift, // N << 4.
+    kCDSHL_5       = 0x5u << kCDSHL_Shift, // N << 5.
+
+    // Compressed displacement tuple-type (specific to AsmJit).
+    //
+    // Since we store the base offset independently of CDTT we can simplify the number of 'TUPLE_TYPE' groups
+    // significantly and just handle special cases.
+    kCDTT_Shift    = 16,
+    kCDTT_Mask     = 0x3u << kCDTT_Shift,
+    kCDTT_None     = 0x0u << kCDTT_Shift,  // Does nothing.
+    kCDTT_ByLL     = 0x1u << kCDTT_Shift,  // Scales by LL (1x 2x 4x).
+    kCDTT_T1W      = 0x2u << kCDTT_Shift,  // Used to add 'W' to the shift.
+    kCDTT_DUP      = 0x3u << kCDTT_Shift,  // Special 'VMOVDDUP' case.
+
+    // Aliases that match names used in instruction manuals.
+    kCDTT__        = kCDTT_None,
+    kCDTT_FV       = kCDTT_ByLL,
+    kCDTT_HV       = kCDTT_ByLL,
+    kCDTT_QV       = kCDTT_ByLL,
+    kCDTT_FVM      = kCDTT_ByLL,
+    kCDTT_T1S      = kCDTT_None,
+    kCDTT_T1F      = kCDTT_None,
+    kCDTT_T1_4X    = kCDTT_None,
+    kCDTT_T4X      = kCDTT_None,           // Alias to have only 3 letters.
+    kCDTT_T2       = kCDTT_None,
+    kCDTT_T4       = kCDTT_None,
+    kCDTT_T8       = kCDTT_None,
+    kCDTT_HVM      = kCDTT_ByLL,
+    kCDTT_QVM      = kCDTT_ByLL,
+    kCDTT_OVM      = kCDTT_ByLL,
+    kCDTT_128      = kCDTT_None,
+
+    // `O` Field in ModR/M (??:xxx:???)
+    // --------------------------------
+
+    kModO_Shift    = 18,
+    kModO_Mask     = 0x7u << kModO_Shift,
+
+    kModO__        = 0x0u,
+    kModO_0        = 0x0u << kModO_Shift,
+    kModO_1        = 0x1u << kModO_Shift,
+    kModO_2        = 0x2u << kModO_Shift,
+    kModO_3        = 0x3u << kModO_Shift,
+    kModO_4        = 0x4u << kModO_Shift,
+    kModO_5        = 0x5u << kModO_Shift,
+    kModO_6        = 0x6u << kModO_Shift,
+    kModO_7        = 0x7u << kModO_Shift,
+
+    // `RM` Field in ModR/M (??:???:xxx)
+    // ---------------------------------
+    //
+    // Second data field used by ModR/M byte. This is only used by few instructions that use OPCODE+MOD/RM where both
+    // values in Mod/RM are part of the opcode.
+
+    kModRM_Shift    = 13,
+    kModRM_Mask     = 0x7u << kModRM_Shift,
+
+    kModRM__        = 0x0u,
+    kModRM_0        = 0x0u << kModRM_Shift,
+    kModRM_1        = 0x1u << kModRM_Shift,
+    kModRM_2        = 0x2u << kModRM_Shift,
+    kModRM_3        = 0x3u << kModRM_Shift,
+    kModRM_4        = 0x4u << kModRM_Shift,
+    kModRM_5        = 0x5u << kModRM_Shift,
+    kModRM_6        = 0x6u << kModRM_Shift,
+    kModRM_7        = 0x7u << kModRM_Shift,
+
+    // `PP` Field
+    // ----------
+    //
+    // These fields are stored deliberately right after each other as it makes it easier to construct VEX prefix from
+    // the opcode value stored in the instruction database.
+    //
+    // Two meanings:
+    //   * "PP" field in AVX/XOP/AVX-512 instruction.
+    //   * Mandatory Prefix in legacy encoding.
+    //
+    // AVX reserves 2 bits for `PP` field, but AsmJit extends the storage by 1 more bit that is used to emit 9B prefix
+    // for some X87-FPU instructions.
+
+    kPP_Shift      = 21,
+    kPP_VEXMask    = 0x03u << kPP_Shift,   // PP field mask used by VEX/EVEX.
+    kPP_FPUMask    = 0x07u << kPP_Shift,   // Mask used by EMIT_PP, also includes '0x9B'.
+    kPP_00         = 0x00u << kPP_Shift,
+    kPP_66         = 0x01u << kPP_Shift,
+    kPP_F3         = 0x02u << kPP_Shift,
+    kPP_F2         = 0x03u << kPP_Shift,
+
+    kPP_9B         = 0x07u << kPP_Shift,   // AsmJit specific to emit FPU's '9B' byte.
+
+    // REX|VEX|EVEX B|X|R|W Bits
+    // -------------------------
+    //
+    // NOTE: REX.[B|X|R] are never stored within the opcode itself, they are reserved by AsmJit are are added
+    // dynamically to the opcode to represent [REX|VEX|EVEX].[B|X|R] bits. REX.W can be stored in DB as it's sometimes
+    // part of the opcode itself.
+
+    // These must be binary compatible with instruction options.
+    kREX_Shift     = 24,
+    kREX_Mask      = 0x0Fu << kREX_Shift,
+    kB             = 0x01u << kREX_Shift,  // Never stored in DB, used by encoder.
+    kX             = 0x02u << kREX_Shift,  // Never stored in DB, used by encoder.
+    kR             = 0x04u << kREX_Shift,  // Never stored in DB, used by encoder.
+    kW             = 0x08u << kREX_Shift,
+    kW_Shift       = kREX_Shift + 3,
+
+    kW__           = 0u << kW_Shift,       // REX.W/VEX.W is unspecified.
+    kW_x           = 0u << kW_Shift,       // REX.W/VEX.W is based on instruction operands.
+    kW_I           = 0u << kW_Shift,       // REX.W/VEX.W is ignored (WIG).
+    kW_0           = 0u << kW_Shift,       // REX.W/VEX.W is 0 (W0).
+    kW_1           = 1u << kW_Shift,       // REX.W/VEX.W is 1 (W1).
+
+    // EVEX.W Field
+    // ------------
+    //
+    // `W` field used by EVEX instruction encoding.
+
+    kEvex_W_Shift  = 28,
+    kEvex_W_Mask   = 1u << kEvex_W_Shift,
+
+    kEvex_W__      = 0u << kEvex_W_Shift,  // EVEX.W is unspecified (not EVEX instruction).
+    kEvex_W_x      = 0u << kEvex_W_Shift,  // EVEX.W is based on instruction operands.
+    kEvex_W_I      = 0u << kEvex_W_Shift,  // EVEX.W is ignored (WIG).
+    kEvex_W_0      = 0u << kEvex_W_Shift,  // EVEX.W is 0 (W0).
+    kEvex_W_1      = 1u << kEvex_W_Shift,  // EVEX.W is 1 (W1).
+
+    // `L` or `LL` field in AVX/XOP/AVX-512
+    // ------------------------------------
+    //
+    // VEX/XOP prefix can only use the first bit `L.128` or `L.256`. EVEX prefix prefix makes it possible to use also
+    // `L.512`. If the instruction set manual describes an instruction by `LIG` it means that the `L` field is ignored
+    // and AsmJit defaults to `0` in such case.
+    kLL_Shift      = 29,
+    kLL_Mask       = 0x3u << kLL_Shift,
+
+    kLL__          = 0x0u << kLL_Shift,    // LL is unspecified.
+    kLL_x          = 0x0u << kLL_Shift,    // LL is based on instruction operands.
+    kLL_I          = 0x0u << kLL_Shift,    // LL is ignored (LIG).
+    kLL_0          = 0x0u << kLL_Shift,    // LL is 0 (L.128).
+    kLL_1          = 0x1u << kLL_Shift,    // LL is 1 (L.256).
+    kLL_2          = 0x2u << kLL_Shift,    // LL is 2 (L.512).
+
+    // Opcode Combinations
+    // -------------------
+
+    k0      = 0,                           // '__' (no prefix, used internally).
+    k000000 = kPP_00 | kMM_00,             // '__' (no prefix, to be the same width as others).
+    k000F00 = kPP_00 | kMM_0F,             // '0F'
+    k000F01 = kPP_00 | kMM_0F01,           // '0F01'
+    k000F0F = kPP_00 | kMM_0F,             // '0F0F' - 3DNOW, equal to 0x0F, must have special encoding to take effect.
+    k000F38 = kPP_00 | kMM_0F38,           // 'NP.0F38'
+    k000F3A = kPP_00 | kMM_0F3A,           // 'NP.0F3A'
+    k00MAP5 = kPP_00 | kMM_MAP5,           // 'NP.MAP5'
+    k00MAP6 = kPP_00 | kMM_MAP6,           // 'NP.MAP5'
+    k660000 = kPP_66 | kMM_00,             // '66'
+    k660F00 = kPP_66 | kMM_0F,             // '66.0F'
+    k660F01 = kPP_66 | kMM_0F01,           // '66.0F01'
+    k660F38 = kPP_66 | kMM_0F38,           // '66.0F38'
+    k660F3A = kPP_66 | kMM_0F3A,           // '66.0F3A'
+    k66MAP5 = kPP_66 | kMM_MAP5,           // '66.MAP5'
+    k66MAP6 = kPP_66 | kMM_MAP6,           // '66.MAP5'
+    kF20000 = kPP_F2 | kMM_00,             // 'F2'
+    kF20F00 = kPP_F2 | kMM_0F,             // 'F2.0F'
+    kF20F01 = kPP_F2 | kMM_0F01,           // 'F2.0F01'
+    kF20F38 = kPP_F2 | kMM_0F38,           // 'F2.0F38'
+    kF20F3A = kPP_F2 | kMM_0F3A,           // 'F2.0F3A'
+    kF2MAP5 = kPP_F2 | kMM_MAP5,           // 'F2.MAP5'
+    kF2MAP6 = kPP_F2 | kMM_MAP6,           // 'F2.MAP5'
+    kF30000 = kPP_F3 | kMM_00,             // 'F3'
+    kF30F00 = kPP_F3 | kMM_0F,             // 'F3.0F'
+    kF30F01 = kPP_F3 | kMM_0F01,           // 'F3.0F01'
+    kF30F38 = kPP_F3 | kMM_0F38,           // 'F3.0F38'
+    kF30F3A = kPP_F3 | kMM_0F3A,           // 'F3.0F3A'
+    kF3MAP5 = kPP_F3 | kMM_MAP5,           // 'F3.MAP5'
+    kF3MAP6 = kPP_F3 | kMM_MAP6,           // 'F3.MAP5'
+    kFPU_00 = kPP_00 | kMM_00,             // '__' (FPU)
+    kFPU_9B = kPP_9B | kMM_00,             // '9B' (FPU)
+    kXOP_M8 = kPP_00 | kMM_XOP08,          // 'M8' (XOP)
+    kXOP_M9 = kPP_00 | kMM_XOP09,          // 'M9' (XOP)
+    kXOP_MA = kPP_00 | kMM_XOP0A           // 'MA' (XOP)
+  };
+
+  // Opcode Builder
+  // --------------
+
+  inline uint32_t get() const noexcept { return v; }
+
+  inline bool hasW() const noexcept { return (v & kW) != 0; }
+  inline bool has66h() const noexcept { return (v & kPP_66) != 0; }
+
+  inline Opcode& add(uint32_t x) noexcept { return operator+=(x); }
+
+  inline Opcode& add66h() noexcept { return operator|=(kPP_66); }
+  template<typename T>
+  inline Opcode& add66hIf(T exp) noexcept { return operator|=(uint32_t(exp) << kPP_Shift); }
+  template<typename T>
+  inline Opcode& add66hBySize(T size) noexcept { return add66hIf(size == 2); }
+
+  inline Opcode& addW() noexcept { return operator|=(kW); }
+  template<typename T>
+  inline Opcode& addWIf(T exp) noexcept { return operator|=(uint32_t(exp) << kW_Shift); }
+  template<typename T>
+  inline Opcode& addWBySize(T size) noexcept { return addWIf(size == 8); }
+
+  template<typename T>
+  inline Opcode& addPrefixBySize(T size) noexcept {
+    static const uint32_t mask[16] = {
+      0,          // #0
+      0,          // #1 -> nothing (already handled or not possible)
+      kPP_66,     // #2 -> 66H
+      0,          // #3
+      0,          // #4 -> nothing
+      0,          // #5
+      0,          // #6
+      0,          // #7
+      kW          // #8 -> REX.W
+    };
+    return operator|=(mask[size & 0xF]);
+  }
+
+  template<typename T>
+  inline Opcode& addArithBySize(T size) noexcept {
+    static const uint32_t mask[16] = {
+      0,          // #0
+      0,          // #1 -> nothing
+      1 | kPP_66, // #2 -> NOT_BYTE_OP(1) and 66H
+      0,          // #3
+      1,          // #4 -> NOT_BYTE_OP(1)
+      0,          // #5
+      0,          // #6
+      0,          // #7
+      1 | kW      // #8 -> NOT_BYTE_OP(1) and REX.W
+    };
+    return operator|=(mask[size & 0xF]);
+  }
+
+  inline Opcode& forceEvex() noexcept { return operator|=(kMM_ForceEvex); }
+  template<typename T>
+  inline Opcode& forceEvexIf(T exp) noexcept { return operator|=(uint32_t(exp) << Support::ConstCTZ<uint32_t(kMM_ForceEvex)>::value); }
+
+  //! Extract `O` field (R) from the opcode (specified as /0..7 in instruction manuals).
+  inline uint32_t extractModO() const noexcept {
+    return (v >> kModO_Shift) & 0x07;
+  }
+
+  //! Extract `RM` field (RM) from the opcode (usually specified as another opcode value).
+  inline uint32_t extractModRM() const noexcept {
+    return (v >> kModRM_Shift) & 0x07;
+  }
+
+  //! Extract `REX` prefix from opcode combined with `options`.
+  inline uint32_t extractRex(InstOptions options) const noexcept {
+    // kREX was designed in a way that when shifted there will be no bytes set except REX.[B|X|R|W].
+    // The returned value forms a real REX prefix byte. This case should be unit-tested as well.
+    return (v | uint32_t(options)) >> kREX_Shift;
+  }
+
+  inline uint32_t extractLLMMMMM(InstOptions options) const noexcept {
+    uint32_t llMmmmm = uint32_t(v & (kLL_Mask | kMM_Mask));
+    uint32_t vexEvex = uint32_t(options & InstOptions::kX86_Evex);
+    return (llMmmmm | vexEvex) >> kMM_Shift;
+  }
+
+  inline Opcode& operator=(uint32_t x) noexcept { v = x; return *this; }
+  inline Opcode& operator+=(uint32_t x) noexcept { v += x; return *this; }
+  inline Opcode& operator-=(uint32_t x) noexcept { v -= x; return *this; }
+  inline Opcode& operator&=(uint32_t x) noexcept { v &= x; return *this; }
+  inline Opcode& operator|=(uint32_t x) noexcept { v |= x; return *this; }
+  inline Opcode& operator^=(uint32_t x) noexcept { v ^= x; return *this; }
+
+  inline uint32_t operator&(uint32_t x) const noexcept { return v & x; }
+  inline uint32_t operator|(uint32_t x) const noexcept { return v | x; }
+  inline uint32_t operator^(uint32_t x) const noexcept { return v ^ x; }
+  inline uint32_t operator<<(uint32_t x) const noexcept { return v << x; }
+  inline uint32_t operator>>(uint32_t x) const noexcept { return v >> x; }
+};
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // ASMJIT_X86_X86OPCODE_P_H_INCLUDED
--- a/lib/lepton/asmjit/x86/x86operand.cpp
+++ b/lib/lepton/asmjit/x86/x86operand.cpp
@ -0,0 +1,231 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#include "../core/api-build_p.h"
+#if !defined(ASMJIT_NO_X86)
+
+#include "../core/misc_p.h"
+#include "../x86/x86operand.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+// x86::Operand - Tests
+// ====================
+
+#if defined(ASMJIT_TEST)
+UNIT(x86_operand) {
+  Label L(1000); // Label with some ID.
+
+  INFO("Checking basic properties of built-in X86 registers");
+  EXPECT(gpb(Gp::kIdAx) == al);
+  EXPECT(gpb(Gp::kIdBx) == bl);
+  EXPECT(gpb(Gp::kIdCx) == cl);
+  EXPECT(gpb(Gp::kIdDx) == dl);
+
+  EXPECT(gpb_lo(Gp::kIdAx) == al);
+  EXPECT(gpb_lo(Gp::kIdBx) == bl);
+  EXPECT(gpb_lo(Gp::kIdCx) == cl);
+  EXPECT(gpb_lo(Gp::kIdDx) == dl);
+
+  EXPECT(gpb_hi(Gp::kIdAx) == ah);
+  EXPECT(gpb_hi(Gp::kIdBx) == bh);
+  EXPECT(gpb_hi(Gp::kIdCx) == ch);
+  EXPECT(gpb_hi(Gp::kIdDx) == dh);
+
+  EXPECT(gpw(Gp::kIdAx) == ax);
+  EXPECT(gpw(Gp::kIdBx) == bx);
+  EXPECT(gpw(Gp::kIdCx) == cx);
+  EXPECT(gpw(Gp::kIdDx) == dx);
+
+  EXPECT(gpd(Gp::kIdAx) == eax);
+  EXPECT(gpd(Gp::kIdBx) == ebx);
+  EXPECT(gpd(Gp::kIdCx) == ecx);
+  EXPECT(gpd(Gp::kIdDx) == edx);
+
+  EXPECT(gpq(Gp::kIdAx) == rax);
+  EXPECT(gpq(Gp::kIdBx) == rbx);
+  EXPECT(gpq(Gp::kIdCx) == rcx);
+  EXPECT(gpq(Gp::kIdDx) == rdx);
+
+  EXPECT(gpb(Gp::kIdAx) != dl);
+  EXPECT(gpw(Gp::kIdBx) != cx);
+  EXPECT(gpd(Gp::kIdCx) != ebx);
+  EXPECT(gpq(Gp::kIdDx) != rax);
+
+  INFO("Checking if x86::reg(...) matches built-in IDs");
+  EXPECT(gpb(5) == bpl);
+  EXPECT(gpw(5) == bp);
+  EXPECT(gpd(5) == ebp);
+  EXPECT(gpq(5) == rbp);
+  EXPECT(st(5)  == st5);
+  EXPECT(mm(5)  == mm5);
+  EXPECT(k(5)   == k5);
+  EXPECT(cr(5)  == cr5);
+  EXPECT(dr(5)  == dr5);
+  EXPECT(xmm(5) == xmm5);
+  EXPECT(ymm(5) == ymm5);
+  EXPECT(zmm(5) == zmm5);
+
+  INFO("Checking x86::Gp register properties");
+  EXPECT(Gp().isReg() == true);
+  EXPECT(eax.isReg() == true);
+  EXPECT(eax.id() == 0);
+  EXPECT(eax.size() == 4);
+  EXPECT(eax.type() == RegType::kX86_Gpd);
+  EXPECT(eax.group() == RegGroup::kGp);
+
+  INFO("Checking x86::Xmm register properties");
+  EXPECT(Xmm().isReg() == true);
+  EXPECT(xmm4.isReg() == true);
+  EXPECT(xmm4.id() == 4);
+  EXPECT(xmm4.size() == 16);
+  EXPECT(xmm4.type() == RegType::kX86_Xmm);
+  EXPECT(xmm4.group() == RegGroup::kVec);
+  EXPECT(xmm4.isVec());
+
+  INFO("Checking x86::Ymm register properties");
+  EXPECT(Ymm().isReg() == true);
+  EXPECT(ymm5.isReg() == true);
+  EXPECT(ymm5.id() == 5);
+  EXPECT(ymm5.size() == 32);
+  EXPECT(ymm5.type() == RegType::kX86_Ymm);
+  EXPECT(ymm5.group() == RegGroup::kVec);
+  EXPECT(ymm5.isVec());
+
+  INFO("Checking x86::Zmm register properties");
+  EXPECT(Zmm().isReg() == true);
+  EXPECT(zmm6.isReg() == true);
+  EXPECT(zmm6.id() == 6);
+  EXPECT(zmm6.size() == 64);
+  EXPECT(zmm6.type() == RegType::kX86_Zmm);
+  EXPECT(zmm6.group() == RegGroup::kVec);
+  EXPECT(zmm6.isVec());
+
+  INFO("Checking x86::Vec register properties");
+  EXPECT(Vec().isReg() == true);
+  // Converts a VEC register to a type of the passed register, but keeps the ID.
+  EXPECT(xmm4.cloneAs(ymm10) == ymm4);
+  EXPECT(xmm4.cloneAs(zmm11) == zmm4);
+  EXPECT(ymm5.cloneAs(xmm12) == xmm5);
+  EXPECT(ymm5.cloneAs(zmm13) == zmm5);
+  EXPECT(zmm6.cloneAs(xmm14) == xmm6);
+  EXPECT(zmm6.cloneAs(ymm15) == ymm6);
+
+  EXPECT(xmm7.xmm() == xmm7);
+  EXPECT(xmm7.ymm() == ymm7);
+  EXPECT(xmm7.zmm() == zmm7);
+
+  EXPECT(ymm7.xmm() == xmm7);
+  EXPECT(ymm7.ymm() == ymm7);
+  EXPECT(ymm7.zmm() == zmm7);
+
+  EXPECT(zmm7.xmm() == xmm7);
+  EXPECT(zmm7.ymm() == ymm7);
+  EXPECT(zmm7.zmm() == zmm7);
+
+  INFO("Checking x86::Mm register properties");
+  EXPECT(Mm().isReg() == true);
+  EXPECT(mm2.isReg() == true);
+  EXPECT(mm2.id() == 2);
+  EXPECT(mm2.size() == 8);
+  EXPECT(mm2.type() == RegType::kX86_Mm);
+  EXPECT(mm2.group() == RegGroup::kX86_MM);
+
+  INFO("Checking x86::KReg register properties");
+  EXPECT(KReg().isReg() == true);
+  EXPECT(k3.isReg() == true);
+  EXPECT(k3.id() == 3);
+  EXPECT(k3.size() == 0);
+  EXPECT(k3.type() == RegType::kX86_KReg);
+  EXPECT(k3.group() == RegGroup::kX86_K);
+
+  INFO("Checking x86::St register properties");
+  EXPECT(St().isReg() == true);
+  EXPECT(st1.isReg() == true);
+  EXPECT(st1.id() == 1);
+  EXPECT(st1.size() == 10);
+  EXPECT(st1.type() == RegType::kX86_St);
+  EXPECT(st1.group() == RegGroup::kX86_St);
+
+  INFO("Checking if default constructed regs behave as expected");
+  EXPECT(Reg().isValid() == false);
+  EXPECT(Gp().isValid() == false);
+  EXPECT(Xmm().isValid() == false);
+  EXPECT(Ymm().isValid() == false);
+  EXPECT(Zmm().isValid() == false);
+  EXPECT(Mm().isValid() == false);
+  EXPECT(KReg().isValid() == false);
+  EXPECT(SReg().isValid() == false);
+  EXPECT(CReg().isValid() == false);
+  EXPECT(DReg().isValid() == false);
+  EXPECT(St().isValid() == false);
+  EXPECT(Bnd().isValid() == false);
+
+  INFO("Checking x86::Mem operand");
+  Mem m;
+  EXPECT(m == Mem(), "Two default constructed x86::Mem operands must be equal");
+
+  m = ptr(L);
+  EXPECT(m.hasBase() == true);
+  EXPECT(m.hasBaseReg() == false);
+  EXPECT(m.hasBaseLabel() == true);
+  EXPECT(m.hasOffset() == false);
+  EXPECT(m.isOffset64Bit() == false);
+  EXPECT(m.offset() == 0);
+  EXPECT(m.offsetLo32() == 0);
+
+  m = ptr(0x0123456789ABCDEFu);
+  EXPECT(m.hasBase() == false);
+  EXPECT(m.hasBaseReg() == false);
+  EXPECT(m.hasIndex() == false);
+  EXPECT(m.hasIndexReg() == false);
+  EXPECT(m.hasOffset() == true);
+  EXPECT(m.isOffset64Bit() == true);
+  EXPECT(m.offset() == int64_t(0x0123456789ABCDEFu));
+  EXPECT(m.offsetLo32() == int32_t(0x89ABCDEFu));
+  m.addOffset(1);
+  EXPECT(m.offset() == int64_t(0x0123456789ABCDF0u));
+
+  m = ptr(0x0123456789ABCDEFu, rdi, 3);
+  EXPECT(m.hasSegment() == false);
+  EXPECT(m.hasBase() == false);
+  EXPECT(m.hasBaseReg() == false);
+  EXPECT(m.hasIndex() == true);
+  EXPECT(m.hasIndexReg() == true);
+  EXPECT(m.indexType() == rdi.type());
+  EXPECT(m.indexId() == rdi.id());
+  EXPECT(m.shift() == 3);
+  EXPECT(m.hasOffset() == true);
+  EXPECT(m.isOffset64Bit() == true);
+  EXPECT(m.offset() == int64_t(0x0123456789ABCDEFu));
+  EXPECT(m.offsetLo32() == int32_t(0x89ABCDEFu));
+  m.resetIndex();
+  EXPECT(m.hasIndex() == false);
+  EXPECT(m.hasIndexReg() == false);
+
+  m = ptr(rax);
+  EXPECT(m.hasBase() == true);
+  EXPECT(m.hasBaseReg() == true);
+  EXPECT(m.baseType() == rax.type());
+  EXPECT(m.baseId() == rax.id());
+  EXPECT(m.hasIndex() == false);
+  EXPECT(m.hasIndexReg() == false);
+  EXPECT(m.indexType() == RegType::kNone);
+  EXPECT(m.indexId() == 0);
+  EXPECT(m.hasOffset() == false);
+  EXPECT(m.isOffset64Bit() == false);
+  EXPECT(m.offset() == 0);
+  EXPECT(m.offsetLo32() == 0);
+  m.setIndex(rsi);
+  EXPECT(m.hasIndex() == true);
+  EXPECT(m.hasIndexReg() == true);
+  EXPECT(m.indexType() == rsi.type());
+  EXPECT(m.indexId() == rsi.id());
+}
+#endif
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_X86
--- a/lib/lepton/asmjit/x86/x86operand.h
+++ b/lib/lepton/asmjit/x86/x86operand.h
--- a/lib/lepton/asmjit/x86/x86rapass.cpp
+++ b/lib/lepton/asmjit/x86/x86rapass.cpp
--- a/lib/lepton/asmjit/x86/x86rapass_p.h
+++ b/lib/lepton/asmjit/x86/x86rapass_p.h
@ -0,0 +1,94 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See asmjit.h or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_X86_X86RAPASS_P_H_INCLUDED
+#define ASMJIT_X86_X86RAPASS_P_H_INCLUDED
+
+#include "../core/api-config.h"
+#ifndef ASMJIT_NO_COMPILER
+
+#include "../core/compiler.h"
+#include "../core/rabuilders_p.h"
+#include "../core/rapass_p.h"
+#include "../x86/x86assembler.h"
+#include "../x86/x86compiler.h"
+#include "../x86/x86emithelper_p.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(x86)
+
+//! \cond INTERNAL
+//! \addtogroup asmjit_x86
+//! \{
+
+//! X86 register allocation pass.
+//!
+//! Takes care of generating function prologs and epilogs, and also performs register allocation.
+class X86RAPass : public BaseRAPass {
+public:
+  ASMJIT_NONCOPYABLE(X86RAPass)
+  typedef BaseRAPass Base;
+
+  EmitHelper _emitHelper;
+
+  //! \name Construction & Destruction
+  //! \{
+
+  X86RAPass() noexcept;
+  virtual ~X86RAPass() noexcept;
+
+  //! \}
+
+  //! \name Accessors
+  //! \{
+
+  //! Returns the compiler casted to `x86::Compiler`.
+  inline Compiler* cc() const noexcept { return static_cast<Compiler*>(_cb); }
+
+  //! Returns emit helper.
+  inline EmitHelper* emitHelper() noexcept { return &_emitHelper; }
+
+  inline bool avxEnabled() const noexcept { return _emitHelper._avxEnabled; }
+  inline bool avx512Enabled() const noexcept { return _emitHelper._avx512Enabled; }
+
+  //! \}
+
+  //! \name Utilities
+  //! \{
+
+  inline uint32_t choose(uint32_t sseInstId, uint32_t avxInstId) noexcept {
+    return avxEnabled() ? avxInstId : sseInstId;
+  }
+
+  //! \}
+
+  //! \name Interface
+  //! \{
+
+  void onInit() noexcept override;
+  void onDone() noexcept override;
+
+  Error buildCFG() noexcept override;
+
+  Error _rewrite(BaseNode* first, BaseNode* stop) noexcept override;
+
+  Error emitMove(uint32_t workId, uint32_t dstPhysId, uint32_t srcPhysId) noexcept override;
+  Error emitSwap(uint32_t aWorkId, uint32_t aPhysId, uint32_t bWorkId, uint32_t bPhysId) noexcept override;
+
+  Error emitLoad(uint32_t workId, uint32_t dstPhysId) noexcept override;
+  Error emitSave(uint32_t workId, uint32_t srcPhysId) noexcept override;
+
+  Error emitJump(const Label& label) noexcept override;
+  Error emitPreCall(InvokeNode* invokeNode) noexcept override;
+
+  //! \}
+};
+
+//! \}
+//! \endcond
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_COMPILER
+#endif // ASMJIT_X86_X86RAPASS_P_H_INCLUDED