我想要一个非常高分辨率的计时器用于我的 C# 应用程序。我想访问 RDTSC 汇编指令。有办法做到这一点吗?
编辑:我正在移植一些 C++ 代码并试图保留与原始代码相同的功能。我可能会切换到更 .NET 的东西,但想要评估 RDTSC 指令,以便我可以将结果与原始结果进行比较。
最佳答案
这里是你如何做到的:
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Runtime.InteropServices;
public static class Rdtsc
{
[StructLayout(LayoutKind.Sequential)]
private struct SystemInfo
{
public ushort wProcessorArchitecture;
public ushort wReserved;
public uint dwPageSize;
public IntPtr lpMinimumApplicationAddress;
public IntPtr lpMaximumApplicationAddress;
public IntPtr dwActiveProcessorMask;
public uint dwNumberOfProcessors;
public uint dwProcessorType;
public uint dwAllocationGranularity;
public ushort wProcessorLevel;
public ushort wProcessorRevision;
}
[DllImport("kernel32.dll", ExactSpelling = true)]
private static extern void GetNativeSystemInfo(out SystemInfo lpSystemInfo);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
private static extern IntPtr VirtualAlloc(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, uint flProtect);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool VirtualProtect(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, out uint lpflOldProtect);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool VirtualFree(IntPtr lpAddress, IntPtr dwSize, uint dwFreeType);
private const uint PAGE_READWRITE = 0x04;
private const uint PAGE_EXECUTE = 0x10;
private const uint MEM_COMMIT = 0x1000;
private const uint MEM_RELEASE = 0x8000;
[SuppressUnmanagedCodeSecurity]
[UnmanagedFunctionPointer(CallingConvention.StdCall)]
public delegate ulong TimestampDelegate();
public static readonly TimestampDelegate Timestamp;
static Rdtsc()
{
SystemInfo systemInfo;
GetNativeSystemInfo(out systemInfo);
if (systemInfo.wProcessorArchitecture != 0 /* PROCESSOR_ARCHITECTURE_INTEL */ &&
systemInfo.wProcessorArchitecture != 9 /* PROCESSOR_ARCHITECTURE_AMD64 */)
{
// Fallback for ARM/IA64/...
Timestamp = StopwatchGetTimestamp;
return;
}
byte[] body;
if (Environment.Is64BitProcess)
{
body = new byte[]
{
0x0f, 0x31, // rdtsc
0x48, 0xc1, 0xe2, 0x20, // shl rdx,20h
0x48, 0x0b, 0xc2, // or rax,rdx
0xc3, // ret
};
}
else
{
body = new byte[]
{
0x0f, 0x31, // rdtsc
0xc3, // ret
};
}
IntPtr buf = IntPtr.Zero;
try
{
// We VirtualAlloc body.Length bytes, with R/W access
// Note that from what I've read, MEM_RESERVE is useless
// if the first parameter is IntPtr.Zero
buf = VirtualAlloc(IntPtr.Zero, (IntPtr)body.Length, MEM_COMMIT, PAGE_READWRITE);
if (buf == IntPtr.Zero)
{
throw new Win32Exception();
}
// Copy our instructions in the buf
Marshal.Copy(body, 0, buf, body.Length);
// Change the access of the allocated memory from R/W to Execute
uint oldProtection;
bool result = VirtualProtect(buf, (IntPtr)body.Length, PAGE_EXECUTE, out oldProtection);
if (!result)
{
throw new Win32Exception();
}
// Create a delegate to the "function"
Timestamp = (TimestampDelegate)Marshal.GetDelegateForFunctionPointer(buf, typeof(TimestampDelegate));
buf = IntPtr.Zero;
}
finally
{
// There was an error!
if (buf != IntPtr.Zero)
{
// Free the allocated memory
bool result = VirtualFree(buf, IntPtr.Zero, MEM_RELEASE);
if (!result)
{
throw new Win32Exception();
}
}
}
}
// Fallback if rdtsc isn't available
private static ulong StopwatchGetTimestamp()
{
return unchecked((ulong)Stopwatch.GetTimestamp());
}
}
一些注意事项:
- 我已经包含了 ARM 处理器的回退 (
Stopwatch.GetTimestamp()
)。 - 我没有使用 CPUID 来阻止 RDTSC 指令被移动 ("cpuid" before "rdtsc")。我不擅长组装,所以我不知道该怎么做。如果您想修改代码,请随意修改并在要使用的“正确”操作码上添加评论
- 我使用的是 RDTSC 而不是 RDTSCP(同样的问题,汇编不是我的语言)
- 它非常慢...假设等效的 Visual C++ 代码在 RDTSC 的 18-24 个滴答中被调用而没有内联,而 C# 版本在初始预热后被调用27-100 个 RDTSC 刻度。
Visual C++ 比较代码:
__declspec(noinline) uint64_t __stdcall Rdtsc(void)
{
return __rdtsc();
}
Aaaa 现在...使用 rdtscp 完全实现
public static class Rdtsc
{
[StructLayout(LayoutKind.Sequential)]
private struct SystemInfo
{
public ushort wProcessorArchitecture;
public ushort wReserved;
public uint dwPageSize;
public IntPtr lpMinimumApplicationAddress;
public IntPtr lpMaximumApplicationAddress;
public IntPtr dwActiveProcessorMask;
public uint dwNumberOfProcessors;
public uint dwProcessorType;
public uint dwAllocationGranularity;
public ushort wProcessorLevel;
public ushort wProcessorRevision;
}
[DllImport("kernel32.dll", ExactSpelling = true)]
private static extern void GetNativeSystemInfo(out SystemInfo lpSystemInfo);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
private static extern IntPtr VirtualAlloc(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, uint flProtect);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool VirtualProtect(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, out uint lpflOldProtect);
[DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool VirtualFree(IntPtr lpAddress, IntPtr dwSize, uint dwFreeType);
private const uint PAGE_READWRITE = 0x04;
private const uint PAGE_EXECUTE = 0x10;
private const uint PAGE_EXECUTE_READWRITE = 0x40;
private const uint MEM_COMMIT = 0x1000;
private const uint MEM_RELEASE = 0x8000;
[SuppressUnmanagedCodeSecurity]
[UnmanagedFunctionPointer(CallingConvention.StdCall)]
public delegate ulong FuncUInt64();
/// <summary>
/// Uses rdtsc. On non-Intel uses Stopwatch.GetTimestamp.
/// </summary>
public static readonly FuncUInt64 Timestamp;
/// <summary>
/// Uses rdtscp if present. Otherwise uses cpuid + rdtsc. On
/// non-Intel uses Stopwatch.GetTimestamp.
/// </summary>
public static readonly FuncUInt64 TimestampP;
public static readonly bool IsRdtscSupported;
public static readonly bool IsRdtscPSupported;
static Rdtsc()
{
SystemInfo systemInfo;
GetNativeSystemInfo(out systemInfo);
if (systemInfo.wProcessorArchitecture != 0 /* PROCESSOR_ARCHITECTURE_INTEL */ &&
systemInfo.wProcessorArchitecture != 9 /* PROCESSOR_ARCHITECTURE_AMD64 */)
{
// Fallback for ARM/IA64/...
Timestamp = StopwatchGetTimestamp;
TimestampP = StopwatchGetTimestamp;
IsRdtscSupported = false;
IsRdtscPSupported = false;
return;
}
byte[] cpuid, rdtsc, rdtscp, rdtsccpuid;
IsRdtscSupported = true;
// Assembly generated with https://defuse.ca/online-x86-assembler.htm
if (Environment.Is64BitProcess)
{
/* CPUID x64:
push rbx;
mov eax, 0x80000000;
cpuid;
mov ebx, 0x80000001;
cmp eax, ebx;
jb Error;
mov eax, ebx;
cpuid;
mov eax, ecx;
shl rax, 0x20;
or rax, rdx
jmp End;
Error:
xor rax, rax;
End:
pop rbx;
ret;
0: 53 push rbx
1: b8 00 00 00 80 mov eax,0x80000000
6: 0f a2 cpuid
8: bb 01 00 00 80 mov ebx,0x80000001
d: 39 d8 cmp eax,ebx
f: 72 0f jb 20 <Error>
11: 89 d8 mov eax,ebx
13: 0f a2 cpuid
15: 89 c8 mov eax,ecx
17: 48 c1 e0 20 shl rax,0x20
1b: 48 09 d0 or rax,rdx
1e: eb 03 jmp 23 <End>
0000000000000020 <Error>:
20: 48 31 c0 xor rax,rax
0000000000000023 <End>:
23: 5b pop rbx
24: c3 ret
*/
cpuid = new byte[] { 0x53, 0xB8, 0x00, 0x00, 0x00, 0x80, 0x0F, 0xA2, 0xBB, 0x01, 0x00, 0x00, 0x80, 0x39, 0xD8, 0x72, 0x16, 0x89, 0xD8, 0x48, 0xC7, 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0xA2, 0x89, 0xC8, 0x48, 0xC1, 0xE0, 0x20, 0x48, 0x09, 0xD0, 0xEB, 0x03, 0x48, 0x31, 0xC0, 0x5B, 0xC3 };
/* RDTSC x64:
rdtsc;
shl rdx, 0x20;
or rax,rdx;
ret;
0: 0f 31 rdtsc
2: 48 c1 e2 20 shl rdx,0x20
6: 48 09 d0 or rax,rdx
9: c3 ret
*/
rdtsc = new byte[] { 0x0F, 0x31, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0xC3 };
/* RDTSCP x64
rdtscp;
shl rdx, 0x20;
or rax, rdx;
ret;
0: 0f 01 f9 rdtscp
3: 48 c1 e2 20 shl rdx,0x20
7: 48 09 d0 or rax,rdx
a: c3 ret
*/
rdtscp = new byte[] { 0x0F, 0x01, 0xF9, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0xC3 };
/* RDTSC + CPUID x64
push rbx;
xor eax, eax;
cpuid;
rdtsc;
shl rdx, 0x20;
or rax, rdx;
pop rbx;
ret;
0: 53 push rbx
1: 31 c0 xor eax,eax
3: 0f a2 cpuid
5: 0f 31 rdtsc
7: 48 c1 e2 20 shl rdx,0x20
b: 48 09 d0 or rax,rdx
e: 5b pop rbx
f: c3 ret
*/
rdtsccpuid = new byte[] { 0x53, 0x31, 0xC0, 0x0F, 0xA2, 0x0F, 0x31, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0x5B, 0xC3 };
}
else
{
/* CPUID x86:
push ebx;
mov eax, 0x80000000;
cpuid;
mov ebx, 0x80000001;
cmp eax, ebx;
jb Error;
mov eax, ebx;
cpuid;
mov eax, edx;
mov edx, ecx;
jmp End;
Error:
xor eax, eax;
xor edx, edx;
End:
pop ebx;
ret;
0: 53 push ebx
1: b8 00 00 00 80 mov eax,0x80000000
6: 0f a2 cpuid
8: bb 01 00 00 80 mov ebx,0x80000001
d: 39 d8 cmp eax,ebx
f: 72 0a jb 1b <Error>
11: 89 d8 mov eax,ebx
13: 0f a2 cpuid
15: 89 d0 mov eax,edx
17: 89 ca mov edx,ecx
19: eb 04 jmp 1f <End>
0000001b <Error>:
1b: 31 c0 xor eax,eax
1d: 31 d2 xor edx,edx
0000001f <End>:
1f: 5b pop ebx
20: c3 ret
*/
cpuid = new byte[] { 0x53, 0xB8, 0x00, 0x00, 0x00, 0x80, 0x0F, 0xA2, 0xBB, 0x01, 0x00, 0x00, 0x80, 0x39, 0xD8, 0x72, 0x0A, 0x89, 0xD8, 0x0F, 0xA2, 0x89, 0xD0, 0x89, 0xCA, 0xEB, 0x04, 0x31, 0xC0, 0x31, 0xD2, 0x5B, 0xC3 };
/* RDTSC x86:
rdtsc;
ret;
0: 0f 31 rdtsc
2: c3 ret
*/
rdtsc = new byte[] { 0x0F, 0x31, 0xC3 };
/* RDTSCP x86
rdtscp;
ret;
0: 0f 01 f9 rdtscp
3: c3 ret
*/
rdtscp = new byte[] { 0x0F, 0x01, 0xF9, 0xC3 };
/* RDTSC + CPUID x86
push ebx;
xor eax,eax;
cpuid;
rdtsc;
pop ebx;
ret;
0: 53 push ebx
1: 31 c0 xor eax,eax
3: 0f a2 cpuid
5: 0f 31 rdtsc
7: 5b pop ebx
8: c3 ret
*/
rdtsccpuid = new byte[] { 0x53, 0x31, 0xC0, 0x0F, 0xA2, 0x0F, 0x31, 0x5B, 0xC3 };
}
IntPtr buf = IntPtr.Zero;
try
{
// We pad the functions to 64 bytes (the length of a cache
// line on the Intel processors)
int cpuidLength = (cpuid.Length & 63) != 0 ? (cpuid.Length | 63) + 1 : cpuid.Length;
int rdtscLength = (rdtsc.Length & 63) != 0 ? (rdtsc.Length | 63) + 1 : rdtsc.Length;
int rdtscpLength = (rdtscp.Length & 63) != 0 ? (rdtscp.Length | 63) + 1 : rdtscp.Length;
int rdtsccpuidLength = (rdtsccpuid.Length & 63) != 0 ? (rdtsccpuid.Length | 63) + 1 : rdtsccpuid.Length;
// We don't know which one of rdtscp or rdtsccpuid we will
// use, so we calculate space for the biggest one.
// Note that it is very unlikely that we will go over 4096
// bytes (the minimum size of memory allocated by
// VirtualAlloc)
int totalLength = cpuidLength + rdtscLength + Math.Max(rdtscpLength, rdtsccpuidLength);
// We VirtualAlloc totalLength bytes, with R/W access
// Note that from what I've read, MEM_RESERVE is useless
// if the first parameter is IntPtr.Zero
buf = VirtualAlloc(IntPtr.Zero, (IntPtr)totalLength, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
if (buf == IntPtr.Zero)
{
throw new Win32Exception();
}
// Copy cpuid instructions in the buf
Marshal.Copy(cpuid, 0, buf, cpuid.Length);
for (int i = cpuid.Length; i < cpuidLength; i++)
{
Marshal.WriteByte(buf, i, 0x90); // nop
}
// Copy rdtsc instructions in the buf
Marshal.Copy(rdtsc, 0, buf + cpuidLength, rdtsc.Length);
for (int i = rdtsc.Length; i < rdtscLength; i++)
{
Marshal.WriteByte(buf, cpuidLength + i, 0x90); // nop
}
var cpuidFunc = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf, typeof(FuncUInt64));
// We use cpuid, EAX=0x80000001 to check for the rdtscp
ulong supportedFeatures = cpuidFunc();
byte[] rdtscpSelected;
int rdtscpSelectedLength;
// Check the rdtscp flag
if ((supportedFeatures & (1L << 27)) != 0)
{
// rdtscp supported
rdtscpSelected = rdtscp;
rdtscpSelectedLength = rdtscpLength;
IsRdtscPSupported = true;
}
else
{
// rdtscp not supported. We use cpuid + rdtsc
rdtscpSelected = rdtsccpuid;
rdtscpSelectedLength = rdtsccpuidLength;
IsRdtscPSupported = false;
}
// Copy rdtscp/rdtsccpuid instructions in the buf
Marshal.Copy(rdtscpSelected, 0, buf + cpuidLength + rdtscLength, rdtscpSelected.Length);
for (int i = rdtscpSelected.Length; i < rdtscpSelectedLength; i++)
{
Marshal.WriteByte(buf, cpuidLength + rdtscLength + i, 0x90); // nop
}
// Change the access of the allocated memory from R/W to Execute
uint oldProtection;
bool result = VirtualProtect(buf, (IntPtr)totalLength, PAGE_EXECUTE, out oldProtection);
if (!result)
{
throw new Win32Exception();
}
// Create a delegate to the "function"
Timestamp = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf + cpuidLength, typeof(FuncUInt64));
TimestampP = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf + cpuidLength + rdtscLength, typeof(FuncUInt64));
buf = IntPtr.Zero;
}
finally
{
// There was an error!
if (buf != IntPtr.Zero)
{
// Free the allocated memory
bool result = VirtualFree(buf, IntPtr.Zero, MEM_RELEASE);
if (!result)
{
throw new Win32Exception();
}
}
}
}
// Fallback if rdtsc isn't available. We can't use directly
// Stopwatch.GetTimestamp() because the return type is different.
private static ulong StopwatchGetTimestamp()
{
return unchecked((ulong)Stopwatch.GetTimestamp());
}
}
要长很多...有两种方法,
ulong ts1 = Rdtsc.Timestamp();
ulong ts2 = Rdtsc.TimestampP();
第一个使用rdtsc
,第二个使用rdtscp
。 rdtscp
优于 rdtsc
,因为它不会在管道中重新排序。 TimestampP
方法一有一个针对旧处理器的回退,使用 cpuid + rdtsc
,但回退速度相当慢。对于两者,使用 Stopwatch.GetTimestamp()
的非 Intel/Amd 处理器都有后备。该类在内部使用 cpuid
指令来检查是否存在 rdtscp
指令。有两个字段,IsRdtscSupported
和 IsRdtscPSupported
指示处理器是否支持 rdtsc
和 rdtscp
。
关于c# - 有没有办法从 C# 调用 RDTSC 汇编指令?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29642816/