-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up GVMLookupForSlot
#85901
Speed up GVMLookupForSlot
#85901
Conversation
BasicMinimalApi spends about 1% of samples in this method.
Tagging subscribers to this area: @agocke, @MichalStrehovsky, @jkotas Issue DetailsBasicMinimalApi spends about 1% of samples in this method. It's the generic virtual method dispatch routine. Before00007FF6DB12BCE0 57 push rdi
00007FF6DB12BCE1 56 push rsi
00007FF6DB12BCE2 55 push rbp
00007FF6DB12BCE3 53 push rbx
00007FF6DB12BCE4 48 83 EC 28 sub rsp,28h
00007FF6DB12BCE8 48 8B 05 79 91 12 00 mov rax,qword ptr [S_P_CoreLib_System_Runtime_TypeLoaderExports::__GCSTATICS (07FF6DB254E68h)]
00007FF6DB12BCEF 48 8B 40 08 mov rax,qword ptr [rax+8]
00007FF6DB12BCF3 48 8B 31 mov rsi,qword ptr [rcx]
00007FF6DB12BCF6 48 8B CE mov rcx,rsi
00007FF6DB12BCF9 4C 8B C2 mov r8,rdx
00007FF6DB12BCFC 4C 8B C9 mov r9,rcx
00007FF6DB12BCFF 49 C1 F9 20 sar r9,20h
00007FF6DB12BD03 44 33 C9 xor r9d,ecx
00007FF6DB12BD06 41 C1 F9 04 sar r9d,4
00007FF6DB12BD0A 44 33 CA xor r9d,edx
00007FF6DB12BD0D 49 C1 F8 20 sar r8,20h
00007FF6DB12BD11 45 33 C1 xor r8d,r9d
00007FF6DB12BD14 44 8B 48 08 mov r9d,dword ptr [rax+8]
00007FF6DB12BD18 45 8D 51 FF lea r10d,[r9-1]
00007FF6DB12BD1C 45 23 C2 and r8d,r10d
00007FF6DB12BD1F 45 3B C1 cmp r8d,r9d
00007FF6DB12BD22 0F 83 8A 00 00 00 jae S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+0D2h (07FF6DB12BDB2h)
00007FF6DB12BD28 4A 8B 44 C0 10 mov rax,qword ptr [rax+r8*8+10h]
00007FF6DB12BD2D 48 85 C0 test rax,rax
00007FF6DB12BD30 74 23 je S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+75h (07FF6DB12BD55h)
00007FF6DB12BD32 0F 1F 80 00 00 00 00 nop dword ptr [rax]
00007FF6DB12BD39 0F 1F 80 00 00 00 00 nop dword ptr [rax]
00007FF6DB12BD40 48 39 48 10 cmp qword ptr [rax+10h],rcx
00007FF6DB12BD44 75 06 jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+6Ch (07FF6DB12BD4Ch)
00007FF6DB12BD46 48 39 50 18 cmp qword ptr [rax+18h],rdx
00007FF6DB12BD4A 74 09 je S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+75h (07FF6DB12BD55h)
00007FF6DB12BD4C 48 8B 40 08 mov rax,qword ptr [rax+8]
00007FF6DB12BD50 48 85 C0 test rax,rax
00007FF6DB12BD53 75 EB jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+60h (07FF6DB12BD40h)
114: entry ??= CacheMiss((IntPtr)obj.GetMethodTable(), *(IntPtr*)&slot,
00007FF6DB12BD55 48 85 C0 test rax,rax
00007FF6DB12BD58 75 4B jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+0C5h (07FF6DB12BDA5h)
00007FF6DB12BD5A 48 8B 3D DF 91 12 00 mov rdi,qword ptr [S_P_CoreLib_System_Runtime_TypeLoaderExports___c::__GCSTATICS (07FF6DB254F40h)]
00007FF6DB12BD61 4C 8B 47 18 mov r8,qword ptr [rdi+18h]
00007FF6DB12BD65 48 8B DA mov rbx,rdx
00007FF6DB12BD68 4D 85 C0 test r8,r8
00007FF6DB12BD6B 75 2A jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+0B7h (07FF6DB12BD97h)
00007FF6DB12BD6D 48 8D 0D 44 60 0B 00 lea rcx,[S_P_CoreLib_System_Runtime_RuntimeObjectFactory::`vftable' (07FF6DB1E1DB8h)]
00007FF6DB12BD74 E8 27 71 F5 FF call RhpNewFast (07FF6DB082EA0h)
00007FF6DB12BD79 48 8B E8 mov rbp,rax
00007FF6DB12BD7C 48 8B 57 08 mov rdx,qword ptr [rdi+8]
00007FF6DB12BD80 48 8B CD mov rcx,rbp
00007FF6DB12BD83 E8 D7 60 F5 FF call __DelegateCtor_S_P_CoreLib_System_Delegate__InitializeClosedInstance__S_P_CoreLib_System_Runtime_TypeLoaderExports___c___GVMLookupForSlot_b__12_0 (07FF6DB081E5Fh)
00007FF6DB12BD88 48 8D 4F 18 lea rcx,[rdi+18h]
00007FF6DB12BD8C 48 8B D5 mov rdx,rbp
00007FF6DB12BD8F E8 1C 73 F5 FF call RhpAssignRefAVLocation (07FF6DB0830B0h)
00007FF6DB12BD94 4C 8B C5 mov r8,rbp
00007FF6DB12BD97 48 8B CE mov rcx,rsi
00007FF6DB12BD9A 48 8B D3 mov rdx,rbx
00007FF6DB12BD9D 45 33 C9 xor r9d,r9d
00007FF6DB12BDA0 E8 1B 00 00 00 call S_P_CoreLib_System_Runtime_TypeLoaderExports__CacheMiss_0 (07FF6DB12BDC0h)
115: (IntPtr context, IntPtr signature, object contextObject, ref IntPtr auxResult)
116: => RuntimeAugments.TypeLoaderCallbacks.ResolveGenericVirtualMethodTarget(new RuntimeTypeHandle(new EETypePtr(context)), *(RuntimeMethodHandle*)&signature));
117: return entry.Result;
00007FF6DB12BDA5 48 8B 40 20 mov rax,qword ptr [rax+20h]
00007FF6DB12BDA9 48 83 C4 28 add rsp,28h
00007FF6DB12BDAD 5B pop rbx
00007FF6DB12BDAE 5D pop rbp
00007FF6DB12BDAF 5E pop rsi
00007FF6DB12BDB0 5F pop rdi
00007FF6DB12BDB1 C3 ret
00007FF6DB12BDB2 E8 49 F4 00 00 call S_P_CoreLib_Internal_Runtime_CompilerHelpers_ThrowHelpers__ThrowIndexOutOfRangeException (07FF6DB13B200h)
00007FF6DB12BDB7 CC int 3 After00007FF6A5C95A40 48 83 EC 28 sub rsp,28h
00007FF6A5C95A44 48 8B 05 ED F3 10 00 mov rax,qword ptr [S_P_CoreLib_System_Runtime_TypeLoaderExports::__GCSTATICS (07FF6A5DA4E38h)]
00007FF6A5C95A4B 48 8B 40 08 mov rax,qword ptr [rax+8]
00007FF6A5C95A4F 4C 8B 01 mov r8,qword ptr [rcx]
00007FF6A5C95A52 4D 8B C8 mov r9,r8
00007FF6A5C95A55 4C 8B D2 mov r10,rdx
00007FF6A5C95A58 4D 8B D9 mov r11,r9
00007FF6A5C95A5B 49 C1 FB 20 sar r11,20h
00007FF6A5C95A5F 45 33 C3 xor r8d,r11d
00007FF6A5C95A62 41 C1 F8 04 sar r8d,4
00007FF6A5C95A66 44 33 C2 xor r8d,edx
00007FF6A5C95A69 49 C1 FA 20 sar r10,20h
00007FF6A5C95A6D 45 33 C2 xor r8d,r10d
00007FF6A5C95A70 44 8B 50 08 mov r10d,dword ptr [rax+8]
00007FF6A5C95A74 41 FF CA dec r10d
00007FF6A5C95A77 45 23 C2 and r8d,r10d
00007FF6A5C95A7A 4D 63 C0 movsxd r8,r8d
00007FF6A5C95A7D 4A 8B 44 C0 10 mov rax,qword ptr [rax+r8*8+10h]
00007FF6A5C95A82 48 85 C0 test rax,rax
00007FF6A5C95A85 74 15 je S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+5Ch (07FF6A5C95A9Ch)
00007FF6A5C95A87 4C 39 48 10 cmp qword ptr [rax+10h],r9
00007FF6A5C95A8B 75 06 jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+53h (07FF6A5C95A93h)
00007FF6A5C95A8D 48 39 50 18 cmp qword ptr [rax+18h],rdx
00007FF6A5C95A91 74 09 je S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+5Ch (07FF6A5C95A9Ch)
00007FF6A5C95A93 48 8B 40 08 mov rax,qword ptr [rax+8]
00007FF6A5C95A97 48 85 C0 test rax,rax
00007FF6A5C95A9A 75 EB jne S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+47h (07FF6A5C95A87h)
114: if (entry != null)
00007FF6A5C95A9C 48 85 C0 test rax,rax
00007FF6A5C95A9F 74 09 je S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlot+6Ah (07FF6A5C95AAAh)
115: return entry.Result;
00007FF6A5C95AA1 48 8B 40 20 mov rax,qword ptr [rax+20h]
116:
117: return GVMLookupForSlotSlow(obj, slot);
00007FF6A5C95AA5 48 83 C4 28 add rsp,28h
00007FF6A5C95AA9 C3 ret
00007FF6A5C95AAA E8 11 00 00 00 call S_P_CoreLib_System_Runtime_TypeLoaderExports__GVMLookupForSlotSlow (07FF6A5C95AC0h)
00007FF6A5C95AAF 90 nop
00007FF6A5C95AB0 48 83 C4 28 add rsp,28h
00007FF6A5C95AB4 C3 ret The slow helper is still not getting tail called unfortunately. Still need to work on my "RyuJIT whisperer" skills. Seems to be about 20% faster on a microbenchmark. Cc @dotnet/ilc-contrib
|
@@ -133,7 +142,11 @@ internal static unsafe IntPtr OpenInstanceMethodLookup(IntPtr openResolver, obje | |||
private static Entry LookupInCache(Entry[] cache, IntPtr context, IntPtr signature) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may consider using similar hashtable here as what we use for casting. Both solve the same problem.
cc @VSadov
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. casting cache might fit better for this scenario.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've logged #85927 to follow up.
That's likely because of the struct field access that in IL looks like it exposes the address of the |
BasicMinimalApi spends about 1% of samples in this method. It's the generic virtual method dispatch routine.
Before
After
The slow helper is still not getting tail called unfortunately. Still need to work on my "RyuJIT whisperer" skills.
Seems to be about 20% faster on a microbenchmark.
Cc @dotnet/ilc-contrib