I just recently upgraded and decided to decompile win64's dll's to see what makes it tick.
So I started with MSVCRT.DLL in the windows/SystemSxS/AMD64.Microsoft.windows.CPlusPlusRuntime/ directory.
If you want to know how NOT to code in ASM I suggest you decompile this dll when you get a chance.
Here's a gem :|~
.text:000007FF760BA760 ; __int64 __cdecl abs64(__int64)
.text:000007FF760BA760 public _abs64
.text:000007FF760BA760 _abs64 proc near
.text:000007FF760BA760 test rcx, rcx
.text:000007FF760BA763 jns short loc_7FF760BA768
.text:000007FF760BA765 neg rcx
.text:000007FF760BA768
.text:000007FF760BA768 loc_7FF760BA768: ; CODE XREF: _abs64+3j
.text:000007FF760BA768 mov rax, rcx
.text:000007FF760BA76B retn
.text:000007FF760BA76B _abs64 endp
Apparently M$ thinks you need a branch to return the absolute value of an integer.
Anyone else would have probably chosen this method
AND RCX, 7FFFFFFF FFFFFFFFh
MOV RAX, RCX
RETN
Here's how C++ finds the length of a string
.text:000007FF760D2E20 ; size_t __cdecl strlen(const char *)
.text:000007FF760D2E20 public strlen
.text:000007FF760D2E20 strlen proc near
.text:000007FF760D2E20 mov rax, rcx
.text:000007FF760D2E23 neg rcx
.text:000007FF760D2E26 test rax, 7
.text:000007FF760D2E2C jz short loc_7FF760D2E3D
.text:000007FF760D2E2E db 66h
.text:000007FF760D2E2E nop
.text:000007FF760D2E30
.text:000007FF760D2E30 loc_7FF760D2E30: ; CODE XREF: strlen+1Bj
.text:000007FF760D2E30 mov dl,
.text:000007FF760D2E32 inc rax
.text:000007FF760D2E35 test dl, dl
.text:000007FF760D2E37 jz short loc_7FF760D2E98
.text:000007FF760D2E39 test al, 7
.text:000007FF760D2E3B jnz short loc_7FF760D2E30
.text:000007FF760D2E3D
.text:000007FF760D2E3D loc_7FF760D2E3D: ; CODE XREF: strlen+Cj
.text:000007FF760D2E3D mov r8, 7EFEFEFEFEFEFEFFh
.text:000007FF760D2E47 mov r11, 8101010101010100h
.text:000007FF760D2E51
.text:000007FF760D2E51 loc_7FF760D2E51: ; CODE XREF: strlen+47j
.text:000007FF760D2E51 ; strlen+76j
.text:000007FF760D2E51 mov rdx,
.text:000007FF760D2E54 mov r9, r8
.text:000007FF760D2E57 add rax, 8
.text:000007FF760D2E5B add r9, rdx
.text:000007FF760D2E5E not rdx
.text:000007FF760D2E61 xor rdx, r9
.text:000007FF760D2E64 and rdx, r11
.text:000007FF760D2E67 jz short loc_7FF760D2E51
.text:000007FF760D2E69 mov rdx,
.text:000007FF760D2E6D test dl, dl
.text:000007FF760D2E6F jz short loc_7FF760D2EC2
.text:000007FF760D2E71 test dh, dh
.text:000007FF760D2E73 jz short loc_7FF760D2EBC
.text:000007FF760D2E75 shr rdx, 10h
.text:000007FF760D2E79 test dl, dl
.text:000007FF760D2E7B jz short loc_7FF760D2EB6
.text:000007FF760D2E7D test dh, dh
.text:000007FF760D2E7F jz short loc_7FF760D2EB0
.text:000007FF760D2E81 shr rdx, 10h
.text:000007FF760D2E85 test dl, dl
.text:000007FF760D2E87 jz short loc_7FF760D2EAA
.text:000007FF760D2E89 test dh, dh
.text:000007FF760D2E8B jz short loc_7FF760D2EA4
.text:000007FF760D2E8D shr edx, 10h
.text:000007FF760D2E90 test dl, dl
.text:000007FF760D2E92 jz short loc_7FF760D2E9E
.text:000007FF760D2E94 test dh, dh
.text:000007FF760D2E96 jnz short loc_7FF760D2E51
.text:000007FF760D2E98
.text:000007FF760D2E98 loc_7FF760D2E98: ; CODE XREF: strlen+17j
.text:000007FF760D2E98 lea rax,
.text:000007FF760D2E9D retn
.text:000007FF760D2E9E ; ---------------------------------------------------------------------------
.text:000007FF760D2E9E
.text:000007FF760D2E9E loc_7FF760D2E9E: ; CODE XREF: strlen+72j
.text:000007FF760D2E9E lea rax,
.text:000007FF760D2EA3 retn
.text:000007FF760D2EA4 ; ---------------------------------------------------------------------------
.text:000007FF760D2EA4
.text:000007FF760D2EA4 loc_7FF760D2EA4: ; CODE XREF: strlen+6Bj
.text:000007FF760D2EA4 lea rax,
.text:000007FF760D2EA9 retn
.text:000007FF760D2EAA ; ---------------------------------------------------------------------------
.text:000007FF760D2EAA
.text:000007FF760D2EAA loc_7FF760D2EAA: ; CODE XREF: strlen+67j
.text:000007FF760D2EAA lea rax,
.text:000007FF760D2EAF retn
.text:000007FF760D2EB0 ; ---------------------------------------------------------------------------
.text:000007FF760D2EB0
.text:000007FF760D2EB0 loc_7FF760D2EB0: ; CODE XREF: strlen+5Fj
.text:000007FF760D2EB0 lea rax,
.text:000007FF760D2EB5 retn
.text:000007FF760D2EB6 ; ---------------------------------------------------------------------------
.text:000007FF760D2EB6
.text:000007FF760D2EB6 loc_7FF760D2EB6: ; CODE XREF: strlen+5Bj
.text:000007FF760D2EB6 lea rax,
.text:000007FF760D2EBB retn
.text:000007FF760D2EBC ; ---------------------------------------------------------------------------
.text:000007FF760D2EBC
.text:000007FF760D2EBC loc_7FF760D2EBC: ; CODE XREF: strlen+53j
.text:000007FF760D2EBC lea rax,
.text:000007FF760D2EC1 retn
.text:000007FF760D2EC2 ; ---------------------------------------------------------------------------
.text:000007FF760D2EC2
.text:000007FF760D2EC2 loc_7FF760D2EC2: ; CODE XREF: strlen+4Fj
.text:000007FF760D2EC2 lea rax,
.text:000007FF760D2EC7 retn
.text:000007FF760D2EC7 strlen endp
It's not that bad of a routine. They even used a tid bit from AMD's Optimization manual and aligned the code with the correct 66h and nop sequence.
StrnCpy is less than optimal.
MSVCRT for win64 did get the trig functions right almost totally copy and pasted from AMD64 math library.
The sqrtf routine is pretty funny it runs off about 5 different pointless compares and 15 mov's before it gets to SQRTSS xmm0, xmm0
Hope someone else besides me got a good laugh out of the _abs64 snippet.
So I started with MSVCRT.DLL in the windows/SystemSxS/AMD64.Microsoft.windows.CPlusPlusRuntime/ directory.
If you want to know how NOT to code in ASM I suggest you decompile this dll when you get a chance.
Here's a gem :|~
.text:000007FF760BA760 ; __int64 __cdecl abs64(__int64)
.text:000007FF760BA760 public _abs64
.text:000007FF760BA760 _abs64 proc near
.text:000007FF760BA760 test rcx, rcx
.text:000007FF760BA763 jns short loc_7FF760BA768
.text:000007FF760BA765 neg rcx
.text:000007FF760BA768
.text:000007FF760BA768 loc_7FF760BA768: ; CODE XREF: _abs64+3j
.text:000007FF760BA768 mov rax, rcx
.text:000007FF760BA76B retn
.text:000007FF760BA76B _abs64 endp
Apparently M$ thinks you need a branch to return the absolute value of an integer.
Anyone else would have probably chosen this method
AND RCX, 7FFFFFFF FFFFFFFFh
MOV RAX, RCX
RETN
Here's how C++ finds the length of a string
.text:000007FF760D2E20 ; size_t __cdecl strlen(const char *)
.text:000007FF760D2E20 public strlen
.text:000007FF760D2E20 strlen proc near
.text:000007FF760D2E20 mov rax, rcx
.text:000007FF760D2E23 neg rcx
.text:000007FF760D2E26 test rax, 7
.text:000007FF760D2E2C jz short loc_7FF760D2E3D
.text:000007FF760D2E2E db 66h
.text:000007FF760D2E2E nop
.text:000007FF760D2E30
.text:000007FF760D2E30 loc_7FF760D2E30: ; CODE XREF: strlen+1Bj
.text:000007FF760D2E30 mov dl,
.text:000007FF760D2E32 inc rax
.text:000007FF760D2E35 test dl, dl
.text:000007FF760D2E37 jz short loc_7FF760D2E98
.text:000007FF760D2E39 test al, 7
.text:000007FF760D2E3B jnz short loc_7FF760D2E30
.text:000007FF760D2E3D
.text:000007FF760D2E3D loc_7FF760D2E3D: ; CODE XREF: strlen+Cj
.text:000007FF760D2E3D mov r8, 7EFEFEFEFEFEFEFFh
.text:000007FF760D2E47 mov r11, 8101010101010100h
.text:000007FF760D2E51
.text:000007FF760D2E51 loc_7FF760D2E51: ; CODE XREF: strlen+47j
.text:000007FF760D2E51 ; strlen+76j
.text:000007FF760D2E51 mov rdx,
.text:000007FF760D2E54 mov r9, r8
.text:000007FF760D2E57 add rax, 8
.text:000007FF760D2E5B add r9, rdx
.text:000007FF760D2E5E not rdx
.text:000007FF760D2E61 xor rdx, r9
.text:000007FF760D2E64 and rdx, r11
.text:000007FF760D2E67 jz short loc_7FF760D2E51
.text:000007FF760D2E69 mov rdx,
.text:000007FF760D2E6D test dl, dl
.text:000007FF760D2E6F jz short loc_7FF760D2EC2
.text:000007FF760D2E71 test dh, dh
.text:000007FF760D2E73 jz short loc_7FF760D2EBC
.text:000007FF760D2E75 shr rdx, 10h
.text:000007FF760D2E79 test dl, dl
.text:000007FF760D2E7B jz short loc_7FF760D2EB6
.text:000007FF760D2E7D test dh, dh
.text:000007FF760D2E7F jz short loc_7FF760D2EB0
.text:000007FF760D2E81 shr rdx, 10h
.text:000007FF760D2E85 test dl, dl
.text:000007FF760D2E87 jz short loc_7FF760D2EAA
.text:000007FF760D2E89 test dh, dh
.text:000007FF760D2E8B jz short loc_7FF760D2EA4
.text:000007FF760D2E8D shr edx, 10h
.text:000007FF760D2E90 test dl, dl
.text:000007FF760D2E92 jz short loc_7FF760D2E9E
.text:000007FF760D2E94 test dh, dh
.text:000007FF760D2E96 jnz short loc_7FF760D2E51
.text:000007FF760D2E98
.text:000007FF760D2E98 loc_7FF760D2E98: ; CODE XREF: strlen+17j
.text:000007FF760D2E98 lea rax,
.text:000007FF760D2E9D retn
.text:000007FF760D2E9E ; ---------------------------------------------------------------------------
.text:000007FF760D2E9E
.text:000007FF760D2E9E loc_7FF760D2E9E: ; CODE XREF: strlen+72j
.text:000007FF760D2E9E lea rax,
.text:000007FF760D2EA3 retn
.text:000007FF760D2EA4 ; ---------------------------------------------------------------------------
.text:000007FF760D2EA4
.text:000007FF760D2EA4 loc_7FF760D2EA4: ; CODE XREF: strlen+6Bj
.text:000007FF760D2EA4 lea rax,
.text:000007FF760D2EA9 retn
.text:000007FF760D2EAA ; ---------------------------------------------------------------------------
.text:000007FF760D2EAA
.text:000007FF760D2EAA loc_7FF760D2EAA: ; CODE XREF: strlen+67j
.text:000007FF760D2EAA lea rax,
.text:000007FF760D2EAF retn
.text:000007FF760D2EB0 ; ---------------------------------------------------------------------------
.text:000007FF760D2EB0
.text:000007FF760D2EB0 loc_7FF760D2EB0: ; CODE XREF: strlen+5Fj
.text:000007FF760D2EB0 lea rax,
.text:000007FF760D2EB5 retn
.text:000007FF760D2EB6 ; ---------------------------------------------------------------------------
.text:000007FF760D2EB6
.text:000007FF760D2EB6 loc_7FF760D2EB6: ; CODE XREF: strlen+5Bj
.text:000007FF760D2EB6 lea rax,
.text:000007FF760D2EBB retn
.text:000007FF760D2EBC ; ---------------------------------------------------------------------------
.text:000007FF760D2EBC
.text:000007FF760D2EBC loc_7FF760D2EBC: ; CODE XREF: strlen+53j
.text:000007FF760D2EBC lea rax,
.text:000007FF760D2EC1 retn
.text:000007FF760D2EC2 ; ---------------------------------------------------------------------------
.text:000007FF760D2EC2
.text:000007FF760D2EC2 loc_7FF760D2EC2: ; CODE XREF: strlen+4Fj
.text:000007FF760D2EC2 lea rax,
.text:000007FF760D2EC7 retn
.text:000007FF760D2EC7 strlen endp
It's not that bad of a routine. They even used a tid bit from AMD's Optimization manual and aligned the code with the correct 66h and nop sequence.
StrnCpy is less than optimal.
MSVCRT for win64 did get the trig functions right almost totally copy and pasted from AMD64 math library.
The sqrtf routine is pretty funny it runs off about 5 different pointless compares and 15 mov's before it gets to SQRTSS xmm0, xmm0
Hope someone else besides me got a good laugh out of the _abs64 snippet.
Generally the MSVCRT DLLs don't seem to have been compiled with very agressive optimizations turned on... also, abs() is probably implemented in C code, meant to be compiled both for 32 and 64 bit - so bitmasking tricks can't be directly used. This could of course be implemented with #ifdefs, or the code sequence could be added as a peephole optimization. Who does a function call when they need a fast abs(), though? :) (VC++ has intrinsics for abs(), btw).
Btw, *are* the compares in sqrtf pointless, or are they needed to handle different precision settings?
Btw, *are* the compares in sqrtf pointless, or are they needed to handle different precision settings?
AND RCX, 7FFFFFFF FFFFFFFFh
MOV RAX, RCX
RETN
It is for ABS? In an ABS function 1 and -1 should return 1 right? Doing your code with 1 (00...0001b) i get 1 (good), but with -1 (11111...111b) i get 2^63 -1 (011111...1111b), so lets the microsoft use that code...
he probably meant this code for a floating point value.
If MS meant it to be for a floating point value, then the neg rcx would procude the wrong result!!
I think r22 got ahead of himself with this one.
Raymond
I think r22 got ahead of himself with this one.
Raymond
MS wanted it for fixed value. r22 wanted it for floating point value :P
Maybe r22 is looking for fabs when he stumbled across abs. :P
Yes I was thinking float when it was clearly an integer
the _abs had 2 functions 1 for int64 and one for int32 exactly the same coding style.
To SAVE my precious rant... using CMOV is faster for amd64 than branching
test rcx,rcx
jns .noS
neg rcx
.noS:
mov rax,rcx
RUNS SLOWER THAN
(here comes the code I meant to write for int64 _abs(_int64))
mov rax,rcx ;store in return reg
neg rcx ;set the sign flag
cmovns rax,rcx ;mov if no sign flag
Using CMOV runs ~1.6x faster on my x2 3800+
If you don't believe me make sure you use random values when benchmarking to take branch prediction out of it.
Damage control complete.
MSVCRT uses CMOVxx in the SplitPath function why can't they use it in ABS ????? :D
the _abs had 2 functions 1 for int64 and one for int32 exactly the same coding style.
To SAVE my precious rant... using CMOV is faster for amd64 than branching
test rcx,rcx
jns .noS
neg rcx
.noS:
mov rax,rcx
RUNS SLOWER THAN
(here comes the code I meant to write for int64 _abs(_int64))
mov rax,rcx ;store in return reg
neg rcx ;set the sign flag
cmovns rax,rcx ;mov if no sign flag
Using CMOV runs ~1.6x faster on my x2 3800+
If you don't believe me make sure you use random values when benchmarking to take branch prediction out of it.
Damage control complete.
MSVCRT uses CMOVxx in the SplitPath function why can't they use it in ABS ????? :D