diff options
author | Denis Vlasenko <vda.linux@googlemail.com> | 2008-04-15 08:27:24 +0000 |
---|---|---|
committer | Denis Vlasenko <vda.linux@googlemail.com> | 2008-04-15 08:27:24 +0000 |
commit | df7958a9606a342e3c3ac5a40fc41f3a79669d62 (patch) | |
tree | f22657788d4ca4bd427f7ff7247dfa353590ac9c /libc/string/x86_64/memset.S | |
parent | 534dfb536f19737f2642ee56dd67a97c5db6a74e (diff) |
amd64 string ops: use alignment more carefully, and comment it.
By capping max padding to not be bigger than three next insns,
we avoid having ridiculously big NOPs like this one:
53:66 66 66 66 2e 0f 1f nopw %cs:0x0(%rax,%rax,1)
5a:84 00 00 00 00 00
which was bigger than next three insns combined!
Size changes:
text data bss dec hex filename
102 0 0 102 66 x86_64/memcpy.o
102 0 0 102 66 x86_64.old/memcpy.o
90 0 0 90 5a x86_64/mempcpy.o
102 0 0 102 66 x86_64.old/mempcpy.o
210 0 0 210 d2 x86_64/memset.o
242 0 0 242 f2 x86_64.old/memset.o
213 0 0 213 d5 x86_64/stpcpy.o
220 0 0 220 dc x86_64.old/stpcpy.o
428 0 0 428 1ac x86_64/strcat.o
444 0 0 444 1bc x86_64.old/strcat.o
417 0 0 417 1a1 x86_64/strchr.o
418 0 0 418 1a2 x86_64.old/strchr.o
33 0 0 33 21 x86_64/strcmp.o
33 0 0 33 21 x86_64.old/strcmp.o
213 0 0 213 d5 x86_64/strcpy.o
220 0 0 220 dc x86_64.old/strcpy.o
135 0 0 135 87 x86_64/strcspn.o
151 0 0 151 97 x86_64.old/strcspn.o
225 0 0 225 e1 x86_64/strlen.o
233 0 0 233 e9 x86_64.old/strlen.o
140 0 0 140 8c x86_64/strpbrk.o
156 0 0 156 9c x86_64.old/strpbrk.o
135 0 0 135 87 x86_64/strspn.o
151 0 0 151 97 x86_64.old/strspn.o
Also, a few files got their .text alignment relaxed from 16 to 8 bytes,
which reduces padding at link time.
Diffstat (limited to 'libc/string/x86_64/memset.S')
-rw-r--r-- | libc/string/x86_64/memset.S | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index 3092e81eb..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -55,8 +55,10 @@ ENTRY (memset) test $0x7,%edi /* Check for alignment. */ jz 2f - .p2align 4 -1: /* Align ptr to 8 byte. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +1: + /* Align ptr to 8 byte. */ mov %sil,(%rcx) dec %rdx inc %rcx @@ -70,8 +72,10 @@ ENTRY (memset) cmp LARGE, %rdx jae 11f - .p2align 4 -3: /* Fill 64 bytes. */ + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 +3: + /* Fill 64 bytes. */ mov %r8,(%rcx) mov %r8,0x8(%rcx) mov %r8,0x10(%rcx) @@ -114,9 +118,11 @@ ENTRY (memset) #endif retq - .p2align 4 -11: /* Fill 64 bytes without polluting the cache. */ - /* We could use movntdq %xmm0,(%rcx) here to further + /* Next 3 insns are 14 bytes total, make sure we decode them in one go */ + .p2align 4,,14 +11: + /* Fill 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further speed up for large cases but let's not use XMM registers. */ movnti %r8,(%rcx) movnti %r8,0x8(%rcx) |