summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libc/string/i386/memset.c65
1 files changed, 56 insertions, 9 deletions
diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c
index cfc16983c..9f51f3c60 100644
--- a/libc/string/i386/memset.c
+++ b/libc/string/i386/memset.c
@@ -28,21 +28,68 @@
* More importantly, these should provide a good example for
* others to follow when adding arch specific optimizations.
* -Erik
+ *
+ * 2009-04: modified by Denys Vlasenko <vda.linux@googlemail.com>
+ * Fill byte-by-byte is a bit too slow. I prefer 46 byte function
+ * which fills x4 faster than 21 bytes one.
*/
#include <string.h>
-/* Experimentally off - libc_hidden_proto(memset) */
#undef memset
void *memset(void *s, int c, size_t count)
{
- int d0, d1;
- __asm__ __volatile__(
- "rep\n\t"
- "stosb"
- : "=&c" (d0), "=&D" (d1)
- :"a" (c),"1" (s),"0" (count)
- :"memory");
- return s;
+ int reg, edi;
+ __asm__ __volatile__(
+
+ /* Most of the time, count is divisible by 4 and nonzero */
+ /* It's better to make this case faster */
+ /* " jecxz 9f\n" - (optional) count == 0: goto ret */
+ " mov %%ecx, %1\n"
+ " shr $2, %%ecx\n"
+ " jz 1f\n" /* zero words: goto fill_bytes */
+ /* extend 8-bit fill to 32 bits */
+ " movzx %%al, %%eax\n" /* 3 bytes */
+ /* or: " and $0xff, %%eax\n" - 5 bytes */
+ " imul $0x01010101, %%eax\n" /* 6 bytes */
+ /* fill full words */
+ " rep; stosl\n"
+ /* fill 0-3 bytes */
+ "1: and $3, %1\n"
+ " jz 9f\n" /* (count & 3) == 0: goto end */
+ "2: stosb\n"
+ " dec %1\n"
+ " jnz 2b\n"
+ /* end */
+ "9:\n"
+
+ : "=&D" (edi), "=&r" (reg)
+ : "0" (s), "a" (c), "c" (count)
+ : "memory"
+ );
+ return s;
}
libc_hidden_def(memset)
+
+/*
+gcc 4.3.1
+=========
+57 push %edi
+8b 7c 24 08 mov 0x8(%esp),%edi
+8b 4c 24 10 mov 0x10(%esp),%ecx
+8b 44 24 0c mov 0xc(%esp),%eax
+89 ca mov %ecx,%edx
+c1 e9 02 shr $0x2,%ecx
+74 0b je 1f <__GI_memset+0x1f>
+0f b6 c0 movzbl %al,%eax
+69 c0 01 01 01 01 imul $0x1010101,%eax,%eax
+f3 ab rep stos %eax,%es:(%edi)
+83 e2 03 and $0x3,%edx
+74 04 je 28 <__GI_memset+0x28>
+aa stos %al,%es:(%edi)
+4a dec %edx
+75 fc jne 24 <__GI_memset+0x24>
+8b 44 24 08 mov 0x8(%esp),%eax
+5f pop %edi
+c3 ret
+*/