From 170fc95312689d64e44c607165cb25cc9d476374 Mon Sep 17 00:00:00 2001
From: Denis Vlasenko <vda.linux@googlemail.com>
Date: Thu, 23 Apr 2009 11:19:30 +0000
Subject: libc/string/i386/memset.c: memset 1 byte at a time is a bit gross

---
 libc/string/i386/memset.c | 65 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

(limited to 'libc/string')

diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c
index cfc16983c..9f51f3c60 100644
--- a/libc/string/i386/memset.c
+++ b/libc/string/i386/memset.c
@@ -28,21 +28,68 @@
  * More importantly, these should provide a good example for
  * others to follow when adding arch specific optimizations.
  *  -Erik
+ *
+ * 2009-04: modified by Denys Vlasenko <vda.linux@googlemail.com>
+ * Fill byte-by-byte is a bit too slow. I prefer 46 byte function
+ * which fills x4 faster than 21 bytes one.
  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memset) */
 #undef memset
 void *memset(void *s, int c, size_t count)
 {
-    int d0, d1;
-    __asm__ __volatile__(
-	    "rep\n\t"
-	    "stosb"
-	    : "=&c" (d0), "=&D" (d1)
-	    :"a" (c),"1" (s),"0" (count)
-	    :"memory");
-    return s;
+	int reg, edi;
+	__asm__ __volatile__(
+
+		/* Most of the time, count is divisible by 4 and nonzero */
+		/* It's better to make this case faster */
+	/*	"	jecxz	9f\n" - (optional) count == 0: goto ret */
+		"	mov	%%ecx, %1\n"
+		"	shr	$2, %%ecx\n"
+		"	jz	1f\n" /* zero words: goto fill_bytes */
+		/* extend 8-bit fill to 32 bits */
+		"	movzx	%%al, %%eax\n" /* 3 bytes */
+	/* or:	"	and	$0xff, %%eax\n" - 5 bytes */
+		"	imul	$0x01010101, %%eax\n" /* 6 bytes */
+		/* fill full words */
+		"	rep; stosl\n"
+		/* fill 0-3 bytes */
+		"1:	and	$3, %1\n"
+		"	jz	9f\n" /* (count & 3) == 0: goto end */
+		"2:	stosb\n"
+		"	dec	%1\n"
+		"	jnz	2b\n"
+		/* end */
+		"9:\n"
+
+		: "=&D" (edi), "=&r" (reg)
+		: "0" (s), "a" (c), "c" (count)
+		: "memory"
+	);
+	return s;
 }
 libc_hidden_def(memset)
+
+/*
+gcc 4.3.1
+=========
+57                     push   %edi
+8b 7c 24 08            mov    0x8(%esp),%edi
+8b 4c 24 10            mov    0x10(%esp),%ecx
+8b 44 24 0c            mov    0xc(%esp),%eax
+89 ca                  mov    %ecx,%edx
+c1 e9 02               shr    $0x2,%ecx
+74 0b                  je     1f <__GI_memset+0x1f>
+0f b6 c0               movzbl %al,%eax
+69 c0 01 01 01 01      imul   $0x1010101,%eax,%eax
+f3 ab                  rep stos %eax,%es:(%edi)
+83 e2 03               and    $0x3,%edx
+74 04                  je     28 <__GI_memset+0x28>
+aa                     stos   %al,%es:(%edi)
+4a                     dec    %edx
+75 fc                  jne    24 <__GI_memset+0x24>
+8b 44 24 08            mov    0x8(%esp),%eax
+5f                     pop    %edi
+c3                     ret
+*/
-- 
cgit v1.2.3