summaryrefslogtreecommitdiff
path: root/libc/string/x86_64/memset.S
diff options
context:
space:
mode:
authorMike Frysinger <vapier@gentoo.org>2005-09-21 02:18:29 +0000
committerMike Frysinger <vapier@gentoo.org>2005-09-21 02:18:29 +0000
commitf5c0ac3d4499a11f4581c1b4ff16cef7d8cf4c0b (patch)
tree4f7ce150130560ccff718076cf102fb4d114752c /libc/string/x86_64/memset.S
parent37016e09de57c7145d7dd29cd1166f21f150d2cb (diff)
merge x86_64 optimized string support
Diffstat (limited to 'libc/string/x86_64/memset.S')
-rw-r--r--libc/string/x86_64/memset.S138
1 files changed, 138 insertions, 0 deletions
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
new file mode 100644
index 000000000..d74ec8ccb
--- /dev/null
+++ b/libc/string/x86_64/memset.S
@@ -0,0 +1,138 @@
+/* memset/bzero -- set memory area to CH/0
+ Optimized version for x86-64.
+ Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Andreas Jaeger <aj@suse.de>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include "_glibc_inc.h"
+
+/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
+#define BZERO_P (defined memset)
+
+/* This is somehow experimental and could made dependend on the cache
+ size. */
+#define LARGE $120000
+
+ .text
+#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+ENTRY (__memset_chk)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+#if BZERO_P
+ mov %rsi,%rdx /* Adjust parameter. */
+ xorl %esi,%esi /* Fill with 0s. */
+#endif
+ cmp $0x7,%rdx /* Check for small length. */
+ mov %rdi,%rcx /* Save ptr as return value. */
+ jbe 7f
+
+#if BZERO_P
+ mov %rsi,%r8 /* Just copy 0. */
+#else
+ /* Populate 8 bit data to full 64-bit. */
+ movabs $0x0101010101010101,%r8
+ movzbl %sil,%eax
+ imul %rax,%r8
+#endif
+ test $0x7,%edi /* Check for alignment. */
+ je 2f
+
+ .p2align 4
+1: /* Align ptr to 8 byte. */
+ mov %sil,(%rcx)
+ dec %rdx
+ inc %rcx
+ test $0x7,%ecx
+ jne 1b
+
+2: /* Check for really large regions. */
+ mov %rdx,%rax
+ shr $0x6,%rax
+ je 4f
+ cmp LARGE, %rdx
+ jae 11f
+
+ .p2align 4
+3: /* Copy 64 bytes. */
+ mov %r8,(%rcx)
+ mov %r8,0x8(%rcx)
+ mov %r8,0x10(%rcx)
+ mov %r8,0x18(%rcx)
+ mov %r8,0x20(%rcx)
+ mov %r8,0x28(%rcx)
+ mov %r8,0x30(%rcx)
+ mov %r8,0x38(%rcx)
+ add $0x40,%rcx
+ dec %rax
+ jne 3b
+
+4: /* Copy final bytes. */
+ and $0x3f,%edx
+ mov %rdx,%rax
+ shr $0x3,%rax
+ je 6f
+
+5: /* First in chunks of 8 bytes. */
+ mov %r8,(%rcx)
+ add $0x8,%rcx
+ dec %rax
+ jne 5b
+6:
+ and $0x7,%edx
+7:
+ test %rdx,%rdx
+ je 9f
+8: /* And finally as bytes (up to 7). */
+ mov %sil,(%rcx)
+ inc %rcx
+ dec %rdx
+ jne 8b
+9:
+#if BZERO_P
+ nop
+#else
+ /* Load result (only if used as memset). */
+ mov %rdi,%rax /* start address of destination is result */
+#endif
+ retq
+
+ .p2align 4
+11: /* Copy 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
+ speed up for large cases but let's not use XMM registers. */
+ movnti %r8,(%rcx)
+ movnti %r8,0x8(%rcx)
+ movnti %r8,0x10(%rcx)
+ movnti %r8,0x18(%rcx)
+ movnti %r8,0x20(%rcx)
+ movnti %r8,0x28(%rcx)
+ movnti %r8,0x30(%rcx)
+ movnti %r8,0x38(%rcx)
+ add $0x40,%rcx
+ dec %rax
+ jne 11b
+ jmp 4b
+
+END (memset)
+
+#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+#endif