From f5c0ac3d4499a11f4581c1b4ff16cef7d8cf4c0b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 21 Sep 2005 02:18:29 +0000 Subject: merge x86_64 optimized string support --- libc/string/x86_64/memset.S | 138 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 libc/string/x86_64/memset.S (limited to 'libc/string/x86_64/memset.S') diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S new file mode 100644 index 000000000..d74ec8ccb --- /dev/null +++ b/libc/string/x86_64/memset.S @@ -0,0 +1,138 @@ +/* memset/bzero -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "_glibc_inc.h" + +/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ +#define BZERO_P (defined memset) + +/* This is somehow experimental and could made dependend on the cache + size. */ +#define LARGE $120000 + + .text +#if !BZERO_P && defined PIC && !defined NOT_IN_libc +ENTRY (__memset_chk) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk) +#endif +ENTRY (memset) +#if BZERO_P + mov %rsi,%rdx /* Adjust parameter. */ + xorl %esi,%esi /* Fill with 0s. */ +#endif + cmp $0x7,%rdx /* Check for small length. */ + mov %rdi,%rcx /* Save ptr as return value. */ + jbe 7f + +#if BZERO_P + mov %rsi,%r8 /* Just copy 0. */ +#else + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r8 + movzbl %sil,%eax + imul %rax,%r8 +#endif + test $0x7,%edi /* Check for alignment. */ + je 2f + + .p2align 4 +1: /* Align ptr to 8 byte. */ + mov %sil,(%rcx) + dec %rdx + inc %rcx + test $0x7,%ecx + jne 1b + +2: /* Check for really large regions. */ + mov %rdx,%rax + shr $0x6,%rax + je 4f + cmp LARGE, %rdx + jae 11f + + .p2align 4 +3: /* Copy 64 bytes. */ + mov %r8,(%rcx) + mov %r8,0x8(%rcx) + mov %r8,0x10(%rcx) + mov %r8,0x18(%rcx) + mov %r8,0x20(%rcx) + mov %r8,0x28(%rcx) + mov %r8,0x30(%rcx) + mov %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 3b + +4: /* Copy final bytes. */ + and $0x3f,%edx + mov %rdx,%rax + shr $0x3,%rax + je 6f + +5: /* First in chunks of 8 bytes. */ + mov %r8,(%rcx) + add $0x8,%rcx + dec %rax + jne 5b +6: + and $0x7,%edx +7: + test %rdx,%rdx + je 9f +8: /* And finally as bytes (up to 7). */ + mov %sil,(%rcx) + inc %rcx + dec %rdx + jne 8b +9: +#if BZERO_P + nop +#else + /* Load result (only if used as memset). */ + mov %rdi,%rax /* start address of destination is result */ +#endif + retq + + .p2align 4 +11: /* Copy 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further + speed up for large cases but let's not use XMM registers. */ + movnti %r8,(%rcx) + movnti %r8,0x8(%rcx) + movnti %r8,0x10(%rcx) + movnti %r8,0x18(%rcx) + movnti %r8,0x20(%rcx) + movnti %r8,0x28(%rcx) + movnti %r8,0x30(%rcx) + movnti %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 11b + jmp 4b + +END (memset) + +#if !BZERO_P && defined PIC && !defined NOT_IN_libc +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) +#endif -- cgit v1.2.3