From 41c15785a9b620a8c85944649c20cca853f40e84 Mon Sep 17 00:00:00 2001 From: Denis Vlasenko Date: Wed, 17 Dec 2008 01:36:31 +0000 Subject: since gcc -Os hates us and does not inline string ops, implement inline versions of some of them. Enable only those which result roughly in the same code size as using out-or-line versions. None of this affects users, installed headers won't have any trace of it. --- include/libc-string_i386.h | 314 ++++++++++++++++++++++++++++++++++++++++++ include/string.h | 29 ++-- libc/string/generic/memchr.c | 4 +- libc/string/generic/mempcpy.c | 3 +- libc/string/i386/memcpy.c | 2 +- libc/string/i386/memset.c | 1 + libc/string/i386/strcpy.c | 2 +- libc/string/i386/strlen.c | 2 +- libc/string/memchr.c | 1 + libc/string/mempcpy.c | 1 + libc/string/stpcpy.c | 2 +- 11 files changed, 341 insertions(+), 20 deletions(-) create mode 100644 include/libc-string_i386.h diff --git a/include/libc-string_i386.h b/include/libc-string_i386.h new file mode 100644 index 000000000..3ed9c8783 --- /dev/null +++ b/include/libc-string_i386.h @@ -0,0 +1,314 @@ +/* + * Copyright (C) 2008 Denys Vlasenko + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball + */ + +#if !defined _STRING_H +#error "Never use directly; include instead" +#endif + +#ifndef _LIBC_STRING_i386_H +#define _LIBC_STRING_i386_H 1 + +static __always_inline +void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count) +{ + int ecx, edi; + + if (count == 0) + return s; + + /* Very small (2 stores or less) are best done with direct + * mov , instructions (they do not clobber registers) */ + if (count == 1) { + *(char *)(s + 0) = eax; + return s; + } + + eax *= 0x01010101; /* done at compile time */ + + if (count == 2) { + *(short *)(s + 0) = eax; + return s; + } + if (count == 3) { + *(short *)(s + 0) = eax; + *(char *) (s + 2) = eax; + return s; + } + if (count == 1*4 + 0) { + *(int *)(s + 0) = eax; + return s; + } + if (count == 1*4 + 1) { + *(int *) (s + 0) = eax; + *(char *)(s + 4) = eax; + return s; + } + if (count == 1*4 + 2) { + *(int *) (s + 0) = eax; + *(short *)(s + 4) = eax; + return s; + } + + /* Small string stores: don't clobber ecx + * (clobbers only eax and edi) */ +#define small_store(arg) { \ + __asm__ __volatile__( \ + arg \ + : "=&D" (edi) \ + : "a" (eax), "0" (s) \ + : "memory" \ + ); \ + return s; \ +} + if (count == 1*4 + 3) small_store("stosl; stosw; stosb"); + if (count == 2*4 + 0) { + ((int *)s)[0] = eax; + ((int *)s)[1] = eax; + return s; + } + if (count == 2*4 + 1) small_store("stosl; stosl; stosb"); + if (count == 2*4 + 2) small_store("stosl; stosl; stosw"); + if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb"); + if (count == 3*4 + 0) small_store("stosl; stosl; stosl"); + if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb"); + if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw"); + if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb"); + if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl"); + if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb"); + /* going over 7 bytes is suboptimal */ + /* stosw is 2-byte insn, so this one takes 6 bytes: */ + if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw"); + /* 7 bytes */ + if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb"); + /* 5 bytes */ + if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl"); + /* 6 bytes */ + if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb"); + /* 7 bytes */ + if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw"); + /* 8 bytes, but oh well... */ + if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb"); + /* 6 bytes */ + if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl"); + /* the rest would be 7+ bytes and is handled below instead */ +#undef small_store + + /* Not small, but multiple-of-4 store. + * "mov ,%ecx; rep; stosl" sequence is 7 bytes */ + __asm__ __volatile__( + " rep; stosl\n" + : "=&c" (ecx), "=&D" (edi) + : "a" (eax), "0" (count / 4), "1" (s) + : "memory" + ); + return s; +} +#if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */ +#define memset(s, c, count) ( \ + ( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \ + || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ + ) \ + ? memset((s), (c), (count)) \ + : inlined_memset_const_c_count4((s), (c), (count)) \ + ) +#endif + + +static __always_inline +void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count) +{ + int ecx; + char *esi, *edi; + + if (count == 0) + return d; + + if (count == 1) { + *(char *)d = *(char *)s; + return d + 1; + } + if (count == 2) { + *(short *)d = *(short *)s; + return d + 2; + } + /* Small string moves: don't clobber ecx + * (clobbers only esi and edi) */ +#define small_move(arg) { \ + __asm__ __volatile__( \ + arg \ + : "=&S" (esi), "=&D" (edi) \ + : "0" (s), "1" (d) \ + : "memory" \ + ); \ + return edi; \ +} + if (count == 3) small_move("movsw; movsb"); + if (count == 1*4 + 0) { + *(int *)d = *(int *)s; + return d + 4; + } + if (count == 1*4 + 1) small_move("movsl; movsb"); + if (count == 1*4 + 2) small_move("movsl; movsw"); + if (count == 1*4 + 3) small_move("movsl; movsw; movsb"); + if (count == 2*4 + 0) small_move("movsl; movsl"); + if (count == 2*4 + 1) small_move("movsl; movsl; movsb"); + if (count == 2*4 + 2) small_move("movsl; movsl; movsw"); + if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb"); + if (count == 3*4 + 0) small_move("movsl; movsl; movsl"); + if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb"); + if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw"); + if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb"); + if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl"); + if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb"); + /* going over 7 bytes is suboptimal */ + /* movsw is 2-byte insn, so this one takes 6 bytes: */ + if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw"); + /* 7 bytes */ + if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb"); + /* 5 bytes */ + if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl"); + /* 6 bytes */ + if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb"); + /* 7 bytes */ + if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw"); + /* 8 bytes, but oh well... */ + if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb"); + /* 6 bytes */ + if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl"); + /* the rest would be 7+ bytes and is handled below instead */ +#undef small_move + + /* Not small, but multiple-of-4 move. + * "mov ,%ecx; rep; movsl" sequence is 7 bytes */ + __asm__ __volatile__( + " rep; movsl\n" + : "=&c" (ecx), "=&S" (esi), "=&D" (edi) + : "0" (count / 4), "1" (s), "2" (d) + : "memory" + ); + return edi; +} +static __always_inline +void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count) +{ + inlined_mempcpy_const_count4(d, s, count); + return d; +} +#if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */ +#define mempcpy(d, s, count) ( \ + ( !(__builtin_constant_p(count)) \ + || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ + ) \ + ? mempcpy((d), (s), (count)) \ + : inlined_mempcpy_const_count4((d), (s), (count)) \ + ) +#define memcpy(d, s, count) ( \ + ( !(__builtin_constant_p(count)) \ + || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ + ) \ + ? memcpy((d), (s), (count)) \ + : inlined_memcpy_const_count4((d), (s), (count)) \ + ) +#endif + + +static __always_inline +size_t inlined_strlen(const char *s) +{ + int edi; + int ecx; + __asm__ __volatile__( + " repne; scasb\n" + /* " notl %0\n" */ + /* " decl %0\n" */ + : "=c" (ecx), "=&D" (edi) + : "1" (s), "a" (0), "0" (0xffffffffu) + /* : no clobbers */ + ); + return -ecx - 1; +} +#if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */ +#define strlen(s) inlined_strlen(s) +#endif + + +static __always_inline +char *inlined_stpcpy(char *dest, const char *src) +{ + char *esi, *edi; + int eax; + __asm__ __volatile__( + "1: lodsb\n" + " stosb\n" + " testb %%al, %%al\n" + " jnz 1b\n" + : "=&S" (esi), "=&D" (edi), "=&a" (eax) + : "0" (src), "1" (dest) + : "memory" + ); + return edi - 1; +} +static __always_inline +char *inlined_strcpy(char *dest, const char *src) +{ + inlined_stpcpy(dest, src); + return dest; +} +#if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */ +#define stpcpy(dest, src) inlined_stpcpy(dest, src) +#define strcpy(dest, src) inlined_strcpy(dest, src) +#endif + + +static __always_inline +void *inlined_memchr(const void *s, int c, size_t count) +{ + void *edi; + int ecx; + /* Unfortunately, c gets loaded to %eax (wide insn), not %al */ + __asm__ __volatile__( + " jecxz 1f\n" + " repne; scasb\n" + " leal -1(%%edi), %%edi\n" + " je 2f\n" + "1:\n" + " xorl %%edi, %%edi\n" + "2:\n" + : "=&D" (edi), "=&c" (ecx) + : "a" (c), "0" (s), "1" (count) + /* : no clobbers */ + ); + return edi; +} +static __always_inline +void *inlined_memchr_const_c(const void *s, int c, size_t count) +{ + void *edi; + int ecx, eax; + __asm__ __volatile__( + " jecxz 1f\n" + " movb %4, %%al\n" /* const c to %%al */ + " repne; scasb\n" + " leal -1(%%edi), %%edi\n" + " je 2f\n" + "1:\n" + " xorl %%edi, %%edi\n" + "2:\n" + : "=&D" (edi), "=&c" (ecx), "=&a" (eax) + : "0" (s), "i" (c), "1" (count) + /* : no clobbers */ + ); + return edi; +} +#if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */ +#define memchr(s, c, count) ( \ + __builtin_constant_p(c) \ + ? inlined_memchr_const_c(s, (c) & 0xff, count) \ + : inlined_memchr(s, c, count) \ + ) +#endif + +#endif /* _LIBC_STRING_i386_H */ diff --git a/include/string.h b/include/string.h index e889dc11a..ab1076565 100644 --- a/include/string.h +++ b/include/string.h @@ -378,7 +378,7 @@ libc_hidden_proto(ffs) /* The following two functions are non-standard but necessary for non-32 bit platforms. */ -#if 0 /*def __USE_GNU*/ +# if 0 /*#ifdef __USE_GNU*/ extern int ffsl (long int __l) __THROW __attribute__ ((__const__)); # ifdef __GNUC__ __extension__ extern int ffsll (long long int __ll) @@ -422,44 +422,44 @@ libc_hidden_proto(strsep) #ifdef __USE_GNU /* Compare S1 and S2 as strings holding name & indices/version numbers. */ -#if 0 +# if 0 extern int strverscmp (__const char *__s1, __const char *__s2) __THROW __attribute_pure__ __nonnull ((1, 2)); libc_hidden_proto(strverscmp) -#endif +# endif /* Return a string describing the meaning of the signal number in SIG. */ extern char *strsignal (int __sig) __THROW; libc_hidden_proto(strsignal) /* Copy SRC to DEST, returning the address of the terminating '\0' in DEST. */ -#if 0 /* uClibc: disabled */ +# if 0 /* uClibc: disabled */ extern char *__stpcpy (char *__restrict __dest, __const char *__restrict __src) __THROW __nonnull ((1, 2)); -#endif +# endif extern char *stpcpy (char *__restrict __dest, __const char *__restrict __src) __THROW __nonnull ((1, 2)); libc_hidden_proto(stpcpy) /* Copy no more than N characters of SRC to DEST, returning the address of the last character written into DEST. */ -#if 0 /* uClibc: disabled */ +# if 0 /* uClibc: disabled */ extern char *__stpncpy (char *__restrict __dest, __const char *__restrict __src, size_t __n) __THROW __nonnull ((1, 2)); -#endif +# endif extern char *stpncpy (char *__restrict __dest, __const char *__restrict __src, size_t __n) __THROW __nonnull ((1, 2)); libc_hidden_proto(stpncpy) -#if 0 /* uClibc does not support strfry or memfrob. */ +# if 0 /* uClibc does not support strfry or memfrob. */ /* Sautee STRING briskly. */ extern char *strfry (char *__string) __THROW __nonnull ((1)); /* Frobnicate N bytes of S. */ extern void *memfrob (void *__s, size_t __n) __THROW __nonnull ((1)); -#endif +# endif # ifndef basename /* Return the file name within directory of FILENAME. We don't @@ -469,7 +469,7 @@ extern void *memfrob (void *__s, size_t __n) __THROW __nonnull ((1)); extern char *basename (__const char *__filename) __THROW __nonnull ((1)); libc_hidden_proto(basename) # endif -#endif +#endif /* __USE_GNU */ #ifdef __USE_BSD @@ -484,4 +484,11 @@ libc_hidden_proto(strlcpy) __END_DECLS -#endif /* string.h */ + +#ifdef UCLIBC_INTERNAL +# if defined __i386__ +# include +# endif +#endif + +#endif /* string.h */ diff --git a/libc/string/generic/memchr.c b/libc/string/generic/memchr.c index 8ea3f539a..d5cd0005e 100644 --- a/libc/string/generic/memchr.c +++ b/libc/string/generic/memchr.c @@ -25,14 +25,12 @@ #include #include -/* Experimentally off - libc_hidden_proto(memchr) */ -/* libc_hidden_proto(abort) */ - #include "memcopy.h" #define LONG_MAX_32_BITS 2147483647 /* Search no more than N bytes of S for C. */ +#undef memchr void *memchr (const void * s, int c_in, size_t n) { const unsigned char *char_ptr; diff --git a/libc/string/generic/mempcpy.c b/libc/string/generic/mempcpy.c index 8d7356486..d7fa79ef5 100644 --- a/libc/string/generic/mempcpy.c +++ b/libc/string/generic/mempcpy.c @@ -8,9 +8,8 @@ #include #ifdef __USE_GNU -/* Experimentally off - libc_hidden_proto(mempcpy) */ -/* Experimentally off - libc_hidden_proto(memcpy) */ +# undef mempcpy void *mempcpy (void *dstpp, const void *srcpp, size_t len) { memcpy(dstpp, srcpp, len); diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c index 216ddfd1a..af86cf255 100644 --- a/libc/string/i386/memcpy.c +++ b/libc/string/i386/memcpy.c @@ -32,7 +32,7 @@ #include -/* Experimentally off - libc_hidden_proto(memcpy) */ +#undef memcpy void *memcpy(void * to, const void * from, size_t n) { int d0, d1, d2; diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c index bbaa45215..cfc16983c 100644 --- a/libc/string/i386/memset.c +++ b/libc/string/i386/memset.c @@ -33,6 +33,7 @@ #include /* Experimentally off - libc_hidden_proto(memset) */ +#undef memset void *memset(void *s, int c, size_t count) { int d0, d1; diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c index 09065a9b7..fff1bd006 100644 --- a/libc/string/i386/strcpy.c +++ b/libc/string/i386/strcpy.c @@ -32,7 +32,7 @@ #include -/* Experimentally off - libc_hidden_proto(strcpy) */ +#undef strcpy char *strcpy(char * dest, const char * src) { int d0, d1, d2; diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c index 61a178393..761d27aae 100644 --- a/libc/string/i386/strlen.c +++ b/libc/string/i386/strlen.c @@ -32,7 +32,7 @@ #include -/* Experimentally off - libc_hidden_proto(strlen) */ +#undef strlen size_t strlen(const char *s) { int d0; diff --git a/libc/string/memchr.c b/libc/string/memchr.c index 5e60f6554..438f4fa4a 100644 --- a/libc/string/memchr.c +++ b/libc/string/memchr.c @@ -10,6 +10,7 @@ #ifdef WANT_WIDE # define Wmemchr wmemchr #else +# undef memchr # define Wmemchr memchr #endif diff --git a/libc/string/mempcpy.c b/libc/string/mempcpy.c index e7605146a..d79bd1937 100644 --- a/libc/string/mempcpy.c +++ b/libc/string/mempcpy.c @@ -12,6 +12,7 @@ #ifdef WANT_WIDE # define Wmempcpy wmempcpy #else +# undef mempcpy # define Wmempcpy mempcpy #endif diff --git a/libc/string/stpcpy.c b/libc/string/stpcpy.c index 8a487584e..58ace8fc7 100644 --- a/libc/string/stpcpy.c +++ b/libc/string/stpcpy.c @@ -10,7 +10,7 @@ #ifdef WANT_WIDE # define Wstpcpy wcpcpy #else -/* Experimentally off - libc_hidden_proto(stpcpy) */ +# undef stpcpy # define Wstpcpy stpcpy #endif -- cgit v1.2.3