diff options
Diffstat (limited to 'include/libc-string_i386.h')
| -rw-r--r-- | include/libc-string_i386.h | 314 | 
1 files changed, 314 insertions, 0 deletions
diff --git a/include/libc-string_i386.h b/include/libc-string_i386.h new file mode 100644 index 000000000..3ed9c8783 --- /dev/null +++ b/include/libc-string_i386.h @@ -0,0 +1,314 @@ +/* + * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball + */ + +#if !defined _STRING_H +#error "Never use <libc-string_i386.h> directly; include <string.h> instead" +#endif + +#ifndef _LIBC_STRING_i386_H +#define _LIBC_STRING_i386_H 1 + +static __always_inline +void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count) +{ +	int ecx, edi; + +	if (count == 0) +		return s; + +	/* Very small (2 stores or less) are best done with direct +	 * mov <const>,<mem> instructions (they do not clobber registers) */ +	if (count == 1) { +		*(char *)(s + 0) = eax; +		return s; +	} + +	eax *= 0x01010101; /* done at compile time */ + +	if (count == 2) { +		*(short *)(s + 0) = eax; +		return s; +	} +	if (count == 3) { +		*(short *)(s + 0) = eax; +		*(char *) (s + 2) = eax; +		return s; +	} +	if (count == 1*4 + 0) { +		*(int *)(s + 0) = eax; +		return s; +	} +	if (count == 1*4 + 1) { +		*(int *) (s + 0) = eax; +		*(char *)(s + 4) = eax; +		return s; +	} +	if (count == 1*4 + 2) { +		*(int *)  (s + 0) = eax; +		*(short *)(s + 4) = eax; +		return s; +	} + +	/* Small string stores: don't clobber ecx +	 * (clobbers only eax and edi) */ +#define small_store(arg) { \ +	__asm__ __volatile__( \ +		arg \ +		: "=&D" (edi) \ +		: "a" (eax), "0" (s) \ +		: "memory" \ +	); \ +	return s; \ +} +	if (count == 1*4 + 3) small_store("stosl; stosw; stosb"); +	if (count == 2*4 + 0) { +		((int *)s)[0] = eax; +		((int *)s)[1] = eax; +		return s; +	} +	if (count == 2*4 + 1) small_store("stosl; stosl; stosb"); +	if (count == 2*4 + 2) small_store("stosl; stosl; stosw"); +	if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb"); +	if (count == 3*4 + 0) small_store("stosl; stosl; stosl"); +	if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb"); +	if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw"); +	if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb"); +	if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl"); +	if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb"); +	/* going over 7 bytes is suboptimal */ +	/* stosw is 2-byte insn, so this one takes 6 bytes: */ +	if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw"); +	/* 7 bytes */ +	if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb"); +	/* 5 bytes */ +	if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl"); +	/* 6 bytes */ +	if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb"); +	/* 7 bytes */ +	if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw"); +	/* 8 bytes, but oh well... */ +	if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb"); +	/* 6 bytes */ +	if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl"); +	/* the rest would be 7+ bytes and is handled below instead */ +#undef small_store + +	/* Not small, but multiple-of-4 store. +	 * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */ +	__asm__ __volatile__( +		"	rep; stosl\n" +		: "=&c" (ecx), "=&D" (edi) +		: "a" (eax), "0" (count / 4), "1" (s) +		: "memory" +	); +	return s; +} +#if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */ +#define memset(s, c, count) ( \ +	( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \ +	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ +	) \ +	? memset((s), (c), (count)) \ +	: inlined_memset_const_c_count4((s), (c), (count)) \ +	) +#endif + + +static __always_inline +void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count) +{ +	int ecx; +	char *esi, *edi; + +	if (count == 0) +		return d; + +	if (count == 1) { +		*(char *)d = *(char *)s; +		return d + 1; +	} +	if (count == 2) { +		*(short *)d = *(short *)s; +		return d + 2; +	} +	/* Small string moves: don't clobber ecx +	 * (clobbers only esi and edi) */ +#define small_move(arg) { \ +	__asm__ __volatile__( \ +		arg \ +		: "=&S" (esi), "=&D" (edi) \ +		: "0" (s), "1" (d) \ +		: "memory" \ +	); \ +	return edi; \ +} +	if (count == 3) small_move("movsw; movsb"); +	if (count == 1*4 + 0) { +		*(int *)d = *(int *)s; +		return d + 4; +	} +	if (count == 1*4 + 1) small_move("movsl; movsb"); +	if (count == 1*4 + 2) small_move("movsl; movsw"); +	if (count == 1*4 + 3) small_move("movsl; movsw; movsb"); +	if (count == 2*4 + 0) small_move("movsl; movsl"); +	if (count == 2*4 + 1) small_move("movsl; movsl; movsb"); +	if (count == 2*4 + 2) small_move("movsl; movsl; movsw"); +	if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb"); +	if (count == 3*4 + 0) small_move("movsl; movsl; movsl"); +	if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb"); +	if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw"); +	if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb"); +	if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl"); +	if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb"); +	/* going over 7 bytes is suboptimal */ +	/* movsw is 2-byte insn, so this one takes 6 bytes: */ +	if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw"); +	/* 7 bytes */ +	if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb"); +	/* 5 bytes */ +	if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl"); +	/* 6 bytes */ +	if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb"); +	/* 7 bytes */ +	if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw"); +	/* 8 bytes, but oh well... */ +	if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb"); +	/* 6 bytes */ +	if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl"); +	/* the rest would be 7+ bytes and is handled below instead */ +#undef small_move + +	/* Not small, but multiple-of-4 move. +	 * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */ +	__asm__ __volatile__( +		"	rep; movsl\n" +		: "=&c" (ecx), "=&S" (esi), "=&D" (edi) +		: "0" (count / 4), "1" (s), "2" (d) +		: "memory" +	); +	return edi; +} +static __always_inline +void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count) +{ +	inlined_mempcpy_const_count4(d, s, count); +	return d; +} +#if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */ +#define mempcpy(d, s, count) ( \ +	( !(__builtin_constant_p(count)) \ +	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ +	) \ +	? mempcpy((d), (s), (count)) \ +	: inlined_mempcpy_const_count4((d), (s), (count)) \ +	) +#define memcpy(d, s, count) ( \ +	( !(__builtin_constant_p(count)) \ +	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ +	) \ +	? memcpy((d), (s), (count)) \ +	: inlined_memcpy_const_count4((d), (s), (count)) \ +	) +#endif + + +static __always_inline +size_t inlined_strlen(const char *s) +{ +	int edi; +	int ecx; +	__asm__ __volatile__( +		"	repne; scasb\n" +	/*	"	notl	%0\n" */ +	/*	"	decl	%0\n" */ +		: "=c" (ecx), "=&D" (edi) +		: "1" (s), "a" (0), "0" (0xffffffffu) +		/* : no clobbers */ +	); +	return -ecx - 1; +} +#if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */ +#define strlen(s) inlined_strlen(s) +#endif + + +static __always_inline +char *inlined_stpcpy(char *dest, const char *src) +{ +	char *esi, *edi; +	int eax; +	__asm__ __volatile__( +		"1:	lodsb\n" +		"	stosb\n" +		"	testb	%%al, %%al\n" +		"	jnz	1b\n" +		: "=&S" (esi), "=&D" (edi), "=&a" (eax) +		: "0" (src), "1" (dest) +		: "memory" +	); +	return edi - 1; +} +static __always_inline +char *inlined_strcpy(char *dest, const char *src) +{ +	inlined_stpcpy(dest, src); +	return dest; +} +#if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */ +#define stpcpy(dest, src) inlined_stpcpy(dest, src) +#define strcpy(dest, src) inlined_strcpy(dest, src) +#endif + + +static __always_inline +void *inlined_memchr(const void *s, int c, size_t count) +{ +	void *edi; +	int ecx; +	/* Unfortunately, c gets loaded to %eax (wide insn), not %al */ +	__asm__ __volatile__( +		"	jecxz	1f\n" +		"	repne; scasb\n" +		"	leal	-1(%%edi), %%edi\n" +		"	je	2f\n" +		"1:\n" +		"	xorl	%%edi, %%edi\n" +		"2:\n" +		: "=&D" (edi), "=&c" (ecx) +		: "a" (c), "0" (s), "1" (count) +		/* : no clobbers */ +	); +	return edi; +} +static __always_inline +void *inlined_memchr_const_c(const void *s, int c, size_t count) +{ +	void *edi; +	int ecx, eax; +	__asm__ __volatile__( +		"	jecxz	1f\n" +		"	movb	%4, %%al\n" /* const c to %%al */ +		"	repne; scasb\n" +		"	leal	-1(%%edi), %%edi\n" +		"	je	2f\n" +		"1:\n" +		"	xorl	%%edi, %%edi\n" +		"2:\n" +		: "=&D" (edi), "=&c" (ecx), "=&a" (eax) +		: "0" (s), "i" (c), "1" (count) +		/* : no clobbers */ +	); +	return edi; +} +#if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */ +#define memchr(s, c, count) ( \ +	__builtin_constant_p(c) \ +	? inlined_memchr_const_c(s, (c) & 0xff, count) \ +	: inlined_memchr(s, c, count) \ +	) +#endif + +#endif /* _LIBC_STRING_i386_H  */  | 
