diff options
Diffstat (limited to 'libc/string')
| -rw-r--r-- | libc/string/Makefile.in | 5 | ||||
| -rw-r--r-- | libc/string/sh/memchr.S | 30 | ||||
| -rw-r--r-- | libc/string/sh/sh4/memcpy.S | 120 | ||||
| -rw-r--r-- | libc/string/sh/sh4/memmove.c | 117 | ||||
| -rw-r--r-- | libc/string/sh/sh4/memset.S | 146 | ||||
| -rw-r--r-- | libc/string/sh/sh4/strcpy.S | 28 | ||||
| -rw-r--r-- | libc/string/sh/sh4/strncpy.S | 43 | ||||
| -rw-r--r-- | libc/string/sh/strlen.S | 75 | 
8 files changed, 553 insertions, 11 deletions
| diff --git a/libc/string/Makefile.in b/libc/string/Makefile.in index 08a1856b7..67679ece7 100644 --- a/libc/string/Makefile.in +++ b/libc/string/Makefile.in @@ -18,7 +18,10 @@ STRING_SUBARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH  STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)  STRING_SUBARCH_SOBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC)) -STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ) +STRING_SUBARCH_CSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.c) +STRING_SUBARCH_COBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.c,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_CSRC)) + +STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ) $(STRING_SUBARCH_COBJ)  endif  # Collect the arch specific implementation (asm, c files) diff --git a/libc/string/sh/memchr.S b/libc/string/sh/memchr.S new file mode 100644 index 000000000..6b7142f69 --- /dev/null +++ b/libc/string/sh/memchr.S @@ -0,0 +1,30 @@ +/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memchr" implementation of SuperH + * + * Copyright (C) 1999  Niibe Yutaka + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* + * void *memchr(const void *s, int c, size_t n); + */ + +#include <sysdep.h> + +ENTRY(memchr) +	tst	r6,r6 +	bt/s	2f +	 exts.b	r5,r5 +1:	mov.b	@r4,r1 +	cmp/eq	r1,r5 +	bt/s	3f +	 dt	r6 +	bf/s	1b +	 add	#1,r4 +2:	mov	#0,r4 +3:	rts +	 mov	r4,r0 +END(memchr) +libc_hidden_def (memchr) diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 0954bce85..efdaf8bba 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -6,6 +6,9 @@   *   Modified from memcpy.S and micro-optimised for SH4   *   Stuart Menefy (stuart.menefy@st.com)   * + * Copyright (c) 2009  STMicroelectronics Ltd + *   Optimised using prefetching and 64bit data transfer via FPU + *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>   */  /* @@ -15,8 +18,25 @@   * If there is an overlap, then the results are undefined.   */ +#include <sysdep.h>  #include <endian.h> +#ifdef __LITTLE_ENDIAN__ +#define	MEMCPY_USES_FPU +/* Use paired single precision load or store mode for 64-bit tranfering. + * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300. + * Currenlty it has been only implemented and tested for little endian mode. */ +.macro FPU_SET_PAIRED_PREC +	sts	fpscr, r7 +	mov	#0x10, r6	! PR=0 SZ=1 +	shll16	r6 +	lds	r6, fpscr +.endm +.macro RESTORE_FPSCR +	lds	r7, fpscr +.endm +#endif +  	!  	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.  	! @@ -157,12 +177,7 @@  9:	rts  	 nop -/* void * memcpy(void *dst, const void *src, size_t len) */ -.text -.align 4 -.type  memcpy,@function -.globl memcpy; -memcpy: +ENTRY(memcpy)  	! Calculate the invariants which will be used in the remainder  	! of the code: @@ -189,9 +204,7 @@ memcpy:  	mov	r4, r0		!   5 MT (0 cycle latency)  	add	r6, r0		!  49 EX -	mov	#16, r1		!   6 EX  	bt/s	.Lcase00	! 111 BR		(aligned) -  	 sub	r4, r5		!  75 EX  	! Arguments are not nicely long word aligned or zero len. @@ -207,6 +220,7 @@ memcpy:  	! However the penalty for getting it 'wrong' is much higher for long word  	! aligned data (and this is more common), so use a value of 16. +	mov	#16, r1		!   6 EX  	cmp/gt	r6,r1		!  56 MT  	add	#-1,r5		!  50 EX @@ -447,6 +461,92 @@ memcpy:  	 mov.l	r7, @-r0	!  30 LS +#ifdef MEMCPY_USES_FPU +	! Copy the cache line aligned blocks by using the FPU registers. +	! If src and dst are well aligned adopt 64-bit data transfer. +	! We also need r0 as a temporary (for movca), so 'undo' the invariant: +	!   r5:	 src (was r0+r5) +	!   r1:	 dest (was r0) +1: +	add	r0, r5 +	mov	r0, r1 + +	add	#-0x1c, r5 +	mov	r5, r0 + +	tst	#7, r0		! src is 8byte aligned +	mov	r5, r3 + +	add	#-64, r3	! To pefetch head +	bt/s	3f + +	 pref	@r3 + +2:	fmov.s	@r5+, fr0 +	mov	r1, r6 +	fmov.s	@r5+, fr1 +	add	#-32, r6 +	fmov.s	@r5+, fr2 +	fmov.s	@r5+, fr3 +	fmov.s	@r5+, fr4 +	fmov.s	@r5+, fr5 +	fmov.s	@r5+, fr6 +	fmov.s	@r5+, fr7 +	add	#-0x40, r5 + +	movca.l	r0, @r6		! Cache allocate + store on dst-32. + +	fmov.s	fr7, @-r1 +	fmov.s	fr6, @-r1 +	fmov.s	fr5, @-r1 +	fmov.s	fr4, @-r1 +	fmov.s	fr3, @-r1 +	fmov.s	fr2, @-r1 +	fmov.s	fr1, @-r1 +	fmov.s	fr0, @-r1 + +	add	#-32, r3 +	cmp/eq	r2,r1 + +	bf/s	2b +	 pref	@r3		! Prefetch the next cache line. + +	bra	5f + +3:	FPU_SET_PAIRED_PREC + +4:	fmov	@r5+, dr0 +	mov	r1, r6 +	fmov	@r5+, dr2 +	add	#-32, r6 +	fmov	@r5+, dr4 +	fmov	@r5+, dr6 +	add	#-0x40, r5 + +	movca.l	r0, @r6 + +	fmov	dr6, @-r1 +	fmov	dr4, @-r1 +	fmov	dr2, @-r1 +	fmov	dr0, @-r1 +	add	#-32, r3 +	cmp/eq	r2,r1 + +	bf/s	4b +	 pref	@r3 + +	RESTORE_FPSCR + +5:	mov	r1, r0 + +	cmp/eq	r4, r0		!  54 MT +	bf/s	1f		! 109 BR +	 sub	r1, r5		!  75 EX + +	rts +	 nop +1: +#else  	! Copy the cache line aligned blocks  	!  	! In use: r0, r2, r4, r5 @@ -512,6 +612,7 @@ memcpy:  	rts  1:	 mov.l	@r15+, r8	!  15 LS +#endif  	sub	r4, r1		!  75 EX		(len remaining)  	! number of trailing bytes is non-zero @@ -803,6 +904,5 @@ memcpy:  	rts  	 mov.b	r1,@-r0 -.size memcpy,.-memcpy; - +END(memcpy)  libc_hidden_def (memcpy) diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c new file mode 100644 index 000000000..4d52db2ca --- /dev/null +++ b/libc/string/sh/sh4/memmove.c @@ -0,0 +1,117 @@ +/* memmove implementation for SH4 + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +#include <string.h> + + +#define FPSCR_SR	(1 << 20) +#define STORE_FPSCR(x)	__asm__ volatile("sts fpscr, %0" : "=r"(x)) +#define LOAD_FPSCR(x)	__asm__ volatile("lds %0, fpscr" : : "r"(x)) + +static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len) +{ +	char *d = (char *)dest; +	char *s = (char *)src; + +	if (len >= 64) { +		unsigned long fpscr; +		int *s1; +		int *d1; + +		/* Align the dest to 4 byte boundary. */ +		while ((unsigned)d & 0x7) { +			*d++ = *s++; +			len--; +		} + +		s1 = (int *)s; +		d1 = (int *)d; + +		/* check if s is well aligned to use FPU */ +		if (!((unsigned)s1 & 0x7)) { + +			/* Align the dest to cache-line boundary */ +			while ((unsigned)d1 & 0x1c) { +				*d1++ = *s1++; +				len -= 4; +			} + +			/* Use paired single precision load or store mode for +			 * 64-bit tranfering.*/ +			STORE_FPSCR(fpscr); +			LOAD_FPSCR(FPSCR_SR); + +			while (len >= 32) { +				__asm__ volatile ("fmov @%0+,dr0":"+r" (s1)); +				__asm__ volatile ("fmov @%0+,dr2":"+r" (s1)); +				__asm__ volatile ("fmov @%0+,dr4":"+r" (s1)); +				__asm__ volatile ("fmov @%0+,dr6":"+r" (s1)); +				__asm__ +				    volatile ("fmov dr0,@%0"::"r" +					      (d1):"memory"); +				d1 += 2; +				__asm__ +				    volatile ("fmov dr2,@%0"::"r" +					      (d1):"memory"); +				d1 += 2; +				__asm__ +				    volatile ("fmov dr4,@%0"::"r" +					      (d1):"memory"); +				d1 += 2; +				__asm__ +				    volatile ("fmov dr6,@%0"::"r" +					      (d1):"memory"); +				d1 += 2; +				len -= 32; +			} +			LOAD_FPSCR(fpscr); +		} +		s = (char *)s1; +		d = (char *)d1; +		/*TODO: other subcases could be covered here?!?*/ +	} +	/* Go to per-byte copy */ +	while (len > 0) { +		*d++ = *s++; +		len--; +	} +	return; +} + +void *memmove(void *dest, const void *src, size_t len) +{ +	unsigned long int d = (long int)dest; +	unsigned long int s = (long int)src; +	unsigned long int res; + +	if (d >= s) +		res = d - s; +	else +		res = s - d; +	/* +	 * 1) dest and src are not overlap  ==> memcpy (BWD/FDW) +	 * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW) +	 * 3) left-to-right overlap ==>  Copy from the beginning to the end +	 * 4) right-to-left overlap ==>  Copy from the end to the beginning +	 */ + +	if (res == 0)		/* 100% overlap */ +		memcpy(dest, src, len);	/* No overlap */ +	else if (res >= len) +		memcpy(dest, src, len); +	else { +		if (d > s)	/* right-to-left overlap */ +			memcpy(dest, src, len);	/* memcpy is BWD */ +		else		/* cannot use SH4 memcpy for this case */ +			fpu_optimised_copy_fwd(dest, src, len); +	} +	return (dest); +} + +libc_hidden_def(memmove) diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S new file mode 100644 index 000000000..1a57cb969 --- /dev/null +++ b/libc/string/sh/sh4/memset.S @@ -0,0 +1,146 @@ +/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memset" implementation of SuperH + * + * Copyright (C) 1999  Niibe Yutaka + * + * Copyright (c) 2009  STMicroelectronics Ltd + *   Optimised using 64bit data transfer via FPU + *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* + *            void *memset(void *s, int c, size_t n); + */ + +#include <sysdep.h> + +#ifdef __LITTLE_ENDIAN__ +#define MEMSET_USES_FPU +/* Use paired single precision load or store mode for 64-bit tranfering. + * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300. + * Currenlty it has been only implemented and tested for little endian mode. */ +.macro FPU_SET_PAIRED_PREC +	sts	fpscr, r3 +	mov	#0x10, r0	! PR=0 SZ=1 +	shll16  r0 +	lds	r0, fpscr +.endm +.macro RESTORE_FPSCR +	lds	r3, fpscr +.endm +#endif + +ENTRY(memset) +	tst	r6,r6 +	bt/s	5f		! if n=0, do nothing +	 add	r6,r4 +	mov	#12,r0 +	cmp/gt	r6,r0 +	bt/s	4f		! if it's too small, set a byte at once +	 mov	r4,r0 +	and	#3,r0 +	cmp/eq	#0,r0 +	bt/s	2f		! It's aligned +	 sub	r0,r6 +1: +	dt	r0 +	bf/s	1b +	 mov.b	r5,@-r4 +2:				! make VVVV +	extu.b	r5,r5 +	swap.b	r5,r0		!   V0 +	or	r0,r5		!   VV +	swap.w	r5,r0		! VV00 +	or	r0,r5		! VVVV + +	! Enough bytes need to be copied +	mov	#0x40, r0	! (MT) +	cmp/gt	r6,r0		! (MT)  64 > len => slow loop + +	bt/s	22f +	 mov	r6,r0 + +	! align the dst to the cache block size if necessary +	mov	r4, r3 +	mov	#~(0x1f), r1 + +	and	r3, r1 +	cmp/eq	r3, r1 + +	bt/s	11f		! dst is already aligned +	 sub	r1, r3		! r3-r1 -> r3 +	shlr2	r3		! number of loops + +10:	mov.l	r5,@-r4 +	dt	r3 +	bf/s	10b +	 add	#-4, r6 + +11:	! dst is 32byte aligned +	mov	r6,r2 +	mov	#-5,r0 +	shld	r0,r2		! number of loops + +#ifdef MEMSET_USES_FPU +	lds	r5, fpul	! (CO) +	fsts	fpul, fr0	! Dr0 will be 'VVVVVVVV' +	fsts	fpul, fr1 + +	FPU_SET_PAIRED_PREC +12: +	add	#-0x20, r6	!(MT) +	fmov	dr0, @-r4 +	fmov	dr0, @-r4 +	fmov	dr0, @-r4 +	dt	r2 +	bf/s	12b		!(BR) +	 fmov	dr0, @-r4 + +	RESTORE_FPSCR +#else +12: +	mov.l	r5,@-r4 +	mov.l	r5,@-r4 +	mov.l	r5,@-r4 +	mov.l	r5,@-r4 +	mov.l	r5,@-r4 +	mov.l	r5,@-r4 +	add	#-0x20, r6 +	mov.l	r5,@-r4 +	dt	r2 +	bf/s	12b +	 mov.l	r5,@-r4 +#endif +	tst	r6,r6 +	bt/s	5f +	 mov	#8, r0 + +	cmp/ge	r0, r6 +	bf/s	4f +	 mov	r6,r0 +22: +	shlr2	r0 +	shlr	r0		! r0 = r6 >> 3 +3: +	dt	r0 +	mov.l	r5,@-r4		! set 8-byte at once +	bf/s	3b +	 mov.l	r5,@-r4 +	! +	mov	#7,r0 +	and	r0,r6 +	tst	r6,r6 +	bt	5f +	! fill bytes +4: +	dt	r6 +	bf/s	4b +	 mov.b	r5,@-r4 +5: +	rts +	 mov	r4,r0 +END(memset) +libc_hidden_def (memset) diff --git a/libc/string/sh/sh4/strcpy.S b/libc/string/sh/sh4/strcpy.S new file mode 100644 index 000000000..0f8278017 --- /dev/null +++ b/libc/string/sh/sh4/strcpy.S @@ -0,0 +1,28 @@ +/* strcpy implementation for SUPERH + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* +	char *strcpy(char *dest, const char *src); + */ + +#include <sysdep.h> + +ENTRY(strcpy) +	mov	r4,r2 +1: +	mov.b	@r5+,r1 +	tst	r1,r1 +	mov.b	r1,@r2 +	bf/s	1b +	 add	#1,r2 + +	rts +	 mov	r4,r0 +END(strcpy) +libc_hidden_def (strcpy) diff --git a/libc/string/sh/sh4/strncpy.S b/libc/string/sh/sh4/strncpy.S new file mode 100644 index 000000000..8a16f39d4 --- /dev/null +++ b/libc/string/sh/sh4/strncpy.S @@ -0,0 +1,43 @@ +/* strncpy implementation for SUPERH + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* +	char *strncpy(char *dest, const char *src, size_t n); + */ + +#include <sysdep.h> + +ENTRY(strncpy) +	mov	#0,r0 +	bra	2f +	 mov	r4,r2 +1: +	mov.b	r1,@(r0,r2) +	add	#1,r0 +2: +	cmp/hs	r6,r0 +	bt	5f +	mov.b	@(r0,r5),r1 +	tst	r1,r1 +	bf/s	1b +	 cmp/hs	r6,r0 +	bra	4f +	 nop +3: +	mov.b	r1,@(r0,r2) +	add	#1,r0 +	cmp/hs	r6,r0 +4: +	bf/s	3b +	 mov	#0,r1 +5: +	rts +	 mov     r2,r0 +END(strncpy) +libc_hidden_def(strncpy) diff --git a/libc/string/sh/strlen.S b/libc/string/sh/strlen.S new file mode 100644 index 000000000..1ccecc17b --- /dev/null +++ b/libc/string/sh/strlen.S @@ -0,0 +1,75 @@ +/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $ + * + * "strlen" implementation of SuperH + * + * Copyright (C) 1999  Kaz Kojima + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* size_t strlen (const char *s)  */ + +#include <sysdep.h> +#include <endian.h> + +ENTRY(strlen) +	mov	r4,r0 +	and	#3,r0 +	tst	r0,r0 +	bt/s	1f +	 mov	#0,r2 + +	add	#-1,r0 +	shll2	r0 +	shll	r0 +	braf	r0 +	 nop + +	mov.b	@r4+,r1 +	tst	r1,r1 +	bt	8f +	add	#1,r2 + +	mov.b	@r4+,r1 +	tst	r1,r1 +	bt	8f +	add	#1,r2 + +	mov.b	@r4+,r1 +	tst	r1,r1 +	bt	8f +	add	#1,r2 + +1: +	mov	#0,r3 +2: +	mov.l	@r4+,r1 +	cmp/str	r3,r1 +	bf/s	2b +	 add	#4,r2 + +	add	#-4,r2 +#ifndef __LITTLE_ENDIAN__ +	swap.b	r1,r1 +	swap.w	r1,r1 +	swap.b	r1,r1 +#endif +	extu.b	r1,r0 +	tst	r0,r0 +	bt/s	8f +	 shlr8	r1 +	add	#1,r2 +	extu.b	r1,r0 +	tst	r0,r0 +	bt/s	8f +	 shlr8	r1 +	add	#1,r2 +	extu.b	r1,r0 +	tst	r0,r0 +	bt	8f +	add	#1,r2 +8: +	rts +	 mov	r2,r0 +END(strlen) +libc_hidden_def (strlen) | 
