From e4f55f33f69fce85099dd5936cc74856aa1b453d Mon Sep 17 00:00:00 2001 From: Carmelo Amoroso Date: Tue, 9 Sep 2008 16:55:27 +0000 Subject: Add optimized memcpy implementation for sh4 (from Stuart Menefy @STMicroelectronics). This implementation is based on 'backward copying'. Signed-off-by: Carmelo Amoroso --- libc/string/generic/_memcpy_fwd.c | 185 +++++++++++++++++++++++++++++++++++++ libc/string/generic/memcopy.h | 3 + libc/string/generic/memcpy.c | 186 -------------------------------------- libc/string/generic/memmove.c | 17 +++- 4 files changed, 201 insertions(+), 190 deletions(-) create mode 100644 libc/string/generic/_memcpy_fwd.c (limited to 'libc/string/generic') diff --git a/libc/string/generic/_memcpy_fwd.c b/libc/string/generic/_memcpy_fwd.c new file mode 100644 index 000000000..470165a57 --- /dev/null +++ b/libc/string/generic/_memcpy_fwd.c @@ -0,0 +1,185 @@ +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ + +static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1; + + switch (len % 8) + { + case 2: + a0 = ((op_t *) srcp)[0]; + srcp -= 6 * OPSIZ; + dstp -= 7 * OPSIZ; + len += 6; + goto do1; + case 3: + a1 = ((op_t *) srcp)[0]; + srcp -= 5 * OPSIZ; + dstp -= 6 * OPSIZ; + len += 5; + goto do2; + case 4: + a0 = ((op_t *) srcp)[0]; + srcp -= 4 * OPSIZ; + dstp -= 5 * OPSIZ; + len += 4; + goto do3; + case 5: + a1 = ((op_t *) srcp)[0]; + srcp -= 3 * OPSIZ; + dstp -= 4 * OPSIZ; + len += 3; + goto do4; + case 6: + a0 = ((op_t *) srcp)[0]; + srcp -= 2 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do5; + case 7: + a1 = ((op_t *) srcp)[0]; + srcp -= 1 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a0 = ((op_t *) srcp)[0]; + srcp -= 0 * OPSIZ; + dstp -= 1 * OPSIZ; + goto do7; + case 1: + a1 = ((op_t *) srcp)[0]; + srcp -=-1 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do + { + do8: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + do7: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = a0; + do6: + a0 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = a1; + do5: + a1 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = a0; + do4: + a0 = ((op_t *) srcp)[4]; + ((op_t *) dstp)[4] = a1; + do3: + a1 = ((op_t *) srcp)[5]; + ((op_t *) dstp)[5] = a0; + do2: + a0 = ((op_t *) srcp)[6]; + ((op_t *) dstp)[6] = a1; + do1: + a1 = ((op_t *) srcp)[7]; + ((op_t *) dstp)[7] = a0; + + srcp += 8 * OPSIZ; + dstp += 8 * OPSIZ; + len -= 8; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = a1; +} + +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + DSTP should be aligned for memory operations on `op_t's, but SRCP must + *not* be aligned. */ + +static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1, a2, a3; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + aligned srcp to make it aligned for copy. */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' + it points in the middle of. */ + srcp &= -OPSIZ; + + switch (len % 4) + { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + srcp -= 1 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + srcp -= 0 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + srcp -=-1 * OPSIZ; + dstp -= 1 * OPSIZ; + len += 0; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp -=-2 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); + do3: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2); + do2: + a2 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2); + do1: + a3 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2); + + srcp += 4 * OPSIZ; + dstp += 4 * OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); +} diff --git a/libc/string/generic/memcopy.h b/libc/string/generic/memcopy.h index df1ba9a97..fab4da764 100644 --- a/libc/string/generic/memcopy.h +++ b/libc/string/generic/memcopy.h @@ -107,6 +107,7 @@ typedef unsigned char byte; } \ } while (0) +#ifdef __ARCH_HAS_BWD_MEMCPY__ /* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with the assumption that DST_BP is aligned on an OPSIZ multiple. If not all bytes could be easily copied, store remaining number of bytes @@ -125,6 +126,8 @@ typedef unsigned char byte; (nbytes_left) = (nbytes) % OPSIZ; \ } while (0) +#endif + /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR, beginning at the words (of type op_t) right before the pointers and continuing towards smaller addresses. May take advantage of that diff --git a/libc/string/generic/memcpy.c b/libc/string/generic/memcpy.c index fa6606ceb..4284f2fe5 100644 --- a/libc/string/generic/memcpy.c +++ b/libc/string/generic/memcpy.c @@ -25,192 +25,6 @@ /* Experimentally off - libc_hidden_proto(memcpy) */ -/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to - block beginning at DSTP with LEN `op_t' words (not LEN bytes!). - Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ - -static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len) -{ - op_t a0, a1; - - switch (len % 8) - { - case 2: - a0 = ((op_t *) srcp)[0]; - srcp -= 6 * OPSIZ; - dstp -= 7 * OPSIZ; - len += 6; - goto do1; - case 3: - a1 = ((op_t *) srcp)[0]; - srcp -= 5 * OPSIZ; - dstp -= 6 * OPSIZ; - len += 5; - goto do2; - case 4: - a0 = ((op_t *) srcp)[0]; - srcp -= 4 * OPSIZ; - dstp -= 5 * OPSIZ; - len += 4; - goto do3; - case 5: - a1 = ((op_t *) srcp)[0]; - srcp -= 3 * OPSIZ; - dstp -= 4 * OPSIZ; - len += 3; - goto do4; - case 6: - a0 = ((op_t *) srcp)[0]; - srcp -= 2 * OPSIZ; - dstp -= 3 * OPSIZ; - len += 2; - goto do5; - case 7: - a1 = ((op_t *) srcp)[0]; - srcp -= 1 * OPSIZ; - dstp -= 2 * OPSIZ; - len += 1; - goto do6; - - case 0: - if (OP_T_THRES <= 3 * OPSIZ && len == 0) - return; - a0 = ((op_t *) srcp)[0]; - srcp -= 0 * OPSIZ; - dstp -= 1 * OPSIZ; - goto do7; - case 1: - a1 = ((op_t *) srcp)[0]; - srcp -=-1 * OPSIZ; - dstp -= 0 * OPSIZ; - len -= 1; - if (OP_T_THRES <= 3 * OPSIZ && len == 0) - goto do0; - goto do8; /* No-op. */ - } - - do - { - do8: - a0 = ((op_t *) srcp)[0]; - ((op_t *) dstp)[0] = a1; - do7: - a1 = ((op_t *) srcp)[1]; - ((op_t *) dstp)[1] = a0; - do6: - a0 = ((op_t *) srcp)[2]; - ((op_t *) dstp)[2] = a1; - do5: - a1 = ((op_t *) srcp)[3]; - ((op_t *) dstp)[3] = a0; - do4: - a0 = ((op_t *) srcp)[4]; - ((op_t *) dstp)[4] = a1; - do3: - a1 = ((op_t *) srcp)[5]; - ((op_t *) dstp)[5] = a0; - do2: - a0 = ((op_t *) srcp)[6]; - ((op_t *) dstp)[6] = a1; - do1: - a1 = ((op_t *) srcp)[7]; - ((op_t *) dstp)[7] = a0; - - srcp += 8 * OPSIZ; - dstp += 8 * OPSIZ; - len -= 8; - } - while (len != 0); - - /* This is the right position for do0. Please don't move - it into the loop. */ - do0: - ((op_t *) dstp)[0] = a1; -} - -/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to - block beginning at DSTP with LEN `op_t' words (not LEN bytes!). - DSTP should be aligned for memory operations on `op_t's, but SRCP must - *not* be aligned. */ - -static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len) -{ - op_t a0, a1, a2, a3; - int sh_1, sh_2; - - /* Calculate how to shift a word read at the memory operation - aligned srcp to make it aligned for copy. */ - - sh_1 = 8 * (srcp % OPSIZ); - sh_2 = 8 * OPSIZ - sh_1; - - /* Make SRCP aligned by rounding it down to the beginning of the `op_t' - it points in the middle of. */ - srcp &= -OPSIZ; - - switch (len % 4) - { - case 2: - a1 = ((op_t *) srcp)[0]; - a2 = ((op_t *) srcp)[1]; - srcp -= 1 * OPSIZ; - dstp -= 3 * OPSIZ; - len += 2; - goto do1; - case 3: - a0 = ((op_t *) srcp)[0]; - a1 = ((op_t *) srcp)[1]; - srcp -= 0 * OPSIZ; - dstp -= 2 * OPSIZ; - len += 1; - goto do2; - case 0: - if (OP_T_THRES <= 3 * OPSIZ && len == 0) - return; - a3 = ((op_t *) srcp)[0]; - a0 = ((op_t *) srcp)[1]; - srcp -=-1 * OPSIZ; - dstp -= 1 * OPSIZ; - len += 0; - goto do3; - case 1: - a2 = ((op_t *) srcp)[0]; - a3 = ((op_t *) srcp)[1]; - srcp -=-2 * OPSIZ; - dstp -= 0 * OPSIZ; - len -= 1; - if (OP_T_THRES <= 3 * OPSIZ && len == 0) - goto do0; - goto do4; /* No-op. */ - } - - do - { - do4: - a0 = ((op_t *) srcp)[0]; - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); - do3: - a1 = ((op_t *) srcp)[1]; - ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2); - do2: - a2 = ((op_t *) srcp)[2]; - ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2); - do1: - a3 = ((op_t *) srcp)[3]; - ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2); - - srcp += 4 * OPSIZ; - dstp += 4 * OPSIZ; - len -= 4; - } - while (len != 0); - - /* This is the right position for do0. Please don't move - it into the loop. */ - do0: - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); -} - void *memcpy (void *dstpp, const void *srcpp, size_t len) { unsigned long int dstp = (long int) dstpp; diff --git a/libc/string/generic/memmove.c b/libc/string/generic/memmove.c index b2a017b16..7f945b150 100644 --- a/libc/string/generic/memmove.c +++ b/libc/string/generic/memmove.c @@ -24,12 +24,18 @@ #include "memcopy.h" #include "pagecopy.h" +#ifdef __ARCH_HAS_BWD_MEMCPY__ +/* generic-opt memmove assumes memcpy does forward copying! */ +#include "_memcpy_fwd.c" +#endif + /* Experimentally off - libc_hidden_proto(memmove) */ /* Experimentally off - libc_hidden_proto(memcpy) */ static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len) { - op_t a0, a1; + op_t a0 = 0; + op_t a1 = 0; switch (len % 8) { @@ -133,7 +139,10 @@ static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len) static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len) { - op_t a0, a1, a2, a3; + op_t a0 = 0; + op_t a1 = 0; + op_t a2 = 0; + op_t a3 = 0; int sh_1, sh_2; /* Calculate how to shift a word read at the memory operation @@ -218,8 +227,8 @@ void *memmove (void *dest, const void *src, size_t len) Reduces the working set. */ if (dstp - srcp >= len) /* *Unsigned* compare! */ { -#if 1 -#warning REMINDER: generic-opt memmove assumes memcpy does forward copying! +#ifndef __ARCH_HAS_BWD_MEMCPY__ + /* Backward memcpy implementation cannot be used */ memcpy(dest, src, len); #else /* Copy from the beginning to the end. */ -- cgit v1.2.3