diff options
-rw-r--r-- | libc/string/sh/sh4/memset.S | 62 |
1 files changed, 34 insertions, 28 deletions
diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S index 83f874612..eb83355ce 100644 --- a/libc/string/sh/sh4/memset.S +++ b/libc/string/sh/sh4/memset.S @@ -5,7 +5,7 @@ * Copyright (C) 1999 Niibe Yutaka * * Copyright (c) 2009 STMicroelectronics Ltd - * Optimised using 64bit data transfer via FPU + * Optimised using 64bit data transfer (via FPU) and the movca.l inst. * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> * * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. @@ -24,9 +24,9 @@ * Currenlty it has been only implemented and tested for little endian mode. */ .macro FPU_SET_PAIRED_PREC sts fpscr, r3 - mov #0x10, r0 ! PR=0 SZ=1 - shll16 r0 - lds r0, fpscr + mov #0x10, r1 ! PR=0 SZ=1 + shll16 r1 + lds r1, fpscr .endm .macro RESTORE_FPSCR lds r3, fpscr @@ -34,12 +34,10 @@ #endif ENTRY(memset) - tst r6,r6 - bt/s 5f ! if n=0, do nothing - add r6,r4 mov #12,r0 + add r6,r4 cmp/gt r6,r0 - bt/s 4f ! if it's too small, set a byte at once + bt/s 40f ! if it's too small, set a byte at once mov r4,r0 and #3,r0 cmp/eq #0,r0 @@ -56,7 +54,7 @@ ENTRY(memset) swap.w r5,r0 ! VV00 or r0,r5 ! VVVV - ! Enough bytes need to be copied + ! Check if enough bytes need to be copied to be worth the big loop mov #0x40, r0 ! (MT) cmp/gt r6,r0 ! (MT) 64 > len => slow loop @@ -84,6 +82,9 @@ ENTRY(memset) mov #-5,r0 shld r0,r2 ! number of loops + add #-32, r4 + mov r5, r0 + #ifdef MEMSET_USES_FPU lds r5, fpul ! (CO) fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV' @@ -91,36 +92,40 @@ ENTRY(memset) FPU_SET_PAIRED_PREC 12: - add #-0x20, r6 !(MT) + movca.l r0, @r4 + mov.l r5, @(4, r4) + add #32, r4 fmov dr0, @-r4 fmov dr0, @-r4 + add #-0x20, r6 fmov dr0, @-r4 dt r2 - bf/s 12b !(BR) - fmov dr0, @-r4 + bf/s 12b + add #-40, r4 RESTORE_FPSCR #else 12: - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 + movca.l r0,@r4 + mov.l r5,@(4, r4) + mov.l r5,@(8, r4) + mov.l r5,@(12,r4) + mov.l r5,@(16,r4) + mov.l r5,@(20,r4) add #-0x20, r6 - mov.l r5,@-r4 + mov.l r5,@(24,r4) dt r2 + mov.l r5,@(28,r4) bf/s 12b - mov.l r5,@-r4 -#endif - tst r6,r6 - bt/s 5f - mov #8, r0 + add #-32, r4 +#endif + add #32, r4 + mov #8, r0 cmp/ge r0, r6 - bf/s 4f - mov r6,r0 + bf 40f + + mov r6,r0 22: shlr2 r0 shlr r0 ! r0 = r6 >> 3 @@ -132,9 +137,10 @@ ENTRY(memset) ! mov #7,r0 and r0,r6 - tst r6,r6 + + ! fill bytes (length may be zero) +40: tst r6,r6 bt 5f - ! fill bytes 4: dt r6 bf/s 4b |