summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libc/string/sh/sh4/memset.S62
1 files changed, 34 insertions, 28 deletions
diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S
index 83f874612..eb83355ce 100644
--- a/libc/string/sh/sh4/memset.S
+++ b/libc/string/sh/sh4/memset.S
@@ -5,7 +5,7 @@
* Copyright (C) 1999 Niibe Yutaka
*
* Copyright (c) 2009 STMicroelectronics Ltd
- * Optimised using 64bit data transfer via FPU
+ * Optimised using 64bit data transfer (via FPU) and the movca.l inst.
* Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
*
* Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
@@ -24,9 +24,9 @@
* Currenlty it has been only implemented and tested for little endian mode. */
.macro FPU_SET_PAIRED_PREC
sts fpscr, r3
- mov #0x10, r0 ! PR=0 SZ=1
- shll16 r0
- lds r0, fpscr
+ mov #0x10, r1 ! PR=0 SZ=1
+ shll16 r1
+ lds r1, fpscr
.endm
.macro RESTORE_FPSCR
lds r3, fpscr
@@ -34,12 +34,10 @@
#endif
ENTRY(memset)
- tst r6,r6
- bt/s 5f ! if n=0, do nothing
- add r6,r4
mov #12,r0
+ add r6,r4
cmp/gt r6,r0
- bt/s 4f ! if it's too small, set a byte at once
+ bt/s 40f ! if it's too small, set a byte at once
mov r4,r0
and #3,r0
cmp/eq #0,r0
@@ -56,7 +54,7 @@ ENTRY(memset)
swap.w r5,r0 ! VV00
or r0,r5 ! VVVV
- ! Enough bytes need to be copied
+ ! Check if enough bytes need to be copied to be worth the big loop
mov #0x40, r0 ! (MT)
cmp/gt r6,r0 ! (MT) 64 > len => slow loop
@@ -84,6 +82,9 @@ ENTRY(memset)
mov #-5,r0
shld r0,r2 ! number of loops
+ add #-32, r4
+ mov r5, r0
+
#ifdef MEMSET_USES_FPU
lds r5, fpul ! (CO)
fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV'
@@ -91,36 +92,40 @@ ENTRY(memset)
FPU_SET_PAIRED_PREC
12:
- add #-0x20, r6 !(MT)
+ movca.l r0, @r4
+ mov.l r5, @(4, r4)
+ add #32, r4
fmov dr0, @-r4
fmov dr0, @-r4
+ add #-0x20, r6
fmov dr0, @-r4
dt r2
- bf/s 12b !(BR)
- fmov dr0, @-r4
+ bf/s 12b
+ add #-40, r4
RESTORE_FPSCR
#else
12:
- mov.l r5,@-r4
- mov.l r5,@-r4
- mov.l r5,@-r4
- mov.l r5,@-r4
- mov.l r5,@-r4
- mov.l r5,@-r4
+ movca.l r0,@r4
+ mov.l r5,@(4, r4)
+ mov.l r5,@(8, r4)
+ mov.l r5,@(12,r4)
+ mov.l r5,@(16,r4)
+ mov.l r5,@(20,r4)
add #-0x20, r6
- mov.l r5,@-r4
+ mov.l r5,@(24,r4)
dt r2
+ mov.l r5,@(28,r4)
bf/s 12b
- mov.l r5,@-r4
-#endif
- tst r6,r6
- bt/s 5f
- mov #8, r0
+ add #-32, r4
+#endif
+ add #32, r4
+ mov #8, r0
cmp/ge r0, r6
- bf/s 4f
- mov r6,r0
+ bf 40f
+
+ mov r6,r0
22:
shlr2 r0
shlr r0 ! r0 = r6 >> 3
@@ -132,9 +137,10 @@ ENTRY(memset)
!
mov #7,r0
and r0,r6
- tst r6,r6
+
+ ! fill bytes (length may be zero)
+40: tst r6,r6
bt 5f
- ! fill bytes
4:
dt r6
bf/s 4b