summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libc/string/sh/sh4/memcpy.S128
1 files changed, 107 insertions, 21 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 252ef36eb..5be770a59 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -28,13 +28,20 @@
* Currenlty it has been only implemented and tested for little endian mode. */
.macro FPU_SET_PAIRED_PREC
sts fpscr, r7
- mov #0x10, r6 ! PR=0 SZ=1
- shll16 r6
- lds r6, fpscr
+ mov #0x10, r0 ! PR=0 SZ=1
+ shll16 r0
+ lds r0, fpscr
.endm
.macro RESTORE_FPSCR
lds r7, fpscr
.endm
+.macro DALLOC
+ ! Cache allocate + store on dst-32.
+ add #-32, r1
+ movca.l r0, @r1
+ add #32, r1
+.endm
+
#endif
!
@@ -471,30 +478,111 @@ ENTRY(memcpy)
add r0, r5
mov r0, r1
- add #-0x1c, r5
- mov r5, r0
+ mov r1, r3 ! MT
+ sub r2, r3 ! EX (r3 - r2 -> r3)
+ mov #-5, r0
+ shld r0, r3 ! number of the cache lines
+ mov #8, r0
+ cmp/ge r0, r3 ! Check if there are many cache lines to copy.
+ bf 45f ! Copy cache line aligned blocks without pref.
+ mov r5, r0
+ add #-0x7c, r0
tst #7, r0 ! src is 8byte aligned
- mov r5, r3
+ bf 45f
+
+ ! Many cache lines have to be copied and the buffers are well aligned.
+ ! Aggressive prefetching and FPU in single paired precision.
+ mov r0, r5
+ mov r5, r6
+ add #-0x80, r6 ! prefetch head
- add #-64, r3 ! To pefetch head
- bt/s 3f
+ FPU_SET_PAIRED_PREC
- pref @r3
+ mov #4, r0
+67:
+ add #-0x20, r6
+ pref @r6
+ add #-0x20, r6
+ pref @r6
+
+ fmov @r5+, dr0
+ fmov @r5+, dr2
+ fmov @r5+, dr4
+ fmov @r5+, dr6
+ fmov @r5+, dr8
+ fmov @r5+, dr10
+ fmov @r5+, dr12
+ fmov @r5+, dr14
+ fmov @r5+, xd0
+ fmov @r5+, xd2
+ fmov @r5+, xd4
+ fmov @r5+, xd6
+ fmov @r5+, xd8
+ fmov @r5+, xd10
+ fmov @r5+, xd12
+ fmov @r5+, xd14
+
+ DALLOC
+ fmov xd14, @-r1
+ fmov xd12, @-r1
+ fmov xd10, @-r1
+ fmov xd8, @-r1
+ DALLOC
+ fmov xd6, @-r1
+ fmov xd4, @-r1
+ fmov xd2, @-r1
+ fmov xd0, @-r1
+ DALLOC
+ fmov dr14, @-r1
+ fmov dr12, @-r1
+ fmov dr10, @-r1
+ fmov dr8, @-r1
+ DALLOC
+ fmov dr6, @-r1
+ add #-0x80, r5
+ fmov dr4, @-r1
+ add #-0x80, r5
+ fmov dr2, @-r1
+ add #-0x20, r6
+ fmov dr0, @-r1
+ add #-4, r3
+ pref @r6
+ add #-0x20, r6
+ cmp/ge r0, r3
+ bt/s 67b
+ pref @r6
+
+ ! Other cache lines could be copied: so use the FPU in single paired
+ ! precision without prefetching. No check for alignment is necessary.
+
+ mov #1, r0
+ cmp/ge r0, r3
+ bt/s 4f
+ add #0x60, r5
+
+ RESTORE_FPSCR
+
+ bra 5f
+ nop
+
+ ! No prefetch and FPU in single precision.
+45:
+ add #-0x1c, r5
+ mov r5, r0
+ tst #7, r0
+ bt 3f
2: fmov.s @r5+, fr0
- mov r1, r6
fmov.s @r5+, fr1
- add #-32, r6
fmov.s @r5+, fr2
fmov.s @r5+, fr3
fmov.s @r5+, fr4
fmov.s @r5+, fr5
fmov.s @r5+, fr6
fmov.s @r5+, fr7
- add #-0x40, r5
- movca.l r0, @r6 ! Cache allocate + store on dst-32.
+ DALLOC
fmov.s fr7, @-r1
fmov.s fr6, @-r1
@@ -505,35 +593,33 @@ ENTRY(memcpy)
fmov.s fr1, @-r1
fmov.s fr0, @-r1
- add #-32, r3
cmp/eq r2,r1
bf/s 2b
- pref @r3 ! Prefetch the next cache line.
+ add #-0x40, r5
bra 5f
+ nop
+
+ ! No prefetch and FPU in single paired precision.
3: FPU_SET_PAIRED_PREC
4: fmov @r5+, dr0
- mov r1, r6
fmov @r5+, dr2
- add #-32, r6
fmov @r5+, dr4
fmov @r5+, dr6
- add #-0x40, r5
- movca.l r0, @r6
+ DALLOC
fmov dr6, @-r1
fmov dr4, @-r1
fmov dr2, @-r1
fmov dr0, @-r1
- add #-32, r3
cmp/eq r2,r1
bf/s 4b
- pref @r3
+ add #-0x40, r5
RESTORE_FPSCR