diff options
Diffstat (limited to 'libc/string/arm/_memcpy.S')
-rw-r--r-- | libc/string/arm/_memcpy.S | 182 |
1 files changed, 168 insertions, 14 deletions
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S index 3704f96b5..5ef63c45a 100644 --- a/libc/string/arm/_memcpy.S +++ b/libc/string/arm/_memcpy.S @@ -39,7 +39,9 @@ #include <features.h> #include <endian.h> +#include <bits/arm_asm.h> +#if !defined(THUMB1_ONLY) /* * This is one fun bit of code ... * Some easy listening music is suggested while trying to understand this @@ -77,11 +79,36 @@ .type _memcpy,%function .align 4 +/* XXX: The Thumb-2 conditionals can be removed if/when we require an + assembler that supports unified syntax. */ +.macro copy regs +#if defined(__thumb2__) + ittt ge + ldmiage r1!, \regs + stmiage r0!, \regs +#else + ldmgeia r1!, \regs + stmgeia r0!, \regs +#endif +.endm + +.macro copydb regs +#if defined(__thumb2__) + ittt ge + ldmdbge r1!, \regs + stmdbge r0!, \regs +#else + ldmgedb r1!, \regs + stmgedb r0!, \regs +#endif +.endm + _memcpy: /* Determine copy direction */ cmp r1, r0 bcc .Lmemcpy_backwards + IT(tt, eq) moveq r0, #0 /* Quick abort for len=0 */ #if defined(__USE_BX__) bxeq lr @@ -102,7 +129,7 @@ _memcpy: blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ subs r2, r2, #0x14 blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ - stmdb sp!, {r4} /* borrow r4 */ + str r4, [sp, #-4]! /* borrow r4 */ /* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ @@ -115,19 +142,22 @@ _memcpy: bge .Lmemcpy_floop32 cmn r2, #0x10 - ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ - stmgeia r0!, {r3, r4, r12, lr} + /* blat a remaining 16 bytes */ + copy "{r3, r4, r12, lr}" subge r2, r2, #0x10 - ldmia sp!, {r4} /* return r4 */ + ldr r4, [sp], #4 /* restore r4 */ .Lmemcpy_fl32: adds r2, r2, #0x14 /* blat 12 bytes at a time */ .Lmemcpy_floop12: - ldmgeia r1!, {r3, r12, lr} - stmgeia r0!, {r3, r12, lr} + copy "{r3, r12, lr}" +#if defined(__thumb2__) + subsge r2, r2, #0x0c +#else subges r2, r2, #0x0c +#endif bge .Lmemcpy_floop12 .Lmemcpy_fl12: @@ -135,26 +165,48 @@ _memcpy: blt .Lmemcpy_fl4 subs r2, r2, #4 + IT(tt, lt) ldrlt r3, [r1], #4 strlt r3, [r0], #4 - ldmgeia r1!, {r3, r12} - stmgeia r0!, {r3, r12} + copy "{r3, r12}" subge r2, r2, #4 .Lmemcpy_fl4: /* less than 4 bytes to go */ adds r2, r2, #4 +#if defined(__thumb2__) + it eq + popeq {r0, pc} /* done */ +#elif defined(__ARM_ARCH_4T__) + ldmeqia sp!, {r0, r3} /* done */ + bxeq r3 +#else ldmeqia sp!, {r0, pc} /* done */ +#endif /* copy the crud byte at a time */ cmp r2, #2 ldrb r3, [r1], #1 strb r3, [r0], #1 +#if defined(__thumb2__) + itt ge + ldrbge r3, [r1], #1 + strbge r3, [r0], #1 + itt gt + ldrbgt r3, [r1], #1 + strbgt r3, [r0], #1 +#else ldrgeb r3, [r1], #1 strgeb r3, [r0], #1 ldrgtb r3, [r1], #1 strgtb r3, [r0], #1 +#endif +#if defined(__ARM_ARCH_4T__) + ldmia sp!, {r0, r3} + bx r3 +#else ldmia sp!, {r0, pc} +#endif /* erg - unaligned destination */ .Lmemcpy_fdestul: @@ -164,10 +216,19 @@ _memcpy: /* align destination with byte copies */ ldrb r3, [r1], #1 strb r3, [r0], #1 +#if defined(__thumb2__) + itt ge + ldrbge r3, [r1], #1 + strbge r3, [r0], #1 + itt gt + ldrbgt r3, [r1], #1 + strbgt r3, [r0], #1 +#else ldrgeb r3, [r1], #1 strgeb r3, [r0], #1 ldrgtb r3, [r1], #1 strgtb r3, [r0], #1 +#endif subs r2, r2, r12 blt .Lmemcpy_fl4 /* less the 4 bytes */ @@ -370,12 +431,12 @@ _memcpy: .Lmemcpy_bl32: cmn r2, #0x10 - ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ - stmgedb r0!, {r3, r4, r12, lr} + /* blat a remaining 16 bytes */ + copydb "{r3, r4, r12, lr}" subge r2, r2, #0x10 adds r2, r2, #0x14 - ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ - stmgedb r0!, {r3, r12, lr} + /* blat a remaining 12 bytes */ + copydb "{r3, r12, lr}" subge r2, r2, #0x0c ldmia sp!, {r4, lr} @@ -383,15 +444,16 @@ _memcpy: adds r2, r2, #8 blt .Lmemcpy_bl4 subs r2, r2, #4 + IT(tt, lt) ldrlt r3, [r1, #-4]! strlt r3, [r0, #-4]! - ldmgedb r1!, {r3, r12} - stmgedb r0!, {r3, r12} + copydb "{r3, r12}" subge r2, r2, #4 .Lmemcpy_bl4: /* less than 4 bytes to go */ adds r2, r2, #4 + IT(t, eq) #if defined(__USE_BX__) bxeq lr #else @@ -401,10 +463,19 @@ _memcpy: cmp r2, #2 ldrb r3, [r1, #-1]! strb r3, [r0, #-1]! +#ifdef __thumb2__ + itt ge + ldrbge r3, [r1, #-1]! + strbge r3, [r0, #-1]! + itt gt + ldrbgt r3, [r1, #-1]! + strbgt r3, [r0, #-1]! +#else ldrgeb r3, [r1, #-1]! strgeb r3, [r0, #-1]! ldrgtb r3, [r1, #-1]! strgtb r3, [r0, #-1]! +#endif #if defined(__USE_BX__) bx lr #else @@ -417,10 +488,19 @@ _memcpy: /* align destination with byte copies */ ldrb r3, [r1, #-1]! strb r3, [r0, #-1]! +#ifdef __thumb2__ + itt ge + ldrbge r3, [r1, #-1]! + strbge r3, [r0, #-1]! + itt gt + ldrbgt r3, [r1, #-1]! + strbgt r3, [r0, #-1]! +#else ldrgeb r3, [r1, #-1]! strgeb r3, [r0, #-1]! ldrgtb r3, [r1, #-1]! strgtb r3, [r0, #-1]! +#endif subs r2, r2, r12 blt .Lmemcpy_bl4 /* less than 4 bytes to go */ ands r12, r1, #3 @@ -591,3 +671,77 @@ _memcpy: .Lmemcpy_bsrcul1l4: add r1, r1, #1 b .Lmemcpy_bl4 + +#else /* THUMB1_ONLY */ + +/* This is a fairly dumb implementation for when we can't use the 32-bit code + above. */ +.text +.global _memcpy +.hidden _memcpy +.type _memcpy,%function +.align 4 +.thumb +_memcpy: + push {r0, r4} + cmp r2, #0 + beq .Lmemcpy_exit + @ See if we have overlapping regions, and need to reverse the + @ direction of the copy + cmp r0, r1 + bls .Lmemcpy_forwards + add r4, r1, r2 + cmp r0, r4 + bcc .Lmemcpy_backwards +.Lmemcpy_forwards: + /* Forwards. */ + mov r3, r0 + eor r3, r1 + mov r4, #3 + tst r3, r4 + bne .Lmemcpy_funaligned + cmp r2, #8 + bcc .Lmemcpy_funaligned +1: @ copy up to the first word boundary. + tst r0, r4 + beq 1f + ldrb r3, [r1] + add r1, r1, #1 + strb r3, [r0] + add r0, r0, #1 + sub r2, r2, #1 + b 1b +1: @ Copy aligned words + ldr r3, [r1] + add r1, r1, #4 + str r3, [r0] + add r0, r0, #4 + sub r2, r2, #4 + cmp r2, #4 + bcs 1b + cmp r2, #0 + beq .Lmemcpy_exit +.Lmemcpy_funaligned: +1: + ldrb r3, [r1] + add r1, r1, #1 + strb r3, [r0] + add r0, r0, #1 + sub r2, r2, #1 + bne 1b +.Lmemcpy_exit: + pop {r0, r4} + bx lr + +.Lmemcpy_backwards: + add r0, r0, r2 + add r1, r1, r2 +1: + sub r0, r0, #1 + sub r1, r1, #1 + ldrb r3, [r1] + strb r3, [r0] + sub r2, r2, #1 + bne 1b + b .Lmemcpy_exit +#endif |