/*
 * Copyright (C) 2004-2007 Atmel Corporation
 *
 * This file is subject to the terms and conditions of the GNU Lesser General
 * Public License.  See the file "COPYING.LIB" in the main directory of this
 * archive for more details.
 */

/* Don't use r12 as dst since we must return it unmodified */
#define dst r9
#define src r11
#define len r10

       .text
       .global memcpy
       .type   memcpy, @function
memcpy:
       pref    src[0]
       mov     dst, r12

       /* If we have less than 32 bytes, don't do anything fancy */
       cp.w    len, 32
       brge    .Lmore_than_31

       sub     len, 1
       retlt   r12
1:     ld.ub   r8, src++
       st.b    dst++, r8
       sub     len, 1
       brge    1b
       retal   r12

.Lmore_than_31:
       pushm   r0-r7, lr

       /* Check alignment */
       mov     r8, src
       andl    r8, 31, COH
       brne    .Lunaligned_src
       mov     r8, dst
       andl    r8, 3, COH
       brne    .Lunaligned_dst

.Laligned_copy:
       sub     len, 32
       brlt    .Lless_than_32

1:     /* Copy 32 bytes at a time */
       ldm     src, r0-r7
       sub     src, -32
       stm     dst, r0-r7
       sub     dst, -32
       sub     len, 32
       brge    1b

.Lless_than_32:
       /* Copy 16 more bytes if possible */
       sub     len, -16
       brlt    .Lless_than_16
       ldm     src, r0-r3
       sub     src, -16
       sub     len, 16
       stm     dst, r0-r3
       sub     dst, -16

.Lless_than_16:
       /* Do the remaining as byte copies */
       neg     len
       add     pc, pc, len << 2
       .rept   15
       ld.ub   r0, src++
       st.b    dst++, r0
       .endr

       popm    r0-r7, pc

.Lunaligned_src:
       /* Make src cacheline-aligned. r8 = (src & 31) */
       rsub    r8, r8, 32
       sub     len, r8
1:     ld.ub   r0, src++
       st.b    dst++, r0
       sub     r8, 1
       brne    1b

       /* If dst is word-aligned, we're ready to go */
       pref    src[0]
       mov     r8, 3
       tst     dst, r8
       breq    .Laligned_copy

.Lunaligned_dst:
       /* src is aligned, but dst is not. Expect bad performance */
       sub     len, 4
       brlt    2f
1:     ld.w    r0, src++
       st.w    dst++, r0
       sub     len, 4
       brge    1b

2:     neg     len
       add     pc, pc, len << 2
       .rept   3
       ld.ub   r0, src++
       st.b    dst++, r0
       .endr

       popm    r0-r7, pc
       .size   memcpy, . - memcpy

libc_hidden_def(memcpy)