/* * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. */ #include #include #ifdef __LITTLE_ENDIAN__ # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM # define MERGE_2(RX,RY,IMM) # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM #else # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 #endif #ifdef __LL64__ # define PREFETCH_READ(RX) prefetch [RX, 56] # define PREFETCH_WRITE(RX) prefetchw [RX, 64] # define LOADX(DST,RX) ldd.ab DST, [RX, 8] # define STOREX(SRC,RX) std.ab SRC, [RX, 8] # define ZOLSHFT 5 # define ZOLAND 0x1F #else # define PREFETCH_READ(RX) prefetch [RX, 28] # define PREFETCH_WRITE(RX) prefetchw [RX, 32] # define LOADX(DST,RX) ld.ab DST, [RX, 4] # define STOREX(SRC,RX) st.ab SRC, [RX, 4] # define ZOLSHFT 4 # define ZOLAND 0xF #endif ENTRY(memcpy) prefetch [r1] ; Prefetch the read location prefetchw [r0] ; Prefetch the write location mov.f 0, r2 ;;; if size is zero jz.d [blink] mov r3, r0 ; don't clobber ret val ;;; if size <= 8 cmp r2, 8 bls.d @.Lsmallchunk mov.f lp_count, r2 and.f r4, r0, 0x03 rsub lp_count, r4, 4 lpnz @.Laligndestination ;; LOOP BEGIN ldb.ab r5, [r1,1] sub r2, r2, 1 stb.ab r5, [r3,1] .Laligndestination: ;;; Check the alignment of the source and.f r4, r1, 0x03 bnz.d @.Lsourceunaligned ;;; CASE 0: Both source and destination are 32bit aligned ;;; Convert len to Dwords, unfold x4 lsr.f lp_count, r2, ZOLSHFT lpnz @.Lcopy32_64bytes ;; LOOP START LOADX (r6, r1) PREFETCH_READ (r1) PREFETCH_WRITE (r3) LOADX (r8, r1) LOADX (r10, r1) LOADX (r4, r1) STOREX (r6, r3) STOREX (r8, r3) STOREX (r10, r3) STOREX (r4, r3) .Lcopy32_64bytes: and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes .Lsmallchunk: lpnz @.Lcopyremainingbytes ;; LOOP START ldb.ab r5, [r1,1] stb.ab r5, [r3,1] .Lcopyremainingbytes: j [blink] ;;; END CASE 0 .Lsourceunaligned: cmp r4, 2 beq.d @.LunalignedOffby2 sub r2, r2, 1 bhi.d @.LunalignedOffby3 ldb.ab r5, [r1, 1] ;;; CASE 1: The source is unaligned, off by 1 ;; Hence I need to read 1 byte for a 16bit alignment ;; and 2bytes to reach 32bit alignment ldh.ab r6, [r1, 2] sub r2, r2, 2 ;; Convert to words, unfold x2 lsr.f lp_count, r2, 3 MERGE_1 (r6, r6, 8) MERGE_2 (r5, r5, 24) or r5, r5, r6 ;; Both src and dst are aligned lpnz @.Lcopy8bytes_1 ;; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 24) or r7, r7, r5 SHIFT_2 (r5, r6, 8) SHIFT_1 (r9, r8, 24) or r9, r9, r5 SHIFT_2 (r5, r8, 8) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_1: ;; Write back the remaining 16bits EXTRACT_1 (r6, r5, 16) sth.ab r6, [r3, 2] ;; Write back the remaining 8bits EXTRACT_2 (r5, r5, 16) stb.ab r5, [r3, 1] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_1 ;; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_1: j [blink] .LunalignedOffby2: ;;; CASE 2: The source is unaligned, off by 2 ldh.ab r5, [r1, 2] sub r2, r2, 1 ;; Both src and dst are aligned ;; Convert to words, unfold x2 lsr.f lp_count, r2, 3 #ifdef __BIG_ENDIAN__ asl.nz r5, r5, 16 #endif lpnz @.Lcopy8bytes_2 ;; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 16) or r7, r7, r5 SHIFT_2 (r5, r6, 16) SHIFT_1 (r9, r8, 16) or r9, r9, r5 SHIFT_2 (r5, r8, 16) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_2: #ifdef __BIG_ENDIAN__ lsr.nz r5, r5, 16 #endif sth.ab r5, [r3, 2] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_2 ;; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_2: j [blink] .LunalignedOffby3: ;;; CASE 3: The source is unaligned, off by 3 ;;; Hence, I need to read 1byte for achieve the 32bit alignment ;; Both src and dst are aligned ;; Convert to words, unfold x2 lsr.f lp_count, r2, 3 #ifdef __BIG_ENDIAN__ asl.ne r5, r5, 24 #endif lpnz @.Lcopy8bytes_3 ;; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 8) or r7, r7, r5 SHIFT_2 (r5, r6, 24) SHIFT_1 (r9, r8, 8) or r9, r9, r5 SHIFT_2 (r5, r8, 24) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_3: #ifdef __BIG_ENDIAN__ lsr.nz r5, r5, 24 #endif stb.ab r5, [r3, 1] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_3 ;; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_3: j [blink] END(memcpy) libc_hidden_def(memcpy)