From 124ec188720b6bdea85ade49e7ea195161b12fce Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Sat, 5 Jan 2008 10:05:27 +0000
Subject: Chris Zankel writes: The following patches add support for the Xtensa
 processor architecture to uClibc. They are based on a recent SVN checkout
 (12/05/2007).

The first patch (attached to this post) adds Xtensa support to various
shared configuration and make files. The following patches then include
the Xtensa specific files and directories.

I welcome any feedback and would appreciate it if you could include the
patches into the mainline tree. I am certainly committed to maintain the port.

Bob Wilson was kind enough to review the patches.

Some notes about the architecture: Xtensa is a configurable and
extensible processor architecture developed by Tensilica. For more
information, please visit: www.linux-xtensa.org.
---
 libc/string/xtensa/memcpy.S | 297 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 libc/string/xtensa/memcpy.S

(limited to 'libc/string/xtensa/memcpy.S')

diff --git a/libc/string/xtensa/memcpy.S b/libc/string/xtensa/memcpy.S
new file mode 100644
index 000000000..19f3a6818
--- /dev/null
+++ b/libc/string/xtensa/memcpy.S
@@ -0,0 +1,297 @@
+/* Optimized memcpy for Xtensa.
+   Copyright (C) 2001, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <bits/xtensa-config.h>
+
+	.macro	src_b	r, w0, w1
+#ifdef __XTENSA_EB__
+	src	\r, \w0, \w1
+#else
+	src	\r, \w1, \w0
+#endif
+	.endm
+
+	.macro	ssa8	r
+#ifdef __XTENSA_EB__
+	ssa8b	\r
+#else
+	ssa8l	\r
+#endif
+	.endm
+
+/* If the Xtensa Unaligned Load Exception option is not used, this
+   code can run a few cycles faster by relying on the low address bits
+   being ignored.  However, if the code is then run with an Xtensa ISS
+   client that checks for unaligned accesses, it will produce a lot of
+   warning messages.  Set this flag to disable the use of unaligned
+   accesses and keep the ISS happy.  */
+
+#define UNALIGNED_ADDRESSES_CHECKED 1
+
+/* Do not use .literal_position in the ENTRY macro.  */
+#undef LITERAL_POSITION
+#define LITERAL_POSITION
+
+
+/* void *memcpy (void *dst, const void *src, size_t len)
+
+   The algorithm is as follows:
+
+   If the destination is unaligned, align it by conditionally
+   copying 1- and/or 2-byte pieces.
+
+   If the source is aligned, copy 16 bytes with a loop, and then finish up
+   with 8, 4, 2, and 1-byte copies conditional on the length.
+
+   Else (if source is unaligned), do the same, but use SRC to align the
+   source data.
+
+   This code tries to use fall-through branches for the common
+   case of aligned source and destination and multiple of 4 (or 8) length.  */
+
+
+/* Byte by byte copy.  */
+
+	.text
+	.align	4
+	.literal_position
+__memcpy_aux:
+
+	/* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
+	   (0 mod 4 alignment for LBEG).  */
+	.byte	0
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, 2f
+#else
+	beqz	a4, 2f
+	add	a7, a3, a4	// a7 = end address for source
+#endif
+1:	l8ui	a6, a3, 0
+	addi	a3, a3, 1
+	s8i	a6, a5, 0
+	addi	a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+	blt	a3, a7, 1b
+#endif
+2:	retw
+
+
+/* Destination is unaligned.  */
+
+	.align	4
+.Ldst1mod2: // dst is only byte aligned
+
+	/* Do short copies byte-by-byte.  */
+	_bltui	a4, 7, .Lbytecopy
+
+	/* Copy 1 byte.  */
+	l8ui	a6, a3, 0
+	addi	a3, a3, 1
+	addi	a4, a4, -1
+	s8i	a6, a5, 0
+	addi	a5, a5, 1
+
+	/* Return to main algorithm if dst is now aligned.  */
+	_bbci.l	a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+	/* Do short copies byte-by-byte.  */
+	_bltui	a4, 6, .Lbytecopy
+
+	/* Copy 2 bytes.  */
+	l8ui	a6, a3, 0
+	l8ui	a7, a3, 1
+	addi	a3, a3, 2
+	addi	a4, a4, -2
+	s8i	a6, a5, 0
+	s8i	a7, a5, 1
+	addi	a5, a5, 2
+
+	/* dst is now aligned; return to main algorithm.  */
+	j	.Ldstaligned
+
+
+ENTRY (memcpy)
+	/* a2 = dst, a3 = src, a4 = len */
+
+	mov	a5, a2		// copy dst so that a2 is return value
+	_bbsi.l	a2, 0, .Ldst1mod2
+	_bbsi.l	a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+	/* Get number of loop iterations with 16B per iteration.  */
+	srli	a7, a4, 4
+
+	/* Check if source is aligned.  */
+	movi	a8, 3
+	_bany	a3, a8, .Lsrcunaligned
+
+	/* Destination and source are word-aligned, use word copy.  */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a7, 2f
+#else
+	beqz	a7, 2f
+	slli	a8, a7, 4
+	add	a8, a8, a3	// a8 = end of last 16B source chunk
+#endif
+1:	l32i	a6, a3, 0
+	l32i	a7, a3, 4
+	s32i	a6, a5, 0
+	l32i	a6, a3, 8
+	s32i	a7, a5, 4
+	l32i	a7, a3, 12
+	s32i	a6, a5, 8
+	addi	a3, a3, 16
+	s32i	a7, a5, 12
+	addi	a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+	blt	a3, a8, 1b
+#endif
+
+	/* Copy any leftover pieces smaller than 16B.  */
+2:	bbci.l	a4, 3, 3f
+
+	/* Copy 8 bytes.  */
+	l32i	a6, a3, 0
+	l32i	a7, a3, 4
+	addi	a3, a3, 8
+	s32i	a6, a5, 0
+	s32i	a7, a5, 4
+	addi	a5, a5, 8
+
+3:	bbsi.l	a4, 2, 4f
+	bbsi.l	a4, 1, 5f
+	bbsi.l	a4, 0, 6f
+	retw
+
+	/* Copy 4 bytes.  */
+4:	l32i	a6, a3, 0
+	addi	a3, a3, 4
+	s32i	a6, a5, 0
+	addi	a5, a5, 4
+	bbsi.l	a4, 1, 5f
+	bbsi.l	a4, 0, 6f
+	retw
+
+	/* Copy 2 bytes.  */
+5:	l16ui	a6, a3, 0
+	addi	a3, a3, 2
+	s16i	a6, a5, 0
+	addi	a5, a5, 2
+	bbsi.l	a4, 0, 6f
+	retw
+
+	/* Copy 1 byte.  */
+6:	l8ui	a6, a3, 0
+	s8i	a6, a5, 0
+
+.Ldone:
+	retw
+
+
+/* Destination is aligned; source is unaligned.  */
+
+	.align	4
+.Lsrcunaligned:
+	/* Avoid loading anything for zero-length copies.  */
+	_beqz	a4, .Ldone
+
+	/* Copy 16 bytes per iteration for word-aligned dst and
+	   unaligned src.  */
+	ssa8	a3		// set shift amount from byte offset
+#if UNALIGNED_ADDRESSES_CHECKED
+	and	a11, a3, a8	// save unalignment offset for below
+	sub	a3, a3, a11	// align a3
+#endif
+	l32i	a6, a3, 0	// load first word
+#if XCHAL_HAVE_LOOPS
+	loopnez	a7, 2f
+#else
+	beqz	a7, 2f
+	slli	a10, a7, 4
+	add	a10, a10, a3	// a10 = end of last 16B source chunk
+#endif
+1:	l32i	a7, a3, 4
+	l32i	a8, a3, 8
+	src_b	a6, a6, a7
+	s32i	a6, a5, 0
+	l32i	a9, a3, 12
+	src_b	a7, a7, a8
+	s32i	a7, a5, 4
+	l32i	a6, a3, 16
+	src_b	a8, a8, a9
+	s32i	a8, a5, 8
+	addi	a3, a3, 16
+	src_b	a9, a9, a6
+	s32i	a9, a5, 12
+	addi	a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+	blt	a3, a10, 1b
+#endif
+
+2:	bbci.l	a4, 3, 3f
+
+	/* Copy 8 bytes.  */
+	l32i	a7, a3, 4
+	l32i	a8, a3, 8
+	src_b	a6, a6, a7
+	s32i	a6, a5, 0
+	addi	a3, a3, 8
+	src_b	a7, a7, a8
+	s32i	a7, a5, 4
+	addi	a5, a5, 8
+	mov	a6, a8
+
+3:	bbci.l	a4, 2, 4f
+
+	/* Copy 4 bytes.  */
+	l32i	a7, a3, 4
+	addi	a3, a3, 4
+	src_b	a6, a6, a7
+	s32i	a6, a5, 0
+	addi	a5, a5, 4
+	mov	a6, a7
+4:
+#if UNALIGNED_ADDRESSES_CHECKED
+	add	a3, a3, a11	// readjust a3 with correct misalignment
+#endif
+	bbsi.l	a4, 1, 5f
+	bbsi.l	a4, 0, 6f
+	retw
+
+	/* Copy 2 bytes.  */
+5:	l8ui	a6, a3, 0
+	l8ui	a7, a3, 1
+	addi	a3, a3, 2
+	s8i	a6, a5, 0
+	s8i	a7, a5, 1
+	addi	a5, a5, 2
+	bbsi.l	a4, 0, 6f
+	retw
+
+	/* Copy 1 byte.  */
+6:	l8ui	a6, a3, 0
+	s8i	a6, a5, 0
+	retw
+
+libc_hidden_def (memcpy)
-- 
cgit v1.2.3