From 124ec188720b6bdea85ade49e7ea195161b12fce Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 5 Jan 2008 10:05:27 +0000 Subject: Chris Zankel writes: The following patches add support for the Xtensa processor architecture to uClibc. They are based on a recent SVN checkout (12/05/2007). The first patch (attached to this post) adds Xtensa support to various shared configuration and make files. The following patches then include the Xtensa specific files and directories. I welcome any feedback and would appreciate it if you could include the patches into the mainline tree. I am certainly committed to maintain the port. Bob Wilson was kind enough to review the patches. Some notes about the architecture: Xtensa is a configurable and extensible processor architecture developed by Tensilica. For more information, please visit: www.linux-xtensa.org. --- libc/string/xtensa/Makefile | 13 ++ libc/string/xtensa/memcpy.S | 297 ++++++++++++++++++++++++++++++++++++++++ libc/string/xtensa/memset.S | 165 +++++++++++++++++++++++ libc/string/xtensa/strcmp.S | 313 +++++++++++++++++++++++++++++++++++++++++++ libc/string/xtensa/strcpy.S | 150 +++++++++++++++++++++ libc/string/xtensa/strlen.S | 104 ++++++++++++++ libc/string/xtensa/strncpy.S | 241 +++++++++++++++++++++++++++++++++ 7 files changed, 1283 insertions(+) create mode 100644 libc/string/xtensa/Makefile create mode 100644 libc/string/xtensa/memcpy.S create mode 100644 libc/string/xtensa/memset.S create mode 100644 libc/string/xtensa/strcmp.S create mode 100644 libc/string/xtensa/strcpy.S create mode 100644 libc/string/xtensa/strlen.S create mode 100644 libc/string/xtensa/strncpy.S (limited to 'libc/string/xtensa') diff --git a/libc/string/xtensa/Makefile b/libc/string/xtensa/Makefile new file mode 100644 index 000000000..0a95346fd --- /dev/null +++ b/libc/string/xtensa/Makefile @@ -0,0 +1,13 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen +# +# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. +# + +top_srcdir:=../../../ +top_builddir:=../../../ +all: objs +include $(top_builddir)Rules.mak +include ../Makefile.in +include $(top_srcdir)Makerules diff --git a/libc/string/xtensa/memcpy.S b/libc/string/xtensa/memcpy.S new file mode 100644 index 000000000..19f3a6818 --- /dev/null +++ b/libc/string/xtensa/memcpy.S @@ -0,0 +1,297 @@ +/* Optimized memcpy for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + + .macro src_b r, w0, w1 +#ifdef __XTENSA_EB__ + src \r, \w0, \w1 +#else + src \r, \w1, \w0 +#endif + .endm + + .macro ssa8 r +#ifdef __XTENSA_EB__ + ssa8b \r +#else + ssa8l \r +#endif + .endm + +/* If the Xtensa Unaligned Load Exception option is not used, this + code can run a few cycles faster by relying on the low address bits + being ignored. However, if the code is then run with an Xtensa ISS + client that checks for unaligned accesses, it will produce a lot of + warning messages. Set this flag to disable the use of unaligned + accesses and keep the ISS happy. */ + +#define UNALIGNED_ADDRESSES_CHECKED 1 + +/* Do not use .literal_position in the ENTRY macro. */ +#undef LITERAL_POSITION +#define LITERAL_POSITION + + +/* void *memcpy (void *dst, const void *src, size_t len) + + The algorithm is as follows: + + If the destination is unaligned, align it by conditionally + copying 1- and/or 2-byte pieces. + + If the source is aligned, copy 16 bytes with a loop, and then finish up + with 8, 4, 2, and 1-byte copies conditional on the length. + + Else (if source is unaligned), do the same, but use SRC to align the + source data. + + This code tries to use fall-through branches for the common + case of aligned source and destination and multiple of 4 (or 8) length. */ + + +/* Byte by byte copy. */ + + .text + .align 4 + .literal_position +__memcpy_aux: + + /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ + (0 mod 4 alignment for LBEG). */ + .byte 0 + +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, 2f +#else + beqz a4, 2f + add a7, a3, a4 // a7 = end address for source +#endif +1: l8ui a6, a3, 0 + addi a3, a3, 1 + s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a3, a7, 1b +#endif +2: retw + + +/* Destination is unaligned. */ + + .align 4 +.Ldst1mod2: // dst is only byte aligned + + /* Do short copies byte-by-byte. */ + _bltui a4, 7, .Lbytecopy + + /* Copy 1 byte. */ + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + + /* Return to main algorithm if dst is now aligned. */ + _bbci.l a5, 1, .Ldstaligned + +.Ldst2mod4: // dst has 16-bit alignment + + /* Do short copies byte-by-byte. */ + _bltui a4, 6, .Lbytecopy + + /* Copy 2 bytes. */ + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + + /* dst is now aligned; return to main algorithm. */ + j .Ldstaligned + + +ENTRY (memcpy) + /* a2 = dst, a3 = src, a4 = len */ + + mov a5, a2 // copy dst so that a2 is return value + _bbsi.l a2, 0, .Ldst1mod2 + _bbsi.l a2, 1, .Ldst2mod4 +.Ldstaligned: + + /* Get number of loop iterations with 16B per iteration. */ + srli a7, a4, 4 + + /* Check if source is aligned. */ + movi a8, 3 + _bany a3, a8, .Lsrcunaligned + + /* Destination and source are word-aligned, use word copy. */ +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a8, a7, 4 + add a8, a8, a3 // a8 = end of last 16B source chunk +#endif +1: l32i a6, a3, 0 + l32i a7, a3, 4 + s32i a6, a5, 0 + l32i a6, a3, 8 + s32i a7, a5, 4 + l32i a7, a3, 12 + s32i a6, a5, 8 + addi a3, a3, 16 + s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a8, 1b +#endif + + /* Copy any leftover pieces smaller than 16B. */ +2: bbci.l a4, 3, 3f + + /* Copy 8 bytes. */ + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a3, a3, 8 + s32i a6, a5, 0 + s32i a7, a5, 4 + addi a5, a5, 8 + +3: bbsi.l a4, 2, 4f + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + retw + + /* Copy 4 bytes. */ +4: l32i a6, a3, 0 + addi a3, a3, 4 + s32i a6, a5, 0 + addi a5, a5, 4 + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + retw + + /* Copy 2 bytes. */ +5: l16ui a6, a3, 0 + addi a3, a3, 2 + s16i a6, a5, 0 + addi a5, a5, 2 + bbsi.l a4, 0, 6f + retw + + /* Copy 1 byte. */ +6: l8ui a6, a3, 0 + s8i a6, a5, 0 + +.Ldone: + retw + + +/* Destination is aligned; source is unaligned. */ + + .align 4 +.Lsrcunaligned: + /* Avoid loading anything for zero-length copies. */ + _beqz a4, .Ldone + + /* Copy 16 bytes per iteration for word-aligned dst and + unaligned src. */ + ssa8 a3 // set shift amount from byte offset +#if UNALIGNED_ADDRESSES_CHECKED + and a11, a3, a8 // save unalignment offset for below + sub a3, a3, a11 // align a3 +#endif + l32i a6, a3, 0 // load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a10, a7, 4 + add a10, a10, a3 // a10 = end of last 16B source chunk +#endif +1: l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + l32i a9, a3, 12 + src_b a7, a7, a8 + s32i a7, a5, 4 + l32i a6, a3, 16 + src_b a8, a8, a9 + s32i a8, a5, 8 + addi a3, a3, 16 + src_b a9, a9, a6 + s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, 1b +#endif + +2: bbci.l a4, 3, 3f + + /* Copy 8 bytes. */ + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a3, a3, 8 + src_b a7, a7, a8 + s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 + +3: bbci.l a4, 2, 4f + + /* Copy 4 bytes. */ + l32i a7, a3, 4 + addi a3, a3, 4 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +4: +#if UNALIGNED_ADDRESSES_CHECKED + add a3, a3, a11 // readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + retw + + /* Copy 2 bytes. */ +5: l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + bbsi.l a4, 0, 6f + retw + + /* Copy 1 byte. */ +6: l8ui a6, a3, 0 + s8i a6, a5, 0 + retw + +libc_hidden_def (memcpy) diff --git a/libc/string/xtensa/memset.S b/libc/string/xtensa/memset.S new file mode 100644 index 000000000..c0928825d --- /dev/null +++ b/libc/string/xtensa/memset.S @@ -0,0 +1,165 @@ +/* Optimized memset for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + +/* Do not use .literal_position in the ENTRY macro. */ +#undef LITERAL_POSITION +#define LITERAL_POSITION + +/* void *memset (void *dst, int c, size_t length) + + The algorithm is as follows: + + Create a word with c in all byte positions. + + If the destination is aligned, set 16B chunks with a loop, and then + finish up with 8B, 4B, 2B, and 1B stores conditional on the length. + + If the destination is unaligned, align it by conditionally + setting 1B and/or 2B and then go to aligned case. + + This code tries to use fall-through branches for the common + case of an aligned destination (except for the branches to + the alignment labels). */ + + +/* Byte-by-byte set. */ + + .text + .align 4 + .literal_position +__memset_aux: + + /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ + (0 mod 4 alignment for LBEG). */ + .byte 0 + +.Lbyteset: +#if XCHAL_HAVE_LOOPS + loopnez a4, 2f +#else + beqz a4, 2f + add a6, a5, a4 // a6 = ending address +#endif +1: s8i a3, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, 1b +#endif +2: retw + + +/* Destination is unaligned. */ + + .align 4 + +.Ldst1mod2: // dst is only byte aligned + + /* Do short sizes byte-by-byte. */ + bltui a4, 8, .Lbyteset + + /* Set 1 byte. */ + s8i a3, a5, 0 + addi a5, a5, 1 + addi a4, a4, -1 + + /* Now retest if dst is aligned. */ + _bbci.l a5, 1, .Ldstaligned + +.Ldst2mod4: // dst has 16-bit alignment + + /* Do short sizes byte-by-byte. */ + bltui a4, 8, .Lbyteset + + /* Set 2 bytes. */ + s16i a3, a5, 0 + addi a5, a5, 2 + addi a4, a4, -2 + + /* dst is now aligned; return to main algorithm */ + j .Ldstaligned + + +ENTRY (memset) + /* a2 = dst, a3 = c, a4 = length */ + + /* Duplicate character into all bytes of word. */ + extui a3, a3, 0, 8 + slli a7, a3, 8 + or a3, a3, a7 + slli a7, a3, 16 + or a3, a3, a7 + + mov a5, a2 // copy dst so that a2 is return value + + /* Check if dst is unaligned. */ + _bbsi.l a2, 0, .Ldst1mod2 + _bbsi.l a2, 1, .Ldst2mod4 +.Ldstaligned: + + /* Get number of loop iterations with 16B per iteration. */ + srli a7, a4, 4 + + /* Destination is word-aligned. */ +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a6, a7, 4 + add a6, a6, a5 // a6 = end of last 16B chunk +#endif + /* Set 16 bytes per iteration. */ +1: s32i a3, a5, 0 + s32i a3, a5, 4 + s32i a3, a5, 8 + s32i a3, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, 1b +#endif + + /* Set any leftover pieces smaller than 16B. */ +2: bbci.l a4, 3, 3f + + /* Set 8 bytes. */ + s32i a3, a5, 0 + s32i a3, a5, 4 + addi a5, a5, 8 + +3: bbci.l a4, 2, 4f + + /* Set 4 bytes. */ + s32i a3, a5, 0 + addi a5, a5, 4 + +4: bbci.l a4, 1, 5f + + /* Set 2 bytes. */ + s16i a3, a5, 0 + addi a5, a5, 2 + +5: bbci.l a4, 0, 6f + + /* Set 1 byte. */ + s8i a3, a5, 0 +6: retw + +libc_hidden_def (memset) diff --git a/libc/string/xtensa/strcmp.S b/libc/string/xtensa/strcmp.S new file mode 100644 index 000000000..90c418d12 --- /dev/null +++ b/libc/string/xtensa/strcmp.S @@ -0,0 +1,313 @@ +/* Optimized strcmp for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + +#ifdef __XTENSA_EB__ +#define MASK0 0xff000000 +#define MASK1 0x00ff0000 +#define MASK2 0x0000ff00 +#define MASK3 0x000000ff +#else +#define MASK0 0x000000ff +#define MASK1 0x0000ff00 +#define MASK2 0x00ff0000 +#define MASK3 0xff000000 +#endif + +#define MASK4 0x40404040 + + .literal .Lmask0, MASK0 + .literal .Lmask1, MASK1 + .literal .Lmask2, MASK2 + .literal .Lmask3, MASK3 + .literal .Lmask4, MASK4 + + .text +ENTRY (strcmp) + /* a2 = s1, a3 = s2 */ + + l8ui a8, a2, 0 // byte 0 from s1 + l8ui a9, a3, 0 // byte 0 from s2 + movi a10, 3 // mask + bne a8, a9, .Lretdiff + + or a11, a2, a3 + bnone a11, a10, .Laligned + + xor a11, a2, a3 // compare low two bits of s1 and s2 + bany a11, a10, .Lunaligned // if they have different alignment + + /* s1/s2 are not word-aligned. */ + addi a2, a2, 1 // advance s1 + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + bnone a2, a10, .Laligned // if s1/s2 now aligned + l8ui a8, a2, 0 // byte 1 from s1 + l8ui a9, a3, 0 // byte 1 from s2 + addi a2, a2, 1 // advance s1 + bne a8, a9, .Lretdiff // if different, return difference + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + bnone a2, a10, .Laligned // if s1/s2 now aligned + l8ui a8, a2, 0 // byte 2 from s1 + l8ui a9, a3, 0 // byte 2 from s2 + addi a2, a2, 1 // advance s1 + bne a8, a9, .Lretdiff // if different, return difference + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + j .Laligned + +/* s1 and s2 have different alignment. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. + + Note: It is important for this unaligned case to come before the + code for aligned strings, because otherwise some of the branches + above cannot reach and have to be transformed to branches around + jumps. The unaligned code is smaller and the branches can reach + over it. */ + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Lunaligned: +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, .Lretdiff // loop forever (almost anyway) +#endif +.Lnextbyte: + l8ui a8, a2, 0 + l8ui a9, a3, 0 + addi a2, a2, 1 + bne a8, a9, .Lretdiff + addi a3, a3, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, .Lretdiff +#else + bnez a8, .Lnextbyte +#endif +.Lretdiff: + sub a2, a8, a9 + retw + +/* s1 is word-aligned; s2 is word-aligned. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. */ + +/* New algorithm, relying on the fact that all normal ASCII is between + 32 and 127. + + Rather than check all bytes for zero: + Take one word (4 bytes). Call it w1. + Shift w1 left by one into w1'. + Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. + Check that all 4 bit 6's (one for each byte) are one: + If they are, we are definitely not done. + If they are not, we are probably done, but need to check for zero. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +.Laligned: + .begin no-transform + l32r a4, .Lmask0 // mask for byte 0 + l32r a7, .Lmask4 + /* Loop forever. (a4 is more than than the maximum number + of iterations) */ + loop a4, .Laligned_done + + /* First unrolled loop body. */ + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq + + /* Second unrolled loop body. */ + l32i a8, a2, 4 // get word from s1+4 + l32i a9, a3, 4 // get word from s2+4 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq2 + + addi a2, a2, 8 // advance s1 pointer + addi a3, a3, 8 // advance s2 pointer +.Laligned_done: + or a1, a1, a1 // nop + +.Lprobeq2: + /* Adjust pointers to account for the loop unrolling. */ + addi a2, a2, 4 + addi a3, a3, 4 + +#else /* !XCHAL_HAVE_LOOPS */ + +.Laligned: + movi a4, MASK0 // mask for byte 0 + movi a7, MASK4 + j .Lfirstword +.Lnextword: + addi a2, a2, 4 // advance s1 pointer + addi a3, a3, 4 // advance s2 pointer +.Lfirstword: + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + ball a9, a7, .Lnextword +#endif /* !XCHAL_HAVE_LOOPS */ + + /* align (0 mod 4) */ +.Lprobeq: + /* Words are probably equal, but check for sure. + If not, loop over the rest of string using normal algorithm. */ + + bnone a8, a4, .Leq // if byte 0 is zero + l32r a5, .Lmask1 // mask for byte 1 + l32r a6, .Lmask2 // mask for byte 2 + bnone a8, a5, .Leq // if byte 1 is zero + l32r a7, .Lmask3 // mask for byte 3 + bnone a8, a6, .Leq // if byte 2 is zero + bnone a8, a7, .Leq // if byte 3 is zero + addi.n a2, a2, 4 // advance s1 pointer + addi.n a3, a3, 4 // advance s2 pointer +#if XCHAL_HAVE_LOOPS + + /* align (1 mod 4) */ + loop a4, .Leq // loop forever (a4 is bigger than max iters) + .end no-transform + + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + addi a2, a2, 4 // advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq // if byte 0 is zero + bnone a8, a5, .Leq // if byte 1 is zero + bnone a8, a6, .Leq // if byte 2 is zero + bnone a8, a7, .Leq // if byte 3 is zero + addi a3, a3, 4 // advance s2 pointer + +#else /* !XCHAL_HAVE_LOOPS */ + + j .Lfirstword2 +.Lnextword2: + addi a3, a3, 4 // advance s2 pointer +.Lfirstword2: + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + addi a2, a2, 4 // advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq // if byte 0 is zero + bnone a8, a5, .Leq // if byte 1 is zero + bnone a8, a6, .Leq // if byte 2 is zero + bany a8, a7, .Lnextword2 // if byte 3 is zero +#endif /* !XCHAL_HAVE_LOOPS */ + + /* Words are equal; some byte is zero. */ +.Leq: movi a2, 0 // return equal + retw + +.Lwne2: /* Words are not equal. On big-endian processors, if none of the + bytes are zero, the return value can be determined by a simple + comparison. */ +#ifdef __XTENSA_EB__ + or a10, a8, a5 + bnall a10, a7, .Lsomezero + bgeu a8, a9, .Lposreturn + movi a2, -1 + retw +.Lposreturn: + movi a2, 1 + retw +.Lsomezero: // There is probably some zero byte. +#endif /* __XTENSA_EB__ */ +.Lwne: /* Words are not equal. */ + xor a2, a8, a9 // get word with nonzero in byte that differs + bany a2, a4, .Ldiff0 // if byte 0 differs + movi a5, MASK1 // mask for byte 1 + bnone a8, a4, .Leq // if byte 0 is zero + bany a2, a5, .Ldiff1 // if byte 1 differs + movi a6, MASK2 // mask for byte 2 + bnone a8, a5, .Leq // if byte 1 is zero + bany a2, a6, .Ldiff2 // if byte 2 differs + bnone a8, a6, .Leq // if byte 2 is zero +#ifdef __XTENSA_EB__ +.Ldiff3: +.Ldiff2: +.Ldiff1: + /* Byte 0 is equal (at least) and there is a difference before a zero + byte. Just subtract words to get the return value. + The high order equal bytes cancel, leaving room for the sign. */ + sub a2, a8, a9 + retw + +.Ldiff0: + /* Need to make room for the sign, so can't subtract whole words. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + retw + +#else /* !__XTENSA_EB__ */ + /* Little-endian is a little more difficult because can't subtract + whole words. */ +.Ldiff3: + /* Bytes 0-2 are equal; byte 3 is different. + For little-endian need to have a sign bit for the difference. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + retw + +.Ldiff0: + /* Byte 0 is different. */ + extui a10, a8, 0, 8 + extui a11, a9, 0, 8 + sub a2, a10, a11 + retw + +.Ldiff1: + /* Byte 0 is equal; byte 1 is different. */ + extui a10, a8, 8, 8 + extui a11, a9, 8, 8 + sub a2, a10, a11 + retw + +.Ldiff2: + /* Bytes 0-1 are equal; byte 2 is different. */ + extui a10, a8, 16, 8 + extui a11, a9, 16, 8 + sub a2, a10, a11 + retw + +#endif /* !__XTENSA_EB */ + +libc_hidden_def (strcmp) + +#ifndef __UCLIBC_HAS_LOCALE__ +strong_alias (strcmp, strcoll) +libc_hidden_def (strcoll) +#endif diff --git a/libc/string/xtensa/strcpy.S b/libc/string/xtensa/strcpy.S new file mode 100644 index 000000000..108070384 --- /dev/null +++ b/libc/string/xtensa/strcpy.S @@ -0,0 +1,150 @@ +/* Optimized strcpy for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + +#ifdef __XTENSA_EB__ +#define MASK0 0xff000000 +#define MASK1 0x00ff0000 +#define MASK2 0x0000ff00 +#define MASK3 0x000000ff +#else +#define MASK0 0x000000ff +#define MASK1 0x0000ff00 +#define MASK2 0x00ff0000 +#define MASK3 0xff000000 +#endif + + .text +ENTRY (strcpy) + /* a2 = dst, a3 = src */ + + mov a10, a2 // leave dst in return value register + movi a4, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a3, 0, .Lsrc1mod2 + bbsi.l a3, 1, .Lsrc2mod4 +.Lsrcaligned: + + /* Check if the destination is aligned. */ + movi a8, 3 + bnone a10, a8, .Laligned + + j .Ldstunaligned + +.Lsrc1mod2: // src address is odd + l8ui a8, a3, 0 // get byte 0 + addi a3, a3, 1 // advance src pointer + s8i a8, a10, 0 // store byte 0 + beqz a8, 1f // if byte 0 is zero + addi a10, a10, 1 // advance dst pointer + bbci.l a3, 1, .Lsrcaligned // if src is now word-aligned + +.Lsrc2mod4: // src address is 2 mod 4 + l8ui a8, a3, 0 // get byte 0 + /* 1-cycle interlock */ + s8i a8, a10, 0 // store byte 0 + beqz a8, 1f // if byte 0 is zero + l8ui a8, a3, 1 // get byte 0 + addi a3, a3, 2 // advance src pointer + s8i a8, a10, 1 // store byte 0 + addi a10, a10, 2 // advance dst pointer + bnez a8, .Lsrcaligned +1: retw + + +/* dst is word-aligned; src is word-aligned. */ + + .align 4 +#if XCHAL_HAVE_LOOPS + /* (2 mod 4) alignment for loop instruction */ +.Laligned: + _movi.n a8, 0 // set up for the maximum loop count + loop a8, .Lz3 // loop forever (almost anyway) + l32i a8, a3, 0 // get word from src + addi a3, a3, 4 // advance src pointer + bnone a8, a4, .Lz0 // if byte 0 is zero + bnone a8, a5, .Lz1 // if byte 1 is zero + bnone a8, a6, .Lz2 // if byte 2 is zero + s32i a8, a10, 0 // store word to dst + bnone a8, a7, .Lz3 // if byte 3 is zero + addi a10, a10, 4 // advance dst pointer + +#else /* !XCHAL_HAVE_LOOPS */ + +1: addi a10, a10, 4 // advance dst pointer +.Laligned: + l32i a8, a3, 0 // get word from src + addi a3, a3, 4 // advance src pointer + bnone a8, a4, .Lz0 // if byte 0 is zero + bnone a8, a5, .Lz1 // if byte 1 is zero + bnone a8, a6, .Lz2 // if byte 2 is zero + s32i a8, a10, 0 // store word to dst + bany a8, a7, 1b // if byte 3 is zero +#endif /* !XCHAL_HAVE_LOOPS */ + +.Lz3: /* Byte 3 is zero. */ + retw + +.Lz0: /* Byte 0 is zero. */ +#ifdef __XTENSA_EB__ + movi a8, 0 +#endif + s8i a8, a10, 0 + retw + +.Lz1: /* Byte 1 is zero. */ +#ifdef __XTENSA_EB__ + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + retw + +.Lz2: /* Byte 2 is zero. */ +#ifdef __XTENSA_EB__ + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + movi a8, 0 + s8i a8, a10, 2 + retw + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Ldstunaligned: + +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, 2f // loop forever (almost anyway) +#endif +1: l8ui a8, a3, 0 + addi a3, a3, 1 + s8i a8, a10, 0 + addi a10, a10, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, 2f +#else + bnez a8, 1b +#endif +2: retw + +libc_hidden_def (strcpy) diff --git a/libc/string/xtensa/strlen.S b/libc/string/xtensa/strlen.S new file mode 100644 index 000000000..dd72c16fa --- /dev/null +++ b/libc/string/xtensa/strlen.S @@ -0,0 +1,104 @@ +/* Optimized strlen for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + +#ifdef __XTENSA_EB__ +#define MASK0 0xff000000 +#define MASK1 0x00ff0000 +#define MASK2 0x0000ff00 +#define MASK3 0x000000ff +#else +#define MASK0 0x000000ff +#define MASK1 0x0000ff00 +#define MASK2 0x00ff0000 +#define MASK3 0xff000000 +#endif + + .text +ENTRY (strlen) + /* a2 = s */ + + addi a3, a2, -4 // because we overincrement at the end + movi a4, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a2, 0, .L1mod2 + bbsi.l a2, 1, .L2mod4 + j .Laligned + +.L1mod2: // address is odd + l8ui a8, a3, 4 // get byte 0 + addi a3, a3, 1 // advance string pointer + beqz a8, .Lz3 // if byte 0 is zero + bbci.l a3, 1, .Laligned // if string pointer is now word-aligned + +.L2mod4: // address is 2 mod 4 + addi a3, a3, 2 // advance ptr for aligned access + l32i a8, a3, 0 // get word with first two bytes of string + bnone a8, a6, .Lz2 // if byte 2 (of word, not string) is zero + bany a8, a7, .Laligned // if byte 3 (of word, not string) is nonzero + + /* Byte 3 is zero. */ + addi a3, a3, 3 // point to zero byte + sub a2, a3, a2 // subtract to get length + retw + + +/* String is word-aligned. */ + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Laligned: +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, .Lz3 // loop forever (almost anyway) +#endif +1: l32i a8, a3, 4 // get next word of string + addi a3, a3, 4 // advance string pointer + bnone a8, a4, .Lz0 // if byte 0 is zero + bnone a8, a5, .Lz1 // if byte 1 is zero + bnone a8, a6, .Lz2 // if byte 2 is zero +#if XCHAL_HAVE_LOOPS + bnone a8, a7, .Lz3 // if byte 3 is zero +#else + bany a8, a7, 1b // repeat if byte 3 is non-zero +#endif + +.Lz3: /* Byte 3 is zero. */ + addi a3, a3, 3 // point to zero byte + /* Fall through.... */ + +.Lz0: /* Byte 0 is zero. */ + sub a2, a3, a2 // subtract to get length + retw + +.Lz1: /* Byte 1 is zero. */ + addi a3, a3, 1 // point to zero byte + sub a2, a3, a2 // subtract to get length + retw + +.Lz2: /* Byte 2 is zero. */ + addi a3, a3, 2 // point to zero byte + sub a2, a3, a2 // subtract to get length + retw + +libc_hidden_def (strlen) diff --git a/libc/string/xtensa/strncpy.S b/libc/string/xtensa/strncpy.S new file mode 100644 index 000000000..7ba2ef77d --- /dev/null +++ b/libc/string/xtensa/strncpy.S @@ -0,0 +1,241 @@ +/* Optimized strcpy for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include + +#ifdef __XTENSA_EB__ +#define MASK0 0xff000000 +#define MASK1 0x00ff0000 +#define MASK2 0x0000ff00 +#define MASK3 0x000000ff +#else +#define MASK0 0x000000ff +#define MASK1 0x0000ff00 +#define MASK2 0x00ff0000 +#define MASK3 0xff000000 +#endif + +/* Do not use .literal_position in the ENTRY macro. */ +#undef LITERAL_POSITION +#define LITERAL_POSITION + + .text + .align 4 + .literal_position +__strncpy_aux: + +.Lsrc1mod2: // src address is odd + l8ui a8, a3, 0 // get byte 0 + addi a3, a3, 1 // advance src pointer + s8i a8, a10, 0 // store byte 0 + addi a4, a4, -1 // decrement n + beqz a4, .Lret // if n is zero + addi a10, a10, 1 // advance dst pointer + beqz a8, .Lfill // if byte 0 is zero + bbci.l a3, 1, .Lsrcaligned // if src is now word-aligned + +.Lsrc2mod4: // src address is 2 mod 4 + l8ui a8, a3, 0 // get byte 0 + addi a4, a4, -1 // decrement n + s8i a8, a10, 0 // store byte 0 + beqz a4, .Lret // if n is zero + addi a10, a10, 1 // advance dst pointer + beqz a8, .Lfill // if byte 0 is zero + l8ui a8, a3, 1 // get byte 0 + addi a3, a3, 2 // advance src pointer + s8i a8, a10, 0 // store byte 0 + addi a4, a4, -1 // decrement n + beqz a4, .Lret // if n is zero + addi a10, a10, 1 // advance dst pointer + bnez a8, .Lsrcaligned + j .Lfill + +.Lret: + retw + + +ENTRY (strncpy) + /* a2 = dst, a3 = src */ + + mov a10, a2 // leave dst in return value register + beqz a4, .Lret // if n is zero + + movi a11, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a3, 0, .Lsrc1mod2 + bbsi.l a3, 1, .Lsrc2mod4 +.Lsrcaligned: + + /* Check if the destination is aligned. */ + movi a8, 3 + bnone a10, a8, .Laligned + + j .Ldstunaligned + + +/* Fill the dst with zeros -- n is at least 1. */ + +.Lfill: + movi a9, 0 + bbsi.l a10, 0, .Lfill1mod2 + bbsi.l a10, 1, .Lfill2mod4 +.Lfillaligned: + blti a4, 4, .Lfillcleanup + + /* Loop filling complete words with zero. */ +#if XCHAL_HAVE_LOOPS + + srai a8, a4, 2 + loop a8, 1f + s32i a9, a10, 0 + addi a10, a10, 4 + +1: slli a8, a8, 2 + sub a4, a4, a8 + +#else /* !XCHAL_HAVE_LOOPS */ + +1: s32i a9, a10, 0 + addi a10, a10, 4 + addi a4, a4, -4 + bgei a4, 4, 1b + +#endif /* !XCHAL_HAVE_LOOPS */ + + beqz a4, 2f + +.Lfillcleanup: + /* Fill leftover (1 to 3) bytes with zero. */ + s8i a9, a10, 0 // store byte 0 + addi a4, a4, -1 // decrement n + addi a10, a10, 1 + bnez a4, .Lfillcleanup + +2: retw + +.Lfill1mod2: // dst address is odd + s8i a9, a10, 0 // store byte 0 + addi a4, a4, -1 // decrement n + beqz a4, 2b // if n is zero + addi a10, a10, 1 // advance dst pointer + bbci.l a10, 1, .Lfillaligned // if dst is now word-aligned + +.Lfill2mod4: // dst address is 2 mod 4 + s8i a9, a10, 0 // store byte 0 + addi a4, a4, -1 // decrement n + beqz a4, 2b // if n is zero + s8i a9, a10, 1 // store byte 1 + addi a4, a4, -1 // decrement n + beqz a4, 2b // if n is zero + addi a10, a10, 2 // advance dst pointer + j .Lfillaligned + + +/* dst is word-aligned; src is word-aligned; n is at least 1. */ + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Laligned: +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, 1f // loop forever (almost anyway) + blti a4, 5, .Ldstunaligned // n is near limit; do one at a time + l32i a8, a3, 0 // get word from src + addi a3, a3, 4 // advance src pointer + bnone a8, a11, .Lz0 // if byte 0 is zero + bnone a8, a5, .Lz1 // if byte 1 is zero + bnone a8, a6, .Lz2 // if byte 2 is zero + s32i a8, a10, 0 // store word to dst + addi a4, a4, -4 // decrement n + addi a10, a10, 4 // advance dst pointer + bnone a8, a7, .Lfill // if byte 3 is zero +1: + +#else /* !XCHAL_HAVE_LOOPS */ + +1: blti a4, 5, .Ldstunaligned // n is near limit; do one at a time + l32i a8, a3, 0 // get word from src + addi a3, a3, 4 // advance src pointer + bnone a8, a11, .Lz0 // if byte 0 is zero + bnone a8, a5, .Lz1 // if byte 1 is zero + bnone a8, a6, .Lz2 // if byte 2 is zero + s32i a8, a10, 0 // store word to dst + addi a4, a4, -4 // decrement n + addi a10, a10, 4 // advance dst pointer + bany a8, a7, 1b // no zeroes +#endif /* !XCHAL_HAVE_LOOPS */ + + j .Lfill + +.Lz0: /* Byte 0 is zero. */ +#ifdef __XTENSA_EB__ + movi a8, 0 +#endif + s8i a8, a10, 0 + addi a4, a4, -1 // decrement n + addi a10, a10, 1 // advance dst pointer + j .Lfill + +.Lz1: /* Byte 1 is zero. */ +#ifdef __XTENSA_EB__ + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + addi a4, a4, -2 // decrement n + addi a10, a10, 2 // advance dst pointer + j .Lfill + +.Lz2: /* Byte 2 is zero. */ +#ifdef __XTENSA_EB__ + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + movi a8, 0 + s8i a8, a10, 2 + addi a4, a4, -3 // decrement n + addi a10, a10, 3 // advance dst pointer + j .Lfill + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Ldstunaligned: + +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, 2f // loop forever (almost anyway) +#endif +1: l8ui a8, a3, 0 + addi a3, a3, 1 + s8i a8, a10, 0 + addi a4, a4, -1 + beqz a4, 3f + addi a10, a10, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, 2f +#else + bnez a8, 1b +#endif +2: j .Lfill + +3: retw + +libc_hidden_def (strncpy) -- cgit v1.2.3