diff options
Diffstat (limited to 'libc/string/xtensa/strcmp.S')
-rw-r--r-- | libc/string/xtensa/strcmp.S | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/libc/string/xtensa/strcmp.S b/libc/string/xtensa/strcmp.S new file mode 100644 index 000000000..90c418d12 --- /dev/null +++ b/libc/string/xtensa/strcmp.S @@ -0,0 +1,313 @@ +/* Optimized strcmp for Xtensa. + Copyright (C) 2001, 2007 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "../../sysdeps/linux/xtensa/sysdep.h" +#include <bits/xtensa-config.h> + +#ifdef __XTENSA_EB__ +#define MASK0 0xff000000 +#define MASK1 0x00ff0000 +#define MASK2 0x0000ff00 +#define MASK3 0x000000ff +#else +#define MASK0 0x000000ff +#define MASK1 0x0000ff00 +#define MASK2 0x00ff0000 +#define MASK3 0xff000000 +#endif + +#define MASK4 0x40404040 + + .literal .Lmask0, MASK0 + .literal .Lmask1, MASK1 + .literal .Lmask2, MASK2 + .literal .Lmask3, MASK3 + .literal .Lmask4, MASK4 + + .text +ENTRY (strcmp) + /* a2 = s1, a3 = s2 */ + + l8ui a8, a2, 0 // byte 0 from s1 + l8ui a9, a3, 0 // byte 0 from s2 + movi a10, 3 // mask + bne a8, a9, .Lretdiff + + or a11, a2, a3 + bnone a11, a10, .Laligned + + xor a11, a2, a3 // compare low two bits of s1 and s2 + bany a11, a10, .Lunaligned // if they have different alignment + + /* s1/s2 are not word-aligned. */ + addi a2, a2, 1 // advance s1 + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + bnone a2, a10, .Laligned // if s1/s2 now aligned + l8ui a8, a2, 0 // byte 1 from s1 + l8ui a9, a3, 0 // byte 1 from s2 + addi a2, a2, 1 // advance s1 + bne a8, a9, .Lretdiff // if different, return difference + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + bnone a2, a10, .Laligned // if s1/s2 now aligned + l8ui a8, a2, 0 // byte 2 from s1 + l8ui a9, a3, 0 // byte 2 from s2 + addi a2, a2, 1 // advance s1 + bne a8, a9, .Lretdiff // if different, return difference + beqz a8, .Leq // bytes equal, if zero, strings are equal + addi a3, a3, 1 // advance s2 + j .Laligned + +/* s1 and s2 have different alignment. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. + + Note: It is important for this unaligned case to come before the + code for aligned strings, because otherwise some of the branches + above cannot reach and have to be transformed to branches around + jumps. The unaligned code is smaller and the branches can reach + over it. */ + + .align 4 + /* (2 mod 4) alignment for loop instruction */ +.Lunaligned: +#if XCHAL_HAVE_LOOPS + _movi.n a8, 0 // set up for the maximum loop count + loop a8, .Lretdiff // loop forever (almost anyway) +#endif +.Lnextbyte: + l8ui a8, a2, 0 + l8ui a9, a3, 0 + addi a2, a2, 1 + bne a8, a9, .Lretdiff + addi a3, a3, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, .Lretdiff +#else + bnez a8, .Lnextbyte +#endif +.Lretdiff: + sub a2, a8, a9 + retw + +/* s1 is word-aligned; s2 is word-aligned. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. */ + +/* New algorithm, relying on the fact that all normal ASCII is between + 32 and 127. + + Rather than check all bytes for zero: + Take one word (4 bytes). Call it w1. + Shift w1 left by one into w1'. + Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. + Check that all 4 bit 6's (one for each byte) are one: + If they are, we are definitely not done. + If they are not, we are probably done, but need to check for zero. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +.Laligned: + .begin no-transform + l32r a4, .Lmask0 // mask for byte 0 + l32r a7, .Lmask4 + /* Loop forever. (a4 is more than than the maximum number + of iterations) */ + loop a4, .Laligned_done + + /* First unrolled loop body. */ + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq + + /* Second unrolled loop body. */ + l32i a8, a2, 4 // get word from s1+4 + l32i a9, a3, 4 // get word from s2+4 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq2 + + addi a2, a2, 8 // advance s1 pointer + addi a3, a3, 8 // advance s2 pointer +.Laligned_done: + or a1, a1, a1 // nop + +.Lprobeq2: + /* Adjust pointers to account for the loop unrolling. */ + addi a2, a2, 4 + addi a3, a3, 4 + +#else /* !XCHAL_HAVE_LOOPS */ + +.Laligned: + movi a4, MASK0 // mask for byte 0 + movi a7, MASK4 + j .Lfirstword +.Lnextword: + addi a2, a2, 4 // advance s1 pointer + addi a3, a3, 4 // advance s2 pointer +.Lfirstword: + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + ball a9, a7, .Lnextword +#endif /* !XCHAL_HAVE_LOOPS */ + + /* align (0 mod 4) */ +.Lprobeq: + /* Words are probably equal, but check for sure. + If not, loop over the rest of string using normal algorithm. */ + + bnone a8, a4, .Leq // if byte 0 is zero + l32r a5, .Lmask1 // mask for byte 1 + l32r a6, .Lmask2 // mask for byte 2 + bnone a8, a5, .Leq // if byte 1 is zero + l32r a7, .Lmask3 // mask for byte 3 + bnone a8, a6, .Leq // if byte 2 is zero + bnone a8, a7, .Leq // if byte 3 is zero + addi.n a2, a2, 4 // advance s1 pointer + addi.n a3, a3, 4 // advance s2 pointer +#if XCHAL_HAVE_LOOPS + + /* align (1 mod 4) */ + loop a4, .Leq // loop forever (a4 is bigger than max iters) + .end no-transform + + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + addi a2, a2, 4 // advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq // if byte 0 is zero + bnone a8, a5, .Leq // if byte 1 is zero + bnone a8, a6, .Leq // if byte 2 is zero + bnone a8, a7, .Leq // if byte 3 is zero + addi a3, a3, 4 // advance s2 pointer + +#else /* !XCHAL_HAVE_LOOPS */ + + j .Lfirstword2 +.Lnextword2: + addi a3, a3, 4 // advance s2 pointer +.Lfirstword2: + l32i a8, a2, 0 // get word from s1 + l32i a9, a3, 0 // get word from s2 + addi a2, a2, 4 // advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq // if byte 0 is zero + bnone a8, a5, .Leq // if byte 1 is zero + bnone a8, a6, .Leq // if byte 2 is zero + bany a8, a7, .Lnextword2 // if byte 3 is zero +#endif /* !XCHAL_HAVE_LOOPS */ + + /* Words are equal; some byte is zero. */ +.Leq: movi a2, 0 // return equal + retw + +.Lwne2: /* Words are not equal. On big-endian processors, if none of the + bytes are zero, the return value can be determined by a simple + comparison. */ +#ifdef __XTENSA_EB__ + or a10, a8, a5 + bnall a10, a7, .Lsomezero + bgeu a8, a9, .Lposreturn + movi a2, -1 + retw +.Lposreturn: + movi a2, 1 + retw +.Lsomezero: // There is probably some zero byte. +#endif /* __XTENSA_EB__ */ +.Lwne: /* Words are not equal. */ + xor a2, a8, a9 // get word with nonzero in byte that differs + bany a2, a4, .Ldiff0 // if byte 0 differs + movi a5, MASK1 // mask for byte 1 + bnone a8, a4, .Leq // if byte 0 is zero + bany a2, a5, .Ldiff1 // if byte 1 differs + movi a6, MASK2 // mask for byte 2 + bnone a8, a5, .Leq // if byte 1 is zero + bany a2, a6, .Ldiff2 // if byte 2 differs + bnone a8, a6, .Leq // if byte 2 is zero +#ifdef __XTENSA_EB__ +.Ldiff3: +.Ldiff2: +.Ldiff1: + /* Byte 0 is equal (at least) and there is a difference before a zero + byte. Just subtract words to get the return value. + The high order equal bytes cancel, leaving room for the sign. */ + sub a2, a8, a9 + retw + +.Ldiff0: + /* Need to make room for the sign, so can't subtract whole words. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + retw + +#else /* !__XTENSA_EB__ */ + /* Little-endian is a little more difficult because can't subtract + whole words. */ +.Ldiff3: + /* Bytes 0-2 are equal; byte 3 is different. + For little-endian need to have a sign bit for the difference. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + retw + +.Ldiff0: + /* Byte 0 is different. */ + extui a10, a8, 0, 8 + extui a11, a9, 0, 8 + sub a2, a10, a11 + retw + +.Ldiff1: + /* Byte 0 is equal; byte 1 is different. */ + extui a10, a8, 8, 8 + extui a11, a9, 8, 8 + sub a2, a10, a11 + retw + +.Ldiff2: + /* Bytes 0-1 are equal; byte 2 is different. */ + extui a10, a8, 16, 8 + extui a11, a9, 16, 8 + sub a2, a10, a11 + retw + +#endif /* !__XTENSA_EB */ + +libc_hidden_def (strcmp) + +#ifndef __UCLIBC_HAS_LOCALE__ +strong_alias (strcmp, strcoll) +libc_hidden_def (strcoll) +#endif |