/* Optimized memset for Xtensa.
   Copyright (C) 2001, 2007 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
   Boston, MA 02110-1301, USA.  */

#include <sysdep.h>
#include <bits/xtensa-config.h>

/* Do not use .literal_position in the ENTRY macro.  */
#undef LITERAL_POSITION
#define LITERAL_POSITION

/* void *memset (void *dst, int c, size_t length)

   The algorithm is as follows:

   Create a word with c in all byte positions.

   If the destination is aligned, set 16B chunks with a loop, and then
   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.

   If the destination is unaligned, align it by conditionally
   setting 1B and/or 2B and then go to aligned case.

   This code tries to use fall-through branches for the common
   case of an aligned destination (except for the branches to
   the alignment labels).  */


/* Byte-by-byte set.  */

	.text
	.align	4
	.literal_position
__memset_aux:

	/* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
	   (0 mod 4 alignment for LBEG).  */
	.byte	0

.Lbyteset:
#if XCHAL_HAVE_LOOPS
	loopnez	a4, 2f
#else
	beqz	a4, 2f
	add	a6, a5, a4	/* a6 = ending address */
#endif
1:	s8i	a3, a5, 0
	addi	a5, a5, 1
#if !XCHAL_HAVE_LOOPS
	blt	a5, a6, 1b
#endif
2:	retw


/* Destination is unaligned.  */

	.align	4

.Ldst1mod2: /* dst is only byte aligned */

	/* Do short sizes byte-by-byte.  */
	bltui	a4, 8, .Lbyteset

	/* Set 1 byte.  */
	s8i	a3, a5, 0
	addi	a5, a5, 1
	addi	a4, a4, -1

	/* Now retest if dst is aligned.  */
	_bbci.l	a5, 1, .Ldstaligned

.Ldst2mod4: /* dst has 16-bit alignment */

	/* Do short sizes byte-by-byte.  */
	bltui	a4, 8, .Lbyteset

	/* Set 2 bytes.  */
	s16i	a3, a5, 0
	addi	a5, a5, 2
	addi	a4, a4, -2

	/* dst is now aligned; return to main algorithm */
	j	.Ldstaligned


ENTRY (memset)
	/* a2 = dst, a3 = c, a4 = length */

	/* Duplicate character into all bytes of word.  */
	extui	a3, a3, 0, 8
	slli	a7, a3, 8
	or	a3, a3, a7
	slli	a7, a3, 16
	or	a3, a3, a7

	mov	a5, a2		/* copy dst so that a2 is return value */

	/* Check if dst is unaligned.  */
	_bbsi.l	a2, 0, .Ldst1mod2
	_bbsi.l	a2, 1, .Ldst2mod4
.Ldstaligned:

	/* Get number of loop iterations with 16B per iteration.  */
	srli	a7, a4, 4

	/* Destination is word-aligned.  */
#if XCHAL_HAVE_LOOPS
	loopnez	a7, 2f
#else
	beqz	a7, 2f
	slli	a6, a7, 4
	add	a6, a6, a5	/* a6 = end of last 16B chunk */
#endif
	/* Set 16 bytes per iteration.  */
1:	s32i	a3, a5, 0
	s32i	a3, a5, 4
	s32i	a3, a5, 8
	s32i	a3, a5, 12
	addi	a5, a5, 16
#if !XCHAL_HAVE_LOOPS
	blt	a5, a6, 1b
#endif

	/* Set any leftover pieces smaller than 16B.  */
2:	bbci.l	a4, 3, 3f

	/* Set 8 bytes.  */
	s32i	a3, a5, 0
	s32i	a3, a5, 4
	addi	a5, a5, 8

3:	bbci.l	a4, 2, 4f

	/* Set 4 bytes.  */
	s32i	a3, a5, 0
	addi	a5, a5, 4

4:	bbci.l	a4, 1, 5f

	/* Set 2 bytes.  */
	s16i	a3, a5, 0
	addi	a5, a5, 2

5:	bbci.l	a4, 0, 6f

	/* Set 1 byte.  */
	s8i	a3, a5, 0
6:	retw

libc_hidden_def (memset)