/*
 * Copyright (C) 2004 Joakim Tjernlund
 * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
 *
 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
 */

/* These are carefully optimized mem*() functions for PPC written in C.
 * Don't muck around with these function without checking the generated
 * assembler code.
 * It is possible to optimize these significantly more by using specific
 * data cache instructions(mainly dcbz). However that requires knownledge
 * about the CPU's cache line size.
 *
 * BUG ALERT!
 * The cache instructions on MPC8xx CPU's are buggy(they don't update
 * the DAR register when causing a DTLB Miss/Error) and cannot be
 * used on 8xx CPU's without a kernel patch to work around this
 * problem.
 */

#include <string.h>


static __inline__ int expand_byte_word(int c){
	/* this does:
	   c = c << 8 | c;
	   c = c << 16 | c ;
	*/
	__asm__("rlwimi	%0,%0,8,16,23\n"
	    "\trlwimi	%0,%0,16,0,15\n"
	    : "=r" (c) : "0" (c));
	return c;
}

void *memset(void *to, int c, size_t n)
{
	unsigned long rem, chunks;
	unsigned char *tmp_to;

	chunks = n / 8;
	tmp_to = to - 4;
	c = expand_byte_word(c);
	if (!chunks)
		goto lessthan8;
	rem = (unsigned long )tmp_to % 4;
	if (rem)
		goto align;
 copy_chunks:
	do {
		*(unsigned long *)(tmp_to+4) = c;
		tmp_to += 4;
		*(unsigned long *)(tmp_to+4) = c;
		tmp_to += 4;
	} while (--chunks);
 lessthan8:
	n = n % 8;
	if (n >= 4) {
		*(unsigned long *)(tmp_to+4) = c;
		tmp_to += 4;
		n = n-4;
	}
	if (!n ) return to;
	tmp_to += 3;
	do {
		*++tmp_to = c;
	} while (--n);

	return to;
 align:
	rem = 4 - rem;
	n = n-rem;
	do {
		*(tmp_to+4) = c;
		++tmp_to;
	} while (--rem);
	chunks = n / 8;
	if (chunks)
		goto copy_chunks;
	goto lessthan8;
}
libc_hidden_def(memset)