patch-2.1.44 linux/arch/sparc64/lib/VISmemset.S

Next file: linux/arch/sparc64/lib/blockops.S
Previous file: linux/arch/sparc64/lib/VIScopy.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.43/linux/arch/sparc64/lib/VISmemset.S linux/arch/sparc64/lib/VISmemset.S
@@ -0,0 +1,228 @@
+/* $Id: VISmemset.S,v 1.4 1997/07/02 19:00:39 jj Exp $
+ * VISmemset.S: High speed memset operations utilizing the UltraSparc
+ *        Visual Instruction Set.
+ *
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include "VIS.h"
+
+#ifdef REGS_64BIT
+#define SET_BLOCKS(base, offset, source)	\
+	stx	source, [base - offset - 0x18];	\
+	stx	source, [base - offset - 0x10];	\
+	stx	source, [base - offset - 0x08];	\
+	stx	source, [base - offset - 0x00];
+#else
+#define SET_BLOCKS(base, offset, source)	\
+	stw	source, [base - offset - 0x18];	\
+	stw	source, [base - offset - 0x14];	\
+	stw	source, [base - offset - 0x10];	\
+	stw	source, [base - offset - 0x0c];	\
+	stw	source, [base - offset - 0x08];	\
+	stw	source, [base - offset - 0x04];	\
+	stw	source, [base - offset - 0x00];	\
+	stw	source, [base - offset + 0x04];
+#endif
+
+#ifndef __KERNEL__
+/* So that the brz,a,pt in memset doesn't have to get through PLT, here we go... */
+#include "VISbzero.S"
+#endif
+
+#ifdef __KERNEL__
+#define RETL	clr %o0
+#else
+#define RETL	mov %g3, %o0
+#endif
+
+	/* Well, memset is a lot easier to get right than bcopy... */
+	.text
+	.align		32
+#ifdef __KERNEL__
+	.globl		__memset
+__memset:
+#endif
+	.globl		memset
+memset:
+#ifndef __KERNEL__
+	brz,a,pt	%o1, bzero_private
+	 mov		%o2, %o1
+#ifndef REGS_64BIT
+	srl		%o2, 0, %o2
+#endif
+	mov		%o0, %g3
+#endif
+	cmp		%o2, 7
+	bleu,pn		%xcc, 17f
+	 andcc		%o0, 3, %g5
+	be,pt		%xcc, 4f
+	 and		%o1, 0xff, %o1
+	cmp		%g5, 3
+	be,pn		%xcc, 2f
+	 stb		%o1, [%o0 + 0x00]
+	cmp		%g5, 2
+	be,pt		%xcc, 2f
+	 stb		%o1, [%o0 + 0x01]
+	stb		%o1, [%o0 + 0x02]
+2:	sub		%g5, 4, %g5
+	sub		%o0, %g5, %o0
+	add		%o2, %g5, %o2
+4:	sllx		%o1, 8, %g1
+	andcc		%o0, 4, %g0
+	or		%o1, %g1, %o1
+	sllx		%o1, 16, %g1
+	or		%o1, %g1, %o1
+	be,pt		%xcc, 2f
+#ifdef REGS_64BIT
+	 sllx		%o1, 32, %g1
+#else
+	 cmp		%o2, 128
+#endif
+	stw		%o1, [%o0]
+	sub		%o2, 4, %o2
+	add		%o0, 4, %o0
+2:
+#ifdef REGS_64BIT
+	cmp		%o2, 128
+	or		%o1, %g1, %o1
+#endif
+	blu,pn		%xcc, 9f
+	 andcc		%o0, 0x38, %g5
+	be,pn		%icc, 6f
+	 mov		64, %o5
+	andcc		%o0, 8, %g0
+	be,pn		%icc, 1f
+	 sub		%o5, %g5, %o5
+#ifdef REGS_64BIT
+	stx		%o1, [%o0]
+#else
+	stw		%o1, [%o0]
+	stw		%o1, [%o0 + 4]
+#endif
+	add		%o0, 8, %o0
+1:	andcc		%o5, 16, %g0
+	be,pn		%icc, 1f
+	 sub		%o2, %o5, %o2
+#ifdef REGS_64BIT
+	stx		%o1, [%o0]
+	stx		%o1, [%o0 + 8]
+#else
+	stw		%o1, [%o0]
+	stw		%o1, [%o0 + 4]
+	stw		%o1, [%o0 + 8]
+	stw		%o1, [%o0 + 12]
+#endif
+	add		%o0, 16, %o0
+1:	andcc		%o5, 32, %g0
+	be,pn		%icc, 7f
+	 andncc		%o2, 0x3f, %o3
+#ifdef REGS_64BIT
+	stx		%o1, [%o0]
+	stx		%o1, [%o0 + 8]
+	stx		%o1, [%o0 + 16]
+	stx		%o1, [%o0 + 24]
+#else
+	stw		%o1, [%o0]
+	stw		%o1, [%o0 + 4]
+	stw		%o1, [%o0 + 8]
+	stw		%o1, [%o0 + 12]
+	stw		%o1, [%o0 + 16]
+	stw		%o1, [%o0 + 20]
+	stw		%o1, [%o0 + 24]
+	stw		%o1, [%o0 + 28]
+#endif
+	add		%o0, 32, %o0
+7:	be,pn		%xcc, 9f
+#ifdef __KERNEL__
+	 wr		%g0, FPRS_FEF, %fprs
+#endif
+	ldd		[%o0 - 8], %f0
+18:	wr		%g0, ASI_BLK_P, %asi
+	membar		#StoreStore | #LoadStore
+	andcc		%o3, 0xc0, %g5
+	and		%o2, 0x3f, %o2
+	fmovd		%f0, %f2
+	fmovd		%f0, %f4
+	andn		%o3, 0xff, %o3
+	fmovd		%f0, %f6
+	cmp		%g5, 64
+	fmovd		%f0, %f8
+	fmovd		%f0, %f10
+	fmovd		%f0, %f12
+	brz,pn		%g5, 10f
+	 fmovd		%f0, %f14
+	be,pn		%icc, 2f
+	 stda		%f0, [%o0 + 0x00] %asi
+	cmp		%g5, 128
+	be,pn		%icc, 2f
+	 stda		%f0, [%o0 + 0x40] %asi
+	stda		%f0, [%o0 + 0x80] %asi
+2:	brz,pn		%o3, 12f
+	 add		%o0, %g5, %o0
+10:	stda		%f0, [%o0 + 0x00] %asi
+	stda		%f0, [%o0 + 0x40] %asi
+	stda		%f0, [%o0 + 0x80] %asi
+	stda		%f0, [%o0 + 0xc0] %asi
+11:	subcc		%o3, 256, %o3
+	bne,pt		%xcc, 10b
+	 add		%o0, 256, %o0
+12:
+#ifdef __KERNEL__
+	wr		%g0, 0, %fprs
+#endif
+	membar		#Sync
+9:	andcc		%o2, 0x78, %g5
+	be,pn		%xcc, 13f
+	 andcc		%o2, 7, %o2
+14:	rd		%pc, %o4
+#ifdef REGS_64BIT
+	srl		%g5, 1, %o3
+	sub		%o4, %o3, %o4
+#else
+	sub		%o4, %g5, %o4
+#endif
+	jmpl		%o4 + (13f - 14b), %g0
+	 add		%o0, %g5, %o0
+12:	SET_BLOCKS(%o0, 0x68, %o1)
+	SET_BLOCKS(%o0, 0x48, %o1)
+	SET_BLOCKS(%o0, 0x28, %o1)
+	SET_BLOCKS(%o0, 0x08, %o1)
+13:	be,pn		%xcc, 8f
+	 andcc		%o2, 4, %g0
+	be,pn		%xcc, 1f
+	 andcc		%o2, 2, %g0
+	stw		%o1, [%o0]
+	add		%o0, 4, %o0
+1:	be,pn		%xcc, 1f
+	 andcc		%o2, 1, %g0
+	sth		%o1, [%o0]
+	add		%o0, 2, %o0
+1:	bne,a,pn	%xcc, 8f
+	 stb		%o1, [%o0]
+8:	retl
+	 RETL
+17:	brz,pn		%o2, 0f
+8:	 add		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%xcc, 8b
+	 stb		%o1, [%o0 - 1]
+0:	retl
+	 RETL
+6:
+#ifdef REGS_64BIT
+	stx		%o1, [%o0]
+#else
+	stw		%o1, [%o0]
+	stw		%o1, [%o0 + 4]
+#endif
+	andncc		%o2, 0x3f, %o3
+	be,pn		%xcc, 9b
+#ifdef __KERNEL__
+	 wr		%g0, FPRS_FEF, %fprs
+#else
+	 nop
+#endif
+	ba,pt		%xcc, 18b
+	 ldd		[%o0], %f0

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov