patch-2.4.4 linux/arch/ia64/lib/copy_user.S

Next file: linux/arch/ia64/lib/csum_partial_copy.c
Previous file: linux/arch/ia64/lib/copy_page.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.3/linux/arch/ia64/lib/copy_user.S linux/arch/ia64/lib/copy_user.S
@@ -12,41 +12,25 @@
  *
  * Inputs:
  *	in0	address of source buffer
- * 	in1	address of destination buffer
+ *	in1	address of destination buffer
  *	in2	number of bytes to copy
  *
- * Outputs: 
- * 	ret0	0 in case of success. The number of bytes NOT copied in
- * 		case of error.
+ * Outputs:
+ *	ret0	0 in case of success. The number of bytes NOT copied in
+ *		case of error.
  *
  * Copyright (C) 2000 Hewlett-Packard Co
  * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
  *
  * Fixme:
  *	- handle the case where we have more than 16 bytes and the alignment
- * 	  are different.
+ *	  are different.
  *	- more benchmarking
- * 	- fix extraneous stop bit introduced by the EX() macro.
+ *	- fix extraneous stop bit introduced by the EX() macro.
  */
 
 #include <asm/asmmacro.h>
 
-// The label comes first because our store instruction contains a comma
-// and confuse the preprocessor otherwise
-//
-#undef DEBUG
-#ifdef DEBUG
-#define EX(y,x...)				\
-99:	x
-#else
-#define EX(y,x...)				\
-	.section __ex_table,"a";		\
-	data4 @gprel(99f);			\
-	data4 y-99f;				\
-	.previous;				\
-99:	x
-#endif
-
 //
 // Tuneable parameters
 //
@@ -85,13 +69,10 @@
 #define enddst		r29
 #define endsrc		r30
 #define saved_pfs	r31
- 	.text
- 	.psr	abi64
- 	.psr	lsb
 
 GLOBAL_ENTRY(__copy_user)
-	UNW(.prologue)
-	UNW(.save ar.pfs, saved_pfs)
+	.prologue
+	.save ar.pfs, saved_pfs
 	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
 
 	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
@@ -102,16 +83,16 @@
 
 	;;			// RAW of cfm when len=0
 	cmp.eq p8,p0=r0,len	// check for zero length
-	UNW(.save ar.lc, saved_lc)
+	.save ar.lc, saved_lc
 	mov saved_lc=ar.lc	// preserve ar.lc (slow)
 (p8)	br.ret.spnt.few rp	// empty mempcy()
 	;;
 	add enddst=dst,len	// first byte after end of source
 	add endsrc=src,len	// first byte after end of destination
-	UNW(.save pr, saved_pr)
+	.save pr, saved_pr
 	mov saved_pr=pr		// preserve predicates
 
-	UNW(.body)
+	.body
 
 	mov dst1=dst		// copy because of rotation
 	mov ar.ec=PIPE_DEPTH
@@ -119,7 +100,7 @@
 
 	mov src1=src		// copy because of rotation
 	mov ar.lc=len2		// initialize lc for small count
-	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy 
+	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
 
 	xor tmp=src,dst		// same alignment test prepare
 (p10)	br.cond.dptk.few long_copy_user
@@ -128,9 +109,8 @@
 	// Now we do the byte by byte loop with software pipeline
 	//
 	// p7 is necessarily false by now
-1:				
+1:
 	EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-
 	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 1b
 	;;
@@ -148,10 +128,10 @@
 	and src2=0x7,src1				// src offset
 	and dst2=0x7,dst1				// dst offset
 	;;
-	// The basic idea is that we copy byte-by-byte at the head so 
-	// that we can reach 8-byte alignment for both src1 and dst1. 
-	// Then copy the body using software pipelined 8-byte copy, 
-	// shifting the two back-to-back words right and left, then copy 
+	// The basic idea is that we copy byte-by-byte at the head so
+	// that we can reach 8-byte alignment for both src1 and dst1.
+	// Then copy the body using software pipelined 8-byte copy,
+	// shifting the two back-to-back words right and left, then copy
 	// the tail by copying byte-by-byte.
 	//
 	// Fault handling. If the byte-by-byte at the head fails on the
@@ -162,18 +142,18 @@
 	// handled simply by failure_in_pipe1.
 	//
 	// The case p14 represents the source has more bytes in the
-	// the first word (by the shifted part), whereas the p15 needs to 
-	// copy some bytes from the 2nd word of the source that has the 
+	// the first word (by the shifted part), whereas the p15 needs to
+	// copy some bytes from the 2nd word of the source that has the
 	// tail of the 1st of the destination.
 	//
 
 	//
-	// Optimization. If dst1 is 8-byte aligned (not rarely), we don't need 
-	// to copy the head to dst1, to start 8-byte copy software pipleline. 
+	// Optimization. If dst1 is 8-byte aligned (not rarely), we don't need
+	// to copy the head to dst1, to start 8-byte copy software pipleline.
 	// We know src1 is not 8-byte aligned in this case.
 	//
 	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt.few 1f				
+(p15)	br.cond.spnt.few 1f
 	;;
 	sub t1=8,src2
 	mov t2=src2
@@ -182,10 +162,10 @@
 	sub len1=len,t1					// set len1
 	;;
 	sub lshift=64,rshift
-	;; 
+	;;
 	br.cond.spnt.few word_copy_user
-	;; 
-1:			
+	;;
+1:
 	cmp.leu	p14,p15=src2,dst2
 	sub t1=dst2,src2
 	;;
@@ -196,30 +176,29 @@
 	;;
 	// For the case p14, we don't need to copy the shifted part to
 	// the 1st word of destination.
-	sub t2=8,t1	
+	sub t2=8,t1
 (p14)	sub word1=word1,t1
 	;;
 	sub len1=len,word1				// resulting len
 (p15)	shl rshift=t1,3					// in bits
 (p14)	shl rshift=t2,3
-	;; 
+	;;
 (p14)	sub len1=len1,t1
 	adds cnt=-1,word1
-	;; 
+	;;
 	sub lshift=64,rshift
 	mov ar.ec=PIPE_DEPTH
 	mov pr.rot=1<<16	// p16=true all others are false
 	mov ar.lc=cnt
-	;; 
-2:	
+	;;
+2:
 	EX(failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	;; 
 	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 2b
 	;;
-	clrrrb	
-	;; 
-word_copy_user:		
+	clrrrb
+	;;
+word_copy_user:
 	cmp.gtu p9,p0=16,len1
 (p9)	br.cond.spnt.few 4f		// if (16 > len1) skip 8-byte copy
 	;;
@@ -227,7 +206,7 @@
 	;;
 	adds cnt=-1,cnt
 	;;
-	.pred.rel "mutex", p14, p15	
+	.pred.rel "mutex", p14, p15
 (p14)	sub src1=src1,t2
 (p15)	sub src1=src1,t1
 	//
@@ -237,23 +216,23 @@
 	mov ar.lc=cnt
 	mov ar.ec=PIPE_DEPTH
 	mov pr.rot=1<<16	// p16=true all others are false
-	;; 
+	;;
 3:
 	//
-	// The pipleline consists of 3 stages:	
+	// The pipleline consists of 3 stages:
 	// 1 (p16):	Load a word from src1
 	// 2 (EPI_1):	Shift right pair, saving to tmp
 	// 3 (EPI):	Store tmp to dst1
 	//
-	// To make it simple, use at least 2 (p16) loops to set up val1[n] 
+	// To make it simple, use at least 2 (p16) loops to set up val1[n]
 	// because we need 2 back-to-back val1[] to get tmp.
 	// Note that this implies EPI_2 must be p18 or greater.
-	// 
+	//
 
 #define EPI_1		p[PIPE_DEPTH-2]
 #define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
 #define CASE(pred, shift)	\
-	(pred)	br.cond.spnt.few copy_user_bit##shift	
+	(pred)	br.cond.spnt.few copy_user_bit##shift
 #define BODY(rshift)							\
 copy_user_bit##rshift:							\
 1:									\
@@ -267,11 +246,11 @@
 	//
 	// Since the instruction 'shrp' requires a fixed 128-bit value
 	// specifying the bits to shift, we need to provide 7 cases
-	// below. 
+	// below.
 	//
 	SWITCH(p6, 8)
 	SWITCH(p7, 16)
-	SWITCH(p8, 24)	
+	SWITCH(p8, 24)
 	SWITCH(p9, 32)
 	SWITCH(p10, 40)
 	SWITCH(p11, 48)
@@ -289,40 +268,40 @@
 	BODY(16)
 	BODY(24)
 	BODY(32)
-	BODY(40)		
+	BODY(40)
 	BODY(48)
 	BODY(56)
-	;; 
-.diff_align_do_tail:	
-	.pred.rel "mutex", p14, p15		
+	;;
+.diff_align_do_tail:
+	.pred.rel "mutex", p14, p15
 (p14)	sub src1=src1,t1
-(p14)	adds dst1=-8,dst1			
+(p14)	adds dst1=-8,dst1
 (p15)	sub dst1=dst1,t1
-	;; 
-4:	
+	;;
+4:
 	// Tail correction.
 	//
 	// The problem with this piplelined loop is that the last word is not
-	// loaded and thus parf of the last word written is not correct. 
+	// loaded and thus parf of the last word written is not correct.
 	// To fix that, we simply copy the tail byte by byte.
-	
+
 	sub len1=endsrc,src1,1
 	clrrrb
-	;; 
+	;;
 	mov ar.ec=PIPE_DEPTH
 	mov pr.rot=1<<16	// p16=true all others are false
 	mov ar.lc=len1
 	;;
-5:		
+5:
 	EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	
 	EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
 	br.ctop.dptk.few 5b
 	;;
+	mov ar.lc=saved_lc
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.pfs=saved_pfs
 	br.ret.dptk.few rp
-	
+
 	//
 	// Beginning of long mempcy (i.e. > 16 bytes)
 	//
@@ -353,6 +332,7 @@
 	// we have never executed the ld1, therefore st1 is not executed.
 	//
 	EX(failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
+	;;
 	EX(failure_out,(p6) st1 [dst1]=val1[0],1)
 	tbit.nz p9,p0=src1,3
 	;;
@@ -369,12 +349,12 @@
 	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
 	;;
 	EX(failure_out, (p9) st8 [dst1]=val2[1],8)
-	tbit.nz p6,p0=len1,3	
+	tbit.nz p6,p0=len1,3
 	cmp.eq p7,p0=r0,cnt
 	adds tmp=-1,cnt			// br.ctop is repeat/until
 (p7)	br.cond.dpnt.few .dotail	// we have less than 16 bytes left
 	;;
-	adds src2=8,src1	
+	adds src2=8,src1
 	adds dst2=8,dst1
 	mov ar.lc=tmp
 	;;
@@ -395,12 +375,12 @@
 	// No matter where we come from (loop or test) the src1 pointer
 	// is 16 byte aligned AND we have less than 16 bytes to copy.
 	//
-.dotail:			
+.dotail:
 	EX(failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
 	tbit.nz p7,p0=len1,2
 	;;
 	EX(failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
- 	tbit.nz p8,p0=len1,1
+	tbit.nz p8,p0=len1,1
 	;;
 	EX(failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
 	tbit.nz p9,p0=len1,0
@@ -430,7 +410,7 @@
 	//
 	//	  In the same loop iteration, the dst1 pointer does not directly
 	//	  reflect where the faulty load was.
-	//	  
+	//
 	//	- pipeline effect
 	//	  When you get a fault on load, you may have valid data from
 	//	  previous loads not yet store in transit. Such data must be
@@ -442,7 +422,7 @@
 	//	- we don't disrupt the pipeline, i.e. data in transit in
 	//	  the software pipeline will be eventually move to memory.
 	//	  We simply replace the load with a simple mov and keep the
-	//	  pipeline going. We can't really do this inline because 
+	//	  pipeline going. We can't really do this inline because
 	//	  p16 is always reset to 1 when lc > 0.
 	//
 failure_in_pipe1:
@@ -459,7 +439,7 @@
 
 	//
 	// This is the case where the byte by byte copy fails on the load
-	// when we copy the head. We need to finish the pipeline and copy 
+	// when we copy the head. We need to finish the pipeline and copy
 	// zeros for the rest of the destination. Since this happens
 	// at the top we still need to fill the body and tail.
 failure_in_pipe2:
@@ -471,7 +451,7 @@
 	;;
 	sub len=enddst,dst1,1		// precompute len
 	br.cond.dptk.few failure_in1bis
-	;; 
+	;;
 
 	//
 	// Here we handle the head & tail part when we check for alignment.
@@ -482,7 +462,7 @@
 	//
 	// However some simplifications are possible given the way
 	// things work.
-	// 
+	//
 	// 1) HEAD
 	// Theory of operation:
 	//
@@ -506,23 +486,23 @@
 	//
 	// Key point:
 	//	- if you fail on 1, 2, 4 then you have never executed any smaller
-	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed 
+	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
 	//	  before.
 	//
 	// This allows us to simplify the cleanup code, because basically you
 	// only have to worry about "pending" stores in the case of a failing
-	// ld8(). Given the way the code is written today, this means only 
+	// ld8(). Given the way the code is written today, this means only
 	// worry about st2, st4. There we can use the information encapsulated
 	// into the predicates.
-	// 
+	//
 	// Other key point:
-	// 	- if you fail on the ld8 in the head, it means you went straight
+	//	- if you fail on the ld8 in the head, it means you went straight
 	//	  to it, i.e. 8byte alignment within an unexisting page.
 	// Again this comes from the fact that if you crossed just for the ld8 then
 	// you are 8byte aligned but also 16byte align, therefore you would
 	// either go for the 16byte copy loop OR the ld8 in the tail part.
 	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
-	// because it would mean you had 15bytes to copy in which case you 
+	// because it would mean you had 15bytes to copy in which case you
 	// would have defaulted to the byte by byte copy.
 	//
 	//
@@ -533,18 +513,18 @@
 	// Key point:
 	// This means that we either:
 	//		- are right on a page boundary
-	//	OR 
-	//		- are at more than 16 bytes from a page boundary with 
+	//	OR
+	//		- are at more than 16 bytes from a page boundary with
 	//		  at most 15 bytes to copy: no chance of crossing.
 	//
 	// This allows us to assume that if we fail on a load we haven't possibly
-	// executed any of the previous (tail) ones, so we don't need to do 
-	// any stores. For instance, if we fail on ld2, this means we had 
+	// executed any of the previous (tail) ones, so we don't need to do
+	// any stores. For instance, if we fail on ld2, this means we had
 	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
 	//
-	// This means that we are in a situation similar the a fault in the 
-	// head part. That's nice! 
-	// 
+	// This means that we are in a situation similar the a fault in the
+	// head part. That's nice!
+	//
 failure_in1:
 //	sub ret0=enddst,dst1	// number of bytes to zero, i.e. not copied
 //	sub len=enddst,dst1,1
@@ -563,7 +543,7 @@
 	;;
 5:
 	st1 [dst1]=r0,1
-	br.cloop.dptk.few 5b	
+	br.cloop.dptk.few 5b
 	;;
 skip_loop:
 	mov pr=saved_pr,0xffffffffffff0000
@@ -574,7 +554,7 @@
 	//
 	// Here we simply restart the loop but instead
 	// of doing loads we fill the pipeline with zeroes
-	// We can't simply store r0 because we may have valid 
+	// We can't simply store r0 because we may have valid
 	// data in transit in the pipeline.
 	// ar.lc and ar.ec are setup correctly at this point
 	//
@@ -593,7 +573,7 @@
 	;;
 	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
 	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk.few failure_in1bis	
+(p6)	br.cond.dptk.few failure_in1bis
 	;;
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
@@ -610,13 +590,13 @@
 	;;
 	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
 	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk.few failure_in1bis	
+(p6)	br.cond.dptk.few failure_in1bis
 	;;
 	mov pr=saved_pr,0xffffffffffff0000
 	mov ar.lc=saved_lc
 	mov ar.pfs=saved_pfs
 	br.ret.dptk.few rp
-	
+
 	//
 	// handling of failures on stores: that's the easy part
 	//

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)