From: Vitaly Mayatskikh <vmayatsk@redhat.com> Date: Sat, 21 Mar 2009 12:04:06 +0100 Subject: [x86_64] copy_user_c can zero more data than needed Message-id: 87zlffxhll.wl%vmayatsk@redhat.com O-Subject: [RHEL-5.4.0 PATCH] bz490938: [x86_64]: copy_user_c can zero more data than needed Bugzilla: 490938 RH-Acked-by: Rik van Riel <riel@redhat.com> RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com> https://bugzilla.redhat.com/show_bug.cgi?id=490938 Description: ============ There is a regression in the patch fixing another regression (456682): 8 bytes error recovery loop can zero more bytes than needed, what potentially lead to data corruption. copy_user routines for x86-64 architecture in RHEL-4 and -5 contain more known errors, which were fixed upstream (and mrg kernel) more than half a year ago. There aren't known bugs in upstream version, so I did full backport of it instead of fixing that particular bug. Upstream status: ================ commits ad2fc2cd925300b8127cf682f5a1c7511ae9dd27 27cb0a75ba252ea7294d67232c4bbbac3f2b2656 afd962a9e8708c571c5c0c4a6d098f931742c229 1129585a08baf58582c0da91e572cb29e3179acf Test status: ============ Tested with test case: https://bugzilla.redhat.com/attachment.cgi?id=336146 test case results for 135.el5: https://bugzilla.redhat.com/attachment.cgi?id=336051 for patched kernel: https://bugzilla.redhat.com/attachment.cgi?id=336145 diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 43223ef..082ea73 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -1,63 +1,109 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ + * + * Functions to copy from and to user space. + */ #include <linux/linkage.h> #include <asm/dwarf2.h> #define FIX_ALIGNMENT 1 - #include <asm/current.h> - #include <asm/asm-offsets.h> - #include <asm/thread_info.h> - #include <asm/cpufeature.h> +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> -/* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) - CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user -2: + .macro ALTERNATIVE_JUMP feature,orig,alt +0: .byte 0xe9 /* 32bit jump */ - .long .Lcug-1f + .long \orig-1f /* by default jump to orig */ 1: - CFI_ENDPROC -ENDPROC(copy_to_user) - .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 32 bit immediate */ - .long copy_user_generic_c-1b /* offset */ +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ .previous .section .altinstructions,"a" .align 8 + .quad 0b .quad 2b - .quad 3b - .byte X86_FEATURE_REP_GOOD + .byte \feature /* when feature is set */ .byte 5 .byte 5 .previous + .endm -/* Standard copy_from_user with segment limit checking */ + .macro ALIGN_DESTINATION +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 100b,103b + .quad 101b,103b + .previous +#endif + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(copy_to_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +/* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx - jc bad_from_user + jc bad_from_user cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - /* FALL THROUGH to copy_user_generic */ + jae bad_from_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC ENDPROC(copy_from_user) - + +ENTRY(copy_user_generic) + CFI_STARTPROC + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(copy_user_generic) + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(__copy_from_user_inatomic) + .section .fixup,"ax" /* must zero dest */ +ENTRY(bad_from_user) bad_from_user: CFI_STARTPROC movl %edx,%ecx @@ -65,295 +111,158 @@ bad_from_user: rep stosb bad_to_user: - movl %edx,%eax + movl %edx,%eax ret CFI_ENDPROC -END(bad_from_user) +ENDPROC(bad_from_user) .previous - - + /* - * copy_user_generic - memory copy with exception handling. - * - * Input: + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro + * code for rep movsq + * + * Input: * rdi destination * rsi source * rdx count * - * Output: - * eax uncopied bytes or 0 if successful. + * Output: + * eax uncopied bytes or 0 if successfull. */ -ENTRY(copy_user_generic) +ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ - .byte 0x66,0x90 -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_user_generic - .quad 2b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous -.Lcug: - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movq %r8,(%rdi) +6: movq %r9,1*8(%rdi) +7: movq %r10,2*8(%rdi) +8: movq %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movq %r8,4*8(%rdi) +14: movq %r9,5*8(%rdi) +15: movq %r10,6*8(%rdi) +16: movq %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movq %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi + incq %rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx + jnz 21b +23: xor %eax,%eax ret - CFI_RESTORE_STATE -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: jmp copy_user_handle_tail /* ecx is zerorest also */ + .previous - /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ld1e - .quad .Ld2,.Ld2e - .quad .Ld3,.Ld3e - .quad .Ld4,.Ld4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ld5e - .quad .Ld6,.Ld6e - .quad .Ld7,.Ld7e - .quad .Ld8,.Ld8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .quad 1b,30b + .quad 2b,30b + .quad 3b,30b + .quad 4b,30b + .quad 5b,30b + .quad 6b,30b + .quad 7b,30b + .quad 8b,30b + .quad 9b,30b + .quad 10b,30b + .quad 11b,30b + .quad 12b,30b + .quad 13b,30b + .quad 14b,30b + .quad 15b,30b + .quad 16b,30b + .quad 18b,40b + .quad 19b,40b + .quad 21b,50b + .quad 22b,50b .previous - - /* Don't forget to store registers, which were loaded before fault. - Otherwise we will have up to 24 bytes of garbage and possible - security leak */ -.Ls8e: addl $8,%eax - movq %r9,6*8(%rdi) -.Ls7e: addl $8,%eax - movq %r8,5*8(%rdi) -.Ls6e: addl $8,%eax - movq %r11,4*8(%rdi) -.Ls5e: addl $32,%eax - jmp .Ls1e - -.Ls4e: addl $8,%eax - movq %r9,2*8(%rdi) -.Ls3e: addl $8,%eax - movq %r8,1*8(%rdi) -.Ls2e: addl $8,%eax - movq %r11,(%rdi) -.Ls1e: addq %rax,%rdi - shlq $6,%rdx - addq %rbx,%rdx - subq %rax,%rdx - andl $63,%ecx - addq %rcx,%rdx - jmp .Lzero_rest - - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ld1e: addl $8,%eax -.Ld2e: addl $8,%eax -.Ld3e: addl $8,%eax -.Ld4e: addl $8,%eax -.Ld5e: addl $8,%eax -.Ld6e: addl $8,%eax -.Ld7e: addl $8,%eax -.Ld8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende CFI_ENDPROC -ENDPROC(copy_user_generic) - - - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ +ENDPROC(copy_user_generic_unrolled) - /* rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ -copy_user_generic_c: +/* Some CPUs run faster using the string copy instructions. + * This is also a lot simpler. Use them when possible. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_generic_string) CFI_STARTPROC - xorq %rax,%rax + andl %edx,%edx + jz 4f + cmpl $8,%edx + jb 2f /* less than 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION movl %edx,%ecx shrl $3,%ecx andl $7,%edx -.Lc1: rep - movsq - movl %edx,%ecx -.Lc2: rep +1: rep + movsq +2: movl %edx,%ecx +3: rep movsb +4: xorl %eax,%eax ret -.Lc1e: movq %rcx,%rsi -.Lc3: rep - stosq -.Lc2ec: movl %edx,%ecx -.Lc4: rep - stosb -.Lc3e: leaq (%rdx,%rsi,8),%rax - ret - /* %rsi contains source address - clear it */ -.Lc2e: xorq %rsi,%rsi - jmp .Lc2ec - CFI_ENDPROC -END(copy_user_generic_c) + .section .fixup,"ax" +11: lea (%rdx,%rcx,8),%rcx +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous .section __ex_table,"a" .align 8 - .quad .Lc1,.Lc1e - .quad .Lc2,.Lc2e - .quad .Lc3,.Lc3e - .quad .Lc4,.Lc3e + .quad 1b,11b + .quad 3b,12b .previous + CFI_ENDPROC +ENDPROC(copy_user_generic_string) diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c index 893d43f..18cda7e 100644 --- a/arch/x86_64/lib/usercopy.c +++ b/arch/x86_64/lib/usercopy.c @@ -164,3 +164,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le } EXPORT_SYMBOL(copy_in_user); +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + */ +unsigned long +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) +{ + char c; + unsigned zero_len; + + for (; len; --len) { + if (__get_user_nocheck(c, from++, sizeof(char))) + break; + if (__put_user_nocheck(c, to++, sizeof(char))) + break; + } + + for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) + if (__put_user_nocheck(c, to++, sizeof(char))) + break; + return len; +} diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index 1e1fa00..299847f 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -355,4 +355,7 @@ unsigned long __clear_user(void __user *mem, unsigned long len); #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user +unsigned long +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); + #endif /* __X86_64_UACCESS_H */