1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
| /* SPDX-License-Identifier: GPL-2.0 */
| /* Copyright 2002 Andi Kleen, SuSE Labs */
|
| #include <linux/linkage.h>
| #include <asm/cpufeatures.h>
| #include <asm/alternative.h>
| #include <asm/export.h>
|
| /*
| * ISO C memset - set a memory block to a byte value. This function uses fast
| * string to get better performance than the original function. The code is
| * simpler and shorter than the original function as well.
| *
| * rdi destination
| * rsi value (char)
| * rdx count (bytes)
| *
| * rax original destination
| */
| SYM_FUNC_START_WEAK(memset)
| SYM_FUNC_START(__memset)
| /*
| * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
| * to use it when possible. If not available, use fast string instructions.
| *
| * Otherwise, use original memset function.
| */
| ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
| "jmp memset_erms", X86_FEATURE_ERMS
|
| movq %rdi,%r9
| movq %rdx,%rcx
| andl $7,%edx
| shrq $3,%rcx
| /* expand byte value */
| movzbl %sil,%esi
| movabs $0x0101010101010101,%rax
| imulq %rsi,%rax
| rep stosq
| movl %edx,%ecx
| rep stosb
| movq %r9,%rax
| RET
| SYM_FUNC_END(__memset)
| SYM_FUNC_END_ALIAS(memset)
| EXPORT_SYMBOL(memset)
| EXPORT_SYMBOL(__memset)
|
| /*
| * ISO C memset - set a memory block to a byte value. This function uses
| * enhanced rep stosb to override the fast string function.
| * The code is simpler and shorter than the fast string function as well.
| *
| * rdi destination
| * rsi value (char)
| * rdx count (bytes)
| *
| * rax original destination
| */
| SYM_FUNC_START_LOCAL(memset_erms)
| movq %rdi,%r9
| movb %sil,%al
| movq %rdx,%rcx
| rep stosb
| movq %r9,%rax
| RET
| SYM_FUNC_END(memset_erms)
|
| SYM_FUNC_START_LOCAL(memset_orig)
| movq %rdi,%r10
|
| /* expand byte value */
| movzbl %sil,%ecx
| movabs $0x0101010101010101,%rax
| imulq %rcx,%rax
|
| /* align dst */
| movl %edi,%r9d
| andl $7,%r9d
| jnz .Lbad_alignment
| .Lafter_bad_alignment:
|
| movq %rdx,%rcx
| shrq $6,%rcx
| jz .Lhandle_tail
|
| .p2align 4
| .Lloop_64:
| decq %rcx
| movq %rax,(%rdi)
| movq %rax,8(%rdi)
| movq %rax,16(%rdi)
| movq %rax,24(%rdi)
| movq %rax,32(%rdi)
| movq %rax,40(%rdi)
| movq %rax,48(%rdi)
| movq %rax,56(%rdi)
| leaq 64(%rdi),%rdi
| jnz .Lloop_64
|
| /* Handle tail in loops. The loops should be faster than hard
| to predict jump tables. */
| .p2align 4
| .Lhandle_tail:
| movl %edx,%ecx
| andl $63&(~7),%ecx
| jz .Lhandle_7
| shrl $3,%ecx
| .p2align 4
| .Lloop_8:
| decl %ecx
| movq %rax,(%rdi)
| leaq 8(%rdi),%rdi
| jnz .Lloop_8
|
| .Lhandle_7:
| andl $7,%edx
| jz .Lende
| .p2align 4
| .Lloop_1:
| decl %edx
| movb %al,(%rdi)
| leaq 1(%rdi),%rdi
| jnz .Lloop_1
|
| .Lende:
| movq %r10,%rax
| RET
|
| .Lbad_alignment:
| cmpq $7,%rdx
| jbe .Lhandle_7
| movq %rax,(%rdi) /* unaligned store */
| movq $8,%r8
| subq %r9,%r8
| addq %r8,%rdi
| subq %r8,%rdx
| jmp .Lafter_bad_alignment
| .Lfinal:
| SYM_FUNC_END(memset_orig)
|
|