barretenberg
Loading...
Searching...
No Matches
asm_macros.hpp
1#pragma once
2// clang-format off
3
4/*
5 * Clear all flags via xorq opcode
6 **/
7#define CLEAR_FLAGS(empty_reg) \
8 "xorq " empty_reg ", " empty_reg " \n\t"
9
14#define LOAD_FIELD_ELEMENT(a, lolo, lohi, hilo, hihi) \
15 "movq 0(" a "), " lolo " \n\t" \
16 "movq 8(" a "), " lohi " \n\t" \
17 "movq 16(" a "), " hilo " \n\t" \
18 "movq 24(" a "), " hihi " \n\t"
19
25#define STORE_FIELD_ELEMENT(r, lolo, lohi, hilo, hihi) \
26 "movq " lolo ", 0(" r ") \n\t" \
27 "movq " lohi ", 8(" r ") \n\t" \
28 "movq " hilo ", 16(" r ") \n\t" \
29 "movq " hihi ", 24(" r ") \n\t"
30
31#if !defined(__ADX__) || defined(DISABLE_ADX)
36#define ADD(b) \
37 "addq 0(" b "), %%r12 \n\t" \
38 "adcq 8(" b "), %%r13 \n\t" \
39 "adcq 16(" b "), %%r14 \n\t" \
40 "adcq 24(" b "), %%r15 \n\t"
41
46#define SUB(b) \
47 "subq 0(" b "), %%r12 \n\t" \
48 "sbbq 8(" b "), %%r13 \n\t" \
49 "sbbq 16(" b "), %%r14 \n\t" \
50 "sbbq 24(" b "), %%r15 \n\t"
51
52
57#define ADD_REDUCE(b, modulus_0, modulus_1, modulus_2, modulus_3) \
58 "addq 0(" b "), %%r12 \n\t" \
59 "adcq 8(" b "), %%r13 \n\t" \
60 "adcq 16(" b "), %%r14 \n\t" \
61 "adcq 24(" b "), %%r15 \n\t" \
62 "movq %%r12, %%r8 \n\t" \
63 "movq %%r13, %%r9 \n\t" \
64 "movq %%r14, %%r10 \n\t" \
65 "movq %%r15, %%r11 \n\t" \
66 "addq " modulus_0 ", %%r12 \n\t" \
67 "adcq " modulus_1 ", %%r13 \n\t" \
68 "adcq " modulus_2 ", %%r14 \n\t" \
69 "adcq " modulus_3 ", %%r15 \n\t" \
70 "cmovncq %%r8, %%r12 \n\t" \
71 "cmovncq %%r9, %%r13 \n\t" \
72 "cmovncq %%r10, %%r14 \n\t" \
73 "cmovncq %%r11, %%r15 \n\t"
74
75
76
81#define REDUCE_FIELD_ELEMENT(neg_modulus_0, neg_modulus_1, neg_modulus_2, neg_modulus_3) \
82 /* Duplicate `r` */ \
83 "movq %%r12, %%r8 \n\t" \
84 "movq %%r13, %%r9 \n\t" \
85 "movq %%r14, %%r10 \n\t" \
86 "movq %%r15, %%r11 \n\t" \
87 "addq " neg_modulus_0 ", %%r12 \n\t" /* r'[0] -= modulus.data[0] */ \
88 "adcq " neg_modulus_1 ", %%r13 \n\t" /* r'[1] -= modulus.data[1] */ \
89 "adcq " neg_modulus_2 ", %%r14 \n\t" /* r'[2] -= modulus.data[2] */ \
90 "adcq " neg_modulus_3 ", %%r15 \n\t" /* r'[3] -= modulus.data[3] */ \
91 \
92 /* if r does not need to be reduced, overflow flag is 1 */ \
93 /* set r' = r if this flag is set */ \
94 "cmovncq %%r8, %%r12 \n\t" \
95 "cmovncq %%r9, %%r13 \n\t" \
96 "cmovncq %%r10, %%r14 \n\t" \
97 "cmovncq %%r11, %%r15 \n\t"
98
103#define SQR(a) \
104 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
105 \
106 "xorq %%r8, %%r8 \n\t" /* clear flags */ \
107 /* compute a[0] *a[1], a[0]*a[2], a[0]*a[3], a[1]*a[2], a[1]*a[3], a[2]*a[3] */ \
108 "mulxq 8(" a "), %%r9, %%r10 \n\t" /* (r[1], r[2]) <- a[0] * a[1] */ \
109 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[1], t[2]) <- a[0] * a[2] */ \
110 "mulxq 24(" a "), %%r11, %%r12 \n\t" /* (r[3], r[4]) <- a[0] * a[3] */ \
111 \
112 \
113 /* accumulate products into result registers */ \
114 "addq %%r8, %%r10 \n\t" /* r[2] += t[1] */ \
115 "adcq %%r15, %%r11 \n\t" /* r[3] += t[2] */ \
116 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %r%dx */ \
117 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[5], t[6]) <- a[1] * a[2] */ \
118 "mulxq 24(" a "), %%rdi, %%rcx \n\t" /* (t[3], t[4]) <- a[1] * a[3] */ \
119 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %%rdx */ \
120 "mulxq 16(" a "), %%r13, %%r14 \n\t" /* (r[5], r[6]) <- a[3] * a[2] */ \
121 "adcq %%rdi, %%r12 \n\t" /* r[4] += t[3] */ \
122 "adcq %%rcx, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
123 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
124 "addq %%r8, %%r11 \n\t" /* r[3] += t[5] */ \
125 "adcq %%r15, %%r12 \n\t" /* r[4] += t[6] */ \
126 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
127 \
128 /* double result registers */ \
129 "addq %%r9, %%r9 \n\t" /* r[1] = 2r[1] */ \
130 "adcq %%r10, %%r10 \n\t" /* r[2] = 2r[2] */ \
131 "adcq %%r11, %%r11 \n\t" /* r[3] = 2r[3] */ \
132 "adcq %%r12, %%r12 \n\t" /* r[4] = 2r[4] */ \
133 "adcq %%r13, %%r13 \n\t" /* r[5] = 2r[5] */ \
134 "adcq %%r14, %%r14 \n\t" /* r[6] = 2r[6] */ \
135 \
136 /* compute a[3]*a[3], a[2]*a[2], a[1]*a[1], a[0]*a[0] */ \
137 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
138 "mulxq %%rdx, %%r8, %%rcx \n\t" /* (r[0], t[4]) <- a[0] * a[0] */ \
139 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
140 "mulxq %%rdx, %%rdx, %%rdi \n\t" /* (t[7], t[8]) <- a[2] * a[2] */ \
141 /* add squares into result registers */ \
142 "addq %%rdx, %%r12 \n\t" /* r[4] += t[7] */ \
143 "adcq %%rdi, %%r13 \n\t" /* r[5] += t[8] */ \
144 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
145 "addq %%rcx, %%r9 \n\t" /* r[1] += t[4] */ \
146 "movq 24(" a "), %%rdx \n\t" /* r[2] += flag_c */ \
147 "mulxq %%rdx, %%rcx, %%r15 \n\t" /* (t[5], r[7]) <- a[3] * a[3] */ \
148 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
149 "mulxq %%rdx, %%rdi, %%rdx \n\t" /* (t[3], t[6]) <- a[1] * a[1] */ \
150 "adcq %%rdi, %%r10 \n\t" /* r[2] += t[3] */ \
151 "adcq %%rdx, %%r11 \n\t" /* r[3] += t[6] */ \
152 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
153 "addq %%rcx, %%r14 \n\t" /* r[6] += t[5] */ \
154 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
155 \
156 /* perform modular reduction: r[0] */ \
157 "movq %%r8, %%rdx \n\t" /* move r8 into %rdx */ \
158 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
159 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
160 "addq %%rdi, %%r8 \n\t" /* r[0] += t[0] (%r8 now free) */ \
161 "adcq %%rcx, %%r9 \n\t" /* r[1] += t[1] + flag_c */ \
162 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
163 "adcq %%rcx, %%r10 \n\t" /* r[2] += t[3] + flag_c */ \
164 "adcq $0, %%r11 \n\t" /* r[4] += flag_c */ \
165 /* Partial fix "adcq $0, %%r12 \n\t"*/ /* r[4] += flag_c */ \
166 "addq %%rdi, %%r9 \n\t" /* r[1] += t[2] */ \
167 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
168 "mulxq %[modulus_3], %%r8, %%rdx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
169 "adcq %%rdi, %%r10 \n\t" /* r[2] += t[0] + flag_c */ \
170 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
171 "adcq %%rdx, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
172 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
173 "addq %%r8, %%r11 \n\t" /* r[3] += t[2] + flag_c */ \
174 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
175 \
176 /* perform modular reduction: r[1] */ \
177 "movq %%r9, %%rdx \n\t" /* move r9 into %rdx */ \
178 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
179 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
180 "addq %%rdi, %%r9 \n\t" /* r[1] += t[0] (%r8 now free) */ \
181 "adcq %%rcx, %%r10 \n\t" /* r[2] += t[1] + flag_c */ \
182 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
183 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[3] + flag_c */ \
184 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
185 "addq %%rdi, %%r10 \n\t" /* r[2] += t[2] */ \
186 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
187 "mulxq %[modulus_3], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
188 "adcq %%rdi, %%r11 \n\t" /* r[3] += t[0] + flag_c */ \
189 "adcq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
190 "adcq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
191 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
192 "addq %%r8, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
193 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
194 \
195 /* perform modular reduction: r[2] */ \
196 "movq %%r10, %%rdx \n\t" /* move r10 into %rdx */ \
197 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
198 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
199 "addq %%rdi, %%r10 \n\t" /* r[2] += t[0] (%r8 now free) */ \
200 "adcq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
201 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
202 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
203 "mulxq %[modulus_3], %%r10, %%rdx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
204 "adcq %%rcx, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
205 "adcq %%r9, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
206 "adcq %%rdx, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
207 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
208 "addq %%rdi, %%r11 \n\t" /* r[3] += t[2] */ \
209 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
210 "adcq %%r10, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
211 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
212 \
213 /* perform modular reduction: r[3] */ \
214 "movq %%r11, %%rdx \n\t" /* move r11 into %rdx */ \
215 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
216 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
217 "mulxq %[modulus_1], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
218 "addq %%rdi, %%r11 \n\t" /* r[3] += t[0] (%r11 now free) */ \
219 "adcq %%r8, %%r12 \n\t" /* r[4] += t[2] */ \
220 "adcq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
221 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
222 "mulxq %[modulus_3], %%r10, %%r11 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
223 "adcq %%r9, %%r14 \n\t" /* r[6] += t[1] + flag_c */ \
224 "adcq %%r11, %%r15 \n\t" /* r[7] += t[3] + flag_c */ \
225 "addq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
226 "adcq %%r8, %%r13 \n\t" /* r[5] += t[0] + flag_c */ \
227 "adcq %%r10, %%r14 \n\t" /* r[6] += t[2] + flag_c */ \
228 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */
229
230
235#define MUL(a1, a2, a3, a4, b) \
236 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
237 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
238 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
239 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
240 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
241 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
242 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
243 /* zero flags */ \
244 \
245 /* start computing modular reduction */ \
246 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
247 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
248 \
249 /* start first addition chain */ \
250 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
251 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
252 "adcq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
253 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
254 \
255 /* reduce by r[0] * k */ \
256 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
257 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
258 "addq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
259 "adcq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
260 "adcq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
261 "adcq $0, %%r10 \n\t" /* r[3] += flag_c */ \
262 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
263 "addq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_c */ \
264 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
265 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
266 "adcq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
267 "adcq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
268 "adcq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
269 "addq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
270 "adcq $0, %%r12 \n\t" /* r[4] += flag_i */ \
271 \
272 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
273 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
274 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
275 /* (which is very convenient because we're out of registers!) */ \
276 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
277 \
278 /* a[1] * b */ \
279 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
280 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
281 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
282 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
283 "adcq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
284 "adcq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
285 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
286 "addq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
287 \
288 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
289 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
290 "adcq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
291 "adcq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
292 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
293 "addq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
294 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
295 \
296 /* reduce by r[1] * k */ \
297 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
298 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
299 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
300 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
301 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
302 "adcq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
303 "adcq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
304 "adcq $0, %%r12 \n\t" /* r[4] += flag_c */ \
305 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
306 "addq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
307 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
308 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
309 "adcq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
310 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
311 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
312 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
313 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
314 \
315 /* a[2] * b */ \
316 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
317 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
318 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
319 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
320 "adcq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
321 "adcq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
322 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
323 "addq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
324 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
325 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
326 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
327 "adcq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
328 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
329 "addq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
330 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
331 \
332 /* reduce by r[2] * k */ \
333 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
334 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
335 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
336 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
337 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
338 "adcq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
339 "adcq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
340 "adcq $0, %%r13 \n\t" /* r[5] += flag_c */ \
341 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
342 "addq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
343 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
344 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
345 "adcq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
346 "adcq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
347 "adcq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
348 "addq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
349 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
350 \
351 /* a[3] * b */ \
352 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
353 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
354 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
355 "addq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
356 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
357 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
358 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
359 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
360 \
361 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
362 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
363 "adcq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
364 "adcq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
365 "adcq $0, %%r15 \n\t" /* r[7] += + flag_c */ \
366 "addq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
367 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
368 \
369 /* reduce by r[3] * k */ \
370 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
371 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
372 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
373 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
374 "addq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
375 "adcq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
376 "adcq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
377 "adcq $0, %%r14 \n\t" /* r[6] += flag_c */ \
378 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */ \
379 "addq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
380 \
381 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
382 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
383 "adcq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
384 "adcq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
385 "adcq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
386 "addq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
387 "adcq $0, %%r15 \n\t" /* r[7] += flag_c */
388
389
394#define MUL_256(a, b, r) \
395 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
396 \
397 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
398 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
399 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
400 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
401 "mulxq 16(" b "), %%r15, %%rax \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
402 /* zero flags */ \
403 "xorq %%r10, %%r10 \n\t" /* clear r10 register, we use this when we need 0 */ \
404 \
405 \
406 /* start first addition chain */ \
407 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
408 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
409 "adcq %%r10, %%rax \n\t" /* r[3] += flag_c */ \
410 "addq %%rdi, %%rax \n\t" /* r[3] += t[2] + flag_c */ \
411 \
412 /* a[1] * b */ \
413 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
414 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
415 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
416 "addq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
417 "adcq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
418 "adcq %%rsi, %%rax \n\t" /* r[3] += t[1] + flag_c */ \
419 "addq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
420 \
421 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
422 "adcq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
423 \
424 /* a[2] * b */ \
425 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
426 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
427 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
428 "addq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
429 "adcq %%r9, %%rax \n\t" /* r[3] += t[1] + flag_c */ \
430 "addq %%rdi, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
431 \
432 \
433 /* a[3] * b */ \
434 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
435 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
436 "adcq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
437 "movq %%r13, 0(" r ") \n\t" \
438 "movq %%r14, 8(" r ") \n\t" \
439 "movq %%r15, 16(" r ") \n\t" \
440 "movq %%rax, 24(" r ") \n\t"
441
442
443#else // 6047895us
448#define ADD(b) \
449 "adcxq 0(" b "), %%r12 \n\t" \
450 "adcxq 8(" b "), %%r13 \n\t" \
451 "adcxq 16(" b "), %%r14 \n\t" \
452 "adcxq 24(" b "), %%r15 \n\t"
453
458#define SUB(b) \
459 "subq 0(" b "), %%r12 \n\t" \
460 "sbbq 8(" b "), %%r13 \n\t" \
461 "sbbq 16(" b "), %%r14 \n\t" \
462 "sbbq 24(" b "), %%r15 \n\t"
463
468#define ADD_REDUCE(b, modulus_0, modulus_1, modulus_2, modulus_3) \
469 "adcxq 0(" b "), %%r12 \n\t" \
470 "movq %%r12, %%r8 \n\t" \
471 "adoxq " modulus_0 ", %%r12 \n\t" \
472 "adcxq 8(" b "), %%r13 \n\t" \
473 "movq %%r13, %%r9 \n\t" \
474 "adoxq " modulus_1 ", %%r13 \n\t" \
475 "adcxq 16(" b "), %%r14 \n\t" \
476 "movq %%r14, %%r10 \n\t" \
477 "adoxq " modulus_2 ", %%r14 \n\t" \
478 "adcxq 24(" b "), %%r15 \n\t" \
479 "movq %%r15, %%r11 \n\t" \
480 "adoxq " modulus_3 ", %%r15 \n\t" \
481 "cmovnoq %%r8, %%r12 \n\t" \
482 "cmovnoq %%r9, %%r13 \n\t" \
483 "cmovnoq %%r10, %%r14 \n\t" \
484 "cmovnoq %%r11, %%r15 \n\t"
485
486
491#define REDUCE_FIELD_ELEMENT(neg_modulus_0, neg_modulus_1, neg_modulus_2, neg_modulus_3) \
492 /* Duplicate `r` */ \
493 "movq %%r12, %%r8 \n\t" \
494 "movq %%r13, %%r9 \n\t" \
495 "movq %%r14, %%r10 \n\t" \
496 "movq %%r15, %%r11 \n\t" \
497 /* Add the negative representation of 'modulus' into `r`. We do this instead */ \
498 /* of subtracting, because we can use `adoxq`. */ \
499 /* This opcode only has a dependence on the overflow */ \
500 /* flag (sub/sbb changes both carry and overflow flags). */ \
501 /* We can process an `adcxq` and `acoxq` opcode simultaneously. */ \
502 "adoxq " neg_modulus_0 ", %%r12 \n\t" /* r'[0] -= modulus.data[0] */ \
503 "adoxq " neg_modulus_1 ", %%r13 \n\t" /* r'[1] -= modulus.data[1] */ \
504 "adoxq " neg_modulus_2 ", %%r14 \n\t" /* r'[2] -= modulus.data[2] */ \
505 "adoxq " neg_modulus_3 ", %%r15 \n\t" /* r'[3] -= modulus.data[3] */ \
506 \
507 /* if r does not need to be reduced, overflow flag is 1 */ \
508 /* set r' = r if this flag is set */ \
509 "cmovnoq %%r8, %%r12 \n\t" \
510 "cmovnoq %%r9, %%r13 \n\t" \
511 "cmovnoq %%r10, %%r14 \n\t" \
512 "cmovnoq %%r11, %%r15 \n\t"
513
514
519#define SQR(a) \
520 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
521 \
522 "xorq %%r8, %%r8 \n\t" /* clear flags */ \
523 /* compute a[0] *a[1], a[0]*a[2], a[0]*a[3], a[1]*a[2], a[1]*a[3], a[2]*a[3] */ \
524 "mulxq 8(" a "), %%r9, %%r10 \n\t" /* (r[1], r[2]) <- a[0] * a[1] */ \
525 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[1], t[2]) <- a[0] * a[2] */ \
526 "mulxq 24(" a "), %%r11, %%r12 \n\t" /* (r[3], r[4]) <- a[0] * a[3] */ \
527 \
528 \
529 /* accumulate products into result registers */ \
530 "adoxq %%r8, %%r10 \n\t" /* r[2] += t[1] */ \
531 "adcxq %%r15, %%r11 \n\t" /* r[3] += t[2] */ \
532 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %r%dx */ \
533 "mulxq 16(" a "), %%r8, %%r15 \n\t" /* (t[5], t[6]) <- a[1] * a[2] */ \
534 "mulxq 24(" a "), %%rdi, %%rcx \n\t" /* (t[3], t[4]) <- a[1] * a[3] */ \
535 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %%rdx */ \
536 "mulxq 16(" a "), %%r13, %%r14 \n\t" /* (r[5], r[6]) <- a[3] * a[2] */ \
537 "adoxq %%r8, %%r11 \n\t" /* r[3] += t[5] */ \
538 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[3] */ \
539 "adoxq %%r15, %%r12 \n\t" /* r[4] += t[6] */ \
540 "adcxq %%rcx, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
541 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
542 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
543 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
544 \
545 /* double result registers */ \
546 "adoxq %%r9, %%r9 \n\t" /* r[1] = 2r[1] */ \
547 "adcxq %%r12, %%r12 \n\t" /* r[4] = 2r[4] */ \
548 "adoxq %%r10, %%r10 \n\t" /* r[2] = 2r[2] */ \
549 "adcxq %%r13, %%r13 \n\t" /* r[5] = 2r[5] */ \
550 "adoxq %%r11, %%r11 \n\t" /* r[3] = 2r[3] */ \
551 "adcxq %%r14, %%r14 \n\t" /* r[6] = 2r[6] */ \
552 \
553 /* compute a[3]*a[3], a[2]*a[2], a[1]*a[1], a[0]*a[0] */ \
554 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
555 "mulxq %%rdx, %%r8, %%rcx \n\t" /* (r[0], t[4]) <- a[0] * a[0] */ \
556 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
557 "mulxq %%rdx, %%rdx, %%rdi \n\t" /* (t[7], t[8]) <- a[2] * a[2] */ \
558 /* add squares into result registers */ \
559 "adcxq %%rcx, %%r9 \n\t" /* r[1] += t[4] */ \
560 "adoxq %%rdx, %%r12 \n\t" /* r[4] += t[7] */ \
561 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[8] */ \
562 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
563 "mulxq %%rdx, %%rcx, %%r15 \n\t" /* (t[5], r[7]) <- a[3] * a[3] */ \
564 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
565 "mulxq %%rdx, %%rdi, %%rdx \n\t" /* (t[3], t[6]) <- a[1] * a[1] */ \
566 "adcxq %%rdi, %%r10 \n\t" /* r[2] += t[3] */ \
567 "adcxq %%rdx, %%r11 \n\t" /* r[3] += t[6] */ \
568 "adoxq %%rcx, %%r14 \n\t" /* r[6] += t[5] */ \
569 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
570 \
571 /* perform modular reduction: r[0] */ \
572 "movq %%r8, %%rdx \n\t" /* move r8 into %rdx */ \
573 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
574 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
575 "adoxq %%rdi, %%r8 \n\t" /* r[0] += t[0] (%r8 now free) */ \
576 "mulxq %[modulus_3], %%r8, %%rdi \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
577 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
578 "adoxq %%rcx, %%r9 \n\t" /* r[1] += t[1] + flag_o */ \
579 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
580 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
581 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
582 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
583 "adoxq %%rcx, %%r10 \n\t" /* r[2] += t[3] + flag_o */ \
584 "adcxq %%rdi, %%r9 \n\t" /* r[1] += t[2] */ \
585 "adoxq %%r8, %%r11 \n\t" /* r[3] += t[2] + flag_o */ \
586 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
587 "adcxq %%rdi, %%r10 \n\t" /* r[2] += t[0] + flag_c */ \
588 "adcxq %%rcx, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
589 \
590 /* perform modular reduction: r[1] */ \
591 "movq %%r9, %%rdx \n\t" /* move r9 into %rdx */ \
592 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[9] * r_inv */ \
593 "mulxq %[modulus_2], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
594 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
595 "mulxq %[modulus_3], %%r8, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
596 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
597 "adoxq %%rcx, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
598 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
599 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
600 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
601 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
602 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
603 "mulxq %[modulus_0], %%r8, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
604 "adcxq %%r8, %%r9 \n\t" /* r[1] += t[0] (%r9 now free) */ \
605 "adoxq %%rcx, %%r10 \n\t" /* r[2] += t[1] + flag_c */ \
606 "mulxq %[modulus_1], %%r8, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
607 "adcxq %%r8, %%r10 \n\t" /* r[2] += t[2] */ \
608 "adoxq %%rcx, %%r11 \n\t" /* r[3] += t[3] + flag_o */ \
609 "adcxq %%rdi, %%r11 \n\t" /* r[3] += t[0] + flag_c */ \
610 \
611 /* perform modular reduction: r[2] */ \
612 "movq %%r10, %%rdx \n\t" /* move r10 into %rdx */ \
613 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
614 "mulxq %[modulus_1], %%rdi, %%rcx \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
615 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
616 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[3] + flag_o */ \
617 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
618 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
619 "mulxq %[modulus_3], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
620 "adcxq %%r8, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
621 "adoxq %%r9, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
622 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
623 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */ \
624 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
625 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
626 "adcxq %%r8, %%r10 \n\t" /* r[2] += t[0] (%r10 now free) */ \
627 "adoxq %%r9, %%r11 \n\t" /* r[3] += t[1] + flag_c */ \
628 "adcxq %%rdi, %%r11 \n\t" /* r[3] += t[2] */ \
629 "adoxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_o */ \
630 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
631 \
632 /* perform modular reduction: r[3] */ \
633 "movq %%r11, %%rdx \n\t" /* move r11 into %rdx */ \
634 "mulxq %[r_inv], %%rdx, %%rdi \n\t" /* (%rdx, _) <- k = r[10] * r_inv */ \
635 "mulxq %[modulus_0], %%rdi, %%rcx \n\t" /* (t[0], t[1]) <- (modulus[0] * k) */ \
636 "mulxq %[modulus_1], %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (modulus[1] * k) */ \
637 "adoxq %%rdi, %%r11 \n\t" /* r[3] += t[0] (%r11 now free) */ \
638 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[2] */ \
639 "adoxq %%rcx, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
640 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
641 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus[3] * k) */ \
642 "mulxq %[modulus_3], %%r10, %%r11 \n\t" /* (t[2], t[3]) <- (modulus[2] * k) */ \
643 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[0] + flag_o */ \
644 "adcxq %%r10, %%r14 \n\t" /* r[6] += t[2] + flag_c */ \
645 "adoxq %%r9, %%r14 \n\t" /* r[6] += t[1] + flag_o */ \
646 "adcxq %%r11, %%r15 \n\t" /* r[7] += t[3] + flag_c */ \
647 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
648
653#define MUL(a1, a2, a3, a4, b) \
654 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
655 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
656 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
657 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
658 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
659 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
660 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
661 /* zero flags */ \
662 \
663 /* start computing modular reduction */ \
664 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
665 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
666 \
667 /* start first addition chain */ \
668 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
669 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_o */ \
670 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
671 \
672 /* reduce by r[0] * k */ \
673 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
674 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
675 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
676 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
677 "adcxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_i */ \
678 "adoxq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
679 "adcxq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_o */ \
680 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
681 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
682 "adoxq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
683 "adcxq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
684 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_o */ \
685 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
686 \
687 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
688 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
689 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
690 /* (which is very convenient because we're out of registers!) */ \
691 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
692 \
693 /* a[1] * b */ \
694 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
695 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
696 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
697 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
698 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
699 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
700 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
701 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
702 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
703 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
704 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
705 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
706 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
707 "adoxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
708 \
709 /* reduce by r[1] * k */ \
710 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
711 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
712 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
713 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
714 "adcxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_o */ \
715 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
716 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
717 "adoxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
718 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
719 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
720 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
721 "adoxq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
722 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
723 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
724 "adcxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
725 \
726 /* a[2] * b */ \
727 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
728 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
729 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
730 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
731 "adcxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
732 "adoxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
733 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_o */ \
734 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
735 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
736 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
737 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
738 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
739 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
740 "adoxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
741 \
742 /* reduce by r[2] * k */ \
743 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
744 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
745 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
746 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
747 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
748 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
749 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
750 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
751 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
752 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
753 "adcxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
754 "adoxq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
755 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
756 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
757 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
758 \
759 /* a[3] * b */ \
760 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
761 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
762 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
763 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
764 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
765 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
766 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
767 \
768 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
769 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
770 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
771 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_o */ \
772 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
773 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += + flag_o */ \
774 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
775 \
776 /* reduce by r[3] * k */ \
777 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
778 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
779 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
780 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
781 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
782 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
783 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
784 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
785 \
786 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
787 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
788 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
789 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
790 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_o */ \
791 "adcxq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
792 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
793
798#define MUL_FOO(a1, a2, a3, a4, b) \
799 "movq " a1 ", %%rdx \n\t" /* load a[0] into %rdx */ \
800 "xorq %%r8, %%r8 \n\t" /* clear r10 register, we use this when we need 0 */ \
801 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
802 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
803 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
804 "mulxq 16(" b "), %%r15, %%r10 \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
805 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
806 /* zero flags */ \
807 \
808 /* start computing modular reduction */ \
809 "movq %%r13, %%rdx \n\t" /* move r[0] into %rdx */ \
810 "mulxq %[r_inv], %%rdx, %%r11 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
811 \
812 /* start first addition chain */ \
813 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
814 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_o */ \
815 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
816 \
817 /* reduce by r[0] * k */ \
818 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
819 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
820 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[2] + flag_c */ \
821 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[3] + flag_c */ \
822 "adcxq %[zero_reference], %%r12 \n\t" /* r[4] += flag_i */ \
823 "adoxq %%r8, %%r13 \n\t" /* r[0] += t[0] (%r13 now free) */ \
824 "adcxq %%r9, %%r14 \n\t" /* r[1] += t[1] + flag_o */ \
825 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
826 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
827 "adoxq %%rdi, %%r14 \n\t" /* r[1] += t[0] */ \
828 "adcxq %%r11, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
829 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_o */ \
830 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
831 \
832 /* modulus = 254 bits, so max(t[3]) = 62 bits */ \
833 /* b also 254 bits, so (a[0] * b[3]) = 62 bits */ \
834 /* i.e. carry flag here is always 0 if b is in mont form, no need to update r[5] */ \
835 /* (which is very convenient because we're out of registers!) */ \
836 /* N.B. the value of r[4] now has a max of 63 bits and can accept another 62 bit value before overflowing */ \
837 \
838 /* a[1] * b */ \
839 "movq " a2 ", %%rdx \n\t" /* load a[1] into %rdx */ \
840 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
841 "mulxq 24(" b "), %%rdi, %%r13 \n\t" /* (t[6], r[5]) <- (a[1] * b[3]) */ \
842 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
843 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
844 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
845 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
846 "adoxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_c */ \
847 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
848 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
849 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
850 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
851 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
852 "adoxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
853 \
854 /* reduce by r[1] * k */ \
855 "movq %%r14, %%rdx \n\t" /* move r[1] into %rdx */ \
856 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
857 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
858 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
859 "adcxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_o */ \
860 "adoxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
861 "adcxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
862 "adoxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
863 "adcxq %[zero_reference], %%r13 \n\t" /* r[5] += flag_o */ \
864 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
865 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
866 "adoxq %%r8, %%r14 \n\t" /* r[1] += t[0] (%r14 now free) */ \
867 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
868 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
869 "adcxq %%r11, %%r10 \n\t" /* r[3] += t[1] + flag_c */ \
870 \
871 /* a[2] * b */ \
872 "movq " a3 ", %%rdx \n\t" /* load a[2] into %rdx */ \
873 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
874 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[2]) */ \
875 "adoxq %%rdi, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
876 "adcxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
877 "adoxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_c */ \
878 "adcxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_o */ \
879 "mulxq 24(" b "), %%rdi, %%r14 \n\t" /* (t[2], r[6]) <- (a[2] * b[3]) */ \
880 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
881 "adoxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_c */ \
882 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
883 "adoxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_c */ \
884 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
885 "adoxq %%r9, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
886 \
887 /* reduce by r[2] * k */ \
888 "movq %%r15, %%rdx \n\t" /* move r[2] into %rdx */ \
889 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
890 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[0], t[1]) <- (modulus.data[1] * k) */ \
891 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[2] * k) */ \
892 "adcxq %%rdi, %%r10 \n\t" /* r[3] += t[1] + flag_o */ \
893 "adoxq %%r11, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
894 "adcxq %%r8, %%r12 \n\t" /* r[4] += t[0] + flag_o */ \
895 "adoxq %%r9, %%r13 \n\t" /* r[5] += t[2] + flag_c */ \
896 "mulxq %[modulus_3], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[3] * k) */ \
897 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
898 "adcxq %%rdi, %%r13 \n\t" /* r[5] += t[1] + flag_o */ \
899 "adoxq %%r11, %%r14 \n\t" /* r[6] += t[3] + flag_c */ \
900 "adcxq %[zero_reference], %%r14 \n\t" /* r[6] += flag_o */ \
901 "adoxq %%r8, %%r15 \n\t" /* r[2] += t[0] (%r15 now free) */ \
902 "adcxq %%r9, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
903 \
904 /* a[3] * b */ \
905 "movq " a4 ", %%rdx \n\t" /* load a[3] into %rdx */ \
906 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
907 "mulxq 8(" b "), %%rdi, %%r11 \n\t" /* (t[4], t[5]) <- (a[3] * b[1]) */ \
908 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] + flag_c */ \
909 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_o */ \
910 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_c */ \
911 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_o */ \
912 \
913 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[3] * b[2]) */ \
914 "mulxq 24(" b "), %%rdi, %%r15 \n\t" /* (t[6], r[7]) <- (a[3] * b[3]) */ \
915 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_c */ \
916 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_o */ \
917 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_c */ \
918 "adcxq %[zero_reference], %%r15 \n\t" /* r[7] += + flag_o */ \
919 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_c */ \
920 \
921 /* reduce by r[3] * k */ \
922 "movq %%r10, %%rdx \n\t" /* move r_inv into %rdx */ \
923 "mulxq %[r_inv], %%rdx, %%r8 \n\t" /* (%rdx, _) <- k = r[1] * r_inv */ \
924 "mulxq %[modulus_0], %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (modulus.data[0] * k) */ \
925 "mulxq %[modulus_1], %%rdi, %%r11 \n\t" /* (t[2], t[3]) <- (modulus.data[1] * k) */ \
926 "adoxq %%r8, %%r10 \n\t" /* r[3] += t[0] (%rsi now free) */ \
927 "adcxq %%r9, %%r12 \n\t" /* r[4] += t[2] + flag_c */ \
928 "adoxq %%rdi, %%r12 \n\t" /* r[4] += t[1] + flag_o */ \
929 "adcxq %%r11, %%r13 \n\t" /* r[5] += t[3] + flag_c */ \
930 \
931 "mulxq %[modulus_2], %%r8, %%r9 \n\t" /* (t[4], t[5]) <- (modulus.data[2] * k) */ \
932 "mulxq %[modulus_3], %%rdi, %%rdx \n\t" /* (t[6], t[7]) <- (modulus.data[3] * k) */ \
933 "adoxq %%r8, %%r13 \n\t" /* r[5] += t[4] + flag_o */ \
934 "adcxq %%r9, %%r14 \n\t" /* r[6] += t[6] + flag_c */ \
935 "adoxq %%rdi, %%r14 \n\t" /* r[6] += t[5] + flag_o */ \
936 "adcxq %%rdx, %%r15 \n\t" /* r[7] += t[7] + flag_c */ \
937 "adoxq %[zero_reference], %%r15 \n\t" /* r[7] += flag_o */
938
943#define MUL_256(a, b, r) \
944 "movq 0(" a "), %%rdx \n\t" /* load a[0] into %rdx */ \
945 \
946 /* front-load mul ops, can parallelize 4 of these but latency is 4 cycles */ \
947 "mulxq 8(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- a[0] * b[1] */ \
948 "mulxq 24(" b "), %%rdi, %%r12 \n\t" /* (t[2], r[4]) <- a[0] * b[3] (overwrite a[0]) */ \
949 "mulxq 0(" b "), %%r13, %%r14 \n\t" /* (r[0], r[1]) <- a[0] * b[0] */ \
950 "mulxq 16(" b "), %%r15, %%rax \n\t" /* (r[2] , r[3]) <- a[0] * b[2] */ \
951 /* zero flags */ \
952 "xorq %%r10, %%r10 \n\t" /* clear r10 register, we use this when we need 0 */ \
953 \
954 \
955 /* start first addition chain */ \
956 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] */ \
957 "adoxq %%rdi, %%rax \n\t" /* r[3] += t[2] + flag_o */ \
958 "adcxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_c */ \
959 "adcxq %%r10, %%rax \n\t" /* r[3] += flag_o */ \
960 \
961 /* a[1] * b */ \
962 "movq 8(" a "), %%rdx \n\t" /* load a[1] into %rdx */ \
963 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[1] * b[0]) */ \
964 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[4], t[5]) <- (a[1] * b[1]) */ \
965 "adcxq %%r8, %%r14 \n\t" /* r[1] += t[0] + flag_c */ \
966 "adoxq %%r9, %%r15 \n\t" /* r[2] += t[1] + flag_o */ \
967 "adcxq %%rdi, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
968 "adoxq %%rsi, %%rax \n\t" /* r[3] += t[1] + flag_o */ \
969 \
970 "mulxq 16(" b "), %%r8, %%r9 \n\t" /* (t[2], t[3]) <- (a[1] * b[2]) */ \
971 "adcxq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
972 \
973 /* a[2] * b */ \
974 "movq 16(" a "), %%rdx \n\t" /* load a[2] into %rdx */ \
975 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[2] * b[0]) */ \
976 "mulxq 8(" b "), %%rdi, %%rsi \n\t" /* (t[0], t[1]) <- (a[2] * b[1]) */ \
977 "adcxq %%r8, %%r15 \n\t" /* r[2] += t[0] + flag_c */ \
978 "adoxq %%r9, %%rax \n\t" /* r[3] += t[1] + flag_o */ \
979 "adcxq %%rdi, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
980 \
981 \
982 /* a[3] * b */ \
983 "movq 24(" a "), %%rdx \n\t" /* load a[3] into %rdx */ \
984 "mulxq 0(" b "), %%r8, %%r9 \n\t" /* (t[0], t[1]) <- (a[3] * b[0]) */ \
985 "adcxq %%r8, %%rax \n\t" /* r[3] += t[0] + flag_c */ \
986 "movq %%r13, 0(" r ") \n\t" \
987 "movq %%r14, 8(" r ") \n\t" \
988 "movq %%r15, 16(" r ") \n\t" \
989 "movq %%rax, 24(" r ") \n\t"
990#endif