barretenberg
Loading...
Searching...
No Matches
field_impl_generic.hpp
1#pragma once
2
3#include <array>
4#include <cstdint>
5
6#include "./field_impl.hpp"
7namespace barretenberg {
8
9// NOLINTBEGIN(readability-implicit-bool-conversion)
10template <class T> constexpr std::pair<uint64_t, uint64_t> field<T>::mul_wide(uint64_t a, uint64_t b) noexcept
11{
12#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
13 const uint128_t res = (static_cast<uint128_t>(a) * static_cast<uint128_t>(b));
14 return { static_cast<uint64_t>(res), static_cast<uint64_t>(res >> 64) };
15#else
16 const uint64_t product = a * b;
17 return { product & 0xffffffffULL, product >> 32 };
18#endif
19}
20
21template <class T>
22constexpr uint64_t field<T>::mac(
23 const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t carry_in, uint64_t& carry_out) noexcept
24{
25#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
26 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
27 static_cast<uint128_t>(carry_in);
28 carry_out = static_cast<uint64_t>(res >> 64);
29 return static_cast<uint64_t>(res);
30#else
31 const uint64_t product = b * c + a + carry_in;
32 carry_out = product >> 32;
33 return product & 0xffffffffULL;
34#endif
35}
36
37template <class T>
38constexpr void field<T>::mac(const uint64_t a,
39 const uint64_t b,
40 const uint64_t c,
41 const uint64_t carry_in,
42 uint64_t& out,
43 uint64_t& carry_out) noexcept
44{
45#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
46 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
47 static_cast<uint128_t>(carry_in);
48 out = static_cast<uint64_t>(res);
49 carry_out = static_cast<uint64_t>(res >> 64);
50#else
51 const uint64_t product = b * c + a + carry_in;
52 carry_out = product >> 32;
53 out = product & 0xffffffffULL;
54#endif
55}
56
57template <class T>
58constexpr uint64_t field<T>::mac_mini(const uint64_t a,
59 const uint64_t b,
60 const uint64_t c,
61 uint64_t& carry_out) noexcept
62{
63#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
64 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
65 carry_out = static_cast<uint64_t>(res >> 64);
66 return static_cast<uint64_t>(res);
67#else
68 const uint64_t product = b * c + a;
69 carry_out = product >> 32;
70 return product & 0xffffffffULL;
71#endif
72}
73
74template <class T>
75constexpr void field<T>::mac_mini(
76 const uint64_t a, const uint64_t b, const uint64_t c, uint64_t& out, uint64_t& carry_out) noexcept
77{
78#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
79 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
80 out = static_cast<uint64_t>(res);
81 carry_out = static_cast<uint64_t>(res >> 64);
82#else
83 const uint64_t result = b * c + a;
84 carry_out = result >> 32;
85 out = result & 0xffffffffULL;
86#endif
87}
88
89template <class T>
90constexpr uint64_t field<T>::mac_discard_lo(const uint64_t a, const uint64_t b, const uint64_t c) noexcept
91{
92#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
93 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
94 return static_cast<uint64_t>(res >> 64);
95#else
96 return (b * c + a) >> 32;
97#endif
98}
99
100template <class T>
101constexpr uint64_t field<T>::addc(const uint64_t a,
102 const uint64_t b,
103 const uint64_t carry_in,
104 uint64_t& carry_out) noexcept
105{
106#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
107 uint128_t res = static_cast<uint128_t>(a) + static_cast<uint128_t>(b) + static_cast<uint128_t>(carry_in);
108 carry_out = static_cast<uint64_t>(res >> 64);
109 return static_cast<uint64_t>(res);
110#else
111 uint64_t r = a + b;
112 const uint64_t carry_temp = r < a;
113 r += carry_in;
114 carry_out = carry_temp + (r < carry_in);
115 return r;
116#endif
117}
118
119template <class T>
120constexpr uint64_t field<T>::sbb(const uint64_t a,
121 const uint64_t b,
122 const uint64_t borrow_in,
123 uint64_t& borrow_out) noexcept
124{
125#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
126 uint128_t res = static_cast<uint128_t>(a) - (static_cast<uint128_t>(b) + static_cast<uint128_t>(borrow_in >> 63));
127 borrow_out = static_cast<uint64_t>(res >> 64);
128 return static_cast<uint64_t>(res);
129#else
130 uint64_t t_1 = a - (borrow_in >> 63ULL);
131 uint64_t borrow_temp_1 = t_1 > a;
132 uint64_t t_2 = t_1 - b;
133 uint64_t borrow_temp_2 = t_2 > t_1;
134 borrow_out = 0ULL - (borrow_temp_1 | borrow_temp_2);
135 return t_2;
136#endif
137}
138
139template <class T>
140constexpr uint64_t field<T>::square_accumulate(const uint64_t a,
141 const uint64_t b,
142 const uint64_t c,
143 const uint64_t carry_in_lo,
144 const uint64_t carry_in_hi,
145 uint64_t& carry_lo,
146 uint64_t& carry_hi) noexcept
147{
148#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
149 const uint128_t product = static_cast<uint128_t>(b) * static_cast<uint128_t>(c);
150 const auto r0 = static_cast<uint64_t>(product);
151 const auto r1 = static_cast<uint64_t>(product >> 64);
152 uint64_t out = r0 + r0;
153 carry_lo = (out < r0);
154 out += a;
155 carry_lo += (out < a);
156 out += carry_in_lo;
157 carry_lo += (out < carry_in_lo);
158 carry_lo += r1;
159 carry_hi = (carry_lo < r1);
160 carry_lo += r1;
161 carry_hi += (carry_lo < r1);
162 carry_lo += carry_in_hi;
163 carry_hi += (carry_lo < carry_in_hi);
164 return out;
165#else
166 const auto product = b * c;
167 const auto t0 = product + a + carry_in_lo;
168 const auto t1 = product + t0;
169 carry_hi = t1 < product;
170 const auto t2 = t1 + (carry_in_hi << 32);
171 carry_hi += t2 < t1;
172 carry_lo = t2 >> 32;
173 return t2 & 0xffffffffULL;
174#endif
175}
176
177template <class T> constexpr field<T> field<T>::reduce() const noexcept
178{
179 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
180 uint256_t val{ data[0], data[1], data[2], data[3] };
181 if (val >= modulus) {
182 val -= modulus;
183 }
184 return { val.data[0], val.data[1], val.data[2], val.data[3] };
185 }
186 uint64_t t0 = data[0] + not_modulus.data[0];
187 uint64_t c = t0 < data[0];
188 auto t1 = addc(data[1], not_modulus.data[1], c, c);
189 auto t2 = addc(data[2], not_modulus.data[2], c, c);
190 auto t3 = addc(data[3], not_modulus.data[3], c, c);
191 const uint64_t selection_mask = 0ULL - c; // 0xffff... if we have overflowed.
192 const uint64_t selection_mask_inverse = ~selection_mask;
193 // if we overflow, we want to swap
194 return {
195 (data[0] & selection_mask_inverse) | (t0 & selection_mask),
196 (data[1] & selection_mask_inverse) | (t1 & selection_mask),
197 (data[2] & selection_mask_inverse) | (t2 & selection_mask),
198 (data[3] & selection_mask_inverse) | (t3 & selection_mask),
199 };
200}
201
202template <class T> constexpr field<T> field<T>::add(const field& other) const noexcept
203{
204 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
205 uint64_t r0 = data[0] + other.data[0];
206 uint64_t c = r0 < data[0];
207 auto r1 = addc(data[1], other.data[1], c, c);
208 auto r2 = addc(data[2], other.data[2], c, c);
209 auto r3 = addc(data[3], other.data[3], c, c);
210 if (c) {
211 uint64_t b = 0;
212 r0 = sbb(r0, modulus.data[0], b, b);
213 r1 = sbb(r1, modulus.data[1], b, b);
214 r2 = sbb(r2, modulus.data[2], b, b);
215 r3 = sbb(r3, modulus.data[3], b, b);
216 // Since both values are in [0, 2**256), the result is in [0, 2**257-2]. Subtracting one p might not be
217 // enough. We need to ensure that we've underflown the 0 and that might require subtracting an additional p
218 if (!b) {
219 b = 0;
220 r0 = sbb(r0, modulus.data[0], b, b);
221 r1 = sbb(r1, modulus.data[1], b, b);
222 r2 = sbb(r2, modulus.data[2], b, b);
223 r3 = sbb(r3, modulus.data[3], b, b);
224 }
225 }
226 return { r0, r1, r2, r3 };
227 } else {
228 uint64_t r0 = data[0] + other.data[0];
229 uint64_t c = r0 < data[0];
230 auto r1 = addc(data[1], other.data[1], c, c);
231 auto r2 = addc(data[2], other.data[2], c, c);
232 uint64_t r3 = data[3] + other.data[3] + c;
233
234 uint64_t t0 = r0 + twice_not_modulus.data[0];
235 c = t0 < twice_not_modulus.data[0];
236 uint64_t t1 = addc(r1, twice_not_modulus.data[1], c, c);
237 uint64_t t2 = addc(r2, twice_not_modulus.data[2], c, c);
238 uint64_t t3 = addc(r3, twice_not_modulus.data[3], c, c);
239 const uint64_t selection_mask = 0ULL - c;
240 const uint64_t selection_mask_inverse = ~selection_mask;
241
242 return {
243 (r0 & selection_mask_inverse) | (t0 & selection_mask),
244 (r1 & selection_mask_inverse) | (t1 & selection_mask),
245 (r2 & selection_mask_inverse) | (t2 & selection_mask),
246 (r3 & selection_mask_inverse) | (t3 & selection_mask),
247 };
248 }
249}
250
251template <class T> constexpr field<T> field<T>::subtract(const field& other) const noexcept
252{
253 uint64_t borrow = 0;
254 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
255 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
256 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
257 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
258
259 r0 += (modulus.data[0] & borrow);
260 uint64_t carry = r0 < (modulus.data[0] & borrow);
261 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
262 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
263 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
264 // The value being subtracted is in [0, 2**256), if we subtract 0 - 2*255 and then add p, the value will stay
265 // negative. If we are adding p, we need to check that we've overflown 2**256. If not, we should add p again
266 if (!carry) {
267 r0 += (modulus.data[0] & borrow);
268 uint64_t carry = r0 < (modulus.data[0] & borrow);
269 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
270 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
271 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
272 }
273 return { r0, r1, r2, r3 };
274}
275
276template <class T> constexpr field<T> field<T>::subtract_coarse(const field& other) const noexcept
277{
278 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
279 return subtract(other);
280 }
281 uint64_t borrow = 0;
282 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
283 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
284 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
285 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
286
287 r0 += (twice_modulus.data[0] & borrow);
288 uint64_t carry = r0 < (twice_modulus.data[0] & borrow);
289 r1 = addc(r1, twice_modulus.data[1] & borrow, carry, carry);
290 r2 = addc(r2, twice_modulus.data[2] & borrow, carry, carry);
291 r3 += (twice_modulus.data[3] & borrow) + carry;
292
293 return { r0, r1, r2, r3 };
294}
295template <class T> constexpr field<T> field<T>::montgomery_mul_big(const field& other) const noexcept
296{
297#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
298 uint64_t c = 0;
299 uint64_t t0 = 0;
300 uint64_t t1 = 0;
301 uint64_t t2 = 0;
302 uint64_t t3 = 0;
303 uint64_t t4 = 0;
304 uint64_t t5 = 0;
305 uint64_t k = 0;
306 for (const auto& element : data) {
307 c = 0;
308 mac(t0, element, other.data[0], c, t0, c);
309 mac(t1, element, other.data[1], c, t1, c);
310 mac(t2, element, other.data[2], c, t2, c);
311 mac(t3, element, other.data[3], c, t3, c);
312 t4 = addc(t4, c, 0, t5);
313
314 c = 0;
315 k = t0 * T::r_inv;
316 c = mac_discard_lo(t0, k, modulus.data[0]);
317 mac(t1, k, modulus.data[1], c, t0, c);
318 mac(t2, k, modulus.data[2], c, t1, c);
319 mac(t3, k, modulus.data[3], c, t2, c);
320 t3 = addc(c, t4, 0, c);
321 t4 = t5 + c;
322 }
323 uint64_t borrow = 0;
324 uint64_t r0 = sbb(t0, modulus.data[0], borrow, borrow);
325 uint64_t r1 = sbb(t1, modulus.data[1], borrow, borrow);
326 uint64_t r2 = sbb(t2, modulus.data[2], borrow, borrow);
327 uint64_t r3 = sbb(t3, modulus.data[3], borrow, borrow);
328 borrow = borrow ^ (0ULL - t4);
329 r0 += (modulus.data[0] & borrow);
330 uint64_t carry = r0 < (modulus.data[0] & borrow);
331 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
332 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
333 r3 += (modulus.data[3] & borrow) + carry;
334 return { r0, r1, r2, r3 };
335#else
336 uint64_t c = 0;
337 uint64_t t0 = 0;
338 uint64_t t1 = 0;
339 uint64_t t2 = 0;
340 uint64_t t3 = 0;
341 uint64_t t4 = 0;
342 uint64_t t5 = 0;
343 uint64_t t6 = 0;
344 uint64_t t7 = 0;
345 uint64_t t8 = 0;
346 uint64_t t9 = 0;
347 uint64_t k = 0;
348
349 constexpr uint64_t wasm_modulus[8]{
350 modulus.data[0] & 0xffffffffULL, modulus.data[0] >> 32ULL, modulus.data[1] & 0xffffffffULL,
351 modulus.data[1] >> 32ULL, modulus.data[2] & 0xffffffffULL, modulus.data[2] >> 32ULL,
352 modulus.data[3] & 0xffffffffULL, modulus.data[3] >> 32ULL,
353 };
354 constexpr uint64_t wasm_rinv = T::r_inv & 0xffffffffULL;
355
356 const uint64_t left[8]{
357 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
358 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
359 };
360 const uint64_t right[8]{
361 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
362 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
363 };
364
365 for (size_t i = 0; i < 8; ++i) {
366 c = 0;
367 mac(t0, left[i], right[0], c, t0, c);
368 mac(t1, left[i], right[1], c, t1, c);
369 mac(t2, left[i], right[2], c, t2, c);
370 mac(t3, left[i], right[3], c, t3, c);
371 mac(t4, left[i], right[4], c, t4, c);
372 mac(t5, left[i], right[5], c, t5, c);
373 mac(t6, left[i], right[6], c, t6, c);
374 mac(t7, left[i], right[7], c, t7, c);
375 uint64_t end_mul = t8 + c;
376 t8 = end_mul & 0xffffffffU;
377 t9 = end_mul >> 32;
378
379 c = 0;
380 k = (t0 * wasm_rinv) & 0xffffffffU;
381 c = mac_discard_lo(t0, k, wasm_modulus[0]);
382 mac(t1, k, wasm_modulus[1], c, t0, c);
383 mac(t2, k, wasm_modulus[2], c, t1, c);
384 mac(t3, k, wasm_modulus[3], c, t2, c);
385 mac(t4, k, wasm_modulus[4], c, t3, c);
386 mac(t5, k, wasm_modulus[5], c, t4, c);
387 mac(t6, k, wasm_modulus[6], c, t5, c);
388 mac(t7, k, wasm_modulus[7], c, t6, c);
389 uint64_t end_reduce = c + t8;
390 t7 = end_reduce & 0xffffffffU;
391 c = end_reduce >> 32;
392 t8 = t9 + c;
393 }
394 uint64_t v0 = t0 + (t1 << 32);
395 uint64_t v1 = t2 + (t3 << 32);
396 uint64_t v2 = t4 + (t5 << 32);
397 uint64_t v3 = t6 + (t7 << 32);
398 uint64_t v4 = t8;
399 uint64_t borrow = 0;
400 uint64_t r0 = sbb(v0, modulus.data[0], borrow, borrow);
401 uint64_t r1 = sbb(v1, modulus.data[1], borrow, borrow);
402 uint64_t r2 = sbb(v2, modulus.data[2], borrow, borrow);
403 uint64_t r3 = sbb(v3, modulus.data[3], borrow, borrow);
404 borrow = borrow ^ (0ULL - v4);
405 r0 += (modulus.data[0] & borrow);
406 uint64_t carry = r0 < (modulus.data[0] & borrow);
407 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
408 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
409 r3 += (modulus.data[3] & borrow) + carry;
410 return { r0, r1, r2, r3 };
411#endif
412}
413
414template <class T> constexpr field<T> field<T>::montgomery_mul(const field& other) const noexcept
415{
416 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
417 return montgomery_mul_big(other);
418 }
419#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
420 auto [t0, c] = mul_wide(data[0], other.data[0]);
421 uint64_t k = t0 * T::r_inv;
422 uint64_t a = mac_discard_lo(t0, k, modulus.data[0]);
423
424 uint64_t t1 = mac_mini(a, data[0], other.data[1], a);
425 mac(t1, k, modulus.data[1], c, t0, c);
426 uint64_t t2 = mac_mini(a, data[0], other.data[2], a);
427 mac(t2, k, modulus.data[2], c, t1, c);
428 uint64_t t3 = mac_mini(a, data[0], other.data[3], a);
429 mac(t3, k, modulus.data[3], c, t2, c);
430 t3 = c + a;
431
432 mac_mini(t0, data[1], other.data[0], t0, a);
433 k = t0 * T::r_inv;
434 c = mac_discard_lo(t0, k, modulus.data[0]);
435 mac(t1, data[1], other.data[1], a, t1, a);
436 mac(t1, k, modulus.data[1], c, t0, c);
437 mac(t2, data[1], other.data[2], a, t2, a);
438 mac(t2, k, modulus.data[2], c, t1, c);
439 mac(t3, data[1], other.data[3], a, t3, a);
440 mac(t3, k, modulus.data[3], c, t2, c);
441 t3 = c + a;
442
443 mac_mini(t0, data[2], other.data[0], t0, a);
444 k = t0 * T::r_inv;
445 c = mac_discard_lo(t0, k, modulus.data[0]);
446 mac(t1, data[2], other.data[1], a, t1, a);
447 mac(t1, k, modulus.data[1], c, t0, c);
448 mac(t2, data[2], other.data[2], a, t2, a);
449 mac(t2, k, modulus.data[2], c, t1, c);
450 mac(t3, data[2], other.data[3], a, t3, a);
451 mac(t3, k, modulus.data[3], c, t2, c);
452 t3 = c + a;
453
454 mac_mini(t0, data[3], other.data[0], t0, a);
455 k = t0 * T::r_inv;
456 c = mac_discard_lo(t0, k, modulus.data[0]);
457 mac(t1, data[3], other.data[1], a, t1, a);
458 mac(t1, k, modulus.data[1], c, t0, c);
459 mac(t2, data[3], other.data[2], a, t2, a);
460 mac(t2, k, modulus.data[2], c, t1, c);
461 mac(t3, data[3], other.data[3], a, t3, a);
462 mac(t3, k, modulus.data[3], c, t2, c);
463 t3 = c + a;
464 return { t0, t1, t2, t3 };
465#else
466 constexpr uint64_t wasm_modulus[8]{
467 modulus.data[0] & 0xffffffffULL, modulus.data[0] >> 32ULL, modulus.data[1] & 0xffffffffULL,
468 modulus.data[1] >> 32ULL, modulus.data[2] & 0xffffffffULL, modulus.data[2] >> 32ULL,
469 modulus.data[3] & 0xffffffffULL, modulus.data[3] >> 32ULL,
470 };
471 constexpr uint64_t wasm_rinv = T::r_inv & 0xffffffffULL;
472
473 const uint64_t left[8]{
474 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
475 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
476 };
477 const uint64_t right[8]{
478 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
479 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
480 };
481
482 auto [t0, c] = mul_wide(left[0], right[0]);
483 uint64_t k = (t0 * wasm_rinv) & 0xffffffffULL;
484 uint64_t a = mac_discard_lo(t0, k, wasm_modulus[0]);
485
486 uint64_t t1 = mac_mini(a, left[0], right[1], a);
487 mac(t1, k, wasm_modulus[1], c, t0, c);
488 uint64_t t2 = mac_mini(a, left[0], right[2], a);
489 mac(t2, k, wasm_modulus[2], c, t1, c);
490 uint64_t t3 = mac_mini(a, left[0], right[3], a);
491 mac(t3, k, wasm_modulus[3], c, t2, c);
492 uint64_t t4 = mac_mini(a, left[0], right[4], a);
493 mac(t4, k, wasm_modulus[4], c, t3, c);
494 uint64_t t5 = mac_mini(a, left[0], right[5], a);
495 mac(t5, k, wasm_modulus[5], c, t4, c);
496 uint64_t t6 = mac_mini(a, left[0], right[6], a);
497 mac(t6, k, wasm_modulus[6], c, t5, c);
498 uint64_t t7 = mac_mini(a, left[0], right[7], a);
499 mac(t7, k, wasm_modulus[7], c, t6, c);
500 t7 = c + a;
501
502 for (size_t i = 1; i < 8; ++i) {
503 mac_mini(t0, left[i], right[0], t0, a);
504 k = (t0 * wasm_rinv) & 0xffffffffULL;
505 c = mac_discard_lo(t0, k, wasm_modulus[0]);
506 mac(t1, left[i], right[1], a, t1, a);
507 mac(t1, k, wasm_modulus[1], c, t0, c);
508 mac(t2, left[i], right[2], a, t2, a);
509 mac(t2, k, wasm_modulus[2], c, t1, c);
510 mac(t3, left[i], right[3], a, t3, a);
511 mac(t3, k, wasm_modulus[3], c, t2, c);
512 mac(t4, left[i], right[4], a, t4, a);
513 mac(t4, k, wasm_modulus[4], c, t3, c);
514 mac(t5, left[i], right[5], a, t5, a);
515 mac(t5, k, wasm_modulus[5], c, t4, c);
516 mac(t6, left[i], right[6], a, t6, a);
517 mac(t6, k, wasm_modulus[6], c, t5, c);
518 mac(t7, left[i], right[7], a, t7, a);
519 mac(t7, k, wasm_modulus[7], c, t6, c);
520 t7 = c + a;
521 }
522
523 // mac_mini(t0, left[2], right[0], t0, a);
524 // k = (t0 * wasm_rinv) & 0xffffffffULL;
525 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
526 // mac(t1, left[2], right[1], a, t1, a);
527 // mac(t1, k, wasm_modulus[1], c, t0, c);
528 // mac(t2, left[2], right[2], a, t2, a);
529 // mac(t2, k, wasm_modulus[2], c, t1, c);
530 // mac(t3, left[2], right[3], a, t3, a);
531 // mac(t3, k, wasm_modulus[3], c, t2, c);
532 // mac(t4, left[2], right[4], a, t4, a);
533 // mac(t4, k, wasm_modulus[4], c, t3, c);
534 // mac(t5, left[2], right[5], a, t5, a);
535 // mac(t5, k, wasm_modulus[5], c, t4, c);
536 // mac(t6, left[2], right[6], a, t6, a);
537 // mac(t6, k, wasm_modulus[6], c, t5, c);
538 // mac(t7, left[2], right[7], a, t7, a);
539 // mac(t7, k, wasm_modulus[7], c, t6, c);
540 // t7 = c + a;
541
542 // mac_mini(t0, left[3], right[0], t0, a);
543 // k = (t0 * wasm_rinv) & 0xffffffffULL;
544 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
545 // mac(t1, left[3], right[1], a, t1, a);
546 // mac(t1, k, wasm_modulus[1], c, t0, c);
547 // mac(t2, left[3], right[2], a, t2, a);
548 // mac(t2, k, wasm_modulus[2], c, t1, c);
549 // mac(t3, left[3], right[3], a, t3, a);
550 // mac(t3, k, wasm_modulus[3], c, t2, c);
551 // mac(t4, left[3], right[4], a, t4, a);
552 // mac(t4, k, wasm_modulus[4], c, t3, c);
553 // mac(t5, left[3], right[5], a, t5, a);
554 // mac(t5, k, wasm_modulus[5], c, t4, c);
555 // mac(t6, left[3], right[6], a, t6, a);
556 // mac(t6, k, wasm_modulus[6], c, t5, c);
557 // mac(t7, left[3], right[7], a, t7, a);
558 // mac(t7, k, wasm_modulus[7], c, t6, c);
559 // t7 = c + a;
560
561 // mac_mini(t0, left[4], right[0], t0, a);
562 // k = (t0 * wasm_rinv) & 0xffffffffULL;
563 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
564 // mac(t1, left[4], right[1], a, t1, a);
565 // mac(t1, k, wasm_modulus[1], c, t0, c);
566 // mac(t2, left[4], right[2], a, t2, a);
567 // mac(t2, k, wasm_modulus[2], c, t1, c);
568 // mac(t3, left[4], right[3], a, t3, a);
569 // mac(t3, k, wasm_modulus[3], c, t2, c);
570 // mac(t4, left[4], right[4], a, t4, a);
571 // mac(t4, k, wasm_modulus[4], c, t3, c);
572 // mac(t5, left[4], right[5], a, t5, a);
573 // mac(t5, k, wasm_modulus[5], c, t4, c);
574 // mac(t6, left[4], right[6], a, t6, a);
575 // mac(t6, k, wasm_modulus[6], c, t5, c);
576 // mac(t7, left[4], right[7], a, t7, a);
577 // mac(t7, k, wasm_modulus[7], c, t6, c);
578 // t7 = c + a;
579
580 // mac_mini(t0, left[5], right[0], t0, a);
581 // k = (t0 * wasm_rinv) & 0xffffffffULL;
582 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
583 // mac(t1, left[5], right[1], a, t1, a);
584 // mac(t1, k, wasm_modulus[1], c, t0, c);
585 // mac(t2, left[5], right[2], a, t2, a);
586 // mac(t2, k, wasm_modulus[2], c, t1, c);
587 // mac(t3, left[5], right[3], a, t3, a);
588 // mac(t3, k, wasm_modulus[3], c, t2, c);
589 // mac(t4, left[5], right[4], a, t4, a);
590 // mac(t4, k, wasm_modulus[4], c, t3, c);
591 // mac(t5, left[5], right[5], a, t5, a);
592 // mac(t5, k, wasm_modulus[5], c, t4, c);
593 // mac(t6, left[5], right[6], a, t6, a);
594 // mac(t6, k, wasm_modulus[6], c, t5, c);
595 // mac(t7, left[5], right[7], a, t7, a);
596 // mac(t7, k, wasm_modulus[7], c, t6, c);
597 // t7 = c + a;
598
599 // mac_mini(t0, left[6], right[0], t0, a);
600 // k = (t0 * wasm_rinv) & 0xffffffffULL;
601 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
602 // mac(t1, left[6], right[1], a, t1, a);
603 // mac(t1, k, wasm_modulus[1], c, t0, c);
604 // mac(t2, left[6], right[2], a, t2, a);
605 // mac(t2, k, wasm_modulus[2], c, t1, c);
606 // mac(t3, left[6], right[3], a, t3, a);
607 // mac(t3, k, wasm_modulus[3], c, t2, c);
608 // mac(t4, left[6], right[4], a, t4, a);
609 // mac(t4, k, wasm_modulus[4], c, t3, c);
610 // mac(t5, left[6], right[5], a, t5, a);
611 // mac(t5, k, wasm_modulus[5], c, t4, c);
612 // mac(t6, left[6], right[6], a, t6, a);
613 // mac(t6, k, wasm_modulus[6], c, t5, c);
614 // mac(t7, left[6], right[7], a, t7, a);
615 // mac(t7, k, wasm_modulus[7], c, t6, c);
616 // t7 = c + a;
617
618 // mac_mini(t0, left[7], right[0], t0, a);
619 // k = (t0 * wasm_rinv) & 0xffffffffULL;
620 // c = mac_discard_lo(t0, k, wasm_modulus[0]);
621 // mac(t1, left[7], right[1], a, t1, a);
622 // mac(t1, k, wasm_modulus[1], c, t0, c);
623 // mac(t2, left[7], right[2], a, t2, a);
624 // mac(t2, k, wasm_modulus[2], c, t1, c);
625 // mac(t3, left[7], right[3], a, t3, a);
626 // mac(t3, k, wasm_modulus[3], c, t2, c);
627 // mac(t4, left[7], right[4], a, t4, a);
628 // mac(t4, k, wasm_modulus[4], c, t3, c);
629 // mac(t5, left[7], right[5], a, t5, a);
630 // mac(t5, k, wasm_modulus[5], c, t4, c);
631 // mac(t6, left[7], right[6], a, t6, a);
632 // mac(t6, k, wasm_modulus[6], c, t5, c);
633 // mac(t7, left[7], right[7], a, t7, a);
634 // mac(t7, k, wasm_modulus[7], c, t6, c);
635 // t7 = c + a;
636
637 return { t0 + (t1 << 32), t2 + (t3 << 32), t4 + (t5 << 32), t6 + (t7 << 32) };
638#endif
639}
640
641template <class T> constexpr field<T> field<T>::montgomery_square() const noexcept
642{
643 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
644 return montgomery_mul_big(*this);
645 }
646#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
647 uint64_t carry_hi = 0;
648
649 auto [t0, carry_lo] = mul_wide(data[0], data[0]);
650 uint64_t t1 = square_accumulate(0, data[1], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
651 uint64_t t2 = square_accumulate(0, data[2], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
652 uint64_t t3 = square_accumulate(0, data[3], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
653
654 uint64_t round_carry = carry_lo;
655 uint64_t k = t0 * T::r_inv;
656 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
657 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
658 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
659 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
660 t3 = carry_lo + round_carry;
661
662 t1 = mac_mini(t1, data[1], data[1], carry_lo);
663 carry_hi = 0;
664 t2 = square_accumulate(t2, data[2], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
665 t3 = square_accumulate(t3, data[3], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
666 round_carry = carry_lo;
667 k = t0 * T::r_inv;
668 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
669 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
670 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
671 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
672 t3 = carry_lo + round_carry;
673
674 t2 = mac_mini(t2, data[2], data[2], carry_lo);
675 carry_hi = 0;
676 t3 = square_accumulate(t3, data[3], data[2], carry_lo, carry_hi, carry_lo, carry_hi);
677 round_carry = carry_lo;
678 k = t0 * T::r_inv;
679 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
680 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
681 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
682 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
683 t3 = carry_lo + round_carry;
684
685 t3 = mac_mini(t3, data[3], data[3], carry_lo);
686 k = t0 * T::r_inv;
687 round_carry = carry_lo;
688 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
689 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
690 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
691 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
692 t3 = carry_lo + round_carry;
693 return { t0, t1, t2, t3 };
694#else
695 // We use ‘montgomery_mul' instead of 'square_accumulate'. The number of additions and comparisons in
696 // 'square_accumulate' makes it slower in this particular case.
697 return montgomery_mul(*this);
698#endif
699}
700
701template <class T> constexpr struct field<T>::wide_array field<T>::mul_512(const field& other) const noexcept {
702#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
703 uint64_t carry_2 = 0;
704 auto [r0, carry] = mul_wide(data[0], other.data[0]);
705 uint64_t r1 = mac_mini(carry, data[0], other.data[1], carry);
706 uint64_t r2 = mac_mini(carry, data[0], other.data[2], carry);
707 uint64_t r3 = mac_mini(carry, data[0], other.data[3], carry_2);
708
709 r1 = mac_mini(r1, data[1], other.data[0], carry);
710 r2 = mac(r2, data[1], other.data[1], carry, carry);
711 r3 = mac(r3, data[1], other.data[2], carry, carry);
712 uint64_t r4 = mac(carry_2, data[1], other.data[3], carry, carry_2);
713
714 r2 = mac_mini(r2, data[2], other.data[0], carry);
715 r3 = mac(r3, data[2], other.data[1], carry, carry);
716 r4 = mac(r4, data[2], other.data[2], carry, carry);
717 uint64_t r5 = mac(carry_2, data[2], other.data[3], carry, carry_2);
718
719 r3 = mac_mini(r3, data[3], other.data[0], carry);
720 r4 = mac(r4, data[3], other.data[1], carry, carry);
721 r5 = mac(r5, data[3], other.data[2], carry, carry);
722 uint64_t r6 = mac(carry_2, data[3], other.data[3], carry, carry_2);
723
724 return { r0, r1, r2, r3, r4, r5, r6, carry_2 };
725#else
726 const uint64_t left[8]{
727 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
728 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
729 };
730
731 const uint64_t right[8]{
732 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
733 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
734 };
735
736 uint64_t carry_2 = 0;
737 auto [r0, carry] = mul_wide(left[0], right[0]);
738 uint64_t r1 = mac_mini(carry, left[0], right[1], carry);
739 uint64_t r2 = mac_mini(carry, left[0], right[2], carry);
740 uint64_t r3 = mac_mini(carry, left[0], right[3], carry);
741 uint64_t r4 = mac_mini(carry, left[0], right[4], carry);
742 uint64_t r5 = mac_mini(carry, left[0], right[5], carry);
743 uint64_t r6 = mac_mini(carry, left[0], right[6], carry);
744 uint64_t r7 = mac_mini(carry, left[0], right[7], carry_2);
745
746 r1 = mac_mini(r1, left[1], right[0], carry);
747 r2 = mac(r2, left[1], right[1], carry, carry);
748 r3 = mac(r3, left[1], right[2], carry, carry);
749 r4 = mac(r4, left[1], right[3], carry, carry);
750 r5 = mac(r5, left[1], right[4], carry, carry);
751 r6 = mac(r6, left[1], right[5], carry, carry);
752 r7 = mac(r7, left[1], right[6], carry, carry);
753 uint64_t r8 = mac(carry_2, left[1], right[7], carry, carry_2);
754
755 r2 = mac_mini(r2, left[2], right[0], carry);
756 r3 = mac(r3, left[2], right[1], carry, carry);
757 r4 = mac(r4, left[2], right[2], carry, carry);
758 r5 = mac(r5, left[2], right[3], carry, carry);
759 r6 = mac(r6, left[2], right[4], carry, carry);
760 r7 = mac(r7, left[2], right[5], carry, carry);
761 r8 = mac(r8, left[2], right[6], carry, carry);
762 uint64_t r9 = mac(carry_2, left[2], right[7], carry, carry_2);
763
764 r3 = mac_mini(r3, left[3], right[0], carry);
765 r4 = mac(r4, left[3], right[1], carry, carry);
766 r5 = mac(r5, left[3], right[2], carry, carry);
767 r6 = mac(r6, left[3], right[3], carry, carry);
768 r7 = mac(r7, left[3], right[4], carry, carry);
769 r8 = mac(r8, left[3], right[5], carry, carry);
770 r9 = mac(r9, left[3], right[6], carry, carry);
771 uint64_t r10 = mac(carry_2, left[3], right[7], carry, carry_2);
772
773 r4 = mac_mini(r4, left[4], right[0], carry);
774 r5 = mac(r5, left[4], right[1], carry, carry);
775 r6 = mac(r6, left[4], right[2], carry, carry);
776 r7 = mac(r7, left[4], right[3], carry, carry);
777 r8 = mac(r8, left[4], right[4], carry, carry);
778 r9 = mac(r9, left[4], right[5], carry, carry);
779 r10 = mac(r10, left[4], right[6], carry, carry);
780 uint64_t r11 = mac(carry_2, left[4], right[7], carry, carry_2);
781
782 r5 = mac_mini(r5, left[5], right[0], carry);
783 r6 = mac(r6, left[5], right[1], carry, carry);
784 r7 = mac(r7, left[5], right[2], carry, carry);
785 r8 = mac(r8, left[5], right[3], carry, carry);
786 r9 = mac(r9, left[5], right[4], carry, carry);
787 r10 = mac(r10, left[5], right[5], carry, carry);
788 r11 = mac(r11, left[5], right[6], carry, carry);
789 uint64_t r12 = mac(carry_2, left[5], right[7], carry, carry_2);
790
791 r6 = mac_mini(r6, left[6], right[0], carry);
792 r7 = mac(r7, left[6], right[1], carry, carry);
793 r8 = mac(r8, left[6], right[2], carry, carry);
794 r9 = mac(r9, left[6], right[3], carry, carry);
795 r10 = mac(r10, left[6], right[4], carry, carry);
796 r11 = mac(r11, left[6], right[5], carry, carry);
797 r12 = mac(r12, left[6], right[6], carry, carry);
798 uint64_t r13 = mac(carry_2, left[6], right[7], carry, carry_2);
799
800 r7 = mac_mini(r7, left[7], right[0], carry);
801 r8 = mac(r8, left[7], right[1], carry, carry);
802 r9 = mac(r9, left[7], right[2], carry, carry);
803 r10 = mac(r10, left[7], right[3], carry, carry);
804 r11 = mac(r11, left[7], right[4], carry, carry);
805 r12 = mac(r12, left[7], right[5], carry, carry);
806 r13 = mac(r13, left[7], right[6], carry, carry);
807 uint64_t r14 = mac(carry_2, left[7], right[7], carry, carry_2);
808
809 return {
810 r0 + (r1 << 32), r2 + (r3 << 32), r4 + (r5 << 32), r6 + (r7 << 32),
811 r8 + (r9 << 32), r10 + (r11 << 32), r12 + (r13 << 32), r14 + (carry_2 << 32),
812 };
813#endif
814}
815
816// NOLINTEND(readability-implicit-bool-conversion)
817} // namespace barretenberg
Definition: uint256.hpp:25
constexpr_utils defines some helper methods that perform some stl-equivalent operations but in a cons...
Definition: constexpr_utils.hpp:16