6#include "./field_impl.hpp"
10template <
class T>
constexpr std::pair<uint64_t, uint64_t> field<T>::mul_wide(uint64_t a, uint64_t b)
noexcept
12#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
13 const uint128_t res = (
static_cast<uint128_t
>(a) *
static_cast<uint128_t
>(b));
14 return {
static_cast<uint64_t
>(res),
static_cast<uint64_t
>(res >> 64) };
16 const uint64_t product = a * b;
17 return { product & 0xffffffffULL, product >> 32 };
22constexpr uint64_t field<T>::mac(
23 const uint64_t a,
const uint64_t b,
const uint64_t c,
const uint64_t carry_in, uint64_t& carry_out)
noexcept
25#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
26 const uint128_t res =
static_cast<uint128_t
>(a) + (
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c)) +
27 static_cast<uint128_t
>(carry_in);
28 carry_out =
static_cast<uint64_t
>(res >> 64);
29 return static_cast<uint64_t
>(res);
31 const uint64_t product = b * c + a + carry_in;
32 carry_out = product >> 32;
33 return product & 0xffffffffULL;
38constexpr void field<T>::mac(
const uint64_t a,
41 const uint64_t carry_in,
43 uint64_t& carry_out)
noexcept
45#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
46 const uint128_t res =
static_cast<uint128_t
>(a) + (
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c)) +
47 static_cast<uint128_t
>(carry_in);
48 out =
static_cast<uint64_t
>(res);
49 carry_out =
static_cast<uint64_t
>(res >> 64);
51 const uint64_t product = b * c + a + carry_in;
52 carry_out = product >> 32;
53 out = product & 0xffffffffULL;
58constexpr uint64_t field<T>::mac_mini(
const uint64_t a,
61 uint64_t& carry_out)
noexcept
63#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
64 const uint128_t res =
static_cast<uint128_t
>(a) + (
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c));
65 carry_out =
static_cast<uint64_t
>(res >> 64);
66 return static_cast<uint64_t
>(res);
68 const uint64_t product = b * c + a;
69 carry_out = product >> 32;
70 return product & 0xffffffffULL;
75constexpr void field<T>::mac_mini(
76 const uint64_t a,
const uint64_t b,
const uint64_t c, uint64_t& out, uint64_t& carry_out)
noexcept
78#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
79 const uint128_t res =
static_cast<uint128_t
>(a) + (
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c));
80 out =
static_cast<uint64_t
>(res);
81 carry_out =
static_cast<uint64_t
>(res >> 64);
83 const uint64_t result = b * c + a;
84 carry_out = result >> 32;
85 out = result & 0xffffffffULL;
90constexpr uint64_t field<T>::mac_discard_lo(
const uint64_t a,
const uint64_t b,
const uint64_t c)
noexcept
92#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
93 const uint128_t res =
static_cast<uint128_t
>(a) + (
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c));
94 return static_cast<uint64_t
>(res >> 64);
96 return (b * c + a) >> 32;
101constexpr uint64_t field<T>::addc(
const uint64_t a,
103 const uint64_t carry_in,
104 uint64_t& carry_out)
noexcept
106#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
107 uint128_t res =
static_cast<uint128_t
>(a) +
static_cast<uint128_t
>(b) +
static_cast<uint128_t
>(carry_in);
108 carry_out =
static_cast<uint64_t
>(res >> 64);
109 return static_cast<uint64_t
>(res);
112 const uint64_t carry_temp = r < a;
114 carry_out = carry_temp + (r < carry_in);
120constexpr uint64_t field<T>::sbb(
const uint64_t a,
122 const uint64_t borrow_in,
123 uint64_t& borrow_out)
noexcept
125#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
126 uint128_t res =
static_cast<uint128_t
>(a) - (
static_cast<uint128_t
>(b) +
static_cast<uint128_t
>(borrow_in >> 63));
127 borrow_out =
static_cast<uint64_t
>(res >> 64);
128 return static_cast<uint64_t
>(res);
130 uint64_t t_1 = a - (borrow_in >> 63ULL);
131 uint64_t borrow_temp_1 = t_1 > a;
132 uint64_t t_2 = t_1 - b;
133 uint64_t borrow_temp_2 = t_2 > t_1;
134 borrow_out = 0ULL - (borrow_temp_1 | borrow_temp_2);
140constexpr uint64_t field<T>::square_accumulate(
const uint64_t a,
143 const uint64_t carry_in_lo,
144 const uint64_t carry_in_hi,
146 uint64_t& carry_hi)
noexcept
148#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
149 const uint128_t product =
static_cast<uint128_t
>(b) *
static_cast<uint128_t
>(c);
150 const auto r0 =
static_cast<uint64_t
>(product);
151 const auto r1 =
static_cast<uint64_t
>(product >> 64);
152 uint64_t out = r0 + r0;
153 carry_lo = (out < r0);
155 carry_lo += (out < a);
157 carry_lo += (out < carry_in_lo);
159 carry_hi = (carry_lo < r1);
161 carry_hi += (carry_lo < r1);
162 carry_lo += carry_in_hi;
163 carry_hi += (carry_lo < carry_in_hi);
166 const auto product = b * c;
167 const auto t0 = product + a + carry_in_lo;
168 const auto t1 = product + t0;
169 carry_hi = t1 < product;
170 const auto t2 = t1 + (carry_in_hi << 32);
173 return t2 & 0xffffffffULL;
177template <
class T>
constexpr field<T> field<T>::reduce() const noexcept
179 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
180 uint256_t val{ data[0], data[1], data[2], data[3] };
181 if (val >= modulus) {
184 return { val.data[0], val.data[1], val.data[2], val.data[3] };
186 uint64_t t0 = data[0] + not_modulus.data[0];
187 uint64_t c = t0 < data[0];
188 auto t1 = addc(data[1], not_modulus.data[1], c, c);
189 auto t2 = addc(data[2], not_modulus.data[2], c, c);
190 auto t3 = addc(data[3], not_modulus.data[3], c, c);
191 const uint64_t selection_mask = 0ULL - c;
192 const uint64_t selection_mask_inverse = ~selection_mask;
195 (data[0] & selection_mask_inverse) | (t0 & selection_mask),
196 (data[1] & selection_mask_inverse) | (t1 & selection_mask),
197 (data[2] & selection_mask_inverse) | (t2 & selection_mask),
198 (data[3] & selection_mask_inverse) | (t3 & selection_mask),
202template <
class T>
constexpr field<T> field<T>::add(
const field& other)
const noexcept
204 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
205 uint64_t r0 = data[0] + other.data[0];
206 uint64_t c = r0 < data[0];
207 auto r1 = addc(data[1], other.data[1], c, c);
208 auto r2 = addc(data[2], other.data[2], c, c);
209 auto r3 = addc(data[3], other.data[3], c, c);
212 r0 = sbb(r0, modulus.data[0], b, b);
213 r1 = sbb(r1, modulus.data[1], b, b);
214 r2 = sbb(r2, modulus.data[2], b, b);
215 r3 = sbb(r3, modulus.data[3], b, b);
220 r0 = sbb(r0, modulus.data[0], b, b);
221 r1 = sbb(r1, modulus.data[1], b, b);
222 r2 = sbb(r2, modulus.data[2], b, b);
223 r3 = sbb(r3, modulus.data[3], b, b);
226 return { r0, r1, r2, r3 };
228 uint64_t r0 = data[0] + other.data[0];
229 uint64_t c = r0 < data[0];
230 auto r1 = addc(data[1], other.data[1], c, c);
231 auto r2 = addc(data[2], other.data[2], c, c);
232 uint64_t r3 = data[3] + other.data[3] + c;
234 uint64_t t0 = r0 + twice_not_modulus.data[0];
235 c = t0 < twice_not_modulus.data[0];
236 uint64_t t1 = addc(r1, twice_not_modulus.data[1], c, c);
237 uint64_t t2 = addc(r2, twice_not_modulus.data[2], c, c);
238 uint64_t t3 = addc(r3, twice_not_modulus.data[3], c, c);
239 const uint64_t selection_mask = 0ULL - c;
240 const uint64_t selection_mask_inverse = ~selection_mask;
243 (r0 & selection_mask_inverse) | (t0 & selection_mask),
244 (r1 & selection_mask_inverse) | (t1 & selection_mask),
245 (r2 & selection_mask_inverse) | (t2 & selection_mask),
246 (r3 & selection_mask_inverse) | (t3 & selection_mask),
251template <
class T>
constexpr field<T> field<T>::subtract(
const field& other)
const noexcept
254 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
255 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
256 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
257 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
259 r0 += (modulus.data[0] & borrow);
260 uint64_t carry = r0 < (modulus.data[0] & borrow);
261 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
262 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
263 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
267 r0 += (modulus.data[0] & borrow);
268 uint64_t carry = r0 < (modulus.data[0] & borrow);
269 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
270 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
271 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
273 return { r0, r1, r2, r3 };
276template <
class T>
constexpr field<T> field<T>::subtract_coarse(
const field& other)
const noexcept
278 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
279 return subtract(other);
282 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
283 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
284 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
285 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
287 r0 += (twice_modulus.data[0] & borrow);
288 uint64_t carry = r0 < (twice_modulus.data[0] & borrow);
289 r1 = addc(r1, twice_modulus.data[1] & borrow, carry, carry);
290 r2 = addc(r2, twice_modulus.data[2] & borrow, carry, carry);
291 r3 += (twice_modulus.data[3] & borrow) + carry;
293 return { r0, r1, r2, r3 };
295template <
class T>
constexpr field<T> field<T>::montgomery_mul_big(
const field& other)
const noexcept
297#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
306 for (
const auto& element : data) {
308 mac(t0, element, other.data[0], c, t0, c);
309 mac(t1, element, other.data[1], c, t1, c);
310 mac(t2, element, other.data[2], c, t2, c);
311 mac(t3, element, other.data[3], c, t3, c);
312 t4 = addc(t4, c, 0, t5);
316 c = mac_discard_lo(t0, k, modulus.data[0]);
317 mac(t1, k, modulus.data[1], c, t0, c);
318 mac(t2, k, modulus.data[2], c, t1, c);
319 mac(t3, k, modulus.data[3], c, t2, c);
320 t3 = addc(c, t4, 0, c);
324 uint64_t r0 = sbb(t0, modulus.data[0], borrow, borrow);
325 uint64_t r1 = sbb(t1, modulus.data[1], borrow, borrow);
326 uint64_t r2 = sbb(t2, modulus.data[2], borrow, borrow);
327 uint64_t r3 = sbb(t3, modulus.data[3], borrow, borrow);
328 borrow = borrow ^ (0ULL - t4);
329 r0 += (modulus.data[0] & borrow);
330 uint64_t carry = r0 < (modulus.data[0] & borrow);
331 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
332 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
333 r3 += (modulus.data[3] & borrow) + carry;
334 return { r0, r1, r2, r3 };
349 constexpr uint64_t wasm_modulus[8]{
350 modulus.data[0] & 0xffffffffULL, modulus.data[0] >> 32ULL, modulus.data[1] & 0xffffffffULL,
351 modulus.data[1] >> 32ULL, modulus.data[2] & 0xffffffffULL, modulus.data[2] >> 32ULL,
352 modulus.data[3] & 0xffffffffULL, modulus.data[3] >> 32ULL,
354 constexpr uint64_t wasm_rinv = T::r_inv & 0xffffffffULL;
356 const uint64_t left[8]{
357 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
358 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
360 const uint64_t right[8]{
361 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
362 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
365 for (
size_t i = 0; i < 8; ++i) {
367 mac(t0, left[i], right[0], c, t0, c);
368 mac(t1, left[i], right[1], c, t1, c);
369 mac(t2, left[i], right[2], c, t2, c);
370 mac(t3, left[i], right[3], c, t3, c);
371 mac(t4, left[i], right[4], c, t4, c);
372 mac(t5, left[i], right[5], c, t5, c);
373 mac(t6, left[i], right[6], c, t6, c);
374 mac(t7, left[i], right[7], c, t7, c);
375 uint64_t end_mul = t8 + c;
376 t8 = end_mul & 0xffffffffU;
380 k = (t0 * wasm_rinv) & 0xffffffffU;
381 c = mac_discard_lo(t0, k, wasm_modulus[0]);
382 mac(t1, k, wasm_modulus[1], c, t0, c);
383 mac(t2, k, wasm_modulus[2], c, t1, c);
384 mac(t3, k, wasm_modulus[3], c, t2, c);
385 mac(t4, k, wasm_modulus[4], c, t3, c);
386 mac(t5, k, wasm_modulus[5], c, t4, c);
387 mac(t6, k, wasm_modulus[6], c, t5, c);
388 mac(t7, k, wasm_modulus[7], c, t6, c);
389 uint64_t end_reduce = c + t8;
390 t7 = end_reduce & 0xffffffffU;
391 c = end_reduce >> 32;
394 uint64_t v0 = t0 + (t1 << 32);
395 uint64_t v1 = t2 + (t3 << 32);
396 uint64_t v2 = t4 + (t5 << 32);
397 uint64_t v3 = t6 + (t7 << 32);
400 uint64_t r0 = sbb(v0, modulus.data[0], borrow, borrow);
401 uint64_t r1 = sbb(v1, modulus.data[1], borrow, borrow);
402 uint64_t r2 = sbb(v2, modulus.data[2], borrow, borrow);
403 uint64_t r3 = sbb(v3, modulus.data[3], borrow, borrow);
404 borrow = borrow ^ (0ULL - v4);
405 r0 += (modulus.data[0] & borrow);
406 uint64_t carry = r0 < (modulus.data[0] & borrow);
407 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
408 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
409 r3 += (modulus.data[3] & borrow) + carry;
410 return { r0, r1, r2, r3 };
414template <
class T>
constexpr field<T> field<T>::montgomery_mul(
const field& other)
const noexcept
416 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
417 return montgomery_mul_big(other);
419#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
420 auto [t0, c] = mul_wide(data[0], other.data[0]);
421 uint64_t k = t0 * T::r_inv;
422 uint64_t a = mac_discard_lo(t0, k, modulus.data[0]);
424 uint64_t t1 = mac_mini(a, data[0], other.data[1], a);
425 mac(t1, k, modulus.data[1], c, t0, c);
426 uint64_t t2 = mac_mini(a, data[0], other.data[2], a);
427 mac(t2, k, modulus.data[2], c, t1, c);
428 uint64_t t3 = mac_mini(a, data[0], other.data[3], a);
429 mac(t3, k, modulus.data[3], c, t2, c);
432 mac_mini(t0, data[1], other.data[0], t0, a);
434 c = mac_discard_lo(t0, k, modulus.data[0]);
435 mac(t1, data[1], other.data[1], a, t1, a);
436 mac(t1, k, modulus.data[1], c, t0, c);
437 mac(t2, data[1], other.data[2], a, t2, a);
438 mac(t2, k, modulus.data[2], c, t1, c);
439 mac(t3, data[1], other.data[3], a, t3, a);
440 mac(t3, k, modulus.data[3], c, t2, c);
443 mac_mini(t0, data[2], other.data[0], t0, a);
445 c = mac_discard_lo(t0, k, modulus.data[0]);
446 mac(t1, data[2], other.data[1], a, t1, a);
447 mac(t1, k, modulus.data[1], c, t0, c);
448 mac(t2, data[2], other.data[2], a, t2, a);
449 mac(t2, k, modulus.data[2], c, t1, c);
450 mac(t3, data[2], other.data[3], a, t3, a);
451 mac(t3, k, modulus.data[3], c, t2, c);
454 mac_mini(t0, data[3], other.data[0], t0, a);
456 c = mac_discard_lo(t0, k, modulus.data[0]);
457 mac(t1, data[3], other.data[1], a, t1, a);
458 mac(t1, k, modulus.data[1], c, t0, c);
459 mac(t2, data[3], other.data[2], a, t2, a);
460 mac(t2, k, modulus.data[2], c, t1, c);
461 mac(t3, data[3], other.data[3], a, t3, a);
462 mac(t3, k, modulus.data[3], c, t2, c);
464 return { t0, t1, t2, t3 };
466 constexpr uint64_t wasm_modulus[8]{
467 modulus.data[0] & 0xffffffffULL, modulus.data[0] >> 32ULL, modulus.data[1] & 0xffffffffULL,
468 modulus.data[1] >> 32ULL, modulus.data[2] & 0xffffffffULL, modulus.data[2] >> 32ULL,
469 modulus.data[3] & 0xffffffffULL, modulus.data[3] >> 32ULL,
471 constexpr uint64_t wasm_rinv = T::r_inv & 0xffffffffULL;
473 const uint64_t left[8]{
474 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
475 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
477 const uint64_t right[8]{
478 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
479 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
482 auto [t0, c] = mul_wide(left[0], right[0]);
483 uint64_t k = (t0 * wasm_rinv) & 0xffffffffULL;
484 uint64_t a = mac_discard_lo(t0, k, wasm_modulus[0]);
486 uint64_t t1 = mac_mini(a, left[0], right[1], a);
487 mac(t1, k, wasm_modulus[1], c, t0, c);
488 uint64_t t2 = mac_mini(a, left[0], right[2], a);
489 mac(t2, k, wasm_modulus[2], c, t1, c);
490 uint64_t t3 = mac_mini(a, left[0], right[3], a);
491 mac(t3, k, wasm_modulus[3], c, t2, c);
492 uint64_t t4 = mac_mini(a, left[0], right[4], a);
493 mac(t4, k, wasm_modulus[4], c, t3, c);
494 uint64_t t5 = mac_mini(a, left[0], right[5], a);
495 mac(t5, k, wasm_modulus[5], c, t4, c);
496 uint64_t t6 = mac_mini(a, left[0], right[6], a);
497 mac(t6, k, wasm_modulus[6], c, t5, c);
498 uint64_t t7 = mac_mini(a, left[0], right[7], a);
499 mac(t7, k, wasm_modulus[7], c, t6, c);
502 for (
size_t i = 1; i < 8; ++i) {
503 mac_mini(t0, left[i], right[0], t0, a);
504 k = (t0 * wasm_rinv) & 0xffffffffULL;
505 c = mac_discard_lo(t0, k, wasm_modulus[0]);
506 mac(t1, left[i], right[1], a, t1, a);
507 mac(t1, k, wasm_modulus[1], c, t0, c);
508 mac(t2, left[i], right[2], a, t2, a);
509 mac(t2, k, wasm_modulus[2], c, t1, c);
510 mac(t3, left[i], right[3], a, t3, a);
511 mac(t3, k, wasm_modulus[3], c, t2, c);
512 mac(t4, left[i], right[4], a, t4, a);
513 mac(t4, k, wasm_modulus[4], c, t3, c);
514 mac(t5, left[i], right[5], a, t5, a);
515 mac(t5, k, wasm_modulus[5], c, t4, c);
516 mac(t6, left[i], right[6], a, t6, a);
517 mac(t6, k, wasm_modulus[6], c, t5, c);
518 mac(t7, left[i], right[7], a, t7, a);
519 mac(t7, k, wasm_modulus[7], c, t6, c);
637 return { t0 + (t1 << 32), t2 + (t3 << 32), t4 + (t5 << 32), t6 + (t7 << 32) };
641template <
class T>
constexpr field<T> field<T>::montgomery_square() const noexcept
643 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
644 return montgomery_mul_big(*
this);
646#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
647 uint64_t carry_hi = 0;
649 auto [t0, carry_lo] = mul_wide(data[0], data[0]);
650 uint64_t t1 = square_accumulate(0, data[1], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
651 uint64_t t2 = square_accumulate(0, data[2], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
652 uint64_t t3 = square_accumulate(0, data[3], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
654 uint64_t round_carry = carry_lo;
655 uint64_t k = t0 * T::r_inv;
656 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
657 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
658 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
659 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
660 t3 = carry_lo + round_carry;
662 t1 = mac_mini(t1, data[1], data[1], carry_lo);
664 t2 = square_accumulate(t2, data[2], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
665 t3 = square_accumulate(t3, data[3], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
666 round_carry = carry_lo;
668 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
669 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
670 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
671 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
672 t3 = carry_lo + round_carry;
674 t2 = mac_mini(t2, data[2], data[2], carry_lo);
676 t3 = square_accumulate(t3, data[3], data[2], carry_lo, carry_hi, carry_lo, carry_hi);
677 round_carry = carry_lo;
679 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
680 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
681 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
682 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
683 t3 = carry_lo + round_carry;
685 t3 = mac_mini(t3, data[3], data[3], carry_lo);
687 round_carry = carry_lo;
688 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
689 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
690 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
691 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
692 t3 = carry_lo + round_carry;
693 return { t0, t1, t2, t3 };
697 return montgomery_mul(*
this);
701template <
class T>
constexpr struct field<T>::wide_array field<T>::mul_512(const field& other) const noexcept {
702#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
703 uint64_t carry_2 = 0;
704 auto [r0, carry] = mul_wide(data[0], other.data[0]);
705 uint64_t r1 = mac_mini(carry, data[0], other.data[1], carry);
706 uint64_t r2 = mac_mini(carry, data[0], other.data[2], carry);
707 uint64_t r3 = mac_mini(carry, data[0], other.data[3], carry_2);
709 r1 = mac_mini(r1, data[1], other.data[0], carry);
710 r2 = mac(r2, data[1], other.data[1], carry, carry);
711 r3 = mac(r3, data[1], other.data[2], carry, carry);
712 uint64_t r4 = mac(carry_2, data[1], other.data[3], carry, carry_2);
714 r2 = mac_mini(r2, data[2], other.data[0], carry);
715 r3 = mac(r3, data[2], other.data[1], carry, carry);
716 r4 = mac(r4, data[2], other.data[2], carry, carry);
717 uint64_t r5 = mac(carry_2, data[2], other.data[3], carry, carry_2);
719 r3 = mac_mini(r3, data[3], other.data[0], carry);
720 r4 = mac(r4, data[3], other.data[1], carry, carry);
721 r5 = mac(r5, data[3], other.data[2], carry, carry);
722 uint64_t r6 = mac(carry_2, data[3], other.data[3], carry, carry_2);
724 return { r0, r1, r2, r3, r4, r5, r6, carry_2 };
726 const uint64_t left[8]{
727 data[0] & 0xffffffffULL, data[0] >> 32, data[1] & 0xffffffffULL, data[1] >> 32,
728 data[2] & 0xffffffffULL, data[2] >> 32, data[3] & 0xffffffffULL, data[3] >> 32,
731 const uint64_t right[8]{
732 other.data[0] & 0xffffffffULL, other.data[0] >> 32, other.data[1] & 0xffffffffULL, other.data[1] >> 32,
733 other.data[2] & 0xffffffffULL, other.data[2] >> 32, other.data[3] & 0xffffffffULL, other.data[3] >> 32,
736 uint64_t carry_2 = 0;
737 auto [r0, carry] = mul_wide(left[0], right[0]);
738 uint64_t r1 = mac_mini(carry, left[0], right[1], carry);
739 uint64_t r2 = mac_mini(carry, left[0], right[2], carry);
740 uint64_t r3 = mac_mini(carry, left[0], right[3], carry);
741 uint64_t r4 = mac_mini(carry, left[0], right[4], carry);
742 uint64_t r5 = mac_mini(carry, left[0], right[5], carry);
743 uint64_t r6 = mac_mini(carry, left[0], right[6], carry);
744 uint64_t r7 = mac_mini(carry, left[0], right[7], carry_2);
746 r1 = mac_mini(r1, left[1], right[0], carry);
747 r2 = mac(r2, left[1], right[1], carry, carry);
748 r3 = mac(r3, left[1], right[2], carry, carry);
749 r4 = mac(r4, left[1], right[3], carry, carry);
750 r5 = mac(r5, left[1], right[4], carry, carry);
751 r6 = mac(r6, left[1], right[5], carry, carry);
752 r7 = mac(r7, left[1], right[6], carry, carry);
753 uint64_t r8 = mac(carry_2, left[1], right[7], carry, carry_2);
755 r2 = mac_mini(r2, left[2], right[0], carry);
756 r3 = mac(r3, left[2], right[1], carry, carry);
757 r4 = mac(r4, left[2], right[2], carry, carry);
758 r5 = mac(r5, left[2], right[3], carry, carry);
759 r6 = mac(r6, left[2], right[4], carry, carry);
760 r7 = mac(r7, left[2], right[5], carry, carry);
761 r8 = mac(r8, left[2], right[6], carry, carry);
762 uint64_t r9 = mac(carry_2, left[2], right[7], carry, carry_2);
764 r3 = mac_mini(r3, left[3], right[0], carry);
765 r4 = mac(r4, left[3], right[1], carry, carry);
766 r5 = mac(r5, left[3], right[2], carry, carry);
767 r6 = mac(r6, left[3], right[3], carry, carry);
768 r7 = mac(r7, left[3], right[4], carry, carry);
769 r8 = mac(r8, left[3], right[5], carry, carry);
770 r9 = mac(r9, left[3], right[6], carry, carry);
771 uint64_t r10 = mac(carry_2, left[3], right[7], carry, carry_2);
773 r4 = mac_mini(r4, left[4], right[0], carry);
774 r5 = mac(r5, left[4], right[1], carry, carry);
775 r6 = mac(r6, left[4], right[2], carry, carry);
776 r7 = mac(r7, left[4], right[3], carry, carry);
777 r8 = mac(r8, left[4], right[4], carry, carry);
778 r9 = mac(r9, left[4], right[5], carry, carry);
779 r10 = mac(r10, left[4], right[6], carry, carry);
780 uint64_t r11 = mac(carry_2, left[4], right[7], carry, carry_2);
782 r5 = mac_mini(r5, left[5], right[0], carry);
783 r6 = mac(r6, left[5], right[1], carry, carry);
784 r7 = mac(r7, left[5], right[2], carry, carry);
785 r8 = mac(r8, left[5], right[3], carry, carry);
786 r9 = mac(r9, left[5], right[4], carry, carry);
787 r10 = mac(r10, left[5], right[5], carry, carry);
788 r11 = mac(r11, left[5], right[6], carry, carry);
789 uint64_t r12 = mac(carry_2, left[5], right[7], carry, carry_2);
791 r6 = mac_mini(r6, left[6], right[0], carry);
792 r7 = mac(r7, left[6], right[1], carry, carry);
793 r8 = mac(r8, left[6], right[2], carry, carry);
794 r9 = mac(r9, left[6], right[3], carry, carry);
795 r10 = mac(r10, left[6], right[4], carry, carry);
796 r11 = mac(r11, left[6], right[5], carry, carry);
797 r12 = mac(r12, left[6], right[6], carry, carry);
798 uint64_t r13 = mac(carry_2, left[6], right[7], carry, carry_2);
800 r7 = mac_mini(r7, left[7], right[0], carry);
801 r8 = mac(r8, left[7], right[1], carry, carry);
802 r9 = mac(r9, left[7], right[2], carry, carry);
803 r10 = mac(r10, left[7], right[3], carry, carry);
804 r11 = mac(r11, left[7], right[4], carry, carry);
805 r12 = mac(r12, left[7], right[5], carry, carry);
806 r13 = mac(r13, left[7], right[6], carry, carry);
807 uint64_t r14 = mac(carry_2, left[7], right[7], carry, carry_2);
810 r0 + (r1 << 32), r2 + (r3 << 32), r4 + (r5 << 32), r6 + (r7 << 32),
811 r8 + (r9 << 32), r10 + (r11 << 32), r12 + (r13 << 32), r14 + (carry_2 << 32),
Definition: uint256.hpp:25
constexpr_utils defines some helper methods that perform some stl-equivalent operations but in a cons...
Definition: constexpr_utils.hpp:16