diff options
3 files changed, 2049 insertions, 0 deletions
diff --git a/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-1.patch b/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-1.patch new file mode 100644 index 0000000000..eb3fc52dca --- /dev/null +++ b/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-1.patch | |||
| @@ -0,0 +1,1918 @@ | |||
| 1 | From 14ac0f0e4e1f36793d09b41ffd5e482575289ab2 Mon Sep 17 00:00:00 2001 | ||
| 2 | From: Danny Tsen <dtsen@us.ibm.com> | ||
| 3 | Date: Tue, 11 Feb 2025 13:48:01 -0500 | ||
| 4 | Subject: [PATCH] Fix Minerva timing side-channel signal for P-384 curve on PPC | ||
| 5 | |||
| 6 | 1. bn_ppc.c: Used bn_mul_mont_int() instead of bn_mul_mont_300_fixed_n6() | ||
| 7 | for Montgomery multiplication. | ||
| 8 | 2. ecp_nistp384-ppc64.pl: | ||
| 9 | - Re-wrote p384_felem_mul and p384_felem_square for easier maintenance with | ||
| 10 | minumum perl wrapper. | ||
| 11 | - Implemented p384_felem_reduce, p384_felem_mul_reduce and p384_felem_square_reduce. | ||
| 12 | - Implemented p384_felem_diff64, felem_diff_128_64 and felem_diff128 in assembly. | ||
| 13 | 3. ecp_nistp384.c: | ||
| 14 | - Added wrapper function for p384_felem_mul_reduce and p384_felem_square_reduce. | ||
| 15 | |||
| 16 | Signed-off-by: Danny Tsen <dtsen@us.ibm.com> | ||
| 17 | |||
| 18 | Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com> | ||
| 19 | Reviewed-by: Tomas Mraz <tomas@openssl.org> | ||
| 20 | (Merged from https://github.com/openssl/openssl/pull/26709) | ||
| 21 | |||
| 22 | (cherry picked from commit 85cabd94958303859b1551364a609d4ff40b67a5) | ||
| 23 | |||
| 24 | CVE: CVE-2025-27587 | ||
| 25 | Upstream-Status: Backport [https://github.com/openssl/openssl/commit/14ac0f0e4e1f36793d09b41ffd5e482575289ab2] | ||
| 26 | Signed-off-by: Peter Marko <peter.marko@siemens.com> | ||
| 27 | --- | ||
| 28 | crypto/bn/bn_ppc.c | 3 + | ||
| 29 | crypto/ec/asm/ecp_nistp384-ppc64.pl | 1724 +++++++++++++++++++++++---- | ||
| 30 | crypto/ec/ecp_nistp384.c | 28 +- | ||
| 31 | 3 files changed, 1504 insertions(+), 251 deletions(-) | ||
| 32 | |||
| 33 | diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c | ||
| 34 | index 1e9421bee2..29293bad55 100644 | ||
| 35 | --- a/crypto/bn/bn_ppc.c | ||
| 36 | +++ b/crypto/bn/bn_ppc.c | ||
| 37 | @@ -41,12 +41,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, | ||
| 38 | */ | ||
| 39 | |||
| 40 | #if defined(_ARCH_PPC64) && !defined(__ILP32__) | ||
| 41 | + /* Minerva side-channel fix danny */ | ||
| 42 | +# if defined(USE_FIXED_N6) | ||
| 43 | if (num == 6) { | ||
| 44 | if (OPENSSL_ppccap_P & PPC_MADD300) | ||
| 45 | return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); | ||
| 46 | else | ||
| 47 | return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); | ||
| 48 | } | ||
| 49 | +# endif | ||
| 50 | #endif | ||
| 51 | |||
| 52 | return bn_mul_mont_int(rp, ap, bp, np, n0, num); | ||
| 53 | diff --git a/crypto/ec/asm/ecp_nistp384-ppc64.pl b/crypto/ec/asm/ecp_nistp384-ppc64.pl | ||
| 54 | index 28f4168e52..b663bddfc6 100755 | ||
| 55 | --- a/crypto/ec/asm/ecp_nistp384-ppc64.pl | ||
| 56 | +++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl | ||
| 57 | @@ -7,13 +7,15 @@ | ||
| 58 | # https://www.openssl.org/source/license.html | ||
| 59 | # | ||
| 60 | # ==================================================================== | ||
| 61 | -# Written by Rohan McLure <rmclure@linux.ibm.com> for the OpenSSL | ||
| 62 | -# project. | ||
| 63 | +# Written by Danny Tsen <dtsen@us.ibm.com> # for the OpenSSL project. | ||
| 64 | +# | ||
| 65 | +# Copyright 2025- IBM Corp. | ||
| 66 | # ==================================================================== | ||
| 67 | # | ||
| 68 | -# p384 lower-level primitives for PPC64 using vector instructions. | ||
| 69 | +# p384 lower-level primitives for PPC64. | ||
| 70 | # | ||
| 71 | |||
| 72 | + | ||
| 73 | use strict; | ||
| 74 | use warnings; | ||
| 75 | |||
| 76 | @@ -21,7 +23,7 @@ my $flavour = shift; | ||
| 77 | my $output = ""; | ||
| 78 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} | ||
| 79 | if (!$output) { | ||
| 80 | - $output = "-"; | ||
| 81 | + $output = "-"; | ||
| 82 | } | ||
| 83 | |||
| 84 | my ($xlate, $dir); | ||
| 85 | @@ -35,271 +37,1495 @@ open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 86 | |||
| 87 | my $code = ""; | ||
| 88 | |||
| 89 | -my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); | ||
| 90 | - | ||
| 91 | -my $vzero = "v32"; | ||
| 92 | - | ||
| 93 | -sub startproc($) | ||
| 94 | -{ | ||
| 95 | - my ($name) = @_; | ||
| 96 | - | ||
| 97 | - $code.=<<___; | ||
| 98 | - .globl ${name} | ||
| 99 | - .align 5 | ||
| 100 | -${name}: | ||
| 101 | - | ||
| 102 | -___ | ||
| 103 | -} | ||
| 104 | - | ||
| 105 | -sub endproc($) | ||
| 106 | -{ | ||
| 107 | - my ($name) = @_; | ||
| 108 | - | ||
| 109 | - $code.=<<___; | ||
| 110 | - blr | ||
| 111 | - .size ${name},.-${name} | ||
| 112 | - | ||
| 113 | -___ | ||
| 114 | -} | ||
| 115 | - | ||
| 116 | -sub load_vrs($$) | ||
| 117 | -{ | ||
| 118 | - my ($pointer, $reg_list) = @_; | ||
| 119 | - | ||
| 120 | - for (my $i = 0; $i <= 6; $i++) { | ||
| 121 | - my $offset = $i * 8; | ||
| 122 | - $code.=<<___; | ||
| 123 | - lxsd $reg_list->[$i],$offset($pointer) | ||
| 124 | -___ | ||
| 125 | - } | ||
| 126 | - | ||
| 127 | - $code.=<<___; | ||
| 128 | - | ||
| 129 | -___ | ||
| 130 | -} | ||
| 131 | - | ||
| 132 | -sub store_vrs($$) | ||
| 133 | -{ | ||
| 134 | - my ($pointer, $reg_list) = @_; | ||
| 135 | - | ||
| 136 | - for (my $i = 0; $i <= 12; $i++) { | ||
| 137 | - my $offset = $i * 16; | ||
| 138 | - $code.=<<___; | ||
| 139 | - stxv $reg_list->[$i],$offset($pointer) | ||
| 140 | -___ | ||
| 141 | - } | ||
| 142 | - | ||
| 143 | - $code.=<<___; | ||
| 144 | - | ||
| 145 | -___ | ||
| 146 | -} | ||
| 147 | - | ||
| 148 | $code.=<<___; | ||
| 149 | -.machine "any" | ||
| 150 | +.machine "any" | ||
| 151 | .text | ||
| 152 | |||
| 153 | -___ | ||
| 154 | +.globl p384_felem_mul | ||
| 155 | +.type p384_felem_mul,\@function | ||
| 156 | +.align 4 | ||
| 157 | +p384_felem_mul: | ||
| 158 | |||
| 159 | -{ | ||
| 160 | - # mul/square common | ||
| 161 | - my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43"); | ||
| 162 | - my ($zero, $one) = ("r8", "r9"); | ||
| 163 | - my $out = "v51"; | ||
| 164 | + stdu 1, -176(1) | ||
| 165 | + mflr 0 | ||
| 166 | + std 14, 56(1) | ||
| 167 | + std 15, 64(1) | ||
| 168 | + std 16, 72(1) | ||
| 169 | + std 17, 80(1) | ||
| 170 | + std 18, 88(1) | ||
| 171 | + std 19, 96(1) | ||
| 172 | + std 20, 104(1) | ||
| 173 | + std 21, 112(1) | ||
| 174 | + std 22, 120(1) | ||
| 175 | |||
| 176 | - { | ||
| 177 | - # | ||
| 178 | - # p384_felem_mul | ||
| 179 | - # | ||
| 180 | + bl _p384_felem_mul_core | ||
| 181 | |||
| 182 | - my ($in1p, $in2p) = ("r4", "r5"); | ||
| 183 | - my @in1 = map("v$_",(44..50)); | ||
| 184 | - my @in2 = map("v$_",(35..41)); | ||
| 185 | + mtlr 0 | ||
| 186 | + ld 14, 56(1) | ||
| 187 | + ld 15, 64(1) | ||
| 188 | + ld 16, 72(1) | ||
| 189 | + ld 17, 80(1) | ||
| 190 | + ld 18, 88(1) | ||
| 191 | + ld 19, 96(1) | ||
| 192 | + ld 20, 104(1) | ||
| 193 | + ld 21, 112(1) | ||
| 194 | + ld 22, 120(1) | ||
| 195 | + addi 1, 1, 176 | ||
| 196 | + blr | ||
| 197 | +.size p384_felem_mul,.-p384_felem_mul | ||
| 198 | |||
| 199 | - startproc("p384_felem_mul"); | ||
| 200 | +.globl p384_felem_square | ||
| 201 | +.type p384_felem_square,\@function | ||
| 202 | +.align 4 | ||
| 203 | +p384_felem_square: | ||
| 204 | |||
| 205 | - $code.=<<___; | ||
| 206 | - vspltisw $vzero,0 | ||
| 207 | + stdu 1, -176(1) | ||
| 208 | + mflr 0 | ||
| 209 | + std 14, 56(1) | ||
| 210 | + std 15, 64(1) | ||
| 211 | + std 16, 72(1) | ||
| 212 | + std 17, 80(1) | ||
| 213 | |||
| 214 | -___ | ||
| 215 | + bl _p384_felem_square_core | ||
| 216 | |||
| 217 | - load_vrs($in1p, \@in1); | ||
| 218 | - load_vrs($in2p, \@in2); | ||
| 219 | - | ||
| 220 | - $code.=<<___; | ||
| 221 | - vmsumudm $out,$in1[0],$in2[0],$vzero | ||
| 222 | - stxv $out,0($outp) | ||
| 223 | - | ||
| 224 | - xxpermdi $t1,$in1[0],$in1[1],0b00 | ||
| 225 | - xxpermdi $t2,$in2[1],$in2[0],0b00 | ||
| 226 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 227 | - stxv $out,16($outp) | ||
| 228 | - | ||
| 229 | - xxpermdi $t2,$in2[2],$in2[1],0b00 | ||
| 230 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 231 | - vmsumudm $out,$in1[2],$in2[0],$out | ||
| 232 | - stxv $out,32($outp) | ||
| 233 | - | ||
| 234 | - xxpermdi $t2,$in2[1],$in2[0],0b00 | ||
| 235 | - xxpermdi $t3,$in1[2],$in1[3],0b00 | ||
| 236 | - xxpermdi $t4,$in2[3],$in2[2],0b00 | ||
| 237 | - vmsumudm $out,$t1,$t4,$vzero | ||
| 238 | - vmsumudm $out,$t3,$t2,$out | ||
| 239 | - stxv $out,48($outp) | ||
| 240 | - | ||
| 241 | - xxpermdi $t2,$in2[4],$in2[3],0b00 | ||
| 242 | - xxpermdi $t4,$in2[2],$in2[1],0b00 | ||
| 243 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 244 | - vmsumudm $out,$t3,$t4,$out | ||
| 245 | - vmsumudm $out,$in1[4],$in2[0],$out | ||
| 246 | - stxv $out,64($outp) | ||
| 247 | - | ||
| 248 | - xxpermdi $t2,$in2[5],$in2[4],0b00 | ||
| 249 | - xxpermdi $t4,$in2[3],$in2[2],0b00 | ||
| 250 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 251 | - vmsumudm $out,$t3,$t4,$out | ||
| 252 | - xxpermdi $t4,$in2[1],$in2[0],0b00 | ||
| 253 | - xxpermdi $t1,$in1[4],$in1[5],0b00 | ||
| 254 | - vmsumudm $out,$t1,$t4,$out | ||
| 255 | - stxv $out,80($outp) | ||
| 256 | - | ||
| 257 | - xxpermdi $t1,$in1[0],$in1[1],0b00 | ||
| 258 | - xxpermdi $t2,$in2[6],$in2[5],0b00 | ||
| 259 | - xxpermdi $t4,$in2[4],$in2[3],0b00 | ||
| 260 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 261 | - vmsumudm $out,$t3,$t4,$out | ||
| 262 | - xxpermdi $t2,$in2[2],$in2[1],0b00 | ||
| 263 | - xxpermdi $t1,$in1[4],$in1[5],0b00 | ||
| 264 | - vmsumudm $out,$t1,$t2,$out | ||
| 265 | - vmsumudm $out,$in1[6],$in2[0],$out | ||
| 266 | - stxv $out,96($outp) | ||
| 267 | - | ||
| 268 | - xxpermdi $t1,$in1[1],$in1[2],0b00 | ||
| 269 | - xxpermdi $t2,$in2[6],$in2[5],0b00 | ||
| 270 | - xxpermdi $t3,$in1[3],$in1[4],0b00 | ||
| 271 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 272 | - vmsumudm $out,$t3,$t4,$out | ||
| 273 | - xxpermdi $t3,$in2[2],$in2[1],0b00 | ||
| 274 | - xxpermdi $t1,$in1[5],$in1[6],0b00 | ||
| 275 | - vmsumudm $out,$t1,$t3,$out | ||
| 276 | - stxv $out,112($outp) | ||
| 277 | - | ||
| 278 | - xxpermdi $t1,$in1[2],$in1[3],0b00 | ||
| 279 | - xxpermdi $t3,$in1[4],$in1[5],0b00 | ||
| 280 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 281 | - vmsumudm $out,$t3,$t4,$out | ||
| 282 | - vmsumudm $out,$in1[6],$in2[2],$out | ||
| 283 | - stxv $out,128($outp) | ||
| 284 | - | ||
| 285 | - xxpermdi $t1,$in1[3],$in1[4],0b00 | ||
| 286 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 287 | - xxpermdi $t1,$in1[5],$in1[6],0b00 | ||
| 288 | - vmsumudm $out,$t1,$t4,$out | ||
| 289 | - stxv $out,144($outp) | ||
| 290 | - | ||
| 291 | - vmsumudm $out,$t3,$t2,$vzero | ||
| 292 | - vmsumudm $out,$in1[6],$in2[4],$out | ||
| 293 | - stxv $out,160($outp) | ||
| 294 | - | ||
| 295 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 296 | - stxv $out,176($outp) | ||
| 297 | - | ||
| 298 | - vmsumudm $out,$in1[6],$in2[6],$vzero | ||
| 299 | - stxv $out,192($outp) | ||
| 300 | -___ | ||
| 301 | + mtlr 0 | ||
| 302 | + ld 14, 56(1) | ||
| 303 | + ld 15, 64(1) | ||
| 304 | + ld 16, 72(1) | ||
| 305 | + ld 17, 80(1) | ||
| 306 | + addi 1, 1, 176 | ||
| 307 | + blr | ||
| 308 | +.size p384_felem_square,.-p384_felem_square | ||
| 309 | |||
| 310 | - endproc("p384_felem_mul"); | ||
| 311 | - } | ||
| 312 | +# | ||
| 313 | +# Felem mul core function - | ||
| 314 | +# r3, r4 and r5 need to pre-loaded. | ||
| 315 | +# | ||
| 316 | +.type _p384_felem_mul_core,\@function | ||
| 317 | +.align 4 | ||
| 318 | +_p384_felem_mul_core: | ||
| 319 | |||
| 320 | - { | ||
| 321 | - # | ||
| 322 | - # p384_felem_square | ||
| 323 | - # | ||
| 324 | + ld 6,0(4) | ||
| 325 | + ld 14,0(5) | ||
| 326 | + ld 7,8(4) | ||
| 327 | + ld 15,8(5) | ||
| 328 | + ld 8,16(4) | ||
| 329 | + ld 16,16(5) | ||
| 330 | + ld 9,24(4) | ||
| 331 | + ld 17,24(5) | ||
| 332 | + ld 10,32(4) | ||
| 333 | + ld 18,32(5) | ||
| 334 | + ld 11,40(4) | ||
| 335 | + ld 19,40(5) | ||
| 336 | + ld 12,48(4) | ||
| 337 | + ld 20,48(5) | ||
| 338 | |||
| 339 | - my ($inp) = ("r4"); | ||
| 340 | - my @in = map("v$_",(44..50)); | ||
| 341 | - my @inx2 = map("v$_",(35..41)); | ||
| 342 | + # out0 | ||
| 343 | + mulld 21, 14, 6 | ||
| 344 | + mulhdu 22, 14, 6 | ||
| 345 | + std 21, 0(3) | ||
| 346 | + std 22, 8(3) | ||
| 347 | |||
| 348 | - startproc("p384_felem_square"); | ||
| 349 | + vxor 0, 0, 0 | ||
| 350 | |||
| 351 | - $code.=<<___; | ||
| 352 | - vspltisw $vzero,0 | ||
| 353 | + # out1 | ||
| 354 | + mtvsrdd 32+13, 14, 6 | ||
| 355 | + mtvsrdd 32+14, 7, 15 | ||
| 356 | + vmsumudm 1, 13, 14, 0 | ||
| 357 | |||
| 358 | -___ | ||
| 359 | + # out2 | ||
| 360 | + mtvsrdd 32+15, 15, 6 | ||
| 361 | + mtvsrdd 32+16, 7, 16 | ||
| 362 | + mtvsrdd 32+17, 0, 8 | ||
| 363 | + mtvsrdd 32+18, 0, 14 | ||
| 364 | + vmsumudm 19, 15, 16, 0 | ||
| 365 | + vmsumudm 2, 17, 18, 19 | ||
| 366 | |||
| 367 | - load_vrs($inp, \@in); | ||
| 368 | + # out3 | ||
| 369 | + mtvsrdd 32+13, 16, 6 | ||
| 370 | + mtvsrdd 32+14, 7, 17 | ||
| 371 | + mtvsrdd 32+15, 14, 8 | ||
| 372 | + mtvsrdd 32+16, 9, 15 | ||
| 373 | + vmsumudm 19, 13, 14, 0 | ||
| 374 | + vmsumudm 3, 15, 16, 19 | ||
| 375 | |||
| 376 | - $code.=<<___; | ||
| 377 | - li $zero,0 | ||
| 378 | - li $one,1 | ||
| 379 | - mtvsrdd $t1,$one,$zero | ||
| 380 | -___ | ||
| 381 | + # out4 | ||
| 382 | + mtvsrdd 32+13, 17, 6 | ||
| 383 | + mtvsrdd 32+14, 7, 18 | ||
| 384 | + mtvsrdd 32+15, 15, 8 | ||
| 385 | + mtvsrdd 32+16, 9, 16 | ||
| 386 | + mtvsrdd 32+17, 0, 10 | ||
| 387 | + mtvsrdd 32+18, 0, 14 | ||
| 388 | + vmsumudm 19, 13, 14, 0 | ||
| 389 | + vmsumudm 4, 15, 16, 19 | ||
| 390 | + vmsumudm 4, 17, 18, 4 | ||
| 391 | |||
| 392 | - for (my $i = 0; $i <= 6; $i++) { | ||
| 393 | - $code.=<<___; | ||
| 394 | - vsld $inx2[$i],$in[$i],$t1 | ||
| 395 | -___ | ||
| 396 | - } | ||
| 397 | - | ||
| 398 | - $code.=<<___; | ||
| 399 | - vmsumudm $out,$in[0],$in[0],$vzero | ||
| 400 | - stxv $out,0($outp) | ||
| 401 | - | ||
| 402 | - vmsumudm $out,$in[0],$inx2[1],$vzero | ||
| 403 | - stxv $out,16($outp) | ||
| 404 | - | ||
| 405 | - vmsumudm $out,$in[0],$inx2[2],$vzero | ||
| 406 | - vmsumudm $out,$in[1],$in[1],$out | ||
| 407 | - stxv $out,32($outp) | ||
| 408 | - | ||
| 409 | - xxpermdi $t1,$in[0],$in[1],0b00 | ||
| 410 | - xxpermdi $t2,$inx2[3],$inx2[2],0b00 | ||
| 411 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 412 | - stxv $out,48($outp) | ||
| 413 | - | ||
| 414 | - xxpermdi $t4,$inx2[4],$inx2[3],0b00 | ||
| 415 | - vmsumudm $out,$t1,$t4,$vzero | ||
| 416 | - vmsumudm $out,$in[2],$in[2],$out | ||
| 417 | - stxv $out,64($outp) | ||
| 418 | - | ||
| 419 | - xxpermdi $t2,$inx2[5],$inx2[4],0b00 | ||
| 420 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 421 | - vmsumudm $out,$in[2],$inx2[3],$out | ||
| 422 | - stxv $out,80($outp) | ||
| 423 | - | ||
| 424 | - xxpermdi $t2,$inx2[6],$inx2[5],0b00 | ||
| 425 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 426 | - vmsumudm $out,$in[2],$inx2[4],$out | ||
| 427 | - vmsumudm $out,$in[3],$in[3],$out | ||
| 428 | - stxv $out,96($outp) | ||
| 429 | - | ||
| 430 | - xxpermdi $t3,$in[1],$in[2],0b00 | ||
| 431 | - vmsumudm $out,$t3,$t2,$vzero | ||
| 432 | - vmsumudm $out,$in[3],$inx2[4],$out | ||
| 433 | - stxv $out,112($outp) | ||
| 434 | - | ||
| 435 | - xxpermdi $t1,$in[2],$in[3],0b00 | ||
| 436 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 437 | - vmsumudm $out,$in[4],$in[4],$out | ||
| 438 | - stxv $out,128($outp) | ||
| 439 | - | ||
| 440 | - xxpermdi $t1,$in[3],$in[4],0b00 | ||
| 441 | - vmsumudm $out,$t1,$t2,$vzero | ||
| 442 | - stxv $out,144($outp) | ||
| 443 | - | ||
| 444 | - vmsumudm $out,$in[4],$inx2[6],$vzero | ||
| 445 | - vmsumudm $out,$in[5],$in[5],$out | ||
| 446 | - stxv $out,160($outp) | ||
| 447 | - | ||
| 448 | - vmsumudm $out,$in[5],$inx2[6],$vzero | ||
| 449 | - stxv $out,176($outp) | ||
| 450 | - | ||
| 451 | - vmsumudm $out,$in[6],$in[6],$vzero | ||
| 452 | - stxv $out,192($outp) | ||
| 453 | -___ | ||
| 454 | + # out5 | ||
| 455 | + mtvsrdd 32+13, 18, 6 | ||
| 456 | + mtvsrdd 32+14, 7, 19 | ||
| 457 | + mtvsrdd 32+15, 16, 8 | ||
| 458 | + mtvsrdd 32+16, 9, 17 | ||
| 459 | + mtvsrdd 32+17, 14, 10 | ||
| 460 | + mtvsrdd 32+18, 11, 15 | ||
| 461 | + vmsumudm 19, 13, 14, 0 | ||
| 462 | + vmsumudm 5, 15, 16, 19 | ||
| 463 | + vmsumudm 5, 17, 18, 5 | ||
| 464 | + | ||
| 465 | + stxv 32+1, 16(3) | ||
| 466 | + stxv 32+2, 32(3) | ||
| 467 | + stxv 32+3, 48(3) | ||
| 468 | + stxv 32+4, 64(3) | ||
| 469 | + stxv 32+5, 80(3) | ||
| 470 | + | ||
| 471 | + # out6 | ||
| 472 | + mtvsrdd 32+13, 19, 6 | ||
| 473 | + mtvsrdd 32+14, 7, 20 | ||
| 474 | + mtvsrdd 32+15, 17, 8 | ||
| 475 | + mtvsrdd 32+16, 9, 18 | ||
| 476 | + mtvsrdd 32+17, 15, 10 | ||
| 477 | + mtvsrdd 32+18, 11, 16 | ||
| 478 | + vmsumudm 19, 13, 14, 0 | ||
| 479 | + vmsumudm 6, 15, 16, 19 | ||
| 480 | + mtvsrdd 32+13, 0, 12 | ||
| 481 | + mtvsrdd 32+14, 0, 14 | ||
| 482 | + vmsumudm 19, 17, 18, 6 | ||
| 483 | + vmsumudm 6, 13, 14, 19 | ||
| 484 | + | ||
| 485 | + # out7 | ||
| 486 | + mtvsrdd 32+13, 19, 7 | ||
| 487 | + mtvsrdd 32+14, 8, 20 | ||
| 488 | + mtvsrdd 32+15, 17, 9 | ||
| 489 | + mtvsrdd 32+16, 10, 18 | ||
| 490 | + mtvsrdd 32+17, 15, 11 | ||
| 491 | + mtvsrdd 32+18, 12, 16 | ||
| 492 | + vmsumudm 19, 13, 14, 0 | ||
| 493 | + vmsumudm 7, 15, 16, 19 | ||
| 494 | + vmsumudm 7, 17, 18, 7 | ||
| 495 | + | ||
| 496 | + # out8 | ||
| 497 | + mtvsrdd 32+13, 19, 8 | ||
| 498 | + mtvsrdd 32+14, 9, 20 | ||
| 499 | + mtvsrdd 32+15, 17, 10 | ||
| 500 | + mtvsrdd 32+16, 11, 18 | ||
| 501 | + mtvsrdd 32+17, 0, 12 | ||
| 502 | + mtvsrdd 32+18, 0, 16 | ||
| 503 | + vmsumudm 19, 13, 14, 0 | ||
| 504 | + vmsumudm 8, 15, 16, 19 | ||
| 505 | + vmsumudm 8, 17, 18, 8 | ||
| 506 | + | ||
| 507 | + # out9 | ||
| 508 | + mtvsrdd 32+13, 19, 9 | ||
| 509 | + mtvsrdd 32+14, 10, 20 | ||
| 510 | + mtvsrdd 32+15, 17, 11 | ||
| 511 | + mtvsrdd 32+16, 12, 18 | ||
| 512 | + vmsumudm 19, 13, 14, 0 | ||
| 513 | + vmsumudm 9, 15, 16, 19 | ||
| 514 | + | ||
| 515 | + # out10 | ||
| 516 | + mtvsrdd 32+13, 19, 10 | ||
| 517 | + mtvsrdd 32+14, 11, 20 | ||
| 518 | + mtvsrdd 32+15, 0, 12 | ||
| 519 | + mtvsrdd 32+16, 0, 18 | ||
| 520 | + vmsumudm 19, 13, 14, 0 | ||
| 521 | + vmsumudm 10, 15, 16, 19 | ||
| 522 | + | ||
| 523 | + # out11 | ||
| 524 | + mtvsrdd 32+17, 19, 11 | ||
| 525 | + mtvsrdd 32+18, 12, 20 | ||
| 526 | + vmsumudm 11, 17, 18, 0 | ||
| 527 | + | ||
| 528 | + stxv 32+6, 96(3) | ||
| 529 | + stxv 32+7, 112(3) | ||
| 530 | + stxv 32+8, 128(3) | ||
| 531 | + stxv 32+9, 144(3) | ||
| 532 | + stxv 32+10, 160(3) | ||
| 533 | + stxv 32+11, 176(3) | ||
| 534 | + | ||
| 535 | + # out12 | ||
| 536 | + mulld 21, 20, 12 | ||
| 537 | + mulhdu 22, 20, 12 # out12 | ||
| 538 | + | ||
| 539 | + std 21, 192(3) | ||
| 540 | + std 22, 200(3) | ||
| 541 | + | ||
| 542 | + blr | ||
| 543 | +.size _p384_felem_mul_core,.-_p384_felem_mul_core | ||
| 544 | + | ||
| 545 | +# | ||
| 546 | +# Felem square core function - | ||
| 547 | +# r3 and r4 need to pre-loaded. | ||
| 548 | +# | ||
| 549 | +.type _p384_felem_square_core,\@function | ||
| 550 | +.align 4 | ||
| 551 | +_p384_felem_square_core: | ||
| 552 | + | ||
| 553 | + ld 6, 0(4) | ||
| 554 | + ld 7, 8(4) | ||
| 555 | + ld 8, 16(4) | ||
| 556 | + ld 9, 24(4) | ||
| 557 | + ld 10, 32(4) | ||
| 558 | + ld 11, 40(4) | ||
| 559 | + ld 12, 48(4) | ||
| 560 | + | ||
| 561 | + vxor 0, 0, 0 | ||
| 562 | + | ||
| 563 | + # out0 | ||
| 564 | + mulld 14, 6, 6 | ||
| 565 | + mulhdu 15, 6, 6 | ||
| 566 | + std 14, 0(3) | ||
| 567 | + std 15, 8(3) | ||
| 568 | + | ||
| 569 | + # out1 | ||
| 570 | + add 14, 6, 6 | ||
| 571 | + mtvsrdd 32+13, 0, 14 | ||
| 572 | + mtvsrdd 32+14, 0, 7 | ||
| 573 | + vmsumudm 1, 13, 14, 0 | ||
| 574 | + | ||
| 575 | + # out2 | ||
| 576 | + mtvsrdd 32+15, 7, 14 | ||
| 577 | + mtvsrdd 32+16, 7, 8 | ||
| 578 | + vmsumudm 2, 15, 16, 0 | ||
| 579 | + | ||
| 580 | + # out3 | ||
| 581 | + add 15, 7, 7 | ||
| 582 | + mtvsrdd 32+13, 8, 14 | ||
| 583 | + mtvsrdd 32+14, 15, 9 | ||
| 584 | + vmsumudm 3, 13, 14, 0 | ||
| 585 | + | ||
| 586 | + # out4 | ||
| 587 | + mtvsrdd 32+13, 9, 14 | ||
| 588 | + mtvsrdd 32+14, 15, 10 | ||
| 589 | + mtvsrdd 32+15, 0, 8 | ||
| 590 | + vmsumudm 4, 13, 14, 0 | ||
| 591 | + vmsumudm 4, 15, 15, 4 | ||
| 592 | + | ||
| 593 | + # out5 | ||
| 594 | + mtvsrdd 32+13, 10, 14 | ||
| 595 | + mtvsrdd 32+14, 15, 11 | ||
| 596 | + add 16, 8, 8 | ||
| 597 | + mtvsrdd 32+15, 0, 16 | ||
| 598 | + mtvsrdd 32+16, 0, 9 | ||
| 599 | + vmsumudm 5, 13, 14, 0 | ||
| 600 | + vmsumudm 5, 15, 16, 5 | ||
| 601 | + | ||
| 602 | + stxv 32+1, 16(3) | ||
| 603 | + stxv 32+2, 32(3) | ||
| 604 | + stxv 32+3, 48(3) | ||
| 605 | + stxv 32+4, 64(3) | ||
| 606 | + | ||
| 607 | + # out6 | ||
| 608 | + mtvsrdd 32+13, 11, 14 | ||
| 609 | + mtvsrdd 32+14, 15, 12 | ||
| 610 | + mtvsrdd 32+15, 9, 16 | ||
| 611 | + mtvsrdd 32+16, 9, 10 | ||
| 612 | + stxv 32+5, 80(3) | ||
| 613 | + vmsumudm 19, 13, 14, 0 | ||
| 614 | + vmsumudm 6, 15, 16, 19 | ||
| 615 | + | ||
| 616 | + # out7 | ||
| 617 | + add 17, 9, 9 | ||
| 618 | + mtvsrdd 32+13, 11, 15 | ||
| 619 | + mtvsrdd 32+14, 16, 12 | ||
| 620 | + mtvsrdd 32+15, 0, 17 | ||
| 621 | + mtvsrdd 32+16, 0, 10 | ||
| 622 | + vmsumudm 19, 13, 14, 0 | ||
| 623 | + vmsumudm 7, 15, 16, 19 | ||
| 624 | + | ||
| 625 | + # out8 | ||
| 626 | + mtvsrdd 32+13, 11, 16 | ||
| 627 | + mtvsrdd 32+14, 17, 12 | ||
| 628 | + mtvsrdd 32+15, 0, 10 | ||
| 629 | + vmsumudm 19, 13, 14, 0 | ||
| 630 | + vmsumudm 8, 15, 15, 19 | ||
| 631 | + | ||
| 632 | + # out9 | ||
| 633 | + add 14, 10, 10 | ||
| 634 | + mtvsrdd 32+13, 11, 17 | ||
| 635 | + mtvsrdd 32+14, 14, 12 | ||
| 636 | + vmsumudm 9, 13, 14, 0 | ||
| 637 | + | ||
| 638 | + # out10 | ||
| 639 | + mtvsrdd 32+13, 11, 14 | ||
| 640 | + mtvsrdd 32+14, 11, 12 | ||
| 641 | + vmsumudm 10, 13, 14, 0 | ||
| 642 | + | ||
| 643 | + stxv 32+6, 96(3) | ||
| 644 | + stxv 32+7, 112(3) | ||
| 645 | + | ||
| 646 | + # out11 | ||
| 647 | + #add 14, 11, 11 | ||
| 648 | + #mtvsrdd 32+13, 0, 14 | ||
| 649 | + #mtvsrdd 32+14, 0, 12 | ||
| 650 | + #vmsumudm 11, 13, 14, 0 | ||
| 651 | + | ||
| 652 | + mulld 6, 12, 11 | ||
| 653 | + mulhdu 7, 12, 11 | ||
| 654 | + addc 8, 6, 6 | ||
| 655 | + adde 9, 7, 7 | ||
| 656 | + | ||
| 657 | + stxv 32+8, 128(3) | ||
| 658 | + stxv 32+9, 144(3) | ||
| 659 | + stxv 32+10, 160(3) | ||
| 660 | + #stxv 32+11, 176(3) | ||
| 661 | + | ||
| 662 | + # out12 | ||
| 663 | + mulld 14, 12, 12 | ||
| 664 | + mulhdu 15, 12, 12 | ||
| 665 | + | ||
| 666 | + std 8, 176(3) | ||
| 667 | + std 9, 184(3) | ||
| 668 | + std 14, 192(3) | ||
| 669 | + std 15, 200(3) | ||
| 670 | + | ||
| 671 | + blr | ||
| 672 | +.size _p384_felem_square_core,.-_p384_felem_square_core | ||
| 673 | + | ||
| 674 | +# | ||
| 675 | +# widefelem (128 bits) * 8 | ||
| 676 | +# | ||
| 677 | +.macro F128_X_8 _off1 _off2 | ||
| 678 | + ld 9,\\_off1(3) | ||
| 679 | + ld 8,\\_off2(3) | ||
| 680 | + srdi 10,9,61 | ||
| 681 | + rldimi 10,8,3,0 | ||
| 682 | + sldi 9,9,3 | ||
| 683 | + std 9,\\_off1(3) | ||
| 684 | + std 10,\\_off2(3) | ||
| 685 | +.endm | ||
| 686 | + | ||
| 687 | +.globl p384_felem128_mul_by_8 | ||
| 688 | +.type p384_felem128_mul_by_8, \@function | ||
| 689 | +.align 4 | ||
| 690 | +p384_felem128_mul_by_8: | ||
| 691 | + | ||
| 692 | + F128_X_8 0, 8 | ||
| 693 | + | ||
| 694 | + F128_X_8 16, 24 | ||
| 695 | + | ||
| 696 | + F128_X_8 32, 40 | ||
| 697 | + | ||
| 698 | + F128_X_8 48, 56 | ||
| 699 | + | ||
| 700 | + F128_X_8 64, 72 | ||
| 701 | + | ||
| 702 | + F128_X_8 80, 88 | ||
| 703 | + | ||
| 704 | + F128_X_8 96, 104 | ||
| 705 | + | ||
| 706 | + F128_X_8 112, 120 | ||
| 707 | + | ||
| 708 | + F128_X_8 128, 136 | ||
| 709 | + | ||
| 710 | + F128_X_8 144, 152 | ||
| 711 | + | ||
| 712 | + F128_X_8 160, 168 | ||
| 713 | + | ||
| 714 | + F128_X_8 176, 184 | ||
| 715 | + | ||
| 716 | + F128_X_8 192, 200 | ||
| 717 | + | ||
| 718 | + blr | ||
| 719 | +.size p384_felem128_mul_by_8,.-p384_felem128_mul_by_8 | ||
| 720 | + | ||
| 721 | +# | ||
| 722 | +# widefelem (128 bits) * 2 | ||
| 723 | +# | ||
| 724 | +.macro F128_X_2 _off1 _off2 | ||
| 725 | + ld 9,\\_off1(3) | ||
| 726 | + ld 8,\\_off2(3) | ||
| 727 | + srdi 10,9,63 | ||
| 728 | + rldimi 10,8,1,0 | ||
| 729 | + sldi 9,9,1 | ||
| 730 | + std 9,\\_off1(3) | ||
| 731 | + std 10,\\_off2(3) | ||
| 732 | +.endm | ||
| 733 | + | ||
| 734 | +.globl p384_felem128_mul_by_2 | ||
| 735 | +.type p384_felem128_mul_by_2, \@function | ||
| 736 | +.align 4 | ||
| 737 | +p384_felem128_mul_by_2: | ||
| 738 | + | ||
| 739 | + F128_X_2 0, 8 | ||
| 740 | + | ||
| 741 | + F128_X_2 16, 24 | ||
| 742 | + | ||
| 743 | + F128_X_2 32, 40 | ||
| 744 | + | ||
| 745 | + F128_X_2 48, 56 | ||
| 746 | + | ||
| 747 | + F128_X_2 64, 72 | ||
| 748 | + | ||
| 749 | + F128_X_2 80, 88 | ||
| 750 | + | ||
| 751 | + F128_X_2 96, 104 | ||
| 752 | + | ||
| 753 | + F128_X_2 112, 120 | ||
| 754 | + | ||
| 755 | + F128_X_2 128, 136 | ||
| 756 | + | ||
| 757 | + F128_X_2 144, 152 | ||
| 758 | + | ||
| 759 | + F128_X_2 160, 168 | ||
| 760 | + | ||
| 761 | + F128_X_2 176, 184 | ||
| 762 | + | ||
| 763 | + F128_X_2 192, 200 | ||
| 764 | + | ||
| 765 | + blr | ||
| 766 | +.size p384_felem128_mul_by_2,.-p384_felem128_mul_by_2 | ||
| 767 | + | ||
| 768 | +.globl p384_felem_diff128 | ||
| 769 | +.type p384_felem_diff128, \@function | ||
| 770 | +.align 4 | ||
| 771 | +p384_felem_diff128: | ||
| 772 | + | ||
| 773 | + addis 5, 2, .LConst_two127\@toc\@ha | ||
| 774 | + addi 5, 5, .LConst_two127\@toc\@l | ||
| 775 | + | ||
| 776 | + ld 10, 0(3) | ||
| 777 | + ld 8, 8(3) | ||
| 778 | + li 9, 0 | ||
| 779 | + addc 10, 10, 9 | ||
| 780 | + li 7, -1 | ||
| 781 | + rldicr 7, 7, 0, 0 # two127 | ||
| 782 | + adde 8, 8, 7 | ||
| 783 | + ld 11, 0(4) | ||
| 784 | + ld 12, 8(4) | ||
| 785 | + subfc 11, 11, 10 | ||
| 786 | + subfe 12, 12, 8 | ||
| 787 | + std 11, 0(3) # out0 | ||
| 788 | + std 12, 8(3) | ||
| 789 | + | ||
| 790 | + # two127m71 = (r10, r9) | ||
| 791 | + ld 8, 16(3) | ||
| 792 | + ld 7, 24(3) | ||
| 793 | + ld 10, 24(5) # two127m71 | ||
| 794 | + addc 8, 8, 9 | ||
| 795 | + adde 7, 7, 10 | ||
| 796 | + ld 11, 16(4) | ||
| 797 | + ld 12, 24(4) | ||
| 798 | + subfc 11, 11, 8 | ||
| 799 | + subfe 12, 12, 7 | ||
| 800 | + std 11, 16(3) # out1 | ||
| 801 | + std 12, 24(3) | ||
| 802 | + | ||
| 803 | + ld 8, 32(3) | ||
| 804 | + ld 7, 40(3) | ||
| 805 | + addc 8, 8, 9 | ||
| 806 | + adde 7, 7, 10 | ||
| 807 | + ld 11, 32(4) | ||
| 808 | + ld 12, 40(4) | ||
| 809 | + subfc 11, 11, 8 | ||
| 810 | + subfe 12, 12, 7 | ||
| 811 | + std 11, 32(3) # out2 | ||
| 812 | + std 12, 40(3) | ||
| 813 | + | ||
| 814 | + ld 8, 48(3) | ||
| 815 | + ld 7, 56(3) | ||
| 816 | + addc 8, 8, 9 | ||
| 817 | + adde 7, 7, 10 | ||
| 818 | + ld 11, 48(4) | ||
| 819 | + ld 12, 56(4) | ||
| 820 | + subfc 11, 11, 8 | ||
| 821 | + subfe 12, 12, 7 | ||
| 822 | + std 11, 48(3) # out3 | ||
| 823 | + std 12, 56(3) | ||
| 824 | + | ||
| 825 | + ld 8, 64(3) | ||
| 826 | + ld 7, 72(3) | ||
| 827 | + addc 8, 8, 9 | ||
| 828 | + adde 7, 7, 10 | ||
| 829 | + ld 11, 64(4) | ||
| 830 | + ld 12, 72(4) | ||
| 831 | + subfc 11, 11, 8 | ||
| 832 | + subfe 12, 12, 7 | ||
| 833 | + std 11, 64(3) # out4 | ||
| 834 | + std 12, 72(3) | ||
| 835 | + | ||
| 836 | + ld 8, 80(3) | ||
| 837 | + ld 7, 88(3) | ||
| 838 | + addc 8, 8, 9 | ||
| 839 | + adde 7, 7, 10 | ||
| 840 | + ld 11, 80(4) | ||
| 841 | + ld 12, 88(4) | ||
| 842 | + subfc 11, 11, 8 | ||
| 843 | + subfe 12, 12, 7 | ||
| 844 | + std 11, 80(3) # out5 | ||
| 845 | + std 12, 88(3) | ||
| 846 | + | ||
| 847 | + ld 8, 96(3) | ||
| 848 | + ld 7, 104(3) | ||
| 849 | + ld 6, 40(5) # two127p111m79m71 | ||
| 850 | + addc 8, 8, 9 | ||
| 851 | + adde 7, 7, 6 | ||
| 852 | + ld 11, 96(4) | ||
| 853 | + ld 12, 104(4) | ||
| 854 | + subfc 11, 11, 8 | ||
| 855 | + subfe 12, 12, 7 | ||
| 856 | + std 11, 96(3) # out6 | ||
| 857 | + std 12, 104(3) | ||
| 858 | + | ||
| 859 | + ld 8, 112(3) | ||
| 860 | + ld 7, 120(3) | ||
| 861 | + ld 6, 56(5) # two127m119m71 | ||
| 862 | + addc 8, 8, 9 | ||
| 863 | + adde 7, 7, 6 | ||
| 864 | + ld 11, 112(4) | ||
| 865 | + ld 12, 120(4) | ||
| 866 | + subfc 11, 11, 8 | ||
| 867 | + subfe 12, 12, 7 | ||
| 868 | + std 11, 112(3) # out7 | ||
| 869 | + std 12, 120(3) | ||
| 870 | + | ||
| 871 | + ld 8, 128(3) | ||
| 872 | + ld 7, 136(3) | ||
| 873 | + ld 6, 72(5) # two127m95m71 | ||
| 874 | + addc 8, 8, 9 | ||
| 875 | + adde 7, 7, 6 | ||
| 876 | + ld 11, 128(4) | ||
| 877 | + ld 12, 136(4) | ||
| 878 | + subfc 11, 11, 8 | ||
| 879 | + subfe 12, 12, 7 | ||
| 880 | + std 11, 128(3) # out8 | ||
| 881 | + std 12, 136(3) | ||
| 882 | + | ||
| 883 | + ld 8, 144(3) | ||
| 884 | + ld 7, 152(3) | ||
| 885 | + addc 8, 8, 9 | ||
| 886 | + adde 7, 7, 10 | ||
| 887 | + ld 11, 144(4) | ||
| 888 | + ld 12, 152(4) | ||
| 889 | + subfc 11, 11, 8 | ||
| 890 | + subfe 12, 12, 7 | ||
| 891 | + std 11, 144(3) # out9 | ||
| 892 | + std 12, 152(3) | ||
| 893 | + | ||
| 894 | + ld 8, 160(3) | ||
| 895 | + ld 7, 168(3) | ||
| 896 | + addc 8, 8, 9 | ||
| 897 | + adde 7, 7, 10 | ||
| 898 | + ld 11, 160(4) | ||
| 899 | + ld 12, 168(4) | ||
| 900 | + subfc 11, 11, 8 | ||
| 901 | + subfe 12, 12, 7 | ||
| 902 | + std 11, 160(3) # out10 | ||
| 903 | + std 12, 168(3) | ||
| 904 | + | ||
| 905 | + ld 8, 176(3) | ||
| 906 | + ld 7, 184(3) | ||
| 907 | + addc 8, 8, 9 | ||
| 908 | + adde 7, 7, 10 | ||
| 909 | + ld 11, 176(4) | ||
| 910 | + ld 12, 184(4) | ||
| 911 | + subfc 11, 11, 8 | ||
| 912 | + subfe 12, 12, 7 | ||
| 913 | + std 11, 176(3) # out11 | ||
| 914 | + std 12, 184(3) | ||
| 915 | + | ||
| 916 | + ld 8, 192(3) | ||
| 917 | + ld 7, 200(3) | ||
| 918 | + addc 8, 8, 9 | ||
| 919 | + adde 7, 7, 10 | ||
| 920 | + ld 11, 192(4) | ||
| 921 | + ld 12, 200(4) | ||
| 922 | + subfc 11, 11, 8 | ||
| 923 | + subfe 12, 12, 7 | ||
| 924 | + std 11, 192(3) # out12 | ||
| 925 | + std 12, 200(3) | ||
| 926 | + | ||
| 927 | + blr | ||
| 928 | +.size p384_felem_diff128,.-p384_felem_diff128 | ||
| 929 | + | ||
| 930 | +.data | ||
| 931 | +.align 4 | ||
| 932 | +.LConst_two127: | ||
| 933 | +#two127 | ||
| 934 | +.long 0x00000000, 0x00000000, 0x00000000, 0x80000000 | ||
| 935 | +#two127m71 | ||
| 936 | +.long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff | ||
| 937 | +#two127p111m79m71 | ||
| 938 | +.long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff | ||
| 939 | +#two127m119m71 | ||
| 940 | +.long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff | ||
| 941 | +#two127m95m71 | ||
| 942 | +.long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff | ||
| 943 | + | ||
| 944 | +.text | ||
| 945 | + | ||
| 946 | +.globl p384_felem_diff_128_64 | ||
| 947 | +.type p384_felem_diff_128_64, \@function | ||
| 948 | +.align 4 | ||
| 949 | +p384_felem_diff_128_64: | ||
| 950 | + addis 5, 2, .LConst_128_two64\@toc\@ha | ||
| 951 | + addi 5, 5, .LConst_128_two64\@toc\@l | ||
| 952 | + | ||
| 953 | + ld 9, 0(3) | ||
| 954 | + ld 10, 8(3) | ||
| 955 | + ld 8, 48(5) # two64p48m16 | ||
| 956 | + li 7, 0 | ||
| 957 | + addc 9, 9, 8 | ||
| 958 | + li 6, 1 | ||
| 959 | + adde 10, 10, 6 | ||
| 960 | + ld 11, 0(4) | ||
| 961 | + subfc 8, 11, 9 | ||
| 962 | + subfe 12, 7, 10 | ||
| 963 | + std 8, 0(3) # out0 | ||
| 964 | + std 12, 8(3) | ||
| 965 | + | ||
| 966 | + ld 9, 16(3) | ||
| 967 | + ld 10, 24(3) | ||
| 968 | + ld 8, 0(5) # two64m56m8 | ||
| 969 | + addc 9, 9, 8 | ||
| 970 | + addze 10, 10 | ||
| 971 | + ld 11, 8(4) | ||
| 972 | + subfc 11, 11, 9 | ||
| 973 | + subfe 12, 7, 10 | ||
| 974 | + std 11, 16(3) # out1 | ||
| 975 | + std 12, 24(3) | ||
| 976 | + | ||
| 977 | + ld 9, 32(3) | ||
| 978 | + ld 10, 40(3) | ||
| 979 | + ld 8, 16(5) # two64m32m8 | ||
| 980 | + addc 9, 9, 8 | ||
| 981 | + addze 10, 10 | ||
| 982 | + ld 11, 16(4) | ||
| 983 | + subfc 11, 11, 9 | ||
| 984 | + subfe 12, 7, 10 | ||
| 985 | + std 11, 32(3) # out2 | ||
| 986 | + std 12, 40(3) | ||
| 987 | + | ||
| 988 | + ld 10, 48(3) | ||
| 989 | + ld 8, 56(3) | ||
| 990 | + #ld 9, 32(5) # two64m8 | ||
| 991 | + li 9, -256 # two64m8 | ||
| 992 | + addc 10, 10, 9 | ||
| 993 | + addze 8, 8 | ||
| 994 | + ld 11, 24(4) | ||
| 995 | + subfc 11, 11, 10 | ||
| 996 | + subfe 12, 7, 8 | ||
| 997 | + std 11, 48(3) # out3 | ||
| 998 | + std 12, 56(3) | ||
| 999 | + | ||
| 1000 | + ld 10, 64(3) | ||
| 1001 | + ld 8, 72(3) | ||
| 1002 | + addc 10, 10, 9 | ||
| 1003 | + addze 8, 8 | ||
| 1004 | + ld 11, 32(4) | ||
| 1005 | + subfc 11, 11, 10 | ||
| 1006 | + subfe 12, 7, 8 | ||
| 1007 | + std 11, 64(3) # out4 | ||
| 1008 | + std 12, 72(3) | ||
| 1009 | + | ||
| 1010 | + ld 10, 80(3) | ||
| 1011 | + ld 8, 88(3) | ||
| 1012 | + addc 10, 10, 9 | ||
| 1013 | + addze 8, 8 | ||
| 1014 | + ld 11, 40(4) | ||
| 1015 | + subfc 11, 11, 10 | ||
| 1016 | + subfe 12, 7, 8 | ||
| 1017 | + std 11, 80(3) # out5 | ||
| 1018 | + std 12, 88(3) | ||
| 1019 | + | ||
| 1020 | + ld 10, 96(3) | ||
| 1021 | + ld 8, 104(3) | ||
| 1022 | + addc 10, 10, 9 | ||
| 1023 | + addze 9, 8 | ||
| 1024 | + ld 11, 48(4) | ||
| 1025 | + subfc 11, 11, 10 | ||
| 1026 | + subfe 12, 7, 9 | ||
| 1027 | + std 11, 96(3) # out6 | ||
| 1028 | + std 12, 104(3) | ||
| 1029 | + | ||
| 1030 | + blr | ||
| 1031 | +.size p384_felem_diff_128_64,.-p384_felem_diff_128_64 | ||
| 1032 | + | ||
| 1033 | +.data | ||
| 1034 | +.align 4 | ||
| 1035 | +.LConst_128_two64: | ||
| 1036 | +#two64m56m8 | ||
| 1037 | +.long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000 | ||
| 1038 | +#two64m32m8 | ||
| 1039 | +.long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000 | ||
| 1040 | +#two64m8 | ||
| 1041 | +.long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000 | ||
| 1042 | +#two64p48m16 | ||
| 1043 | +.long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000 | ||
| 1044 | + | ||
| 1045 | +.LConst_two60: | ||
| 1046 | +#two60m52m4 | ||
| 1047 | +.long 0xfffffff0, 0x0fefffff, 0x0, 0x0 | ||
| 1048 | +#two60p44m12 | ||
| 1049 | +.long 0xfffff000, 0x10000fff, 0x0, 0x0 | ||
| 1050 | +#two60m28m4 | ||
| 1051 | +.long 0xeffffff0, 0x0fffffff, 0x0, 0x0 | ||
| 1052 | +#two60m4 | ||
| 1053 | +.long 0xfffffff0, 0x0fffffff, 0x0, 0x0 | ||
| 1054 | + | ||
| 1055 | +.text | ||
| 1056 | +# | ||
| 1057 | +# static void felem_diff64(felem out, const felem in) | ||
| 1058 | +# | ||
| 1059 | +.globl p384_felem_diff64 | ||
| 1060 | +.type p384_felem_diff64, \@function | ||
| 1061 | +.align 4 | ||
| 1062 | +p384_felem_diff64: | ||
| 1063 | + addis 5, 2, .LConst_two60\@toc\@ha | ||
| 1064 | + addi 5, 5, .LConst_two60\@toc\@l | ||
| 1065 | + | ||
| 1066 | + ld 9, 0(3) | ||
| 1067 | + ld 8, 16(5) # two60p44m12 | ||
| 1068 | + li 7, 0 | ||
| 1069 | + add 9, 9, 8 | ||
| 1070 | + ld 11, 0(4) | ||
| 1071 | + subf 8, 11, 9 | ||
| 1072 | + std 8, 0(3) # out0 | ||
| 1073 | + | ||
| 1074 | + ld 9, 8(3) | ||
| 1075 | + ld 8, 0(5) # two60m52m4 | ||
| 1076 | + add 9, 9, 8 | ||
| 1077 | + ld 11, 8(4) | ||
| 1078 | + subf 11, 11, 9 | ||
| 1079 | + std 11, 8(3) # out1 | ||
| 1080 | + | ||
| 1081 | + ld 9, 16(3) | ||
| 1082 | + ld 8, 32(5) # two60m28m4 | ||
| 1083 | + add 9, 9, 8 | ||
| 1084 | + ld 11, 16(4) | ||
| 1085 | + subf 11, 11, 9 | ||
| 1086 | + std 11, 16(3) # out2 | ||
| 1087 | + | ||
| 1088 | + ld 10, 24(3) | ||
| 1089 | + ld 9, 48(5) # two60m4 | ||
| 1090 | + add 10, 10, 9 | ||
| 1091 | + ld 12, 24(4) | ||
| 1092 | + subf 12, 12, 10 | ||
| 1093 | + std 12, 24(3) # out3 | ||
| 1094 | + | ||
| 1095 | + ld 10, 32(3) | ||
| 1096 | + add 10, 10, 9 | ||
| 1097 | + ld 11, 32(4) | ||
| 1098 | + subf 11, 11, 10 | ||
| 1099 | + std 11, 32(3) # out4 | ||
| 1100 | + | ||
| 1101 | + ld 10, 40(3) | ||
| 1102 | + add 10, 10, 9 | ||
| 1103 | + ld 12, 40(4) | ||
| 1104 | + subf 12, 12, 10 | ||
| 1105 | + std 12, 40(3) # out5 | ||
| 1106 | |||
| 1107 | - endproc("p384_felem_square"); | ||
| 1108 | - } | ||
| 1109 | -} | ||
| 1110 | + ld 10, 48(3) | ||
| 1111 | + add 10, 10, 9 | ||
| 1112 | + ld 11, 48(4) | ||
| 1113 | + subf 11, 11, 10 | ||
| 1114 | + std 11, 48(3) # out6 | ||
| 1115 | + | ||
| 1116 | + blr | ||
| 1117 | +.size p384_felem_diff64,.-p384_felem_diff64 | ||
| 1118 | + | ||
| 1119 | +.text | ||
| 1120 | +# | ||
| 1121 | +# Shift 128 bits right <nbits> | ||
| 1122 | +# | ||
| 1123 | +.macro SHR o_h o_l in_h in_l nbits | ||
| 1124 | + srdi \\o_l, \\in_l, \\nbits # shift lower right <nbits> | ||
| 1125 | + rldimi \\o_l, \\in_h, 64-\\nbits, 0 # insert <64-nbits> from hi | ||
| 1126 | + srdi \\o_h, \\in_h, \\nbits # shift higher right <nbits> | ||
| 1127 | +.endm | ||
| 1128 | + | ||
| 1129 | +# | ||
| 1130 | +# static void felem_reduce(felem out, const widefelem in) | ||
| 1131 | +# | ||
| 1132 | +.global p384_felem_reduce | ||
| 1133 | +.type p384_felem_reduce,\@function | ||
| 1134 | +.align 4 | ||
| 1135 | +p384_felem_reduce: | ||
| 1136 | + | ||
| 1137 | + stdu 1, -208(1) | ||
| 1138 | + mflr 0 | ||
| 1139 | + std 14, 56(1) | ||
| 1140 | + std 15, 64(1) | ||
| 1141 | + std 16, 72(1) | ||
| 1142 | + std 17, 80(1) | ||
| 1143 | + std 18, 88(1) | ||
| 1144 | + std 19, 96(1) | ||
| 1145 | + std 20, 104(1) | ||
| 1146 | + std 21, 112(1) | ||
| 1147 | + std 22, 120(1) | ||
| 1148 | + std 23, 128(1) | ||
| 1149 | + std 24, 136(1) | ||
| 1150 | + std 25, 144(1) | ||
| 1151 | + std 26, 152(1) | ||
| 1152 | + std 27, 160(1) | ||
| 1153 | + std 28, 168(1) | ||
| 1154 | + std 29, 176(1) | ||
| 1155 | + std 30, 184(1) | ||
| 1156 | + std 31, 192(1) | ||
| 1157 | + | ||
| 1158 | + bl _p384_felem_reduce_core | ||
| 1159 | + | ||
| 1160 | + mtlr 0 | ||
| 1161 | + ld 14, 56(1) | ||
| 1162 | + ld 15, 64(1) | ||
| 1163 | + ld 16, 72(1) | ||
| 1164 | + ld 17, 80(1) | ||
| 1165 | + ld 18, 88(1) | ||
| 1166 | + ld 19, 96(1) | ||
| 1167 | + ld 20, 104(1) | ||
| 1168 | + ld 21, 112(1) | ||
| 1169 | + ld 22, 120(1) | ||
| 1170 | + ld 23, 128(1) | ||
| 1171 | + ld 24, 136(1) | ||
| 1172 | + ld 25, 144(1) | ||
| 1173 | + ld 26, 152(1) | ||
| 1174 | + ld 27, 160(1) | ||
| 1175 | + ld 28, 168(1) | ||
| 1176 | + ld 29, 176(1) | ||
| 1177 | + ld 30, 184(1) | ||
| 1178 | + ld 31, 192(1) | ||
| 1179 | + addi 1, 1, 208 | ||
| 1180 | + blr | ||
| 1181 | +.size p384_felem_reduce,.-p384_felem_reduce | ||
| 1182 | + | ||
| 1183 | +# | ||
| 1184 | +# Felem reduction core function - | ||
| 1185 | +# r3 and r4 need to pre-loaded. | ||
| 1186 | +# | ||
| 1187 | +.type _p384_felem_reduce_core,\@function | ||
| 1188 | +.align 4 | ||
| 1189 | +_p384_felem_reduce_core: | ||
| 1190 | + addis 12, 2, .LConst\@toc\@ha | ||
| 1191 | + addi 12, 12, .LConst\@toc\@l | ||
| 1192 | + | ||
| 1193 | + # load constat p | ||
| 1194 | + ld 11, 8(12) # hi - two124m68 | ||
| 1195 | + | ||
| 1196 | + # acc[6] = in[6] + two124m68; | ||
| 1197 | + ld 26, 96(4) # in[6].l | ||
| 1198 | + ld 27, 96+8(4) # in[6].h | ||
| 1199 | + add 27, 27, 11 | ||
| 1200 | + | ||
| 1201 | + # acc[5] = in[5] + two124m68; | ||
| 1202 | + ld 24, 80(4) # in[5].l | ||
| 1203 | + ld 25, 80+8(4) # in[5].h | ||
| 1204 | + add 25, 25, 11 | ||
| 1205 | + | ||
| 1206 | + # acc[4] = in[4] + two124m68; | ||
| 1207 | + ld 22, 64(4) # in[4].l | ||
| 1208 | + ld 23, 64+8(4) # in[4].h | ||
| 1209 | + add 23, 23, 11 | ||
| 1210 | + | ||
| 1211 | + # acc[3] = in[3] + two124m68; | ||
| 1212 | + ld 20, 48(4) # in[3].l | ||
| 1213 | + ld 21, 48+8(4) # in[3].h | ||
| 1214 | + add 21, 21, 11 | ||
| 1215 | + | ||
| 1216 | + ld 11, 48+8(12) # hi - two124m92m68 | ||
| 1217 | + | ||
| 1218 | + # acc[2] = in[2] + two124m92m68; | ||
| 1219 | + ld 18, 32(4) # in[2].l | ||
| 1220 | + ld 19, 32+8(4) # in[2].h | ||
| 1221 | + add 19, 19, 11 | ||
| 1222 | + | ||
| 1223 | + ld 11, 16+8(12) # high - two124m116m68 | ||
| 1224 | + | ||
| 1225 | + # acc[1] = in[1] + two124m116m68; | ||
| 1226 | + ld 16, 16(4) # in[1].l | ||
| 1227 | + ld 17, 16+8(4) # in[1].h | ||
| 1228 | + add 17, 17, 11 | ||
| 1229 | + | ||
| 1230 | + ld 11, 32+8(12) # high - two124p108m76 | ||
| 1231 | + | ||
| 1232 | + # acc[0] = in[0] + two124p108m76; | ||
| 1233 | + ld 14, 0(4) # in[0].l | ||
| 1234 | + ld 15, 0+8(4) # in[0].h | ||
| 1235 | + add 15, 15, 11 | ||
| 1236 | + | ||
| 1237 | + # compute mask | ||
| 1238 | + li 7, -1 | ||
| 1239 | + | ||
| 1240 | + # Eliminate in[12] | ||
| 1241 | + | ||
| 1242 | + # acc[8] += in[12] >> 32; | ||
| 1243 | + ld 5, 192(4) # in[12].l | ||
| 1244 | + ld 6, 192+8(4) # in[12].h | ||
| 1245 | + SHR 9, 10, 6, 5, 32 | ||
| 1246 | + ld 30, 128(4) # in[8].l | ||
| 1247 | + ld 31, 136(4) # in[8].h | ||
| 1248 | + addc 30, 30, 10 | ||
| 1249 | + adde 31, 31, 9 | ||
| 1250 | + | ||
| 1251 | + # acc[7] += (in[12] & 0xffffffff) << 24; | ||
| 1252 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1253 | + and 11, 11, 5 | ||
| 1254 | + sldi 11, 11, 24 # << 24 | ||
| 1255 | + ld 28, 112(4) # in[7].l | ||
| 1256 | + ld 29, 120(4) # in[7].h | ||
| 1257 | + addc 28, 28, 11 | ||
| 1258 | + addze 29, 29 | ||
| 1259 | + | ||
| 1260 | + # acc[7] += in[12] >> 8; | ||
| 1261 | + SHR 9, 10, 6, 5, 8 | ||
| 1262 | + addc 28, 28, 10 | ||
| 1263 | + adde 29, 29, 9 | ||
| 1264 | + | ||
| 1265 | + # acc[6] += (in[12] & 0xff) << 48; | ||
| 1266 | + andi. 11, 5, 0xff | ||
| 1267 | + sldi 11, 11, 48 | ||
| 1268 | + addc 26, 26, 11 | ||
| 1269 | + addze 27, 27 | ||
| 1270 | + | ||
| 1271 | + # acc[6] -= in[12] >> 16; | ||
| 1272 | + SHR 9, 10, 6, 5, 16 | ||
| 1273 | + subfc 26, 10, 26 | ||
| 1274 | + subfe 27, 9, 27 | ||
| 1275 | + | ||
| 1276 | + # acc[5] -= (in[12] & 0xffff) << 40; | ||
| 1277 | + srdi 11, 7, 48 # 0xffff | ||
| 1278 | + and 11, 11, 5 | ||
| 1279 | + sldi 11, 11, 40 # << 40 | ||
| 1280 | + li 9, 0 | ||
| 1281 | + subfc 24, 11, 24 | ||
| 1282 | + subfe 25, 9, 25 | ||
| 1283 | + | ||
| 1284 | + # acc[6] += in[12] >> 48; | ||
| 1285 | + SHR 9, 10, 6, 5, 48 | ||
| 1286 | + addc 26, 26, 10 | ||
| 1287 | + adde 27, 27, 9 | ||
| 1288 | + | ||
| 1289 | + # acc[5] += (in[12] & 0xffffffffffff) << 8; | ||
| 1290 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1291 | + and 11, 11, 5 | ||
| 1292 | + sldi 11, 11, 8 # << 8 | ||
| 1293 | + addc 24, 24, 11 | ||
| 1294 | + addze 25, 25 | ||
| 1295 | + | ||
| 1296 | + # Eliminate in[11] | ||
| 1297 | + | ||
| 1298 | + # acc[7] += in[11] >> 32; | ||
| 1299 | + ld 5, 176(4) # in[11].l | ||
| 1300 | + ld 6, 176+8(4) # in[11].h | ||
| 1301 | + SHR 9, 10, 6, 5, 32 | ||
| 1302 | + addc 28, 28, 10 | ||
| 1303 | + adde 29, 29, 9 | ||
| 1304 | + | ||
| 1305 | + # acc[6] += (in[11] & 0xffffffff) << 24; | ||
| 1306 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1307 | + and 11, 11, 5 | ||
| 1308 | + sldi 11, 11, 24 # << 24 | ||
| 1309 | + addc 26, 26, 11 | ||
| 1310 | + addze 27, 27 | ||
| 1311 | + | ||
| 1312 | + # acc[6] += in[11] >> 8; | ||
| 1313 | + SHR 9, 10, 6, 5, 8 | ||
| 1314 | + addc 26, 26, 10 | ||
| 1315 | + adde 27, 27, 9 | ||
| 1316 | + | ||
| 1317 | + # acc[5] += (in[11] & 0xff) << 48; | ||
| 1318 | + andi. 11, 5, 0xff | ||
| 1319 | + sldi 11, 11, 48 | ||
| 1320 | + addc 24, 24, 11 | ||
| 1321 | + addze 25, 25 | ||
| 1322 | + | ||
| 1323 | + # acc[5] -= in[11] >> 16; | ||
| 1324 | + SHR 9, 10, 6, 5, 16 | ||
| 1325 | + subfc 24, 10, 24 | ||
| 1326 | + subfe 25, 9, 25 | ||
| 1327 | + | ||
| 1328 | + # acc[4] -= (in[11] & 0xffff) << 40; | ||
| 1329 | + srdi 11, 7, 48 # 0xffff | ||
| 1330 | + and 11, 11, 5 | ||
| 1331 | + sldi 11, 11, 40 # << 40 | ||
| 1332 | + li 9, 0 | ||
| 1333 | + subfc 22, 11, 22 | ||
| 1334 | + subfe 23, 9, 23 | ||
| 1335 | + | ||
| 1336 | + # acc[5] += in[11] >> 48; | ||
| 1337 | + SHR 9, 10, 6, 5, 48 | ||
| 1338 | + addc 24, 24, 10 | ||
| 1339 | + adde 25, 25, 9 | ||
| 1340 | + | ||
| 1341 | + # acc[4] += (in[11] & 0xffffffffffff) << 8; | ||
| 1342 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1343 | + and 11, 11, 5 | ||
| 1344 | + sldi 11, 11, 8 # << 8 | ||
| 1345 | + addc 22, 22, 11 | ||
| 1346 | + addze 23, 23 | ||
| 1347 | + | ||
| 1348 | + # Eliminate in[10] | ||
| 1349 | + | ||
| 1350 | + # acc[6] += in[10] >> 32; | ||
| 1351 | + ld 5, 160(4) # in[10].l | ||
| 1352 | + ld 6, 160+8(4) # in[10].h | ||
| 1353 | + SHR 9, 10, 6, 5, 32 | ||
| 1354 | + addc 26, 26, 10 | ||
| 1355 | + adde 27, 27, 9 | ||
| 1356 | + | ||
| 1357 | + # acc[5] += (in[10] & 0xffffffff) << 24; | ||
| 1358 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1359 | + and 11, 11, 5 | ||
| 1360 | + sldi 11, 11, 24 # << 24 | ||
| 1361 | + addc 24, 24, 11 | ||
| 1362 | + addze 25, 25 | ||
| 1363 | + | ||
| 1364 | + # acc[5] += in[10] >> 8; | ||
| 1365 | + SHR 9, 10, 6, 5, 8 | ||
| 1366 | + addc 24, 24, 10 | ||
| 1367 | + adde 25, 25, 9 | ||
| 1368 | + | ||
| 1369 | + # acc[4] += (in[10] & 0xff) << 48; | ||
| 1370 | + andi. 11, 5, 0xff | ||
| 1371 | + sldi 11, 11, 48 | ||
| 1372 | + addc 22, 22, 11 | ||
| 1373 | + addze 23, 23 | ||
| 1374 | + | ||
| 1375 | + # acc[4] -= in[10] >> 16; | ||
| 1376 | + SHR 9, 10, 6, 5, 16 | ||
| 1377 | + subfc 22, 10, 22 | ||
| 1378 | + subfe 23, 9, 23 | ||
| 1379 | + | ||
| 1380 | + # acc[3] -= (in[10] & 0xffff) << 40; | ||
| 1381 | + srdi 11, 7, 48 # 0xffff | ||
| 1382 | + and 11, 11, 5 | ||
| 1383 | + sldi 11, 11, 40 # << 40 | ||
| 1384 | + li 9, 0 | ||
| 1385 | + subfc 20, 11, 20 | ||
| 1386 | + subfe 21, 9, 21 | ||
| 1387 | + | ||
| 1388 | + # acc[4] += in[10] >> 48; | ||
| 1389 | + SHR 9, 10, 6, 5, 48 | ||
| 1390 | + addc 22, 22, 10 | ||
| 1391 | + adde 23, 23, 9 | ||
| 1392 | + | ||
| 1393 | + # acc[3] += (in[10] & 0xffffffffffff) << 8; | ||
| 1394 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1395 | + and 11, 11, 5 | ||
| 1396 | + sldi 11, 11, 8 # << 8 | ||
| 1397 | + addc 20, 20, 11 | ||
| 1398 | + addze 21, 21 | ||
| 1399 | + | ||
| 1400 | + # Eliminate in[9] | ||
| 1401 | + | ||
| 1402 | + # acc[5] += in[9] >> 32; | ||
| 1403 | + ld 5, 144(4) # in[9].l | ||
| 1404 | + ld 6, 144+8(4) # in[9].h | ||
| 1405 | + SHR 9, 10, 6, 5, 32 | ||
| 1406 | + addc 24, 24, 10 | ||
| 1407 | + adde 25, 25, 9 | ||
| 1408 | + | ||
| 1409 | + # acc[4] += (in[9] & 0xffffffff) << 24; | ||
| 1410 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1411 | + and 11, 11, 5 | ||
| 1412 | + sldi 11, 11, 24 # << 24 | ||
| 1413 | + addc 22, 22, 11 | ||
| 1414 | + addze 23, 23 | ||
| 1415 | + | ||
| 1416 | + # acc[4] += in[9] >> 8; | ||
| 1417 | + SHR 9, 10, 6, 5, 8 | ||
| 1418 | + addc 22, 22, 10 | ||
| 1419 | + adde 23, 23, 9 | ||
| 1420 | + | ||
| 1421 | + # acc[3] += (in[9] & 0xff) << 48; | ||
| 1422 | + andi. 11, 5, 0xff | ||
| 1423 | + sldi 11, 11, 48 | ||
| 1424 | + addc 20, 20, 11 | ||
| 1425 | + addze 21, 21 | ||
| 1426 | + | ||
| 1427 | + # acc[3] -= in[9] >> 16; | ||
| 1428 | + SHR 9, 10, 6, 5, 16 | ||
| 1429 | + subfc 20, 10, 20 | ||
| 1430 | + subfe 21, 9, 21 | ||
| 1431 | + | ||
| 1432 | + # acc[2] -= (in[9] & 0xffff) << 40; | ||
| 1433 | + srdi 11, 7, 48 # 0xffff | ||
| 1434 | + and 11, 11, 5 | ||
| 1435 | + sldi 11, 11, 40 # << 40 | ||
| 1436 | + li 9, 0 | ||
| 1437 | + subfc 18, 11, 18 | ||
| 1438 | + subfe 19, 9, 19 | ||
| 1439 | + | ||
| 1440 | + # acc[3] += in[9] >> 48; | ||
| 1441 | + SHR 9, 10, 6, 5, 48 | ||
| 1442 | + addc 20, 20, 10 | ||
| 1443 | + adde 21, 21, 9 | ||
| 1444 | + | ||
| 1445 | + # acc[2] += (in[9] & 0xffffffffffff) << 8; | ||
| 1446 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1447 | + and 11, 11, 5 | ||
| 1448 | + sldi 11, 11, 8 # << 8 | ||
| 1449 | + addc 18, 18, 11 | ||
| 1450 | + addze 19, 19 | ||
| 1451 | + | ||
| 1452 | + # Eliminate acc[8] | ||
| 1453 | + | ||
| 1454 | + # acc[4] += acc[8] >> 32; | ||
| 1455 | + mr 5, 30 # acc[8].l | ||
| 1456 | + mr 6, 31 # acc[8].h | ||
| 1457 | + SHR 9, 10, 6, 5, 32 | ||
| 1458 | + addc 22, 22, 10 | ||
| 1459 | + adde 23, 23, 9 | ||
| 1460 | + | ||
| 1461 | + # acc[3] += (acc[8] & 0xffffffff) << 24; | ||
| 1462 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1463 | + and 11, 11, 5 | ||
| 1464 | + sldi 11, 11, 24 # << 24 | ||
| 1465 | + addc 20, 20, 11 | ||
| 1466 | + addze 21, 21 | ||
| 1467 | + | ||
| 1468 | + # acc[3] += acc[8] >> 8; | ||
| 1469 | + SHR 9, 10, 6, 5, 8 | ||
| 1470 | + addc 20, 20, 10 | ||
| 1471 | + adde 21, 21, 9 | ||
| 1472 | + | ||
| 1473 | + # acc[2] += (acc[8] & 0xff) << 48; | ||
| 1474 | + andi. 11, 5, 0xff | ||
| 1475 | + sldi 11, 11, 48 | ||
| 1476 | + addc 18, 18, 11 | ||
| 1477 | + addze 19, 19 | ||
| 1478 | + | ||
| 1479 | + # acc[2] -= acc[8] >> 16; | ||
| 1480 | + SHR 9, 10, 6, 5, 16 | ||
| 1481 | + subfc 18, 10, 18 | ||
| 1482 | + subfe 19, 9, 19 | ||
| 1483 | + | ||
| 1484 | + # acc[1] -= (acc[8] & 0xffff) << 40; | ||
| 1485 | + srdi 11, 7, 48 # 0xffff | ||
| 1486 | + and 11, 11, 5 | ||
| 1487 | + sldi 11, 11, 40 # << 40 | ||
| 1488 | + li 9, 0 | ||
| 1489 | + subfc 16, 11, 16 | ||
| 1490 | + subfe 17, 9, 17 | ||
| 1491 | + | ||
| 1492 | + #acc[2] += acc[8] >> 48; | ||
| 1493 | + SHR 9, 10, 6, 5, 48 | ||
| 1494 | + addc 18, 18, 10 | ||
| 1495 | + adde 19, 19, 9 | ||
| 1496 | + | ||
| 1497 | + # acc[1] += (acc[8] & 0xffffffffffff) << 8; | ||
| 1498 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1499 | + and 11, 11, 5 | ||
| 1500 | + sldi 11, 11, 8 # << 8 | ||
| 1501 | + addc 16, 16, 11 | ||
| 1502 | + addze 17, 17 | ||
| 1503 | + | ||
| 1504 | + # Eliminate acc[7] | ||
| 1505 | + | ||
| 1506 | + # acc[3] += acc[7] >> 32; | ||
| 1507 | + mr 5, 28 # acc[7].l | ||
| 1508 | + mr 6, 29 # acc[7].h | ||
| 1509 | + SHR 9, 10, 6, 5, 32 | ||
| 1510 | + addc 20, 20, 10 | ||
| 1511 | + adde 21, 21, 9 | ||
| 1512 | + | ||
| 1513 | + # acc[2] += (acc[7] & 0xffffffff) << 24; | ||
| 1514 | + srdi 11, 7, 32 # 0xffffffff | ||
| 1515 | + and 11, 11, 5 | ||
| 1516 | + sldi 11, 11, 24 # << 24 | ||
| 1517 | + addc 18, 18, 11 | ||
| 1518 | + addze 19, 19 | ||
| 1519 | + | ||
| 1520 | + # acc[2] += acc[7] >> 8; | ||
| 1521 | + SHR 9, 10, 6, 5, 8 | ||
| 1522 | + addc 18, 18, 10 | ||
| 1523 | + adde 19, 19, 9 | ||
| 1524 | + | ||
| 1525 | + # acc[1] += (acc[7] & 0xff) << 48; | ||
| 1526 | + andi. 11, 5, 0xff | ||
| 1527 | + sldi 11, 11, 48 | ||
| 1528 | + addc 16, 16, 11 | ||
| 1529 | + addze 17, 17 | ||
| 1530 | + | ||
| 1531 | + # acc[1] -= acc[7] >> 16; | ||
| 1532 | + SHR 9, 10, 6, 5, 16 | ||
| 1533 | + subfc 16, 10, 16 | ||
| 1534 | + subfe 17, 9, 17 | ||
| 1535 | + | ||
| 1536 | + # acc[0] -= (acc[7] & 0xffff) << 40; | ||
| 1537 | + srdi 11, 7, 48 # 0xffff | ||
| 1538 | + and 11, 11, 5 | ||
| 1539 | + sldi 11, 11, 40 # << 40 | ||
| 1540 | + li 9, 0 | ||
| 1541 | + subfc 14, 11, 14 | ||
| 1542 | + subfe 15, 9, 15 | ||
| 1543 | + | ||
| 1544 | + # acc[1] += acc[7] >> 48; | ||
| 1545 | + SHR 9, 10, 6, 5, 48 | ||
| 1546 | + addc 16, 16, 10 | ||
| 1547 | + adde 17, 17, 9 | ||
| 1548 | + | ||
| 1549 | + # acc[0] += (acc[7] & 0xffffffffffff) << 8; | ||
| 1550 | + srdi 11, 7, 16 # 0xffffffffffff | ||
| 1551 | + and 11, 11, 5 | ||
| 1552 | + sldi 11, 11, 8 # << 8 | ||
| 1553 | + addc 14, 14, 11 | ||
| 1554 | + addze 15, 15 | ||
| 1555 | + | ||
| 1556 | + # | ||
| 1557 | + # Carry 4 -> 5 -> 6 | ||
| 1558 | + # | ||
| 1559 | + # acc[5] += acc[4] >> 56; | ||
| 1560 | + # acc[4] &= 0x00ffffffffffffff; | ||
| 1561 | + SHR 9, 10, 23, 22, 56 | ||
| 1562 | + addc 24, 24, 10 | ||
| 1563 | + adde 25, 25, 9 | ||
| 1564 | + srdi 11, 7, 8 # 0x00ffffffffffffff | ||
| 1565 | + and 22, 22, 11 | ||
| 1566 | + li 23, 0 | ||
| 1567 | + | ||
| 1568 | + # acc[6] += acc[5] >> 56; | ||
| 1569 | + # acc[5] &= 0x00ffffffffffffff; | ||
| 1570 | + SHR 9, 10, 25, 24, 56 | ||
| 1571 | + addc 26, 26, 10 | ||
| 1572 | + adde 27, 27, 9 | ||
| 1573 | + and 24, 24, 11 | ||
| 1574 | + li 25, 0 | ||
| 1575 | + | ||
| 1576 | + # [3]: Eliminate high bits of acc[6] */ | ||
| 1577 | + # temp = acc[6] >> 48; | ||
| 1578 | + # acc[6] &= 0x0000ffffffffffff; | ||
| 1579 | + SHR 31, 30, 27, 26, 48 # temp = acc[6] >> 48 | ||
| 1580 | + srdi 11, 7, 16 # 0x0000ffffffffffff | ||
| 1581 | + and 26, 26, 11 | ||
| 1582 | + li 27, 0 | ||
| 1583 | + | ||
| 1584 | + # temp < 2^80 | ||
| 1585 | + # acc[3] += temp >> 40; | ||
| 1586 | + SHR 9, 10, 31, 30, 40 | ||
| 1587 | + addc 20, 20, 10 | ||
| 1588 | + adde 21, 21, 9 | ||
| 1589 | + | ||
| 1590 | + # acc[2] += (temp & 0xffffffffff) << 16; | ||
| 1591 | + srdi 11, 7, 24 # 0xffffffffff | ||
| 1592 | + and 10, 30, 11 | ||
| 1593 | + sldi 10, 10, 16 | ||
| 1594 | + addc 18, 18, 10 | ||
| 1595 | + addze 19, 19 | ||
| 1596 | + | ||
| 1597 | + # acc[2] += temp >> 16; | ||
| 1598 | + SHR 9, 10, 31, 30, 16 | ||
| 1599 | + addc 18, 18, 10 | ||
| 1600 | + adde 19, 19, 9 | ||
| 1601 | + | ||
| 1602 | + # acc[1] += (temp & 0xffff) << 40; | ||
| 1603 | + srdi 11, 7, 48 # 0xffff | ||
| 1604 | + and 10, 30, 11 | ||
| 1605 | + sldi 10, 10, 40 | ||
| 1606 | + addc 16, 16, 10 | ||
| 1607 | + addze 17, 17 | ||
| 1608 | + | ||
| 1609 | + # acc[1] -= temp >> 24; | ||
| 1610 | + SHR 9, 10, 31, 30, 24 | ||
| 1611 | + subfc 16, 10, 16 | ||
| 1612 | + subfe 17, 9, 17 | ||
| 1613 | + | ||
| 1614 | + # acc[0] -= (temp & 0xffffff) << 32; | ||
| 1615 | + srdi 11, 7, 40 # 0xffffff | ||
| 1616 | + and 10, 30, 11 | ||
| 1617 | + sldi 10, 10, 32 | ||
| 1618 | + li 9, 0 | ||
| 1619 | + subfc 14, 10, 14 | ||
| 1620 | + subfe 15, 9, 15 | ||
| 1621 | + | ||
| 1622 | + # acc[0] += temp; | ||
| 1623 | + addc 14, 14, 30 | ||
| 1624 | + adde 15, 15, 31 | ||
| 1625 | + | ||
| 1626 | + # Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 | ||
| 1627 | + # | ||
| 1628 | + # acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */ | ||
| 1629 | + SHR 9, 10, 15, 14, 56 | ||
| 1630 | + addc 16, 16, 10 | ||
| 1631 | + adde 17, 17, 9 | ||
| 1632 | + | ||
| 1633 | + # acc[0] &= 0x00ffffffffffffff; | ||
| 1634 | + srdi 11, 7, 8 # 0x00ffffffffffffff | ||
| 1635 | + and 14, 14, 11 | ||
| 1636 | + li 15, 0 | ||
| 1637 | + | ||
| 1638 | + # acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */ | ||
| 1639 | + SHR 9, 10, 17, 16, 56 | ||
| 1640 | + addc 18, 18, 10 | ||
| 1641 | + adde 19, 19, 9 | ||
| 1642 | + | ||
| 1643 | + # acc[1] &= 0x00ffffffffffffff; | ||
| 1644 | + and 16, 16, 11 | ||
| 1645 | + li 17, 0 | ||
| 1646 | + | ||
| 1647 | + # acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */ | ||
| 1648 | + SHR 9, 10, 19, 18, 56 | ||
| 1649 | + addc 20, 20, 10 | ||
| 1650 | + adde 21, 21, 9 | ||
| 1651 | + | ||
| 1652 | + # acc[2] &= 0x00ffffffffffffff; | ||
| 1653 | + and 18, 18, 11 | ||
| 1654 | + li 19, 0 | ||
| 1655 | + | ||
| 1656 | + # acc[4] += acc[3] >> 56; | ||
| 1657 | + SHR 9, 10, 21, 20, 56 | ||
| 1658 | + addc 22, 22, 10 | ||
| 1659 | + adde 23, 23, 9 | ||
| 1660 | + | ||
| 1661 | + # acc[3] &= 0x00ffffffffffffff; | ||
| 1662 | + and 20, 20, 11 | ||
| 1663 | + li 21, 0 | ||
| 1664 | + | ||
| 1665 | + # acc[5] += acc[4] >> 56; | ||
| 1666 | + SHR 9, 10, 23, 22, 56 | ||
| 1667 | + addc 24, 24, 10 | ||
| 1668 | + adde 25, 25, 9 | ||
| 1669 | + | ||
| 1670 | + # acc[4] &= 0x00ffffffffffffff; | ||
| 1671 | + and 22, 22, 11 | ||
| 1672 | + | ||
| 1673 | + # acc[6] += acc[5] >> 56; | ||
| 1674 | + SHR 9, 10, 25, 24, 56 | ||
| 1675 | + addc 26, 26, 10 | ||
| 1676 | + adde 27, 27, 9 | ||
| 1677 | + | ||
| 1678 | + # acc[5] &= 0x00ffffffffffffff; | ||
| 1679 | + and 24, 24, 11 | ||
| 1680 | + | ||
| 1681 | + std 14, 0(3) | ||
| 1682 | + std 16, 8(3) | ||
| 1683 | + std 18, 16(3) | ||
| 1684 | + std 20, 24(3) | ||
| 1685 | + std 22, 32(3) | ||
| 1686 | + std 24, 40(3) | ||
| 1687 | + std 26, 48(3) | ||
| 1688 | + blr | ||
| 1689 | +.size _p384_felem_reduce_core,.-_p384_felem_reduce_core | ||
| 1690 | + | ||
| 1691 | +.data | ||
| 1692 | +.align 4 | ||
| 1693 | +.LConst: | ||
| 1694 | +# two124m68: | ||
| 1695 | +.long 0x0, 0x0, 0xfffffff0, 0xfffffff | ||
| 1696 | +# two124m116m68: | ||
| 1697 | +.long 0x0, 0x0, 0xfffffff0, 0xfefffff | ||
| 1698 | +#two124p108m76: | ||
| 1699 | +.long 0x0, 0x0, 0xfffff000, 0x10000fff | ||
| 1700 | +#two124m92m68: | ||
| 1701 | +.long 0x0, 0x0, 0xeffffff0, 0xfffffff | ||
| 1702 | + | ||
| 1703 | +.text | ||
| 1704 | + | ||
| 1705 | +# | ||
| 1706 | +# void p384_felem_square_reduce(felem out, const felem in) | ||
| 1707 | +# | ||
| 1708 | +.global p384_felem_square_reduce | ||
| 1709 | +.type p384_felem_square_reduce,\@function | ||
| 1710 | +.align 4 | ||
| 1711 | +p384_felem_square_reduce: | ||
| 1712 | + stdu 1, -512(1) | ||
| 1713 | + mflr 0 | ||
| 1714 | + std 14, 56(1) | ||
| 1715 | + std 15, 64(1) | ||
| 1716 | + std 16, 72(1) | ||
| 1717 | + std 17, 80(1) | ||
| 1718 | + std 18, 88(1) | ||
| 1719 | + std 19, 96(1) | ||
| 1720 | + std 20, 104(1) | ||
| 1721 | + std 21, 112(1) | ||
| 1722 | + std 22, 120(1) | ||
| 1723 | + std 23, 128(1) | ||
| 1724 | + std 24, 136(1) | ||
| 1725 | + std 25, 144(1) | ||
| 1726 | + std 26, 152(1) | ||
| 1727 | + std 27, 160(1) | ||
| 1728 | + std 28, 168(1) | ||
| 1729 | + std 29, 176(1) | ||
| 1730 | + std 30, 184(1) | ||
| 1731 | + std 31, 192(1) | ||
| 1732 | + | ||
| 1733 | + std 3, 496(1) | ||
| 1734 | + addi 3, 1, 208 | ||
| 1735 | + bl _p384_felem_square_core | ||
| 1736 | + | ||
| 1737 | + mr 4, 3 | ||
| 1738 | + ld 3, 496(1) | ||
| 1739 | + bl _p384_felem_reduce_core | ||
| 1740 | + | ||
| 1741 | + ld 14, 56(1) | ||
| 1742 | + ld 15, 64(1) | ||
| 1743 | + ld 16, 72(1) | ||
| 1744 | + ld 17, 80(1) | ||
| 1745 | + ld 18, 88(1) | ||
| 1746 | + ld 19, 96(1) | ||
| 1747 | + ld 20, 104(1) | ||
| 1748 | + ld 21, 112(1) | ||
| 1749 | + ld 22, 120(1) | ||
| 1750 | + ld 23, 128(1) | ||
| 1751 | + ld 24, 136(1) | ||
| 1752 | + ld 25, 144(1) | ||
| 1753 | + ld 26, 152(1) | ||
| 1754 | + ld 27, 160(1) | ||
| 1755 | + ld 28, 168(1) | ||
| 1756 | + ld 29, 176(1) | ||
| 1757 | + ld 30, 184(1) | ||
| 1758 | + ld 31, 192(1) | ||
| 1759 | + addi 1, 1, 512 | ||
| 1760 | + mtlr 0 | ||
| 1761 | + blr | ||
| 1762 | +.size p384_felem_square_reduce,.-p384_felem_square_reduce | ||
| 1763 | + | ||
| 1764 | +# | ||
| 1765 | +# void p384_felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
| 1766 | +# | ||
| 1767 | +.global p384_felem_mul_reduce | ||
| 1768 | +.type p384_felem_mul_reduce,\@function | ||
| 1769 | +.align 5 | ||
| 1770 | +p384_felem_mul_reduce: | ||
| 1771 | + stdu 1, -512(1) | ||
| 1772 | + mflr 0 | ||
| 1773 | + std 14, 56(1) | ||
| 1774 | + std 15, 64(1) | ||
| 1775 | + std 16, 72(1) | ||
| 1776 | + std 17, 80(1) | ||
| 1777 | + std 18, 88(1) | ||
| 1778 | + std 19, 96(1) | ||
| 1779 | + std 20, 104(1) | ||
| 1780 | + std 21, 112(1) | ||
| 1781 | + std 22, 120(1) | ||
| 1782 | + std 23, 128(1) | ||
| 1783 | + std 24, 136(1) | ||
| 1784 | + std 25, 144(1) | ||
| 1785 | + std 26, 152(1) | ||
| 1786 | + std 27, 160(1) | ||
| 1787 | + std 28, 168(1) | ||
| 1788 | + std 29, 176(1) | ||
| 1789 | + std 30, 184(1) | ||
| 1790 | + std 31, 192(1) | ||
| 1791 | + | ||
| 1792 | + std 3, 496(1) | ||
| 1793 | + addi 3, 1, 208 | ||
| 1794 | + bl _p384_felem_mul_core | ||
| 1795 | + | ||
| 1796 | + mr 4, 3 | ||
| 1797 | + ld 3, 496(1) | ||
| 1798 | + bl _p384_felem_reduce_core | ||
| 1799 | + | ||
| 1800 | + ld 14, 56(1) | ||
| 1801 | + ld 15, 64(1) | ||
| 1802 | + ld 16, 72(1) | ||
| 1803 | + ld 17, 80(1) | ||
| 1804 | + ld 18, 88(1) | ||
| 1805 | + ld 19, 96(1) | ||
| 1806 | + ld 20, 104(1) | ||
| 1807 | + ld 21, 112(1) | ||
| 1808 | + ld 22, 120(1) | ||
| 1809 | + ld 23, 128(1) | ||
| 1810 | + ld 24, 136(1) | ||
| 1811 | + ld 25, 144(1) | ||
| 1812 | + ld 26, 152(1) | ||
| 1813 | + ld 27, 160(1) | ||
| 1814 | + ld 28, 168(1) | ||
| 1815 | + ld 29, 176(1) | ||
| 1816 | + ld 30, 184(1) | ||
| 1817 | + ld 31, 192(1) | ||
| 1818 | + addi 1, 1, 512 | ||
| 1819 | + mtlr 0 | ||
| 1820 | + blr | ||
| 1821 | +.size p384_felem_mul_reduce,.-p384_felem_mul_reduce | ||
| 1822 | +___ | ||
| 1823 | |||
| 1824 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 1825 | print $code; | ||
| 1826 | diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c | ||
| 1827 | index 3fd7a40020..e0b5786bc1 100644 | ||
| 1828 | --- a/crypto/ec/ecp_nistp384.c | ||
| 1829 | +++ b/crypto/ec/ecp_nistp384.c | ||
| 1830 | @@ -252,6 +252,16 @@ static void felem_neg(felem out, const felem in) | ||
| 1831 | out[6] = two60m4 - in[6]; | ||
| 1832 | } | ||
| 1833 | |||
| 1834 | +#if defined(ECP_NISTP384_ASM) | ||
| 1835 | +void p384_felem_diff64(felem out, const felem in); | ||
| 1836 | +void p384_felem_diff128(widefelem out, const widefelem in); | ||
| 1837 | +void p384_felem_diff_128_64(widefelem out, const felem in); | ||
| 1838 | + | ||
| 1839 | +# define felem_diff64 p384_felem_diff64 | ||
| 1840 | +# define felem_diff128 p384_felem_diff128 | ||
| 1841 | +# define felem_diff_128_64 p384_felem_diff_128_64 | ||
| 1842 | + | ||
| 1843 | +#else | ||
| 1844 | /*- | ||
| 1845 | * felem_diff64 subtracts |in| from |out| | ||
| 1846 | * On entry: | ||
| 1847 | @@ -369,6 +379,7 @@ static void felem_diff128(widefelem out, const widefelem in) | ||
| 1848 | for (i = 0; i < 2*NLIMBS-1; i++) | ||
| 1849 | out[i] -= in[i]; | ||
| 1850 | } | ||
| 1851 | +#endif /* ECP_NISTP384_ASM */ | ||
| 1852 | |||
| 1853 | static void felem_square_ref(widefelem out, const felem in) | ||
| 1854 | { | ||
| 1855 | @@ -503,7 +514,7 @@ static void felem_mul_ref(widefelem out, const felem in1, const felem in2) | ||
| 1856 | * [3]: Y = 2^48 (acc[6] >> 48) | ||
| 1857 | * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d) | ||
| 1858 | */ | ||
| 1859 | -static void felem_reduce(felem out, const widefelem in) | ||
| 1860 | +static void felem_reduce_ref(felem out, const widefelem in) | ||
| 1861 | { | ||
| 1862 | /* | ||
| 1863 | * In order to prevent underflow, we add a multiple of p before subtracting. | ||
| 1864 | @@ -682,8 +693,11 @@ static void (*felem_square_p)(widefelem out, const felem in) = | ||
| 1865 | static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = | ||
| 1866 | felem_mul_wrapper; | ||
| 1867 | |||
| 1868 | +static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref; | ||
| 1869 | + | ||
| 1870 | void p384_felem_square(widefelem out, const felem in); | ||
| 1871 | void p384_felem_mul(widefelem out, const felem in1, const felem in2); | ||
| 1872 | +void p384_felem_reduce(felem out, const widefelem in); | ||
| 1873 | |||
| 1874 | # if defined(_ARCH_PPC64) | ||
| 1875 | # include "crypto/ppc_arch.h" | ||
| 1876 | @@ -695,6 +709,7 @@ static void felem_select(void) | ||
| 1877 | if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { | ||
| 1878 | felem_square_p = p384_felem_square; | ||
| 1879 | felem_mul_p = p384_felem_mul; | ||
| 1880 | + felem_reduce_p = p384_felem_reduce; | ||
| 1881 | |||
| 1882 | return; | ||
| 1883 | } | ||
| 1884 | @@ -703,6 +718,7 @@ static void felem_select(void) | ||
| 1885 | /* Default */ | ||
| 1886 | felem_square_p = felem_square_ref; | ||
| 1887 | felem_mul_p = felem_mul_ref; | ||
| 1888 | + felem_reduce_p = p384_felem_reduce; | ||
| 1889 | } | ||
| 1890 | |||
| 1891 | static void felem_square_wrapper(widefelem out, const felem in) | ||
| 1892 | @@ -719,10 +735,17 @@ static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) | ||
| 1893 | |||
| 1894 | # define felem_square felem_square_p | ||
| 1895 | # define felem_mul felem_mul_p | ||
| 1896 | +# define felem_reduce felem_reduce_p | ||
| 1897 | + | ||
| 1898 | +void p384_felem_square_reduce(felem out, const felem in); | ||
| 1899 | +void p384_felem_mul_reduce(felem out, const felem in1, const felem in2); | ||
| 1900 | + | ||
| 1901 | +# define felem_square_reduce p384_felem_square_reduce | ||
| 1902 | +# define felem_mul_reduce p384_felem_mul_reduce | ||
| 1903 | #else | ||
| 1904 | # define felem_square felem_square_ref | ||
| 1905 | # define felem_mul felem_mul_ref | ||
| 1906 | -#endif | ||
| 1907 | +# define felem_reduce felem_reduce_ref | ||
| 1908 | |||
| 1909 | static ossl_inline void felem_square_reduce(felem out, const felem in) | ||
| 1910 | { | ||
| 1911 | @@ -739,6 +762,7 @@ static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem | ||
| 1912 | felem_mul(tmp, in1, in2); | ||
| 1913 | felem_reduce(out, tmp); | ||
| 1914 | } | ||
| 1915 | +#endif | ||
| 1916 | |||
| 1917 | /*- | ||
| 1918 | * felem_inv calculates |out| = |in|^{-1} | ||
diff --git a/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-2.patch b/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-2.patch new file mode 100644 index 0000000000..0659a9d6d9 --- /dev/null +++ b/meta/recipes-connectivity/openssl/openssl/CVE-2025-27587-2.patch | |||
| @@ -0,0 +1,129 @@ | |||
| 1 | From 6b1646e472c9e8c08bb14066ba2a7c3eed45f84a Mon Sep 17 00:00:00 2001 | ||
| 2 | From: "A. Wilcox" <AWilcox@Wilcox-Tech.com> | ||
| 3 | Date: Thu, 17 Apr 2025 08:51:53 -0500 | ||
| 4 | Subject: [PATCH] Fix P-384 curve on lower-than-P9 PPC64 targets | ||
| 5 | |||
| 6 | The change adding an asm implementation of p384_felem_reduce incorrectly | ||
| 7 | uses the accelerated version on both targets that support the intrinsics | ||
| 8 | *and* targets that don't, instead of falling back to the generics on older | ||
| 9 | targets. This results in crashes when trying to use P-384 on < Power9. | ||
| 10 | |||
| 11 | Signed-off-by: Anna Wilcox <AWilcox@Wilcox-Tech.com> | ||
| 12 | Closes: #27350 | ||
| 13 | Fixes: 85cabd94 ("Fix Minerva timing side-channel signal for P-384 curve on PPC") | ||
| 14 | |||
| 15 | Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com> | ||
| 16 | Reviewed-by: Tomas Mraz <tomas@openssl.org> | ||
| 17 | (Merged from https://github.com/openssl/openssl/pull/27429) | ||
| 18 | |||
| 19 | (cherry picked from commit 29864f2b0f1046177e8048a5b17440893d3f9425) | ||
| 20 | |||
| 21 | CVE: CVE-2025-27587 | ||
| 22 | Upstream-Status: Backport [https://github.com/openssl/openssl/commit/6b1646e472c9e8c08bb14066ba2a7c3eed45f84a] | ||
| 23 | Signed-off-by: Peter Marko <peter.marko@siemens.com> | ||
| 24 | --- | ||
| 25 | crypto/ec/ecp_nistp384.c | 54 ++++++++++++++++++++++++---------------- | ||
| 26 | 1 file changed, 33 insertions(+), 21 deletions(-) | ||
| 27 | |||
| 28 | diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c | ||
| 29 | index e0b5786bc1..439b4d03a3 100644 | ||
| 30 | --- a/crypto/ec/ecp_nistp384.c | ||
| 31 | +++ b/crypto/ec/ecp_nistp384.c | ||
| 32 | @@ -684,6 +684,22 @@ static void felem_reduce_ref(felem out, const widefelem in) | ||
| 33 | out[i] = acc[i]; | ||
| 34 | } | ||
| 35 | |||
| 36 | +static ossl_inline void felem_square_reduce_ref(felem out, const felem in) | ||
| 37 | +{ | ||
| 38 | + widefelem tmp; | ||
| 39 | + | ||
| 40 | + felem_square_ref(tmp, in); | ||
| 41 | + felem_reduce_ref(out, tmp); | ||
| 42 | +} | ||
| 43 | + | ||
| 44 | +static ossl_inline void felem_mul_reduce_ref(felem out, const felem in1, const felem in2) | ||
| 45 | +{ | ||
| 46 | + widefelem tmp; | ||
| 47 | + | ||
| 48 | + felem_mul_ref(tmp, in1, in2); | ||
| 49 | + felem_reduce_ref(out, tmp); | ||
| 50 | +} | ||
| 51 | + | ||
| 52 | #if defined(ECP_NISTP384_ASM) | ||
| 53 | static void felem_square_wrapper(widefelem out, const felem in); | ||
| 54 | static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2); | ||
| 55 | @@ -695,10 +711,18 @@ static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = | ||
| 56 | |||
| 57 | static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref; | ||
| 58 | |||
| 59 | +static void (*felem_square_reduce_p)(felem out, const felem in) = | ||
| 60 | + felem_square_reduce_ref; | ||
| 61 | +static void (*felem_mul_reduce_p)(felem out, const felem in1, const felem in2) = | ||
| 62 | + felem_mul_reduce_ref; | ||
| 63 | + | ||
| 64 | void p384_felem_square(widefelem out, const felem in); | ||
| 65 | void p384_felem_mul(widefelem out, const felem in1, const felem in2); | ||
| 66 | void p384_felem_reduce(felem out, const widefelem in); | ||
| 67 | |||
| 68 | +void p384_felem_square_reduce(felem out, const felem in); | ||
| 69 | +void p384_felem_mul_reduce(felem out, const felem in1, const felem in2); | ||
| 70 | + | ||
| 71 | # if defined(_ARCH_PPC64) | ||
| 72 | # include "crypto/ppc_arch.h" | ||
| 73 | # endif | ||
| 74 | @@ -710,6 +734,8 @@ static void felem_select(void) | ||
| 75 | felem_square_p = p384_felem_square; | ||
| 76 | felem_mul_p = p384_felem_mul; | ||
| 77 | felem_reduce_p = p384_felem_reduce; | ||
| 78 | + felem_square_reduce_p = p384_felem_square_reduce; | ||
| 79 | + felem_mul_reduce_p = p384_felem_mul_reduce; | ||
| 80 | |||
| 81 | return; | ||
| 82 | } | ||
| 83 | @@ -718,7 +744,9 @@ static void felem_select(void) | ||
| 84 | /* Default */ | ||
| 85 | felem_square_p = felem_square_ref; | ||
| 86 | felem_mul_p = felem_mul_ref; | ||
| 87 | - felem_reduce_p = p384_felem_reduce; | ||
| 88 | + felem_reduce_p = felem_reduce_ref; | ||
| 89 | + felem_square_reduce_p = felem_square_reduce_ref; | ||
| 90 | + felem_mul_reduce_p = felem_mul_reduce_ref; | ||
| 91 | } | ||
| 92 | |||
| 93 | static void felem_square_wrapper(widefelem out, const felem in) | ||
| 94 | @@ -737,31 +765,15 @@ static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) | ||
| 95 | # define felem_mul felem_mul_p | ||
| 96 | # define felem_reduce felem_reduce_p | ||
| 97 | |||
| 98 | -void p384_felem_square_reduce(felem out, const felem in); | ||
| 99 | -void p384_felem_mul_reduce(felem out, const felem in1, const felem in2); | ||
| 100 | - | ||
| 101 | -# define felem_square_reduce p384_felem_square_reduce | ||
| 102 | -# define felem_mul_reduce p384_felem_mul_reduce | ||
| 103 | +# define felem_square_reduce felem_square_reduce_p | ||
| 104 | +# define felem_mul_reduce felem_mul_reduce_p | ||
| 105 | #else | ||
| 106 | # define felem_square felem_square_ref | ||
| 107 | # define felem_mul felem_mul_ref | ||
| 108 | # define felem_reduce felem_reduce_ref | ||
| 109 | |||
| 110 | -static ossl_inline void felem_square_reduce(felem out, const felem in) | ||
| 111 | -{ | ||
| 112 | - widefelem tmp; | ||
| 113 | - | ||
| 114 | - felem_square(tmp, in); | ||
| 115 | - felem_reduce(out, tmp); | ||
| 116 | -} | ||
| 117 | - | ||
| 118 | -static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
| 119 | -{ | ||
| 120 | - widefelem tmp; | ||
| 121 | - | ||
| 122 | - felem_mul(tmp, in1, in2); | ||
| 123 | - felem_reduce(out, tmp); | ||
| 124 | -} | ||
| 125 | +# define felem_square_reduce felem_square_reduce_ref | ||
| 126 | +# define felem_mul_reduce felem_mul_reduce_ref | ||
| 127 | #endif | ||
| 128 | |||
| 129 | /*- | ||
diff --git a/meta/recipes-connectivity/openssl/openssl_3.2.4.bb b/meta/recipes-connectivity/openssl/openssl_3.2.4.bb index d6bf32d989..fd98b32007 100644 --- a/meta/recipes-connectivity/openssl/openssl_3.2.4.bb +++ b/meta/recipes-connectivity/openssl/openssl_3.2.4.bb | |||
| @@ -13,6 +13,8 @@ SRC_URI = "https://github.com/openssl/openssl/releases/download/openssl-${PV}/op | |||
| 13 | file://0001-Configure-do-not-tweak-mips-cflags.patch \ | 13 | file://0001-Configure-do-not-tweak-mips-cflags.patch \ |
| 14 | file://0001-Added-handshake-history-reporting-when-test-fails.patch \ | 14 | file://0001-Added-handshake-history-reporting-when-test-fails.patch \ |
| 15 | file://CVE-2024-41996.patch \ | 15 | file://CVE-2024-41996.patch \ |
| 16 | file://CVE-2025-27587-1.patch \ | ||
| 17 | file://CVE-2025-27587-2.patch \ | ||
| 16 | " | 18 | " |
| 17 | 19 | ||
| 18 | SRC_URI:append:class-nativesdk = " \ | 20 | SRC_URI:append:class-nativesdk = " \ |
