root/arch/x86/crypto/serpent-avx2-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * x86_64/AVX2 assembler optimized version of Serpent
   4  *
   5  * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6  *
   7  * Based on AVX assembler implementation of Serpent by:
   8  *  Copyright © 2012 Johannes Goetzfried
   9  *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  10  */
  11 
  12 #include <linux/linkage.h>
  13 #include <asm/frame.h>
  14 #include "glue_helper-asm-avx2.S"
  15 
  16 .file "serpent-avx2-asm_64.S"
  17 
  18 .section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  19 .align 16
  20 .Lbswap128_mask:
  21         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  22 
  23 .section        .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
  24 .align 16
  25 .Lxts_gf128mul_and_shl1_mask_0:
  26         .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  27 
  28 .section        .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
  29 .align 16
  30 .Lxts_gf128mul_and_shl1_mask_1:
  31         .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  32 
  33 .text
  34 
  35 #define CTX %rdi
  36 
  37 #define RNOT %ymm0
  38 #define tp  %ymm1
  39 
  40 #define RA1 %ymm2
  41 #define RA2 %ymm3
  42 #define RB1 %ymm4
  43 #define RB2 %ymm5
  44 #define RC1 %ymm6
  45 #define RC2 %ymm7
  46 #define RD1 %ymm8
  47 #define RD2 %ymm9
  48 #define RE1 %ymm10
  49 #define RE2 %ymm11
  50 
  51 #define RK0 %ymm12
  52 #define RK1 %ymm13
  53 #define RK2 %ymm14
  54 #define RK3 %ymm15
  55 
  56 #define RK0x %xmm12
  57 #define RK1x %xmm13
  58 #define RK2x %xmm14
  59 #define RK3x %xmm15
  60 
  61 #define S0_1(x0, x1, x2, x3, x4)      \
  62         vpor            x0,   x3, tp; \
  63         vpxor           x3,   x0, x0; \
  64         vpxor           x2,   x3, x4; \
  65         vpxor           RNOT, x4, x4; \
  66         vpxor           x1,   tp, x3; \
  67         vpand           x0,   x1, x1; \
  68         vpxor           x4,   x1, x1; \
  69         vpxor           x0,   x2, x2;
  70 #define S0_2(x0, x1, x2, x3, x4)      \
  71         vpxor           x3,   x0, x0; \
  72         vpor            x0,   x4, x4; \
  73         vpxor           x2,   x0, x0; \
  74         vpand           x1,   x2, x2; \
  75         vpxor           x2,   x3, x3; \
  76         vpxor           RNOT, x1, x1; \
  77         vpxor           x4,   x2, x2; \
  78         vpxor           x2,   x1, x1;
  79 
  80 #define S1_1(x0, x1, x2, x3, x4)      \
  81         vpxor           x0,   x1, tp; \
  82         vpxor           x3,   x0, x0; \
  83         vpxor           RNOT, x3, x3; \
  84         vpand           tp,   x1, x4; \
  85         vpor            tp,   x0, x0; \
  86         vpxor           x2,   x3, x3; \
  87         vpxor           x3,   x0, x0; \
  88         vpxor           x3,   tp, x1;
  89 #define S1_2(x0, x1, x2, x3, x4)      \
  90         vpxor           x4,   x3, x3; \
  91         vpor            x4,   x1, x1; \
  92         vpxor           x2,   x4, x4; \
  93         vpand           x0,   x2, x2; \
  94         vpxor           x1,   x2, x2; \
  95         vpor            x0,   x1, x1; \
  96         vpxor           RNOT, x0, x0; \
  97         vpxor           x2,   x0, x0; \
  98         vpxor           x1,   x4, x4;
  99 
 100 #define S2_1(x0, x1, x2, x3, x4)      \
 101         vpxor           RNOT, x3, x3; \
 102         vpxor           x0,   x1, x1; \
 103         vpand           x2,   x0, tp; \
 104         vpxor           x3,   tp, tp; \
 105         vpor            x0,   x3, x3; \
 106         vpxor           x1,   x2, x2; \
 107         vpxor           x1,   x3, x3; \
 108         vpand           tp,   x1, x1;
 109 #define S2_2(x0, x1, x2, x3, x4)      \
 110         vpxor           x2,   tp, tp; \
 111         vpand           x3,   x2, x2; \
 112         vpor            x1,   x3, x3; \
 113         vpxor           RNOT, tp, tp; \
 114         vpxor           tp,   x3, x3; \
 115         vpxor           tp,   x0, x4; \
 116         vpxor           x2,   tp, x0; \
 117         vpor            x2,   x1, x1;
 118 
 119 #define S3_1(x0, x1, x2, x3, x4)      \
 120         vpxor           x3,   x1, tp; \
 121         vpor            x0,   x3, x3; \
 122         vpand           x0,   x1, x4; \
 123         vpxor           x2,   x0, x0; \
 124         vpxor           tp,   x2, x2; \
 125         vpand           x3,   tp, x1; \
 126         vpxor           x3,   x2, x2; \
 127         vpor            x4,   x0, x0; \
 128         vpxor           x3,   x4, x4;
 129 #define S3_2(x0, x1, x2, x3, x4)      \
 130         vpxor           x0,   x1, x1; \
 131         vpand           x3,   x0, x0; \
 132         vpand           x4,   x3, x3; \
 133         vpxor           x2,   x3, x3; \
 134         vpor            x1,   x4, x4; \
 135         vpand           x1,   x2, x2; \
 136         vpxor           x3,   x4, x4; \
 137         vpxor           x3,   x0, x0; \
 138         vpxor           x2,   x3, x3;
 139 
 140 #define S4_1(x0, x1, x2, x3, x4)      \
 141         vpand           x0,   x3, tp; \
 142         vpxor           x3,   x0, x0; \
 143         vpxor           x2,   tp, tp; \
 144         vpor            x3,   x2, x2; \
 145         vpxor           x1,   x0, x0; \
 146         vpxor           tp,   x3, x4; \
 147         vpor            x0,   x2, x2; \
 148         vpxor           x1,   x2, x2;
 149 #define S4_2(x0, x1, x2, x3, x4)      \
 150         vpand           x0,   x1, x1; \
 151         vpxor           x4,   x1, x1; \
 152         vpand           x2,   x4, x4; \
 153         vpxor           tp,   x2, x2; \
 154         vpxor           x0,   x4, x4; \
 155         vpor            x1,   tp, x3; \
 156         vpxor           RNOT, x1, x1; \
 157         vpxor           x0,   x3, x3;
 158 
 159 #define S5_1(x0, x1, x2, x3, x4)      \
 160         vpor            x0,   x1, tp; \
 161         vpxor           tp,   x2, x2; \
 162         vpxor           RNOT, x3, x3; \
 163         vpxor           x0,   x1, x4; \
 164         vpxor           x2,   x0, x0; \
 165         vpand           x4,   tp, x1; \
 166         vpor            x3,   x4, x4; \
 167         vpxor           x0,   x4, x4;
 168 #define S5_2(x0, x1, x2, x3, x4)      \
 169         vpand           x3,   x0, x0; \
 170         vpxor           x3,   x1, x1; \
 171         vpxor           x2,   x3, x3; \
 172         vpxor           x1,   x0, x0; \
 173         vpand           x4,   x2, x2; \
 174         vpxor           x2,   x1, x1; \
 175         vpand           x0,   x2, x2; \
 176         vpxor           x2,   x3, x3;
 177 
 178 #define S6_1(x0, x1, x2, x3, x4)      \
 179         vpxor           x0,   x3, x3; \
 180         vpxor           x2,   x1, tp; \
 181         vpxor           x0,   x2, x2; \
 182         vpand           x3,   x0, x0; \
 183         vpor            x3,   tp, tp; \
 184         vpxor           RNOT, x1, x4; \
 185         vpxor           tp,   x0, x0; \
 186         vpxor           x2,   tp, x1;
 187 #define S6_2(x0, x1, x2, x3, x4)      \
 188         vpxor           x4,   x3, x3; \
 189         vpxor           x0,   x4, x4; \
 190         vpand           x0,   x2, x2; \
 191         vpxor           x1,   x4, x4; \
 192         vpxor           x3,   x2, x2; \
 193         vpand           x1,   x3, x3; \
 194         vpxor           x0,   x3, x3; \
 195         vpxor           x2,   x1, x1;
 196 
 197 #define S7_1(x0, x1, x2, x3, x4)      \
 198         vpxor           RNOT, x1, tp; \
 199         vpxor           RNOT, x0, x0; \
 200         vpand           x2,   tp, x1; \
 201         vpxor           x3,   x1, x1; \
 202         vpor            tp,   x3, x3; \
 203         vpxor           x2,   tp, x4; \
 204         vpxor           x3,   x2, x2; \
 205         vpxor           x0,   x3, x3; \
 206         vpor            x1,   x0, x0;
 207 #define S7_2(x0, x1, x2, x3, x4)      \
 208         vpand           x0,   x2, x2; \
 209         vpxor           x4,   x0, x0; \
 210         vpxor           x3,   x4, x4; \
 211         vpand           x0,   x3, x3; \
 212         vpxor           x1,   x4, x4; \
 213         vpxor           x4,   x2, x2; \
 214         vpxor           x1,   x3, x3; \
 215         vpor            x0,   x4, x4; \
 216         vpxor           x1,   x4, x4;
 217 
 218 #define SI0_1(x0, x1, x2, x3, x4)     \
 219         vpxor           x0,   x1, x1; \
 220         vpor            x1,   x3, tp; \
 221         vpxor           x1,   x3, x4; \
 222         vpxor           RNOT, x0, x0; \
 223         vpxor           tp,   x2, x2; \
 224         vpxor           x0,   tp, x3; \
 225         vpand           x1,   x0, x0; \
 226         vpxor           x2,   x0, x0;
 227 #define SI0_2(x0, x1, x2, x3, x4)     \
 228         vpand           x3,   x2, x2; \
 229         vpxor           x4,   x3, x3; \
 230         vpxor           x3,   x2, x2; \
 231         vpxor           x3,   x1, x1; \
 232         vpand           x0,   x3, x3; \
 233         vpxor           x0,   x1, x1; \
 234         vpxor           x2,   x0, x0; \
 235         vpxor           x3,   x4, x4;
 236 
 237 #define SI1_1(x0, x1, x2, x3, x4)     \
 238         vpxor           x3,   x1, x1; \
 239         vpxor           x2,   x0, tp; \
 240         vpxor           RNOT, x2, x2; \
 241         vpor            x1,   x0, x4; \
 242         vpxor           x3,   x4, x4; \
 243         vpand           x1,   x3, x3; \
 244         vpxor           x2,   x1, x1; \
 245         vpand           x4,   x2, x2;
 246 #define SI1_2(x0, x1, x2, x3, x4)     \
 247         vpxor           x1,   x4, x4; \
 248         vpor            x3,   x1, x1; \
 249         vpxor           tp,   x3, x3; \
 250         vpxor           tp,   x2, x2; \
 251         vpor            x4,   tp, x0; \
 252         vpxor           x4,   x2, x2; \
 253         vpxor           x0,   x1, x1; \
 254         vpxor           x1,   x4, x4;
 255 
 256 #define SI2_1(x0, x1, x2, x3, x4)     \
 257         vpxor           x1,   x2, x2; \
 258         vpxor           RNOT, x3, tp; \
 259         vpor            x2,   tp, tp; \
 260         vpxor           x3,   x2, x2; \
 261         vpxor           x0,   x3, x4; \
 262         vpxor           x1,   tp, x3; \
 263         vpor            x2,   x1, x1; \
 264         vpxor           x0,   x2, x2;
 265 #define SI2_2(x0, x1, x2, x3, x4)     \
 266         vpxor           x4,   x1, x1; \
 267         vpor            x3,   x4, x4; \
 268         vpxor           x3,   x2, x2; \
 269         vpxor           x2,   x4, x4; \
 270         vpand           x1,   x2, x2; \
 271         vpxor           x3,   x2, x2; \
 272         vpxor           x4,   x3, x3; \
 273         vpxor           x0,   x4, x4;
 274 
 275 #define SI3_1(x0, x1, x2, x3, x4)     \
 276         vpxor           x1,   x2, x2; \
 277         vpand           x2,   x1, tp; \
 278         vpxor           x0,   tp, tp; \
 279         vpor            x1,   x0, x0; \
 280         vpxor           x3,   x1, x4; \
 281         vpxor           x3,   x0, x0; \
 282         vpor            tp,   x3, x3; \
 283         vpxor           x2,   tp, x1;
 284 #define SI3_2(x0, x1, x2, x3, x4)     \
 285         vpxor           x3,   x1, x1; \
 286         vpxor           x2,   x0, x0; \
 287         vpxor           x3,   x2, x2; \
 288         vpand           x1,   x3, x3; \
 289         vpxor           x0,   x1, x1; \
 290         vpand           x2,   x0, x0; \
 291         vpxor           x3,   x4, x4; \
 292         vpxor           x0,   x3, x3; \
 293         vpxor           x1,   x0, x0;
 294 
 295 #define SI4_1(x0, x1, x2, x3, x4)     \
 296         vpxor           x3,   x2, x2; \
 297         vpand           x1,   x0, tp; \
 298         vpxor           x2,   tp, tp; \
 299         vpor            x3,   x2, x2; \
 300         vpxor           RNOT, x0, x4; \
 301         vpxor           tp,   x1, x1; \
 302         vpxor           x2,   tp, x0; \
 303         vpand           x4,   x2, x2;
 304 #define SI4_2(x0, x1, x2, x3, x4)     \
 305         vpxor           x0,   x2, x2; \
 306         vpor            x4,   x0, x0; \
 307         vpxor           x3,   x0, x0; \
 308         vpand           x2,   x3, x3; \
 309         vpxor           x3,   x4, x4; \
 310         vpxor           x1,   x3, x3; \
 311         vpand           x0,   x1, x1; \
 312         vpxor           x1,   x4, x4; \
 313         vpxor           x3,   x0, x0;
 314 
 315 #define SI5_1(x0, x1, x2, x3, x4)     \
 316         vpor            x2,   x1, tp; \
 317         vpxor           x1,   x2, x2; \
 318         vpxor           x3,   tp, tp; \
 319         vpand           x1,   x3, x3; \
 320         vpxor           x3,   x2, x2; \
 321         vpor            x0,   x3, x3; \
 322         vpxor           RNOT, x0, x0; \
 323         vpxor           x2,   x3, x3; \
 324         vpor            x0,   x2, x2;
 325 #define SI5_2(x0, x1, x2, x3, x4)     \
 326         vpxor           tp,   x1, x4; \
 327         vpxor           x4,   x2, x2; \
 328         vpand           x0,   x4, x4; \
 329         vpxor           tp,   x0, x0; \
 330         vpxor           x3,   tp, x1; \
 331         vpand           x2,   x0, x0; \
 332         vpxor           x3,   x2, x2; \
 333         vpxor           x2,   x0, x0; \
 334         vpxor           x4,   x2, x2; \
 335         vpxor           x3,   x4, x4;
 336 
 337 #define SI6_1(x0, x1, x2, x3, x4)     \
 338         vpxor           x2,   x0, x0; \
 339         vpand           x3,   x0, tp; \
 340         vpxor           x3,   x2, x2; \
 341         vpxor           x2,   tp, tp; \
 342         vpxor           x1,   x3, x3; \
 343         vpor            x0,   x2, x2; \
 344         vpxor           x3,   x2, x2; \
 345         vpand           tp,   x3, x3;
 346 #define SI6_2(x0, x1, x2, x3, x4)     \
 347         vpxor           RNOT, tp, tp; \
 348         vpxor           x1,   x3, x3; \
 349         vpand           x2,   x1, x1; \
 350         vpxor           tp,   x0, x4; \
 351         vpxor           x4,   x3, x3; \
 352         vpxor           x2,   x4, x4; \
 353         vpxor           x1,   tp, x0; \
 354         vpxor           x0,   x2, x2;
 355 
 356 #define SI7_1(x0, x1, x2, x3, x4)     \
 357         vpand           x0,   x3, tp; \
 358         vpxor           x2,   x0, x0; \
 359         vpor            x3,   x2, x2; \
 360         vpxor           x1,   x3, x4; \
 361         vpxor           RNOT, x0, x0; \
 362         vpor            tp,   x1, x1; \
 363         vpxor           x0,   x4, x4; \
 364         vpand           x2,   x0, x0; \
 365         vpxor           x1,   x0, x0;
 366 #define SI7_2(x0, x1, x2, x3, x4)     \
 367         vpand           x2,   x1, x1; \
 368         vpxor           x2,   tp, x3; \
 369         vpxor           x3,   x4, x4; \
 370         vpand           x3,   x2, x2; \
 371         vpor            x0,   x3, x3; \
 372         vpxor           x4,   x1, x1; \
 373         vpxor           x4,   x3, x3; \
 374         vpand           x0,   x4, x4; \
 375         vpxor           x2,   x4, x4;
 376 
 377 #define get_key(i,j,t) \
 378         vpbroadcastd (4*(i)+(j))*4(CTX), t;
 379 
 380 #define K2(x0, x1, x2, x3, x4, i) \
 381         get_key(i, 0, RK0); \
 382         get_key(i, 1, RK1); \
 383         get_key(i, 2, RK2); \
 384         get_key(i, 3, RK3); \
 385         vpxor RK0,      x0 ## 1, x0 ## 1; \
 386         vpxor RK1,      x1 ## 1, x1 ## 1; \
 387         vpxor RK2,      x2 ## 1, x2 ## 1; \
 388         vpxor RK3,      x3 ## 1, x3 ## 1; \
 389                 vpxor RK0,      x0 ## 2, x0 ## 2; \
 390                 vpxor RK1,      x1 ## 2, x1 ## 2; \
 391                 vpxor RK2,      x2 ## 2, x2 ## 2; \
 392                 vpxor RK3,      x3 ## 2, x3 ## 2;
 393 
 394 #define LK2(x0, x1, x2, x3, x4, i) \
 395         vpslld $13,             x0 ## 1, x4 ## 1;          \
 396         vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 397         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 398         vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 399         vpslld $3,              x2 ## 1, x4 ## 1;          \
 400         vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 401         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 402         vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 403                 vpslld $13,             x0 ## 2, x4 ## 2;          \
 404                 vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 405                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 406                 vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 407                 vpslld $3,              x2 ## 2, x4 ## 2;          \
 408                 vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 409                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 410                 vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 411         vpslld $1,              x1 ## 1, x4 ## 1;          \
 412         vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 413         vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 414         vpslld $3,              x0 ## 1, x4 ## 1;          \
 415         vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 416         vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 417         get_key(i, 1, RK1); \
 418                 vpslld $1,              x1 ## 2, x4 ## 2;          \
 419                 vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 420                 vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 421                 vpslld $3,              x0 ## 2, x4 ## 2;          \
 422                 vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 423                 vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 424                 get_key(i, 3, RK3); \
 425         vpslld $7,              x3 ## 1, x4 ## 1;          \
 426         vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 427         vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 428         vpslld $7,              x1 ## 1, x4 ## 1;          \
 429         vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 430         vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 431         vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 432         vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 433         get_key(i, 0, RK0); \
 434                 vpslld $7,              x3 ## 2, x4 ## 2;          \
 435                 vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 436                 vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 437                 vpslld $7,              x1 ## 2, x4 ## 2;          \
 438                 vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 439                 vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 440                 vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 441                 vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 442                 get_key(i, 2, RK2); \
 443         vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 444         vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 445         vpslld $5,              x0 ## 1, x4 ## 1;          \
 446         vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 447         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 448         vpslld $22,             x2 ## 1, x4 ## 1;          \
 449         vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 450         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 451         vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 452         vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 453                 vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 454                 vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 455                 vpslld $5,              x0 ## 2, x4 ## 2;          \
 456                 vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 457                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 458                 vpslld $22,             x2 ## 2, x4 ## 2;          \
 459                 vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 460                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 461                 vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 462                 vpxor                   RK2, x2 ## 2, x2 ## 2;
 463 
 464 #define KL2(x0, x1, x2, x3, x4, i) \
 465         vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 466         vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 467         vpsrld $5,              x0 ## 1, x4 ## 1;          \
 468         vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 469         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 470         vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 471         vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 472         vpsrld $22,             x2 ## 1, x4 ## 1;          \
 473         vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 474         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 475         vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 476                 vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 477                 vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 478                 vpsrld $5,              x0 ## 2, x4 ## 2;          \
 479                 vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 480                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 481                 vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 482                 vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 483                 vpsrld $22,             x2 ## 2, x4 ## 2;          \
 484                 vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 485                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 486                 vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 487         vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 488         vpslld $7,              x1 ## 1, x4 ## 1;          \
 489         vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 490         vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 491         vpsrld $1,              x1 ## 1, x4 ## 1;          \
 492         vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 493         vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 494                 vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 495                 vpslld $7,              x1 ## 2, x4 ## 2;          \
 496                 vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 497                 vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 498                 vpsrld $1,              x1 ## 2, x4 ## 2;          \
 499                 vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 500                 vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 501         vpsrld $7,              x3 ## 1, x4 ## 1;          \
 502         vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 503         vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 504         vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 505         vpslld $3,              x0 ## 1, x4 ## 1;          \
 506         vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 507                 vpsrld $7,              x3 ## 2, x4 ## 2;          \
 508                 vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 509                 vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 510                 vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 511                 vpslld $3,              x0 ## 2, x4 ## 2;          \
 512                 vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 513         vpsrld $13,             x0 ## 1, x4 ## 1;          \
 514         vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 515         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 516         vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 517         vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 518         vpsrld $3,              x2 ## 1, x4 ## 1;          \
 519         vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 520         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 521                 vpsrld $13,             x0 ## 2, x4 ## 2;          \
 522                 vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 523                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 524                 vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 525                 vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 526                 vpsrld $3,              x2 ## 2, x4 ## 2;          \
 527                 vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 528                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 529 
 530 #define S(SBOX, x0, x1, x2, x3, x4) \
 531         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 532         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 533         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 534         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 535 
 536 #define SP(SBOX, x0, x1, x2, x3, x4, i) \
 537         get_key(i, 0, RK0); \
 538         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 539         get_key(i, 2, RK2); \
 540         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 541         get_key(i, 3, RK3); \
 542         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 543         get_key(i, 1, RK1); \
 544         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 545 
 546 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 547         vpunpckldq              x1, x0, t0; \
 548         vpunpckhdq              x1, x0, t2; \
 549         vpunpckldq              x3, x2, t1; \
 550         vpunpckhdq              x3, x2, x3; \
 551         \
 552         vpunpcklqdq             t1, t0, x0; \
 553         vpunpckhqdq             t1, t0, x1; \
 554         vpunpcklqdq             x3, t2, x2; \
 555         vpunpckhqdq             x3, t2, x3;
 556 
 557 #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 558         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 559 
 560 #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 561         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 562 
 563 .align 8
 564 __serpent_enc_blk16:
 565         /* input:
 566          *      %rdi: ctx, CTX
 567          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
 568          * output:
 569          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 570          */
 571 
 572         vpcmpeqd RNOT, RNOT, RNOT;
 573 
 574         read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 575         read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 576 
 577                                                  K2(RA, RB, RC, RD, RE, 0);
 578         S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 579         S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 580         S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 581         S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 582         S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 583         S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 584         S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 585         S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 586         S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 587         S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 588         S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 589         S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 590         S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 591         S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 592         S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 593         S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 594         S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 595         S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 596         S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 597         S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 598         S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 599         S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 600         S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 601         S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 602         S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 603         S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 604         S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 605         S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 606         S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 607         S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 608         S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 609         S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 610 
 611         write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 612         write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 613 
 614         ret;
 615 ENDPROC(__serpent_enc_blk16)
 616 
 617 .align 8
 618 __serpent_dec_blk16:
 619         /* input:
 620          *      %rdi: ctx, CTX
 621          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 622          * output:
 623          *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
 624          */
 625 
 626         vpcmpeqd RNOT, RNOT, RNOT;
 627 
 628         read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 629         read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 630 
 631                                                  K2(RA, RB, RC, RD, RE, 32);
 632         SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 633         SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 634         SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 635         SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 636         SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 637         SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 638         SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 639         SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 640         SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 641         SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 642         SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 643         SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 644         SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 645         SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 646         SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 647         SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 648         SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 649         SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 650         SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 651         SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 652         SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 653         SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 654         SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 655         SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 656         SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 657         SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 658         SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 659         SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 660         SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 661         SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 662         SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 663         S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 664 
 665         write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 666         write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 667 
 668         ret;
 669 ENDPROC(__serpent_dec_blk16)
 670 
 671 ENTRY(serpent_ecb_enc_16way)
 672         /* input:
 673          *      %rdi: ctx, CTX
 674          *      %rsi: dst
 675          *      %rdx: src
 676          */
 677         FRAME_BEGIN
 678 
 679         vzeroupper;
 680 
 681         load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 682 
 683         call __serpent_enc_blk16;
 684 
 685         store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 686 
 687         vzeroupper;
 688 
 689         FRAME_END
 690         ret;
 691 ENDPROC(serpent_ecb_enc_16way)
 692 
 693 ENTRY(serpent_ecb_dec_16way)
 694         /* input:
 695          *      %rdi: ctx, CTX
 696          *      %rsi: dst
 697          *      %rdx: src
 698          */
 699         FRAME_BEGIN
 700 
 701         vzeroupper;
 702 
 703         load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 704 
 705         call __serpent_dec_blk16;
 706 
 707         store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 708 
 709         vzeroupper;
 710 
 711         FRAME_END
 712         ret;
 713 ENDPROC(serpent_ecb_dec_16way)
 714 
 715 ENTRY(serpent_cbc_dec_16way)
 716         /* input:
 717          *      %rdi: ctx, CTX
 718          *      %rsi: dst
 719          *      %rdx: src
 720          */
 721         FRAME_BEGIN
 722 
 723         vzeroupper;
 724 
 725         load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 726 
 727         call __serpent_dec_blk16;
 728 
 729         store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
 730                         RK0);
 731 
 732         vzeroupper;
 733 
 734         FRAME_END
 735         ret;
 736 ENDPROC(serpent_cbc_dec_16way)
 737 
 738 ENTRY(serpent_ctr_16way)
 739         /* input:
 740          *      %rdi: ctx, CTX
 741          *      %rsi: dst (16 blocks)
 742          *      %rdx: src (16 blocks)
 743          *      %rcx: iv (little endian, 128bit)
 744          */
 745         FRAME_BEGIN
 746 
 747         vzeroupper;
 748 
 749         load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 750                        RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 751                        tp);
 752 
 753         call __serpent_enc_blk16;
 754 
 755         store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 756 
 757         vzeroupper;
 758 
 759         FRAME_END
 760         ret;
 761 ENDPROC(serpent_ctr_16way)
 762 
 763 ENTRY(serpent_xts_enc_16way)
 764         /* input:
 765          *      %rdi: ctx, CTX
 766          *      %rsi: dst (16 blocks)
 767          *      %rdx: src (16 blocks)
 768          *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 769          */
 770         FRAME_BEGIN
 771 
 772         vzeroupper;
 773 
 774         load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 775                        RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 776                        .Lxts_gf128mul_and_shl1_mask_0,
 777                        .Lxts_gf128mul_and_shl1_mask_1);
 778 
 779         call __serpent_enc_blk16;
 780 
 781         store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 782 
 783         vzeroupper;
 784 
 785         FRAME_END
 786         ret;
 787 ENDPROC(serpent_xts_enc_16way)
 788 
 789 ENTRY(serpent_xts_dec_16way)
 790         /* input:
 791          *      %rdi: ctx, CTX
 792          *      %rsi: dst (16 blocks)
 793          *      %rdx: src (16 blocks)
 794          *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 795          */
 796         FRAME_BEGIN
 797 
 798         vzeroupper;
 799 
 800         load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 801                        RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 802                        .Lxts_gf128mul_and_shl1_mask_0,
 803                        .Lxts_gf128mul_and_shl1_mask_1);
 804 
 805         call __serpent_dec_blk16;
 806 
 807         store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 808 
 809         vzeroupper;
 810 
 811         FRAME_END
 812         ret;
 813 ENDPROC(serpent_xts_dec_16way)

/* [<][>][^][v][top][bottom][index][help] */