1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 #include <linux/linkage.h>
  19 #include <asm/frame.h>
  20 #include <asm/nospec-branch.h>
  21 
  22 #define CAMELLIA_TABLE_BYTE_LEN 272
  23 
  24 
  25 #define key_table 0
  26 #define key_length CAMELLIA_TABLE_BYTE_LEN
  27 
  28 
  29 #define CTX %rdi
  30 
  31 
  32 
  33 
  34 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  35         vpand x, mask4bit, tmp0; \
  36         vpandn x, mask4bit, x; \
  37         vpsrld $4, x, x; \
  38         \
  39         vpshufb tmp0, lo_t, tmp0; \
  40         vpshufb x, hi_t, x; \
  41         vpxor tmp0, x, x;
  42 
  43 
  44 
  45 
  46 
  47 
  48 
  49 
  50 
  51 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
  52                   t7, mem_cd, key) \
  53         
  54 
  55  \
  56         vmovdqa .Linv_shift_row, t4; \
  57         vbroadcastss .L0f0f0f0f, t7; \
  58         vmovdqa .Lpre_tf_lo_s1, t0; \
  59         vmovdqa .Lpre_tf_hi_s1, t1; \
  60         \
  61          \
  62         vpshufb t4, x0, x0; \
  63         vpshufb t4, x7, x7; \
  64         vpshufb t4, x1, x1; \
  65         vpshufb t4, x4, x4; \
  66         vpshufb t4, x2, x2; \
  67         vpshufb t4, x5, x5; \
  68         vpshufb t4, x3, x3; \
  69         vpshufb t4, x6, x6; \
  70         \
  71          \
  72         vmovdqa .Lpre_tf_lo_s4, t2; \
  73         vmovdqa .Lpre_tf_hi_s4, t3; \
  74         filter_8bit(x0, t0, t1, t7, t6); \
  75         filter_8bit(x7, t0, t1, t7, t6); \
  76         filter_8bit(x1, t0, t1, t7, t6); \
  77         filter_8bit(x4, t0, t1, t7, t6); \
  78         filter_8bit(x2, t0, t1, t7, t6); \
  79         filter_8bit(x5, t0, t1, t7, t6); \
  80         \
  81          \
  82         vpxor t4, t4, t4; \
  83         filter_8bit(x3, t2, t3, t7, t6); \
  84         filter_8bit(x6, t2, t3, t7, t6); \
  85         \
  86          \
  87         vmovdqa .Lpost_tf_lo_s1, t0; \
  88         vmovdqa .Lpost_tf_hi_s1, t1; \
  89         vaesenclast t4, x0, x0; \
  90         vaesenclast t4, x7, x7; \
  91         vaesenclast t4, x1, x1; \
  92         vaesenclast t4, x4, x4; \
  93         vaesenclast t4, x2, x2; \
  94         vaesenclast t4, x5, x5; \
  95         vaesenclast t4, x3, x3; \
  96         vaesenclast t4, x6, x6; \
  97         \
  98          \
  99         vmovdqa .Lpost_tf_lo_s3, t2; \
 100         vmovdqa .Lpost_tf_hi_s3, t3; \
 101         filter_8bit(x0, t0, t1, t7, t6); \
 102         filter_8bit(x7, t0, t1, t7, t6); \
 103         filter_8bit(x3, t0, t1, t7, t6); \
 104         filter_8bit(x6, t0, t1, t7, t6); \
 105         \
 106          \
 107         vmovdqa .Lpost_tf_lo_s2, t4; \
 108         vmovdqa .Lpost_tf_hi_s2, t5; \
 109         filter_8bit(x2, t2, t3, t7, t6); \
 110         filter_8bit(x5, t2, t3, t7, t6); \
 111         \
 112         vpxor t6, t6, t6; \
 113         vmovq key, t0; \
 114         \
 115          \
 116         filter_8bit(x1, t4, t5, t7, t2); \
 117         filter_8bit(x4, t4, t5, t7, t2); \
 118         \
 119         vpsrldq $5, t0, t5; \
 120         vpsrldq $1, t0, t1; \
 121         vpsrldq $2, t0, t2; \
 122         vpsrldq $3, t0, t3; \
 123         vpsrldq $4, t0, t4; \
 124         vpshufb t6, t0, t0; \
 125         vpshufb t6, t1, t1; \
 126         vpshufb t6, t2, t2; \
 127         vpshufb t6, t3, t3; \
 128         vpshufb t6, t4, t4; \
 129         vpsrldq $2, t5, t7; \
 130         vpshufb t6, t7, t7; \
 131         \
 132         
 133 
 134  \
 135         vpxor x5, x0, x0; \
 136         vpxor x6, x1, x1; \
 137         vpxor x7, x2, x2; \
 138         vpxor x4, x3, x3; \
 139         \
 140         vpxor x2, x4, x4; \
 141         vpxor x3, x5, x5; \
 142         vpxor x0, x6, x6; \
 143         vpxor x1, x7, x7; \
 144         \
 145         vpxor x7, x0, x0; \
 146         vpxor x4, x1, x1; \
 147         vpxor x5, x2, x2; \
 148         vpxor x6, x3, x3; \
 149         \
 150         vpxor x3, x4, x4; \
 151         vpxor x0, x5, x5; \
 152         vpxor x1, x6, x6; \
 153         vpxor x2, x7, x7;  \
 154         \
 155         
 156 
 157  \
 158         \
 159         vpxor t3, x4, x4; \
 160         vpxor 0 * 16(mem_cd), x4, x4; \
 161         \
 162         vpxor t2, x5, x5; \
 163         vpxor 1 * 16(mem_cd), x5, x5; \
 164         \
 165         vpsrldq $1, t5, t3; \
 166         vpshufb t6, t5, t5; \
 167         vpshufb t6, t3, t6; \
 168         \
 169         vpxor t1, x6, x6; \
 170         vpxor 2 * 16(mem_cd), x6, x6; \
 171         \
 172         vpxor t0, x7, x7; \
 173         vpxor 3 * 16(mem_cd), x7, x7; \
 174         \
 175         vpxor t7, x0, x0; \
 176         vpxor 4 * 16(mem_cd), x0, x0; \
 177         \
 178         vpxor t6, x1, x1; \
 179         vpxor 5 * 16(mem_cd), x1, x1; \
 180         \
 181         vpxor t5, x2, x2; \
 182         vpxor 6 * 16(mem_cd), x2, x2; \
 183         \
 184         vpxor t4, x3, x3; \
 185         vpxor 7 * 16(mem_cd), x3, x3;
 186 
 187 
 188 
 189 
 190 
 191 .align 8
 192 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
 193         roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 194                   %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
 195                   %rcx, (%r9));
 196         ret;
 197 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
 198 
 199 .align 8
 200 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
 201         roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
 202                   %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
 203                   %rax, (%r9));
 204         ret;
 205 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 206 
 207 
 208 
 209 
 210 
 211 
 212 
 213 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 214                       y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 215         leaq (key_table + (i) * 8)(CTX), %r9; \
 216         call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
 217         \
 218         vmovdqu x4, 0 * 16(mem_cd); \
 219         vmovdqu x5, 1 * 16(mem_cd); \
 220         vmovdqu x6, 2 * 16(mem_cd); \
 221         vmovdqu x7, 3 * 16(mem_cd); \
 222         vmovdqu x0, 4 * 16(mem_cd); \
 223         vmovdqu x1, 5 * 16(mem_cd); \
 224         vmovdqu x2, 6 * 16(mem_cd); \
 225         vmovdqu x3, 7 * 16(mem_cd); \
 226         \
 227         leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
 228         call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
 229         \
 230         store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 231 
 232 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) 
 233 
 234 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 235          \
 236         vmovdqu x0, 0 * 16(mem_ab); \
 237         vmovdqu x1, 1 * 16(mem_ab); \
 238         vmovdqu x2, 2 * 16(mem_ab); \
 239         vmovdqu x3, 3 * 16(mem_ab); \
 240         vmovdqu x4, 4 * 16(mem_ab); \
 241         vmovdqu x5, 5 * 16(mem_ab); \
 242         vmovdqu x6, 6 * 16(mem_ab); \
 243         vmovdqu x7, 7 * 16(mem_ab);
 244 
 245 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 246                       y6, y7, mem_ab, mem_cd, i) \
 247         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 248                       y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 249         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 250                       y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 251         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 252                       y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 253 
 254 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 255                       y6, y7, mem_ab, mem_cd, i) \
 256         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 257                       y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 258         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 259                       y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 260         two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 261                       y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 262 
 263 
 264 
 265 
 266 
 267 
 268 
 269 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
 270         vpcmpgtb v0, zero, t0; \
 271         vpaddb v0, v0, v0; \
 272         vpabsb t0, t0; \
 273         \
 274         vpcmpgtb v1, zero, t1; \
 275         vpaddb v1, v1, v1; \
 276         vpabsb t1, t1; \
 277         \
 278         vpcmpgtb v2, zero, t2; \
 279         vpaddb v2, v2, v2; \
 280         vpabsb t2, t2; \
 281         \
 282         vpor t0, v1, v1; \
 283         \
 284         vpcmpgtb v3, zero, t0; \
 285         vpaddb v3, v3, v3; \
 286         vpabsb t0, t0; \
 287         \
 288         vpor t1, v2, v2; \
 289         vpor t2, v3, v3; \
 290         vpor t0, v0, v0;
 291 
 292 
 293 
 294 
 295 
 296 
 297 
 298 
 299 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 300               tt1, tt2, tt3, kll, klr, krl, krr) \
 301         
 302 
 303 
 304 
 305  \
 306         vpxor tt0, tt0, tt0; \
 307         vmovd kll, t0; \
 308         vpshufb tt0, t0, t3; \
 309         vpsrldq $1, t0, t0; \
 310         vpshufb tt0, t0, t2; \
 311         vpsrldq $1, t0, t0; \
 312         vpshufb tt0, t0, t1; \
 313         vpsrldq $1, t0, t0; \
 314         vpshufb tt0, t0, t0; \
 315         \
 316         vpand l0, t0, t0; \
 317         vpand l1, t1, t1; \
 318         vpand l2, t2, t2; \
 319         vpand l3, t3, t3; \
 320         \
 321         rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 322         \
 323         vpxor l4, t0, l4; \
 324         vmovdqu l4, 4 * 16(l); \
 325         vpxor l5, t1, l5; \
 326         vmovdqu l5, 5 * 16(l); \
 327         vpxor l6, t2, l6; \
 328         vmovdqu l6, 6 * 16(l); \
 329         vpxor l7, t3, l7; \
 330         vmovdqu l7, 7 * 16(l); \
 331         \
 332         
 333 
 334 
 335 
 336  \
 337         \
 338         vmovd krr, t0; \
 339         vpshufb tt0, t0, t3; \
 340         vpsrldq $1, t0, t0; \
 341         vpshufb tt0, t0, t2; \
 342         vpsrldq $1, t0, t0; \
 343         vpshufb tt0, t0, t1; \
 344         vpsrldq $1, t0, t0; \
 345         vpshufb tt0, t0, t0; \
 346         \
 347         vpor 4 * 16(r), t0, t0; \
 348         vpor 5 * 16(r), t1, t1; \
 349         vpor 6 * 16(r), t2, t2; \
 350         vpor 7 * 16(r), t3, t3; \
 351         \
 352         vpxor 0 * 16(r), t0, t0; \
 353         vpxor 1 * 16(r), t1, t1; \
 354         vpxor 2 * 16(r), t2, t2; \
 355         vpxor 3 * 16(r), t3, t3; \
 356         vmovdqu t0, 0 * 16(r); \
 357         vmovdqu t1, 1 * 16(r); \
 358         vmovdqu t2, 2 * 16(r); \
 359         vmovdqu t3, 3 * 16(r); \
 360         \
 361         
 362 
 363 
 364 
 365  \
 366         vmovd krl, t0; \
 367         vpshufb tt0, t0, t3; \
 368         vpsrldq $1, t0, t0; \
 369         vpshufb tt0, t0, t2; \
 370         vpsrldq $1, t0, t0; \
 371         vpshufb tt0, t0, t1; \
 372         vpsrldq $1, t0, t0; \
 373         vpshufb tt0, t0, t0; \
 374         \
 375         vpand 0 * 16(r), t0, t0; \
 376         vpand 1 * 16(r), t1, t1; \
 377         vpand 2 * 16(r), t2, t2; \
 378         vpand 3 * 16(r), t3, t3; \
 379         \
 380         rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 381         \
 382         vpxor 4 * 16(r), t0, t0; \
 383         vpxor 5 * 16(r), t1, t1; \
 384         vpxor 6 * 16(r), t2, t2; \
 385         vpxor 7 * 16(r), t3, t3; \
 386         vmovdqu t0, 4 * 16(r); \
 387         vmovdqu t1, 5 * 16(r); \
 388         vmovdqu t2, 6 * 16(r); \
 389         vmovdqu t3, 7 * 16(r); \
 390         \
 391         
 392 
 393 
 394 
 395  \
 396         \
 397         vmovd klr, t0; \
 398         vpshufb tt0, t0, t3; \
 399         vpsrldq $1, t0, t0; \
 400         vpshufb tt0, t0, t2; \
 401         vpsrldq $1, t0, t0; \
 402         vpshufb tt0, t0, t1; \
 403         vpsrldq $1, t0, t0; \
 404         vpshufb tt0, t0, t0; \
 405         \
 406         vpor l4, t0, t0; \
 407         vpor l5, t1, t1; \
 408         vpor l6, t2, t2; \
 409         vpor l7, t3, t3; \
 410         \
 411         vpxor l0, t0, l0; \
 412         vmovdqu l0, 0 * 16(l); \
 413         vpxor l1, t1, l1; \
 414         vmovdqu l1, 1 * 16(l); \
 415         vpxor l2, t2, l2; \
 416         vmovdqu l2, 2 * 16(l); \
 417         vpxor l3, t3, l3; \
 418         vmovdqu l3, 3 * 16(l);
 419 
 420 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 421         vpunpckhdq x1, x0, t2; \
 422         vpunpckldq x1, x0, x0; \
 423         \
 424         vpunpckldq x3, x2, t1; \
 425         vpunpckhdq x3, x2, x2; \
 426         \
 427         vpunpckhqdq t1, x0, x1; \
 428         vpunpcklqdq t1, x0, x0; \
 429         \
 430         vpunpckhqdq x2, t2, x3; \
 431         vpunpcklqdq x2, t2, x2;
 432 
 433 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
 434                          b3, c3, d3, st0, st1) \
 435         vmovdqu d2, st0; \
 436         vmovdqu d3, st1; \
 437         transpose_4x4(a0, a1, a2, a3, d2, d3); \
 438         transpose_4x4(b0, b1, b2, b3, d2, d3); \
 439         vmovdqu st0, d2; \
 440         vmovdqu st1, d3; \
 441         \
 442         vmovdqu a0, st0; \
 443         vmovdqu a1, st1; \
 444         transpose_4x4(c0, c1, c2, c3, a0, a1); \
 445         transpose_4x4(d0, d1, d2, d3, a0, a1); \
 446         \
 447         vmovdqu .Lshufb_16x16b, a0; \
 448         vmovdqu st1, a1; \
 449         vpshufb a0, a2, a2; \
 450         vpshufb a0, a3, a3; \
 451         vpshufb a0, b0, b0; \
 452         vpshufb a0, b1, b1; \
 453         vpshufb a0, b2, b2; \
 454         vpshufb a0, b3, b3; \
 455         vpshufb a0, a1, a1; \
 456         vpshufb a0, c0, c0; \
 457         vpshufb a0, c1, c1; \
 458         vpshufb a0, c2, c2; \
 459         vpshufb a0, c3, c3; \
 460         vpshufb a0, d0, d0; \
 461         vpshufb a0, d1, d1; \
 462         vpshufb a0, d2, d2; \
 463         vpshufb a0, d3, d3; \
 464         vmovdqu d3, st1; \
 465         vmovdqu st0, d3; \
 466         vpshufb a0, d3, a0; \
 467         vmovdqu d2, st0; \
 468         \
 469         transpose_4x4(a0, b0, c0, d0, d2, d3); \
 470         transpose_4x4(a1, b1, c1, d1, d2, d3); \
 471         vmovdqu st0, d2; \
 472         vmovdqu st1, d3; \
 473         \
 474         vmovdqu b0, st0; \
 475         vmovdqu b1, st1; \
 476         transpose_4x4(a2, b2, c2, d2, b0, b1); \
 477         transpose_4x4(a3, b3, c3, d3, b0, b1); \
 478         vmovdqu st0, b0; \
 479         vmovdqu st1, b1; \
 480         
 481 
 482 
 483 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 484                      y6, y7, rio, key) \
 485         vmovq key, x0; \
 486         vpshufb .Lpack_bswap, x0, x0; \
 487         \
 488         vpxor 0 * 16(rio), x0, y7; \
 489         vpxor 1 * 16(rio), x0, y6; \
 490         vpxor 2 * 16(rio), x0, y5; \
 491         vpxor 3 * 16(rio), x0, y4; \
 492         vpxor 4 * 16(rio), x0, y3; \
 493         vpxor 5 * 16(rio), x0, y2; \
 494         vpxor 6 * 16(rio), x0, y1; \
 495         vpxor 7 * 16(rio), x0, y0; \
 496         vpxor 8 * 16(rio), x0, x7; \
 497         vpxor 9 * 16(rio), x0, x6; \
 498         vpxor 10 * 16(rio), x0, x5; \
 499         vpxor 11 * 16(rio), x0, x4; \
 500         vpxor 12 * 16(rio), x0, x3; \
 501         vpxor 13 * 16(rio), x0, x2; \
 502         vpxor 14 * 16(rio), x0, x1; \
 503         vpxor 15 * 16(rio), x0, x0;
 504 
 505 
 506 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 507                       y6, y7, mem_ab, mem_cd) \
 508         byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 509                          y5, y6, y7, (mem_ab), (mem_cd)); \
 510         \
 511         vmovdqu x0, 0 * 16(mem_ab); \
 512         vmovdqu x1, 1 * 16(mem_ab); \
 513         vmovdqu x2, 2 * 16(mem_ab); \
 514         vmovdqu x3, 3 * 16(mem_ab); \
 515         vmovdqu x4, 4 * 16(mem_ab); \
 516         vmovdqu x5, 5 * 16(mem_ab); \
 517         vmovdqu x6, 6 * 16(mem_ab); \
 518         vmovdqu x7, 7 * 16(mem_ab); \
 519         vmovdqu y0, 0 * 16(mem_cd); \
 520         vmovdqu y1, 1 * 16(mem_cd); \
 521         vmovdqu y2, 2 * 16(mem_cd); \
 522         vmovdqu y3, 3 * 16(mem_cd); \
 523         vmovdqu y4, 4 * 16(mem_cd); \
 524         vmovdqu y5, 5 * 16(mem_cd); \
 525         vmovdqu y6, 6 * 16(mem_cd); \
 526         vmovdqu y7, 7 * 16(mem_cd);
 527 
 528 
 529 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 530                     y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 531         byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
 532                          y7, x3, x7, stack_tmp0, stack_tmp1); \
 533         \
 534         vmovdqu x0, stack_tmp0; \
 535         \
 536         vmovq key, x0; \
 537         vpshufb .Lpack_bswap, x0, x0; \
 538         \
 539         vpxor x0, y7, y7; \
 540         vpxor x0, y6, y6; \
 541         vpxor x0, y5, y5; \
 542         vpxor x0, y4, y4; \
 543         vpxor x0, y3, y3; \
 544         vpxor x0, y2, y2; \
 545         vpxor x0, y1, y1; \
 546         vpxor x0, y0, y0; \
 547         vpxor x0, x7, x7; \
 548         vpxor x0, x6, x6; \
 549         vpxor x0, x5, x5; \
 550         vpxor x0, x4, x4; \
 551         vpxor x0, x3, x3; \
 552         vpxor x0, x2, x2; \
 553         vpxor x0, x1, x1; \
 554         vpxor stack_tmp0, x0, x0;
 555 
 556 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 557                      y6, y7, rio) \
 558         vmovdqu x0, 0 * 16(rio); \
 559         vmovdqu x1, 1 * 16(rio); \
 560         vmovdqu x2, 2 * 16(rio); \
 561         vmovdqu x3, 3 * 16(rio); \
 562         vmovdqu x4, 4 * 16(rio); \
 563         vmovdqu x5, 5 * 16(rio); \
 564         vmovdqu x6, 6 * 16(rio); \
 565         vmovdqu x7, 7 * 16(rio); \
 566         vmovdqu y0, 8 * 16(rio); \
 567         vmovdqu y1, 9 * 16(rio); \
 568         vmovdqu y2, 10 * 16(rio); \
 569         vmovdqu y3, 11 * 16(rio); \
 570         vmovdqu y4, 12 * 16(rio); \
 571         vmovdqu y5, 13 * 16(rio); \
 572         vmovdqu y6, 14 * 16(rio); \
 573         vmovdqu y7, 15 * 16(rio);
 574 
 575 
 576 
 577 .section        .rodata.cst16, "aM", @progbits, 16
 578 .align 16
 579 
 580 #define SHUFB_BYTES(idx) \
 581         0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 582 
 583 .Lshufb_16x16b:
 584         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 585 
 586 .Lpack_bswap:
 587         .long 0x00010203
 588         .long 0x04050607
 589         .long 0x80808080
 590         .long 0x80808080
 591 
 592 
 593 .Lbswap128_mask:
 594         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 595 
 596 
 597 .Lxts_gf128mul_and_shl1_mask:
 598         .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 599 
 600 
 601 
 602 
 603 
 604 
 605 
 606 
 607 
 608 
 609 
 610 
 611 
 612 
 613 
 614 .Lpre_tf_lo_s1:
 615         .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 616         .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 617 .Lpre_tf_hi_s1:
 618         .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 619         .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 620 
 621 
 622 
 623 
 624 
 625 
 626 
 627 
 628 
 629 
 630 
 631 
 632 
 633 
 634 
 635 .Lpre_tf_lo_s4:
 636         .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 637         .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 638 .Lpre_tf_hi_s4:
 639         .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 640         .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 641 
 642 
 643 
 644 
 645 
 646 
 647 
 648 
 649 
 650 
 651 
 652 
 653 
 654 
 655 
 656 
 657 
 658 .Lpost_tf_lo_s1:
 659         .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 660         .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 661 .Lpost_tf_hi_s1:
 662         .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 663         .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 664 
 665 
 666 
 667 
 668 
 669 
 670 
 671 
 672 
 673 
 674 
 675 
 676 
 677 
 678 
 679 
 680 
 681 .Lpost_tf_lo_s2:
 682         .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 683         .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 684 .Lpost_tf_hi_s2:
 685         .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 686         .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 687 
 688 
 689 
 690 
 691 
 692 
 693 
 694 
 695 
 696 
 697 
 698 
 699 
 700 
 701 
 702 
 703 
 704 .Lpost_tf_lo_s3:
 705         .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 706         .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 707 .Lpost_tf_hi_s3:
 708         .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 709         .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 710 
 711 
 712 .Linv_shift_row:
 713         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 714         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 715 
 716 
 717 .section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 718 .align 4
 719 .L0f0f0f0f:
 720         .long 0x0f0f0f0f
 721 
 722 .text
 723 
 724 .align 8
 725 __camellia_enc_blk16:
 726         
 727 
 728 
 729 
 730 
 731 
 732 
 733 
 734         FRAME_BEGIN
 735 
 736         leaq 8 * 16(%rax), %rcx;
 737 
 738         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 739                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 740                       %xmm15, %rax, %rcx);
 741 
 742         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 743                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 744                      %xmm15, %rax, %rcx, 0);
 745 
 746         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 747               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 748               %xmm15,
 749               ((key_table + (8) * 8) + 0)(CTX),
 750               ((key_table + (8) * 8) + 4)(CTX),
 751               ((key_table + (8) * 8) + 8)(CTX),
 752               ((key_table + (8) * 8) + 12)(CTX));
 753 
 754         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 755                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 756                      %xmm15, %rax, %rcx, 8);
 757 
 758         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 759               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 760               %xmm15,
 761               ((key_table + (16) * 8) + 0)(CTX),
 762               ((key_table + (16) * 8) + 4)(CTX),
 763               ((key_table + (16) * 8) + 8)(CTX),
 764               ((key_table + (16) * 8) + 12)(CTX));
 765 
 766         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 767                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 768                      %xmm15, %rax, %rcx, 16);
 769 
 770         movl $24, %r8d;
 771         cmpl $16, key_length(CTX);
 772         jne .Lenc_max32;
 773 
 774 .Lenc_done:
 775         
 776         vmovdqu 0 * 16(%rcx), %xmm8;
 777         vmovdqu 1 * 16(%rcx), %xmm9;
 778         vmovdqu 2 * 16(%rcx), %xmm10;
 779         vmovdqu 3 * 16(%rcx), %xmm11;
 780         vmovdqu 4 * 16(%rcx), %xmm12;
 781         vmovdqu 5 * 16(%rcx), %xmm13;
 782         vmovdqu 6 * 16(%rcx), %xmm14;
 783         vmovdqu 7 * 16(%rcx), %xmm15;
 784 
 785         outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 786                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 787                     %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
 788 
 789         FRAME_END
 790         ret;
 791 
 792 .align 8
 793 .Lenc_max32:
 794         movl $32, %r8d;
 795 
 796         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 797               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 798               %xmm15,
 799               ((key_table + (24) * 8) + 0)(CTX),
 800               ((key_table + (24) * 8) + 4)(CTX),
 801               ((key_table + (24) * 8) + 8)(CTX),
 802               ((key_table + (24) * 8) + 12)(CTX));
 803 
 804         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 805                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 806                      %xmm15, %rax, %rcx, 24);
 807 
 808         jmp .Lenc_done;
 809 ENDPROC(__camellia_enc_blk16)
 810 
 811 .align 8
 812 __camellia_dec_blk16:
 813         
 814 
 815 
 816 
 817 
 818 
 819 
 820 
 821 
 822         FRAME_BEGIN
 823 
 824         leaq 8 * 16(%rax), %rcx;
 825 
 826         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 827                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 828                       %xmm15, %rax, %rcx);
 829 
 830         cmpl $32, %r8d;
 831         je .Ldec_max32;
 832 
 833 .Ldec_max24:
 834         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 835                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 836                      %xmm15, %rax, %rcx, 16);
 837 
 838         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 839               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 840               %xmm15,
 841               ((key_table + (16) * 8) + 8)(CTX),
 842               ((key_table + (16) * 8) + 12)(CTX),
 843               ((key_table + (16) * 8) + 0)(CTX),
 844               ((key_table + (16) * 8) + 4)(CTX));
 845 
 846         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 847                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 848                      %xmm15, %rax, %rcx, 8);
 849 
 850         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 851               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 852               %xmm15,
 853               ((key_table + (8) * 8) + 8)(CTX),
 854               ((key_table + (8) * 8) + 12)(CTX),
 855               ((key_table + (8) * 8) + 0)(CTX),
 856               ((key_table + (8) * 8) + 4)(CTX));
 857 
 858         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 859                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 860                      %xmm15, %rax, %rcx, 0);
 861 
 862         
 863         vmovdqu 0 * 16(%rcx), %xmm8;
 864         vmovdqu 1 * 16(%rcx), %xmm9;
 865         vmovdqu 2 * 16(%rcx), %xmm10;
 866         vmovdqu 3 * 16(%rcx), %xmm11;
 867         vmovdqu 4 * 16(%rcx), %xmm12;
 868         vmovdqu 5 * 16(%rcx), %xmm13;
 869         vmovdqu 6 * 16(%rcx), %xmm14;
 870         vmovdqu 7 * 16(%rcx), %xmm15;
 871 
 872         outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 873                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 874                     %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 875 
 876         FRAME_END
 877         ret;
 878 
 879 .align 8
 880 .Ldec_max32:
 881         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 882                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 883                      %xmm15, %rax, %rcx, 24);
 884 
 885         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 886               %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 887               %xmm15,
 888               ((key_table + (24) * 8) + 8)(CTX),
 889               ((key_table + (24) * 8) + 12)(CTX),
 890               ((key_table + (24) * 8) + 0)(CTX),
 891               ((key_table + (24) * 8) + 4)(CTX));
 892 
 893         jmp .Ldec_max24;
 894 ENDPROC(__camellia_dec_blk16)
 895 
 896 ENTRY(camellia_ecb_enc_16way)
 897         
 898 
 899 
 900 
 901 
 902          FRAME_BEGIN
 903 
 904         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 905                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 906                      %xmm15, %rdx, (key_table)(CTX));
 907 
 908         
 909         movq    %rsi, %rax;
 910 
 911         call __camellia_enc_blk16;
 912 
 913         write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 914                      %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 915                      %xmm8, %rsi);
 916 
 917         FRAME_END
 918         ret;
 919 ENDPROC(camellia_ecb_enc_16way)
 920 
 921 ENTRY(camellia_ecb_dec_16way)
 922         
 923 
 924 
 925 
 926 
 927          FRAME_BEGIN
 928 
 929         cmpl $16, key_length(CTX);
 930         movl $32, %r8d;
 931         movl $24, %eax;
 932         cmovel %eax, %r8d; 
 933 
 934         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 935                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 936                      %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 937 
 938         
 939         movq    %rsi, %rax;
 940 
 941         call __camellia_dec_blk16;
 942 
 943         write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 944                      %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 945                      %xmm8, %rsi);
 946 
 947         FRAME_END
 948         ret;
 949 ENDPROC(camellia_ecb_dec_16way)
 950 
 951 ENTRY(camellia_cbc_dec_16way)
 952         
 953 
 954 
 955 
 956 
 957         FRAME_BEGIN
 958 
 959         cmpl $16, key_length(CTX);
 960         movl $32, %r8d;
 961         movl $24, %eax;
 962         cmovel %eax, %r8d; 
 963 
 964         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 965                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 966                      %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 967 
 968         
 969 
 970 
 971 
 972         subq $(16 * 16), %rsp;
 973         movq %rsp, %rax;
 974 
 975         call __camellia_dec_blk16;
 976 
 977         addq $(16 * 16), %rsp;
 978 
 979         vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
 980         vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
 981         vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
 982         vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
 983         vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
 984         vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
 985         vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
 986         vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
 987         vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
 988         vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
 989         vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
 990         vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
 991         vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
 992         vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
 993         vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
 994         write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 995                      %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 996                      %xmm8, %rsi);
 997 
 998         FRAME_END
 999         ret;
1000 ENDPROC(camellia_cbc_dec_16way)
1001 
1002 #define inc_le128(x, minus_one, tmp) \
1003         vpcmpeqq minus_one, x, tmp; \
1004         vpsubq minus_one, x, x; \
1005         vpslldq $8, tmp, tmp; \
1006         vpsubq tmp, x, x;
1007 
1008 ENTRY(camellia_ctr_16way)
1009         
1010 
1011 
1012 
1013 
1014 
1015         FRAME_BEGIN
1016 
1017         subq $(16 * 16), %rsp;
1018         movq %rsp, %rax;
1019 
1020         vmovdqa .Lbswap128_mask, %xmm14;
1021 
1022         
1023         vmovdqu (%rcx), %xmm0;
1024         vpshufb %xmm14, %xmm0, %xmm15;
1025         vmovdqu %xmm15, 15 * 16(%rax);
1026 
1027         vpcmpeqd %xmm15, %xmm15, %xmm15;
1028         vpsrldq $8, %xmm15, %xmm15; 
1029 
1030         
1031         inc_le128(%xmm0, %xmm15, %xmm13);
1032         vpshufb %xmm14, %xmm0, %xmm13;
1033         vmovdqu %xmm13, 14 * 16(%rax);
1034         inc_le128(%xmm0, %xmm15, %xmm13);
1035         vpshufb %xmm14, %xmm0, %xmm13;
1036         vmovdqu %xmm13, 13 * 16(%rax);
1037         inc_le128(%xmm0, %xmm15, %xmm13);
1038         vpshufb %xmm14, %xmm0, %xmm12;
1039         inc_le128(%xmm0, %xmm15, %xmm13);
1040         vpshufb %xmm14, %xmm0, %xmm11;
1041         inc_le128(%xmm0, %xmm15, %xmm13);
1042         vpshufb %xmm14, %xmm0, %xmm10;
1043         inc_le128(%xmm0, %xmm15, %xmm13);
1044         vpshufb %xmm14, %xmm0, %xmm9;
1045         inc_le128(%xmm0, %xmm15, %xmm13);
1046         vpshufb %xmm14, %xmm0, %xmm8;
1047         inc_le128(%xmm0, %xmm15, %xmm13);
1048         vpshufb %xmm14, %xmm0, %xmm7;
1049         inc_le128(%xmm0, %xmm15, %xmm13);
1050         vpshufb %xmm14, %xmm0, %xmm6;
1051         inc_le128(%xmm0, %xmm15, %xmm13);
1052         vpshufb %xmm14, %xmm0, %xmm5;
1053         inc_le128(%xmm0, %xmm15, %xmm13);
1054         vpshufb %xmm14, %xmm0, %xmm4;
1055         inc_le128(%xmm0, %xmm15, %xmm13);
1056         vpshufb %xmm14, %xmm0, %xmm3;
1057         inc_le128(%xmm0, %xmm15, %xmm13);
1058         vpshufb %xmm14, %xmm0, %xmm2;
1059         inc_le128(%xmm0, %xmm15, %xmm13);
1060         vpshufb %xmm14, %xmm0, %xmm1;
1061         inc_le128(%xmm0, %xmm15, %xmm13);
1062         vmovdqa %xmm0, %xmm13;
1063         vpshufb %xmm14, %xmm0, %xmm0;
1064         inc_le128(%xmm13, %xmm15, %xmm14);
1065         vmovdqu %xmm13, (%rcx);
1066 
1067         
1068         vmovq (key_table)(CTX), %xmm15;
1069         vpshufb .Lpack_bswap, %xmm15, %xmm15;
1070         vpxor %xmm0, %xmm15, %xmm0;
1071         vpxor %xmm1, %xmm15, %xmm1;
1072         vpxor %xmm2, %xmm15, %xmm2;
1073         vpxor %xmm3, %xmm15, %xmm3;
1074         vpxor %xmm4, %xmm15, %xmm4;
1075         vpxor %xmm5, %xmm15, %xmm5;
1076         vpxor %xmm6, %xmm15, %xmm6;
1077         vpxor %xmm7, %xmm15, %xmm7;
1078         vpxor %xmm8, %xmm15, %xmm8;
1079         vpxor %xmm9, %xmm15, %xmm9;
1080         vpxor %xmm10, %xmm15, %xmm10;
1081         vpxor %xmm11, %xmm15, %xmm11;
1082         vpxor %xmm12, %xmm15, %xmm12;
1083         vpxor 13 * 16(%rax), %xmm15, %xmm13;
1084         vpxor 14 * 16(%rax), %xmm15, %xmm14;
1085         vpxor 15 * 16(%rax), %xmm15, %xmm15;
1086 
1087         call __camellia_enc_blk16;
1088 
1089         addq $(16 * 16), %rsp;
1090 
1091         vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1092         vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1093         vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1094         vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1095         vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1096         vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1097         vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1098         vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1099         vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1100         vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1101         vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1102         vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1103         vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1104         vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1105         vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1106         vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1107         write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1108                      %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1109                      %xmm8, %rsi);
1110 
1111         FRAME_END
1112         ret;
1113 ENDPROC(camellia_ctr_16way)
1114 
1115 #define gf128mul_x_ble(iv, mask, tmp) \
1116         vpsrad $31, iv, tmp; \
1117         vpaddq iv, iv, iv; \
1118         vpshufd $0x13, tmp, tmp; \
1119         vpand mask, tmp, tmp; \
1120         vpxor tmp, iv, iv;
1121 
1122 .align 8
1123 camellia_xts_crypt_16way:
1124         
1125 
1126 
1127 
1128 
1129 
1130 
1131 
1132         FRAME_BEGIN
1133 
1134         subq $(16 * 16), %rsp;
1135         movq %rsp, %rax;
1136 
1137         vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1138 
1139         
1140         vmovdqu (%rcx), %xmm0;
1141         vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1142         vmovdqu %xmm15, 15 * 16(%rax);
1143         vmovdqu %xmm0, 0 * 16(%rsi);
1144 
1145         
1146         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1147         vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1148         vmovdqu %xmm15, 14 * 16(%rax);
1149         vmovdqu %xmm0, 1 * 16(%rsi);
1150 
1151         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1152         vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1153         vmovdqu %xmm0, 2 * 16(%rsi);
1154 
1155         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1156         vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1157         vmovdqu %xmm0, 3 * 16(%rsi);
1158 
1159         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1160         vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1161         vmovdqu %xmm0, 4 * 16(%rsi);
1162 
1163         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1164         vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1165         vmovdqu %xmm0, 5 * 16(%rsi);
1166 
1167         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1168         vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1169         vmovdqu %xmm0, 6 * 16(%rsi);
1170 
1171         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1172         vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1173         vmovdqu %xmm0, 7 * 16(%rsi);
1174 
1175         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1176         vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1177         vmovdqu %xmm0, 8 * 16(%rsi);
1178 
1179         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1180         vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1181         vmovdqu %xmm0, 9 * 16(%rsi);
1182 
1183         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1184         vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1185         vmovdqu %xmm0, 10 * 16(%rsi);
1186 
1187         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1188         vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1189         vmovdqu %xmm0, 11 * 16(%rsi);
1190 
1191         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1192         vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1193         vmovdqu %xmm0, 12 * 16(%rsi);
1194 
1195         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1196         vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1197         vmovdqu %xmm0, 13 * 16(%rsi);
1198 
1199         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1200         vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1201         vmovdqu %xmm0, 14 * 16(%rsi);
1202 
1203         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1204         vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1205         vmovdqu %xmm15, 0 * 16(%rax);
1206         vmovdqu %xmm0, 15 * 16(%rsi);
1207 
1208         gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1209         vmovdqu %xmm0, (%rcx);
1210 
1211         
1212         vmovq (key_table)(CTX, %r8, 8), %xmm15;
1213         vpshufb .Lpack_bswap, %xmm15, %xmm15;
1214         vpxor 0 * 16(%rax), %xmm15, %xmm0;
1215         vpxor %xmm1, %xmm15, %xmm1;
1216         vpxor %xmm2, %xmm15, %xmm2;
1217         vpxor %xmm3, %xmm15, %xmm3;
1218         vpxor %xmm4, %xmm15, %xmm4;
1219         vpxor %xmm5, %xmm15, %xmm5;
1220         vpxor %xmm6, %xmm15, %xmm6;
1221         vpxor %xmm7, %xmm15, %xmm7;
1222         vpxor %xmm8, %xmm15, %xmm8;
1223         vpxor %xmm9, %xmm15, %xmm9;
1224         vpxor %xmm10, %xmm15, %xmm10;
1225         vpxor %xmm11, %xmm15, %xmm11;
1226         vpxor %xmm12, %xmm15, %xmm12;
1227         vpxor %xmm13, %xmm15, %xmm13;
1228         vpxor 14 * 16(%rax), %xmm15, %xmm14;
1229         vpxor 15 * 16(%rax), %xmm15, %xmm15;
1230 
1231         CALL_NOSPEC %r9;
1232 
1233         addq $(16 * 16), %rsp;
1234 
1235         vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1236         vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1237         vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1238         vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1239         vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1240         vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1241         vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1242         vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1243         vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1244         vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1245         vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1246         vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1247         vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1248         vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1249         vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1250         vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1251         write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1252                      %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1253                      %xmm8, %rsi);
1254 
1255         FRAME_END
1256         ret;
1257 ENDPROC(camellia_xts_crypt_16way)
1258 
1259 ENTRY(camellia_xts_enc_16way)
1260         
1261 
1262 
1263 
1264 
1265 
1266         xorl %r8d, %r8d; 
1267 
1268         leaq __camellia_enc_blk16, %r9;
1269 
1270         jmp camellia_xts_crypt_16way;
1271 ENDPROC(camellia_xts_enc_16way)
1272 
1273 ENTRY(camellia_xts_dec_16way)
1274         
1275 
1276 
1277 
1278 
1279 
1280 
1281         cmpl $16, key_length(CTX);
1282         movl $32, %r8d;
1283         movl $24, %eax;
1284         cmovel %eax, %r8d;  
1285 
1286         leaq __camellia_dec_blk16, %r9;
1287 
1288         jmp camellia_xts_crypt_16way;
1289 ENDPROC(camellia_xts_dec_16way)