1/* 2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.data 15.align 16 16 17ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 18ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 19CTRINC: .octa 0x00000003000000020000000100000000 20 21.text 22 23ENTRY(chacha20_block_xor_ssse3) 24 # %rdi: Input state matrix, s 25 # %rsi: 1 data block output, o 26 # %rdx: 1 data block input, i 27 28 # This function encrypts one ChaCha20 block by loading the state matrix 29 # in four SSE registers. It performs matrix operation on four words in 30 # parallel, but requireds shuffling to rearrange the words after each 31 # round. 8/16-bit word rotation is done with the slightly better 32 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses 33 # traditional shift+OR. 34 35 # x0..3 = s0..3 36 movdqa 0x00(%rdi),%xmm0 37 movdqa 0x10(%rdi),%xmm1 38 movdqa 0x20(%rdi),%xmm2 39 movdqa 0x30(%rdi),%xmm3 40 movdqa %xmm0,%xmm8 41 movdqa %xmm1,%xmm9 42 movdqa %xmm2,%xmm10 43 movdqa %xmm3,%xmm11 44 45 movdqa ROT8(%rip),%xmm4 46 movdqa ROT16(%rip),%xmm5 47 48 mov $10,%ecx 49 50.Ldoubleround: 51 52 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 53 paddd %xmm1,%xmm0 54 pxor %xmm0,%xmm3 55 pshufb %xmm5,%xmm3 56 57 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 58 paddd %xmm3,%xmm2 59 pxor %xmm2,%xmm1 60 movdqa %xmm1,%xmm6 61 pslld $12,%xmm6 62 psrld $20,%xmm1 63 por %xmm6,%xmm1 64 65 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 66 paddd %xmm1,%xmm0 67 pxor %xmm0,%xmm3 68 pshufb %xmm4,%xmm3 69 70 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 71 paddd %xmm3,%xmm2 72 pxor %xmm2,%xmm1 73 movdqa %xmm1,%xmm7 74 pslld $7,%xmm7 75 psrld $25,%xmm1 76 por %xmm7,%xmm1 77 78 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 79 pshufd $0x39,%xmm1,%xmm1 80 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 81 pshufd $0x4e,%xmm2,%xmm2 82 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 83 pshufd $0x93,%xmm3,%xmm3 84 85 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 86 paddd %xmm1,%xmm0 87 pxor %xmm0,%xmm3 88 pshufb %xmm5,%xmm3 89 90 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 91 paddd %xmm3,%xmm2 92 pxor %xmm2,%xmm1 93 movdqa %xmm1,%xmm6 94 pslld $12,%xmm6 95 psrld $20,%xmm1 96 por %xmm6,%xmm1 97 98 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 99 paddd %xmm1,%xmm0 100 pxor %xmm0,%xmm3 101 pshufb %xmm4,%xmm3 102 103 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 104 paddd %xmm3,%xmm2 105 pxor %xmm2,%xmm1 106 movdqa %xmm1,%xmm7 107 pslld $7,%xmm7 108 psrld $25,%xmm1 109 por %xmm7,%xmm1 110 111 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 112 pshufd $0x93,%xmm1,%xmm1 113 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 114 pshufd $0x4e,%xmm2,%xmm2 115 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 116 pshufd $0x39,%xmm3,%xmm3 117 118 dec %ecx 119 jnz .Ldoubleround 120 121 # o0 = i0 ^ (x0 + s0) 122 movdqu 0x00(%rdx),%xmm4 123 paddd %xmm8,%xmm0 124 pxor %xmm4,%xmm0 125 movdqu %xmm0,0x00(%rsi) 126 # o1 = i1 ^ (x1 + s1) 127 movdqu 0x10(%rdx),%xmm5 128 paddd %xmm9,%xmm1 129 pxor %xmm5,%xmm1 130 movdqu %xmm1,0x10(%rsi) 131 # o2 = i2 ^ (x2 + s2) 132 movdqu 0x20(%rdx),%xmm6 133 paddd %xmm10,%xmm2 134 pxor %xmm6,%xmm2 135 movdqu %xmm2,0x20(%rsi) 136 # o3 = i3 ^ (x3 + s3) 137 movdqu 0x30(%rdx),%xmm7 138 paddd %xmm11,%xmm3 139 pxor %xmm7,%xmm3 140 movdqu %xmm3,0x30(%rsi) 141 142 ret 143ENDPROC(chacha20_block_xor_ssse3) 144 145ENTRY(chacha20_4block_xor_ssse3) 146 # %rdi: Input state matrix, s 147 # %rsi: 4 data blocks output, o 148 # %rdx: 4 data blocks input, i 149 150 # This function encrypts four consecutive ChaCha20 blocks by loading the 151 # the state matrix in SSE registers four times. As we need some scratch 152 # registers, we save the first four registers on the stack. The 153 # algorithm performs each operation on the corresponding word of each 154 # state matrix, hence requires no word shuffling. For final XORing step 155 # we transpose the matrix by interleaving 32- and then 64-bit words, 156 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 157 # done with the slightly better performing SSSE3 byte shuffling, 158 # 7/12-bit word rotation uses traditional shift+OR. 159 160 mov %rsp,%r11 161 sub $0x80,%rsp 162 and $~63,%rsp 163 164 # x0..15[0-3] = s0..3[0..3] 165 movq 0x00(%rdi),%xmm1 166 pshufd $0x00,%xmm1,%xmm0 167 pshufd $0x55,%xmm1,%xmm1 168 movq 0x08(%rdi),%xmm3 169 pshufd $0x00,%xmm3,%xmm2 170 pshufd $0x55,%xmm3,%xmm3 171 movq 0x10(%rdi),%xmm5 172 pshufd $0x00,%xmm5,%xmm4 173 pshufd $0x55,%xmm5,%xmm5 174 movq 0x18(%rdi),%xmm7 175 pshufd $0x00,%xmm7,%xmm6 176 pshufd $0x55,%xmm7,%xmm7 177 movq 0x20(%rdi),%xmm9 178 pshufd $0x00,%xmm9,%xmm8 179 pshufd $0x55,%xmm9,%xmm9 180 movq 0x28(%rdi),%xmm11 181 pshufd $0x00,%xmm11,%xmm10 182 pshufd $0x55,%xmm11,%xmm11 183 movq 0x30(%rdi),%xmm13 184 pshufd $0x00,%xmm13,%xmm12 185 pshufd $0x55,%xmm13,%xmm13 186 movq 0x38(%rdi),%xmm15 187 pshufd $0x00,%xmm15,%xmm14 188 pshufd $0x55,%xmm15,%xmm15 189 # x0..3 on stack 190 movdqa %xmm0,0x00(%rsp) 191 movdqa %xmm1,0x10(%rsp) 192 movdqa %xmm2,0x20(%rsp) 193 movdqa %xmm3,0x30(%rsp) 194 195 movdqa CTRINC(%rip),%xmm1 196 movdqa ROT8(%rip),%xmm2 197 movdqa ROT16(%rip),%xmm3 198 199 # x12 += counter values 0-3 200 paddd %xmm1,%xmm12 201 202 mov $10,%ecx 203 204.Ldoubleround4: 205 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 206 movdqa 0x00(%rsp),%xmm0 207 paddd %xmm4,%xmm0 208 movdqa %xmm0,0x00(%rsp) 209 pxor %xmm0,%xmm12 210 pshufb %xmm3,%xmm12 211 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 212 movdqa 0x10(%rsp),%xmm0 213 paddd %xmm5,%xmm0 214 movdqa %xmm0,0x10(%rsp) 215 pxor %xmm0,%xmm13 216 pshufb %xmm3,%xmm13 217 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 218 movdqa 0x20(%rsp),%xmm0 219 paddd %xmm6,%xmm0 220 movdqa %xmm0,0x20(%rsp) 221 pxor %xmm0,%xmm14 222 pshufb %xmm3,%xmm14 223 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 224 movdqa 0x30(%rsp),%xmm0 225 paddd %xmm7,%xmm0 226 movdqa %xmm0,0x30(%rsp) 227 pxor %xmm0,%xmm15 228 pshufb %xmm3,%xmm15 229 230 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 231 paddd %xmm12,%xmm8 232 pxor %xmm8,%xmm4 233 movdqa %xmm4,%xmm0 234 pslld $12,%xmm0 235 psrld $20,%xmm4 236 por %xmm0,%xmm4 237 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 238 paddd %xmm13,%xmm9 239 pxor %xmm9,%xmm5 240 movdqa %xmm5,%xmm0 241 pslld $12,%xmm0 242 psrld $20,%xmm5 243 por %xmm0,%xmm5 244 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 245 paddd %xmm14,%xmm10 246 pxor %xmm10,%xmm6 247 movdqa %xmm6,%xmm0 248 pslld $12,%xmm0 249 psrld $20,%xmm6 250 por %xmm0,%xmm6 251 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 252 paddd %xmm15,%xmm11 253 pxor %xmm11,%xmm7 254 movdqa %xmm7,%xmm0 255 pslld $12,%xmm0 256 psrld $20,%xmm7 257 por %xmm0,%xmm7 258 259 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 260 movdqa 0x00(%rsp),%xmm0 261 paddd %xmm4,%xmm0 262 movdqa %xmm0,0x00(%rsp) 263 pxor %xmm0,%xmm12 264 pshufb %xmm2,%xmm12 265 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 266 movdqa 0x10(%rsp),%xmm0 267 paddd %xmm5,%xmm0 268 movdqa %xmm0,0x10(%rsp) 269 pxor %xmm0,%xmm13 270 pshufb %xmm2,%xmm13 271 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 272 movdqa 0x20(%rsp),%xmm0 273 paddd %xmm6,%xmm0 274 movdqa %xmm0,0x20(%rsp) 275 pxor %xmm0,%xmm14 276 pshufb %xmm2,%xmm14 277 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 278 movdqa 0x30(%rsp),%xmm0 279 paddd %xmm7,%xmm0 280 movdqa %xmm0,0x30(%rsp) 281 pxor %xmm0,%xmm15 282 pshufb %xmm2,%xmm15 283 284 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 285 paddd %xmm12,%xmm8 286 pxor %xmm8,%xmm4 287 movdqa %xmm4,%xmm0 288 pslld $7,%xmm0 289 psrld $25,%xmm4 290 por %xmm0,%xmm4 291 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 292 paddd %xmm13,%xmm9 293 pxor %xmm9,%xmm5 294 movdqa %xmm5,%xmm0 295 pslld $7,%xmm0 296 psrld $25,%xmm5 297 por %xmm0,%xmm5 298 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 299 paddd %xmm14,%xmm10 300 pxor %xmm10,%xmm6 301 movdqa %xmm6,%xmm0 302 pslld $7,%xmm0 303 psrld $25,%xmm6 304 por %xmm0,%xmm6 305 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 306 paddd %xmm15,%xmm11 307 pxor %xmm11,%xmm7 308 movdqa %xmm7,%xmm0 309 pslld $7,%xmm0 310 psrld $25,%xmm7 311 por %xmm0,%xmm7 312 313 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 314 movdqa 0x00(%rsp),%xmm0 315 paddd %xmm5,%xmm0 316 movdqa %xmm0,0x00(%rsp) 317 pxor %xmm0,%xmm15 318 pshufb %xmm3,%xmm15 319 # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 320 movdqa 0x10(%rsp),%xmm0 321 paddd %xmm6,%xmm0 322 movdqa %xmm0,0x10(%rsp) 323 pxor %xmm0,%xmm12 324 pshufb %xmm3,%xmm12 325 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 326 movdqa 0x20(%rsp),%xmm0 327 paddd %xmm7,%xmm0 328 movdqa %xmm0,0x20(%rsp) 329 pxor %xmm0,%xmm13 330 pshufb %xmm3,%xmm13 331 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 332 movdqa 0x30(%rsp),%xmm0 333 paddd %xmm4,%xmm0 334 movdqa %xmm0,0x30(%rsp) 335 pxor %xmm0,%xmm14 336 pshufb %xmm3,%xmm14 337 338 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 339 paddd %xmm15,%xmm10 340 pxor %xmm10,%xmm5 341 movdqa %xmm5,%xmm0 342 pslld $12,%xmm0 343 psrld $20,%xmm5 344 por %xmm0,%xmm5 345 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 346 paddd %xmm12,%xmm11 347 pxor %xmm11,%xmm6 348 movdqa %xmm6,%xmm0 349 pslld $12,%xmm0 350 psrld $20,%xmm6 351 por %xmm0,%xmm6 352 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 353 paddd %xmm13,%xmm8 354 pxor %xmm8,%xmm7 355 movdqa %xmm7,%xmm0 356 pslld $12,%xmm0 357 psrld $20,%xmm7 358 por %xmm0,%xmm7 359 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 360 paddd %xmm14,%xmm9 361 pxor %xmm9,%xmm4 362 movdqa %xmm4,%xmm0 363 pslld $12,%xmm0 364 psrld $20,%xmm4 365 por %xmm0,%xmm4 366 367 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 368 movdqa 0x00(%rsp),%xmm0 369 paddd %xmm5,%xmm0 370 movdqa %xmm0,0x00(%rsp) 371 pxor %xmm0,%xmm15 372 pshufb %xmm2,%xmm15 373 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 374 movdqa 0x10(%rsp),%xmm0 375 paddd %xmm6,%xmm0 376 movdqa %xmm0,0x10(%rsp) 377 pxor %xmm0,%xmm12 378 pshufb %xmm2,%xmm12 379 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 380 movdqa 0x20(%rsp),%xmm0 381 paddd %xmm7,%xmm0 382 movdqa %xmm0,0x20(%rsp) 383 pxor %xmm0,%xmm13 384 pshufb %xmm2,%xmm13 385 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 386 movdqa 0x30(%rsp),%xmm0 387 paddd %xmm4,%xmm0 388 movdqa %xmm0,0x30(%rsp) 389 pxor %xmm0,%xmm14 390 pshufb %xmm2,%xmm14 391 392 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 393 paddd %xmm15,%xmm10 394 pxor %xmm10,%xmm5 395 movdqa %xmm5,%xmm0 396 pslld $7,%xmm0 397 psrld $25,%xmm5 398 por %xmm0,%xmm5 399 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 400 paddd %xmm12,%xmm11 401 pxor %xmm11,%xmm6 402 movdqa %xmm6,%xmm0 403 pslld $7,%xmm0 404 psrld $25,%xmm6 405 por %xmm0,%xmm6 406 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 407 paddd %xmm13,%xmm8 408 pxor %xmm8,%xmm7 409 movdqa %xmm7,%xmm0 410 pslld $7,%xmm0 411 psrld $25,%xmm7 412 por %xmm0,%xmm7 413 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 414 paddd %xmm14,%xmm9 415 pxor %xmm9,%xmm4 416 movdqa %xmm4,%xmm0 417 pslld $7,%xmm0 418 psrld $25,%xmm4 419 por %xmm0,%xmm4 420 421 dec %ecx 422 jnz .Ldoubleround4 423 424 # x0[0-3] += s0[0] 425 # x1[0-3] += s0[1] 426 movq 0x00(%rdi),%xmm3 427 pshufd $0x00,%xmm3,%xmm2 428 pshufd $0x55,%xmm3,%xmm3 429 paddd 0x00(%rsp),%xmm2 430 movdqa %xmm2,0x00(%rsp) 431 paddd 0x10(%rsp),%xmm3 432 movdqa %xmm3,0x10(%rsp) 433 # x2[0-3] += s0[2] 434 # x3[0-3] += s0[3] 435 movq 0x08(%rdi),%xmm3 436 pshufd $0x00,%xmm3,%xmm2 437 pshufd $0x55,%xmm3,%xmm3 438 paddd 0x20(%rsp),%xmm2 439 movdqa %xmm2,0x20(%rsp) 440 paddd 0x30(%rsp),%xmm3 441 movdqa %xmm3,0x30(%rsp) 442 443 # x4[0-3] += s1[0] 444 # x5[0-3] += s1[1] 445 movq 0x10(%rdi),%xmm3 446 pshufd $0x00,%xmm3,%xmm2 447 pshufd $0x55,%xmm3,%xmm3 448 paddd %xmm2,%xmm4 449 paddd %xmm3,%xmm5 450 # x6[0-3] += s1[2] 451 # x7[0-3] += s1[3] 452 movq 0x18(%rdi),%xmm3 453 pshufd $0x00,%xmm3,%xmm2 454 pshufd $0x55,%xmm3,%xmm3 455 paddd %xmm2,%xmm6 456 paddd %xmm3,%xmm7 457 458 # x8[0-3] += s2[0] 459 # x9[0-3] += s2[1] 460 movq 0x20(%rdi),%xmm3 461 pshufd $0x00,%xmm3,%xmm2 462 pshufd $0x55,%xmm3,%xmm3 463 paddd %xmm2,%xmm8 464 paddd %xmm3,%xmm9 465 # x10[0-3] += s2[2] 466 # x11[0-3] += s2[3] 467 movq 0x28(%rdi),%xmm3 468 pshufd $0x00,%xmm3,%xmm2 469 pshufd $0x55,%xmm3,%xmm3 470 paddd %xmm2,%xmm10 471 paddd %xmm3,%xmm11 472 473 # x12[0-3] += s3[0] 474 # x13[0-3] += s3[1] 475 movq 0x30(%rdi),%xmm3 476 pshufd $0x00,%xmm3,%xmm2 477 pshufd $0x55,%xmm3,%xmm3 478 paddd %xmm2,%xmm12 479 paddd %xmm3,%xmm13 480 # x14[0-3] += s3[2] 481 # x15[0-3] += s3[3] 482 movq 0x38(%rdi),%xmm3 483 pshufd $0x00,%xmm3,%xmm2 484 pshufd $0x55,%xmm3,%xmm3 485 paddd %xmm2,%xmm14 486 paddd %xmm3,%xmm15 487 488 # x12 += counter values 0-3 489 paddd %xmm1,%xmm12 490 491 # interleave 32-bit words in state n, n+1 492 movdqa 0x00(%rsp),%xmm0 493 movdqa 0x10(%rsp),%xmm1 494 movdqa %xmm0,%xmm2 495 punpckldq %xmm1,%xmm2 496 punpckhdq %xmm1,%xmm0 497 movdqa %xmm2,0x00(%rsp) 498 movdqa %xmm0,0x10(%rsp) 499 movdqa 0x20(%rsp),%xmm0 500 movdqa 0x30(%rsp),%xmm1 501 movdqa %xmm0,%xmm2 502 punpckldq %xmm1,%xmm2 503 punpckhdq %xmm1,%xmm0 504 movdqa %xmm2,0x20(%rsp) 505 movdqa %xmm0,0x30(%rsp) 506 movdqa %xmm4,%xmm0 507 punpckldq %xmm5,%xmm4 508 punpckhdq %xmm5,%xmm0 509 movdqa %xmm0,%xmm5 510 movdqa %xmm6,%xmm0 511 punpckldq %xmm7,%xmm6 512 punpckhdq %xmm7,%xmm0 513 movdqa %xmm0,%xmm7 514 movdqa %xmm8,%xmm0 515 punpckldq %xmm9,%xmm8 516 punpckhdq %xmm9,%xmm0 517 movdqa %xmm0,%xmm9 518 movdqa %xmm10,%xmm0 519 punpckldq %xmm11,%xmm10 520 punpckhdq %xmm11,%xmm0 521 movdqa %xmm0,%xmm11 522 movdqa %xmm12,%xmm0 523 punpckldq %xmm13,%xmm12 524 punpckhdq %xmm13,%xmm0 525 movdqa %xmm0,%xmm13 526 movdqa %xmm14,%xmm0 527 punpckldq %xmm15,%xmm14 528 punpckhdq %xmm15,%xmm0 529 movdqa %xmm0,%xmm15 530 531 # interleave 64-bit words in state n, n+2 532 movdqa 0x00(%rsp),%xmm0 533 movdqa 0x20(%rsp),%xmm1 534 movdqa %xmm0,%xmm2 535 punpcklqdq %xmm1,%xmm2 536 punpckhqdq %xmm1,%xmm0 537 movdqa %xmm2,0x00(%rsp) 538 movdqa %xmm0,0x20(%rsp) 539 movdqa 0x10(%rsp),%xmm0 540 movdqa 0x30(%rsp),%xmm1 541 movdqa %xmm0,%xmm2 542 punpcklqdq %xmm1,%xmm2 543 punpckhqdq %xmm1,%xmm0 544 movdqa %xmm2,0x10(%rsp) 545 movdqa %xmm0,0x30(%rsp) 546 movdqa %xmm4,%xmm0 547 punpcklqdq %xmm6,%xmm4 548 punpckhqdq %xmm6,%xmm0 549 movdqa %xmm0,%xmm6 550 movdqa %xmm5,%xmm0 551 punpcklqdq %xmm7,%xmm5 552 punpckhqdq %xmm7,%xmm0 553 movdqa %xmm0,%xmm7 554 movdqa %xmm8,%xmm0 555 punpcklqdq %xmm10,%xmm8 556 punpckhqdq %xmm10,%xmm0 557 movdqa %xmm0,%xmm10 558 movdqa %xmm9,%xmm0 559 punpcklqdq %xmm11,%xmm9 560 punpckhqdq %xmm11,%xmm0 561 movdqa %xmm0,%xmm11 562 movdqa %xmm12,%xmm0 563 punpcklqdq %xmm14,%xmm12 564 punpckhqdq %xmm14,%xmm0 565 movdqa %xmm0,%xmm14 566 movdqa %xmm13,%xmm0 567 punpcklqdq %xmm15,%xmm13 568 punpckhqdq %xmm15,%xmm0 569 movdqa %xmm0,%xmm15 570 571 # xor with corresponding input, write to output 572 movdqa 0x00(%rsp),%xmm0 573 movdqu 0x00(%rdx),%xmm1 574 pxor %xmm1,%xmm0 575 movdqu %xmm0,0x00(%rsi) 576 movdqa 0x10(%rsp),%xmm0 577 movdqu 0x80(%rdx),%xmm1 578 pxor %xmm1,%xmm0 579 movdqu %xmm0,0x80(%rsi) 580 movdqa 0x20(%rsp),%xmm0 581 movdqu 0x40(%rdx),%xmm1 582 pxor %xmm1,%xmm0 583 movdqu %xmm0,0x40(%rsi) 584 movdqa 0x30(%rsp),%xmm0 585 movdqu 0xc0(%rdx),%xmm1 586 pxor %xmm1,%xmm0 587 movdqu %xmm0,0xc0(%rsi) 588 movdqu 0x10(%rdx),%xmm1 589 pxor %xmm1,%xmm4 590 movdqu %xmm4,0x10(%rsi) 591 movdqu 0x90(%rdx),%xmm1 592 pxor %xmm1,%xmm5 593 movdqu %xmm5,0x90(%rsi) 594 movdqu 0x50(%rdx),%xmm1 595 pxor %xmm1,%xmm6 596 movdqu %xmm6,0x50(%rsi) 597 movdqu 0xd0(%rdx),%xmm1 598 pxor %xmm1,%xmm7 599 movdqu %xmm7,0xd0(%rsi) 600 movdqu 0x20(%rdx),%xmm1 601 pxor %xmm1,%xmm8 602 movdqu %xmm8,0x20(%rsi) 603 movdqu 0xa0(%rdx),%xmm1 604 pxor %xmm1,%xmm9 605 movdqu %xmm9,0xa0(%rsi) 606 movdqu 0x60(%rdx),%xmm1 607 pxor %xmm1,%xmm10 608 movdqu %xmm10,0x60(%rsi) 609 movdqu 0xe0(%rdx),%xmm1 610 pxor %xmm1,%xmm11 611 movdqu %xmm11,0xe0(%rsi) 612 movdqu 0x30(%rdx),%xmm1 613 pxor %xmm1,%xmm12 614 movdqu %xmm12,0x30(%rsi) 615 movdqu 0xb0(%rdx),%xmm1 616 pxor %xmm1,%xmm13 617 movdqu %xmm13,0xb0(%rsi) 618 movdqu 0x70(%rdx),%xmm1 619 pxor %xmm1,%xmm14 620 movdqu %xmm14,0x70(%rsi) 621 movdqu 0xf0(%rdx),%xmm1 622 pxor %xmm1,%xmm15 623 movdqu %xmm15,0xf0(%rsi) 624 625 mov %r11,%rsp 626 ret 627ENDPROC(chacha20_4block_xor_ssse3) 628