1/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13
14.data
15.align 16
16
17ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
18ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
19CTRINC:	.octa 0x00000003000000020000000100000000
20
21.text
22
23ENTRY(chacha20_block_xor_ssse3)
24	# %rdi: Input state matrix, s
25	# %rsi: 1 data block output, o
26	# %rdx: 1 data block input, i
27
28	# This function encrypts one ChaCha20 block by loading the state matrix
29	# in four SSE registers. It performs matrix operation on four words in
30	# parallel, but requireds shuffling to rearrange the words after each
31	# round. 8/16-bit word rotation is done with the slightly better
32	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
33	# traditional shift+OR.
34
35	# x0..3 = s0..3
36	movdqa		0x00(%rdi),%xmm0
37	movdqa		0x10(%rdi),%xmm1
38	movdqa		0x20(%rdi),%xmm2
39	movdqa		0x30(%rdi),%xmm3
40	movdqa		%xmm0,%xmm8
41	movdqa		%xmm1,%xmm9
42	movdqa		%xmm2,%xmm10
43	movdqa		%xmm3,%xmm11
44
45	movdqa		ROT8(%rip),%xmm4
46	movdqa		ROT16(%rip),%xmm5
47
48	mov	$10,%ecx
49
50.Ldoubleround:
51
52	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53	paddd		%xmm1,%xmm0
54	pxor		%xmm0,%xmm3
55	pshufb		%xmm5,%xmm3
56
57	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58	paddd		%xmm3,%xmm2
59	pxor		%xmm2,%xmm1
60	movdqa		%xmm1,%xmm6
61	pslld		$12,%xmm6
62	psrld		$20,%xmm1
63	por		%xmm6,%xmm1
64
65	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
66	paddd		%xmm1,%xmm0
67	pxor		%xmm0,%xmm3
68	pshufb		%xmm4,%xmm3
69
70	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
71	paddd		%xmm3,%xmm2
72	pxor		%xmm2,%xmm1
73	movdqa		%xmm1,%xmm7
74	pslld		$7,%xmm7
75	psrld		$25,%xmm1
76	por		%xmm7,%xmm1
77
78	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
79	pshufd		$0x39,%xmm1,%xmm1
80	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
81	pshufd		$0x4e,%xmm2,%xmm2
82	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
83	pshufd		$0x93,%xmm3,%xmm3
84
85	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
86	paddd		%xmm1,%xmm0
87	pxor		%xmm0,%xmm3
88	pshufb		%xmm5,%xmm3
89
90	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
91	paddd		%xmm3,%xmm2
92	pxor		%xmm2,%xmm1
93	movdqa		%xmm1,%xmm6
94	pslld		$12,%xmm6
95	psrld		$20,%xmm1
96	por		%xmm6,%xmm1
97
98	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
99	paddd		%xmm1,%xmm0
100	pxor		%xmm0,%xmm3
101	pshufb		%xmm4,%xmm3
102
103	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
104	paddd		%xmm3,%xmm2
105	pxor		%xmm2,%xmm1
106	movdqa		%xmm1,%xmm7
107	pslld		$7,%xmm7
108	psrld		$25,%xmm1
109	por		%xmm7,%xmm1
110
111	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
112	pshufd		$0x93,%xmm1,%xmm1
113	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
114	pshufd		$0x4e,%xmm2,%xmm2
115	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
116	pshufd		$0x39,%xmm3,%xmm3
117
118	dec		%ecx
119	jnz		.Ldoubleround
120
121	# o0 = i0 ^ (x0 + s0)
122	movdqu		0x00(%rdx),%xmm4
123	paddd		%xmm8,%xmm0
124	pxor		%xmm4,%xmm0
125	movdqu		%xmm0,0x00(%rsi)
126	# o1 = i1 ^ (x1 + s1)
127	movdqu		0x10(%rdx),%xmm5
128	paddd		%xmm9,%xmm1
129	pxor		%xmm5,%xmm1
130	movdqu		%xmm1,0x10(%rsi)
131	# o2 = i2 ^ (x2 + s2)
132	movdqu		0x20(%rdx),%xmm6
133	paddd		%xmm10,%xmm2
134	pxor		%xmm6,%xmm2
135	movdqu		%xmm2,0x20(%rsi)
136	# o3 = i3 ^ (x3 + s3)
137	movdqu		0x30(%rdx),%xmm7
138	paddd		%xmm11,%xmm3
139	pxor		%xmm7,%xmm3
140	movdqu		%xmm3,0x30(%rsi)
141
142	ret
143ENDPROC(chacha20_block_xor_ssse3)
144
145ENTRY(chacha20_4block_xor_ssse3)
146	# %rdi: Input state matrix, s
147	# %rsi: 4 data blocks output, o
148	# %rdx: 4 data blocks input, i
149
150	# This function encrypts four consecutive ChaCha20 blocks by loading the
151	# the state matrix in SSE registers four times. As we need some scratch
152	# registers, we save the first four registers on the stack. The
153	# algorithm performs each operation on the corresponding word of each
154	# state matrix, hence requires no word shuffling. For final XORing step
155	# we transpose the matrix by interleaving 32- and then 64-bit words,
156	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157	# done with the slightly better performing SSSE3 byte shuffling,
158	# 7/12-bit word rotation uses traditional shift+OR.
159
160	mov		%rsp,%r11
161	sub		$0x80,%rsp
162	and		$~63,%rsp
163
164	# x0..15[0-3] = s0..3[0..3]
165	movq		0x00(%rdi),%xmm1
166	pshufd		$0x00,%xmm1,%xmm0
167	pshufd		$0x55,%xmm1,%xmm1
168	movq		0x08(%rdi),%xmm3
169	pshufd		$0x00,%xmm3,%xmm2
170	pshufd		$0x55,%xmm3,%xmm3
171	movq		0x10(%rdi),%xmm5
172	pshufd		$0x00,%xmm5,%xmm4
173	pshufd		$0x55,%xmm5,%xmm5
174	movq		0x18(%rdi),%xmm7
175	pshufd		$0x00,%xmm7,%xmm6
176	pshufd		$0x55,%xmm7,%xmm7
177	movq		0x20(%rdi),%xmm9
178	pshufd		$0x00,%xmm9,%xmm8
179	pshufd		$0x55,%xmm9,%xmm9
180	movq		0x28(%rdi),%xmm11
181	pshufd		$0x00,%xmm11,%xmm10
182	pshufd		$0x55,%xmm11,%xmm11
183	movq		0x30(%rdi),%xmm13
184	pshufd		$0x00,%xmm13,%xmm12
185	pshufd		$0x55,%xmm13,%xmm13
186	movq		0x38(%rdi),%xmm15
187	pshufd		$0x00,%xmm15,%xmm14
188	pshufd		$0x55,%xmm15,%xmm15
189	# x0..3 on stack
190	movdqa		%xmm0,0x00(%rsp)
191	movdqa		%xmm1,0x10(%rsp)
192	movdqa		%xmm2,0x20(%rsp)
193	movdqa		%xmm3,0x30(%rsp)
194
195	movdqa		CTRINC(%rip),%xmm1
196	movdqa		ROT8(%rip),%xmm2
197	movdqa		ROT16(%rip),%xmm3
198
199	# x12 += counter values 0-3
200	paddd		%xmm1,%xmm12
201
202	mov		$10,%ecx
203
204.Ldoubleround4:
205	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
206	movdqa		0x00(%rsp),%xmm0
207	paddd		%xmm4,%xmm0
208	movdqa		%xmm0,0x00(%rsp)
209	pxor		%xmm0,%xmm12
210	pshufb		%xmm3,%xmm12
211	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
212	movdqa		0x10(%rsp),%xmm0
213	paddd		%xmm5,%xmm0
214	movdqa		%xmm0,0x10(%rsp)
215	pxor		%xmm0,%xmm13
216	pshufb		%xmm3,%xmm13
217	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
218	movdqa		0x20(%rsp),%xmm0
219	paddd		%xmm6,%xmm0
220	movdqa		%xmm0,0x20(%rsp)
221	pxor		%xmm0,%xmm14
222	pshufb		%xmm3,%xmm14
223	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
224	movdqa		0x30(%rsp),%xmm0
225	paddd		%xmm7,%xmm0
226	movdqa		%xmm0,0x30(%rsp)
227	pxor		%xmm0,%xmm15
228	pshufb		%xmm3,%xmm15
229
230	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
231	paddd		%xmm12,%xmm8
232	pxor		%xmm8,%xmm4
233	movdqa		%xmm4,%xmm0
234	pslld		$12,%xmm0
235	psrld		$20,%xmm4
236	por		%xmm0,%xmm4
237	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
238	paddd		%xmm13,%xmm9
239	pxor		%xmm9,%xmm5
240	movdqa		%xmm5,%xmm0
241	pslld		$12,%xmm0
242	psrld		$20,%xmm5
243	por		%xmm0,%xmm5
244	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
245	paddd		%xmm14,%xmm10
246	pxor		%xmm10,%xmm6
247	movdqa		%xmm6,%xmm0
248	pslld		$12,%xmm0
249	psrld		$20,%xmm6
250	por		%xmm0,%xmm6
251	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
252	paddd		%xmm15,%xmm11
253	pxor		%xmm11,%xmm7
254	movdqa		%xmm7,%xmm0
255	pslld		$12,%xmm0
256	psrld		$20,%xmm7
257	por		%xmm0,%xmm7
258
259	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
260	movdqa		0x00(%rsp),%xmm0
261	paddd		%xmm4,%xmm0
262	movdqa		%xmm0,0x00(%rsp)
263	pxor		%xmm0,%xmm12
264	pshufb		%xmm2,%xmm12
265	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
266	movdqa		0x10(%rsp),%xmm0
267	paddd		%xmm5,%xmm0
268	movdqa		%xmm0,0x10(%rsp)
269	pxor		%xmm0,%xmm13
270	pshufb		%xmm2,%xmm13
271	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
272	movdqa		0x20(%rsp),%xmm0
273	paddd		%xmm6,%xmm0
274	movdqa		%xmm0,0x20(%rsp)
275	pxor		%xmm0,%xmm14
276	pshufb		%xmm2,%xmm14
277	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
278	movdqa		0x30(%rsp),%xmm0
279	paddd		%xmm7,%xmm0
280	movdqa		%xmm0,0x30(%rsp)
281	pxor		%xmm0,%xmm15
282	pshufb		%xmm2,%xmm15
283
284	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
285	paddd		%xmm12,%xmm8
286	pxor		%xmm8,%xmm4
287	movdqa		%xmm4,%xmm0
288	pslld		$7,%xmm0
289	psrld		$25,%xmm4
290	por		%xmm0,%xmm4
291	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
292	paddd		%xmm13,%xmm9
293	pxor		%xmm9,%xmm5
294	movdqa		%xmm5,%xmm0
295	pslld		$7,%xmm0
296	psrld		$25,%xmm5
297	por		%xmm0,%xmm5
298	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
299	paddd		%xmm14,%xmm10
300	pxor		%xmm10,%xmm6
301	movdqa		%xmm6,%xmm0
302	pslld		$7,%xmm0
303	psrld		$25,%xmm6
304	por		%xmm0,%xmm6
305	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
306	paddd		%xmm15,%xmm11
307	pxor		%xmm11,%xmm7
308	movdqa		%xmm7,%xmm0
309	pslld		$7,%xmm0
310	psrld		$25,%xmm7
311	por		%xmm0,%xmm7
312
313	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
314	movdqa		0x00(%rsp),%xmm0
315	paddd		%xmm5,%xmm0
316	movdqa		%xmm0,0x00(%rsp)
317	pxor		%xmm0,%xmm15
318	pshufb		%xmm3,%xmm15
319	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
320	movdqa		0x10(%rsp),%xmm0
321	paddd		%xmm6,%xmm0
322	movdqa		%xmm0,0x10(%rsp)
323	pxor		%xmm0,%xmm12
324	pshufb		%xmm3,%xmm12
325	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
326	movdqa		0x20(%rsp),%xmm0
327	paddd		%xmm7,%xmm0
328	movdqa		%xmm0,0x20(%rsp)
329	pxor		%xmm0,%xmm13
330	pshufb		%xmm3,%xmm13
331	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
332	movdqa		0x30(%rsp),%xmm0
333	paddd		%xmm4,%xmm0
334	movdqa		%xmm0,0x30(%rsp)
335	pxor		%xmm0,%xmm14
336	pshufb		%xmm3,%xmm14
337
338	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
339	paddd		%xmm15,%xmm10
340	pxor		%xmm10,%xmm5
341	movdqa		%xmm5,%xmm0
342	pslld		$12,%xmm0
343	psrld		$20,%xmm5
344	por		%xmm0,%xmm5
345	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
346	paddd		%xmm12,%xmm11
347	pxor		%xmm11,%xmm6
348	movdqa		%xmm6,%xmm0
349	pslld		$12,%xmm0
350	psrld		$20,%xmm6
351	por		%xmm0,%xmm6
352	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
353	paddd		%xmm13,%xmm8
354	pxor		%xmm8,%xmm7
355	movdqa		%xmm7,%xmm0
356	pslld		$12,%xmm0
357	psrld		$20,%xmm7
358	por		%xmm0,%xmm7
359	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
360	paddd		%xmm14,%xmm9
361	pxor		%xmm9,%xmm4
362	movdqa		%xmm4,%xmm0
363	pslld		$12,%xmm0
364	psrld		$20,%xmm4
365	por		%xmm0,%xmm4
366
367	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
368	movdqa		0x00(%rsp),%xmm0
369	paddd		%xmm5,%xmm0
370	movdqa		%xmm0,0x00(%rsp)
371	pxor		%xmm0,%xmm15
372	pshufb		%xmm2,%xmm15
373	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
374	movdqa		0x10(%rsp),%xmm0
375	paddd		%xmm6,%xmm0
376	movdqa		%xmm0,0x10(%rsp)
377	pxor		%xmm0,%xmm12
378	pshufb		%xmm2,%xmm12
379	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
380	movdqa		0x20(%rsp),%xmm0
381	paddd		%xmm7,%xmm0
382	movdqa		%xmm0,0x20(%rsp)
383	pxor		%xmm0,%xmm13
384	pshufb		%xmm2,%xmm13
385	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
386	movdqa		0x30(%rsp),%xmm0
387	paddd		%xmm4,%xmm0
388	movdqa		%xmm0,0x30(%rsp)
389	pxor		%xmm0,%xmm14
390	pshufb		%xmm2,%xmm14
391
392	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
393	paddd		%xmm15,%xmm10
394	pxor		%xmm10,%xmm5
395	movdqa		%xmm5,%xmm0
396	pslld		$7,%xmm0
397	psrld		$25,%xmm5
398	por		%xmm0,%xmm5
399	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
400	paddd		%xmm12,%xmm11
401	pxor		%xmm11,%xmm6
402	movdqa		%xmm6,%xmm0
403	pslld		$7,%xmm0
404	psrld		$25,%xmm6
405	por		%xmm0,%xmm6
406	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
407	paddd		%xmm13,%xmm8
408	pxor		%xmm8,%xmm7
409	movdqa		%xmm7,%xmm0
410	pslld		$7,%xmm0
411	psrld		$25,%xmm7
412	por		%xmm0,%xmm7
413	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
414	paddd		%xmm14,%xmm9
415	pxor		%xmm9,%xmm4
416	movdqa		%xmm4,%xmm0
417	pslld		$7,%xmm0
418	psrld		$25,%xmm4
419	por		%xmm0,%xmm4
420
421	dec		%ecx
422	jnz		.Ldoubleround4
423
424	# x0[0-3] += s0[0]
425	# x1[0-3] += s0[1]
426	movq		0x00(%rdi),%xmm3
427	pshufd		$0x00,%xmm3,%xmm2
428	pshufd		$0x55,%xmm3,%xmm3
429	paddd		0x00(%rsp),%xmm2
430	movdqa		%xmm2,0x00(%rsp)
431	paddd		0x10(%rsp),%xmm3
432	movdqa		%xmm3,0x10(%rsp)
433	# x2[0-3] += s0[2]
434	# x3[0-3] += s0[3]
435	movq		0x08(%rdi),%xmm3
436	pshufd		$0x00,%xmm3,%xmm2
437	pshufd		$0x55,%xmm3,%xmm3
438	paddd		0x20(%rsp),%xmm2
439	movdqa		%xmm2,0x20(%rsp)
440	paddd		0x30(%rsp),%xmm3
441	movdqa		%xmm3,0x30(%rsp)
442
443	# x4[0-3] += s1[0]
444	# x5[0-3] += s1[1]
445	movq		0x10(%rdi),%xmm3
446	pshufd		$0x00,%xmm3,%xmm2
447	pshufd		$0x55,%xmm3,%xmm3
448	paddd		%xmm2,%xmm4
449	paddd		%xmm3,%xmm5
450	# x6[0-3] += s1[2]
451	# x7[0-3] += s1[3]
452	movq		0x18(%rdi),%xmm3
453	pshufd		$0x00,%xmm3,%xmm2
454	pshufd		$0x55,%xmm3,%xmm3
455	paddd		%xmm2,%xmm6
456	paddd		%xmm3,%xmm7
457
458	# x8[0-3] += s2[0]
459	# x9[0-3] += s2[1]
460	movq		0x20(%rdi),%xmm3
461	pshufd		$0x00,%xmm3,%xmm2
462	pshufd		$0x55,%xmm3,%xmm3
463	paddd		%xmm2,%xmm8
464	paddd		%xmm3,%xmm9
465	# x10[0-3] += s2[2]
466	# x11[0-3] += s2[3]
467	movq		0x28(%rdi),%xmm3
468	pshufd		$0x00,%xmm3,%xmm2
469	pshufd		$0x55,%xmm3,%xmm3
470	paddd		%xmm2,%xmm10
471	paddd		%xmm3,%xmm11
472
473	# x12[0-3] += s3[0]
474	# x13[0-3] += s3[1]
475	movq		0x30(%rdi),%xmm3
476	pshufd		$0x00,%xmm3,%xmm2
477	pshufd		$0x55,%xmm3,%xmm3
478	paddd		%xmm2,%xmm12
479	paddd		%xmm3,%xmm13
480	# x14[0-3] += s3[2]
481	# x15[0-3] += s3[3]
482	movq		0x38(%rdi),%xmm3
483	pshufd		$0x00,%xmm3,%xmm2
484	pshufd		$0x55,%xmm3,%xmm3
485	paddd		%xmm2,%xmm14
486	paddd		%xmm3,%xmm15
487
488	# x12 += counter values 0-3
489	paddd		%xmm1,%xmm12
490
491	# interleave 32-bit words in state n, n+1
492	movdqa		0x00(%rsp),%xmm0
493	movdqa		0x10(%rsp),%xmm1
494	movdqa		%xmm0,%xmm2
495	punpckldq	%xmm1,%xmm2
496	punpckhdq	%xmm1,%xmm0
497	movdqa		%xmm2,0x00(%rsp)
498	movdqa		%xmm0,0x10(%rsp)
499	movdqa		0x20(%rsp),%xmm0
500	movdqa		0x30(%rsp),%xmm1
501	movdqa		%xmm0,%xmm2
502	punpckldq	%xmm1,%xmm2
503	punpckhdq	%xmm1,%xmm0
504	movdqa		%xmm2,0x20(%rsp)
505	movdqa		%xmm0,0x30(%rsp)
506	movdqa		%xmm4,%xmm0
507	punpckldq	%xmm5,%xmm4
508	punpckhdq	%xmm5,%xmm0
509	movdqa		%xmm0,%xmm5
510	movdqa		%xmm6,%xmm0
511	punpckldq	%xmm7,%xmm6
512	punpckhdq	%xmm7,%xmm0
513	movdqa		%xmm0,%xmm7
514	movdqa		%xmm8,%xmm0
515	punpckldq	%xmm9,%xmm8
516	punpckhdq	%xmm9,%xmm0
517	movdqa		%xmm0,%xmm9
518	movdqa		%xmm10,%xmm0
519	punpckldq	%xmm11,%xmm10
520	punpckhdq	%xmm11,%xmm0
521	movdqa		%xmm0,%xmm11
522	movdqa		%xmm12,%xmm0
523	punpckldq	%xmm13,%xmm12
524	punpckhdq	%xmm13,%xmm0
525	movdqa		%xmm0,%xmm13
526	movdqa		%xmm14,%xmm0
527	punpckldq	%xmm15,%xmm14
528	punpckhdq	%xmm15,%xmm0
529	movdqa		%xmm0,%xmm15
530
531	# interleave 64-bit words in state n, n+2
532	movdqa		0x00(%rsp),%xmm0
533	movdqa		0x20(%rsp),%xmm1
534	movdqa		%xmm0,%xmm2
535	punpcklqdq	%xmm1,%xmm2
536	punpckhqdq	%xmm1,%xmm0
537	movdqa		%xmm2,0x00(%rsp)
538	movdqa		%xmm0,0x20(%rsp)
539	movdqa		0x10(%rsp),%xmm0
540	movdqa		0x30(%rsp),%xmm1
541	movdqa		%xmm0,%xmm2
542	punpcklqdq	%xmm1,%xmm2
543	punpckhqdq	%xmm1,%xmm0
544	movdqa		%xmm2,0x10(%rsp)
545	movdqa		%xmm0,0x30(%rsp)
546	movdqa		%xmm4,%xmm0
547	punpcklqdq	%xmm6,%xmm4
548	punpckhqdq	%xmm6,%xmm0
549	movdqa		%xmm0,%xmm6
550	movdqa		%xmm5,%xmm0
551	punpcklqdq	%xmm7,%xmm5
552	punpckhqdq	%xmm7,%xmm0
553	movdqa		%xmm0,%xmm7
554	movdqa		%xmm8,%xmm0
555	punpcklqdq	%xmm10,%xmm8
556	punpckhqdq	%xmm10,%xmm0
557	movdqa		%xmm0,%xmm10
558	movdqa		%xmm9,%xmm0
559	punpcklqdq	%xmm11,%xmm9
560	punpckhqdq	%xmm11,%xmm0
561	movdqa		%xmm0,%xmm11
562	movdqa		%xmm12,%xmm0
563	punpcklqdq	%xmm14,%xmm12
564	punpckhqdq	%xmm14,%xmm0
565	movdqa		%xmm0,%xmm14
566	movdqa		%xmm13,%xmm0
567	punpcklqdq	%xmm15,%xmm13
568	punpckhqdq	%xmm15,%xmm0
569	movdqa		%xmm0,%xmm15
570
571	# xor with corresponding input, write to output
572	movdqa		0x00(%rsp),%xmm0
573	movdqu		0x00(%rdx),%xmm1
574	pxor		%xmm1,%xmm0
575	movdqu		%xmm0,0x00(%rsi)
576	movdqa		0x10(%rsp),%xmm0
577	movdqu		0x80(%rdx),%xmm1
578	pxor		%xmm1,%xmm0
579	movdqu		%xmm0,0x80(%rsi)
580	movdqa		0x20(%rsp),%xmm0
581	movdqu		0x40(%rdx),%xmm1
582	pxor		%xmm1,%xmm0
583	movdqu		%xmm0,0x40(%rsi)
584	movdqa		0x30(%rsp),%xmm0
585	movdqu		0xc0(%rdx),%xmm1
586	pxor		%xmm1,%xmm0
587	movdqu		%xmm0,0xc0(%rsi)
588	movdqu		0x10(%rdx),%xmm1
589	pxor		%xmm1,%xmm4
590	movdqu		%xmm4,0x10(%rsi)
591	movdqu		0x90(%rdx),%xmm1
592	pxor		%xmm1,%xmm5
593	movdqu		%xmm5,0x90(%rsi)
594	movdqu		0x50(%rdx),%xmm1
595	pxor		%xmm1,%xmm6
596	movdqu		%xmm6,0x50(%rsi)
597	movdqu		0xd0(%rdx),%xmm1
598	pxor		%xmm1,%xmm7
599	movdqu		%xmm7,0xd0(%rsi)
600	movdqu		0x20(%rdx),%xmm1
601	pxor		%xmm1,%xmm8
602	movdqu		%xmm8,0x20(%rsi)
603	movdqu		0xa0(%rdx),%xmm1
604	pxor		%xmm1,%xmm9
605	movdqu		%xmm9,0xa0(%rsi)
606	movdqu		0x60(%rdx),%xmm1
607	pxor		%xmm1,%xmm10
608	movdqu		%xmm10,0x60(%rsi)
609	movdqu		0xe0(%rdx),%xmm1
610	pxor		%xmm1,%xmm11
611	movdqu		%xmm11,0xe0(%rsi)
612	movdqu		0x30(%rdx),%xmm1
613	pxor		%xmm1,%xmm12
614	movdqu		%xmm12,0x30(%rsi)
615	movdqu		0xb0(%rdx),%xmm1
616	pxor		%xmm1,%xmm13
617	movdqu		%xmm13,0xb0(%rsi)
618	movdqu		0x70(%rdx),%xmm1
619	pxor		%xmm1,%xmm14
620	movdqu		%xmm14,0x70(%rsi)
621	movdqu		0xf0(%rdx),%xmm1
622	pxor		%xmm1,%xmm15
623	movdqu		%xmm15,0xf0(%rsi)
624
625	mov		%r11,%rsp
626	ret
627ENDPROC(chacha20_4block_xor_ssse3)
628