1/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17#include <linux/socket.h>
18#include <linux/stddef.h>
19#include <net/sock.h>
20
21#include "vmci_transport_notify.h"
22
23#define PKT_FIELD(vsk, field_name) \
24	(vmci_trans(vsk)->notify.pkt_q_state.field_name)
25
26static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
27{
28	bool retval;
29	u64 notify_limit;
30
31	if (!PKT_FIELD(vsk, peer_waiting_write))
32		return false;
33
34	/* When the sender blocks, we take that as a sign that the sender is
35	 * faster than the receiver. To reduce the transmit rate of the sender,
36	 * we delay the sending of the read notification by decreasing the
37	 * write_notify_window. The notification is delayed until the number of
38	 * bytes used in the queue drops below the write_notify_window.
39	 */
40
41	if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
42		PKT_FIELD(vsk, peer_waiting_write_detected) = true;
43		if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
44			PKT_FIELD(vsk, write_notify_window) =
45			    PKT_FIELD(vsk, write_notify_min_window);
46		} else {
47			PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
48			if (PKT_FIELD(vsk, write_notify_window) <
49			    PKT_FIELD(vsk, write_notify_min_window))
50				PKT_FIELD(vsk, write_notify_window) =
51				    PKT_FIELD(vsk, write_notify_min_window);
52
53		}
54	}
55	notify_limit = vmci_trans(vsk)->consume_size -
56		PKT_FIELD(vsk, write_notify_window);
57
58	/* The notify_limit is used to delay notifications in the case where
59	 * flow control is enabled. Below the test is expressed in terms of
60	 * free space in the queue: if free_space > ConsumeSize -
61	 * write_notify_window then notify An alternate way of expressing this
62	 * is to rewrite the expression to use the data ready in the receive
63	 * queue: if write_notify_window > bufferReady then notify as
64	 * free_space == ConsumeSize - bufferReady.
65	 */
66
67	retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
68		notify_limit;
69
70	if (retval) {
71		/* Once we notify the peer, we reset the detected flag so the
72		 * next wait will again cause a decrease in the window size.
73		 */
74
75		PKT_FIELD(vsk, peer_waiting_write_detected) = false;
76	}
77	return retval;
78}
79
80static void
81vmci_transport_handle_read(struct sock *sk,
82			   struct vmci_transport_packet *pkt,
83			   bool bottom_half,
84			   struct sockaddr_vm *dst, struct sockaddr_vm *src)
85{
86	sk->sk_write_space(sk);
87}
88
89static void
90vmci_transport_handle_wrote(struct sock *sk,
91			    struct vmci_transport_packet *pkt,
92			    bool bottom_half,
93			    struct sockaddr_vm *dst, struct sockaddr_vm *src)
94{
95	sk->sk_data_ready(sk);
96}
97
98static void vsock_block_update_write_window(struct sock *sk)
99{
100	struct vsock_sock *vsk = vsock_sk(sk);
101
102	if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size)
103		PKT_FIELD(vsk, write_notify_window) =
104		    min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
105			vmci_trans(vsk)->consume_size);
106}
107
108static int vmci_transport_send_read_notification(struct sock *sk)
109{
110	struct vsock_sock *vsk;
111	bool sent_read;
112	unsigned int retries;
113	int err;
114
115	vsk = vsock_sk(sk);
116	sent_read = false;
117	retries = 0;
118	err = 0;
119
120	if (vmci_transport_notify_waiting_write(vsk)) {
121		/* Notify the peer that we have read, retrying the send on
122		 * failure up to our maximum value.  XXX For now we just log
123		 * the failure, but later we should schedule a work item to
124		 * handle the resend until it succeeds.  That would require
125		 * keeping track of work items in the vsk and cleaning them up
126		 * upon socket close.
127		 */
128		while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
129		       !sent_read &&
130		       retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
131			err = vmci_transport_send_read(sk);
132			if (err >= 0)
133				sent_read = true;
134
135			retries++;
136		}
137
138		if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read)
139			pr_err("%p unable to send read notification to peer\n",
140			       sk);
141		else
142			PKT_FIELD(vsk, peer_waiting_write) = false;
143
144	}
145	return err;
146}
147
148static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
149{
150	struct vsock_sock *vsk = vsock_sk(sk);
151
152	PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
153	PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
154	PKT_FIELD(vsk, peer_waiting_write) = false;
155	PKT_FIELD(vsk, peer_waiting_write_detected) = false;
156}
157
158static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
159{
160	PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
161	PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
162	PKT_FIELD(vsk, peer_waiting_write) = false;
163	PKT_FIELD(vsk, peer_waiting_write_detected) = false;
164}
165
166static int
167vmci_transport_notify_pkt_poll_in(struct sock *sk,
168				  size_t target, bool *data_ready_now)
169{
170	struct vsock_sock *vsk = vsock_sk(sk);
171
172	if (vsock_stream_has_data(vsk)) {
173		*data_ready_now = true;
174	} else {
175		/* We can't read right now because there is nothing in the
176		 * queue. Ask for notifications when there is something to
177		 * read.
178		 */
179		if (sk->sk_state == SS_CONNECTED)
180			vsock_block_update_write_window(sk);
181		*data_ready_now = false;
182	}
183
184	return 0;
185}
186
187static int
188vmci_transport_notify_pkt_poll_out(struct sock *sk,
189				   size_t target, bool *space_avail_now)
190{
191	s64 produce_q_free_space;
192	struct vsock_sock *vsk = vsock_sk(sk);
193
194	produce_q_free_space = vsock_stream_has_space(vsk);
195	if (produce_q_free_space > 0) {
196		*space_avail_now = true;
197		return 0;
198	} else if (produce_q_free_space == 0) {
199		/* This is a connected socket but we can't currently send data.
200		 * Nothing else to do.
201		 */
202		*space_avail_now = false;
203	}
204
205	return 0;
206}
207
208static int
209vmci_transport_notify_pkt_recv_init(
210				struct sock *sk,
211				size_t target,
212				struct vmci_transport_recv_notify_data *data)
213{
214	struct vsock_sock *vsk = vsock_sk(sk);
215
216	data->consume_head = 0;
217	data->produce_tail = 0;
218	data->notify_on_block = false;
219
220	if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
221		PKT_FIELD(vsk, write_notify_min_window) = target + 1;
222		if (PKT_FIELD(vsk, write_notify_window) <
223		    PKT_FIELD(vsk, write_notify_min_window)) {
224			/* If the current window is smaller than the new
225			 * minimal window size, we need to reevaluate whether
226			 * we need to notify the sender. If the number of ready
227			 * bytes are smaller than the new window, we need to
228			 * send a notification to the sender before we block.
229			 */
230
231			PKT_FIELD(vsk, write_notify_window) =
232			    PKT_FIELD(vsk, write_notify_min_window);
233			data->notify_on_block = true;
234		}
235	}
236
237	return 0;
238}
239
240static int
241vmci_transport_notify_pkt_recv_pre_block(
242				struct sock *sk,
243				size_t target,
244				struct vmci_transport_recv_notify_data *data)
245{
246	int err = 0;
247
248	vsock_block_update_write_window(sk);
249
250	if (data->notify_on_block) {
251		err = vmci_transport_send_read_notification(sk);
252		if (err < 0)
253			return err;
254		data->notify_on_block = false;
255	}
256
257	return err;
258}
259
260static int
261vmci_transport_notify_pkt_recv_post_dequeue(
262				struct sock *sk,
263				size_t target,
264				ssize_t copied,
265				bool data_read,
266				struct vmci_transport_recv_notify_data *data)
267{
268	struct vsock_sock *vsk;
269	int err;
270	bool was_full = false;
271	u64 free_space;
272
273	vsk = vsock_sk(sk);
274	err = 0;
275
276	if (data_read) {
277		smp_mb();
278
279		free_space =
280			vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair);
281		was_full = free_space == copied;
282
283		if (was_full)
284			PKT_FIELD(vsk, peer_waiting_write) = true;
285
286		err = vmci_transport_send_read_notification(sk);
287		if (err < 0)
288			return err;
289
290		/* See the comment in
291		 * vmci_transport_notify_pkt_send_post_enqueue().
292		 */
293		sk->sk_data_ready(sk);
294	}
295
296	return err;
297}
298
299static int
300vmci_transport_notify_pkt_send_init(
301				struct sock *sk,
302				struct vmci_transport_send_notify_data *data)
303{
304	data->consume_head = 0;
305	data->produce_tail = 0;
306
307	return 0;
308}
309
310static int
311vmci_transport_notify_pkt_send_post_enqueue(
312				struct sock *sk,
313				ssize_t written,
314				struct vmci_transport_send_notify_data *data)
315{
316	int err = 0;
317	struct vsock_sock *vsk;
318	bool sent_wrote = false;
319	bool was_empty;
320	int retries = 0;
321
322	vsk = vsock_sk(sk);
323
324	smp_mb();
325
326	was_empty =
327		vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written;
328	if (was_empty) {
329		while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
330		       !sent_wrote &&
331		       retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
332			err = vmci_transport_send_wrote(sk);
333			if (err >= 0)
334				sent_wrote = true;
335
336			retries++;
337		}
338	}
339
340	if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) {
341		pr_err("%p unable to send wrote notification to peer\n",
342		       sk);
343		return err;
344	}
345
346	return err;
347}
348
349static void
350vmci_transport_notify_pkt_handle_pkt(
351				struct sock *sk,
352				struct vmci_transport_packet *pkt,
353				bool bottom_half,
354				struct sockaddr_vm *dst,
355				struct sockaddr_vm *src, bool *pkt_processed)
356{
357	bool processed = false;
358
359	switch (pkt->type) {
360	case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
361		vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
362		processed = true;
363		break;
364	case VMCI_TRANSPORT_PACKET_TYPE_READ:
365		vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
366		processed = true;
367		break;
368	}
369
370	if (pkt_processed)
371		*pkt_processed = processed;
372}
373
374static void vmci_transport_notify_pkt_process_request(struct sock *sk)
375{
376	struct vsock_sock *vsk = vsock_sk(sk);
377
378	PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
379	if (vmci_trans(vsk)->consume_size <
380		PKT_FIELD(vsk, write_notify_min_window))
381		PKT_FIELD(vsk, write_notify_min_window) =
382			vmci_trans(vsk)->consume_size;
383}
384
385static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
386{
387	struct vsock_sock *vsk = vsock_sk(sk);
388
389	PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
390	if (vmci_trans(vsk)->consume_size <
391		PKT_FIELD(vsk, write_notify_min_window))
392		PKT_FIELD(vsk, write_notify_min_window) =
393			vmci_trans(vsk)->consume_size;
394}
395
396static int
397vmci_transport_notify_pkt_recv_pre_dequeue(
398				struct sock *sk,
399				size_t target,
400				struct vmci_transport_recv_notify_data *data)
401{
402	return 0; /* NOP for QState. */
403}
404
405static int
406vmci_transport_notify_pkt_send_pre_block(
407				struct sock *sk,
408				struct vmci_transport_send_notify_data *data)
409{
410	return 0; /* NOP for QState. */
411}
412
413static int
414vmci_transport_notify_pkt_send_pre_enqueue(
415				struct sock *sk,
416				struct vmci_transport_send_notify_data *data)
417{
418	return 0; /* NOP for QState. */
419}
420
421/* Socket always on control packet based operations. */
422struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
423	vmci_transport_notify_pkt_socket_init,
424	vmci_transport_notify_pkt_socket_destruct,
425	vmci_transport_notify_pkt_poll_in,
426	vmci_transport_notify_pkt_poll_out,
427	vmci_transport_notify_pkt_handle_pkt,
428	vmci_transport_notify_pkt_recv_init,
429	vmci_transport_notify_pkt_recv_pre_block,
430	vmci_transport_notify_pkt_recv_pre_dequeue,
431	vmci_transport_notify_pkt_recv_post_dequeue,
432	vmci_transport_notify_pkt_send_init,
433	vmci_transport_notify_pkt_send_pre_block,
434	vmci_transport_notify_pkt_send_pre_enqueue,
435	vmci_transport_notify_pkt_send_post_enqueue,
436	vmci_transport_notify_pkt_process_request,
437	vmci_transport_notify_pkt_process_negotiate,
438};
439