1/*
2 * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
3 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <linux/sched.h>
35#include <linux/spinlock.h>
36#include <linux/idr.h>
37#include <linux/pci.h>
38#include <linux/io.h>
39#include <linux/delay.h>
40#include <linux/netdevice.h>
41#include <linux/vmalloc.h>
42#include <linux/bitmap.h>
43#include <linux/slab.h>
44#include <linux/module.h>
45
46#include "ipath_kernel.h"
47#include "ipath_verbs.h"
48
49static void ipath_update_pio_bufs(struct ipath_devdata *);
50
51const char *ipath_get_unit_name(int unit)
52{
53	static char iname[16];
54	snprintf(iname, sizeof iname, "infinipath%u", unit);
55	return iname;
56}
57
58#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
59#define PFX IPATH_DRV_NAME ": "
60
61/*
62 * The size has to be longer than this string, so we can append
63 * board/chip information to it in the init code.
64 */
65const char ib_ipath_version[] = IPATH_IDSTR "\n";
66
67static struct idr unit_table;
68DEFINE_SPINLOCK(ipath_devs_lock);
69LIST_HEAD(ipath_dev_list);
70
71wait_queue_head_t ipath_state_wait;
72
73unsigned ipath_debug = __IPATH_INFO;
74
75module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
76MODULE_PARM_DESC(debug, "mask for debug prints");
77EXPORT_SYMBOL_GPL(ipath_debug);
78
79unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
80module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
81MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
82
83static unsigned ipath_hol_timeout_ms = 13000;
84module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
85MODULE_PARM_DESC(hol_timeout_ms,
86	"duration of user app suspension after link failure");
87
88unsigned ipath_linkrecovery = 1;
89module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
90MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
91
92MODULE_LICENSE("GPL");
93MODULE_AUTHOR("QLogic <support@qlogic.com>");
94MODULE_DESCRIPTION("QLogic InfiniPath driver");
95
96/*
97 * Table to translate the LINKTRAININGSTATE portion of
98 * IBCStatus to a human-readable form.
99 */
100const char *ipath_ibcstatus_str[] = {
101	"Disabled",
102	"LinkUp",
103	"PollActive",
104	"PollQuiet",
105	"SleepDelay",
106	"SleepQuiet",
107	"LState6",		/* unused */
108	"LState7",		/* unused */
109	"CfgDebounce",
110	"CfgRcvfCfg",
111	"CfgWaitRmt",
112	"CfgIdle",
113	"RecovRetrain",
114	"CfgTxRevLane",		/* unused before IBA7220 */
115	"RecovWaitRmt",
116	"RecovIdle",
117	/* below were added for IBA7220 */
118	"CfgEnhanced",
119	"CfgTest",
120	"CfgWaitRmtTest",
121	"CfgWaitCfgEnhanced",
122	"SendTS_T",
123	"SendTstIdles",
124	"RcvTS_T",
125	"SendTst_TS1s",
126	"LTState18", "LTState19", "LTState1A", "LTState1B",
127	"LTState1C", "LTState1D", "LTState1E", "LTState1F"
128};
129
130static void ipath_remove_one(struct pci_dev *);
131static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
132
133/* Only needed for registration, nothing else needs this info */
134#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
135#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
136
137/* Number of seconds before our card status check...  */
138#define STATUS_TIMEOUT 60
139
140static const struct pci_device_id ipath_pci_tbl[] = {
141	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
142	{ 0, }
143};
144
145MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
146
147static struct pci_driver ipath_driver = {
148	.name = IPATH_DRV_NAME,
149	.probe = ipath_init_one,
150	.remove = ipath_remove_one,
151	.id_table = ipath_pci_tbl,
152	.driver = {
153		.groups = ipath_driver_attr_groups,
154	},
155};
156
157static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
158			     u32 *bar0, u32 *bar1)
159{
160	int ret;
161
162	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
163	if (ret)
164		ipath_dev_err(dd, "failed to read bar0 before enable: "
165			      "error %d\n", -ret);
166
167	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
168	if (ret)
169		ipath_dev_err(dd, "failed to read bar1 before enable: "
170			      "error %d\n", -ret);
171
172	ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
173}
174
175static void ipath_free_devdata(struct pci_dev *pdev,
176			       struct ipath_devdata *dd)
177{
178	unsigned long flags;
179
180	pci_set_drvdata(pdev, NULL);
181
182	if (dd->ipath_unit != -1) {
183		spin_lock_irqsave(&ipath_devs_lock, flags);
184		idr_remove(&unit_table, dd->ipath_unit);
185		list_del(&dd->ipath_list);
186		spin_unlock_irqrestore(&ipath_devs_lock, flags);
187	}
188	vfree(dd);
189}
190
191static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
192{
193	unsigned long flags;
194	struct ipath_devdata *dd;
195	int ret;
196
197	dd = vzalloc(sizeof(*dd));
198	if (!dd) {
199		dd = ERR_PTR(-ENOMEM);
200		goto bail;
201	}
202	dd->ipath_unit = -1;
203
204	idr_preload(GFP_KERNEL);
205	spin_lock_irqsave(&ipath_devs_lock, flags);
206
207	ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
208	if (ret < 0) {
209		printk(KERN_ERR IPATH_DRV_NAME
210		       ": Could not allocate unit ID: error %d\n", -ret);
211		ipath_free_devdata(pdev, dd);
212		dd = ERR_PTR(ret);
213		goto bail_unlock;
214	}
215	dd->ipath_unit = ret;
216
217	dd->pcidev = pdev;
218	pci_set_drvdata(pdev, dd);
219
220	list_add(&dd->ipath_list, &ipath_dev_list);
221
222bail_unlock:
223	spin_unlock_irqrestore(&ipath_devs_lock, flags);
224	idr_preload_end();
225bail:
226	return dd;
227}
228
229static inline struct ipath_devdata *__ipath_lookup(int unit)
230{
231	return idr_find(&unit_table, unit);
232}
233
234struct ipath_devdata *ipath_lookup(int unit)
235{
236	struct ipath_devdata *dd;
237	unsigned long flags;
238
239	spin_lock_irqsave(&ipath_devs_lock, flags);
240	dd = __ipath_lookup(unit);
241	spin_unlock_irqrestore(&ipath_devs_lock, flags);
242
243	return dd;
244}
245
246int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
247{
248	int nunits, npresent, nup;
249	struct ipath_devdata *dd;
250	unsigned long flags;
251	int maxports;
252
253	nunits = npresent = nup = maxports = 0;
254
255	spin_lock_irqsave(&ipath_devs_lock, flags);
256
257	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
258		nunits++;
259		if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
260			npresent++;
261		if (dd->ipath_lid &&
262		    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
263					 | IPATH_LINKUNK)))
264			nup++;
265		if (dd->ipath_cfgports > maxports)
266			maxports = dd->ipath_cfgports;
267	}
268
269	spin_unlock_irqrestore(&ipath_devs_lock, flags);
270
271	if (npresentp)
272		*npresentp = npresent;
273	if (nupp)
274		*nupp = nup;
275	if (maxportsp)
276		*maxportsp = maxports;
277
278	return nunits;
279}
280
281/*
282 * These next two routines are placeholders in case we don't have per-arch
283 * code for controlling write combining.  If explicit control of write
284 * combining is not available, performance will probably be awful.
285 */
286
287int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
288{
289	return -EOPNOTSUPP;
290}
291
292void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
293{
294}
295
296/*
297 * Perform a PIO buffer bandwidth write test, to verify proper system
298 * configuration.  Even when all the setup calls work, occasionally
299 * BIOS or other issues can prevent write combining from working, or
300 * can cause other bandwidth problems to the chip.
301 *
302 * This test simply writes the same buffer over and over again, and
303 * measures close to the peak bandwidth to the chip (not testing
304 * data bandwidth to the wire).   On chips that use an address-based
305 * trigger to send packets to the wire, this is easy.  On chips that
306 * use a count to trigger, we want to make sure that the packet doesn't
307 * go out on the wire, or trigger flow control checks.
308 */
309static void ipath_verify_pioperf(struct ipath_devdata *dd)
310{
311	u32 pbnum, cnt, lcnt;
312	u32 __iomem *piobuf;
313	u32 *addr;
314	u64 msecs, emsecs;
315
316	piobuf = ipath_getpiobuf(dd, 0, &pbnum);
317	if (!piobuf) {
318		dev_info(&dd->pcidev->dev,
319			"No PIObufs for checking perf, skipping\n");
320		return;
321	}
322
323	/*
324	 * Enough to give us a reasonable test, less than piobuf size, and
325	 * likely multiple of store buffer length.
326	 */
327	cnt = 1024;
328
329	addr = vmalloc(cnt);
330	if (!addr) {
331		dev_info(&dd->pcidev->dev,
332			"Couldn't get memory for checking PIO perf,"
333			" skipping\n");
334		goto done;
335	}
336
337	preempt_disable();  /* we want reasonably accurate elapsed time */
338	msecs = 1 + jiffies_to_msecs(jiffies);
339	for (lcnt = 0; lcnt < 10000U; lcnt++) {
340		/* wait until we cross msec boundary */
341		if (jiffies_to_msecs(jiffies) >= msecs)
342			break;
343		udelay(1);
344	}
345
346	ipath_disable_armlaunch(dd);
347
348	/*
349	 * length 0, no dwords actually sent, and mark as VL15
350	 * on chips where that may matter (due to IB flowcontrol)
351	 */
352	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
353		writeq(1UL << 63, piobuf);
354	else
355		writeq(0, piobuf);
356	ipath_flush_wc();
357
358	/*
359	 * this is only roughly accurate, since even with preempt we
360	 * still take interrupts that could take a while.   Running for
361	 * >= 5 msec seems to get us "close enough" to accurate values
362	 */
363	msecs = jiffies_to_msecs(jiffies);
364	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
365		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
366		emsecs = jiffies_to_msecs(jiffies) - msecs;
367	}
368
369	/* 1 GiB/sec, slightly over IB SDR line rate */
370	if (lcnt < (emsecs * 1024U))
371		ipath_dev_err(dd,
372			"Performance problem: bandwidth to PIO buffers is "
373			"only %u MiB/sec\n",
374			lcnt / (u32) emsecs);
375	else
376		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
377			lcnt / (u32) emsecs);
378
379	preempt_enable();
380
381	vfree(addr);
382
383done:
384	/* disarm piobuf, so it's available again */
385	ipath_disarm_piobufs(dd, pbnum, 1);
386	ipath_enable_armlaunch(dd);
387}
388
389static void cleanup_device(struct ipath_devdata *dd);
390
391static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
392{
393	int ret, len, j;
394	struct ipath_devdata *dd;
395	unsigned long long addr;
396	u32 bar0 = 0, bar1 = 0;
397
398	dd = ipath_alloc_devdata(pdev);
399	if (IS_ERR(dd)) {
400		ret = PTR_ERR(dd);
401		printk(KERN_ERR IPATH_DRV_NAME
402		       ": Could not allocate devdata: error %d\n", -ret);
403		goto bail;
404	}
405
406	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
407
408	ret = pci_enable_device(pdev);
409	if (ret) {
410		/* This can happen iff:
411		 *
412		 * We did a chip reset, and then failed to reprogram the
413		 * BAR, or the chip reset due to an internal error.  We then
414		 * unloaded the driver and reloaded it.
415		 *
416		 * Both reset cases set the BAR back to initial state.  For
417		 * the latter case, the AER sticky error bit at offset 0x718
418		 * should be set, but the Linux kernel doesn't yet know
419		 * about that, it appears.  If the original BAR was retained
420		 * in the kernel data structures, this may be OK.
421		 */
422		ipath_dev_err(dd, "enable unit %d failed: error %d\n",
423			      dd->ipath_unit, -ret);
424		goto bail_devdata;
425	}
426	addr = pci_resource_start(pdev, 0);
427	len = pci_resource_len(pdev, 0);
428	ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
429		   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
430		   ent->device, ent->driver_data);
431
432	read_bars(dd, pdev, &bar0, &bar1);
433
434	if (!bar1 && !(bar0 & ~0xf)) {
435		if (addr) {
436			dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
437				 "rewriting as %llx\n", addr);
438			ret = pci_write_config_dword(
439				pdev, PCI_BASE_ADDRESS_0, addr);
440			if (ret) {
441				ipath_dev_err(dd, "rewrite of BAR0 "
442					      "failed: err %d\n", -ret);
443				goto bail_disable;
444			}
445			ret = pci_write_config_dword(
446				pdev, PCI_BASE_ADDRESS_1, addr >> 32);
447			if (ret) {
448				ipath_dev_err(dd, "rewrite of BAR1 "
449					      "failed: err %d\n", -ret);
450				goto bail_disable;
451			}
452		} else {
453			ipath_dev_err(dd, "BAR is 0 (probable RESET), "
454				      "not usable until reboot\n");
455			ret = -ENODEV;
456			goto bail_disable;
457		}
458	}
459
460	ret = pci_request_regions(pdev, IPATH_DRV_NAME);
461	if (ret) {
462		dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
463			 "err %d\n", dd->ipath_unit, -ret);
464		goto bail_disable;
465	}
466
467	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
468	if (ret) {
469		/*
470		 * if the 64 bit setup fails, try 32 bit.  Some systems
471		 * do not setup 64 bit maps on systems with 2GB or less
472		 * memory installed.
473		 */
474		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
475		if (ret) {
476			dev_info(&pdev->dev,
477				"Unable to set DMA mask for unit %u: %d\n",
478				dd->ipath_unit, ret);
479			goto bail_regions;
480		}
481		else {
482			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
483			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
484			if (ret)
485				dev_info(&pdev->dev,
486					"Unable to set DMA consistent mask "
487					"for unit %u: %d\n",
488					dd->ipath_unit, ret);
489
490		}
491	}
492	else {
493		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
494		if (ret)
495			dev_info(&pdev->dev,
496				"Unable to set DMA consistent mask "
497				"for unit %u: %d\n",
498				dd->ipath_unit, ret);
499	}
500
501	pci_set_master(pdev);
502
503	/*
504	 * Save BARs to rewrite after device reset.  Save all 64 bits of
505	 * BAR, just in case.
506	 */
507	dd->ipath_pcibar0 = addr;
508	dd->ipath_pcibar1 = addr >> 32;
509	dd->ipath_deviceid = ent->device;	/* save for later use */
510	dd->ipath_vendorid = ent->vendor;
511
512	/* setup the chip-specific functions, as early as possible. */
513	switch (ent->device) {
514	case PCI_DEVICE_ID_INFINIPATH_HT:
515		ipath_init_iba6110_funcs(dd);
516		break;
517
518	default:
519		ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
520			      "failing\n", ent->device);
521		return -ENODEV;
522	}
523
524	for (j = 0; j < 6; j++) {
525		if (!pdev->resource[j].start)
526			continue;
527		ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
528			   j, &pdev->resource[j],
529			   (unsigned long long)pci_resource_len(pdev, j));
530	}
531
532	if (!addr) {
533		ipath_dev_err(dd, "No valid address in BAR 0!\n");
534		ret = -ENODEV;
535		goto bail_regions;
536	}
537
538	dd->ipath_pcirev = pdev->revision;
539
540#if defined(__powerpc__)
541	/* There isn't a generic way to specify writethrough mappings */
542	dd->ipath_kregbase = __ioremap(addr, len,
543		(_PAGE_NO_CACHE|_PAGE_WRITETHRU));
544#else
545	dd->ipath_kregbase = ioremap_nocache(addr, len);
546#endif
547
548	if (!dd->ipath_kregbase) {
549		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
550			  addr);
551		ret = -ENOMEM;
552		goto bail_iounmap;
553	}
554	dd->ipath_kregend = (u64 __iomem *)
555		((void __iomem *)dd->ipath_kregbase + len);
556	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
557	/* for user mmap */
558	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
559		   addr, dd->ipath_kregbase);
560
561	if (dd->ipath_f_bus(dd, pdev))
562		ipath_dev_err(dd, "Failed to setup config space; "
563			      "continuing anyway\n");
564
565	/*
566	 * set up our interrupt handler; IRQF_SHARED probably not needed,
567	 * since MSI interrupts shouldn't be shared but won't  hurt for now.
568	 * check 0 irq after we return from chip-specific bus setup, since
569	 * that can affect this due to setup
570	 */
571	if (!dd->ipath_irq)
572		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
573			      "work\n");
574	else {
575		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
576				  IPATH_DRV_NAME, dd);
577		if (ret) {
578			ipath_dev_err(dd, "Couldn't setup irq handler, "
579				      "irq=%d: %d\n", dd->ipath_irq, ret);
580			goto bail_iounmap;
581		}
582	}
583
584	ret = ipath_init_chip(dd, 0);	/* do the chip-specific init */
585	if (ret)
586		goto bail_irqsetup;
587
588	ret = ipath_enable_wc(dd);
589
590	if (ret) {
591		ipath_dev_err(dd, "Write combining not enabled "
592			      "(err %d): performance may be poor\n",
593			      -ret);
594		ret = 0;
595	}
596
597	ipath_verify_pioperf(dd);
598
599	ipath_device_create_group(&pdev->dev, dd);
600	ipathfs_add_device(dd);
601	ipath_user_add(dd);
602	ipath_diag_add(dd);
603	ipath_register_ib_device(dd);
604
605	goto bail;
606
607bail_irqsetup:
608	cleanup_device(dd);
609
610	if (dd->ipath_irq)
611		dd->ipath_f_free_irq(dd);
612
613	if (dd->ipath_f_cleanup)
614		dd->ipath_f_cleanup(dd);
615
616bail_iounmap:
617	iounmap((volatile void __iomem *) dd->ipath_kregbase);
618
619bail_regions:
620	pci_release_regions(pdev);
621
622bail_disable:
623	pci_disable_device(pdev);
624
625bail_devdata:
626	ipath_free_devdata(pdev, dd);
627
628bail:
629	return ret;
630}
631
632static void cleanup_device(struct ipath_devdata *dd)
633{
634	int port;
635	struct ipath_portdata **tmp;
636	unsigned long flags;
637
638	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
639		/* can't do anything more with chip; needs re-init */
640		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
641		if (dd->ipath_kregbase) {
642			/*
643			 * if we haven't already cleaned up before these are
644			 * to ensure any register reads/writes "fail" until
645			 * re-init
646			 */
647			dd->ipath_kregbase = NULL;
648			dd->ipath_uregbase = 0;
649			dd->ipath_sregbase = 0;
650			dd->ipath_cregbase = 0;
651			dd->ipath_kregsize = 0;
652		}
653		ipath_disable_wc(dd);
654	}
655
656	if (dd->ipath_spectriggerhit)
657		dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
658			 dd->ipath_spectriggerhit);
659
660	if (dd->ipath_pioavailregs_dma) {
661		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
662				  (void *) dd->ipath_pioavailregs_dma,
663				  dd->ipath_pioavailregs_phys);
664		dd->ipath_pioavailregs_dma = NULL;
665	}
666	if (dd->ipath_dummy_hdrq) {
667		dma_free_coherent(&dd->pcidev->dev,
668			dd->ipath_pd[0]->port_rcvhdrq_size,
669			dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
670		dd->ipath_dummy_hdrq = NULL;
671	}
672
673	if (dd->ipath_pageshadow) {
674		struct page **tmpp = dd->ipath_pageshadow;
675		dma_addr_t *tmpd = dd->ipath_physshadow;
676		int i, cnt = 0;
677
678		ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
679			   "locked\n");
680		for (port = 0; port < dd->ipath_cfgports; port++) {
681			int port_tidbase = port * dd->ipath_rcvtidcnt;
682			int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
683			for (i = port_tidbase; i < maxtid; i++) {
684				if (!tmpp[i])
685					continue;
686				pci_unmap_page(dd->pcidev, tmpd[i],
687					PAGE_SIZE, PCI_DMA_FROMDEVICE);
688				ipath_release_user_pages(&tmpp[i], 1);
689				tmpp[i] = NULL;
690				cnt++;
691			}
692		}
693		if (cnt) {
694			ipath_stats.sps_pageunlocks += cnt;
695			ipath_cdbg(VERBOSE, "There were still %u expTID "
696				   "entries locked\n", cnt);
697		}
698		if (ipath_stats.sps_pagelocks ||
699		    ipath_stats.sps_pageunlocks)
700			ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
701				   "unlocked via ipath_m{un}lock\n",
702				   (unsigned long long)
703				   ipath_stats.sps_pagelocks,
704				   (unsigned long long)
705				   ipath_stats.sps_pageunlocks);
706
707		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
708			   dd->ipath_pageshadow);
709		tmpp = dd->ipath_pageshadow;
710		dd->ipath_pageshadow = NULL;
711		vfree(tmpp);
712
713		dd->ipath_egrtidbase = NULL;
714	}
715
716	/*
717	 * free any resources still in use (usually just kernel ports)
718	 * at unload; we do for portcnt, because that's what we allocate.
719	 * We acquire lock to be really paranoid that ipath_pd isn't being
720	 * accessed from some interrupt-related code (that should not happen,
721	 * but best to be sure).
722	 */
723	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
724	tmp = dd->ipath_pd;
725	dd->ipath_pd = NULL;
726	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
727	for (port = 0; port < dd->ipath_portcnt; port++) {
728		struct ipath_portdata *pd = tmp[port];
729		tmp[port] = NULL; /* debugging paranoia */
730		ipath_free_pddata(dd, pd);
731	}
732	kfree(tmp);
733}
734
735static void ipath_remove_one(struct pci_dev *pdev)
736{
737	struct ipath_devdata *dd = pci_get_drvdata(pdev);
738
739	ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
740
741	/*
742	 * disable the IB link early, to be sure no new packets arrive, which
743	 * complicates the shutdown process
744	 */
745	ipath_shutdown_device(dd);
746
747	flush_workqueue(ib_wq);
748
749	if (dd->verbs_dev)
750		ipath_unregister_ib_device(dd->verbs_dev);
751
752	ipath_diag_remove(dd);
753	ipath_user_remove(dd);
754	ipathfs_remove_device(dd);
755	ipath_device_remove_group(&pdev->dev, dd);
756
757	ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
758		   "unit %u\n", dd, (u32) dd->ipath_unit);
759
760	cleanup_device(dd);
761
762	/*
763	 * turn off rcv, send, and interrupts for all ports, all drivers
764	 * should also hard reset the chip here?
765	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
766	 * for all versions of the driver, if they were allocated
767	 */
768	if (dd->ipath_irq) {
769		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
770			   dd->ipath_unit, dd->ipath_irq);
771		dd->ipath_f_free_irq(dd);
772	} else
773		ipath_dbg("irq is 0, not doing free_irq "
774			  "for unit %u\n", dd->ipath_unit);
775	/*
776	 * we check for NULL here, because it's outside
777	 * the kregbase check, and we need to call it
778	 * after the free_irq.	Thus it's possible that
779	 * the function pointers were never initialized.
780	 */
781	if (dd->ipath_f_cleanup)
782		/* clean up chip-specific stuff */
783		dd->ipath_f_cleanup(dd);
784
785	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
786	iounmap((volatile void __iomem *) dd->ipath_kregbase);
787	pci_release_regions(pdev);
788	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
789	pci_disable_device(pdev);
790
791	ipath_free_devdata(pdev, dd);
792}
793
794/* general driver use */
795DEFINE_MUTEX(ipath_mutex);
796
797static DEFINE_SPINLOCK(ipath_pioavail_lock);
798
799/**
800 * ipath_disarm_piobufs - cancel a range of PIO buffers
801 * @dd: the infinipath device
802 * @first: the first PIO buffer to cancel
803 * @cnt: the number of PIO buffers to cancel
804 *
805 * cancel a range of PIO buffers, used when they might be armed, but
806 * not triggered.  Used at init to ensure buffer state, and also user
807 * process close, in case it died while writing to a PIO buffer
808 * Also after errors.
809 */
810void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
811			  unsigned cnt)
812{
813	unsigned i, last = first + cnt;
814	unsigned long flags;
815
816	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
817	for (i = first; i < last; i++) {
818		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
819		/*
820		 * The disarm-related bits are write-only, so it
821		 * is ok to OR them in with our copy of sendctrl
822		 * while we hold the lock.
823		 */
824		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
825			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
826			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
827		/* can't disarm bufs back-to-back per iba7220 spec */
828		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
829		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
830	}
831	/* on some older chips, update may not happen after cancel */
832	ipath_force_pio_avail_update(dd);
833}
834
835/**
836 * ipath_wait_linkstate - wait for an IB link state change to occur
837 * @dd: the infinipath device
838 * @state: the state to wait for
839 * @msecs: the number of milliseconds to wait
840 *
841 * wait up to msecs milliseconds for IB link state change to occur for
842 * now, take the easy polling route.  Currently used only by
843 * ipath_set_linkstate.  Returns 0 if state reached, otherwise
844 * -ETIMEDOUT state can have multiple states set, for any of several
845 * transitions.
846 */
847int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
848{
849	dd->ipath_state_wanted = state;
850	wait_event_interruptible_timeout(ipath_state_wait,
851					 (dd->ipath_flags & state),
852					 msecs_to_jiffies(msecs));
853	dd->ipath_state_wanted = 0;
854
855	if (!(dd->ipath_flags & state)) {
856		u64 val;
857		ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
858			   " ms\n",
859			   /* test INIT ahead of DOWN, both can be set */
860			   (state & IPATH_LINKINIT) ? "INIT" :
861			   ((state & IPATH_LINKDOWN) ? "DOWN" :
862			    ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
863			   msecs);
864		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
865		ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
866			   (unsigned long long) ipath_read_kreg64(
867				   dd, dd->ipath_kregs->kr_ibcctrl),
868			   (unsigned long long) val,
869			   ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
870	}
871	return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
872}
873
874static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
875	char *buf, size_t blen)
876{
877	static const struct {
878		ipath_err_t err;
879		const char *msg;
880	} errs[] = {
881		{ INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
882		{ INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
883		{ INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
884		{ INFINIPATH_E_SDMABASE, "SDmaBase" },
885		{ INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
886		{ INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
887		{ INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
888		{ INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
889		{ INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
890		{ INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
891		{ INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
892		{ INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
893	};
894	int i;
895	int expected;
896	size_t bidx = 0;
897
898	for (i = 0; i < ARRAY_SIZE(errs); i++) {
899		expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
900			test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
901		if ((err & errs[i].err) && !expected)
902			bidx += snprintf(buf + bidx, blen - bidx,
903					 "%s ", errs[i].msg);
904	}
905}
906
907/*
908 * Decode the error status into strings, deciding whether to always
909 * print * it or not depending on "normal packet errors" vs everything
910 * else.   Return 1 if "real" errors, otherwise 0 if only packet
911 * errors, so caller can decide what to print with the string.
912 */
913int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
914	ipath_err_t err)
915{
916	int iserr = 1;
917	*buf = '\0';
918	if (err & INFINIPATH_E_PKTERRS) {
919		if (!(err & ~INFINIPATH_E_PKTERRS))
920			iserr = 0; // if only packet errors.
921		if (ipath_debug & __IPATH_ERRPKTDBG) {
922			if (err & INFINIPATH_E_REBP)
923				strlcat(buf, "EBP ", blen);
924			if (err & INFINIPATH_E_RVCRC)
925				strlcat(buf, "VCRC ", blen);
926			if (err & INFINIPATH_E_RICRC) {
927				strlcat(buf, "CRC ", blen);
928				// clear for check below, so only once
929				err &= INFINIPATH_E_RICRC;
930			}
931			if (err & INFINIPATH_E_RSHORTPKTLEN)
932				strlcat(buf, "rshortpktlen ", blen);
933			if (err & INFINIPATH_E_SDROPPEDDATAPKT)
934				strlcat(buf, "sdroppeddatapkt ", blen);
935			if (err & INFINIPATH_E_SPKTLEN)
936				strlcat(buf, "spktlen ", blen);
937		}
938		if ((err & INFINIPATH_E_RICRC) &&
939			!(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
940			strlcat(buf, "CRC ", blen);
941		if (!iserr)
942			goto done;
943	}
944	if (err & INFINIPATH_E_RHDRLEN)
945		strlcat(buf, "rhdrlen ", blen);
946	if (err & INFINIPATH_E_RBADTID)
947		strlcat(buf, "rbadtid ", blen);
948	if (err & INFINIPATH_E_RBADVERSION)
949		strlcat(buf, "rbadversion ", blen);
950	if (err & INFINIPATH_E_RHDR)
951		strlcat(buf, "rhdr ", blen);
952	if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
953		strlcat(buf, "sendspecialtrigger ", blen);
954	if (err & INFINIPATH_E_RLONGPKTLEN)
955		strlcat(buf, "rlongpktlen ", blen);
956	if (err & INFINIPATH_E_RMAXPKTLEN)
957		strlcat(buf, "rmaxpktlen ", blen);
958	if (err & INFINIPATH_E_RMINPKTLEN)
959		strlcat(buf, "rminpktlen ", blen);
960	if (err & INFINIPATH_E_SMINPKTLEN)
961		strlcat(buf, "sminpktlen ", blen);
962	if (err & INFINIPATH_E_RFORMATERR)
963		strlcat(buf, "rformaterr ", blen);
964	if (err & INFINIPATH_E_RUNSUPVL)
965		strlcat(buf, "runsupvl ", blen);
966	if (err & INFINIPATH_E_RUNEXPCHAR)
967		strlcat(buf, "runexpchar ", blen);
968	if (err & INFINIPATH_E_RIBFLOW)
969		strlcat(buf, "ribflow ", blen);
970	if (err & INFINIPATH_E_SUNDERRUN)
971		strlcat(buf, "sunderrun ", blen);
972	if (err & INFINIPATH_E_SPIOARMLAUNCH)
973		strlcat(buf, "spioarmlaunch ", blen);
974	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
975		strlcat(buf, "sunexperrpktnum ", blen);
976	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
977		strlcat(buf, "sdroppedsmppkt ", blen);
978	if (err & INFINIPATH_E_SMAXPKTLEN)
979		strlcat(buf, "smaxpktlen ", blen);
980	if (err & INFINIPATH_E_SUNSUPVL)
981		strlcat(buf, "sunsupVL ", blen);
982	if (err & INFINIPATH_E_INVALIDADDR)
983		strlcat(buf, "invalidaddr ", blen);
984	if (err & INFINIPATH_E_RRCVEGRFULL)
985		strlcat(buf, "rcvegrfull ", blen);
986	if (err & INFINIPATH_E_RRCVHDRFULL)
987		strlcat(buf, "rcvhdrfull ", blen);
988	if (err & INFINIPATH_E_IBSTATUSCHANGED)
989		strlcat(buf, "ibcstatuschg ", blen);
990	if (err & INFINIPATH_E_RIBLOSTLINK)
991		strlcat(buf, "riblostlink ", blen);
992	if (err & INFINIPATH_E_HARDWARE)
993		strlcat(buf, "hardware ", blen);
994	if (err & INFINIPATH_E_RESET)
995		strlcat(buf, "reset ", blen);
996	if (err & INFINIPATH_E_SDMAERRS)
997		decode_sdma_errs(dd, err, buf, blen);
998	if (err & INFINIPATH_E_INVALIDEEPCMD)
999		strlcat(buf, "invalideepromcmd ", blen);
1000done:
1001	return iserr;
1002}
1003
1004/**
1005 * get_rhf_errstring - decode RHF errors
1006 * @err: the err number
1007 * @msg: the output buffer
1008 * @len: the length of the output buffer
1009 *
1010 * only used one place now, may want more later
1011 */
1012static void get_rhf_errstring(u32 err, char *msg, size_t len)
1013{
1014	/* if no errors, and so don't need to check what's first */
1015	*msg = '\0';
1016
1017	if (err & INFINIPATH_RHF_H_ICRCERR)
1018		strlcat(msg, "icrcerr ", len);
1019	if (err & INFINIPATH_RHF_H_VCRCERR)
1020		strlcat(msg, "vcrcerr ", len);
1021	if (err & INFINIPATH_RHF_H_PARITYERR)
1022		strlcat(msg, "parityerr ", len);
1023	if (err & INFINIPATH_RHF_H_LENERR)
1024		strlcat(msg, "lenerr ", len);
1025	if (err & INFINIPATH_RHF_H_MTUERR)
1026		strlcat(msg, "mtuerr ", len);
1027	if (err & INFINIPATH_RHF_H_IHDRERR)
1028		/* infinipath hdr checksum error */
1029		strlcat(msg, "ipathhdrerr ", len);
1030	if (err & INFINIPATH_RHF_H_TIDERR)
1031		strlcat(msg, "tiderr ", len);
1032	if (err & INFINIPATH_RHF_H_MKERR)
1033		/* bad port, offset, etc. */
1034		strlcat(msg, "invalid ipathhdr ", len);
1035	if (err & INFINIPATH_RHF_H_IBERR)
1036		strlcat(msg, "iberr ", len);
1037	if (err & INFINIPATH_RHF_L_SWA)
1038		strlcat(msg, "swA ", len);
1039	if (err & INFINIPATH_RHF_L_SWB)
1040		strlcat(msg, "swB ", len);
1041}
1042
1043/**
1044 * ipath_get_egrbuf - get an eager buffer
1045 * @dd: the infinipath device
1046 * @bufnum: the eager buffer to get
1047 *
1048 * must only be called if ipath_pd[port] is known to be allocated
1049 */
1050static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
1051{
1052	return dd->ipath_port0_skbinfo ?
1053		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
1054}
1055
1056/**
1057 * ipath_alloc_skb - allocate an skb and buffer with possible constraints
1058 * @dd: the infinipath device
1059 * @gfp_mask: the sk_buff SFP mask
1060 */
1061struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
1062				gfp_t gfp_mask)
1063{
1064	struct sk_buff *skb;
1065	u32 len;
1066
1067	/*
1068	 * Only fully supported way to handle this is to allocate lots
1069	 * extra, align as needed, and then do skb_reserve().  That wastes
1070	 * a lot of memory...  I'll have to hack this into infinipath_copy
1071	 * also.
1072	 */
1073
1074	/*
1075	 * We need 2 extra bytes for ipath_ether data sent in the
1076	 * key header.  In order to keep everything dword aligned,
1077	 * we'll reserve 4 bytes.
1078	 */
1079	len = dd->ipath_ibmaxlen + 4;
1080
1081	if (dd->ipath_flags & IPATH_4BYTE_TID) {
1082		/* We need a 2KB multiple alignment, and there is no way
1083		 * to do it except to allocate extra and then skb_reserve
1084		 * enough to bring it up to the right alignment.
1085		 */
1086		len += 2047;
1087	}
1088
1089	skb = __dev_alloc_skb(len, gfp_mask);
1090	if (!skb) {
1091		ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
1092			      len);
1093		goto bail;
1094	}
1095
1096	skb_reserve(skb, 4);
1097
1098	if (dd->ipath_flags & IPATH_4BYTE_TID) {
1099		u32 una = (unsigned long)skb->data & 2047;
1100		if (una)
1101			skb_reserve(skb, 2048 - una);
1102	}
1103
1104bail:
1105	return skb;
1106}
1107
1108static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
1109			     u32 eflags,
1110			     u32 l,
1111			     u32 etail,
1112			     __le32 *rhf_addr,
1113			     struct ipath_message_header *hdr)
1114{
1115	char emsg[128];
1116
1117	get_rhf_errstring(eflags, emsg, sizeof emsg);
1118	ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
1119		   "tlen=%x opcode=%x egridx=%x: %s\n",
1120		   eflags, l,
1121		   ipath_hdrget_rcv_type(rhf_addr),
1122		   ipath_hdrget_length_in_bytes(rhf_addr),
1123		   be32_to_cpu(hdr->bth[0]) >> 24,
1124		   etail, emsg);
1125
1126	/* Count local link integrity errors. */
1127	if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
1128		u8 n = (dd->ipath_ibcctrl >>
1129			INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
1130			INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
1131
1132		if (++dd->ipath_lli_counter > n) {
1133			dd->ipath_lli_counter = 0;
1134			dd->ipath_lli_errors++;
1135		}
1136	}
1137}
1138
1139/*
1140 * ipath_kreceive - receive a packet
1141 * @pd: the infinipath port
1142 *
1143 * called from interrupt handler for errors or receive interrupt
1144 */
1145void ipath_kreceive(struct ipath_portdata *pd)
1146{
1147	struct ipath_devdata *dd = pd->port_dd;
1148	__le32 *rhf_addr;
1149	void *ebuf;
1150	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
1151	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
1152	u32 etail = -1, l, hdrqtail;
1153	struct ipath_message_header *hdr;
1154	u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
1155	static u64 totcalls;	/* stats, may eventually remove */
1156	int last;
1157
1158	l = pd->port_head;
1159	rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
1160	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
1161		u32 seq = ipath_hdrget_seq(rhf_addr);
1162
1163		if (seq != pd->port_seq_cnt)
1164			goto bail;
1165		hdrqtail = 0;
1166	} else {
1167		hdrqtail = ipath_get_rcvhdrtail(pd);
1168		if (l == hdrqtail)
1169			goto bail;
1170		smp_rmb();
1171	}
1172
1173reloop:
1174	for (last = 0, i = 1; !last; i += !last) {
1175		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
1176		eflags = ipath_hdrget_err_flags(rhf_addr);
1177		etype = ipath_hdrget_rcv_type(rhf_addr);
1178		/* total length */
1179		tlen = ipath_hdrget_length_in_bytes(rhf_addr);
1180		ebuf = NULL;
1181		if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
1182		    ipath_hdrget_use_egr_buf(rhf_addr) :
1183		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
1184			/*
1185			 * It turns out that the chip uses an eager buffer
1186			 * for all non-expected packets, whether it "needs"
1187			 * one or not.  So always get the index, but don't
1188			 * set ebuf (so we try to copy data) unless the
1189			 * length requires it.
1190			 */
1191			etail = ipath_hdrget_index(rhf_addr);
1192			updegr = 1;
1193			if (tlen > sizeof(*hdr) ||
1194			    etype == RCVHQ_RCV_TYPE_NON_KD)
1195				ebuf = ipath_get_egrbuf(dd, etail);
1196		}
1197
1198		/*
1199		 * both tiderr and ipathhdrerr are set for all plain IB
1200		 * packets; only ipathhdrerr should be set.
1201		 */
1202
1203		if (etype != RCVHQ_RCV_TYPE_NON_KD &&
1204		    etype != RCVHQ_RCV_TYPE_ERROR &&
1205		    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
1206		    IPS_PROTO_VERSION)
1207			ipath_cdbg(PKT, "Bad InfiniPath protocol version "
1208				   "%x\n", etype);
1209
1210		if (unlikely(eflags))
1211			ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
1212		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
1213			ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
1214			if (dd->ipath_lli_counter)
1215				dd->ipath_lli_counter--;
1216		} else if (etype == RCVHQ_RCV_TYPE_EAGER) {
1217			u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
1218			u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
1219			ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
1220				   "qp=%x), len %x; ignored\n",
1221				   etype, opcode, qp, tlen);
1222		}
1223		else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
1224			ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
1225				  be32_to_cpu(hdr->bth[0]) >> 24);
1226		else {
1227			/*
1228			 * error packet, type of error unknown.
1229			 * Probably type 3, but we don't know, so don't
1230			 * even try to print the opcode, etc.
1231			 * Usually caused by a "bad packet", that has no
1232			 * BTH, when the LRH says it should.
1233			 */
1234			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
1235				  " %x, len %x hdrq+%x rhf: %Lx\n",
1236				  etail, tlen, l, (unsigned long long)
1237				  le64_to_cpu(*(__le64 *) rhf_addr));
1238			if (ipath_debug & __IPATH_ERRPKTDBG) {
1239				u32 j, *d, dw = rsize-2;
1240				if (rsize > (tlen>>2))
1241					dw = tlen>>2;
1242				d = (u32 *)hdr;
1243				printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
1244					dw);
1245				for (j = 0; j < dw; j++)
1246					printk(KERN_DEBUG "%8x%s", d[j],
1247						(j%8) == 7 ? "\n" : " ");
1248				printk(KERN_DEBUG ".\n");
1249			}
1250		}
1251		l += rsize;
1252		if (l >= maxcnt)
1253			l = 0;
1254		rhf_addr = (__le32 *) pd->port_rcvhdrq +
1255			l + dd->ipath_rhf_offset;
1256		if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
1257			u32 seq = ipath_hdrget_seq(rhf_addr);
1258
1259			if (++pd->port_seq_cnt > 13)
1260				pd->port_seq_cnt = 1;
1261			if (seq != pd->port_seq_cnt)
1262				last = 1;
1263		} else if (l == hdrqtail)
1264			last = 1;
1265		/*
1266		 * update head regs on last packet, and every 16 packets.
1267		 * Reduce bus traffic, while still trying to prevent
1268		 * rcvhdrq overflows, for when the queue is nearly full
1269		 */
1270		if (last || !(i & 0xf)) {
1271			u64 lval = l;
1272
1273			/* request IBA6120 and 7220 interrupt only on last */
1274			if (last)
1275				lval |= dd->ipath_rhdrhead_intr_off;
1276			ipath_write_ureg(dd, ur_rcvhdrhead, lval,
1277				pd->port_port);
1278			if (updegr) {
1279				ipath_write_ureg(dd, ur_rcvegrindexhead,
1280						 etail, pd->port_port);
1281				updegr = 0;
1282			}
1283		}
1284	}
1285
1286	if (!dd->ipath_rhdrhead_intr_off && !reloop &&
1287	    !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
1288		/* IBA6110 workaround; we can have a race clearing chip
1289		 * interrupt with another interrupt about to be delivered,
1290		 * and can clear it before it is delivered on the GPIO
1291		 * workaround.  By doing the extra check here for the
1292		 * in-memory tail register updating while we were doing
1293		 * earlier packets, we "almost" guarantee we have covered
1294		 * that case.
1295		 */
1296		u32 hqtail = ipath_get_rcvhdrtail(pd);
1297		if (hqtail != hdrqtail) {
1298			hdrqtail = hqtail;
1299			reloop = 1; /* loop 1 extra time at most */
1300			goto reloop;
1301		}
1302	}
1303
1304	pkttot += i;
1305
1306	pd->port_head = l;
1307
1308	if (pkttot > ipath_stats.sps_maxpkts_call)
1309		ipath_stats.sps_maxpkts_call = pkttot;
1310	ipath_stats.sps_port0pkts += pkttot;
1311	ipath_stats.sps_avgpkts_call =
1312		ipath_stats.sps_port0pkts / ++totcalls;
1313
1314bail:;
1315}
1316
1317/**
1318 * ipath_update_pio_bufs - update shadow copy of the PIO availability map
1319 * @dd: the infinipath device
1320 *
1321 * called whenever our local copy indicates we have run out of send buffers
1322 * NOTE: This can be called from interrupt context by some code
1323 * and from non-interrupt context by ipath_getpiobuf().
1324 */
1325
1326static void ipath_update_pio_bufs(struct ipath_devdata *dd)
1327{
1328	unsigned long flags;
1329	int i;
1330	const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
1331
1332	/* If the generation (check) bits have changed, then we update the
1333	 * busy bit for the corresponding PIO buffer.  This algorithm will
1334	 * modify positions to the value they already have in some cases
1335	 * (i.e., no change), but it's faster than changing only the bits
1336	 * that have changed.
1337	 *
1338	 * We would like to do this atomicly, to avoid spinlocks in the
1339	 * critical send path, but that's not really possible, given the
1340	 * type of changes, and that this routine could be called on
1341	 * multiple cpu's simultaneously, so we lock in this routine only,
1342	 * to avoid conflicting updates; all we change is the shadow, and
1343	 * it's a single 64 bit memory location, so by definition the update
1344	 * is atomic in terms of what other cpu's can see in testing the
1345	 * bits.  The spin_lock overhead isn't too bad, since it only
1346	 * happens when all buffers are in use, so only cpu overhead, not
1347	 * latency or bandwidth is affected.
1348	 */
1349	if (!dd->ipath_pioavailregs_dma) {
1350		ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
1351		return;
1352	}
1353	if (ipath_debug & __IPATH_VERBDBG) {
1354		/* only if packet debug and verbose */
1355		volatile __le64 *dma = dd->ipath_pioavailregs_dma;
1356		unsigned long *shadow = dd->ipath_pioavailshadow;
1357
1358		ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
1359			   "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
1360			   "s3=%lx\n",
1361			   (unsigned long long) le64_to_cpu(dma[0]),
1362			   shadow[0],
1363			   (unsigned long long) le64_to_cpu(dma[1]),
1364			   shadow[1],
1365			   (unsigned long long) le64_to_cpu(dma[2]),
1366			   shadow[2],
1367			   (unsigned long long) le64_to_cpu(dma[3]),
1368			   shadow[3]);
1369		if (piobregs > 4)
1370			ipath_cdbg(
1371				PKT, "2nd group, dma4=%llx shad4=%lx, "
1372				"d5=%llx s5=%lx, d6=%llx s6=%lx, "
1373				"d7=%llx s7=%lx\n",
1374				(unsigned long long) le64_to_cpu(dma[4]),
1375				shadow[4],
1376				(unsigned long long) le64_to_cpu(dma[5]),
1377				shadow[5],
1378				(unsigned long long) le64_to_cpu(dma[6]),
1379				shadow[6],
1380				(unsigned long long) le64_to_cpu(dma[7]),
1381				shadow[7]);
1382	}
1383	spin_lock_irqsave(&ipath_pioavail_lock, flags);
1384	for (i = 0; i < piobregs; i++) {
1385		u64 pchbusy, pchg, piov, pnew;
1386		/*
1387		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
1388		 */
1389		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
1390			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
1391		else
1392			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
1393		pchg = dd->ipath_pioavailkernel[i] &
1394			~(dd->ipath_pioavailshadow[i] ^ piov);
1395		pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
1396		if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
1397			pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
1398			pnew |= piov & pchbusy;
1399			dd->ipath_pioavailshadow[i] = pnew;
1400		}
1401	}
1402	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1403}
1404
1405/*
1406 * used to force update of pioavailshadow if we can't get a pio buffer.
1407 * Needed primarily due to exitting freeze mode after recovering
1408 * from errors.  Done lazily, because it's safer (known to not
1409 * be writing pio buffers).
1410 */
1411static void ipath_reset_availshadow(struct ipath_devdata *dd)
1412{
1413	int i, im;
1414	unsigned long flags;
1415
1416	spin_lock_irqsave(&ipath_pioavail_lock, flags);
1417	for (i = 0; i < dd->ipath_pioavregs; i++) {
1418		u64 val, oldval;
1419		/* deal with 6110 chip bug on high register #s */
1420		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
1421			i ^ 1 : i;
1422		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
1423		/*
1424		 * busy out the buffers not in the kernel avail list,
1425		 * without changing the generation bits.
1426		 */
1427		oldval = dd->ipath_pioavailshadow[i];
1428		dd->ipath_pioavailshadow[i] = val |
1429			((~dd->ipath_pioavailkernel[i] <<
1430			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
1431			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
1432		if (oldval != dd->ipath_pioavailshadow[i])
1433			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
1434				i, (unsigned long long) oldval,
1435				dd->ipath_pioavailshadow[i]);
1436	}
1437	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1438}
1439
1440/**
1441 * ipath_setrcvhdrsize - set the receive header size
1442 * @dd: the infinipath device
1443 * @rhdrsize: the receive header size
1444 *
1445 * called from user init code, and also layered driver init
1446 */
1447int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
1448{
1449	int ret = 0;
1450
1451	if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
1452		if (dd->ipath_rcvhdrsize != rhdrsize) {
1453			dev_info(&dd->pcidev->dev,
1454				 "Error: can't set protocol header "
1455				 "size %u, already %u\n",
1456				 rhdrsize, dd->ipath_rcvhdrsize);
1457			ret = -EAGAIN;
1458		} else
1459			ipath_cdbg(VERBOSE, "Reuse same protocol header "
1460				   "size %u\n", dd->ipath_rcvhdrsize);
1461	} else if (rhdrsize > (dd->ipath_rcvhdrentsize -
1462			       (sizeof(u64) / sizeof(u32)))) {
1463		ipath_dbg("Error: can't set protocol header size %u "
1464			  "(> max %u)\n", rhdrsize,
1465			  dd->ipath_rcvhdrentsize -
1466			  (u32) (sizeof(u64) / sizeof(u32)));
1467		ret = -EOVERFLOW;
1468	} else {
1469		dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
1470		dd->ipath_rcvhdrsize = rhdrsize;
1471		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
1472				 dd->ipath_rcvhdrsize);
1473		ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
1474			   dd->ipath_rcvhdrsize);
1475	}
1476	return ret;
1477}
1478
1479/*
1480 * debugging code and stats updates if no pio buffers available.
1481 */
1482static noinline void no_pio_bufs(struct ipath_devdata *dd)
1483{
1484	unsigned long *shadow = dd->ipath_pioavailshadow;
1485	__le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
1486
1487	dd->ipath_upd_pio_shadow = 1;
1488
1489	/*
1490	 * not atomic, but if we lose a stat count in a while, that's OK
1491	 */
1492	ipath_stats.sps_nopiobufs++;
1493	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
1494		ipath_force_pio_avail_update(dd); /* at start */
1495		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
1496			"%llx %llx %llx %llx\n"
1497			"ipath  shadow:  %lx %lx %lx %lx\n",
1498			dd->ipath_consec_nopiobuf,
1499			(unsigned long)get_cycles(),
1500			(unsigned long long) le64_to_cpu(dma[0]),
1501			(unsigned long long) le64_to_cpu(dma[1]),
1502			(unsigned long long) le64_to_cpu(dma[2]),
1503			(unsigned long long) le64_to_cpu(dma[3]),
1504			shadow[0], shadow[1], shadow[2], shadow[3]);
1505		/*
1506		 * 4 buffers per byte, 4 registers above, cover rest
1507		 * below
1508		 */
1509		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
1510		    (sizeof(shadow[0]) * 4 * 4))
1511			ipath_dbg("2nd group: dmacopy: "
1512				  "%llx %llx %llx %llx\n"
1513				  "ipath  shadow:  %lx %lx %lx %lx\n",
1514				  (unsigned long long)le64_to_cpu(dma[4]),
1515				  (unsigned long long)le64_to_cpu(dma[5]),
1516				  (unsigned long long)le64_to_cpu(dma[6]),
1517				  (unsigned long long)le64_to_cpu(dma[7]),
1518				  shadow[4], shadow[5], shadow[6], shadow[7]);
1519
1520		/* at end, so update likely happened */
1521		ipath_reset_availshadow(dd);
1522	}
1523}
1524
1525/*
1526 * common code for normal driver pio buffer allocation, and reserved
1527 * allocation.
1528 *
1529 * do appropriate marking as busy, etc.
1530 * returns buffer number if one found (>=0), negative number is error.
1531 */
1532static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
1533	u32 *pbufnum, u32 first, u32 last, u32 firsti)
1534{
1535	int i, j, updated = 0;
1536	unsigned piobcnt;
1537	unsigned long flags;
1538	unsigned long *shadow = dd->ipath_pioavailshadow;
1539	u32 __iomem *buf;
1540
1541	piobcnt = last - first;
1542	if (dd->ipath_upd_pio_shadow) {
1543		/*
1544		 * Minor optimization.  If we had no buffers on last call,
1545		 * start out by doing the update; continue and do scan even
1546		 * if no buffers were updated, to be paranoid
1547		 */
1548		ipath_update_pio_bufs(dd);
1549		updated++;
1550		i = first;
1551	} else
1552		i = firsti;
1553rescan:
1554	/*
1555	 * while test_and_set_bit() is atomic, we do that and then the
1556	 * change_bit(), and the pair is not.  See if this is the cause
1557	 * of the remaining armlaunch errors.
1558	 */
1559	spin_lock_irqsave(&ipath_pioavail_lock, flags);
1560	for (j = 0; j < piobcnt; j++, i++) {
1561		if (i >= last)
1562			i = first;
1563		if (__test_and_set_bit((2 * i) + 1, shadow))
1564			continue;
1565		/* flip generation bit */
1566		__change_bit(2 * i, shadow);
1567		break;
1568	}
1569	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1570
1571	if (j == piobcnt) {
1572		if (!updated) {
1573			/*
1574			 * first time through; shadow exhausted, but may be
1575			 * buffers available, try an update and then rescan.
1576			 */
1577			ipath_update_pio_bufs(dd);
1578			updated++;
1579			i = first;
1580			goto rescan;
1581		} else if (updated == 1 && piobcnt <=
1582			((dd->ipath_sendctrl
1583			>> INFINIPATH_S_UPDTHRESH_SHIFT) &
1584			INFINIPATH_S_UPDTHRESH_MASK)) {
1585			/*
1586			 * for chips supporting and using the update
1587			 * threshold we need to force an update of the
1588			 * in-memory copy if the count is less than the
1589			 * thershold, then check one more time.
1590			 */
1591			ipath_force_pio_avail_update(dd);
1592			ipath_update_pio_bufs(dd);
1593			updated++;
1594			i = first;
1595			goto rescan;
1596		}
1597
1598		no_pio_bufs(dd);
1599		buf = NULL;
1600	} else {
1601		if (i < dd->ipath_piobcnt2k)
1602			buf = (u32 __iomem *) (dd->ipath_pio2kbase +
1603					       i * dd->ipath_palign);
1604		else
1605			buf = (u32 __iomem *)
1606				(dd->ipath_pio4kbase +
1607				 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
1608		if (pbufnum)
1609			*pbufnum = i;
1610	}
1611
1612	return buf;
1613}
1614
1615/**
1616 * ipath_getpiobuf - find an available pio buffer
1617 * @dd: the infinipath device
1618 * @plen: the size of the PIO buffer needed in 32-bit words
1619 * @pbufnum: the buffer number is placed here
1620 */
1621u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
1622{
1623	u32 __iomem *buf;
1624	u32 pnum, nbufs;
1625	u32 first, lasti;
1626
1627	if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
1628		first = dd->ipath_piobcnt2k;
1629		lasti = dd->ipath_lastpioindexl;
1630	} else {
1631		first = 0;
1632		lasti = dd->ipath_lastpioindex;
1633	}
1634	nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
1635	buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
1636
1637	if (buf) {
1638		/*
1639		 * Set next starting place.  It's just an optimization,
1640		 * it doesn't matter who wins on this, so no locking
1641		 */
1642		if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
1643			dd->ipath_lastpioindexl = pnum + 1;
1644		else
1645			dd->ipath_lastpioindex = pnum + 1;
1646		if (dd->ipath_upd_pio_shadow)
1647			dd->ipath_upd_pio_shadow = 0;
1648		if (dd->ipath_consec_nopiobuf)
1649			dd->ipath_consec_nopiobuf = 0;
1650		ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
1651			   pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
1652		if (pbufnum)
1653			*pbufnum = pnum;
1654
1655	}
1656	return buf;
1657}
1658
1659/**
1660 * ipath_chg_pioavailkernel - change which send buffers are available for kernel
1661 * @dd: the infinipath device
1662 * @start: the starting send buffer number
1663 * @len: the number of send buffers
1664 * @avail: true if the buffers are available for kernel use, false otherwise
1665 */
1666void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
1667			      unsigned len, int avail)
1668{
1669	unsigned long flags;
1670	unsigned end, cnt = 0;
1671
1672	/* There are two bits per send buffer (busy and generation) */
1673	start *= 2;
1674	end = start + len * 2;
1675
1676	spin_lock_irqsave(&ipath_pioavail_lock, flags);
1677	/* Set or clear the busy bit in the shadow. */
1678	while (start < end) {
1679		if (avail) {
1680			unsigned long dma;
1681			int i, im;
1682			/*
1683			 * the BUSY bit will never be set, because we disarm
1684			 * the user buffers before we hand them back to the
1685			 * kernel.  We do have to make sure the generation
1686			 * bit is set correctly in shadow, since it could
1687			 * have changed many times while allocated to user.
1688			 * We can't use the bitmap functions on the full
1689			 * dma array because it is always little-endian, so
1690			 * we have to flip to host-order first.
1691			 * BITS_PER_LONG is slightly wrong, since it's
1692			 * always 64 bits per register in chip...
1693			 * We only work on 64 bit kernels, so that's OK.
1694			 */
1695			/* deal with 6110 chip bug on high register #s */
1696			i = start / BITS_PER_LONG;
1697			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
1698				i ^ 1 : i;
1699			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
1700				+ start, dd->ipath_pioavailshadow);
1701			dma = (unsigned long) le64_to_cpu(
1702				dd->ipath_pioavailregs_dma[im]);
1703			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1704				+ start) % BITS_PER_LONG, &dma))
1705				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1706					+ start, dd->ipath_pioavailshadow);
1707			else
1708				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1709					+ start, dd->ipath_pioavailshadow);
1710			__set_bit(start, dd->ipath_pioavailkernel);
1711		} else {
1712			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
1713				dd->ipath_pioavailshadow);
1714			__clear_bit(start, dd->ipath_pioavailkernel);
1715		}
1716		start += 2;
1717	}
1718
1719	if (dd->ipath_pioupd_thresh) {
1720		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
1721		cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
1722	}
1723	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1724
1725	/*
1726	 * When moving buffers from kernel to user, if number assigned to
1727	 * the user is less than the pio update threshold, and threshold
1728	 * is supported (cnt was computed > 0), drop the update threshold
1729	 * so we update at least once per allocated number of buffers.
1730	 * In any case, if the kernel buffers are less than the threshold,
1731	 * drop the threshold.  We don't bother increasing it, having once
1732	 * decreased it, since it would typically just cycle back and forth.
1733	 * If we don't decrease below buffers in use, we can wait a long
1734	 * time for an update, until some other context uses PIO buffers.
1735	 */
1736	if (!avail && len < cnt)
1737		cnt = len;
1738	if (cnt < dd->ipath_pioupd_thresh) {
1739		dd->ipath_pioupd_thresh = cnt;
1740		ipath_dbg("Decreased pio update threshold to %u\n",
1741			dd->ipath_pioupd_thresh);
1742		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1743		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
1744			<< INFINIPATH_S_UPDTHRESH_SHIFT);
1745		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
1746			<< INFINIPATH_S_UPDTHRESH_SHIFT;
1747		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1748			dd->ipath_sendctrl);
1749		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1750	}
1751}
1752
1753/**
1754 * ipath_create_rcvhdrq - create a receive header queue
1755 * @dd: the infinipath device
1756 * @pd: the port data
1757 *
1758 * this must be contiguous memory (from an i/o perspective), and must be
1759 * DMA'able (which means for some systems, it will go through an IOMMU,
1760 * or be forced into a low address range).
1761 */
1762int ipath_create_rcvhdrq(struct ipath_devdata *dd,
1763			 struct ipath_portdata *pd)
1764{
1765	int ret = 0;
1766
1767	if (!pd->port_rcvhdrq) {
1768		dma_addr_t phys_hdrqtail;
1769		gfp_t gfp_flags = GFP_USER | __GFP_COMP;
1770		int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
1771				sizeof(u32), PAGE_SIZE);
1772
1773		pd->port_rcvhdrq = dma_alloc_coherent(
1774			&dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
1775			gfp_flags);
1776
1777		if (!pd->port_rcvhdrq) {
1778			ipath_dev_err(dd, "attempt to allocate %d bytes "
1779				      "for port %u rcvhdrq failed\n",
1780				      amt, pd->port_port);
1781			ret = -ENOMEM;
1782			goto bail;
1783		}
1784
1785		if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
1786			pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
1787				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
1788				GFP_KERNEL);
1789			if (!pd->port_rcvhdrtail_kvaddr) {
1790				ipath_dev_err(dd, "attempt to allocate 1 page "
1791					"for port %u rcvhdrqtailaddr "
1792					"failed\n", pd->port_port);
1793				ret = -ENOMEM;
1794				dma_free_coherent(&dd->pcidev->dev, amt,
1795					pd->port_rcvhdrq,
1796					pd->port_rcvhdrq_phys);
1797				pd->port_rcvhdrq = NULL;
1798				goto bail;
1799			}
1800			pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
1801			ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
1802				   "physical\n", pd->port_port,
1803				   (unsigned long long) phys_hdrqtail);
1804		}
1805
1806		pd->port_rcvhdrq_size = amt;
1807
1808		ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
1809			   "for port %u rcvhdr Q\n",
1810			   amt >> PAGE_SHIFT, pd->port_rcvhdrq,
1811			   (unsigned long) pd->port_rcvhdrq_phys,
1812			   (unsigned long) pd->port_rcvhdrq_size,
1813			   pd->port_port);
1814	}
1815	else
1816		ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
1817			   "hdrtailaddr@%p %llx physical\n",
1818			   pd->port_port, pd->port_rcvhdrq,
1819			   (unsigned long long) pd->port_rcvhdrq_phys,
1820			   pd->port_rcvhdrtail_kvaddr, (unsigned long long)
1821			   pd->port_rcvhdrqtailaddr_phys);
1822
1823	/* clear for security and sanity on each use */
1824	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
1825	if (pd->port_rcvhdrtail_kvaddr)
1826		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
1827
1828	/*
1829	 * tell chip each time we init it, even if we are re-using previous
1830	 * memory (we zero the register at process close)
1831	 */
1832	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
1833			      pd->port_port, pd->port_rcvhdrqtailaddr_phys);
1834	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
1835			      pd->port_port, pd->port_rcvhdrq_phys);
1836
1837bail:
1838	return ret;
1839}
1840
1841
1842/*
1843 * Flush all sends that might be in the ready to send state, as well as any
1844 * that are in the process of being sent.   Used whenever we need to be
1845 * sure the send side is idle.  Cleans up all buffer state by canceling
1846 * all pio buffers, and issuing an abort, which cleans up anything in the
1847 * launch fifo.  The cancel is superfluous on some chip versions, but
1848 * it's safer to always do it.
1849 * PIOAvail bits are updated by the chip as if normal send had happened.
1850 */
1851void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
1852{
1853	unsigned long flags;
1854
1855	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
1856		ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
1857		goto bail;
1858	}
1859	/*
1860	 * If we have SDMA, and it's not disabled, we have to kick off the
1861	 * abort state machine, provided we aren't already aborting.
1862	 * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
1863	 * we skip the rest of this routine. It is already "in progress"
1864	 */
1865	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
1866		int skip_cancel;
1867		unsigned long *statp = &dd->ipath_sdma_status;
1868
1869		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
1870		skip_cancel =
1871			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
1872			&& !test_bit(IPATH_SDMA_DISABLED, statp);
1873		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
1874		if (skip_cancel)
1875			goto bail;
1876	}
1877
1878	ipath_dbg("Cancelling all in-progress send buffers\n");
1879
1880	/* skip armlaunch errs for a while */
1881	dd->ipath_lastcancel = jiffies + HZ / 2;
1882
1883	/*
1884	 * The abort bit is auto-clearing.  We also don't want pioavail
1885	 * update happening during this, and we don't want any other
1886	 * sends going out, so turn those off for the duration.  We read
1887	 * the scratch register to be sure that cancels and the abort
1888	 * have taken effect in the chip.  Otherwise two parts are same
1889	 * as ipath_force_pio_avail_update()
1890	 */
1891	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1892	dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
1893		| INFINIPATH_S_PIOENABLE);
1894	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1895		dd->ipath_sendctrl | INFINIPATH_S_ABORT);
1896	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1897	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1898
1899	/* disarm all send buffers */
1900	ipath_disarm_piobufs(dd, 0,
1901		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
1902
1903	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
1904		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
1905
1906	if (restore_sendctrl) {
1907		/* else done by caller later if needed */
1908		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1909		dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
1910			INFINIPATH_S_PIOENABLE;
1911		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1912			dd->ipath_sendctrl);
1913		/* and again, be sure all have hit the chip */
1914		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1915		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1916	}
1917
1918	if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
1919	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
1920	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
1921		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
1922		/* only wait so long for intr */
1923		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
1924		dd->ipath_sdma_reset_wait = 200;
1925		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
1926			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
1927		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
1928	}
1929bail:;
1930}
1931
1932/*
1933 * Force an update of in-memory copy of the pioavail registers, when
1934 * needed for any of a variety of reasons.  We read the scratch register
1935 * to make it highly likely that the update will have happened by the
1936 * time we return.  If already off (as in cancel_sends above), this
1937 * routine is a nop, on the assumption that the caller will "do the
1938 * right thing".
1939 */
1940void ipath_force_pio_avail_update(struct ipath_devdata *dd)
1941{
1942	unsigned long flags;
1943
1944	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1945	if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
1946		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1947			dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
1948		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1949		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1950			dd->ipath_sendctrl);
1951		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1952	}
1953	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1954}
1955
1956static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
1957				int linitcmd)
1958{
1959	u64 mod_wd;
1960	static const char *what[4] = {
1961		[0] = "NOP",
1962		[INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
1963		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
1964		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
1965	};
1966
1967	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
1968		/*
1969		 * If we are told to disable, note that so link-recovery
1970		 * code does not attempt to bring us back up.
1971		 */
1972		preempt_disable();
1973		dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
1974		preempt_enable();
1975	} else if (linitcmd) {
1976		/*
1977		 * Any other linkinitcmd will lead to LINKDOWN and then
1978		 * to INIT (if all is well), so clear flag to let
1979		 * link-recovery code attempt to bring us back up.
1980		 */
1981		preempt_disable();
1982		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
1983		preempt_enable();
1984	}
1985
1986	mod_wd = (linkcmd << dd->ibcc_lc_shift) |
1987		(linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
1988	ipath_cdbg(VERBOSE,
1989		"Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
1990		dd->ipath_unit, what[linkcmd], linitcmd,
1991		ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
1992			ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
1993
1994	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
1995			 dd->ipath_ibcctrl | mod_wd);
1996	/* read from chip so write is flushed */
1997	(void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
1998}
1999
2000int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
2001{
2002	u32 lstate;
2003	int ret;
2004
2005	switch (newstate) {
2006	case IPATH_IB_LINKDOWN_ONLY:
2007		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
2008		/* don't wait */
2009		ret = 0;
2010		goto bail;
2011
2012	case IPATH_IB_LINKDOWN:
2013		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2014					INFINIPATH_IBCC_LINKINITCMD_POLL);
2015		/* don't wait */
2016		ret = 0;
2017		goto bail;
2018
2019	case IPATH_IB_LINKDOWN_SLEEP:
2020		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2021					INFINIPATH_IBCC_LINKINITCMD_SLEEP);
2022		/* don't wait */
2023		ret = 0;
2024		goto bail;
2025
2026	case IPATH_IB_LINKDOWN_DISABLE:
2027		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2028					INFINIPATH_IBCC_LINKINITCMD_DISABLE);
2029		/* don't wait */
2030		ret = 0;
2031		goto bail;
2032
2033	case IPATH_IB_LINKARM:
2034		if (dd->ipath_flags & IPATH_LINKARMED) {
2035			ret = 0;
2036			goto bail;
2037		}
2038		if (!(dd->ipath_flags &
2039		      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
2040			ret = -EINVAL;
2041			goto bail;
2042		}
2043		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
2044
2045		/*
2046		 * Since the port can transition to ACTIVE by receiving
2047		 * a non VL 15 packet, wait for either state.
2048		 */
2049		lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
2050		break;
2051
2052	case IPATH_IB_LINKACTIVE:
2053		if (dd->ipath_flags & IPATH_LINKACTIVE) {
2054			ret = 0;
2055			goto bail;
2056		}
2057		if (!(dd->ipath_flags & IPATH_LINKARMED)) {
2058			ret = -EINVAL;
2059			goto bail;
2060		}
2061		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
2062		lstate = IPATH_LINKACTIVE;
2063		break;
2064
2065	case IPATH_IB_LINK_LOOPBACK:
2066		dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
2067		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
2068		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2069				 dd->ipath_ibcctrl);
2070
2071		/* turn heartbeat off, as it causes loopback to fail */
2072		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2073				       IPATH_IB_HRTBT_OFF);
2074		/* don't wait */
2075		ret = 0;
2076		goto bail;
2077
2078	case IPATH_IB_LINK_EXTERNAL:
2079		dev_info(&dd->pcidev->dev,
2080			"Disabling IB local loopback (normal)\n");
2081		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2082				       IPATH_IB_HRTBT_ON);
2083		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
2084		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2085				 dd->ipath_ibcctrl);
2086		/* don't wait */
2087		ret = 0;
2088		goto bail;
2089
2090	/*
2091	 * Heartbeat can be explicitly enabled by the user via
2092	 * "hrtbt_enable" "file", and if disabled, trying to enable here
2093	 * will have no effect.  Implicit changes (heartbeat off when
2094	 * loopback on, and vice versa) are included to ease testing.
2095	 */
2096	case IPATH_IB_LINK_HRTBT:
2097		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2098			IPATH_IB_HRTBT_ON);
2099		goto bail;
2100
2101	case IPATH_IB_LINK_NO_HRTBT:
2102		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2103			IPATH_IB_HRTBT_OFF);
2104		goto bail;
2105
2106	default:
2107		ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
2108		ret = -EINVAL;
2109		goto bail;
2110	}
2111	ret = ipath_wait_linkstate(dd, lstate, 2000);
2112
2113bail:
2114	return ret;
2115}
2116
2117/**
2118 * ipath_set_mtu - set the MTU
2119 * @dd: the infinipath device
2120 * @arg: the new MTU
2121 *
2122 * we can handle "any" incoming size, the issue here is whether we
2123 * need to restrict our outgoing size.   For now, we don't do any
2124 * sanity checking on this, and we don't deal with what happens to
2125 * programs that are already running when the size changes.
2126 * NOTE: changing the MTU will usually cause the IBC to go back to
2127 * link INIT state...
2128 */
2129int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
2130{
2131	u32 piosize;
2132	int changed = 0;
2133	int ret;
2134
2135	/*
2136	 * mtu is IB data payload max.  It's the largest power of 2 less
2137	 * than piosize (or even larger, since it only really controls the
2138	 * largest we can receive; we can send the max of the mtu and
2139	 * piosize).  We check that it's one of the valid IB sizes.
2140	 */
2141	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
2142	    (arg != 4096 || !ipath_mtu4096)) {
2143		ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
2144		ret = -EINVAL;
2145		goto bail;
2146	}
2147	if (dd->ipath_ibmtu == arg) {
2148		ret = 0;        /* same as current */
2149		goto bail;
2150	}
2151
2152	piosize = dd->ipath_ibmaxlen;
2153	dd->ipath_ibmtu = arg;
2154
2155	if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
2156		/* Only if it's not the initial value (or reset to it) */
2157		if (piosize != dd->ipath_init_ibmaxlen) {
2158			if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
2159				piosize = dd->ipath_init_ibmaxlen;
2160			dd->ipath_ibmaxlen = piosize;
2161			changed = 1;
2162		}
2163	} else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
2164		piosize = arg + IPATH_PIO_MAXIBHDR;
2165		ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
2166			   "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
2167			   arg);
2168		dd->ipath_ibmaxlen = piosize;
2169		changed = 1;
2170	}
2171
2172	if (changed) {
2173		u64 ibc = dd->ipath_ibcctrl, ibdw;
2174		/*
2175		 * update our housekeeping variables, and set IBC max
2176		 * size, same as init code; max IBC is max we allow in
2177		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
2178		 */
2179		dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
2180		ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
2181		ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
2182			 dd->ibcc_mpl_shift);
2183		ibc |= ibdw << dd->ibcc_mpl_shift;
2184		dd->ipath_ibcctrl = ibc;
2185		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2186				 dd->ipath_ibcctrl);
2187		dd->ipath_f_tidtemplate(dd);
2188	}
2189
2190	ret = 0;
2191
2192bail:
2193	return ret;
2194}
2195
2196int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
2197{
2198	dd->ipath_lid = lid;
2199	dd->ipath_lmc = lmc;
2200
2201	dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
2202		(~((1U << lmc) - 1)) << 16);
2203
2204	dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
2205
2206	return 0;
2207}
2208
2209
2210/**
2211 * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
2212 * @dd: the infinipath device
2213 * @regno: the register number to write
2214 * @port: the port containing the register
2215 * @value: the value to write
2216 *
2217 * Registers that vary with the chip implementation constants (port)
2218 * use this routine.
2219 */
2220void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
2221			  unsigned port, u64 value)
2222{
2223	u16 where;
2224
2225	if (port < dd->ipath_portcnt &&
2226	    (regno == dd->ipath_kregs->kr_rcvhdraddr ||
2227	     regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
2228		where = regno + port;
2229	else
2230		where = -1;
2231
2232	ipath_write_kreg(dd, where, value);
2233}
2234
2235/*
2236 * Following deal with the "obviously simple" task of overriding the state
2237 * of the LEDS, which normally indicate link physical and logical status.
2238 * The complications arise in dealing with different hardware mappings
2239 * and the board-dependent routine being called from interrupts.
2240 * and then there's the requirement to _flash_ them.
2241 */
2242#define LED_OVER_FREQ_SHIFT 8
2243#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
2244/* Below is "non-zero" to force override, but both actual LEDs are off */
2245#define LED_OVER_BOTH_OFF (8)
2246
2247static void ipath_run_led_override(unsigned long opaque)
2248{
2249	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
2250	int timeoff;
2251	int pidx;
2252	u64 lstate, ltstate, val;
2253
2254	if (!(dd->ipath_flags & IPATH_INITTED))
2255		return;
2256
2257	pidx = dd->ipath_led_override_phase++ & 1;
2258	dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
2259	timeoff = dd->ipath_led_override_timeoff;
2260
2261	/*
2262	 * below potentially restores the LED values per current status,
2263	 * should also possibly setup the traffic-blink register,
2264	 * but leave that to per-chip functions.
2265	 */
2266	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
2267	ltstate = ipath_ib_linktrstate(dd, val);
2268	lstate = ipath_ib_linkstate(dd, val);
2269
2270	dd->ipath_f_setextled(dd, lstate, ltstate);
2271	mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
2272}
2273
2274void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
2275{
2276	int timeoff, freq;
2277
2278	if (!(dd->ipath_flags & IPATH_INITTED))
2279		return;
2280
2281	/* First check if we are blinking. If not, use 1HZ polling */
2282	timeoff = HZ;
2283	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
2284
2285	if (freq) {
2286		/* For blink, set each phase from one nybble of val */
2287		dd->ipath_led_override_vals[0] = val & 0xF;
2288		dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
2289		timeoff = (HZ << 4)/freq;
2290	} else {
2291		/* Non-blink set both phases the same. */
2292		dd->ipath_led_override_vals[0] = val & 0xF;
2293		dd->ipath_led_override_vals[1] = val & 0xF;
2294	}
2295	dd->ipath_led_override_timeoff = timeoff;
2296
2297	/*
2298	 * If the timer has not already been started, do so. Use a "quick"
2299	 * timeout so the function will be called soon, to look at our request.
2300	 */
2301	if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
2302		/* Need to start timer */
2303		init_timer(&dd->ipath_led_override_timer);
2304		dd->ipath_led_override_timer.function =
2305						 ipath_run_led_override;
2306		dd->ipath_led_override_timer.data = (unsigned long) dd;
2307		dd->ipath_led_override_timer.expires = jiffies + 1;
2308		add_timer(&dd->ipath_led_override_timer);
2309	} else
2310		atomic_dec(&dd->ipath_led_override_timer_active);
2311}
2312
2313/**
2314 * ipath_shutdown_device - shut down a device
2315 * @dd: the infinipath device
2316 *
2317 * This is called to make the device quiet when we are about to
2318 * unload the driver, and also when the device is administratively
2319 * disabled.   It does not free any data structures.
2320 * Everything it does has to be setup again by ipath_init_chip(dd,1)
2321 */
2322void ipath_shutdown_device(struct ipath_devdata *dd)
2323{
2324	unsigned long flags;
2325
2326	ipath_dbg("Shutting down the device\n");
2327
2328	ipath_hol_up(dd); /* make sure user processes aren't suspended */
2329
2330	dd->ipath_flags |= IPATH_LINKUNK;
2331	dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
2332			     IPATH_LINKINIT | IPATH_LINKARMED |
2333			     IPATH_LINKACTIVE);
2334	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
2335				IPATH_STATUS_IB_READY);
2336
2337	/* mask interrupts, but not errors */
2338	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
2339
2340	dd->ipath_rcvctrl = 0;
2341	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
2342			 dd->ipath_rcvctrl);
2343
2344	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
2345		teardown_sdma(dd);
2346
2347	/*
2348	 * gracefully stop all sends allowing any in progress to trickle out
2349	 * first.
2350	 */
2351	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
2352	dd->ipath_sendctrl = 0;
2353	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
2354	/* flush it */
2355	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
2356	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
2357
2358	/*
2359	 * enough for anything that's going to trickle out to have actually
2360	 * done so.
2361	 */
2362	udelay(5);
2363
2364	dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
2365
2366	ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
2367	ipath_cancel_sends(dd, 0);
2368
2369	/*
2370	 * we are shutting down, so tell components that care.  We don't do
2371	 * this on just a link state change, much like ethernet, a cable
2372	 * unplug, etc. doesn't change driver state
2373	 */
2374	signal_ib_event(dd, IB_EVENT_PORT_ERR);
2375
2376	/* disable IBC */
2377	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
2378	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
2379			 dd->ipath_control | INFINIPATH_C_FREEZEMODE);
2380
2381	/*
2382	 * clear SerdesEnable and turn the leds off; do this here because
2383	 * we are unloading, so don't count on interrupts to move along
2384	 * Turn the LEDs off explicitly for the same reason.
2385	 */
2386	dd->ipath_f_quiet_serdes(dd);
2387
2388	/* stop all the timers that might still be running */
2389	del_timer_sync(&dd->ipath_hol_timer);
2390	if (dd->ipath_stats_timer_active) {
2391		del_timer_sync(&dd->ipath_stats_timer);
2392		dd->ipath_stats_timer_active = 0;
2393	}
2394	if (dd->ipath_intrchk_timer.data) {
2395		del_timer_sync(&dd->ipath_intrchk_timer);
2396		dd->ipath_intrchk_timer.data = 0;
2397	}
2398	if (atomic_read(&dd->ipath_led_override_timer_active)) {
2399		del_timer_sync(&dd->ipath_led_override_timer);
2400		atomic_set(&dd->ipath_led_override_timer_active, 0);
2401	}
2402
2403	/*
2404	 * clear all interrupts and errors, so that the next time the driver
2405	 * is loaded or device is enabled, we know that whatever is set
2406	 * happened while we were unloaded
2407	 */
2408	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
2409			 ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
2410	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
2411	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
2412
2413	ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
2414	ipath_update_eeprom_log(dd);
2415}
2416
2417/**
2418 * ipath_free_pddata - free a port's allocated data
2419 * @dd: the infinipath device
2420 * @pd: the portdata structure
2421 *
2422 * free up any allocated data for a port
2423 * This should not touch anything that would affect a simultaneous
2424 * re-allocation of port data, because it is called after ipath_mutex
2425 * is released (and can be called from reinit as well).
2426 * It should never change any chip state, or global driver state.
2427 * (The only exception to global state is freeing the port0 port0_skbs.)
2428 */
2429void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
2430{
2431	if (!pd)
2432		return;
2433
2434	if (pd->port_rcvhdrq) {
2435		ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
2436			   "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
2437			   (unsigned long) pd->port_rcvhdrq_size);
2438		dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
2439				  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
2440		pd->port_rcvhdrq = NULL;
2441		if (pd->port_rcvhdrtail_kvaddr) {
2442			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
2443					 pd->port_rcvhdrtail_kvaddr,
2444					 pd->port_rcvhdrqtailaddr_phys);
2445			pd->port_rcvhdrtail_kvaddr = NULL;
2446		}
2447	}
2448	if (pd->port_port && pd->port_rcvegrbuf) {
2449		unsigned e;
2450
2451		for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
2452			void *base = pd->port_rcvegrbuf[e];
2453			size_t size = pd->port_rcvegrbuf_size;
2454
2455			ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
2456				   "chunk %u/%u\n", base,
2457				   (unsigned long) size,
2458				   e, pd->port_rcvegrbuf_chunks);
2459			dma_free_coherent(&dd->pcidev->dev, size,
2460				base, pd->port_rcvegrbuf_phys[e]);
2461		}
2462		kfree(pd->port_rcvegrbuf);
2463		pd->port_rcvegrbuf = NULL;
2464		kfree(pd->port_rcvegrbuf_phys);
2465		pd->port_rcvegrbuf_phys = NULL;
2466		pd->port_rcvegrbuf_chunks = 0;
2467	} else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
2468		unsigned e;
2469		struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
2470
2471		dd->ipath_port0_skbinfo = NULL;
2472		ipath_cdbg(VERBOSE, "free closed port %d "
2473			   "ipath_port0_skbinfo @ %p\n", pd->port_port,
2474			   skbinfo);
2475		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
2476			if (skbinfo[e].skb) {
2477				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
2478						 dd->ipath_ibmaxlen,
2479						 PCI_DMA_FROMDEVICE);
2480				dev_kfree_skb(skbinfo[e].skb);
2481			}
2482		vfree(skbinfo);
2483	}
2484	kfree(pd->port_tid_pg_list);
2485	vfree(pd->subport_uregbase);
2486	vfree(pd->subport_rcvegrbuf);
2487	vfree(pd->subport_rcvhdr_base);
2488	kfree(pd);
2489}
2490
2491static int __init infinipath_init(void)
2492{
2493	int ret;
2494
2495	if (ipath_debug & __IPATH_DBG)
2496		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
2497
2498	/*
2499	 * These must be called before the driver is registered with
2500	 * the PCI subsystem.
2501	 */
2502	idr_init(&unit_table);
2503
2504	ret = pci_register_driver(&ipath_driver);
2505	if (ret < 0) {
2506		printk(KERN_ERR IPATH_DRV_NAME
2507		       ": Unable to register driver: error %d\n", -ret);
2508		goto bail_unit;
2509	}
2510
2511	ret = ipath_init_ipathfs();
2512	if (ret < 0) {
2513		printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
2514		       "ipathfs: error %d\n", -ret);
2515		goto bail_pci;
2516	}
2517
2518	goto bail;
2519
2520bail_pci:
2521	pci_unregister_driver(&ipath_driver);
2522
2523bail_unit:
2524	idr_destroy(&unit_table);
2525
2526bail:
2527	return ret;
2528}
2529
2530static void __exit infinipath_cleanup(void)
2531{
2532	ipath_exit_ipathfs();
2533
2534	ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
2535	pci_unregister_driver(&ipath_driver);
2536
2537	idr_destroy(&unit_table);
2538}
2539
2540/**
2541 * ipath_reset_device - reset the chip if possible
2542 * @unit: the device to reset
2543 *
2544 * Whether or not reset is successful, we attempt to re-initialize the chip
2545 * (that is, much like a driver unload/reload).  We clear the INITTED flag
2546 * so that the various entry points will fail until we reinitialize.  For
2547 * now, we only allow this if no user ports are open that use chip resources
2548 */
2549int ipath_reset_device(int unit)
2550{
2551	int ret, i;
2552	struct ipath_devdata *dd = ipath_lookup(unit);
2553	unsigned long flags;
2554
2555	if (!dd) {
2556		ret = -ENODEV;
2557		goto bail;
2558	}
2559
2560	if (atomic_read(&dd->ipath_led_override_timer_active)) {
2561		/* Need to stop LED timer, _then_ shut off LEDs */
2562		del_timer_sync(&dd->ipath_led_override_timer);
2563		atomic_set(&dd->ipath_led_override_timer_active, 0);
2564	}
2565
2566	/* Shut off LEDs after we are sure timer is not running */
2567	dd->ipath_led_override = LED_OVER_BOTH_OFF;
2568	dd->ipath_f_setextled(dd, 0, 0);
2569
2570	dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
2571
2572	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
2573		dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
2574			 "not initialized or not present\n", unit);
2575		ret = -ENXIO;
2576		goto bail;
2577	}
2578
2579	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
2580	if (dd->ipath_pd)
2581		for (i = 1; i < dd->ipath_cfgports; i++) {
2582			if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
2583				continue;
2584			spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2585			ipath_dbg("unit %u port %d is in use "
2586				  "(PID %u cmd %s), can't reset\n",
2587				  unit, i,
2588				  pid_nr(dd->ipath_pd[i]->port_pid),
2589				  dd->ipath_pd[i]->port_comm);
2590			ret = -EBUSY;
2591			goto bail;
2592		}
2593	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2594
2595	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
2596		teardown_sdma(dd);
2597
2598	dd->ipath_flags &= ~IPATH_INITTED;
2599	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
2600	ret = dd->ipath_f_reset(dd);
2601	if (ret == 1) {
2602		ipath_dbg("Reinitializing unit %u after reset attempt\n",
2603			  unit);
2604		ret = ipath_init_chip(dd, 1);
2605	} else
2606		ret = -EAGAIN;
2607	if (ret)
2608		ipath_dev_err(dd, "Reinitialize unit %u after "
2609			      "reset failed with %d\n", unit, ret);
2610	else
2611		dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
2612			 "resetting\n", unit);
2613
2614bail:
2615	return ret;
2616}
2617
2618/*
2619 * send a signal to all the processes that have the driver open
2620 * through the normal interfaces (i.e., everything other than diags
2621 * interface).  Returns number of signalled processes.
2622 */
2623static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
2624{
2625	int i, sub, any = 0;
2626	struct pid *pid;
2627	unsigned long flags;
2628
2629	if (!dd->ipath_pd)
2630		return 0;
2631
2632	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
2633	for (i = 1; i < dd->ipath_cfgports; i++) {
2634		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
2635			continue;
2636		pid = dd->ipath_pd[i]->port_pid;
2637		if (!pid)
2638			continue;
2639
2640		dev_info(&dd->pcidev->dev, "context %d in use "
2641			  "(PID %u), sending signal %d\n",
2642			  i, pid_nr(pid), sig);
2643		kill_pid(pid, sig, 1);
2644		any++;
2645		for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
2646			pid = dd->ipath_pd[i]->port_subpid[sub];
2647			if (!pid)
2648				continue;
2649			dev_info(&dd->pcidev->dev, "sub-context "
2650				"%d:%d in use (PID %u), sending "
2651				"signal %d\n", i, sub, pid_nr(pid), sig);
2652			kill_pid(pid, sig, 1);
2653			any++;
2654		}
2655	}
2656	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2657	return any;
2658}
2659
2660static void ipath_hol_signal_down(struct ipath_devdata *dd)
2661{
2662	if (ipath_signal_procs(dd, SIGSTOP))
2663		ipath_dbg("Stopped some processes\n");
2664	ipath_cancel_sends(dd, 1);
2665}
2666
2667
2668static void ipath_hol_signal_up(struct ipath_devdata *dd)
2669{
2670	if (ipath_signal_procs(dd, SIGCONT))
2671		ipath_dbg("Continued some processes\n");
2672}
2673
2674/*
2675 * link is down, stop any users processes, and flush pending sends
2676 * to prevent HoL blocking, then start the HoL timer that
2677 * periodically continues, then stop procs, so they can detect
2678 * link down if they want, and do something about it.
2679 * Timer may already be running, so use mod_timer, not add_timer.
2680 */
2681void ipath_hol_down(struct ipath_devdata *dd)
2682{
2683	dd->ipath_hol_state = IPATH_HOL_DOWN;
2684	ipath_hol_signal_down(dd);
2685	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
2686	dd->ipath_hol_timer.expires = jiffies +
2687		msecs_to_jiffies(ipath_hol_timeout_ms);
2688	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
2689}
2690
2691/*
2692 * link is up, continue any user processes, and ensure timer
2693 * is a nop, if running.  Let timer keep running, if set; it
2694 * will nop when it sees the link is up
2695 */
2696void ipath_hol_up(struct ipath_devdata *dd)
2697{
2698	ipath_hol_signal_up(dd);
2699	dd->ipath_hol_state = IPATH_HOL_UP;
2700}
2701
2702/*
2703 * toggle the running/not running state of user proceses
2704 * to prevent HoL blocking on chip resources, but still allow
2705 * user processes to do link down special case handling.
2706 * Should only be called via the timer
2707 */
2708void ipath_hol_event(unsigned long opaque)
2709{
2710	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
2711
2712	if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
2713		&& dd->ipath_hol_state != IPATH_HOL_UP) {
2714		dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
2715		ipath_dbg("Stopping processes\n");
2716		ipath_hol_signal_down(dd);
2717	} else { /* may do "extra" if also in ipath_hol_up() */
2718		dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
2719		ipath_dbg("Continuing processes\n");
2720		ipath_hol_signal_up(dd);
2721	}
2722	if (dd->ipath_hol_state == IPATH_HOL_UP)
2723		ipath_dbg("link's up, don't resched timer\n");
2724	else {
2725		dd->ipath_hol_timer.expires = jiffies +
2726			msecs_to_jiffies(ipath_hol_timeout_ms);
2727		mod_timer(&dd->ipath_hol_timer,
2728			dd->ipath_hol_timer.expires);
2729	}
2730}
2731
2732int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
2733{
2734	u64 val;
2735
2736	if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
2737		return -1;
2738	if (dd->ipath_rx_pol_inv != new_pol_inv) {
2739		dd->ipath_rx_pol_inv = new_pol_inv;
2740		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
2741		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
2742			 INFINIPATH_XGXS_RX_POL_SHIFT);
2743		val |= ((u64)dd->ipath_rx_pol_inv) <<
2744			INFINIPATH_XGXS_RX_POL_SHIFT;
2745		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
2746	}
2747	return 0;
2748}
2749
2750/*
2751 * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
2752 * the 7220, which is count-based, rather than trigger-based.  Safe for the
2753 * driver check, since it's at init.   Not completely safe when used for
2754 * user-mode checking, since some error checking can be lost, but not
2755 * particularly risky, and only has problematic side-effects in the face of
2756 * very buggy user code.  There is no reference counting, but that's also
2757 * fine, given the intended use.
2758 */
2759void ipath_enable_armlaunch(struct ipath_devdata *dd)
2760{
2761	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
2762	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
2763		INFINIPATH_E_SPIOARMLAUNCH);
2764	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
2765	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
2766		dd->ipath_errormask);
2767}
2768
2769void ipath_disable_armlaunch(struct ipath_devdata *dd)
2770{
2771	/* so don't re-enable if already set */
2772	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
2773	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
2774	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
2775		dd->ipath_errormask);
2776}
2777
2778module_init(infinipath_init);
2779module_exit(infinipath_cleanup);
2780