DPDK logo

Elixir Cross Referencer

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright 2016 6WIND S.A.
 * Copyright 2016 Mellanox Technologies, Ltd
 */

#include <rte_eal_memconfig.h>
#include <rte_mempool.h>
#include <rte_malloc.h>
#include <rte_rwlock.h>
#include <rte_bus_pci.h>

#include <mlx5_common_mp.h>
#include <mlx5_common_mr.h>

#include "mlx5.h"
#include "mlx5_mr.h"
#include "mlx5_rxtx.h"

struct mr_find_contig_memsegs_data {
	uintptr_t addr;
	uintptr_t start;
	uintptr_t end;
	const struct rte_memseg_list *msl;
};

struct mr_update_mp_data {
	struct rte_eth_dev *dev;
	struct mlx5_mr_ctrl *mr_ctrl;
	int ret;
};

/**
 * Callback for memory free event. Iterate freed memsegs and check whether it
 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
 * result, the MR would be fragmented. If it becomes empty, the MR will be freed
 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
 * secondary process, the garbage collector will be called in primary process
 * as the secondary process can't call mlx5_mr_create().
 *
 * The global cache must be rebuilt if there's any change and this event has to
 * be propagated to dataplane threads to flush the local caches.
 *
 * @param sh
 *   Pointer to the Ethernet device shared context.
 * @param addr
 *   Address of freed memory.
 * @param len
 *   Size of freed memory.
 */
static void
mlx5_mr_mem_event_free_cb(struct mlx5_dev_ctx_shared *sh,
			  const void *addr, size_t len)
{
	const struct rte_memseg_list *msl;
	struct mlx5_mr *mr;
	int ms_n;
	int i;
	int rebuild = 0;

	DEBUG("device %s free callback: addr=%p, len=%zu",
	      sh->ibdev_name, addr, len);
	msl = rte_mem_virt2memseg_list(addr);
	/* addr and len must be page-aligned. */
	MLX5_ASSERT((uintptr_t)addr ==
		    RTE_ALIGN((uintptr_t)addr, msl->page_sz));
	MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
	ms_n = len / msl->page_sz;
	rte_rwlock_write_lock(&sh->share_cache.rwlock);
	/* Clear bits of freed memsegs from MR. */
	for (i = 0; i < ms_n; ++i) {
		const struct rte_memseg *ms;
		struct mr_cache_entry entry;
		uintptr_t start;
		int ms_idx;
		uint32_t pos;

		/* Find MR having this memseg. */
		start = (uintptr_t)addr + i * msl->page_sz;
		mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, start);
		if (mr == NULL)
			continue;
		MLX5_ASSERT(mr->msl); /* Can't be external memory. */
		ms = rte_mem_virt2memseg((void *)start, msl);
		MLX5_ASSERT(ms != NULL);
		MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
		ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
		pos = ms_idx - mr->ms_base_idx;
		MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
		MLX5_ASSERT(pos < mr->ms_bmp_n);
		DEBUG("device %s MR(%p): clear bitmap[%u] for addr %p",
		      sh->ibdev_name, (void *)mr, pos, (void *)start);
		rte_bitmap_clear(mr->ms_bmp, pos);
		if (--mr->ms_n == 0) {
			LIST_REMOVE(mr, mr);
			LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
			DEBUG("device %s remove MR(%p) from list",
			      sh->ibdev_name, (void *)mr);
		}
		/*
		 * MR is fragmented or will be freed. the global cache must be
		 * rebuilt.
		 */
		rebuild = 1;
	}
	if (rebuild) {
		mlx5_mr_rebuild_cache(&sh->share_cache);
		/*
		 * Flush local caches by propagating invalidation across cores.
		 * rte_smp_wmb() is enough to synchronize this event. If one of
		 * freed memsegs is seen by other core, that means the memseg
		 * has been allocated by allocator, which will come after this
		 * free call. Therefore, this store instruction (incrementing
		 * generation below) will be guaranteed to be seen by other core
		 * before the core sees the newly allocated memory.
		 */
		++sh->share_cache.dev_gen;
		DEBUG("broadcasting local cache flush, gen=%d",
		      sh->share_cache.dev_gen);
		rte_smp_wmb();
	}
	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
}

/**
 * Callback for memory event. This can be called from both primary and secondary
 * process.
 *
 * @param event_type
 *   Memory event type.
 * @param addr
 *   Address of memory.
 * @param len
 *   Size of memory.
 */
void
mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
		     size_t len, void *arg __rte_unused)
{
	struct mlx5_dev_ctx_shared *sh;
	struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;

	/* Must be called from the primary process. */
	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
	switch (event_type) {
	case RTE_MEM_EVENT_FREE:
		rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
		/* Iterate all the existing mlx5 devices. */
		LIST_FOREACH(sh, dev_list, mem_event_cb)
			mlx5_mr_mem_event_free_cb(sh, addr, len);
		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
		break;
	case RTE_MEM_EVENT_ALLOC:
	default:
		break;
	}
}

/**
 * Bottom-half of LKey search on Rx.
 *
 * @param rxq
 *   Pointer to Rx queue structure.
 * @param addr
 *   Search key.
 *
 * @return
 *   Searched LKey on success, UINT32_MAX on no match.
 */
uint32_t
mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
{
	struct mlx5_rxq_ctrl *rxq_ctrl =
		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
	struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
	struct mlx5_priv *priv = rxq_ctrl->priv;

	return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
				  &priv->sh->share_cache, mr_ctrl, addr,
				  priv->config.mr_ext_memseg_en);
}

/**
 * Bottom-half of LKey search on Tx.
 *
 * @param txq
 *   Pointer to Tx queue structure.
 * @param addr
 *   Search key.
 *
 * @return
 *   Searched LKey on success, UINT32_MAX on no match.
 */
static uint32_t
mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
{
	struct mlx5_txq_ctrl *txq_ctrl =
		container_of(txq, struct mlx5_txq_ctrl, txq);
	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
	struct mlx5_priv *priv = txq_ctrl->priv;

	return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
				  &priv->sh->share_cache, mr_ctrl, addr,
				  priv->config.mr_ext_memseg_en);
}

/**
 * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
 * list, register the mempool of the mbuf as externally allocated memory.
 *
 * @param txq
 *   Pointer to Tx queue structure.
 * @param mb
 *   Pointer to mbuf.
 *
 * @return
 *   Searched LKey on success, UINT32_MAX on no match.
 */
uint32_t
mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
{
	uintptr_t addr = (uintptr_t)mb->buf_addr;
	uint32_t lkey;

	lkey = mlx5_tx_addr2mr_bh(txq, addr);
	if (lkey == UINT32_MAX && rte_errno == ENXIO) {
		/* Mempool may have externally allocated memory. */
		return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
	}
	return lkey;
}

/**
 * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp().
 *
 * Externally allocated chunk is registered and a MR is created for the chunk.
 * The MR object is added to the global list. If memseg list of a MR object
 * (mr->msl) is null, the MR object can be regarded as externally allocated
 * memory.
 *
 * Once external memory is registered, it should be static. If the memory is
 * freed and the virtual address range has different physical memory mapped
 * again, it may cause crash on device due to the wrong translation entry. PMD
 * can't track the free event of the external memory for now.
 */
static void
mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
			 struct rte_mempool_memhdr *memhdr,
			 unsigned mem_idx __rte_unused)
{
	struct mr_update_mp_data *data = opaque;
	struct rte_eth_dev *dev = data->dev;
	struct mlx5_priv *priv = dev->data->dev_private;
	struct mlx5_dev_ctx_shared *sh = priv->sh;
	struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
	struct mlx5_mr *mr = NULL;
	uintptr_t addr = (uintptr_t)memhdr->addr;
	size_t len = memhdr->len;
	struct mr_cache_entry entry;
	uint32_t lkey;

	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
	/* If already registered, it should return. */
	rte_rwlock_read_lock(&sh->share_cache.rwlock);
	lkey = mlx5_mr_lookup_cache(&sh->share_cache, &entry, addr);
	rte_rwlock_read_unlock(&sh->share_cache.rwlock);
	if (lkey != UINT32_MAX)
		return;
	DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
		dev->data->port_id, mem_idx, mp->name);
	mr = mlx5_create_mr_ext(sh->pd, addr, len, mp->socket_id,
				sh->share_cache.reg_mr_cb);
	if (!mr) {
		DRV_LOG(WARNING,
			"port %u unable to allocate a new MR of"
			" mempool (%s).",
			dev->data->port_id, mp->name);
		data->ret = -1;
		return;
	}
	rte_rwlock_write_lock(&sh->share_cache.rwlock);
	LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
	/* Insert to the global cache table. */
	mlx5_mr_insert_cache(&sh->share_cache, mr);
	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
	/* Insert to the local cache table */
	mlx5_mr_addr2mr_bh(sh->pd, &priv->mp_id, &sh->share_cache,
			   mr_ctrl, addr, priv->config.mr_ext_memseg_en);
}

/**
 * Finds the first ethdev that match the pci device.
 * The existence of multiple ethdev per pci device is only with representors.
 * On such case, it is enough to get only one of the ports as they all share
 * the same ibv context.
 *
 * @param pdev
 *   Pointer to the PCI device.
 *
 * @return
 *   Pointer to the ethdev if found, NULL otherwise.
 */
static struct rte_eth_dev *
pci_dev_to_eth_dev(struct rte_pci_device *pdev)
{
	uint16_t port_id;

	port_id = rte_eth_find_next_of(0, &pdev->device);
	if (port_id == RTE_MAX_ETHPORTS)
		return NULL;
	return &rte_eth_devices[port_id];
}

/**
 * DPDK callback to DMA map external memory to a PCI device.
 *
 * @param pdev
 *   Pointer to the PCI device.
 * @param addr
 *   Starting virtual address of memory to be mapped.
 * @param iova
 *   Starting IOVA address of memory to be mapped.
 * @param len
 *   Length of memory segment being mapped.
 *
 * @return
 *   0 on success, negative value on error.
 */
int
mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
	     uint64_t iova __rte_unused, size_t len)
{
	struct rte_eth_dev *dev;
	struct mlx5_mr *mr;
	struct mlx5_priv *priv;
	struct mlx5_dev_ctx_shared *sh;

	dev = pci_dev_to_eth_dev(pdev);
	if (!dev) {
		DRV_LOG(WARNING, "unable to find matching ethdev "
				 "to PCI device %p", (void *)pdev);
		rte_errno = ENODEV;
		return -1;
	}
	priv = dev->data->dev_private;
	sh = priv->sh;
	mr = mlx5_create_mr_ext(sh->pd, (uintptr_t)addr, len, SOCKET_ID_ANY,
				sh->share_cache.reg_mr_cb);
	if (!mr) {
		DRV_LOG(WARNING,
			"port %u unable to dma map", dev->data->port_id);
		rte_errno = EINVAL;
		return -1;
	}
	rte_rwlock_write_lock(&sh->share_cache.rwlock);
	LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
	/* Insert to the global cache table. */
	mlx5_mr_insert_cache(&sh->share_cache, mr);
	rte_rwlock_write_unlock(&sh->share_cache.rwlock);
	return 0;
}

/**
 * DPDK callback to DMA unmap external memory to a PCI device.
 *
 * @param pdev
 *   Pointer to the PCI device.
 * @param addr
 *   Starting virtual address of memory to be unmapped.
 * @param iova
 *   Starting IOVA address of memory to be unmapped.
 * @param len
 *   Length of memory segment being unmapped.
 *
 * @return
 *   0 on success, negative value on error.
 */
int
mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
	       uint64_t iova __rte_unused, size_t len __rte_unused)
{
	struct rte_eth_dev *dev;
	struct mlx5_priv *priv;
	struct mlx5_dev_ctx_shared *sh;
	struct mlx5_mr *mr;
	struct mr_cache_entry entry;

	dev = pci_dev_to_eth_dev(pdev);
	if (!dev) {
		DRV_LOG(WARNING, "unable to find matching ethdev "
				 "to PCI device %p", (void *)pdev);
		rte_errno = ENODEV;
		return -1;
	}
	priv = dev->data->dev_private;
	sh = priv->sh;
	rte_rwlock_read_lock(&sh->share_cache.rwlock);
	mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, (uintptr_t)addr);
	if (!mr) {
		rte_rwlock_read_unlock(&sh->share_cache.rwlock);
		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
				 "to PCI device %p", (uintptr_t)addr,
				 (void *)pdev);
		rte_errno = EINVAL;
		return -1;
	}
	LIST_REMOVE(mr, mr);
	LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
	      (void *)mr);
	mlx5_mr_rebuild_cache(&sh->share_cache);
	/*
	 * Flush local caches by propagating invalidation across cores.
	 * rte_smp_wmb() is enough to synchronize this event. If one of
	 * freed memsegs is seen by other core, that means the memseg
	 * has been allocated by allocator, which will come after this
	 * free call. Therefore, this store instruction (incrementing
	 * generation below) will be guaranteed to be seen by other core
	 * before the core sees the newly allocated memory.
	 */
	++sh->share_cache.dev_gen;
	DEBUG("broadcasting local cache flush, gen=%d",
	      sh->share_cache.dev_gen);
	rte_smp_wmb();
	rte_rwlock_read_unlock(&sh->share_cache.rwlock);
	return 0;
}

/**
 * Register MR for entire memory chunks in a Mempool having externally allocated
 * memory and fill in local cache.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mr_ctrl
 *   Pointer to per-queue MR control structure.
 * @param mp
 *   Pointer to registering Mempool.
 *
 * @return
 *   0 on success, -1 on failure.
 */
static uint32_t
mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
		      struct rte_mempool *mp)
{
	struct mr_update_mp_data data = {
		.dev = dev,
		.mr_ctrl = mr_ctrl,
		.ret = 0,
	};

	rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data);
	return data.ret;
}

/**
 * Register MR entire memory chunks in a Mempool having externally allocated
 * memory and search LKey of the address to return.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param addr
 *   Search key.
 * @param mp
 *   Pointer to registering Mempool where addr belongs.
 *
 * @return
 *   LKey for address on success, UINT32_MAX on failure.
 */
uint32_t
mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
		      struct rte_mempool *mp)
{
	struct mlx5_txq_ctrl *txq_ctrl =
		container_of(txq, struct mlx5_txq_ctrl, txq);
	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
	struct mlx5_priv *priv = txq_ctrl->priv;

	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
		DRV_LOG(WARNING,
			"port %u using address (%p) from unregistered mempool"
			" having externally allocated memory"
			" in secondary process, please create mempool"
			" prior to rte_eth_dev_start()",
			PORT_ID(priv), (void *)addr);
		return UINT32_MAX;
	}
	mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
	return mlx5_tx_addr2mr_bh(txq, addr);
}

/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */
static void
mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
		     struct rte_mempool_memhdr *memhdr,
		     unsigned mem_idx __rte_unused)
{
	struct mr_update_mp_data *data = opaque;
	struct rte_eth_dev *dev = data->dev;
	struct mlx5_priv *priv = dev->data->dev_private;

	uint32_t lkey;

	/* Stop iteration if failed in the previous walk. */
	if (data->ret < 0)
		return;
	/* Register address of the chunk and update local caches. */
	lkey = mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
				  &priv->sh->share_cache, data->mr_ctrl,
				  (uintptr_t)memhdr->addr,
				  priv->config.mr_ext_memseg_en);
	if (lkey == UINT32_MAX)
		data->ret = -1;
}

/**
 * Register entire memory chunks in a Mempool.
 *
 * @param dev
 *   Pointer to Ethernet device.
 * @param mr_ctrl
 *   Pointer to per-queue MR control structure.
 * @param mp
 *   Pointer to registering Mempool.
 *
 * @return
 *   0 on success, -1 on failure.
 */
int
mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
		  struct rte_mempool *mp)
{
	struct mr_update_mp_data data = {
		.dev = dev,
		.mr_ctrl = mr_ctrl,
		.ret = 0,
	};

	rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data);
	if (data.ret < 0 && rte_errno == ENXIO) {
		/* Mempool may have externally allocated memory. */
		return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp);
	}
	return data.ret;
}