Branch data Line data Source code
1 : : /* SPDX-License-Identifier: BSD-3-Clause
2 : : * Copyright(c) 2010-2014 Intel Corporation.
3 : : * Copyright(c) 2013 6WIND S.A.
4 : : */
5 : :
6 : : #include <errno.h>
7 : : #include <fcntl.h>
8 : : #include <stdbool.h>
9 : : #include <stdlib.h>
10 : : #include <stdio.h>
11 : : #include <stdint.h>
12 : : #include <inttypes.h>
13 : : #include <string.h>
14 : : #include <sys/mman.h>
15 : : #include <sys/stat.h>
16 : : #include <sys/file.h>
17 : : #include <sys/resource.h>
18 : : #include <sys/personality.h>
19 : : #include <unistd.h>
20 : : #include <limits.h>
21 : : #include <signal.h>
22 : : #include <setjmp.h>
23 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
24 : : #include <numa.h>
25 : : #include <numaif.h>
26 : : #endif
27 : :
28 : : #include <rte_errno.h>
29 : : #include <rte_log.h>
30 : : #include <rte_memory.h>
31 : : #include <rte_eal.h>
32 : : #include <rte_lcore.h>
33 : : #include <rte_common.h>
34 : :
35 : : #include <eal_export.h>
36 : : #include "eal_private.h"
37 : : #include "eal_memalloc.h"
38 : : #include "eal_memcfg.h"
39 : : #include "eal_internal_cfg.h"
40 : : #include "eal_filesystem.h"
41 : : #include "eal_hugepages.h"
42 : : #include "eal_options.h"
43 : :
44 : : #define PFN_MASK_SIZE 8
45 : :
46 : : /**
47 : : * @file
48 : : * Huge page mapping under linux
49 : : *
50 : : * To reserve a big contiguous amount of memory, we use the hugepage
51 : : * feature of linux. For that, we need to have hugetlbfs mounted. This
52 : : * code will create many files in this directory (one per page) and
53 : : * map them in virtual memory. For each page, we will retrieve its
54 : : * physical address and remap it in order to have a virtual contiguous
55 : : * zone as well as a physical contiguous zone.
56 : : */
57 : :
58 : : static int phys_addrs_available = -1;
59 : :
60 : : #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
61 : :
62 : 181 : uint64_t eal_get_baseaddr(void)
63 : : {
64 : : /*
65 : : * Linux kernel uses a really high address as starting address for
66 : : * serving mmaps calls. If there exists addressing limitations and IOVA
67 : : * mode is VA, this starting address is likely too high for those
68 : : * devices. However, it is possible to use a lower address in the
69 : : * process virtual address space as with 64 bits there is a lot of
70 : : * available space.
71 : : *
72 : : * Current known limitations are 39 or 40 bits. Setting the starting
73 : : * address at 4GB implies there are 508GB or 1020GB for mapping the
74 : : * available hugepages. This is likely enough for most systems, although
75 : : * a device with addressing limitations should call
76 : : * rte_mem_check_dma_mask for ensuring all memory is within supported
77 : : * range.
78 : : */
79 : : #if defined(RTE_ARCH_LOONGARCH)
80 : : return 0x7000000000ULL;
81 : : #else
82 : 181 : return 0x100000000ULL;
83 : : #endif
84 : : }
85 : :
86 : : /*
87 : : * Get physical address of any mapped virtual address in the current process.
88 : : */
89 : : RTE_EXPORT_SYMBOL(rte_mem_virt2phy)
90 : : phys_addr_t
91 : 4000 : rte_mem_virt2phy(const void *virtaddr)
92 : : {
93 : : int fd, retval;
94 : : uint64_t page, physaddr;
95 : : unsigned long virt_pfn;
96 : : int page_size;
97 : : off_t offset;
98 : :
99 [ + - ]: 4000 : if (phys_addrs_available == 0)
100 : : return RTE_BAD_IOVA;
101 : :
102 : : /* standard page size */
103 : 4000 : page_size = getpagesize();
104 : :
105 : : fd = open("/proc/self/pagemap", O_RDONLY);
106 [ - + ]: 4000 : if (fd < 0) {
107 : 0 : EAL_LOG(INFO, "%s(): cannot open /proc/self/pagemap: %s",
108 : : __func__, strerror(errno));
109 : 0 : return RTE_BAD_IOVA;
110 : : }
111 : :
112 : 4000 : virt_pfn = (unsigned long)virtaddr / page_size;
113 : 4000 : offset = sizeof(uint64_t) * virt_pfn;
114 [ - + ]: 4000 : if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
115 : 0 : EAL_LOG(INFO, "%s(): seek error in /proc/self/pagemap: %s",
116 : : __func__, strerror(errno));
117 : 0 : close(fd);
118 : 0 : return RTE_BAD_IOVA;
119 : : }
120 : :
121 : 4000 : retval = read(fd, &page, PFN_MASK_SIZE);
122 : 4000 : close(fd);
123 [ - + ]: 4000 : if (retval < 0) {
124 : 0 : EAL_LOG(INFO, "%s(): cannot read /proc/self/pagemap: %s",
125 : : __func__, strerror(errno));
126 : 0 : return RTE_BAD_IOVA;
127 [ - + ]: 4000 : } else if (retval != PFN_MASK_SIZE) {
128 : 0 : EAL_LOG(INFO, "%s(): read %d bytes from /proc/self/pagemap "
129 : : "but expected %d:",
130 : : __func__, retval, PFN_MASK_SIZE);
131 : 0 : return RTE_BAD_IOVA;
132 : : }
133 : :
134 : : /*
135 : : * the pfn (page frame number) are bits 0-54 (see
136 : : * pagemap.txt in linux Documentation)
137 : : */
138 [ + - ]: 4000 : if ((page & 0x7fffffffffffffULL) == 0)
139 : : return RTE_BAD_IOVA;
140 : :
141 : 4000 : physaddr = ((page & 0x7fffffffffffffULL) * page_size)
142 : 4000 : + ((unsigned long)virtaddr % page_size);
143 : :
144 : 4000 : return physaddr;
145 : : }
146 : :
147 : : RTE_EXPORT_SYMBOL(rte_mem_virt2iova)
148 : : rte_iova_t
149 : 1866 : rte_mem_virt2iova(const void *virtaddr)
150 : : {
151 [ - + ]: 1866 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
152 : 0 : return (uintptr_t)virtaddr;
153 : 1866 : return rte_mem_virt2phy(virtaddr);
154 : : }
155 : :
156 : : /*
157 : : * For each hugepage in hugepg_tbl, fill the physaddr value. We find
158 : : * it by browsing the /proc/self/pagemap special file.
159 : : */
160 : : static int
161 : 2 : find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
162 : : {
163 : : unsigned int i;
164 : : phys_addr_t addr;
165 : :
166 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
167 : 2046 : addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
168 [ + - ]: 2046 : if (addr == RTE_BAD_PHYS_ADDR)
169 : : return -1;
170 : 2046 : hugepg_tbl[i].physaddr = addr;
171 : : }
172 : : return 0;
173 : : }
174 : :
175 : : /*
176 : : * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
177 : : */
178 : : static int
179 : : set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
180 : : {
181 : : unsigned int i;
182 : : static phys_addr_t addr;
183 : :
184 [ # # ]: 0 : for (i = 0; i < hpi->num_pages[0]; i++) {
185 : 0 : hugepg_tbl[i].physaddr = addr;
186 : 0 : addr += hugepg_tbl[i].size;
187 : : }
188 : : return 0;
189 : : }
190 : :
191 : : /*
192 : : * Check whether address-space layout randomization is enabled in
193 : : * the kernel. This is important for multi-process as it can prevent
194 : : * two processes mapping data to the same virtual address
195 : : * Returns:
196 : : * 0 - address space randomization disabled
197 : : * 1/2 - address space randomization enabled
198 : : * negative error code on error
199 : : */
200 : : static int
201 : 1 : aslr_enabled(void)
202 : : {
203 : : char c;
204 : :
205 : : /*
206 : : * Check whether the current process is executed with the command line
207 : : * "setarch ... --addr-no-randomize ..." or "setarch ... -R ..."
208 : : * This complements the sysfs check to ensure comprehensive ASLR status detection.
209 : : * This check is necessary to support the functionality of the "setarch" command,
210 : : * which can disable ASLR by setting the ADDR_NO_RANDOMIZE personality flag.
211 : : */
212 [ + - ]: 1 : if ((personality(0xffffffff) & ADDR_NO_RANDOMIZE) == ADDR_NO_RANDOMIZE)
213 : : return 0;
214 : :
215 : : int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
216 [ - + ]: 1 : if (fd < 0)
217 : 0 : return -errno;
218 : 1 : retval = read(fd, &c, 1);
219 : 1 : close(fd);
220 [ - + ]: 1 : if (retval < 0)
221 : 0 : return -errno;
222 [ + - ]: 1 : if (retval == 0)
223 : : return -EIO;
224 [ - + ]: 1 : switch (c) {
225 : : case '0' : return 0;
226 : : case '1' : return 1;
227 : : case '2' : return 2;
228 : : default: return -EINVAL;
229 : : }
230 : : }
231 : :
232 : : static sigjmp_buf huge_jmpenv;
233 : :
234 : 0 : static void huge_sigbus_handler(int signo __rte_unused)
235 : : {
236 : 0 : siglongjmp(huge_jmpenv, 1);
237 : : }
238 : :
239 : : /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
240 : : * non-static local variable in the stack frame calling sigsetjmp might be
241 : : * clobbered by a call to longjmp.
242 : : */
243 : 2046 : static int huge_wrap_sigsetjmp(void)
244 : : {
245 : 2046 : return sigsetjmp(huge_jmpenv, 1);
246 : : }
247 : :
248 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
249 : : /* Callback for numa library. */
250 : : void numa_error(char *where)
251 : : {
252 : 0 : EAL_LOG(ERR, "%s failed: %s", where, strerror(errno));
253 : 0 : }
254 : : #endif
255 : :
256 : : /*
257 : : * Mmap all hugepages of hugepage table: it first open a file in
258 : : * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
259 : : * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
260 : : * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
261 : : * map contiguous physical blocks in contiguous virtual blocks.
262 : : */
263 : : static unsigned
264 : 2 : map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
265 : : uint64_t *essential_memory __rte_unused)
266 : : {
267 : : int fd;
268 : : unsigned i;
269 : : void *virtaddr;
270 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
271 : : int node_id = -1;
272 : : int essential_prev = 0;
273 : : int oldpolicy;
274 : : struct bitmask *oldmask = NULL;
275 : : bool have_numa = true;
276 : : unsigned long maxnode = 0;
277 : : const struct internal_config *internal_conf =
278 : 2 : eal_get_internal_configuration();
279 : :
280 : : /* Check if kernel supports NUMA. */
281 [ + - ]: 2 : if (numa_available() != 0) {
282 : 0 : EAL_LOG(DEBUG, "NUMA is not supported.");
283 : : have_numa = false;
284 : : }
285 : :
286 : : if (have_numa) {
287 : 2 : EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
288 : 2 : oldmask = numa_allocate_nodemask();
289 [ - + ]: 2 : if (get_mempolicy(&oldpolicy, oldmask->maskp,
290 : 2 : oldmask->size + 1, 0, 0) < 0) {
291 : 0 : EAL_LOG(ERR,
292 : : "Failed to get current mempolicy: %s. "
293 : : "Assuming MPOL_DEFAULT.", strerror(errno));
294 : 0 : oldpolicy = MPOL_DEFAULT;
295 : : }
296 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
297 [ - + ]: 64 : if (internal_conf->numa_mem[i])
298 : 0 : maxnode = i + 1;
299 : : }
300 : : #endif
301 : :
302 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
303 : 2046 : struct hugepage_file *hf = &hugepg_tbl[i];
304 : 2046 : uint64_t hugepage_sz = hpi->hugepage_sz;
305 : :
306 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
307 [ - + ]: 2046 : if (maxnode) {
308 : : unsigned int j;
309 : :
310 [ # # ]: 0 : for (j = 0; j < maxnode; j++)
311 [ # # ]: 0 : if (essential_memory[j])
312 : : break;
313 : :
314 [ # # ]: 0 : if (j == maxnode) {
315 : 0 : node_id = (node_id + 1) % maxnode;
316 [ # # ]: 0 : while (!internal_conf->numa_mem[node_id]) {
317 : 0 : node_id++;
318 : 0 : node_id %= maxnode;
319 : : }
320 : : essential_prev = 0;
321 : : } else {
322 : 0 : node_id = j;
323 : 0 : essential_prev = essential_memory[j];
324 : :
325 [ # # ]: 0 : if (essential_memory[j] < hugepage_sz)
326 : 0 : essential_memory[j] = 0;
327 : : else
328 : 0 : essential_memory[j] -= hugepage_sz;
329 : : }
330 : :
331 : 0 : EAL_LOG(DEBUG,
332 : : "Setting policy MPOL_PREFERRED for socket %d",
333 : : node_id);
334 : 0 : numa_set_preferred(node_id);
335 : : }
336 : : #endif
337 : :
338 : 2046 : hf->file_id = i;
339 : 2046 : hf->size = hugepage_sz;
340 [ - + ]: 2046 : if (eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), hpi->hugedir,
341 : : hf->file_id) == NULL) {
342 : 0 : EAL_LOG(DEBUG, "%s(): huge file path '%s' truncated",
343 : : __func__, hf->filepath);
344 : 0 : goto out;
345 : : }
346 : :
347 : 2046 : hf->filepath[sizeof(hf->filepath) - 1] = '\0';
348 : :
349 : : /* try to create hugepage file */
350 : : fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
351 [ - + ]: 2046 : if (fd < 0) {
352 : 0 : EAL_LOG(DEBUG, "%s(): open failed: %s", __func__,
353 : : strerror(errno));
354 : 0 : goto out;
355 : : }
356 : :
357 : : /* map the segment, and populate page tables,
358 : : * the kernel fills this segment with zeros. we don't care where
359 : : * this gets mapped - we already have contiguous memory areas
360 : : * ready for us to map into.
361 : : */
362 : 2046 : virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
363 : : MAP_SHARED | MAP_POPULATE, fd, 0);
364 [ - + ]: 2046 : if (virtaddr == MAP_FAILED) {
365 : 0 : EAL_LOG(DEBUG, "%s(): mmap failed: %s", __func__,
366 : : strerror(errno));
367 : 0 : close(fd);
368 : 0 : goto out;
369 : : }
370 : :
371 : 2046 : hf->orig_va = virtaddr;
372 : :
373 : : /* In linux, hugetlb limitations, like cgroup, are
374 : : * enforced at fault time instead of mmap(), even
375 : : * with the option of MAP_POPULATE. Kernel will send
376 : : * a SIGBUS signal. To avoid to be killed, save stack
377 : : * environment here, if SIGBUS happens, we can jump
378 : : * back here.
379 : : */
380 [ - + ]: 2046 : if (huge_wrap_sigsetjmp()) {
381 : 0 : EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more "
382 : : "hugepages of size %u MB",
383 : : (unsigned int)(hugepage_sz / 0x100000));
384 : 0 : munmap(virtaddr, hugepage_sz);
385 : 0 : close(fd);
386 : 0 : unlink(hugepg_tbl[i].filepath);
387 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
388 [ # # ]: 0 : if (maxnode)
389 : 0 : essential_memory[node_id] =
390 : : essential_prev;
391 : : #endif
392 : 0 : goto out;
393 : : }
394 : 2046 : *(int *)virtaddr = 0;
395 : :
396 : : /* set shared lock on the file. */
397 [ - + ]: 2046 : if (flock(fd, LOCK_SH) < 0) {
398 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed:%s ",
399 : : __func__, strerror(errno));
400 : 0 : close(fd);
401 : 0 : goto out;
402 : : }
403 : :
404 : 2046 : close(fd);
405 : : }
406 : :
407 : 2 : out:
408 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
409 [ - + ]: 2 : if (maxnode) {
410 : 0 : EAL_LOG(DEBUG,
411 : : "Restoring previous memory policy: %d", oldpolicy);
412 [ # # ]: 0 : if (oldpolicy == MPOL_DEFAULT) {
413 : 0 : numa_set_localalloc();
414 [ # # ]: 0 : } else if (set_mempolicy(oldpolicy, oldmask->maskp,
415 : 0 : oldmask->size + 1) < 0) {
416 : 0 : EAL_LOG(ERR, "Failed to restore mempolicy: %s",
417 : : strerror(errno));
418 : 0 : numa_set_localalloc();
419 : : }
420 : : }
421 [ + - ]: 2 : if (oldmask != NULL)
422 : : numa_free_cpumask(oldmask);
423 : : #endif
424 : 2 : return i;
425 : : }
426 : :
427 : : /*
428 : : * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
429 : : * page.
430 : : */
431 : : static int
432 : 2 : find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
433 : : {
434 : : int socket_id;
435 : : char *end, *nodestr;
436 : : unsigned i, hp_count = 0;
437 : : uint64_t virt_addr;
438 : : char buf[BUFSIZ];
439 : : char *hugedir_str;
440 : : FILE *f;
441 : : int ret;
442 : :
443 : 2 : f = fopen("/proc/self/numa_maps", "r");
444 [ - + ]: 2 : if (f == NULL) {
445 : 0 : EAL_LOG(NOTICE, "NUMA support not available"
446 : : " consider that all memory is in socket_id 0");
447 : 0 : return 0;
448 : : }
449 : :
450 : 2 : ret = asprintf(&hugedir_str, "%s/%s",
451 : 2 : hpi->hugedir, eal_get_hugefile_prefix());
452 [ - + ]: 2 : if (ret < 0) {
453 : 0 : EAL_LOG(ERR, "%s(): failed to store hugepage path", __func__);
454 : 0 : hugedir_str = NULL;
455 : 0 : goto error;
456 : : }
457 : :
458 : : ret = -1;
459 : :
460 : : /* parse numa map */
461 [ + + ]: 2388 : while (fgets(buf, sizeof(buf), f) != NULL) {
462 : :
463 : : /* ignore non huge page */
464 [ + + ]: 2386 : if (strstr(buf, " huge ") == NULL &&
465 [ + - ]: 340 : strstr(buf, hugedir_str) == NULL)
466 : 340 : continue;
467 : :
468 : : /* get zone addr */
469 : 2046 : virt_addr = strtoull(buf, &end, 16);
470 [ + - - + ]: 2046 : if (virt_addr == 0 || end == buf) {
471 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
472 : 0 : goto error;
473 : : }
474 : :
475 : : /* get node id (socket id) */
476 : 2046 : nodestr = strstr(buf, " N");
477 [ - + ]: 2046 : if (nodestr == NULL) {
478 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
479 : 0 : goto error;
480 : : }
481 : 2046 : nodestr += 2;
482 : 2046 : end = strstr(nodestr, "=");
483 [ - + ]: 2046 : if (end == NULL) {
484 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
485 : 0 : goto error;
486 : : }
487 : 2046 : end[0] = '\0';
488 : 2046 : end = NULL;
489 : :
490 : 2046 : socket_id = strtoul(nodestr, &end, 0);
491 [ + - + - : 2046 : if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
- + ]
492 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
493 : 0 : goto error;
494 : : }
495 : :
496 : : /* if we find this page in our mappings, set socket_id */
497 [ + + ]: 2095104 : for (i = 0; i < hpi->num_pages[0]; i++) {
498 : 2093058 : void *va = (void *)(unsigned long)virt_addr;
499 [ + + ]: 2093058 : if (hugepg_tbl[i].orig_va == va) {
500 : 2046 : hugepg_tbl[i].socket_id = socket_id;
501 : 2046 : hp_count++;
502 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
503 : 2046 : EAL_LOG(DEBUG,
504 : : "Hugepage %s is on socket %d",
505 : : hugepg_tbl[i].filepath, socket_id);
506 : : #endif
507 : : }
508 : : }
509 : : }
510 : :
511 [ - + ]: 2 : if (hp_count < hpi->num_pages[0])
512 : 0 : goto error;
513 : :
514 : : ret = 0;
515 : 2 : error:
516 : 2 : free(hugedir_str);
517 : 2 : fclose(f);
518 : 2 : return ret;
519 : : }
520 : :
521 : : static int
522 : 10359 : cmp_physaddr(const void *a, const void *b)
523 : : {
524 : : #ifndef RTE_ARCH_PPC_64
525 : : const struct hugepage_file *p1 = a;
526 : : const struct hugepage_file *p2 = b;
527 : : #else
528 : : /* PowerPC needs memory sorted in reverse order from x86 */
529 : : const struct hugepage_file *p1 = b;
530 : : const struct hugepage_file *p2 = a;
531 : : #endif
532 [ + + ]: 10359 : if (p1->physaddr < p2->physaddr)
533 : : return -1;
534 [ - + ]: 9178 : else if (p1->physaddr > p2->physaddr)
535 : : return 1;
536 : : else
537 : 0 : return 0;
538 : : }
539 : :
540 : : /*
541 : : * Uses mmap to create a shared memory area for storage of data
542 : : * Used in this file to store the hugepage file map on disk
543 : : */
544 : : static void *
545 : 2 : create_shared_memory(const char *filename, const size_t mem_size)
546 : : {
547 : : void *retval;
548 : : int fd;
549 : : const struct internal_config *internal_conf =
550 : 2 : eal_get_internal_configuration();
551 : :
552 : : /* if no shared files mode is used, create anonymous memory instead */
553 [ - + ]: 2 : if (internal_conf->no_shconf) {
554 : 0 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
555 : : MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
556 [ # # ]: 0 : if (retval == MAP_FAILED)
557 : : return NULL;
558 : 0 : return retval;
559 : : }
560 : :
561 : : fd = open(filename, O_CREAT | O_RDWR, 0600);
562 [ + - ]: 2 : if (fd < 0)
563 : : return NULL;
564 [ - + ]: 2 : if (ftruncate(fd, mem_size) < 0) {
565 : 0 : close(fd);
566 : 0 : return NULL;
567 : : }
568 : 2 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
569 : 2 : close(fd);
570 [ - + ]: 2 : if (retval == MAP_FAILED)
571 : 0 : return NULL;
572 : : return retval;
573 : : }
574 : :
575 : : /*
576 : : * this copies *active* hugepages from one hugepage table to another.
577 : : * destination is typically the shared memory.
578 : : */
579 : : static int
580 : 2 : copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
581 : : const struct hugepage_file * src, int src_size)
582 : : {
583 : : int src_pos, dst_pos = 0;
584 : :
585 [ + + ]: 2048 : for (src_pos = 0; src_pos < src_size; src_pos++) {
586 [ + + ]: 2046 : if (src[src_pos].orig_va != NULL) {
587 : : /* error on overflow attempt */
588 [ + - ]: 18 : if (dst_pos == dest_size)
589 : : return -1;
590 : 18 : memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
591 : 18 : dst_pos++;
592 : : }
593 : : }
594 : : return 0;
595 : : }
596 : :
597 : : static int
598 : 0 : unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
599 : : unsigned num_hp_info)
600 : : {
601 : : unsigned socket, size;
602 : : int page, nrpages = 0;
603 : : const struct internal_config *internal_conf =
604 : 0 : eal_get_internal_configuration();
605 : :
606 : : /* get total number of hugepages */
607 [ # # ]: 0 : for (size = 0; size < num_hp_info; size++)
608 [ # # ]: 0 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
609 : 0 : nrpages +=
610 : 0 : internal_conf->hugepage_info[size].num_pages[socket];
611 : :
612 [ # # ]: 0 : for (page = 0; page < nrpages; page++) {
613 : 0 : struct hugepage_file *hp = &hugepg_tbl[page];
614 : :
615 [ # # # # ]: 0 : if (hp->orig_va != NULL && unlink(hp->filepath)) {
616 : 0 : EAL_LOG(WARNING, "%s(): Removing %s failed: %s",
617 : : __func__, hp->filepath, strerror(errno));
618 : : }
619 : : }
620 : 0 : return 0;
621 : : }
622 : :
623 : : /*
624 : : * unmaps hugepages that are not going to be used. since we originally allocate
625 : : * ALL hugepages (not just those we need), additional unmapping needs to be done.
626 : : */
627 : : static int
628 : 2 : unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
629 : : struct hugepage_info *hpi,
630 : : unsigned num_hp_info)
631 : : {
632 : : unsigned socket, size;
633 : : int page, nrpages = 0;
634 : : const struct internal_config *internal_conf =
635 : 2 : eal_get_internal_configuration();
636 : :
637 : : /* get total number of hugepages */
638 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++)
639 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
640 : 64 : nrpages += internal_conf->hugepage_info[size].num_pages[socket];
641 : :
642 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++) {
643 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
644 : : unsigned pages_found = 0;
645 : :
646 : : /* traverse until we have unmapped all the unused pages */
647 [ + + ]: 65536 : for (page = 0; page < nrpages; page++) {
648 : 65472 : struct hugepage_file *hp = &hugepg_tbl[page];
649 : :
650 : : /* find a page that matches the criteria */
651 [ + - ]: 65472 : if ((hp->size == hpi[size].hugepage_sz) &&
652 [ + + ]: 65472 : (hp->socket_id == (int) socket)) {
653 : :
654 : : /* if we skipped enough pages, unmap the rest */
655 [ + + ]: 2046 : if (pages_found == hpi[size].num_pages[socket]) {
656 : : uint64_t unmap_len;
657 : :
658 : : unmap_len = hp->size;
659 : :
660 : : /* get start addr and len of the remaining segment */
661 : 2028 : munmap(hp->orig_va,
662 : : (size_t)unmap_len);
663 : :
664 : 2028 : hp->orig_va = NULL;
665 [ - + ]: 2028 : if (unlink(hp->filepath) == -1) {
666 : 0 : EAL_LOG(ERR, "%s(): Removing %s failed: %s",
667 : : __func__, hp->filepath, strerror(errno));
668 : 0 : return -1;
669 : : }
670 : : } else {
671 : : /* lock the page and skip */
672 : 18 : pages_found++;
673 : : }
674 : :
675 : : } /* match page */
676 : : } /* foreach page */
677 : : } /* foreach socket */
678 : : } /* foreach pagesize */
679 : :
680 : : return 0;
681 : : }
682 : :
683 : : static int
684 : 2 : remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
685 : : {
686 : 2 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
687 : : struct rte_memseg_list *msl;
688 : : struct rte_fbarray *arr;
689 : : int cur_page, seg_len;
690 : : unsigned int msl_idx;
691 : : int ms_idx;
692 : : uint64_t page_sz;
693 : : size_t memseg_len;
694 : : int socket_id;
695 : : #ifndef RTE_ARCH_64
696 : : const struct internal_config *internal_conf =
697 : : eal_get_internal_configuration();
698 : : #endif
699 : 2 : page_sz = hugepages[seg_start].size;
700 : 2 : socket_id = hugepages[seg_start].socket_id;
701 : 2 : seg_len = seg_end - seg_start;
702 : :
703 : 2 : EAL_LOG(DEBUG, "Attempting to map %" PRIu64 "M on socket %i",
704 : : (seg_len * page_sz) >> 20ULL, socket_id);
705 : :
706 : : /* find free space in memseg lists */
707 [ + - ]: 2 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
708 : : int free_len;
709 : : bool empty;
710 : 2 : msl = &mcfg->memsegs[msl_idx];
711 : 2 : arr = &msl->memseg_arr;
712 : :
713 [ - + ]: 2 : if (msl->page_sz != page_sz)
714 : 0 : continue;
715 [ - + ]: 2 : if (msl->socket_id != socket_id)
716 : 0 : continue;
717 : :
718 : : /* leave space for a hole if array is not empty */
719 : 2 : empty = arr->count == 0;
720 : : /* find start of the biggest contiguous block and its size */
721 : 2 : ms_idx = rte_fbarray_find_biggest_free(arr, 0);
722 [ - + ]: 2 : if (ms_idx < 0)
723 : 0 : continue;
724 : : /* hole is 1 segment long, so at least two segments long. */
725 : 2 : free_len = rte_fbarray_find_contig_free(arr, ms_idx);
726 [ - + ]: 2 : if (free_len < 2)
727 : 0 : continue;
728 : : /* leave some space between memsegs, they are not IOVA
729 : : * contiguous, so they shouldn't be VA contiguous either.
730 : : */
731 [ - + ]: 2 : if (!empty) {
732 : 0 : ms_idx++;
733 : 0 : free_len--;
734 : : }
735 : :
736 : : /* we might not get all of the space we wanted */
737 : 2 : free_len = RTE_MIN(seg_len, free_len);
738 : 2 : seg_end = seg_start + free_len;
739 : : seg_len = seg_end - seg_start;
740 : 2 : break;
741 : : }
742 [ - + ]: 2 : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
743 : 0 : EAL_LOG(ERR, "Could not find space for memseg. Please increase RTE_MAX_MEMSEG_PER_LIST "
744 : : "RTE_MAX_MEMSEG_PER_TYPE and/or RTE_MAX_MEM_MB_PER_TYPE in configuration.");
745 : 0 : return -1;
746 : : }
747 : :
748 : : #ifdef RTE_ARCH_PPC_64
749 : : /* for PPC64 we go through the list backwards */
750 : : for (cur_page = seg_end - 1; cur_page >= seg_start;
751 : : cur_page--, ms_idx++) {
752 : : #else
753 [ + + ]: 20 : for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
754 : : #endif
755 : 18 : struct hugepage_file *hfile = &hugepages[cur_page];
756 : 18 : struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
757 : : void *addr;
758 : : int fd;
759 : :
760 : 18 : fd = open(hfile->filepath, O_RDWR);
761 [ - + ]: 18 : if (fd < 0) {
762 : 0 : EAL_LOG(ERR, "Could not open '%s': %s",
763 : : hfile->filepath, strerror(errno));
764 : 0 : return -1;
765 : : }
766 : : /* set shared lock on the file. */
767 [ - + ]: 18 : if (flock(fd, LOCK_SH) < 0) {
768 : 0 : EAL_LOG(DEBUG, "Could not lock '%s': %s",
769 : : hfile->filepath, strerror(errno));
770 : 0 : close(fd);
771 : 0 : return -1;
772 : : }
773 : : memseg_len = (size_t)page_sz;
774 : 18 : addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
775 : :
776 : : /* we know this address is already mmapped by memseg list, so
777 : : * using MAP_FIXED here is safe
778 : : */
779 : 18 : addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
780 : : MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
781 [ - + ]: 18 : if (addr == MAP_FAILED) {
782 : 0 : EAL_LOG(ERR, "Couldn't remap '%s': %s",
783 : : hfile->filepath, strerror(errno));
784 : 0 : close(fd);
785 : 0 : return -1;
786 : : }
787 : :
788 : : /* we have a new address, so unmap previous one */
789 : : #ifndef RTE_ARCH_64
790 : : /* in 32-bit legacy mode, we have already unmapped the page */
791 : : if (!internal_conf->legacy_mem)
792 : : munmap(hfile->orig_va, page_sz);
793 : : #else
794 : 18 : munmap(hfile->orig_va, page_sz);
795 : : #endif
796 : :
797 : 18 : hfile->orig_va = NULL;
798 : 18 : hfile->final_va = addr;
799 : :
800 : : /* rewrite physical addresses in IOVA as VA mode */
801 [ - + ]: 18 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
802 : 0 : hfile->physaddr = (uintptr_t)addr;
803 : :
804 : : /* set up memseg data */
805 : 18 : ms->addr = addr;
806 : 18 : ms->hugepage_sz = page_sz;
807 : 18 : ms->len = memseg_len;
808 : 18 : ms->iova = hfile->physaddr;
809 : 18 : ms->socket_id = hfile->socket_id;
810 : 18 : ms->nchannel = rte_memory_get_nchannel();
811 : 18 : ms->nrank = rte_memory_get_nrank();
812 : :
813 : 18 : rte_fbarray_set_used(arr, ms_idx);
814 : :
815 : : /* store segment fd internally */
816 [ - + ]: 18 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
817 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
818 : : rte_strerror(rte_errno));
819 : : }
820 : 2 : EAL_LOG(DEBUG, "Allocated %" PRIu64 "M on socket %i",
821 : : (seg_len * page_sz) >> 20, socket_id);
822 : 2 : return seg_len;
823 : : }
824 : :
825 : : static uint64_t
826 : : get_mem_amount(uint64_t page_sz, uint64_t max_mem)
827 : : {
828 : : uint64_t area_sz, max_pages;
829 : :
830 : : /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
831 : : max_pages = RTE_MAX_MEMSEG_PER_LIST;
832 : : max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
833 : :
834 : : area_sz = RTE_MIN(page_sz * max_pages, max_mem);
835 : :
836 : : /* make sure the list isn't smaller than the page size */
837 : : area_sz = RTE_MAX(area_sz, page_sz);
838 : :
839 : : return RTE_ALIGN(area_sz, page_sz);
840 : : }
841 : :
842 : : static int
843 : : memseg_list_free(struct rte_memseg_list *msl)
844 : : {
845 : : if (rte_fbarray_destroy(&msl->memseg_arr)) {
846 : : EAL_LOG(ERR, "Cannot destroy memseg list");
847 : : return -1;
848 : : }
849 : : memset(msl, 0, sizeof(*msl));
850 : : return 0;
851 : : }
852 : :
853 : : /*
854 : : * Our VA space is not preallocated yet, so preallocate it here. We need to know
855 : : * how many segments there are in order to map all pages into one address space,
856 : : * and leave appropriate holes between segments so that rte_malloc does not
857 : : * concatenate them into one big segment.
858 : : *
859 : : * we also need to unmap original pages to free up address space.
860 : : */
861 : : static int __rte_unused
862 : : prealloc_segments(struct hugepage_file *hugepages, int n_pages)
863 : : {
864 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
865 : : int cur_page, seg_start_page, end_seg, new_memseg;
866 : : unsigned int hpi_idx, socket, i;
867 : : int n_contig_segs, n_segs;
868 : : int msl_idx;
869 : : const struct internal_config *internal_conf =
870 : : eal_get_internal_configuration();
871 : :
872 : : /* before we preallocate segments, we need to free up our VA space.
873 : : * we're not removing files, and we already have information about
874 : : * PA-contiguousness, so it is safe to unmap everything.
875 : : */
876 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
877 : : struct hugepage_file *hpi = &hugepages[cur_page];
878 : : munmap(hpi->orig_va, hpi->size);
879 : : hpi->orig_va = NULL;
880 : : }
881 : :
882 : : /* we cannot know how many page sizes and sockets we have discovered, so
883 : : * loop over all of them
884 : : */
885 : : for (hpi_idx = 0; hpi_idx < internal_conf->num_hugepage_sizes;
886 : : hpi_idx++) {
887 : : uint64_t page_sz =
888 : : internal_conf->hugepage_info[hpi_idx].hugepage_sz;
889 : :
890 : : for (i = 0; i < rte_socket_count(); i++) {
891 : : struct rte_memseg_list *msl;
892 : :
893 : : socket = rte_socket_id_by_idx(i);
894 : : n_contig_segs = 0;
895 : : n_segs = 0;
896 : : seg_start_page = -1;
897 : :
898 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
899 : : struct hugepage_file *prev, *cur;
900 : : int prev_seg_start_page = -1;
901 : :
902 : : cur = &hugepages[cur_page];
903 : : prev = cur_page == 0 ? NULL :
904 : : &hugepages[cur_page - 1];
905 : :
906 : : new_memseg = 0;
907 : : end_seg = 0;
908 : :
909 : : if (cur->size == 0)
910 : : end_seg = 1;
911 : : else if (cur->socket_id != (int) socket)
912 : : end_seg = 1;
913 : : else if (cur->size != page_sz)
914 : : end_seg = 1;
915 : : else if (cur_page == 0)
916 : : new_memseg = 1;
917 : : #ifdef RTE_ARCH_PPC_64
918 : : /* On PPC64 architecture, the mmap always start
919 : : * from higher address to lower address. Here,
920 : : * physical addresses are in descending order.
921 : : */
922 : : else if ((prev->physaddr - cur->physaddr) !=
923 : : cur->size)
924 : : new_memseg = 1;
925 : : #else
926 : : else if ((cur->physaddr - prev->physaddr) !=
927 : : cur->size)
928 : : new_memseg = 1;
929 : : #endif
930 : : if (new_memseg) {
931 : : /* if we're already inside a segment,
932 : : * new segment means end of current one
933 : : */
934 : : if (seg_start_page != -1) {
935 : : end_seg = 1;
936 : : prev_seg_start_page =
937 : : seg_start_page;
938 : : }
939 : : seg_start_page = cur_page;
940 : : }
941 : :
942 : : if (end_seg) {
943 : : if (prev_seg_start_page != -1) {
944 : : /* we've found a new segment */
945 : : n_contig_segs++;
946 : : n_segs += cur_page -
947 : : prev_seg_start_page;
948 : : } else if (seg_start_page != -1) {
949 : : /* we didn't find new segment,
950 : : * but did end current one
951 : : */
952 : : n_contig_segs++;
953 : : n_segs += cur_page -
954 : : seg_start_page;
955 : : seg_start_page = -1;
956 : : continue;
957 : : } else {
958 : : /* we're skipping this page */
959 : : continue;
960 : : }
961 : : }
962 : : /* segment continues */
963 : : }
964 : : /* check if we missed last segment */
965 : : if (seg_start_page != -1) {
966 : : n_contig_segs++;
967 : : n_segs += cur_page - seg_start_page;
968 : : }
969 : :
970 : : /* if no segments were found, do not preallocate */
971 : : if (n_segs == 0)
972 : : continue;
973 : :
974 : : /* we now have total number of pages that we will
975 : : * allocate for this segment list. add separator pages
976 : : * to the total count, and preallocate VA space.
977 : : */
978 : : n_segs += n_contig_segs - 1;
979 : :
980 : : /* now, preallocate VA space for these segments */
981 : :
982 : : /* first, find suitable memseg list for this */
983 : : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
984 : : msl_idx++) {
985 : : msl = &mcfg->memsegs[msl_idx];
986 : :
987 : : if (msl->base_va != NULL)
988 : : continue;
989 : : break;
990 : : }
991 : : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
992 : : EAL_LOG(ERR, "Not enough space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
993 : : return -1;
994 : : }
995 : :
996 : : /* now, allocate fbarray itself */
997 : : if (eal_memseg_list_init(msl, page_sz, n_segs,
998 : : socket, msl_idx, true) < 0)
999 : : return -1;
1000 : :
1001 : : /* finally, allocate VA space */
1002 : : if (eal_memseg_list_alloc(msl, 0) < 0) {
1003 : : EAL_LOG(ERR, "Cannot preallocate 0x%"PRIx64"kB hugepages",
1004 : : page_sz >> 10);
1005 : : return -1;
1006 : : }
1007 : : }
1008 : : }
1009 : : return 0;
1010 : : }
1011 : :
1012 : : /*
1013 : : * We cannot reallocate memseg lists on the fly because PPC64 stores pages
1014 : : * backwards, therefore we have to process the entire memseg first before
1015 : : * remapping it into memseg list VA space.
1016 : : */
1017 : : static int
1018 : 2 : remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
1019 : : {
1020 : : int cur_page, seg_start_page, new_memseg, ret;
1021 : :
1022 : : seg_start_page = 0;
1023 [ + - ]: 20 : for (cur_page = 0; cur_page < n_pages; cur_page++) {
1024 : : struct hugepage_file *prev, *cur;
1025 : :
1026 : : new_memseg = 0;
1027 : :
1028 : 20 : cur = &hugepages[cur_page];
1029 [ + + ]: 20 : prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
1030 : :
1031 : : /* if size is zero, no more pages left */
1032 [ + + ]: 20 : if (cur->size == 0)
1033 : : break;
1034 : :
1035 [ + + ]: 18 : if (cur_page == 0)
1036 : : new_memseg = 1;
1037 [ + - ]: 16 : else if (cur->socket_id != prev->socket_id)
1038 : : new_memseg = 1;
1039 [ + - ]: 16 : else if (cur->size != prev->size)
1040 : : new_memseg = 1;
1041 : : #ifdef RTE_ARCH_PPC_64
1042 : : /* On PPC64 architecture, the mmap always start from higher
1043 : : * address to lower address. Here, physical addresses are in
1044 : : * descending order.
1045 : : */
1046 : : else if ((prev->physaddr - cur->physaddr) != cur->size)
1047 : : new_memseg = 1;
1048 : : #else
1049 [ - + ]: 16 : else if ((cur->physaddr - prev->physaddr) != cur->size)
1050 : : new_memseg = 1;
1051 : : #endif
1052 : :
1053 : : if (new_memseg) {
1054 : : /* if this isn't the first time, remap segment */
1055 [ - + ]: 2 : if (cur_page != 0) {
1056 : : int n_remapped = 0;
1057 : 0 : int n_needed = cur_page - seg_start_page;
1058 [ # # ]: 0 : while (n_remapped < n_needed) {
1059 : 0 : ret = remap_segment(hugepages, seg_start_page,
1060 : : cur_page);
1061 [ # # ]: 0 : if (ret < 0)
1062 : : return -1;
1063 : 0 : n_remapped += ret;
1064 : 0 : seg_start_page += ret;
1065 : : }
1066 : : }
1067 : : /* remember where we started */
1068 : : seg_start_page = cur_page;
1069 : : }
1070 : : /* continuation of previous memseg */
1071 : : }
1072 : : /* we were stopped, but we didn't remap the last segment, do it now */
1073 [ + - ]: 2 : if (cur_page != 0) {
1074 : : int n_remapped = 0;
1075 : 2 : int n_needed = cur_page - seg_start_page;
1076 [ + + ]: 4 : while (n_remapped < n_needed) {
1077 : 2 : ret = remap_segment(hugepages, seg_start_page,
1078 : : cur_page);
1079 [ + - ]: 2 : if (ret < 0)
1080 : : return -1;
1081 : 2 : n_remapped += ret;
1082 : 2 : seg_start_page += ret;
1083 : : }
1084 : : }
1085 : : return 0;
1086 : : }
1087 : :
1088 : : static inline size_t
1089 : 0 : eal_get_hugepage_mem_size(void)
1090 : : {
1091 : : uint64_t size = 0;
1092 : : unsigned i, j;
1093 : : struct internal_config *internal_conf =
1094 : 0 : eal_get_internal_configuration();
1095 : :
1096 [ # # ]: 0 : for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
1097 : : struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
1098 [ # # ]: 0 : if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
1099 [ # # ]: 0 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1100 : 0 : size += hpi->hugepage_sz * hpi->num_pages[j];
1101 : : }
1102 : : }
1103 : : }
1104 : :
1105 : 0 : return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
1106 : : }
1107 : :
1108 : : static struct sigaction huge_action_old;
1109 : : static int huge_need_recover;
1110 : :
1111 : : static void
1112 : 2 : huge_register_sigbus(void)
1113 : : {
1114 : : sigset_t mask;
1115 : : struct sigaction action;
1116 : :
1117 : 2 : sigemptyset(&mask);
1118 : 2 : sigaddset(&mask, SIGBUS);
1119 : 2 : action.sa_flags = 0;
1120 : 2 : action.sa_mask = mask;
1121 : 2 : action.sa_handler = huge_sigbus_handler;
1122 : :
1123 : 2 : huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
1124 : 2 : }
1125 : :
1126 : : static void
1127 : : huge_recover_sigbus(void)
1128 : : {
1129 [ + - ]: 2 : if (huge_need_recover) {
1130 : 2 : sigaction(SIGBUS, &huge_action_old, NULL);
1131 : 2 : huge_need_recover = 0;
1132 : : }
1133 : : }
1134 : :
1135 : : /*
1136 : : * Prepare physical memory mapping: fill configuration structure with
1137 : : * these infos, return 0 on success.
1138 : : * 1. map N huge pages in separate files in hugetlbfs
1139 : : * 2. find associated physical addr
1140 : : * 3. find associated NUMA socket ID
1141 : : * 4. sort all huge pages by physical address
1142 : : * 5. remap these N huge pages in the correct order
1143 : : * 6. unmap the first mapping
1144 : : * 7. fill memsegs in configuration with contiguous zones
1145 : : */
1146 : : static int
1147 : 123 : eal_legacy_hugepage_init(void)
1148 : : {
1149 : : struct rte_mem_config *mcfg;
1150 : : struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
1151 : : struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
1152 : : struct internal_config *internal_conf =
1153 : 123 : eal_get_internal_configuration();
1154 : :
1155 : : uint64_t memory[RTE_MAX_NUMA_NODES];
1156 : :
1157 : : unsigned hp_offset;
1158 : : int i, j;
1159 : : int nr_hugefiles, nr_hugepages = 0;
1160 : : void *addr;
1161 : :
1162 : : memset(used_hp, 0, sizeof(used_hp));
1163 : :
1164 : : /* get pointer to global configuration */
1165 : 123 : mcfg = rte_eal_get_configuration()->mem_config;
1166 : :
1167 : : /* hugetlbfs can be disabled */
1168 [ + + ]: 123 : if (internal_conf->no_hugetlbfs) {
1169 : : void *prealloc_addr;
1170 : : size_t mem_sz;
1171 : : struct rte_memseg_list *msl;
1172 : : int n_segs, fd, flags;
1173 : : int memfd;
1174 : : uint64_t page_sz;
1175 : :
1176 : : /* nohuge mode is legacy mode */
1177 : 121 : internal_conf->legacy_mem = 1;
1178 : :
1179 : : /* nohuge mode is single-file segments mode */
1180 : 121 : internal_conf->single_file_segments = 1;
1181 : :
1182 : : /* create a memseg list */
1183 : 121 : msl = &mcfg->memsegs[0];
1184 : :
1185 : 121 : mem_sz = internal_conf->memory;
1186 : : page_sz = RTE_PGSIZE_4K;
1187 : 121 : n_segs = mem_sz / page_sz;
1188 : :
1189 [ + - ]: 121 : if (eal_memseg_list_init_named(
1190 : : msl, "nohugemem", page_sz, n_segs, 0, true)) {
1191 : : return -1;
1192 : : }
1193 : :
1194 : : /* set up parameters for anonymous mmap */
1195 : : fd = -1;
1196 : : flags = MAP_PRIVATE | MAP_ANONYMOUS;
1197 : :
1198 : : /* create a memfd and store it in the segment fd table */
1199 : 121 : memfd = memfd_create("nohuge", 0);
1200 [ - + ]: 121 : if (memfd < 0) {
1201 : 0 : EAL_LOG(DEBUG, "Cannot create memfd: %s",
1202 : : strerror(errno));
1203 : 0 : EAL_LOG(DEBUG, "Falling back to anonymous map");
1204 : : } else {
1205 : : /* we got an fd - now resize it */
1206 [ - + ]: 121 : if (ftruncate(memfd, internal_conf->memory) < 0) {
1207 : 0 : EAL_LOG(ERR, "Cannot resize memfd: %s",
1208 : : strerror(errno));
1209 : 0 : EAL_LOG(ERR, "Falling back to anonymous map");
1210 : 0 : close(memfd);
1211 : : } else {
1212 : : /* creating memfd-backed file was successful.
1213 : : * we want changes to memfd to be visible to
1214 : : * other processes (such as vhost backend), so
1215 : : * map it as shared memory.
1216 : : */
1217 : 121 : EAL_LOG(DEBUG, "Using memfd for anonymous memory");
1218 : : fd = memfd;
1219 : : flags = MAP_SHARED;
1220 : : }
1221 : : }
1222 : : /* preallocate address space for the memory, so that it can be
1223 : : * fit into the DMA mask.
1224 : : */
1225 [ - + ]: 121 : if (eal_memseg_list_alloc(msl, 0)) {
1226 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1227 : 0 : return -1;
1228 : : }
1229 : :
1230 : 121 : prealloc_addr = msl->base_va;
1231 : 121 : addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
1232 : : flags | MAP_FIXED, fd, 0);
1233 [ - + ]: 121 : if (addr == MAP_FAILED || addr != prealloc_addr) {
1234 : 0 : EAL_LOG(ERR, "%s: mmap() failed: %s", __func__,
1235 : : strerror(errno));
1236 : 0 : munmap(prealloc_addr, mem_sz);
1237 : 0 : return -1;
1238 : : }
1239 : :
1240 : : /* we're in single-file segments mode, so only the segment list
1241 : : * fd needs to be set up.
1242 : : */
1243 [ + - ]: 121 : if (fd != -1) {
1244 [ - + ]: 121 : if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
1245 : 0 : EAL_LOG(ERR, "Cannot set up segment list fd");
1246 : : /* not a serious error, proceed */
1247 : : }
1248 : : }
1249 : :
1250 : 121 : eal_memseg_list_populate(msl, addr, n_segs);
1251 : :
1252 [ - + - - ]: 121 : if (mcfg->dma_maskbits &&
1253 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1254 : 0 : EAL_LOG(ERR,
1255 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1256 : : __func__);
1257 [ # # # # ]: 0 : if (rte_eal_iova_mode() == RTE_IOVA_VA &&
1258 : 0 : rte_eal_using_phys_addrs())
1259 : 0 : EAL_LOG(ERR,
1260 : : "%s(): Please try initializing EAL with --iova-mode=pa parameter.",
1261 : : __func__);
1262 : 0 : goto fail;
1263 : : }
1264 : 121 : return 0;
1265 : : }
1266 : :
1267 : : /* calculate total number of hugepages available. at this point we haven't
1268 : : * yet started sorting them so they all are on socket 0 */
1269 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1270 : : /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
1271 : 2 : used_hp[i].hugepage_sz = internal_conf->hugepage_info[i].hugepage_sz;
1272 : :
1273 : 2 : nr_hugepages += internal_conf->hugepage_info[i].num_pages[0];
1274 : : }
1275 : :
1276 : : /*
1277 : : * allocate a memory area for hugepage table.
1278 : : * this isn't shared memory yet. due to the fact that we need some
1279 : : * processing done on these pages, shared memory will be created
1280 : : * at a later stage.
1281 : : */
1282 : 2 : tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
1283 [ - + ]: 2 : if (tmp_hp == NULL)
1284 : 0 : goto fail;
1285 : :
1286 : : memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
1287 : :
1288 : : hp_offset = 0; /* where we start the current page size entries */
1289 : :
1290 : 2 : huge_register_sigbus();
1291 : :
1292 : : /* make a copy of numa_mem, needed for balanced allocation. */
1293 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1294 : 64 : memory[i] = internal_conf->numa_mem[i];
1295 : :
1296 : : /* map all hugepages and sort them */
1297 [ + + ]: 4 : for (i = 0; i < (int)internal_conf->num_hugepage_sizes; i++) {
1298 : : unsigned pages_old, pages_new;
1299 : : struct hugepage_info *hpi;
1300 : :
1301 : : /*
1302 : : * we don't yet mark hugepages as used at this stage, so
1303 : : * we just map all hugepages available to the system
1304 : : * all hugepages are still located on socket 0
1305 : : */
1306 : 2 : hpi = &internal_conf->hugepage_info[i];
1307 : :
1308 [ - + ]: 2 : if (hpi->num_pages[0] == 0)
1309 : 0 : continue;
1310 : :
1311 : : /* map all hugepages available */
1312 : : pages_old = hpi->num_pages[0];
1313 : 2 : pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
1314 [ - + ]: 2 : if (pages_new < pages_old) {
1315 : 0 : EAL_LOG(DEBUG,
1316 : : "%d not %d hugepages of size %u MB allocated",
1317 : : pages_new, pages_old,
1318 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1319 : :
1320 : 0 : int pages = pages_old - pages_new;
1321 : :
1322 : 0 : nr_hugepages -= pages;
1323 : 0 : hpi->num_pages[0] = pages_new;
1324 [ # # ]: 0 : if (pages_new == 0)
1325 : 0 : continue;
1326 : : }
1327 : :
1328 [ + - - + ]: 4 : if (rte_eal_using_phys_addrs() &&
1329 : 2 : rte_eal_iova_mode() != RTE_IOVA_VA) {
1330 : : /* find physical addresses for each hugepage */
1331 [ - + ]: 2 : if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1332 : 0 : EAL_LOG(DEBUG, "Failed to find phys addr "
1333 : : "for %u MB pages",
1334 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1335 : 0 : goto fail;
1336 : : }
1337 : : } else {
1338 : : /* set physical addresses for each hugepage */
1339 : : if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1340 : : EAL_LOG(DEBUG, "Failed to set phys addr "
1341 : : "for %u MB pages",
1342 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1343 : : goto fail;
1344 : : }
1345 : : }
1346 : :
1347 [ - + ]: 2 : if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1348 : 0 : EAL_LOG(DEBUG, "Failed to find NUMA socket for %u MB pages",
1349 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1350 : 0 : goto fail;
1351 : : }
1352 : :
1353 : 2 : qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
1354 : : sizeof(struct hugepage_file), cmp_physaddr);
1355 : :
1356 : : /* we have processed a num of hugepages of this size, so inc offset */
1357 : 2 : hp_offset += hpi->num_pages[0];
1358 : : }
1359 : :
1360 : : huge_recover_sigbus();
1361 : :
1362 [ - + - - ]: 2 : if (internal_conf->memory == 0 && internal_conf->force_numa == 0)
1363 : 0 : internal_conf->memory = eal_get_hugepage_mem_size();
1364 : :
1365 : : nr_hugefiles = nr_hugepages;
1366 : :
1367 : :
1368 : : /* clean out the numbers of pages */
1369 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++)
1370 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
1371 : 64 : internal_conf->hugepage_info[i].num_pages[j] = 0;
1372 : :
1373 : : /* get hugepages for each socket */
1374 [ + + ]: 2048 : for (i = 0; i < nr_hugefiles; i++) {
1375 : 2046 : int socket = tmp_hp[i].socket_id;
1376 : :
1377 : : /* find a hugepage info with right size and increment num_pages */
1378 : 2046 : const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
1379 : : (int)internal_conf->num_hugepage_sizes);
1380 [ + + ]: 4092 : for (j = 0; j < nb_hpsizes; j++) {
1381 : 2046 : if (tmp_hp[i].size ==
1382 [ + - ]: 2046 : internal_conf->hugepage_info[j].hugepage_sz) {
1383 : 2046 : internal_conf->hugepage_info[j].num_pages[socket]++;
1384 : : }
1385 : : }
1386 : : }
1387 : :
1388 : : /* make a copy of numa_mem, needed for number of pages calculation */
1389 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1390 : 64 : memory[i] = internal_conf->numa_mem[i];
1391 : :
1392 : : /* calculate final number of pages */
1393 : 2 : nr_hugepages = eal_dynmem_calc_num_pages_per_socket(memory,
1394 : 2 : internal_conf->hugepage_info, used_hp,
1395 : : internal_conf->num_hugepage_sizes);
1396 : :
1397 : : /* error if not enough memory available */
1398 [ - + ]: 2 : if (nr_hugepages < 0)
1399 : 0 : goto fail;
1400 : :
1401 : : /* reporting in! */
1402 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1403 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1404 [ + + ]: 64 : if (used_hp[i].num_pages[j] > 0) {
1405 : 2 : EAL_LOG(DEBUG,
1406 : : "Requesting %u pages of size %uMB"
1407 : : " from socket %i",
1408 : : used_hp[i].num_pages[j],
1409 : : (unsigned)
1410 : : (used_hp[i].hugepage_sz / 0x100000),
1411 : : j);
1412 : : }
1413 : : }
1414 : : }
1415 : :
1416 : : /* create shared memory */
1417 : 2 : hugepage = create_shared_memory(eal_hugepage_data_path(),
1418 : : nr_hugefiles * sizeof(struct hugepage_file));
1419 : :
1420 [ - + ]: 2 : if (hugepage == NULL) {
1421 : 0 : EAL_LOG(ERR, "Failed to create shared memory!");
1422 : 0 : goto fail;
1423 : : }
1424 : : memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
1425 : :
1426 : : /*
1427 : : * unmap pages that we won't need (looks at used_hp).
1428 : : * also, sets final_va to NULL on pages that were unmapped.
1429 : : */
1430 [ - + ]: 2 : if (unmap_unneeded_hugepages(tmp_hp, used_hp,
1431 : : internal_conf->num_hugepage_sizes) < 0) {
1432 : 0 : EAL_LOG(ERR, "Unmapping and locking hugepages failed!");
1433 : 0 : goto fail;
1434 : : }
1435 : :
1436 : : /*
1437 : : * copy stuff from malloc'd hugepage* to the actual shared memory.
1438 : : * this procedure only copies those hugepages that have orig_va
1439 : : * not NULL. has overflow protection.
1440 : : */
1441 [ - + ]: 2 : if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
1442 : : tmp_hp, nr_hugefiles) < 0) {
1443 : 0 : EAL_LOG(ERR, "Copying tables to shared memory failed!");
1444 : 0 : goto fail;
1445 : : }
1446 : :
1447 : : #ifndef RTE_ARCH_64
1448 : : /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
1449 : : if (internal_conf->legacy_mem &&
1450 : : prealloc_segments(hugepage, nr_hugefiles)) {
1451 : : EAL_LOG(ERR, "Could not preallocate VA space for hugepages");
1452 : : goto fail;
1453 : : }
1454 : : #endif
1455 : :
1456 : : /* remap all pages we do need into memseg list VA space, so that those
1457 : : * pages become first-class citizens in DPDK memory subsystem
1458 : : */
1459 [ - + ]: 2 : if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
1460 : 0 : EAL_LOG(ERR, "Couldn't remap hugepage files into memseg lists");
1461 : 0 : goto fail;
1462 : : }
1463 : :
1464 : : /* free the hugepage backing files */
1465 [ - + - - ]: 2 : if (internal_conf->hugepage_file.unlink_before_mapping &&
1466 : 0 : unlink_hugepage_files(tmp_hp, internal_conf->num_hugepage_sizes) < 0) {
1467 : 0 : EAL_LOG(ERR, "Unlinking hugepage files failed!");
1468 : 0 : goto fail;
1469 : : }
1470 : :
1471 : : /* free the temporary hugepage table */
1472 : 2 : free(tmp_hp);
1473 : : tmp_hp = NULL;
1474 : :
1475 : 2 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1476 : : hugepage = NULL;
1477 : :
1478 : : /* we're not going to allocate more pages, so release VA space for
1479 : : * unused memseg lists
1480 : : */
1481 [ + + ]: 258 : for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
1482 : : struct rte_memseg_list *msl = &mcfg->memsegs[i];
1483 : : size_t mem_sz;
1484 : :
1485 : : /* skip inactive lists */
1486 [ + + ]: 256 : if (msl->base_va == NULL)
1487 : 240 : continue;
1488 : : /* skip lists where there is at least one page allocated */
1489 [ + + ]: 16 : if (msl->memseg_arr.count > 0)
1490 : 2 : continue;
1491 : : /* this is an unused list, deallocate it */
1492 : 14 : mem_sz = msl->len;
1493 : 14 : munmap(msl->base_va, mem_sz);
1494 : 14 : msl->base_va = NULL;
1495 : 14 : msl->len = 0;
1496 : 14 : msl->heap = 0;
1497 : :
1498 : : /* destroy backing fbarray */
1499 : 14 : rte_fbarray_destroy(&msl->memseg_arr);
1500 : : }
1501 : :
1502 [ - + - - ]: 2 : if (mcfg->dma_maskbits &&
1503 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1504 : 0 : EAL_LOG(ERR,
1505 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1506 : : __func__);
1507 : 0 : goto fail;
1508 : : }
1509 : :
1510 : : return 0;
1511 : :
1512 [ # # ]: 0 : fail:
1513 : : huge_recover_sigbus();
1514 : 0 : free(tmp_hp);
1515 [ # # ]: 0 : if (hugepage != NULL)
1516 : 0 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1517 : :
1518 : : return -1;
1519 : : }
1520 : :
1521 : : /*
1522 : : * uses fstat to report the size of a file on disk
1523 : : */
1524 : : static off_t
1525 : : getFileSize(int fd)
1526 : : {
1527 : : struct stat st;
1528 [ # # ]: 0 : if (fstat(fd, &st) < 0)
1529 : : return 0;
1530 : 0 : return st.st_size;
1531 : : }
1532 : :
1533 : : /*
1534 : : * This creates the memory mappings in the secondary process to match that of
1535 : : * the server process. It goes through each memory segment in the DPDK runtime
1536 : : * configuration and finds the hugepages which form that segment, mapping them
1537 : : * in order to form a contiguous block in the virtual memory space
1538 : : */
1539 : : static int
1540 : 0 : eal_legacy_hugepage_attach(void)
1541 : : {
1542 : 0 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1543 : : struct hugepage_file *hp = NULL;
1544 : : unsigned int num_hp = 0;
1545 : : unsigned int i = 0;
1546 : : unsigned int cur_seg;
1547 : : off_t size = 0;
1548 : : int fd, fd_hugepage = -1;
1549 : :
1550 [ # # ]: 0 : if (aslr_enabled() > 0) {
1551 : 0 : EAL_LOG(WARNING, "WARNING: Address Space Layout Randomization "
1552 : : "(ASLR) is enabled in the kernel.");
1553 : 0 : EAL_LOG(WARNING, " This may cause issues with mapping memory "
1554 : : "into secondary processes");
1555 : : }
1556 : :
1557 : 0 : fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
1558 [ # # ]: 0 : if (fd_hugepage < 0) {
1559 : 0 : EAL_LOG(ERR, "Could not open %s",
1560 : : eal_hugepage_data_path());
1561 : 0 : goto error;
1562 : : }
1563 : :
1564 : : size = getFileSize(fd_hugepage);
1565 : 0 : hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1566 [ # # ]: 0 : if (hp == MAP_FAILED) {
1567 : 0 : EAL_LOG(ERR, "Could not mmap %s",
1568 : : eal_hugepage_data_path());
1569 : 0 : goto error;
1570 : : }
1571 : :
1572 : 0 : num_hp = size / sizeof(struct hugepage_file);
1573 : 0 : EAL_LOG(DEBUG, "Analysing %u files", num_hp);
1574 : :
1575 : : /* map all segments into memory to make sure we get the addrs. the
1576 : : * segments themselves are already in memseg list (which is shared and
1577 : : * has its VA space already preallocated), so we just need to map
1578 : : * everything into correct addresses.
1579 : : */
1580 [ # # ]: 0 : for (i = 0; i < num_hp; i++) {
1581 : 0 : struct hugepage_file *hf = &hp[i];
1582 : 0 : size_t map_sz = hf->size;
1583 : 0 : void *map_addr = hf->final_va;
1584 : : int msl_idx, ms_idx;
1585 : : struct rte_memseg_list *msl;
1586 : : struct rte_memseg *ms;
1587 : :
1588 : : /* if size is zero, no more pages left */
1589 [ # # ]: 0 : if (map_sz == 0)
1590 : : break;
1591 : :
1592 : 0 : fd = open(hf->filepath, O_RDWR);
1593 [ # # ]: 0 : if (fd < 0) {
1594 : 0 : EAL_LOG(ERR, "Could not open %s: %s",
1595 : : hf->filepath, strerror(errno));
1596 : 0 : goto error;
1597 : : }
1598 : :
1599 : 0 : map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
1600 : : MAP_SHARED | MAP_FIXED, fd, 0);
1601 [ # # ]: 0 : if (map_addr == MAP_FAILED) {
1602 : 0 : EAL_LOG(ERR, "Could not map %s: %s",
1603 : : hf->filepath, strerror(errno));
1604 : 0 : goto fd_error;
1605 : : }
1606 : :
1607 : : /* set shared lock on the file. */
1608 [ # # ]: 0 : if (flock(fd, LOCK_SH) < 0) {
1609 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed: %s",
1610 : : __func__, strerror(errno));
1611 : 0 : goto mmap_error;
1612 : : }
1613 : :
1614 : : /* find segment data */
1615 : 0 : msl = rte_mem_virt2memseg_list(map_addr);
1616 [ # # ]: 0 : if (msl == NULL) {
1617 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg list",
1618 : : __func__);
1619 : 0 : goto mmap_error;
1620 : : }
1621 : 0 : ms = rte_mem_virt2memseg(map_addr, msl);
1622 [ # # ]: 0 : if (ms == NULL) {
1623 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg",
1624 : : __func__);
1625 : 0 : goto mmap_error;
1626 : : }
1627 : :
1628 : 0 : msl_idx = msl - mcfg->memsegs;
1629 : 0 : ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
1630 [ # # ]: 0 : if (ms_idx < 0) {
1631 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg idx",
1632 : : __func__);
1633 : 0 : goto mmap_error;
1634 : : }
1635 : :
1636 : : /* store segment fd internally */
1637 [ # # ]: 0 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
1638 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
1639 : : rte_strerror(rte_errno));
1640 : : }
1641 : : /* unmap the hugepage config file, since we are done using it */
1642 : 0 : munmap(hp, size);
1643 : 0 : close(fd_hugepage);
1644 : 0 : return 0;
1645 : :
1646 : 0 : mmap_error:
1647 : 0 : munmap(hp[i].final_va, hp[i].size);
1648 : 0 : fd_error:
1649 : 0 : close(fd);
1650 : 0 : error:
1651 : : /* unwind mmap's done so far */
1652 [ # # ]: 0 : for (cur_seg = 0; cur_seg < i; cur_seg++)
1653 : 0 : munmap(hp[cur_seg].final_va, hp[cur_seg].size);
1654 : :
1655 [ # # ]: 0 : if (hp != NULL && hp != MAP_FAILED)
1656 : 0 : munmap(hp, size);
1657 [ # # ]: 0 : if (fd_hugepage >= 0)
1658 : 0 : close(fd_hugepage);
1659 : : return -1;
1660 : : }
1661 : :
1662 : : static int
1663 : 26 : eal_hugepage_attach(void)
1664 : : {
1665 [ + + ]: 26 : if (eal_memalloc_sync_with_primary()) {
1666 : 1 : EAL_LOG(ERR, "Could not map memory from primary process");
1667 [ + - ]: 1 : if (aslr_enabled() > 0)
1668 : 1 : EAL_LOG(ERR, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes");
1669 : 1 : return -1;
1670 : : }
1671 : : return 0;
1672 : : }
1673 : :
1674 : : int
1675 : 180 : rte_eal_hugepage_init(void)
1676 : : {
1677 : : const struct internal_config *internal_conf =
1678 : 180 : eal_get_internal_configuration();
1679 : :
1680 : 180 : return internal_conf->legacy_mem ?
1681 [ + + ]: 180 : eal_legacy_hugepage_init() :
1682 : 57 : eal_dynmem_hugepage_init();
1683 : : }
1684 : :
1685 : : int
1686 : 26 : rte_eal_hugepage_attach(void)
1687 : : {
1688 : : const struct internal_config *internal_conf =
1689 : 26 : eal_get_internal_configuration();
1690 : :
1691 : 26 : return internal_conf->legacy_mem ?
1692 [ - + ]: 26 : eal_legacy_hugepage_attach() :
1693 : 26 : eal_hugepage_attach();
1694 : : }
1695 : :
1696 : : RTE_EXPORT_SYMBOL(rte_eal_using_phys_addrs)
1697 : : int
1698 : 211 : rte_eal_using_phys_addrs(void)
1699 : : {
1700 [ + + ]: 211 : if (phys_addrs_available == -1) {
1701 : 209 : uint64_t tmp = 0;
1702 : :
1703 [ + + + - ]: 297 : if (rte_eal_has_hugepages() != 0 &&
1704 : 88 : rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
1705 : 88 : phys_addrs_available = 1;
1706 : : else
1707 : 121 : phys_addrs_available = 0;
1708 : : }
1709 : 211 : return phys_addrs_available;
1710 : : }
1711 : :
1712 : : static int __rte_unused
1713 : : memseg_primary_init_32(void)
1714 : : {
1715 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1716 : : int active_sockets, hpi_idx, msl_idx = 0;
1717 : : unsigned int socket_id, i;
1718 : : struct rte_memseg_list *msl;
1719 : : uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
1720 : : uint64_t max_mem;
1721 : : struct internal_config *internal_conf =
1722 : : eal_get_internal_configuration();
1723 : :
1724 : : /* no-huge does not need this at all */
1725 : : if (internal_conf->no_hugetlbfs)
1726 : : return 0;
1727 : :
1728 : : /* this is a giant hack, but desperate times call for desperate
1729 : : * measures. in legacy 32-bit mode, we cannot preallocate VA space,
1730 : : * because having upwards of 2 gigabytes of VA space already mapped will
1731 : : * interfere with our ability to map and sort hugepages.
1732 : : *
1733 : : * therefore, in legacy 32-bit mode, we will be initializing memseg
1734 : : * lists much later - in eal_memory.c, right after we unmap all the
1735 : : * unneeded pages. this will not affect secondary processes, as those
1736 : : * should be able to mmap the space without (too many) problems.
1737 : : */
1738 : : if (internal_conf->legacy_mem)
1739 : : return 0;
1740 : :
1741 : : /* 32-bit mode is a very special case. we cannot know in advance where
1742 : : * the user will want to allocate their memory, so we have to do some
1743 : : * heuristics.
1744 : : */
1745 : : active_sockets = 0;
1746 : : total_requested_mem = 0;
1747 : : if (internal_conf->force_numa)
1748 : : for (i = 0; i < rte_socket_count(); i++) {
1749 : : uint64_t mem;
1750 : :
1751 : : socket_id = rte_socket_id_by_idx(i);
1752 : : mem = internal_conf->numa_mem[socket_id];
1753 : :
1754 : : if (mem == 0)
1755 : : continue;
1756 : :
1757 : : active_sockets++;
1758 : : total_requested_mem += mem;
1759 : : }
1760 : : else
1761 : : total_requested_mem = internal_conf->memory;
1762 : :
1763 : : max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
1764 : : if (total_requested_mem > max_mem) {
1765 : : EAL_LOG(ERR, "Invalid parameters: 32-bit process can at most use %uM of memory",
1766 : : (unsigned int)(max_mem >> 20));
1767 : : return -1;
1768 : : }
1769 : : total_extra_mem = max_mem - total_requested_mem;
1770 : : extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
1771 : : total_extra_mem / active_sockets;
1772 : :
1773 : : /* the allocation logic is a little bit convoluted, but here's how it
1774 : : * works, in a nutshell:
1775 : : * - if user hasn't specified on which sockets to allocate memory via
1776 : : * --socket-mem, we allocate all of our memory on main core socket.
1777 : : * - if user has specified sockets to allocate memory on, there may be
1778 : : * some "unused" memory left (e.g. if user has specified --socket-mem
1779 : : * such that not all memory adds up to 2 gigabytes), so add it to all
1780 : : * sockets that are in use equally.
1781 : : *
1782 : : * page sizes are sorted by size in descending order, so we can safely
1783 : : * assume that we dispense with bigger page sizes first.
1784 : : */
1785 : :
1786 : : /* create memseg lists */
1787 : : for (i = 0; i < rte_socket_count(); i++) {
1788 : : int hp_sizes = (int) internal_conf->num_hugepage_sizes;
1789 : : uint64_t max_socket_mem, cur_socket_mem;
1790 : : unsigned int main_lcore_socket;
1791 : : struct rte_config *cfg = rte_eal_get_configuration();
1792 : : bool skip;
1793 : : int ret;
1794 : :
1795 : : ret = rte_socket_id_by_idx(i);
1796 : : if (ret == -1) {
1797 : : EAL_LOG(ERR, "Cannot get socket ID for socket index %u", i);
1798 : : return -1;
1799 : : }
1800 : : socket_id = (unsigned int)ret;
1801 : :
1802 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1803 : : /* we can still sort pages by socket in legacy mode */
1804 : : if (!internal_conf->legacy_mem && socket_id > 0)
1805 : : break;
1806 : : #endif
1807 : :
1808 : : /* if we didn't specifically request memory on this socket */
1809 : : skip = active_sockets != 0 &&
1810 : : internal_conf->numa_mem[socket_id] == 0;
1811 : : /* ...or if we didn't specifically request memory on *any*
1812 : : * socket, and this is not main lcore
1813 : : */
1814 : : main_lcore_socket = rte_lcore_to_socket_id(cfg->main_lcore);
1815 : : skip |= active_sockets == 0 && socket_id != main_lcore_socket;
1816 : :
1817 : : if (skip) {
1818 : : EAL_LOG(DEBUG, "Will not preallocate memory on socket %u",
1819 : : socket_id);
1820 : : continue;
1821 : : }
1822 : :
1823 : : /* max amount of memory on this socket */
1824 : : max_socket_mem = (active_sockets != 0 ?
1825 : : internal_conf->numa_mem[socket_id] :
1826 : : internal_conf->memory) +
1827 : : extra_mem_per_socket;
1828 : : cur_socket_mem = 0;
1829 : :
1830 : : for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
1831 : : uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
1832 : : uint64_t hugepage_sz;
1833 : : struct hugepage_info *hpi;
1834 : : int type_msl_idx, max_segs, total_segs = 0;
1835 : :
1836 : : hpi = &internal_conf->hugepage_info[hpi_idx];
1837 : : hugepage_sz = hpi->hugepage_sz;
1838 : :
1839 : : /* check if pages are actually available */
1840 : : if (hpi->num_pages[socket_id] == 0)
1841 : : continue;
1842 : :
1843 : : max_segs = RTE_MAX_MEMSEG_PER_TYPE;
1844 : : max_pagesz_mem = max_socket_mem - cur_socket_mem;
1845 : :
1846 : : /* make it multiple of page size */
1847 : : max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
1848 : : hugepage_sz);
1849 : :
1850 : : EAL_LOG(DEBUG, "Attempting to preallocate "
1851 : : "%" PRIu64 "M on socket %i",
1852 : : max_pagesz_mem >> 20, socket_id);
1853 : :
1854 : : type_msl_idx = 0;
1855 : : while (cur_pagesz_mem < max_pagesz_mem &&
1856 : : total_segs < max_segs) {
1857 : : uint64_t cur_mem;
1858 : : unsigned int n_segs;
1859 : :
1860 : : if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
1861 : : EAL_LOG(ERR,
1862 : : "No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
1863 : : return -1;
1864 : : }
1865 : :
1866 : : msl = &mcfg->memsegs[msl_idx];
1867 : :
1868 : : cur_mem = get_mem_amount(hugepage_sz,
1869 : : max_pagesz_mem);
1870 : : n_segs = cur_mem / hugepage_sz;
1871 : :
1872 : : if (eal_memseg_list_init(msl, hugepage_sz,
1873 : : n_segs, socket_id, type_msl_idx,
1874 : : true)) {
1875 : : /* failing to allocate a memseg list is
1876 : : * a serious error.
1877 : : */
1878 : : EAL_LOG(ERR, "Cannot allocate memseg list");
1879 : : return -1;
1880 : : }
1881 : :
1882 : : if (eal_memseg_list_alloc(msl, 0)) {
1883 : : /* if we couldn't allocate VA space, we
1884 : : * can try with smaller page sizes.
1885 : : */
1886 : : EAL_LOG(ERR, "Cannot allocate VA space for memseg list, retrying with different page size");
1887 : : /* deallocate memseg list */
1888 : : if (memseg_list_free(msl))
1889 : : return -1;
1890 : : break;
1891 : : }
1892 : :
1893 : : total_segs += msl->memseg_arr.len;
1894 : : cur_pagesz_mem = total_segs * hugepage_sz;
1895 : : type_msl_idx++;
1896 : : msl_idx++;
1897 : : }
1898 : : cur_socket_mem += cur_pagesz_mem;
1899 : : }
1900 : : if (cur_socket_mem == 0) {
1901 : : EAL_LOG(ERR, "Cannot allocate VA space on socket %u",
1902 : : socket_id);
1903 : : return -1;
1904 : : }
1905 : : }
1906 : :
1907 : : return 0;
1908 : : }
1909 : :
1910 : : static int __rte_unused
1911 : : memseg_primary_init(void)
1912 : : {
1913 : 180 : return eal_dynmem_memseg_lists_init();
1914 : : }
1915 : :
1916 : : static int
1917 : 26 : memseg_secondary_init(void)
1918 : : {
1919 : 26 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1920 : : int msl_idx = 0;
1921 : : struct rte_memseg_list *msl;
1922 : :
1923 [ + + ]: 3354 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
1924 : :
1925 : 3328 : msl = &mcfg->memsegs[msl_idx];
1926 : :
1927 : : /* skip empty and external memseg lists */
1928 [ + + - + ]: 3328 : if (msl->memseg_arr.len == 0 || msl->external)
1929 : 3120 : continue;
1930 : :
1931 [ - + ]: 208 : if (rte_fbarray_attach(&msl->memseg_arr)) {
1932 : 0 : EAL_LOG(ERR, "Cannot attach to primary process memseg lists");
1933 : 0 : return -1;
1934 : : }
1935 : :
1936 : : /* preallocate VA space */
1937 [ - + ]: 208 : if (eal_memseg_list_alloc(msl, 0)) {
1938 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1939 : 0 : return -1;
1940 : : }
1941 : : }
1942 : :
1943 : : return 0;
1944 : : }
1945 : :
1946 : : int
1947 : 206 : rte_eal_memseg_init(void)
1948 : : {
1949 : : /* increase rlimit to maximum */
1950 : : struct rlimit lim;
1951 : :
1952 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1953 : : const struct internal_config *internal_conf =
1954 : : eal_get_internal_configuration();
1955 : : #endif
1956 [ + - ]: 206 : if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
1957 : : /* set limit to maximum */
1958 : 206 : lim.rlim_cur = lim.rlim_max;
1959 : :
1960 [ - + ]: 206 : if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
1961 : 0 : EAL_LOG(DEBUG, "Setting maximum number of open files failed: %s",
1962 : : strerror(errno));
1963 : : } else {
1964 : 206 : EAL_LOG(DEBUG, "Setting maximum number of open files to %"
1965 : : PRIu64,
1966 : : (uint64_t)lim.rlim_cur);
1967 : : }
1968 : : } else {
1969 : 0 : EAL_LOG(ERR, "Cannot get current resource limits");
1970 : : }
1971 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1972 : : if (!internal_conf->legacy_mem && rte_socket_count() > 1) {
1973 : : EAL_LOG(WARNING, "DPDK is running on a NUMA system, but is compiled without NUMA support.");
1974 : : EAL_LOG(WARNING, "This will have adverse consequences for performance and usability.");
1975 : : EAL_LOG(WARNING, "Please use --legacy-mem option, or recompile with NUMA support.");
1976 : : }
1977 : : #endif
1978 : :
1979 : 206 : return rte_eal_process_type() == RTE_PROC_PRIMARY ?
1980 : : #ifndef RTE_ARCH_64
1981 : : memseg_primary_init_32() :
1982 : : #else
1983 [ + + ]: 206 : memseg_primary_init() :
1984 : : #endif
1985 : 26 : memseg_secondary_init();
1986 : : }
|