Branch data Line data Source code
1 : : /* SPDX-License-Identifier: BSD-3-Clause
2 : : * Copyright(c) 2010-2014 Intel Corporation.
3 : : * Copyright(c) 2013 6WIND S.A.
4 : : */
5 : :
6 : : #include <errno.h>
7 : : #include <fcntl.h>
8 : : #include <stdbool.h>
9 : : #include <stdlib.h>
10 : : #include <stdio.h>
11 : : #include <stdint.h>
12 : : #include <inttypes.h>
13 : : #include <string.h>
14 : : #include <sys/mman.h>
15 : : #include <sys/stat.h>
16 : : #include <sys/file.h>
17 : : #include <sys/resource.h>
18 : : #include <sys/personality.h>
19 : : #include <unistd.h>
20 : : #include <limits.h>
21 : : #include <signal.h>
22 : : #include <setjmp.h>
23 : : #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
24 : : #define MEMFD_SUPPORTED
25 : : #endif
26 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
27 : : #include <numa.h>
28 : : #include <numaif.h>
29 : : #endif
30 : :
31 : : #include <rte_errno.h>
32 : : #include <rte_log.h>
33 : : #include <rte_memory.h>
34 : : #include <rte_eal.h>
35 : : #include <rte_lcore.h>
36 : : #include <rte_common.h>
37 : :
38 : : #include <eal_export.h>
39 : : #include "eal_private.h"
40 : : #include "eal_memalloc.h"
41 : : #include "eal_memcfg.h"
42 : : #include "eal_internal_cfg.h"
43 : : #include "eal_filesystem.h"
44 : : #include "eal_hugepages.h"
45 : : #include "eal_options.h"
46 : :
47 : : #define PFN_MASK_SIZE 8
48 : :
49 : : /**
50 : : * @file
51 : : * Huge page mapping under linux
52 : : *
53 : : * To reserve a big contiguous amount of memory, we use the hugepage
54 : : * feature of linux. For that, we need to have hugetlbfs mounted. This
55 : : * code will create many files in this directory (one per page) and
56 : : * map them in virtual memory. For each page, we will retrieve its
57 : : * physical address and remap it in order to have a virtual contiguous
58 : : * zone as well as a physical contiguous zone.
59 : : */
60 : :
61 : : static int phys_addrs_available = -1;
62 : :
63 : : #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
64 : :
65 : 157 : uint64_t eal_get_baseaddr(void)
66 : : {
67 : : /*
68 : : * Linux kernel uses a really high address as starting address for
69 : : * serving mmaps calls. If there exists addressing limitations and IOVA
70 : : * mode is VA, this starting address is likely too high for those
71 : : * devices. However, it is possible to use a lower address in the
72 : : * process virtual address space as with 64 bits there is a lot of
73 : : * available space.
74 : : *
75 : : * Current known limitations are 39 or 40 bits. Setting the starting
76 : : * address at 4GB implies there are 508GB or 1020GB for mapping the
77 : : * available hugepages. This is likely enough for most systems, although
78 : : * a device with addressing limitations should call
79 : : * rte_mem_check_dma_mask for ensuring all memory is within supported
80 : : * range.
81 : : */
82 : : #if defined(RTE_ARCH_LOONGARCH)
83 : : return 0x7000000000ULL;
84 : : #else
85 : 157 : return 0x100000000ULL;
86 : : #endif
87 : : }
88 : :
89 : : /*
90 : : * Get physical address of any mapped virtual address in the current process.
91 : : */
92 : : RTE_EXPORT_SYMBOL(rte_mem_virt2phy)
93 : : phys_addr_t
94 : 5368 : rte_mem_virt2phy(const void *virtaddr)
95 : : {
96 : : int fd, retval;
97 : : uint64_t page, physaddr;
98 : : unsigned long virt_pfn;
99 : : int page_size;
100 : : off_t offset;
101 : :
102 [ + - ]: 5368 : if (phys_addrs_available == 0)
103 : : return RTE_BAD_IOVA;
104 : :
105 : : /* standard page size */
106 : 5368 : page_size = getpagesize();
107 : :
108 : : fd = open("/proc/self/pagemap", O_RDONLY);
109 [ - + ]: 5368 : if (fd < 0) {
110 : 0 : EAL_LOG(INFO, "%s(): cannot open /proc/self/pagemap: %s",
111 : : __func__, strerror(errno));
112 : 0 : return RTE_BAD_IOVA;
113 : : }
114 : :
115 : 5368 : virt_pfn = (unsigned long)virtaddr / page_size;
116 : 5368 : offset = sizeof(uint64_t) * virt_pfn;
117 [ - + ]: 5368 : if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
118 : 0 : EAL_LOG(INFO, "%s(): seek error in /proc/self/pagemap: %s",
119 : : __func__, strerror(errno));
120 : 0 : close(fd);
121 : 0 : return RTE_BAD_IOVA;
122 : : }
123 : :
124 : 5368 : retval = read(fd, &page, PFN_MASK_SIZE);
125 : 5368 : close(fd);
126 [ - + ]: 5368 : if (retval < 0) {
127 : 0 : EAL_LOG(INFO, "%s(): cannot read /proc/self/pagemap: %s",
128 : : __func__, strerror(errno));
129 : 0 : return RTE_BAD_IOVA;
130 [ - + ]: 5368 : } else if (retval != PFN_MASK_SIZE) {
131 : 0 : EAL_LOG(INFO, "%s(): read %d bytes from /proc/self/pagemap "
132 : : "but expected %d:",
133 : : __func__, retval, PFN_MASK_SIZE);
134 : 0 : return RTE_BAD_IOVA;
135 : : }
136 : :
137 : : /*
138 : : * the pfn (page frame number) are bits 0-54 (see
139 : : * pagemap.txt in linux Documentation)
140 : : */
141 [ + - ]: 5368 : if ((page & 0x7fffffffffffffULL) == 0)
142 : : return RTE_BAD_IOVA;
143 : :
144 : 5368 : physaddr = ((page & 0x7fffffffffffffULL) * page_size)
145 : 5368 : + ((unsigned long)virtaddr % page_size);
146 : :
147 : 5368 : return physaddr;
148 : : }
149 : :
150 : : RTE_EXPORT_SYMBOL(rte_mem_virt2iova)
151 : : rte_iova_t
152 : 3236 : rte_mem_virt2iova(const void *virtaddr)
153 : : {
154 [ - + ]: 3236 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
155 : 0 : return (uintptr_t)virtaddr;
156 : 3236 : return rte_mem_virt2phy(virtaddr);
157 : : }
158 : :
159 : : /*
160 : : * For each hugepage in hugepg_tbl, fill the physaddr value. We find
161 : : * it by browsing the /proc/self/pagemap special file.
162 : : */
163 : : static int
164 : 2 : find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
165 : : {
166 : : unsigned int i;
167 : : phys_addr_t addr;
168 : :
169 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
170 : 2046 : addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
171 [ + - ]: 2046 : if (addr == RTE_BAD_PHYS_ADDR)
172 : : return -1;
173 : 2046 : hugepg_tbl[i].physaddr = addr;
174 : : }
175 : : return 0;
176 : : }
177 : :
178 : : /*
179 : : * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
180 : : */
181 : : static int
182 : : set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
183 : : {
184 : : unsigned int i;
185 : : static phys_addr_t addr;
186 : :
187 [ # # ]: 0 : for (i = 0; i < hpi->num_pages[0]; i++) {
188 : 0 : hugepg_tbl[i].physaddr = addr;
189 : 0 : addr += hugepg_tbl[i].size;
190 : : }
191 : : return 0;
192 : : }
193 : :
194 : : /*
195 : : * Check whether address-space layout randomization is enabled in
196 : : * the kernel. This is important for multi-process as it can prevent
197 : : * two processes mapping data to the same virtual address
198 : : * Returns:
199 : : * 0 - address space randomization disabled
200 : : * 1/2 - address space randomization enabled
201 : : * negative error code on error
202 : : */
203 : : static int
204 : 2 : aslr_enabled(void)
205 : : {
206 : : char c;
207 : :
208 : : /*
209 : : * Check whether the current process is executed with the command line
210 : : * "setarch ... --addr-no-randomize ..." or "setarch ... -R ..."
211 : : * This complements the sysfs check to ensure comprehensive ASLR status detection.
212 : : * This check is necessary to support the functionality of the "setarch" command,
213 : : * which can disable ASLR by setting the ADDR_NO_RANDOMIZE personality flag.
214 : : */
215 [ + - ]: 2 : if ((personality(0xffffffff) & ADDR_NO_RANDOMIZE) == ADDR_NO_RANDOMIZE)
216 : : return 0;
217 : :
218 : : int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
219 [ - + ]: 2 : if (fd < 0)
220 : 0 : return -errno;
221 : 2 : retval = read(fd, &c, 1);
222 : 2 : close(fd);
223 [ - + ]: 2 : if (retval < 0)
224 : 0 : return -errno;
225 [ + - ]: 2 : if (retval == 0)
226 : : return -EIO;
227 [ - + ]: 2 : switch (c) {
228 : : case '0' : return 0;
229 : : case '1' : return 1;
230 : : case '2' : return 2;
231 : : default: return -EINVAL;
232 : : }
233 : : }
234 : :
235 : : static sigjmp_buf huge_jmpenv;
236 : :
237 : 0 : static void huge_sigbus_handler(int signo __rte_unused)
238 : : {
239 : 0 : siglongjmp(huge_jmpenv, 1);
240 : : }
241 : :
242 : : /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
243 : : * non-static local variable in the stack frame calling sigsetjmp might be
244 : : * clobbered by a call to longjmp.
245 : : */
246 : 2046 : static int huge_wrap_sigsetjmp(void)
247 : : {
248 : 2046 : return sigsetjmp(huge_jmpenv, 1);
249 : : }
250 : :
251 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
252 : : /* Callback for numa library. */
253 : : void numa_error(char *where)
254 : : {
255 : 0 : EAL_LOG(ERR, "%s failed: %s", where, strerror(errno));
256 : 0 : }
257 : : #endif
258 : :
259 : : /*
260 : : * Mmap all hugepages of hugepage table: it first open a file in
261 : : * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
262 : : * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
263 : : * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
264 : : * map contiguous physical blocks in contiguous virtual blocks.
265 : : */
266 : : static unsigned
267 : 2 : map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
268 : : uint64_t *essential_memory __rte_unused)
269 : : {
270 : : int fd;
271 : : unsigned i;
272 : : void *virtaddr;
273 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
274 : : int node_id = -1;
275 : : int essential_prev = 0;
276 : : int oldpolicy;
277 : : struct bitmask *oldmask = NULL;
278 : : bool have_numa = true;
279 : : unsigned long maxnode = 0;
280 : : const struct internal_config *internal_conf =
281 : 2 : eal_get_internal_configuration();
282 : :
283 : : /* Check if kernel supports NUMA. */
284 [ + - ]: 2 : if (numa_available() != 0) {
285 : 0 : EAL_LOG(DEBUG, "NUMA is not supported.");
286 : : have_numa = false;
287 : : }
288 : :
289 : : if (have_numa) {
290 : 2 : EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
291 : 2 : oldmask = numa_allocate_nodemask();
292 [ - + ]: 2 : if (get_mempolicy(&oldpolicy, oldmask->maskp,
293 : 2 : oldmask->size + 1, 0, 0) < 0) {
294 : 0 : EAL_LOG(ERR,
295 : : "Failed to get current mempolicy: %s. "
296 : : "Assuming MPOL_DEFAULT.", strerror(errno));
297 : 0 : oldpolicy = MPOL_DEFAULT;
298 : : }
299 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
300 [ - + ]: 64 : if (internal_conf->socket_mem[i])
301 : 0 : maxnode = i + 1;
302 : : }
303 : : #endif
304 : :
305 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
306 : 2046 : struct hugepage_file *hf = &hugepg_tbl[i];
307 : 2046 : uint64_t hugepage_sz = hpi->hugepage_sz;
308 : :
309 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
310 [ - + ]: 2046 : if (maxnode) {
311 : : unsigned int j;
312 : :
313 [ # # ]: 0 : for (j = 0; j < maxnode; j++)
314 [ # # ]: 0 : if (essential_memory[j])
315 : : break;
316 : :
317 [ # # ]: 0 : if (j == maxnode) {
318 : 0 : node_id = (node_id + 1) % maxnode;
319 [ # # ]: 0 : while (!internal_conf->socket_mem[node_id]) {
320 : 0 : node_id++;
321 : 0 : node_id %= maxnode;
322 : : }
323 : : essential_prev = 0;
324 : : } else {
325 : 0 : node_id = j;
326 : 0 : essential_prev = essential_memory[j];
327 : :
328 [ # # ]: 0 : if (essential_memory[j] < hugepage_sz)
329 : 0 : essential_memory[j] = 0;
330 : : else
331 : 0 : essential_memory[j] -= hugepage_sz;
332 : : }
333 : :
334 : 0 : EAL_LOG(DEBUG,
335 : : "Setting policy MPOL_PREFERRED for socket %d",
336 : : node_id);
337 : 0 : numa_set_preferred(node_id);
338 : : }
339 : : #endif
340 : :
341 : 2046 : hf->file_id = i;
342 : 2046 : hf->size = hugepage_sz;
343 : 2046 : eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
344 : 2046 : hpi->hugedir, hf->file_id);
345 : 2046 : hf->filepath[sizeof(hf->filepath) - 1] = '\0';
346 : :
347 : : /* try to create hugepage file */
348 : : fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
349 [ - + ]: 2046 : if (fd < 0) {
350 : 0 : EAL_LOG(DEBUG, "%s(): open failed: %s", __func__,
351 : : strerror(errno));
352 : 0 : goto out;
353 : : }
354 : :
355 : : /* map the segment, and populate page tables,
356 : : * the kernel fills this segment with zeros. we don't care where
357 : : * this gets mapped - we already have contiguous memory areas
358 : : * ready for us to map into.
359 : : */
360 : 2046 : virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
361 : : MAP_SHARED | MAP_POPULATE, fd, 0);
362 [ - + ]: 2046 : if (virtaddr == MAP_FAILED) {
363 : 0 : EAL_LOG(DEBUG, "%s(): mmap failed: %s", __func__,
364 : : strerror(errno));
365 : 0 : close(fd);
366 : 0 : goto out;
367 : : }
368 : :
369 : 2046 : hf->orig_va = virtaddr;
370 : :
371 : : /* In linux, hugetlb limitations, like cgroup, are
372 : : * enforced at fault time instead of mmap(), even
373 : : * with the option of MAP_POPULATE. Kernel will send
374 : : * a SIGBUS signal. To avoid to be killed, save stack
375 : : * environment here, if SIGBUS happens, we can jump
376 : : * back here.
377 : : */
378 [ - + ]: 2046 : if (huge_wrap_sigsetjmp()) {
379 : 0 : EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more "
380 : : "hugepages of size %u MB",
381 : : (unsigned int)(hugepage_sz / 0x100000));
382 : 0 : munmap(virtaddr, hugepage_sz);
383 : 0 : close(fd);
384 : 0 : unlink(hugepg_tbl[i].filepath);
385 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
386 [ # # ]: 0 : if (maxnode)
387 : 0 : essential_memory[node_id] =
388 : : essential_prev;
389 : : #endif
390 : 0 : goto out;
391 : : }
392 : 2046 : *(int *)virtaddr = 0;
393 : :
394 : : /* set shared lock on the file. */
395 [ - + ]: 2046 : if (flock(fd, LOCK_SH) < 0) {
396 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed:%s ",
397 : : __func__, strerror(errno));
398 : 0 : close(fd);
399 : 0 : goto out;
400 : : }
401 : :
402 : 2046 : close(fd);
403 : : }
404 : :
405 : 2 : out:
406 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
407 [ - + ]: 2 : if (maxnode) {
408 : 0 : EAL_LOG(DEBUG,
409 : : "Restoring previous memory policy: %d", oldpolicy);
410 [ # # ]: 0 : if (oldpolicy == MPOL_DEFAULT) {
411 : 0 : numa_set_localalloc();
412 [ # # ]: 0 : } else if (set_mempolicy(oldpolicy, oldmask->maskp,
413 : 0 : oldmask->size + 1) < 0) {
414 : 0 : EAL_LOG(ERR, "Failed to restore mempolicy: %s",
415 : : strerror(errno));
416 : 0 : numa_set_localalloc();
417 : : }
418 : : }
419 [ + - ]: 2 : if (oldmask != NULL)
420 : : numa_free_cpumask(oldmask);
421 : : #endif
422 : 2 : return i;
423 : : }
424 : :
425 : : /*
426 : : * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
427 : : * page.
428 : : */
429 : : static int
430 : 2 : find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
431 : : {
432 : : int socket_id;
433 : : char *end, *nodestr;
434 : : unsigned i, hp_count = 0;
435 : : uint64_t virt_addr;
436 : : char buf[BUFSIZ];
437 : : char hugedir_str[PATH_MAX];
438 : : FILE *f;
439 : :
440 : 2 : f = fopen("/proc/self/numa_maps", "r");
441 [ - + ]: 2 : if (f == NULL) {
442 : 0 : EAL_LOG(NOTICE, "NUMA support not available"
443 : : " consider that all memory is in socket_id 0");
444 : 0 : return 0;
445 : : }
446 : :
447 : 2 : snprintf(hugedir_str, sizeof(hugedir_str),
448 : 2 : "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
449 : :
450 : : /* parse numa map */
451 [ + + ]: 2388 : while (fgets(buf, sizeof(buf), f) != NULL) {
452 : :
453 : : /* ignore non huge page */
454 [ + + ]: 2386 : if (strstr(buf, " huge ") == NULL &&
455 [ + - ]: 340 : strstr(buf, hugedir_str) == NULL)
456 : 340 : continue;
457 : :
458 : : /* get zone addr */
459 : 2046 : virt_addr = strtoull(buf, &end, 16);
460 [ + - - + ]: 2046 : if (virt_addr == 0 || end == buf) {
461 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
462 : 0 : goto error;
463 : : }
464 : :
465 : : /* get node id (socket id) */
466 : 2046 : nodestr = strstr(buf, " N");
467 [ - + ]: 2046 : if (nodestr == NULL) {
468 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
469 : 0 : goto error;
470 : : }
471 : 2046 : nodestr += 2;
472 : 2046 : end = strstr(nodestr, "=");
473 [ - + ]: 2046 : if (end == NULL) {
474 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
475 : 0 : goto error;
476 : : }
477 : 2046 : end[0] = '\0';
478 : 2046 : end = NULL;
479 : :
480 : 2046 : socket_id = strtoul(nodestr, &end, 0);
481 [ + - + - : 2046 : if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
- + ]
482 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
483 : 0 : goto error;
484 : : }
485 : :
486 : : /* if we find this page in our mappings, set socket_id */
487 [ + + ]: 2095104 : for (i = 0; i < hpi->num_pages[0]; i++) {
488 : 2093058 : void *va = (void *)(unsigned long)virt_addr;
489 [ + + ]: 2093058 : if (hugepg_tbl[i].orig_va == va) {
490 : 2046 : hugepg_tbl[i].socket_id = socket_id;
491 : 2046 : hp_count++;
492 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
493 : 2046 : EAL_LOG(DEBUG,
494 : : "Hugepage %s is on socket %d",
495 : : hugepg_tbl[i].filepath, socket_id);
496 : : #endif
497 : : }
498 : : }
499 : : }
500 : :
501 [ - + ]: 2 : if (hp_count < hpi->num_pages[0])
502 : 0 : goto error;
503 : :
504 : 2 : fclose(f);
505 : 2 : return 0;
506 : :
507 : 0 : error:
508 : 0 : fclose(f);
509 : 0 : return -1;
510 : : }
511 : :
512 : : static int
513 : 10387 : cmp_physaddr(const void *a, const void *b)
514 : : {
515 : : #ifndef RTE_ARCH_PPC_64
516 : : const struct hugepage_file *p1 = a;
517 : : const struct hugepage_file *p2 = b;
518 : : #else
519 : : /* PowerPC needs memory sorted in reverse order from x86 */
520 : : const struct hugepage_file *p1 = b;
521 : : const struct hugepage_file *p2 = a;
522 : : #endif
523 [ + + ]: 10387 : if (p1->physaddr < p2->physaddr)
524 : : return -1;
525 [ - + ]: 9172 : else if (p1->physaddr > p2->physaddr)
526 : : return 1;
527 : : else
528 : 0 : return 0;
529 : : }
530 : :
531 : : /*
532 : : * Uses mmap to create a shared memory area for storage of data
533 : : * Used in this file to store the hugepage file map on disk
534 : : */
535 : : static void *
536 : 2 : create_shared_memory(const char *filename, const size_t mem_size)
537 : : {
538 : : void *retval;
539 : : int fd;
540 : : const struct internal_config *internal_conf =
541 : 2 : eal_get_internal_configuration();
542 : :
543 : : /* if no shared files mode is used, create anonymous memory instead */
544 [ - + ]: 2 : if (internal_conf->no_shconf) {
545 : 0 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
546 : : MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
547 [ # # ]: 0 : if (retval == MAP_FAILED)
548 : : return NULL;
549 : 0 : return retval;
550 : : }
551 : :
552 : : fd = open(filename, O_CREAT | O_RDWR, 0600);
553 [ + - ]: 2 : if (fd < 0)
554 : : return NULL;
555 [ - + ]: 2 : if (ftruncate(fd, mem_size) < 0) {
556 : 0 : close(fd);
557 : 0 : return NULL;
558 : : }
559 : 2 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
560 : 2 : close(fd);
561 [ - + ]: 2 : if (retval == MAP_FAILED)
562 : 0 : return NULL;
563 : : return retval;
564 : : }
565 : :
566 : : /*
567 : : * this copies *active* hugepages from one hugepage table to another.
568 : : * destination is typically the shared memory.
569 : : */
570 : : static int
571 : 2 : copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
572 : : const struct hugepage_file * src, int src_size)
573 : : {
574 : : int src_pos, dst_pos = 0;
575 : :
576 [ + + ]: 2048 : for (src_pos = 0; src_pos < src_size; src_pos++) {
577 [ + + ]: 2046 : if (src[src_pos].orig_va != NULL) {
578 : : /* error on overflow attempt */
579 [ + - ]: 18 : if (dst_pos == dest_size)
580 : : return -1;
581 : 18 : memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
582 : 18 : dst_pos++;
583 : : }
584 : : }
585 : : return 0;
586 : : }
587 : :
588 : : static int
589 : 0 : unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
590 : : unsigned num_hp_info)
591 : : {
592 : : unsigned socket, size;
593 : : int page, nrpages = 0;
594 : : const struct internal_config *internal_conf =
595 : 0 : eal_get_internal_configuration();
596 : :
597 : : /* get total number of hugepages */
598 [ # # ]: 0 : for (size = 0; size < num_hp_info; size++)
599 [ # # ]: 0 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
600 : 0 : nrpages +=
601 : 0 : internal_conf->hugepage_info[size].num_pages[socket];
602 : :
603 [ # # ]: 0 : for (page = 0; page < nrpages; page++) {
604 : 0 : struct hugepage_file *hp = &hugepg_tbl[page];
605 : :
606 [ # # # # ]: 0 : if (hp->orig_va != NULL && unlink(hp->filepath)) {
607 : 0 : EAL_LOG(WARNING, "%s(): Removing %s failed: %s",
608 : : __func__, hp->filepath, strerror(errno));
609 : : }
610 : : }
611 : 0 : return 0;
612 : : }
613 : :
614 : : /*
615 : : * unmaps hugepages that are not going to be used. since we originally allocate
616 : : * ALL hugepages (not just those we need), additional unmapping needs to be done.
617 : : */
618 : : static int
619 : 2 : unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
620 : : struct hugepage_info *hpi,
621 : : unsigned num_hp_info)
622 : : {
623 : : unsigned socket, size;
624 : : int page, nrpages = 0;
625 : : const struct internal_config *internal_conf =
626 : 2 : eal_get_internal_configuration();
627 : :
628 : : /* get total number of hugepages */
629 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++)
630 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
631 : 64 : nrpages += internal_conf->hugepage_info[size].num_pages[socket];
632 : :
633 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++) {
634 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
635 : : unsigned pages_found = 0;
636 : :
637 : : /* traverse until we have unmapped all the unused pages */
638 [ + + ]: 65536 : for (page = 0; page < nrpages; page++) {
639 : 65472 : struct hugepage_file *hp = &hugepg_tbl[page];
640 : :
641 : : /* find a page that matches the criteria */
642 [ + - ]: 65472 : if ((hp->size == hpi[size].hugepage_sz) &&
643 [ + + ]: 65472 : (hp->socket_id == (int) socket)) {
644 : :
645 : : /* if we skipped enough pages, unmap the rest */
646 [ + + ]: 2046 : if (pages_found == hpi[size].num_pages[socket]) {
647 : : uint64_t unmap_len;
648 : :
649 : : unmap_len = hp->size;
650 : :
651 : : /* get start addr and len of the remaining segment */
652 : 2028 : munmap(hp->orig_va,
653 : : (size_t)unmap_len);
654 : :
655 : 2028 : hp->orig_va = NULL;
656 [ - + ]: 2028 : if (unlink(hp->filepath) == -1) {
657 : 0 : EAL_LOG(ERR, "%s(): Removing %s failed: %s",
658 : : __func__, hp->filepath, strerror(errno));
659 : 0 : return -1;
660 : : }
661 : : } else {
662 : : /* lock the page and skip */
663 : 18 : pages_found++;
664 : : }
665 : :
666 : : } /* match page */
667 : : } /* foreach page */
668 : : } /* foreach socket */
669 : : } /* foreach pagesize */
670 : :
671 : : return 0;
672 : : }
673 : :
674 : : static int
675 : 2 : remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
676 : : {
677 : 2 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
678 : : struct rte_memseg_list *msl;
679 : : struct rte_fbarray *arr;
680 : : int cur_page, seg_len;
681 : : unsigned int msl_idx;
682 : : int ms_idx;
683 : : uint64_t page_sz;
684 : : size_t memseg_len;
685 : : int socket_id;
686 : : #ifndef RTE_ARCH_64
687 : : const struct internal_config *internal_conf =
688 : : eal_get_internal_configuration();
689 : : #endif
690 : 2 : page_sz = hugepages[seg_start].size;
691 : 2 : socket_id = hugepages[seg_start].socket_id;
692 : 2 : seg_len = seg_end - seg_start;
693 : :
694 : 2 : EAL_LOG(DEBUG, "Attempting to map %" PRIu64 "M on socket %i",
695 : : (seg_len * page_sz) >> 20ULL, socket_id);
696 : :
697 : : /* find free space in memseg lists */
698 [ + - ]: 2 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
699 : : int free_len;
700 : : bool empty;
701 : 2 : msl = &mcfg->memsegs[msl_idx];
702 : 2 : arr = &msl->memseg_arr;
703 : :
704 [ - + ]: 2 : if (msl->page_sz != page_sz)
705 : 0 : continue;
706 [ - + ]: 2 : if (msl->socket_id != socket_id)
707 : 0 : continue;
708 : :
709 : : /* leave space for a hole if array is not empty */
710 : 2 : empty = arr->count == 0;
711 : : /* find start of the biggest contiguous block and its size */
712 : 2 : ms_idx = rte_fbarray_find_biggest_free(arr, 0);
713 [ - + ]: 2 : if (ms_idx < 0)
714 : 0 : continue;
715 : : /* hole is 1 segment long, so at least two segments long. */
716 : 2 : free_len = rte_fbarray_find_contig_free(arr, ms_idx);
717 [ - + ]: 2 : if (free_len < 2)
718 : 0 : continue;
719 : : /* leave some space between memsegs, they are not IOVA
720 : : * contiguous, so they shouldn't be VA contiguous either.
721 : : */
722 [ - + ]: 2 : if (!empty) {
723 : 0 : ms_idx++;
724 : 0 : free_len--;
725 : : }
726 : :
727 : : /* we might not get all of the space we wanted */
728 : 2 : free_len = RTE_MIN(seg_len, free_len);
729 : 2 : seg_end = seg_start + free_len;
730 : : seg_len = seg_end - seg_start;
731 : 2 : break;
732 : : }
733 [ - + ]: 2 : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
734 : 0 : EAL_LOG(ERR, "Could not find space for memseg. Please increase RTE_MAX_MEMSEG_PER_LIST "
735 : : "RTE_MAX_MEMSEG_PER_TYPE and/or RTE_MAX_MEM_MB_PER_TYPE in configuration.");
736 : 0 : return -1;
737 : : }
738 : :
739 : : #ifdef RTE_ARCH_PPC_64
740 : : /* for PPC64 we go through the list backwards */
741 : : for (cur_page = seg_end - 1; cur_page >= seg_start;
742 : : cur_page--, ms_idx++) {
743 : : #else
744 [ + + ]: 20 : for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
745 : : #endif
746 : 18 : struct hugepage_file *hfile = &hugepages[cur_page];
747 : 18 : struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
748 : : void *addr;
749 : : int fd;
750 : :
751 : 18 : fd = open(hfile->filepath, O_RDWR);
752 [ - + ]: 18 : if (fd < 0) {
753 : 0 : EAL_LOG(ERR, "Could not open '%s': %s",
754 : : hfile->filepath, strerror(errno));
755 : 0 : return -1;
756 : : }
757 : : /* set shared lock on the file. */
758 [ - + ]: 18 : if (flock(fd, LOCK_SH) < 0) {
759 : 0 : EAL_LOG(DEBUG, "Could not lock '%s': %s",
760 : : hfile->filepath, strerror(errno));
761 : 0 : close(fd);
762 : 0 : return -1;
763 : : }
764 : : memseg_len = (size_t)page_sz;
765 : 18 : addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
766 : :
767 : : /* we know this address is already mmapped by memseg list, so
768 : : * using MAP_FIXED here is safe
769 : : */
770 : 18 : addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
771 : : MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
772 [ - + ]: 18 : if (addr == MAP_FAILED) {
773 : 0 : EAL_LOG(ERR, "Couldn't remap '%s': %s",
774 : : hfile->filepath, strerror(errno));
775 : 0 : close(fd);
776 : 0 : return -1;
777 : : }
778 : :
779 : : /* we have a new address, so unmap previous one */
780 : : #ifndef RTE_ARCH_64
781 : : /* in 32-bit legacy mode, we have already unmapped the page */
782 : : if (!internal_conf->legacy_mem)
783 : : munmap(hfile->orig_va, page_sz);
784 : : #else
785 : 18 : munmap(hfile->orig_va, page_sz);
786 : : #endif
787 : :
788 : 18 : hfile->orig_va = NULL;
789 : 18 : hfile->final_va = addr;
790 : :
791 : : /* rewrite physical addresses in IOVA as VA mode */
792 [ - + ]: 18 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
793 : 0 : hfile->physaddr = (uintptr_t)addr;
794 : :
795 : : /* set up memseg data */
796 : 18 : ms->addr = addr;
797 : 18 : ms->hugepage_sz = page_sz;
798 : 18 : ms->len = memseg_len;
799 : 18 : ms->iova = hfile->physaddr;
800 : 18 : ms->socket_id = hfile->socket_id;
801 : 18 : ms->nchannel = rte_memory_get_nchannel();
802 : 18 : ms->nrank = rte_memory_get_nrank();
803 : :
804 : 18 : rte_fbarray_set_used(arr, ms_idx);
805 : :
806 : : /* store segment fd internally */
807 [ - + ]: 18 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
808 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
809 : : rte_strerror(rte_errno));
810 : : }
811 : 2 : EAL_LOG(DEBUG, "Allocated %" PRIu64 "M on socket %i",
812 : : (seg_len * page_sz) >> 20, socket_id);
813 : 2 : return seg_len;
814 : : }
815 : :
816 : : static uint64_t
817 : : get_mem_amount(uint64_t page_sz, uint64_t max_mem)
818 : : {
819 : : uint64_t area_sz, max_pages;
820 : :
821 : : /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
822 : : max_pages = RTE_MAX_MEMSEG_PER_LIST;
823 : : max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
824 : :
825 : : area_sz = RTE_MIN(page_sz * max_pages, max_mem);
826 : :
827 : : /* make sure the list isn't smaller than the page size */
828 : : area_sz = RTE_MAX(area_sz, page_sz);
829 : :
830 : : return RTE_ALIGN(area_sz, page_sz);
831 : : }
832 : :
833 : : static int
834 : : memseg_list_free(struct rte_memseg_list *msl)
835 : : {
836 : : if (rte_fbarray_destroy(&msl->memseg_arr)) {
837 : : EAL_LOG(ERR, "Cannot destroy memseg list");
838 : : return -1;
839 : : }
840 : : memset(msl, 0, sizeof(*msl));
841 : : return 0;
842 : : }
843 : :
844 : : /*
845 : : * Our VA space is not preallocated yet, so preallocate it here. We need to know
846 : : * how many segments there are in order to map all pages into one address space,
847 : : * and leave appropriate holes between segments so that rte_malloc does not
848 : : * concatenate them into one big segment.
849 : : *
850 : : * we also need to unmap original pages to free up address space.
851 : : */
852 : : static int __rte_unused
853 : : prealloc_segments(struct hugepage_file *hugepages, int n_pages)
854 : : {
855 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
856 : : int cur_page, seg_start_page, end_seg, new_memseg;
857 : : unsigned int hpi_idx, socket, i;
858 : : int n_contig_segs, n_segs;
859 : : int msl_idx;
860 : : const struct internal_config *internal_conf =
861 : : eal_get_internal_configuration();
862 : :
863 : : /* before we preallocate segments, we need to free up our VA space.
864 : : * we're not removing files, and we already have information about
865 : : * PA-contiguousness, so it is safe to unmap everything.
866 : : */
867 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
868 : : struct hugepage_file *hpi = &hugepages[cur_page];
869 : : munmap(hpi->orig_va, hpi->size);
870 : : hpi->orig_va = NULL;
871 : : }
872 : :
873 : : /* we cannot know how many page sizes and sockets we have discovered, so
874 : : * loop over all of them
875 : : */
876 : : for (hpi_idx = 0; hpi_idx < internal_conf->num_hugepage_sizes;
877 : : hpi_idx++) {
878 : : uint64_t page_sz =
879 : : internal_conf->hugepage_info[hpi_idx].hugepage_sz;
880 : :
881 : : for (i = 0; i < rte_socket_count(); i++) {
882 : : struct rte_memseg_list *msl;
883 : :
884 : : socket = rte_socket_id_by_idx(i);
885 : : n_contig_segs = 0;
886 : : n_segs = 0;
887 : : seg_start_page = -1;
888 : :
889 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
890 : : struct hugepage_file *prev, *cur;
891 : : int prev_seg_start_page = -1;
892 : :
893 : : cur = &hugepages[cur_page];
894 : : prev = cur_page == 0 ? NULL :
895 : : &hugepages[cur_page - 1];
896 : :
897 : : new_memseg = 0;
898 : : end_seg = 0;
899 : :
900 : : if (cur->size == 0)
901 : : end_seg = 1;
902 : : else if (cur->socket_id != (int) socket)
903 : : end_seg = 1;
904 : : else if (cur->size != page_sz)
905 : : end_seg = 1;
906 : : else if (cur_page == 0)
907 : : new_memseg = 1;
908 : : #ifdef RTE_ARCH_PPC_64
909 : : /* On PPC64 architecture, the mmap always start
910 : : * from higher address to lower address. Here,
911 : : * physical addresses are in descending order.
912 : : */
913 : : else if ((prev->physaddr - cur->physaddr) !=
914 : : cur->size)
915 : : new_memseg = 1;
916 : : #else
917 : : else if ((cur->physaddr - prev->physaddr) !=
918 : : cur->size)
919 : : new_memseg = 1;
920 : : #endif
921 : : if (new_memseg) {
922 : : /* if we're already inside a segment,
923 : : * new segment means end of current one
924 : : */
925 : : if (seg_start_page != -1) {
926 : : end_seg = 1;
927 : : prev_seg_start_page =
928 : : seg_start_page;
929 : : }
930 : : seg_start_page = cur_page;
931 : : }
932 : :
933 : : if (end_seg) {
934 : : if (prev_seg_start_page != -1) {
935 : : /* we've found a new segment */
936 : : n_contig_segs++;
937 : : n_segs += cur_page -
938 : : prev_seg_start_page;
939 : : } else if (seg_start_page != -1) {
940 : : /* we didn't find new segment,
941 : : * but did end current one
942 : : */
943 : : n_contig_segs++;
944 : : n_segs += cur_page -
945 : : seg_start_page;
946 : : seg_start_page = -1;
947 : : continue;
948 : : } else {
949 : : /* we're skipping this page */
950 : : continue;
951 : : }
952 : : }
953 : : /* segment continues */
954 : : }
955 : : /* check if we missed last segment */
956 : : if (seg_start_page != -1) {
957 : : n_contig_segs++;
958 : : n_segs += cur_page - seg_start_page;
959 : : }
960 : :
961 : : /* if no segments were found, do not preallocate */
962 : : if (n_segs == 0)
963 : : continue;
964 : :
965 : : /* we now have total number of pages that we will
966 : : * allocate for this segment list. add separator pages
967 : : * to the total count, and preallocate VA space.
968 : : */
969 : : n_segs += n_contig_segs - 1;
970 : :
971 : : /* now, preallocate VA space for these segments */
972 : :
973 : : /* first, find suitable memseg list for this */
974 : : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
975 : : msl_idx++) {
976 : : msl = &mcfg->memsegs[msl_idx];
977 : :
978 : : if (msl->base_va != NULL)
979 : : continue;
980 : : break;
981 : : }
982 : : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
983 : : EAL_LOG(ERR, "Not enough space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
984 : : return -1;
985 : : }
986 : :
987 : : /* now, allocate fbarray itself */
988 : : if (eal_memseg_list_init(msl, page_sz, n_segs,
989 : : socket, msl_idx, true) < 0)
990 : : return -1;
991 : :
992 : : /* finally, allocate VA space */
993 : : if (eal_memseg_list_alloc(msl, 0) < 0) {
994 : : EAL_LOG(ERR, "Cannot preallocate 0x%"PRIx64"kB hugepages",
995 : : page_sz >> 10);
996 : : return -1;
997 : : }
998 : : }
999 : : }
1000 : : return 0;
1001 : : }
1002 : :
1003 : : /*
1004 : : * We cannot reallocate memseg lists on the fly because PPC64 stores pages
1005 : : * backwards, therefore we have to process the entire memseg first before
1006 : : * remapping it into memseg list VA space.
1007 : : */
1008 : : static int
1009 : 2 : remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
1010 : : {
1011 : : int cur_page, seg_start_page, new_memseg, ret;
1012 : :
1013 : : seg_start_page = 0;
1014 [ + - ]: 20 : for (cur_page = 0; cur_page < n_pages; cur_page++) {
1015 : : struct hugepage_file *prev, *cur;
1016 : :
1017 : : new_memseg = 0;
1018 : :
1019 : 20 : cur = &hugepages[cur_page];
1020 [ + + ]: 20 : prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
1021 : :
1022 : : /* if size is zero, no more pages left */
1023 [ + + ]: 20 : if (cur->size == 0)
1024 : : break;
1025 : :
1026 [ + + ]: 18 : if (cur_page == 0)
1027 : : new_memseg = 1;
1028 [ + - ]: 16 : else if (cur->socket_id != prev->socket_id)
1029 : : new_memseg = 1;
1030 [ + - ]: 16 : else if (cur->size != prev->size)
1031 : : new_memseg = 1;
1032 : : #ifdef RTE_ARCH_PPC_64
1033 : : /* On PPC64 architecture, the mmap always start from higher
1034 : : * address to lower address. Here, physical addresses are in
1035 : : * descending order.
1036 : : */
1037 : : else if ((prev->physaddr - cur->physaddr) != cur->size)
1038 : : new_memseg = 1;
1039 : : #else
1040 [ - + ]: 16 : else if ((cur->physaddr - prev->physaddr) != cur->size)
1041 : : new_memseg = 1;
1042 : : #endif
1043 : :
1044 : : if (new_memseg) {
1045 : : /* if this isn't the first time, remap segment */
1046 [ - + ]: 2 : if (cur_page != 0) {
1047 : : int n_remapped = 0;
1048 : 0 : int n_needed = cur_page - seg_start_page;
1049 [ # # ]: 0 : while (n_remapped < n_needed) {
1050 : 0 : ret = remap_segment(hugepages, seg_start_page,
1051 : : cur_page);
1052 [ # # ]: 0 : if (ret < 0)
1053 : : return -1;
1054 : 0 : n_remapped += ret;
1055 : 0 : seg_start_page += ret;
1056 : : }
1057 : : }
1058 : : /* remember where we started */
1059 : : seg_start_page = cur_page;
1060 : : }
1061 : : /* continuation of previous memseg */
1062 : : }
1063 : : /* we were stopped, but we didn't remap the last segment, do it now */
1064 [ + - ]: 2 : if (cur_page != 0) {
1065 : : int n_remapped = 0;
1066 : 2 : int n_needed = cur_page - seg_start_page;
1067 [ + + ]: 4 : while (n_remapped < n_needed) {
1068 : 2 : ret = remap_segment(hugepages, seg_start_page,
1069 : : cur_page);
1070 [ + - ]: 2 : if (ret < 0)
1071 : : return -1;
1072 : 2 : n_remapped += ret;
1073 : 2 : seg_start_page += ret;
1074 : : }
1075 : : }
1076 : : return 0;
1077 : : }
1078 : :
1079 : : static inline size_t
1080 : 0 : eal_get_hugepage_mem_size(void)
1081 : : {
1082 : : uint64_t size = 0;
1083 : : unsigned i, j;
1084 : : struct internal_config *internal_conf =
1085 : 0 : eal_get_internal_configuration();
1086 : :
1087 [ # # ]: 0 : for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
1088 : : struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
1089 [ # # ]: 0 : if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
1090 [ # # ]: 0 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1091 : 0 : size += hpi->hugepage_sz * hpi->num_pages[j];
1092 : : }
1093 : : }
1094 : : }
1095 : :
1096 : 0 : return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
1097 : : }
1098 : :
1099 : : static struct sigaction huge_action_old;
1100 : : static int huge_need_recover;
1101 : :
1102 : : static void
1103 : 2 : huge_register_sigbus(void)
1104 : : {
1105 : : sigset_t mask;
1106 : : struct sigaction action;
1107 : :
1108 : 2 : sigemptyset(&mask);
1109 : 2 : sigaddset(&mask, SIGBUS);
1110 : 2 : action.sa_flags = 0;
1111 : 2 : action.sa_mask = mask;
1112 : 2 : action.sa_handler = huge_sigbus_handler;
1113 : :
1114 : 2 : huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
1115 : 2 : }
1116 : :
1117 : : static void
1118 : : huge_recover_sigbus(void)
1119 : : {
1120 [ + - ]: 2 : if (huge_need_recover) {
1121 : 2 : sigaction(SIGBUS, &huge_action_old, NULL);
1122 : 2 : huge_need_recover = 0;
1123 : : }
1124 : : }
1125 : :
1126 : : /*
1127 : : * Prepare physical memory mapping: fill configuration structure with
1128 : : * these infos, return 0 on success.
1129 : : * 1. map N huge pages in separate files in hugetlbfs
1130 : : * 2. find associated physical addr
1131 : : * 3. find associated NUMA socket ID
1132 : : * 4. sort all huge pages by physical address
1133 : : * 5. remap these N huge pages in the correct order
1134 : : * 6. unmap the first mapping
1135 : : * 7. fill memsegs in configuration with contiguous zones
1136 : : */
1137 : : static int
1138 : 101 : eal_legacy_hugepage_init(void)
1139 : : {
1140 : : struct rte_mem_config *mcfg;
1141 : : struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
1142 : : struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
1143 : : struct internal_config *internal_conf =
1144 : 101 : eal_get_internal_configuration();
1145 : :
1146 : : uint64_t memory[RTE_MAX_NUMA_NODES];
1147 : :
1148 : : unsigned hp_offset;
1149 : : int i, j;
1150 : : int nr_hugefiles, nr_hugepages = 0;
1151 : : void *addr;
1152 : :
1153 : : memset(used_hp, 0, sizeof(used_hp));
1154 : :
1155 : : /* get pointer to global configuration */
1156 : 101 : mcfg = rte_eal_get_configuration()->mem_config;
1157 : :
1158 : : /* hugetlbfs can be disabled */
1159 [ + + ]: 101 : if (internal_conf->no_hugetlbfs) {
1160 : : void *prealloc_addr;
1161 : : size_t mem_sz;
1162 : : struct rte_memseg_list *msl;
1163 : : int n_segs, fd, flags;
1164 : : #ifdef MEMFD_SUPPORTED
1165 : : int memfd;
1166 : : #endif
1167 : : uint64_t page_sz;
1168 : :
1169 : : /* nohuge mode is legacy mode */
1170 : 99 : internal_conf->legacy_mem = 1;
1171 : :
1172 : : /* nohuge mode is single-file segments mode */
1173 : 99 : internal_conf->single_file_segments = 1;
1174 : :
1175 : : /* create a memseg list */
1176 : 99 : msl = &mcfg->memsegs[0];
1177 : :
1178 : 99 : mem_sz = internal_conf->memory;
1179 : : page_sz = RTE_PGSIZE_4K;
1180 : 99 : n_segs = mem_sz / page_sz;
1181 : :
1182 [ + - ]: 99 : if (eal_memseg_list_init_named(
1183 : : msl, "nohugemem", page_sz, n_segs, 0, true)) {
1184 : : return -1;
1185 : : }
1186 : :
1187 : : /* set up parameters for anonymous mmap */
1188 : : fd = -1;
1189 : : flags = MAP_PRIVATE | MAP_ANONYMOUS;
1190 : :
1191 : : #ifdef MEMFD_SUPPORTED
1192 : : /* create a memfd and store it in the segment fd table */
1193 : 99 : memfd = memfd_create("nohuge", 0);
1194 [ - + ]: 99 : if (memfd < 0) {
1195 : 0 : EAL_LOG(DEBUG, "Cannot create memfd: %s",
1196 : : strerror(errno));
1197 : 0 : EAL_LOG(DEBUG, "Falling back to anonymous map");
1198 : : } else {
1199 : : /* we got an fd - now resize it */
1200 [ - + ]: 99 : if (ftruncate(memfd, internal_conf->memory) < 0) {
1201 : 0 : EAL_LOG(ERR, "Cannot resize memfd: %s",
1202 : : strerror(errno));
1203 : 0 : EAL_LOG(ERR, "Falling back to anonymous map");
1204 : 0 : close(memfd);
1205 : : } else {
1206 : : /* creating memfd-backed file was successful.
1207 : : * we want changes to memfd to be visible to
1208 : : * other processes (such as vhost backend), so
1209 : : * map it as shared memory.
1210 : : */
1211 : 99 : EAL_LOG(DEBUG, "Using memfd for anonymous memory");
1212 : : fd = memfd;
1213 : : flags = MAP_SHARED;
1214 : : }
1215 : : }
1216 : : #endif
1217 : : /* preallocate address space for the memory, so that it can be
1218 : : * fit into the DMA mask.
1219 : : */
1220 [ - + ]: 99 : if (eal_memseg_list_alloc(msl, 0)) {
1221 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1222 : 0 : return -1;
1223 : : }
1224 : :
1225 : 99 : prealloc_addr = msl->base_va;
1226 : 99 : addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
1227 : : flags | MAP_FIXED, fd, 0);
1228 [ - + ]: 99 : if (addr == MAP_FAILED || addr != prealloc_addr) {
1229 : 0 : EAL_LOG(ERR, "%s: mmap() failed: %s", __func__,
1230 : : strerror(errno));
1231 : 0 : munmap(prealloc_addr, mem_sz);
1232 : 0 : return -1;
1233 : : }
1234 : :
1235 : : /* we're in single-file segments mode, so only the segment list
1236 : : * fd needs to be set up.
1237 : : */
1238 [ + - ]: 99 : if (fd != -1) {
1239 [ - + ]: 99 : if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
1240 : 0 : EAL_LOG(ERR, "Cannot set up segment list fd");
1241 : : /* not a serious error, proceed */
1242 : : }
1243 : : }
1244 : :
1245 : 99 : eal_memseg_list_populate(msl, addr, n_segs);
1246 : :
1247 [ - + - - ]: 99 : if (mcfg->dma_maskbits &&
1248 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1249 : 0 : EAL_LOG(ERR,
1250 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1251 : : __func__);
1252 [ # # # # ]: 0 : if (rte_eal_iova_mode() == RTE_IOVA_VA &&
1253 : 0 : rte_eal_using_phys_addrs())
1254 : 0 : EAL_LOG(ERR,
1255 : : "%s(): Please try initializing EAL with --iova-mode=pa parameter.",
1256 : : __func__);
1257 : 0 : goto fail;
1258 : : }
1259 : 99 : return 0;
1260 : : }
1261 : :
1262 : : /* calculate total number of hugepages available. at this point we haven't
1263 : : * yet started sorting them so they all are on socket 0 */
1264 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1265 : : /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
1266 : 2 : used_hp[i].hugepage_sz = internal_conf->hugepage_info[i].hugepage_sz;
1267 : :
1268 : 2 : nr_hugepages += internal_conf->hugepage_info[i].num_pages[0];
1269 : : }
1270 : :
1271 : : /*
1272 : : * allocate a memory area for hugepage table.
1273 : : * this isn't shared memory yet. due to the fact that we need some
1274 : : * processing done on these pages, shared memory will be created
1275 : : * at a later stage.
1276 : : */
1277 : 2 : tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
1278 [ - + ]: 2 : if (tmp_hp == NULL)
1279 : 0 : goto fail;
1280 : :
1281 : : memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
1282 : :
1283 : : hp_offset = 0; /* where we start the current page size entries */
1284 : :
1285 : 2 : huge_register_sigbus();
1286 : :
1287 : : /* make a copy of socket_mem, needed for balanced allocation. */
1288 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1289 : 64 : memory[i] = internal_conf->socket_mem[i];
1290 : :
1291 : : /* map all hugepages and sort them */
1292 [ + + ]: 4 : for (i = 0; i < (int)internal_conf->num_hugepage_sizes; i++) {
1293 : : unsigned pages_old, pages_new;
1294 : : struct hugepage_info *hpi;
1295 : :
1296 : : /*
1297 : : * we don't yet mark hugepages as used at this stage, so
1298 : : * we just map all hugepages available to the system
1299 : : * all hugepages are still located on socket 0
1300 : : */
1301 : 2 : hpi = &internal_conf->hugepage_info[i];
1302 : :
1303 [ - + ]: 2 : if (hpi->num_pages[0] == 0)
1304 : 0 : continue;
1305 : :
1306 : : /* map all hugepages available */
1307 : : pages_old = hpi->num_pages[0];
1308 : 2 : pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
1309 [ - + ]: 2 : if (pages_new < pages_old) {
1310 : 0 : EAL_LOG(DEBUG,
1311 : : "%d not %d hugepages of size %u MB allocated",
1312 : : pages_new, pages_old,
1313 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1314 : :
1315 : 0 : int pages = pages_old - pages_new;
1316 : :
1317 : 0 : nr_hugepages -= pages;
1318 : 0 : hpi->num_pages[0] = pages_new;
1319 [ # # ]: 0 : if (pages_new == 0)
1320 : 0 : continue;
1321 : : }
1322 : :
1323 [ + - - + ]: 4 : if (rte_eal_using_phys_addrs() &&
1324 : 2 : rte_eal_iova_mode() != RTE_IOVA_VA) {
1325 : : /* find physical addresses for each hugepage */
1326 [ - + ]: 2 : if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1327 : 0 : EAL_LOG(DEBUG, "Failed to find phys addr "
1328 : : "for %u MB pages",
1329 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1330 : 0 : goto fail;
1331 : : }
1332 : : } else {
1333 : : /* set physical addresses for each hugepage */
1334 : : if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1335 : : EAL_LOG(DEBUG, "Failed to set phys addr "
1336 : : "for %u MB pages",
1337 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1338 : : goto fail;
1339 : : }
1340 : : }
1341 : :
1342 [ - + ]: 2 : if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1343 : 0 : EAL_LOG(DEBUG, "Failed to find NUMA socket for %u MB pages",
1344 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1345 : 0 : goto fail;
1346 : : }
1347 : :
1348 : 2 : qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
1349 : : sizeof(struct hugepage_file), cmp_physaddr);
1350 : :
1351 : : /* we have processed a num of hugepages of this size, so inc offset */
1352 : 2 : hp_offset += hpi->num_pages[0];
1353 : : }
1354 : :
1355 : : huge_recover_sigbus();
1356 : :
1357 [ - + - - ]: 2 : if (internal_conf->memory == 0 && internal_conf->force_sockets == 0)
1358 : 0 : internal_conf->memory = eal_get_hugepage_mem_size();
1359 : :
1360 : : nr_hugefiles = nr_hugepages;
1361 : :
1362 : :
1363 : : /* clean out the numbers of pages */
1364 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++)
1365 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
1366 : 64 : internal_conf->hugepage_info[i].num_pages[j] = 0;
1367 : :
1368 : : /* get hugepages for each socket */
1369 [ + + ]: 2048 : for (i = 0; i < nr_hugefiles; i++) {
1370 : 2046 : int socket = tmp_hp[i].socket_id;
1371 : :
1372 : : /* find a hugepage info with right size and increment num_pages */
1373 : 2046 : const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
1374 : : (int)internal_conf->num_hugepage_sizes);
1375 [ + + ]: 4092 : for (j = 0; j < nb_hpsizes; j++) {
1376 : 2046 : if (tmp_hp[i].size ==
1377 [ + - ]: 2046 : internal_conf->hugepage_info[j].hugepage_sz) {
1378 : 2046 : internal_conf->hugepage_info[j].num_pages[socket]++;
1379 : : }
1380 : : }
1381 : : }
1382 : :
1383 : : /* make a copy of socket_mem, needed for number of pages calculation */
1384 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1385 : 64 : memory[i] = internal_conf->socket_mem[i];
1386 : :
1387 : : /* calculate final number of pages */
1388 : 2 : nr_hugepages = eal_dynmem_calc_num_pages_per_socket(memory,
1389 : 2 : internal_conf->hugepage_info, used_hp,
1390 : : internal_conf->num_hugepage_sizes);
1391 : :
1392 : : /* error if not enough memory available */
1393 [ - + ]: 2 : if (nr_hugepages < 0)
1394 : 0 : goto fail;
1395 : :
1396 : : /* reporting in! */
1397 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1398 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1399 [ + + ]: 64 : if (used_hp[i].num_pages[j] > 0) {
1400 : 2 : EAL_LOG(DEBUG,
1401 : : "Requesting %u pages of size %uMB"
1402 : : " from socket %i",
1403 : : used_hp[i].num_pages[j],
1404 : : (unsigned)
1405 : : (used_hp[i].hugepage_sz / 0x100000),
1406 : : j);
1407 : : }
1408 : : }
1409 : : }
1410 : :
1411 : : /* create shared memory */
1412 : 2 : hugepage = create_shared_memory(eal_hugepage_data_path(),
1413 : : nr_hugefiles * sizeof(struct hugepage_file));
1414 : :
1415 [ - + ]: 2 : if (hugepage == NULL) {
1416 : 0 : EAL_LOG(ERR, "Failed to create shared memory!");
1417 : 0 : goto fail;
1418 : : }
1419 : : memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
1420 : :
1421 : : /*
1422 : : * unmap pages that we won't need (looks at used_hp).
1423 : : * also, sets final_va to NULL on pages that were unmapped.
1424 : : */
1425 [ - + ]: 2 : if (unmap_unneeded_hugepages(tmp_hp, used_hp,
1426 : : internal_conf->num_hugepage_sizes) < 0) {
1427 : 0 : EAL_LOG(ERR, "Unmapping and locking hugepages failed!");
1428 : 0 : goto fail;
1429 : : }
1430 : :
1431 : : /*
1432 : : * copy stuff from malloc'd hugepage* to the actual shared memory.
1433 : : * this procedure only copies those hugepages that have orig_va
1434 : : * not NULL. has overflow protection.
1435 : : */
1436 [ - + ]: 2 : if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
1437 : : tmp_hp, nr_hugefiles) < 0) {
1438 : 0 : EAL_LOG(ERR, "Copying tables to shared memory failed!");
1439 : 0 : goto fail;
1440 : : }
1441 : :
1442 : : #ifndef RTE_ARCH_64
1443 : : /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
1444 : : if (internal_conf->legacy_mem &&
1445 : : prealloc_segments(hugepage, nr_hugefiles)) {
1446 : : EAL_LOG(ERR, "Could not preallocate VA space for hugepages");
1447 : : goto fail;
1448 : : }
1449 : : #endif
1450 : :
1451 : : /* remap all pages we do need into memseg list VA space, so that those
1452 : : * pages become first-class citizens in DPDK memory subsystem
1453 : : */
1454 [ - + ]: 2 : if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
1455 : 0 : EAL_LOG(ERR, "Couldn't remap hugepage files into memseg lists");
1456 : 0 : goto fail;
1457 : : }
1458 : :
1459 : : /* free the hugepage backing files */
1460 [ - + - - ]: 2 : if (internal_conf->hugepage_file.unlink_before_mapping &&
1461 : 0 : unlink_hugepage_files(tmp_hp, internal_conf->num_hugepage_sizes) < 0) {
1462 : 0 : EAL_LOG(ERR, "Unlinking hugepage files failed!");
1463 : 0 : goto fail;
1464 : : }
1465 : :
1466 : : /* free the temporary hugepage table */
1467 : 2 : free(tmp_hp);
1468 : : tmp_hp = NULL;
1469 : :
1470 : 2 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1471 : : hugepage = NULL;
1472 : :
1473 : : /* we're not going to allocate more pages, so release VA space for
1474 : : * unused memseg lists
1475 : : */
1476 [ + + ]: 258 : for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
1477 : : struct rte_memseg_list *msl = &mcfg->memsegs[i];
1478 : : size_t mem_sz;
1479 : :
1480 : : /* skip inactive lists */
1481 [ + + ]: 256 : if (msl->base_va == NULL)
1482 : 240 : continue;
1483 : : /* skip lists where there is at least one page allocated */
1484 [ + + ]: 16 : if (msl->memseg_arr.count > 0)
1485 : 2 : continue;
1486 : : /* this is an unused list, deallocate it */
1487 : 14 : mem_sz = msl->len;
1488 : 14 : munmap(msl->base_va, mem_sz);
1489 : 14 : msl->base_va = NULL;
1490 : 14 : msl->len = 0;
1491 : 14 : msl->heap = 0;
1492 : :
1493 : : /* destroy backing fbarray */
1494 : 14 : rte_fbarray_destroy(&msl->memseg_arr);
1495 : : }
1496 : :
1497 [ - + - - ]: 2 : if (mcfg->dma_maskbits &&
1498 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1499 : 0 : EAL_LOG(ERR,
1500 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1501 : : __func__);
1502 : 0 : goto fail;
1503 : : }
1504 : :
1505 : : return 0;
1506 : :
1507 [ # # ]: 0 : fail:
1508 : : huge_recover_sigbus();
1509 : 0 : free(tmp_hp);
1510 [ # # ]: 0 : if (hugepage != NULL)
1511 : 0 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1512 : :
1513 : : return -1;
1514 : : }
1515 : :
1516 : : /*
1517 : : * uses fstat to report the size of a file on disk
1518 : : */
1519 : : static off_t
1520 : : getFileSize(int fd)
1521 : : {
1522 : : struct stat st;
1523 [ # # ]: 0 : if (fstat(fd, &st) < 0)
1524 : : return 0;
1525 : 0 : return st.st_size;
1526 : : }
1527 : :
1528 : : /*
1529 : : * This creates the memory mappings in the secondary process to match that of
1530 : : * the server process. It goes through each memory segment in the DPDK runtime
1531 : : * configuration and finds the hugepages which form that segment, mapping them
1532 : : * in order to form a contiguous block in the virtual memory space
1533 : : */
1534 : : static int
1535 : 1 : eal_legacy_hugepage_attach(void)
1536 : : {
1537 : 1 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1538 : : struct hugepage_file *hp = NULL;
1539 : : unsigned int num_hp = 0;
1540 : : unsigned int i = 0;
1541 : : unsigned int cur_seg;
1542 : : off_t size = 0;
1543 : : int fd, fd_hugepage = -1;
1544 : :
1545 [ + - ]: 1 : if (aslr_enabled() > 0) {
1546 : 1 : EAL_LOG(WARNING, "WARNING: Address Space Layout Randomization "
1547 : : "(ASLR) is enabled in the kernel.");
1548 : 1 : EAL_LOG(WARNING, " This may cause issues with mapping memory "
1549 : : "into secondary processes");
1550 : : }
1551 : :
1552 : 1 : fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
1553 [ + - ]: 1 : if (fd_hugepage < 0) {
1554 : 1 : EAL_LOG(ERR, "Could not open %s",
1555 : : eal_hugepage_data_path());
1556 : 1 : goto error;
1557 : : }
1558 : :
1559 : : size = getFileSize(fd_hugepage);
1560 : 0 : hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1561 [ # # ]: 0 : if (hp == MAP_FAILED) {
1562 : 0 : EAL_LOG(ERR, "Could not mmap %s",
1563 : : eal_hugepage_data_path());
1564 : 0 : goto error;
1565 : : }
1566 : :
1567 : 0 : num_hp = size / sizeof(struct hugepage_file);
1568 : 0 : EAL_LOG(DEBUG, "Analysing %u files", num_hp);
1569 : :
1570 : : /* map all segments into memory to make sure we get the addrs. the
1571 : : * segments themselves are already in memseg list (which is shared and
1572 : : * has its VA space already preallocated), so we just need to map
1573 : : * everything into correct addresses.
1574 : : */
1575 [ # # ]: 0 : for (i = 0; i < num_hp; i++) {
1576 : 0 : struct hugepage_file *hf = &hp[i];
1577 : 0 : size_t map_sz = hf->size;
1578 : 0 : void *map_addr = hf->final_va;
1579 : : int msl_idx, ms_idx;
1580 : : struct rte_memseg_list *msl;
1581 : : struct rte_memseg *ms;
1582 : :
1583 : : /* if size is zero, no more pages left */
1584 [ # # ]: 0 : if (map_sz == 0)
1585 : : break;
1586 : :
1587 : 0 : fd = open(hf->filepath, O_RDWR);
1588 [ # # ]: 0 : if (fd < 0) {
1589 : 0 : EAL_LOG(ERR, "Could not open %s: %s",
1590 : : hf->filepath, strerror(errno));
1591 : 0 : goto error;
1592 : : }
1593 : :
1594 : 0 : map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
1595 : : MAP_SHARED | MAP_FIXED, fd, 0);
1596 [ # # ]: 0 : if (map_addr == MAP_FAILED) {
1597 : 0 : EAL_LOG(ERR, "Could not map %s: %s",
1598 : : hf->filepath, strerror(errno));
1599 : 0 : goto fd_error;
1600 : : }
1601 : :
1602 : : /* set shared lock on the file. */
1603 [ # # ]: 0 : if (flock(fd, LOCK_SH) < 0) {
1604 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed: %s",
1605 : : __func__, strerror(errno));
1606 : 0 : goto mmap_error;
1607 : : }
1608 : :
1609 : : /* find segment data */
1610 : 0 : msl = rte_mem_virt2memseg_list(map_addr);
1611 [ # # ]: 0 : if (msl == NULL) {
1612 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg list",
1613 : : __func__);
1614 : 0 : goto mmap_error;
1615 : : }
1616 : 0 : ms = rte_mem_virt2memseg(map_addr, msl);
1617 [ # # ]: 0 : if (ms == NULL) {
1618 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg",
1619 : : __func__);
1620 : 0 : goto mmap_error;
1621 : : }
1622 : :
1623 : 0 : msl_idx = msl - mcfg->memsegs;
1624 : 0 : ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
1625 [ # # ]: 0 : if (ms_idx < 0) {
1626 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg idx",
1627 : : __func__);
1628 : 0 : goto mmap_error;
1629 : : }
1630 : :
1631 : : /* store segment fd internally */
1632 [ # # ]: 0 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
1633 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
1634 : : rte_strerror(rte_errno));
1635 : : }
1636 : : /* unmap the hugepage config file, since we are done using it */
1637 : 0 : munmap(hp, size);
1638 : 0 : close(fd_hugepage);
1639 : 0 : return 0;
1640 : :
1641 : 0 : mmap_error:
1642 : 0 : munmap(hp[i].final_va, hp[i].size);
1643 : 0 : fd_error:
1644 : 0 : close(fd);
1645 : 1 : error:
1646 : : /* unwind mmap's done so far */
1647 [ - + ]: 1 : for (cur_seg = 0; cur_seg < i; cur_seg++)
1648 : 0 : munmap(hp[cur_seg].final_va, hp[cur_seg].size);
1649 : :
1650 [ - + ]: 1 : if (hp != NULL && hp != MAP_FAILED)
1651 : 0 : munmap(hp, size);
1652 [ - + ]: 1 : if (fd_hugepage >= 0)
1653 : 0 : close(fd_hugepage);
1654 : : return -1;
1655 : : }
1656 : :
1657 : : static int
1658 : 26 : eal_hugepage_attach(void)
1659 : : {
1660 [ + + ]: 26 : if (eal_memalloc_sync_with_primary()) {
1661 : 1 : EAL_LOG(ERR, "Could not map memory from primary process");
1662 [ + - ]: 1 : if (aslr_enabled() > 0)
1663 : 1 : EAL_LOG(ERR, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes");
1664 : 1 : return -1;
1665 : : }
1666 : : return 0;
1667 : : }
1668 : :
1669 : : int
1670 : 156 : rte_eal_hugepage_init(void)
1671 : : {
1672 : : const struct internal_config *internal_conf =
1673 : 156 : eal_get_internal_configuration();
1674 : :
1675 : 156 : return internal_conf->legacy_mem ?
1676 [ + + ]: 156 : eal_legacy_hugepage_init() :
1677 : 55 : eal_dynmem_hugepage_init();
1678 : : }
1679 : :
1680 : : int
1681 : 27 : rte_eal_hugepage_attach(void)
1682 : : {
1683 : : const struct internal_config *internal_conf =
1684 : 27 : eal_get_internal_configuration();
1685 : :
1686 : 27 : return internal_conf->legacy_mem ?
1687 [ + + ]: 27 : eal_legacy_hugepage_attach() :
1688 : 26 : eal_hugepage_attach();
1689 : : }
1690 : :
1691 : : RTE_EXPORT_SYMBOL(rte_eal_using_phys_addrs)
1692 : : int
1693 : 187 : rte_eal_using_phys_addrs(void)
1694 : : {
1695 [ + + ]: 187 : if (phys_addrs_available == -1) {
1696 : 185 : uint64_t tmp = 0;
1697 : :
1698 [ + + + - ]: 271 : if (rte_eal_has_hugepages() != 0 &&
1699 : 86 : rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
1700 : 86 : phys_addrs_available = 1;
1701 : : else
1702 : 99 : phys_addrs_available = 0;
1703 : : }
1704 : 187 : return phys_addrs_available;
1705 : : }
1706 : :
1707 : : static int __rte_unused
1708 : : memseg_primary_init_32(void)
1709 : : {
1710 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1711 : : int active_sockets, hpi_idx, msl_idx = 0;
1712 : : unsigned int socket_id, i;
1713 : : struct rte_memseg_list *msl;
1714 : : uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
1715 : : uint64_t max_mem;
1716 : : struct internal_config *internal_conf =
1717 : : eal_get_internal_configuration();
1718 : :
1719 : : /* no-huge does not need this at all */
1720 : : if (internal_conf->no_hugetlbfs)
1721 : : return 0;
1722 : :
1723 : : /* this is a giant hack, but desperate times call for desperate
1724 : : * measures. in legacy 32-bit mode, we cannot preallocate VA space,
1725 : : * because having upwards of 2 gigabytes of VA space already mapped will
1726 : : * interfere with our ability to map and sort hugepages.
1727 : : *
1728 : : * therefore, in legacy 32-bit mode, we will be initializing memseg
1729 : : * lists much later - in eal_memory.c, right after we unmap all the
1730 : : * unneeded pages. this will not affect secondary processes, as those
1731 : : * should be able to mmap the space without (too many) problems.
1732 : : */
1733 : : if (internal_conf->legacy_mem)
1734 : : return 0;
1735 : :
1736 : : /* 32-bit mode is a very special case. we cannot know in advance where
1737 : : * the user will want to allocate their memory, so we have to do some
1738 : : * heuristics.
1739 : : */
1740 : : active_sockets = 0;
1741 : : total_requested_mem = 0;
1742 : : if (internal_conf->force_sockets)
1743 : : for (i = 0; i < rte_socket_count(); i++) {
1744 : : uint64_t mem;
1745 : :
1746 : : socket_id = rte_socket_id_by_idx(i);
1747 : : mem = internal_conf->socket_mem[socket_id];
1748 : :
1749 : : if (mem == 0)
1750 : : continue;
1751 : :
1752 : : active_sockets++;
1753 : : total_requested_mem += mem;
1754 : : }
1755 : : else
1756 : : total_requested_mem = internal_conf->memory;
1757 : :
1758 : : max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
1759 : : if (total_requested_mem > max_mem) {
1760 : : EAL_LOG(ERR, "Invalid parameters: 32-bit process can at most use %uM of memory",
1761 : : (unsigned int)(max_mem >> 20));
1762 : : return -1;
1763 : : }
1764 : : total_extra_mem = max_mem - total_requested_mem;
1765 : : extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
1766 : : total_extra_mem / active_sockets;
1767 : :
1768 : : /* the allocation logic is a little bit convoluted, but here's how it
1769 : : * works, in a nutshell:
1770 : : * - if user hasn't specified on which sockets to allocate memory via
1771 : : * --socket-mem, we allocate all of our memory on main core socket.
1772 : : * - if user has specified sockets to allocate memory on, there may be
1773 : : * some "unused" memory left (e.g. if user has specified --socket-mem
1774 : : * such that not all memory adds up to 2 gigabytes), so add it to all
1775 : : * sockets that are in use equally.
1776 : : *
1777 : : * page sizes are sorted by size in descending order, so we can safely
1778 : : * assume that we dispense with bigger page sizes first.
1779 : : */
1780 : :
1781 : : /* create memseg lists */
1782 : : for (i = 0; i < rte_socket_count(); i++) {
1783 : : int hp_sizes = (int) internal_conf->num_hugepage_sizes;
1784 : : uint64_t max_socket_mem, cur_socket_mem;
1785 : : unsigned int main_lcore_socket;
1786 : : struct rte_config *cfg = rte_eal_get_configuration();
1787 : : bool skip;
1788 : :
1789 : : socket_id = rte_socket_id_by_idx(i);
1790 : :
1791 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1792 : : /* we can still sort pages by socket in legacy mode */
1793 : : if (!internal_conf->legacy_mem && socket_id > 0)
1794 : : break;
1795 : : #endif
1796 : :
1797 : : /* if we didn't specifically request memory on this socket */
1798 : : skip = active_sockets != 0 &&
1799 : : internal_conf->socket_mem[socket_id] == 0;
1800 : : /* ...or if we didn't specifically request memory on *any*
1801 : : * socket, and this is not main lcore
1802 : : */
1803 : : main_lcore_socket = rte_lcore_to_socket_id(cfg->main_lcore);
1804 : : skip |= active_sockets == 0 && socket_id != main_lcore_socket;
1805 : :
1806 : : if (skip) {
1807 : : EAL_LOG(DEBUG, "Will not preallocate memory on socket %u",
1808 : : socket_id);
1809 : : continue;
1810 : : }
1811 : :
1812 : : /* max amount of memory on this socket */
1813 : : max_socket_mem = (active_sockets != 0 ?
1814 : : internal_conf->socket_mem[socket_id] :
1815 : : internal_conf->memory) +
1816 : : extra_mem_per_socket;
1817 : : cur_socket_mem = 0;
1818 : :
1819 : : for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
1820 : : uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
1821 : : uint64_t hugepage_sz;
1822 : : struct hugepage_info *hpi;
1823 : : int type_msl_idx, max_segs, total_segs = 0;
1824 : :
1825 : : hpi = &internal_conf->hugepage_info[hpi_idx];
1826 : : hugepage_sz = hpi->hugepage_sz;
1827 : :
1828 : : /* check if pages are actually available */
1829 : : if (hpi->num_pages[socket_id] == 0)
1830 : : continue;
1831 : :
1832 : : max_segs = RTE_MAX_MEMSEG_PER_TYPE;
1833 : : max_pagesz_mem = max_socket_mem - cur_socket_mem;
1834 : :
1835 : : /* make it multiple of page size */
1836 : : max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
1837 : : hugepage_sz);
1838 : :
1839 : : EAL_LOG(DEBUG, "Attempting to preallocate "
1840 : : "%" PRIu64 "M on socket %i",
1841 : : max_pagesz_mem >> 20, socket_id);
1842 : :
1843 : : type_msl_idx = 0;
1844 : : while (cur_pagesz_mem < max_pagesz_mem &&
1845 : : total_segs < max_segs) {
1846 : : uint64_t cur_mem;
1847 : : unsigned int n_segs;
1848 : :
1849 : : if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
1850 : : EAL_LOG(ERR,
1851 : : "No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
1852 : : return -1;
1853 : : }
1854 : :
1855 : : msl = &mcfg->memsegs[msl_idx];
1856 : :
1857 : : cur_mem = get_mem_amount(hugepage_sz,
1858 : : max_pagesz_mem);
1859 : : n_segs = cur_mem / hugepage_sz;
1860 : :
1861 : : if (eal_memseg_list_init(msl, hugepage_sz,
1862 : : n_segs, socket_id, type_msl_idx,
1863 : : true)) {
1864 : : /* failing to allocate a memseg list is
1865 : : * a serious error.
1866 : : */
1867 : : EAL_LOG(ERR, "Cannot allocate memseg list");
1868 : : return -1;
1869 : : }
1870 : :
1871 : : if (eal_memseg_list_alloc(msl, 0)) {
1872 : : /* if we couldn't allocate VA space, we
1873 : : * can try with smaller page sizes.
1874 : : */
1875 : : EAL_LOG(ERR, "Cannot allocate VA space for memseg list, retrying with different page size");
1876 : : /* deallocate memseg list */
1877 : : if (memseg_list_free(msl))
1878 : : return -1;
1879 : : break;
1880 : : }
1881 : :
1882 : : total_segs += msl->memseg_arr.len;
1883 : : cur_pagesz_mem = total_segs * hugepage_sz;
1884 : : type_msl_idx++;
1885 : : msl_idx++;
1886 : : }
1887 : : cur_socket_mem += cur_pagesz_mem;
1888 : : }
1889 : : if (cur_socket_mem == 0) {
1890 : : EAL_LOG(ERR, "Cannot allocate VA space on socket %u",
1891 : : socket_id);
1892 : : return -1;
1893 : : }
1894 : : }
1895 : :
1896 : : return 0;
1897 : : }
1898 : :
1899 : : static int __rte_unused
1900 : : memseg_primary_init(void)
1901 : : {
1902 : 156 : return eal_dynmem_memseg_lists_init();
1903 : : }
1904 : :
1905 : : static int
1906 : 27 : memseg_secondary_init(void)
1907 : : {
1908 : 27 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1909 : : int msl_idx = 0;
1910 : : struct rte_memseg_list *msl;
1911 : :
1912 [ + + ]: 3483 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
1913 : :
1914 : 3456 : msl = &mcfg->memsegs[msl_idx];
1915 : :
1916 : : /* skip empty and external memseg lists */
1917 [ + + - + ]: 3456 : if (msl->memseg_arr.len == 0 || msl->external)
1918 : 3247 : continue;
1919 : :
1920 [ - + ]: 209 : if (rte_fbarray_attach(&msl->memseg_arr)) {
1921 : 0 : EAL_LOG(ERR, "Cannot attach to primary process memseg lists");
1922 : 0 : return -1;
1923 : : }
1924 : :
1925 : : /* preallocate VA space */
1926 [ - + ]: 209 : if (eal_memseg_list_alloc(msl, 0)) {
1927 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1928 : 0 : return -1;
1929 : : }
1930 : : }
1931 : :
1932 : : return 0;
1933 : : }
1934 : :
1935 : : int
1936 : 183 : rte_eal_memseg_init(void)
1937 : : {
1938 : : /* increase rlimit to maximum */
1939 : : struct rlimit lim;
1940 : :
1941 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1942 : : const struct internal_config *internal_conf =
1943 : : eal_get_internal_configuration();
1944 : : #endif
1945 [ + - ]: 183 : if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
1946 : : /* set limit to maximum */
1947 : 183 : lim.rlim_cur = lim.rlim_max;
1948 : :
1949 [ - + ]: 183 : if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
1950 : 0 : EAL_LOG(DEBUG, "Setting maximum number of open files failed: %s",
1951 : : strerror(errno));
1952 : : } else {
1953 : 183 : EAL_LOG(DEBUG, "Setting maximum number of open files to %"
1954 : : PRIu64,
1955 : : (uint64_t)lim.rlim_cur);
1956 : : }
1957 : : } else {
1958 : 0 : EAL_LOG(ERR, "Cannot get current resource limits");
1959 : : }
1960 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1961 : : if (!internal_conf->legacy_mem && rte_socket_count() > 1) {
1962 : : EAL_LOG(WARNING, "DPDK is running on a NUMA system, but is compiled without NUMA support.");
1963 : : EAL_LOG(WARNING, "This will have adverse consequences for performance and usability.");
1964 : : EAL_LOG(WARNING, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.");
1965 : : }
1966 : : #endif
1967 : :
1968 : 183 : return rte_eal_process_type() == RTE_PROC_PRIMARY ?
1969 : : #ifndef RTE_ARCH_64
1970 : : memseg_primary_init_32() :
1971 : : #else
1972 [ + + ]: 183 : memseg_primary_init() :
1973 : : #endif
1974 : 27 : memseg_secondary_init();
1975 : : }
|