Branch data Line data Source code
1 : : /* SPDX-License-Identifier: BSD-3-Clause
2 : : * Copyright(c) 2010-2014 Intel Corporation.
3 : : * Copyright(c) 2013 6WIND S.A.
4 : : */
5 : :
6 : : #include <errno.h>
7 : : #include <fcntl.h>
8 : : #include <stdbool.h>
9 : : #include <stdlib.h>
10 : : #include <stdio.h>
11 : : #include <stdint.h>
12 : : #include <inttypes.h>
13 : : #include <string.h>
14 : : #include <sys/mman.h>
15 : : #include <sys/stat.h>
16 : : #include <sys/file.h>
17 : : #include <sys/resource.h>
18 : : #include <sys/personality.h>
19 : : #include <unistd.h>
20 : : #include <limits.h>
21 : : #include <signal.h>
22 : : #include <setjmp.h>
23 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
24 : : #include <numa.h>
25 : : #include <numaif.h>
26 : : #endif
27 : :
28 : : #include <rte_errno.h>
29 : : #include <rte_log.h>
30 : : #include <rte_memory.h>
31 : : #include <rte_eal.h>
32 : : #include <rte_lcore.h>
33 : : #include <rte_common.h>
34 : :
35 : : #include <eal_export.h>
36 : : #include "eal_private.h"
37 : : #include "eal_memalloc.h"
38 : : #include "eal_memcfg.h"
39 : : #include "eal_internal_cfg.h"
40 : : #include "eal_filesystem.h"
41 : : #include "eal_hugepages.h"
42 : : #include "eal_options.h"
43 : :
44 : : #define PFN_MASK_SIZE 8
45 : :
46 : : /**
47 : : * @file
48 : : * Huge page mapping under linux
49 : : *
50 : : * To reserve a big contiguous amount of memory, we use the hugepage
51 : : * feature of linux. For that, we need to have hugetlbfs mounted. This
52 : : * code will create many files in this directory (one per page) and
53 : : * map them in virtual memory. For each page, we will retrieve its
54 : : * physical address and remap it in order to have a virtual contiguous
55 : : * zone as well as a physical contiguous zone.
56 : : */
57 : :
58 : : static int phys_addrs_available = -1;
59 : :
60 : : #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
61 : :
62 : 158 : uint64_t eal_get_baseaddr(void)
63 : : {
64 : : /*
65 : : * Linux kernel uses a really high address as starting address for
66 : : * serving mmaps calls. If there exists addressing limitations and IOVA
67 : : * mode is VA, this starting address is likely too high for those
68 : : * devices. However, it is possible to use a lower address in the
69 : : * process virtual address space as with 64 bits there is a lot of
70 : : * available space.
71 : : *
72 : : * Current known limitations are 39 or 40 bits. Setting the starting
73 : : * address at 4GB implies there are 508GB or 1020GB for mapping the
74 : : * available hugepages. This is likely enough for most systems, although
75 : : * a device with addressing limitations should call
76 : : * rte_mem_check_dma_mask for ensuring all memory is within supported
77 : : * range.
78 : : */
79 : : #if defined(RTE_ARCH_LOONGARCH)
80 : : return 0x7000000000ULL;
81 : : #else
82 : 158 : return 0x100000000ULL;
83 : : #endif
84 : : }
85 : :
86 : : /*
87 : : * Get physical address of any mapped virtual address in the current process.
88 : : */
89 : : RTE_EXPORT_SYMBOL(rte_mem_virt2phy)
90 : : phys_addr_t
91 : 5336 : rte_mem_virt2phy(const void *virtaddr)
92 : : {
93 : : int fd, retval;
94 : : uint64_t page, physaddr;
95 : : unsigned long virt_pfn;
96 : : int page_size;
97 : : off_t offset;
98 : :
99 [ + - ]: 5336 : if (phys_addrs_available == 0)
100 : : return RTE_BAD_IOVA;
101 : :
102 : : /* standard page size */
103 : 5336 : page_size = getpagesize();
104 : :
105 : : fd = open("/proc/self/pagemap", O_RDONLY);
106 [ - + ]: 5336 : if (fd < 0) {
107 : 0 : EAL_LOG(INFO, "%s(): cannot open /proc/self/pagemap: %s",
108 : : __func__, strerror(errno));
109 : 0 : return RTE_BAD_IOVA;
110 : : }
111 : :
112 : 5336 : virt_pfn = (unsigned long)virtaddr / page_size;
113 : 5336 : offset = sizeof(uint64_t) * virt_pfn;
114 [ - + ]: 5336 : if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
115 : 0 : EAL_LOG(INFO, "%s(): seek error in /proc/self/pagemap: %s",
116 : : __func__, strerror(errno));
117 : 0 : close(fd);
118 : 0 : return RTE_BAD_IOVA;
119 : : }
120 : :
121 : 5336 : retval = read(fd, &page, PFN_MASK_SIZE);
122 : 5336 : close(fd);
123 [ - + ]: 5336 : if (retval < 0) {
124 : 0 : EAL_LOG(INFO, "%s(): cannot read /proc/self/pagemap: %s",
125 : : __func__, strerror(errno));
126 : 0 : return RTE_BAD_IOVA;
127 [ - + ]: 5336 : } else if (retval != PFN_MASK_SIZE) {
128 : 0 : EAL_LOG(INFO, "%s(): read %d bytes from /proc/self/pagemap "
129 : : "but expected %d:",
130 : : __func__, retval, PFN_MASK_SIZE);
131 : 0 : return RTE_BAD_IOVA;
132 : : }
133 : :
134 : : /*
135 : : * the pfn (page frame number) are bits 0-54 (see
136 : : * pagemap.txt in linux Documentation)
137 : : */
138 [ + - ]: 5336 : if ((page & 0x7fffffffffffffULL) == 0)
139 : : return RTE_BAD_IOVA;
140 : :
141 : 5336 : physaddr = ((page & 0x7fffffffffffffULL) * page_size)
142 : 5336 : + ((unsigned long)virtaddr % page_size);
143 : :
144 : 5336 : return physaddr;
145 : : }
146 : :
147 : : RTE_EXPORT_SYMBOL(rte_mem_virt2iova)
148 : : rte_iova_t
149 : 3204 : rte_mem_virt2iova(const void *virtaddr)
150 : : {
151 [ - + ]: 3204 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
152 : 0 : return (uintptr_t)virtaddr;
153 : 3204 : return rte_mem_virt2phy(virtaddr);
154 : : }
155 : :
156 : : /*
157 : : * For each hugepage in hugepg_tbl, fill the physaddr value. We find
158 : : * it by browsing the /proc/self/pagemap special file.
159 : : */
160 : : static int
161 : 2 : find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
162 : : {
163 : : unsigned int i;
164 : : phys_addr_t addr;
165 : :
166 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
167 : 2046 : addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
168 [ + - ]: 2046 : if (addr == RTE_BAD_PHYS_ADDR)
169 : : return -1;
170 : 2046 : hugepg_tbl[i].physaddr = addr;
171 : : }
172 : : return 0;
173 : : }
174 : :
175 : : /*
176 : : * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
177 : : */
178 : : static int
179 : : set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
180 : : {
181 : : unsigned int i;
182 : : static phys_addr_t addr;
183 : :
184 [ # # ]: 0 : for (i = 0; i < hpi->num_pages[0]; i++) {
185 : 0 : hugepg_tbl[i].physaddr = addr;
186 : 0 : addr += hugepg_tbl[i].size;
187 : : }
188 : : return 0;
189 : : }
190 : :
191 : : /*
192 : : * Check whether address-space layout randomization is enabled in
193 : : * the kernel. This is important for multi-process as it can prevent
194 : : * two processes mapping data to the same virtual address
195 : : * Returns:
196 : : * 0 - address space randomization disabled
197 : : * 1/2 - address space randomization enabled
198 : : * negative error code on error
199 : : */
200 : : static int
201 : 2 : aslr_enabled(void)
202 : : {
203 : : char c;
204 : :
205 : : /*
206 : : * Check whether the current process is executed with the command line
207 : : * "setarch ... --addr-no-randomize ..." or "setarch ... -R ..."
208 : : * This complements the sysfs check to ensure comprehensive ASLR status detection.
209 : : * This check is necessary to support the functionality of the "setarch" command,
210 : : * which can disable ASLR by setting the ADDR_NO_RANDOMIZE personality flag.
211 : : */
212 [ + - ]: 2 : if ((personality(0xffffffff) & ADDR_NO_RANDOMIZE) == ADDR_NO_RANDOMIZE)
213 : : return 0;
214 : :
215 : : int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
216 [ - + ]: 2 : if (fd < 0)
217 : 0 : return -errno;
218 : 2 : retval = read(fd, &c, 1);
219 : 2 : close(fd);
220 [ - + ]: 2 : if (retval < 0)
221 : 0 : return -errno;
222 [ + - ]: 2 : if (retval == 0)
223 : : return -EIO;
224 [ - + ]: 2 : switch (c) {
225 : : case '0' : return 0;
226 : : case '1' : return 1;
227 : : case '2' : return 2;
228 : : default: return -EINVAL;
229 : : }
230 : : }
231 : :
232 : : static sigjmp_buf huge_jmpenv;
233 : :
234 : 0 : static void huge_sigbus_handler(int signo __rte_unused)
235 : : {
236 : 0 : siglongjmp(huge_jmpenv, 1);
237 : : }
238 : :
239 : : /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
240 : : * non-static local variable in the stack frame calling sigsetjmp might be
241 : : * clobbered by a call to longjmp.
242 : : */
243 : 2046 : static int huge_wrap_sigsetjmp(void)
244 : : {
245 : 2046 : return sigsetjmp(huge_jmpenv, 1);
246 : : }
247 : :
248 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
249 : : /* Callback for numa library. */
250 : : void numa_error(char *where)
251 : : {
252 : 0 : EAL_LOG(ERR, "%s failed: %s", where, strerror(errno));
253 : 0 : }
254 : : #endif
255 : :
256 : : /*
257 : : * Mmap all hugepages of hugepage table: it first open a file in
258 : : * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
259 : : * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
260 : : * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
261 : : * map contiguous physical blocks in contiguous virtual blocks.
262 : : */
263 : : static unsigned
264 : 2 : map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
265 : : uint64_t *essential_memory __rte_unused)
266 : : {
267 : : int fd;
268 : : unsigned i;
269 : : void *virtaddr;
270 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
271 : : int node_id = -1;
272 : : int essential_prev = 0;
273 : : int oldpolicy;
274 : : struct bitmask *oldmask = NULL;
275 : : bool have_numa = true;
276 : : unsigned long maxnode = 0;
277 : : const struct internal_config *internal_conf =
278 : 2 : eal_get_internal_configuration();
279 : :
280 : : /* Check if kernel supports NUMA. */
281 [ + - ]: 2 : if (numa_available() != 0) {
282 : 0 : EAL_LOG(DEBUG, "NUMA is not supported.");
283 : : have_numa = false;
284 : : }
285 : :
286 : : if (have_numa) {
287 : 2 : EAL_LOG(DEBUG, "Trying to obtain current memory policy.");
288 : 2 : oldmask = numa_allocate_nodemask();
289 [ - + ]: 2 : if (get_mempolicy(&oldpolicy, oldmask->maskp,
290 : 2 : oldmask->size + 1, 0, 0) < 0) {
291 : 0 : EAL_LOG(ERR,
292 : : "Failed to get current mempolicy: %s. "
293 : : "Assuming MPOL_DEFAULT.", strerror(errno));
294 : 0 : oldpolicy = MPOL_DEFAULT;
295 : : }
296 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
297 [ - + ]: 64 : if (internal_conf->numa_mem[i])
298 : 0 : maxnode = i + 1;
299 : : }
300 : : #endif
301 : :
302 [ + + ]: 2048 : for (i = 0; i < hpi->num_pages[0]; i++) {
303 : 2046 : struct hugepage_file *hf = &hugepg_tbl[i];
304 : 2046 : uint64_t hugepage_sz = hpi->hugepage_sz;
305 : :
306 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
307 [ - + ]: 2046 : if (maxnode) {
308 : : unsigned int j;
309 : :
310 [ # # ]: 0 : for (j = 0; j < maxnode; j++)
311 [ # # ]: 0 : if (essential_memory[j])
312 : : break;
313 : :
314 [ # # ]: 0 : if (j == maxnode) {
315 : 0 : node_id = (node_id + 1) % maxnode;
316 [ # # ]: 0 : while (!internal_conf->numa_mem[node_id]) {
317 : 0 : node_id++;
318 : 0 : node_id %= maxnode;
319 : : }
320 : : essential_prev = 0;
321 : : } else {
322 : 0 : node_id = j;
323 : 0 : essential_prev = essential_memory[j];
324 : :
325 [ # # ]: 0 : if (essential_memory[j] < hugepage_sz)
326 : 0 : essential_memory[j] = 0;
327 : : else
328 : 0 : essential_memory[j] -= hugepage_sz;
329 : : }
330 : :
331 : 0 : EAL_LOG(DEBUG,
332 : : "Setting policy MPOL_PREFERRED for socket %d",
333 : : node_id);
334 : 0 : numa_set_preferred(node_id);
335 : : }
336 : : #endif
337 : :
338 : 2046 : hf->file_id = i;
339 : 2046 : hf->size = hugepage_sz;
340 : 2046 : eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
341 : 2046 : hpi->hugedir, hf->file_id);
342 : 2046 : hf->filepath[sizeof(hf->filepath) - 1] = '\0';
343 : :
344 : : /* try to create hugepage file */
345 : : fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
346 [ - + ]: 2046 : if (fd < 0) {
347 : 0 : EAL_LOG(DEBUG, "%s(): open failed: %s", __func__,
348 : : strerror(errno));
349 : 0 : goto out;
350 : : }
351 : :
352 : : /* map the segment, and populate page tables,
353 : : * the kernel fills this segment with zeros. we don't care where
354 : : * this gets mapped - we already have contiguous memory areas
355 : : * ready for us to map into.
356 : : */
357 : 2046 : virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
358 : : MAP_SHARED | MAP_POPULATE, fd, 0);
359 [ - + ]: 2046 : if (virtaddr == MAP_FAILED) {
360 : 0 : EAL_LOG(DEBUG, "%s(): mmap failed: %s", __func__,
361 : : strerror(errno));
362 : 0 : close(fd);
363 : 0 : goto out;
364 : : }
365 : :
366 : 2046 : hf->orig_va = virtaddr;
367 : :
368 : : /* In linux, hugetlb limitations, like cgroup, are
369 : : * enforced at fault time instead of mmap(), even
370 : : * with the option of MAP_POPULATE. Kernel will send
371 : : * a SIGBUS signal. To avoid to be killed, save stack
372 : : * environment here, if SIGBUS happens, we can jump
373 : : * back here.
374 : : */
375 [ - + ]: 2046 : if (huge_wrap_sigsetjmp()) {
376 : 0 : EAL_LOG(DEBUG, "SIGBUS: Cannot mmap more "
377 : : "hugepages of size %u MB",
378 : : (unsigned int)(hugepage_sz / 0x100000));
379 : 0 : munmap(virtaddr, hugepage_sz);
380 : 0 : close(fd);
381 : 0 : unlink(hugepg_tbl[i].filepath);
382 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
383 [ # # ]: 0 : if (maxnode)
384 : 0 : essential_memory[node_id] =
385 : : essential_prev;
386 : : #endif
387 : 0 : goto out;
388 : : }
389 : 2046 : *(int *)virtaddr = 0;
390 : :
391 : : /* set shared lock on the file. */
392 [ - + ]: 2046 : if (flock(fd, LOCK_SH) < 0) {
393 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed:%s ",
394 : : __func__, strerror(errno));
395 : 0 : close(fd);
396 : 0 : goto out;
397 : : }
398 : :
399 : 2046 : close(fd);
400 : : }
401 : :
402 : 2 : out:
403 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
404 [ - + ]: 2 : if (maxnode) {
405 : 0 : EAL_LOG(DEBUG,
406 : : "Restoring previous memory policy: %d", oldpolicy);
407 [ # # ]: 0 : if (oldpolicy == MPOL_DEFAULT) {
408 : 0 : numa_set_localalloc();
409 [ # # ]: 0 : } else if (set_mempolicy(oldpolicy, oldmask->maskp,
410 : 0 : oldmask->size + 1) < 0) {
411 : 0 : EAL_LOG(ERR, "Failed to restore mempolicy: %s",
412 : : strerror(errno));
413 : 0 : numa_set_localalloc();
414 : : }
415 : : }
416 [ + - ]: 2 : if (oldmask != NULL)
417 : : numa_free_cpumask(oldmask);
418 : : #endif
419 : 2 : return i;
420 : : }
421 : :
422 : : /*
423 : : * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
424 : : * page.
425 : : */
426 : : static int
427 : 2 : find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
428 : : {
429 : : int socket_id;
430 : : char *end, *nodestr;
431 : : unsigned i, hp_count = 0;
432 : : uint64_t virt_addr;
433 : : char buf[BUFSIZ];
434 : : char hugedir_str[PATH_MAX];
435 : : FILE *f;
436 : :
437 : 2 : f = fopen("/proc/self/numa_maps", "r");
438 [ - + ]: 2 : if (f == NULL) {
439 : 0 : EAL_LOG(NOTICE, "NUMA support not available"
440 : : " consider that all memory is in socket_id 0");
441 : 0 : return 0;
442 : : }
443 : :
444 : 2 : snprintf(hugedir_str, sizeof(hugedir_str),
445 : 2 : "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
446 : :
447 : : /* parse numa map */
448 [ + + ]: 2388 : while (fgets(buf, sizeof(buf), f) != NULL) {
449 : :
450 : : /* ignore non huge page */
451 [ + + ]: 2386 : if (strstr(buf, " huge ") == NULL &&
452 [ + - ]: 340 : strstr(buf, hugedir_str) == NULL)
453 : 340 : continue;
454 : :
455 : : /* get zone addr */
456 : 2046 : virt_addr = strtoull(buf, &end, 16);
457 [ + - - + ]: 2046 : if (virt_addr == 0 || end == buf) {
458 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
459 : 0 : goto error;
460 : : }
461 : :
462 : : /* get node id (socket id) */
463 : 2046 : nodestr = strstr(buf, " N");
464 [ - + ]: 2046 : if (nodestr == NULL) {
465 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
466 : 0 : goto error;
467 : : }
468 : 2046 : nodestr += 2;
469 : 2046 : end = strstr(nodestr, "=");
470 [ - + ]: 2046 : if (end == NULL) {
471 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
472 : 0 : goto error;
473 : : }
474 : 2046 : end[0] = '\0';
475 : 2046 : end = NULL;
476 : :
477 : 2046 : socket_id = strtoul(nodestr, &end, 0);
478 [ + - + - : 2046 : if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
- + ]
479 : 0 : EAL_LOG(ERR, "%s(): error in numa_maps parsing", __func__);
480 : 0 : goto error;
481 : : }
482 : :
483 : : /* if we find this page in our mappings, set socket_id */
484 [ + + ]: 2095104 : for (i = 0; i < hpi->num_pages[0]; i++) {
485 : 2093058 : void *va = (void *)(unsigned long)virt_addr;
486 [ + + ]: 2093058 : if (hugepg_tbl[i].orig_va == va) {
487 : 2046 : hugepg_tbl[i].socket_id = socket_id;
488 : 2046 : hp_count++;
489 : : #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
490 : 2046 : EAL_LOG(DEBUG,
491 : : "Hugepage %s is on socket %d",
492 : : hugepg_tbl[i].filepath, socket_id);
493 : : #endif
494 : : }
495 : : }
496 : : }
497 : :
498 [ - + ]: 2 : if (hp_count < hpi->num_pages[0])
499 : 0 : goto error;
500 : :
501 : 2 : fclose(f);
502 : 2 : return 0;
503 : :
504 : 0 : error:
505 : 0 : fclose(f);
506 : 0 : return -1;
507 : : }
508 : :
509 : : static int
510 : 10401 : cmp_physaddr(const void *a, const void *b)
511 : : {
512 : : #ifndef RTE_ARCH_PPC_64
513 : : const struct hugepage_file *p1 = a;
514 : : const struct hugepage_file *p2 = b;
515 : : #else
516 : : /* PowerPC needs memory sorted in reverse order from x86 */
517 : : const struct hugepage_file *p1 = b;
518 : : const struct hugepage_file *p2 = a;
519 : : #endif
520 [ + + ]: 10401 : if (p1->physaddr < p2->physaddr)
521 : : return -1;
522 [ - + ]: 9179 : else if (p1->physaddr > p2->physaddr)
523 : : return 1;
524 : : else
525 : 0 : return 0;
526 : : }
527 : :
528 : : /*
529 : : * Uses mmap to create a shared memory area for storage of data
530 : : * Used in this file to store the hugepage file map on disk
531 : : */
532 : : static void *
533 : 2 : create_shared_memory(const char *filename, const size_t mem_size)
534 : : {
535 : : void *retval;
536 : : int fd;
537 : : const struct internal_config *internal_conf =
538 : 2 : eal_get_internal_configuration();
539 : :
540 : : /* if no shared files mode is used, create anonymous memory instead */
541 [ - + ]: 2 : if (internal_conf->no_shconf) {
542 : 0 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
543 : : MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
544 [ # # ]: 0 : if (retval == MAP_FAILED)
545 : : return NULL;
546 : 0 : return retval;
547 : : }
548 : :
549 : : fd = open(filename, O_CREAT | O_RDWR, 0600);
550 [ + - ]: 2 : if (fd < 0)
551 : : return NULL;
552 [ - + ]: 2 : if (ftruncate(fd, mem_size) < 0) {
553 : 0 : close(fd);
554 : 0 : return NULL;
555 : : }
556 : 2 : retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
557 : 2 : close(fd);
558 [ - + ]: 2 : if (retval == MAP_FAILED)
559 : 0 : return NULL;
560 : : return retval;
561 : : }
562 : :
563 : : /*
564 : : * this copies *active* hugepages from one hugepage table to another.
565 : : * destination is typically the shared memory.
566 : : */
567 : : static int
568 : 2 : copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
569 : : const struct hugepage_file * src, int src_size)
570 : : {
571 : : int src_pos, dst_pos = 0;
572 : :
573 [ + + ]: 2048 : for (src_pos = 0; src_pos < src_size; src_pos++) {
574 [ + + ]: 2046 : if (src[src_pos].orig_va != NULL) {
575 : : /* error on overflow attempt */
576 [ + - ]: 18 : if (dst_pos == dest_size)
577 : : return -1;
578 : 18 : memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
579 : 18 : dst_pos++;
580 : : }
581 : : }
582 : : return 0;
583 : : }
584 : :
585 : : static int
586 : 0 : unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
587 : : unsigned num_hp_info)
588 : : {
589 : : unsigned socket, size;
590 : : int page, nrpages = 0;
591 : : const struct internal_config *internal_conf =
592 : 0 : eal_get_internal_configuration();
593 : :
594 : : /* get total number of hugepages */
595 [ # # ]: 0 : for (size = 0; size < num_hp_info; size++)
596 [ # # ]: 0 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
597 : 0 : nrpages +=
598 : 0 : internal_conf->hugepage_info[size].num_pages[socket];
599 : :
600 [ # # ]: 0 : for (page = 0; page < nrpages; page++) {
601 : 0 : struct hugepage_file *hp = &hugepg_tbl[page];
602 : :
603 [ # # # # ]: 0 : if (hp->orig_va != NULL && unlink(hp->filepath)) {
604 : 0 : EAL_LOG(WARNING, "%s(): Removing %s failed: %s",
605 : : __func__, hp->filepath, strerror(errno));
606 : : }
607 : : }
608 : 0 : return 0;
609 : : }
610 : :
611 : : /*
612 : : * unmaps hugepages that are not going to be used. since we originally allocate
613 : : * ALL hugepages (not just those we need), additional unmapping needs to be done.
614 : : */
615 : : static int
616 : 2 : unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
617 : : struct hugepage_info *hpi,
618 : : unsigned num_hp_info)
619 : : {
620 : : unsigned socket, size;
621 : : int page, nrpages = 0;
622 : : const struct internal_config *internal_conf =
623 : 2 : eal_get_internal_configuration();
624 : :
625 : : /* get total number of hugepages */
626 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++)
627 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
628 : 64 : nrpages += internal_conf->hugepage_info[size].num_pages[socket];
629 : :
630 [ + + ]: 4 : for (size = 0; size < num_hp_info; size++) {
631 [ + + ]: 66 : for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
632 : : unsigned pages_found = 0;
633 : :
634 : : /* traverse until we have unmapped all the unused pages */
635 [ + + ]: 65536 : for (page = 0; page < nrpages; page++) {
636 : 65472 : struct hugepage_file *hp = &hugepg_tbl[page];
637 : :
638 : : /* find a page that matches the criteria */
639 [ + - ]: 65472 : if ((hp->size == hpi[size].hugepage_sz) &&
640 [ + + ]: 65472 : (hp->socket_id == (int) socket)) {
641 : :
642 : : /* if we skipped enough pages, unmap the rest */
643 [ + + ]: 2046 : if (pages_found == hpi[size].num_pages[socket]) {
644 : : uint64_t unmap_len;
645 : :
646 : : unmap_len = hp->size;
647 : :
648 : : /* get start addr and len of the remaining segment */
649 : 2028 : munmap(hp->orig_va,
650 : : (size_t)unmap_len);
651 : :
652 : 2028 : hp->orig_va = NULL;
653 [ - + ]: 2028 : if (unlink(hp->filepath) == -1) {
654 : 0 : EAL_LOG(ERR, "%s(): Removing %s failed: %s",
655 : : __func__, hp->filepath, strerror(errno));
656 : 0 : return -1;
657 : : }
658 : : } else {
659 : : /* lock the page and skip */
660 : 18 : pages_found++;
661 : : }
662 : :
663 : : } /* match page */
664 : : } /* foreach page */
665 : : } /* foreach socket */
666 : : } /* foreach pagesize */
667 : :
668 : : return 0;
669 : : }
670 : :
671 : : static int
672 : 2 : remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
673 : : {
674 : 2 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
675 : : struct rte_memseg_list *msl;
676 : : struct rte_fbarray *arr;
677 : : int cur_page, seg_len;
678 : : unsigned int msl_idx;
679 : : int ms_idx;
680 : : uint64_t page_sz;
681 : : size_t memseg_len;
682 : : int socket_id;
683 : : #ifndef RTE_ARCH_64
684 : : const struct internal_config *internal_conf =
685 : : eal_get_internal_configuration();
686 : : #endif
687 : 2 : page_sz = hugepages[seg_start].size;
688 : 2 : socket_id = hugepages[seg_start].socket_id;
689 : 2 : seg_len = seg_end - seg_start;
690 : :
691 : 2 : EAL_LOG(DEBUG, "Attempting to map %" PRIu64 "M on socket %i",
692 : : (seg_len * page_sz) >> 20ULL, socket_id);
693 : :
694 : : /* find free space in memseg lists */
695 [ + - ]: 2 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
696 : : int free_len;
697 : : bool empty;
698 : 2 : msl = &mcfg->memsegs[msl_idx];
699 : 2 : arr = &msl->memseg_arr;
700 : :
701 [ - + ]: 2 : if (msl->page_sz != page_sz)
702 : 0 : continue;
703 [ - + ]: 2 : if (msl->socket_id != socket_id)
704 : 0 : continue;
705 : :
706 : : /* leave space for a hole if array is not empty */
707 : 2 : empty = arr->count == 0;
708 : : /* find start of the biggest contiguous block and its size */
709 : 2 : ms_idx = rte_fbarray_find_biggest_free(arr, 0);
710 [ - + ]: 2 : if (ms_idx < 0)
711 : 0 : continue;
712 : : /* hole is 1 segment long, so at least two segments long. */
713 : 2 : free_len = rte_fbarray_find_contig_free(arr, ms_idx);
714 [ - + ]: 2 : if (free_len < 2)
715 : 0 : continue;
716 : : /* leave some space between memsegs, they are not IOVA
717 : : * contiguous, so they shouldn't be VA contiguous either.
718 : : */
719 [ - + ]: 2 : if (!empty) {
720 : 0 : ms_idx++;
721 : 0 : free_len--;
722 : : }
723 : :
724 : : /* we might not get all of the space we wanted */
725 : 2 : free_len = RTE_MIN(seg_len, free_len);
726 : 2 : seg_end = seg_start + free_len;
727 : : seg_len = seg_end - seg_start;
728 : 2 : break;
729 : : }
730 [ - + ]: 2 : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
731 : 0 : EAL_LOG(ERR, "Could not find space for memseg. Please increase RTE_MAX_MEMSEG_PER_LIST "
732 : : "RTE_MAX_MEMSEG_PER_TYPE and/or RTE_MAX_MEM_MB_PER_TYPE in configuration.");
733 : 0 : return -1;
734 : : }
735 : :
736 : : #ifdef RTE_ARCH_PPC_64
737 : : /* for PPC64 we go through the list backwards */
738 : : for (cur_page = seg_end - 1; cur_page >= seg_start;
739 : : cur_page--, ms_idx++) {
740 : : #else
741 [ + + ]: 20 : for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
742 : : #endif
743 : 18 : struct hugepage_file *hfile = &hugepages[cur_page];
744 : 18 : struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
745 : : void *addr;
746 : : int fd;
747 : :
748 : 18 : fd = open(hfile->filepath, O_RDWR);
749 [ - + ]: 18 : if (fd < 0) {
750 : 0 : EAL_LOG(ERR, "Could not open '%s': %s",
751 : : hfile->filepath, strerror(errno));
752 : 0 : return -1;
753 : : }
754 : : /* set shared lock on the file. */
755 [ - + ]: 18 : if (flock(fd, LOCK_SH) < 0) {
756 : 0 : EAL_LOG(DEBUG, "Could not lock '%s': %s",
757 : : hfile->filepath, strerror(errno));
758 : 0 : close(fd);
759 : 0 : return -1;
760 : : }
761 : : memseg_len = (size_t)page_sz;
762 : 18 : addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
763 : :
764 : : /* we know this address is already mmapped by memseg list, so
765 : : * using MAP_FIXED here is safe
766 : : */
767 : 18 : addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
768 : : MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
769 [ - + ]: 18 : if (addr == MAP_FAILED) {
770 : 0 : EAL_LOG(ERR, "Couldn't remap '%s': %s",
771 : : hfile->filepath, strerror(errno));
772 : 0 : close(fd);
773 : 0 : return -1;
774 : : }
775 : :
776 : : /* we have a new address, so unmap previous one */
777 : : #ifndef RTE_ARCH_64
778 : : /* in 32-bit legacy mode, we have already unmapped the page */
779 : : if (!internal_conf->legacy_mem)
780 : : munmap(hfile->orig_va, page_sz);
781 : : #else
782 : 18 : munmap(hfile->orig_va, page_sz);
783 : : #endif
784 : :
785 : 18 : hfile->orig_va = NULL;
786 : 18 : hfile->final_va = addr;
787 : :
788 : : /* rewrite physical addresses in IOVA as VA mode */
789 [ - + ]: 18 : if (rte_eal_iova_mode() == RTE_IOVA_VA)
790 : 0 : hfile->physaddr = (uintptr_t)addr;
791 : :
792 : : /* set up memseg data */
793 : 18 : ms->addr = addr;
794 : 18 : ms->hugepage_sz = page_sz;
795 : 18 : ms->len = memseg_len;
796 : 18 : ms->iova = hfile->physaddr;
797 : 18 : ms->socket_id = hfile->socket_id;
798 : 18 : ms->nchannel = rte_memory_get_nchannel();
799 : 18 : ms->nrank = rte_memory_get_nrank();
800 : :
801 : 18 : rte_fbarray_set_used(arr, ms_idx);
802 : :
803 : : /* store segment fd internally */
804 [ - + ]: 18 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
805 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
806 : : rte_strerror(rte_errno));
807 : : }
808 : 2 : EAL_LOG(DEBUG, "Allocated %" PRIu64 "M on socket %i",
809 : : (seg_len * page_sz) >> 20, socket_id);
810 : 2 : return seg_len;
811 : : }
812 : :
813 : : static uint64_t
814 : : get_mem_amount(uint64_t page_sz, uint64_t max_mem)
815 : : {
816 : : uint64_t area_sz, max_pages;
817 : :
818 : : /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
819 : : max_pages = RTE_MAX_MEMSEG_PER_LIST;
820 : : max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
821 : :
822 : : area_sz = RTE_MIN(page_sz * max_pages, max_mem);
823 : :
824 : : /* make sure the list isn't smaller than the page size */
825 : : area_sz = RTE_MAX(area_sz, page_sz);
826 : :
827 : : return RTE_ALIGN(area_sz, page_sz);
828 : : }
829 : :
830 : : static int
831 : : memseg_list_free(struct rte_memseg_list *msl)
832 : : {
833 : : if (rte_fbarray_destroy(&msl->memseg_arr)) {
834 : : EAL_LOG(ERR, "Cannot destroy memseg list");
835 : : return -1;
836 : : }
837 : : memset(msl, 0, sizeof(*msl));
838 : : return 0;
839 : : }
840 : :
841 : : /*
842 : : * Our VA space is not preallocated yet, so preallocate it here. We need to know
843 : : * how many segments there are in order to map all pages into one address space,
844 : : * and leave appropriate holes between segments so that rte_malloc does not
845 : : * concatenate them into one big segment.
846 : : *
847 : : * we also need to unmap original pages to free up address space.
848 : : */
849 : : static int __rte_unused
850 : : prealloc_segments(struct hugepage_file *hugepages, int n_pages)
851 : : {
852 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
853 : : int cur_page, seg_start_page, end_seg, new_memseg;
854 : : unsigned int hpi_idx, socket, i;
855 : : int n_contig_segs, n_segs;
856 : : int msl_idx;
857 : : const struct internal_config *internal_conf =
858 : : eal_get_internal_configuration();
859 : :
860 : : /* before we preallocate segments, we need to free up our VA space.
861 : : * we're not removing files, and we already have information about
862 : : * PA-contiguousness, so it is safe to unmap everything.
863 : : */
864 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
865 : : struct hugepage_file *hpi = &hugepages[cur_page];
866 : : munmap(hpi->orig_va, hpi->size);
867 : : hpi->orig_va = NULL;
868 : : }
869 : :
870 : : /* we cannot know how many page sizes and sockets we have discovered, so
871 : : * loop over all of them
872 : : */
873 : : for (hpi_idx = 0; hpi_idx < internal_conf->num_hugepage_sizes;
874 : : hpi_idx++) {
875 : : uint64_t page_sz =
876 : : internal_conf->hugepage_info[hpi_idx].hugepage_sz;
877 : :
878 : : for (i = 0; i < rte_socket_count(); i++) {
879 : : struct rte_memseg_list *msl;
880 : :
881 : : socket = rte_socket_id_by_idx(i);
882 : : n_contig_segs = 0;
883 : : n_segs = 0;
884 : : seg_start_page = -1;
885 : :
886 : : for (cur_page = 0; cur_page < n_pages; cur_page++) {
887 : : struct hugepage_file *prev, *cur;
888 : : int prev_seg_start_page = -1;
889 : :
890 : : cur = &hugepages[cur_page];
891 : : prev = cur_page == 0 ? NULL :
892 : : &hugepages[cur_page - 1];
893 : :
894 : : new_memseg = 0;
895 : : end_seg = 0;
896 : :
897 : : if (cur->size == 0)
898 : : end_seg = 1;
899 : : else if (cur->socket_id != (int) socket)
900 : : end_seg = 1;
901 : : else if (cur->size != page_sz)
902 : : end_seg = 1;
903 : : else if (cur_page == 0)
904 : : new_memseg = 1;
905 : : #ifdef RTE_ARCH_PPC_64
906 : : /* On PPC64 architecture, the mmap always start
907 : : * from higher address to lower address. Here,
908 : : * physical addresses are in descending order.
909 : : */
910 : : else if ((prev->physaddr - cur->physaddr) !=
911 : : cur->size)
912 : : new_memseg = 1;
913 : : #else
914 : : else if ((cur->physaddr - prev->physaddr) !=
915 : : cur->size)
916 : : new_memseg = 1;
917 : : #endif
918 : : if (new_memseg) {
919 : : /* if we're already inside a segment,
920 : : * new segment means end of current one
921 : : */
922 : : if (seg_start_page != -1) {
923 : : end_seg = 1;
924 : : prev_seg_start_page =
925 : : seg_start_page;
926 : : }
927 : : seg_start_page = cur_page;
928 : : }
929 : :
930 : : if (end_seg) {
931 : : if (prev_seg_start_page != -1) {
932 : : /* we've found a new segment */
933 : : n_contig_segs++;
934 : : n_segs += cur_page -
935 : : prev_seg_start_page;
936 : : } else if (seg_start_page != -1) {
937 : : /* we didn't find new segment,
938 : : * but did end current one
939 : : */
940 : : n_contig_segs++;
941 : : n_segs += cur_page -
942 : : seg_start_page;
943 : : seg_start_page = -1;
944 : : continue;
945 : : } else {
946 : : /* we're skipping this page */
947 : : continue;
948 : : }
949 : : }
950 : : /* segment continues */
951 : : }
952 : : /* check if we missed last segment */
953 : : if (seg_start_page != -1) {
954 : : n_contig_segs++;
955 : : n_segs += cur_page - seg_start_page;
956 : : }
957 : :
958 : : /* if no segments were found, do not preallocate */
959 : : if (n_segs == 0)
960 : : continue;
961 : :
962 : : /* we now have total number of pages that we will
963 : : * allocate for this segment list. add separator pages
964 : : * to the total count, and preallocate VA space.
965 : : */
966 : : n_segs += n_contig_segs - 1;
967 : :
968 : : /* now, preallocate VA space for these segments */
969 : :
970 : : /* first, find suitable memseg list for this */
971 : : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
972 : : msl_idx++) {
973 : : msl = &mcfg->memsegs[msl_idx];
974 : :
975 : : if (msl->base_va != NULL)
976 : : continue;
977 : : break;
978 : : }
979 : : if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
980 : : EAL_LOG(ERR, "Not enough space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
981 : : return -1;
982 : : }
983 : :
984 : : /* now, allocate fbarray itself */
985 : : if (eal_memseg_list_init(msl, page_sz, n_segs,
986 : : socket, msl_idx, true) < 0)
987 : : return -1;
988 : :
989 : : /* finally, allocate VA space */
990 : : if (eal_memseg_list_alloc(msl, 0) < 0) {
991 : : EAL_LOG(ERR, "Cannot preallocate 0x%"PRIx64"kB hugepages",
992 : : page_sz >> 10);
993 : : return -1;
994 : : }
995 : : }
996 : : }
997 : : return 0;
998 : : }
999 : :
1000 : : /*
1001 : : * We cannot reallocate memseg lists on the fly because PPC64 stores pages
1002 : : * backwards, therefore we have to process the entire memseg first before
1003 : : * remapping it into memseg list VA space.
1004 : : */
1005 : : static int
1006 : 2 : remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
1007 : : {
1008 : : int cur_page, seg_start_page, new_memseg, ret;
1009 : :
1010 : : seg_start_page = 0;
1011 [ + - ]: 20 : for (cur_page = 0; cur_page < n_pages; cur_page++) {
1012 : : struct hugepage_file *prev, *cur;
1013 : :
1014 : : new_memseg = 0;
1015 : :
1016 : 20 : cur = &hugepages[cur_page];
1017 [ + + ]: 20 : prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
1018 : :
1019 : : /* if size is zero, no more pages left */
1020 [ + + ]: 20 : if (cur->size == 0)
1021 : : break;
1022 : :
1023 [ + + ]: 18 : if (cur_page == 0)
1024 : : new_memseg = 1;
1025 [ + - ]: 16 : else if (cur->socket_id != prev->socket_id)
1026 : : new_memseg = 1;
1027 [ + - ]: 16 : else if (cur->size != prev->size)
1028 : : new_memseg = 1;
1029 : : #ifdef RTE_ARCH_PPC_64
1030 : : /* On PPC64 architecture, the mmap always start from higher
1031 : : * address to lower address. Here, physical addresses are in
1032 : : * descending order.
1033 : : */
1034 : : else if ((prev->physaddr - cur->physaddr) != cur->size)
1035 : : new_memseg = 1;
1036 : : #else
1037 [ - + ]: 16 : else if ((cur->physaddr - prev->physaddr) != cur->size)
1038 : : new_memseg = 1;
1039 : : #endif
1040 : :
1041 : : if (new_memseg) {
1042 : : /* if this isn't the first time, remap segment */
1043 [ - + ]: 2 : if (cur_page != 0) {
1044 : : int n_remapped = 0;
1045 : 0 : int n_needed = cur_page - seg_start_page;
1046 [ # # ]: 0 : while (n_remapped < n_needed) {
1047 : 0 : ret = remap_segment(hugepages, seg_start_page,
1048 : : cur_page);
1049 [ # # ]: 0 : if (ret < 0)
1050 : : return -1;
1051 : 0 : n_remapped += ret;
1052 : 0 : seg_start_page += ret;
1053 : : }
1054 : : }
1055 : : /* remember where we started */
1056 : : seg_start_page = cur_page;
1057 : : }
1058 : : /* continuation of previous memseg */
1059 : : }
1060 : : /* we were stopped, but we didn't remap the last segment, do it now */
1061 [ + - ]: 2 : if (cur_page != 0) {
1062 : : int n_remapped = 0;
1063 : 2 : int n_needed = cur_page - seg_start_page;
1064 [ + + ]: 4 : while (n_remapped < n_needed) {
1065 : 2 : ret = remap_segment(hugepages, seg_start_page,
1066 : : cur_page);
1067 [ + - ]: 2 : if (ret < 0)
1068 : : return -1;
1069 : 2 : n_remapped += ret;
1070 : 2 : seg_start_page += ret;
1071 : : }
1072 : : }
1073 : : return 0;
1074 : : }
1075 : :
1076 : : static inline size_t
1077 : 0 : eal_get_hugepage_mem_size(void)
1078 : : {
1079 : : uint64_t size = 0;
1080 : : unsigned i, j;
1081 : : struct internal_config *internal_conf =
1082 : 0 : eal_get_internal_configuration();
1083 : :
1084 [ # # ]: 0 : for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
1085 : : struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
1086 [ # # ]: 0 : if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
1087 [ # # ]: 0 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1088 : 0 : size += hpi->hugepage_sz * hpi->num_pages[j];
1089 : : }
1090 : : }
1091 : : }
1092 : :
1093 : 0 : return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
1094 : : }
1095 : :
1096 : : static struct sigaction huge_action_old;
1097 : : static int huge_need_recover;
1098 : :
1099 : : static void
1100 : 2 : huge_register_sigbus(void)
1101 : : {
1102 : : sigset_t mask;
1103 : : struct sigaction action;
1104 : :
1105 : 2 : sigemptyset(&mask);
1106 : 2 : sigaddset(&mask, SIGBUS);
1107 : 2 : action.sa_flags = 0;
1108 : 2 : action.sa_mask = mask;
1109 : 2 : action.sa_handler = huge_sigbus_handler;
1110 : :
1111 : 2 : huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
1112 : 2 : }
1113 : :
1114 : : static void
1115 : : huge_recover_sigbus(void)
1116 : : {
1117 [ + - ]: 2 : if (huge_need_recover) {
1118 : 2 : sigaction(SIGBUS, &huge_action_old, NULL);
1119 : 2 : huge_need_recover = 0;
1120 : : }
1121 : : }
1122 : :
1123 : : /*
1124 : : * Prepare physical memory mapping: fill configuration structure with
1125 : : * these infos, return 0 on success.
1126 : : * 1. map N huge pages in separate files in hugetlbfs
1127 : : * 2. find associated physical addr
1128 : : * 3. find associated NUMA socket ID
1129 : : * 4. sort all huge pages by physical address
1130 : : * 5. remap these N huge pages in the correct order
1131 : : * 6. unmap the first mapping
1132 : : * 7. fill memsegs in configuration with contiguous zones
1133 : : */
1134 : : static int
1135 : 102 : eal_legacy_hugepage_init(void)
1136 : : {
1137 : : struct rte_mem_config *mcfg;
1138 : : struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
1139 : : struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
1140 : : struct internal_config *internal_conf =
1141 : 102 : eal_get_internal_configuration();
1142 : :
1143 : : uint64_t memory[RTE_MAX_NUMA_NODES];
1144 : :
1145 : : unsigned hp_offset;
1146 : : int i, j;
1147 : : int nr_hugefiles, nr_hugepages = 0;
1148 : : void *addr;
1149 : :
1150 : : memset(used_hp, 0, sizeof(used_hp));
1151 : :
1152 : : /* get pointer to global configuration */
1153 : 102 : mcfg = rte_eal_get_configuration()->mem_config;
1154 : :
1155 : : /* hugetlbfs can be disabled */
1156 [ + + ]: 102 : if (internal_conf->no_hugetlbfs) {
1157 : : void *prealloc_addr;
1158 : : size_t mem_sz;
1159 : : struct rte_memseg_list *msl;
1160 : : int n_segs, fd, flags;
1161 : : int memfd;
1162 : : uint64_t page_sz;
1163 : :
1164 : : /* nohuge mode is legacy mode */
1165 : 100 : internal_conf->legacy_mem = 1;
1166 : :
1167 : : /* nohuge mode is single-file segments mode */
1168 : 100 : internal_conf->single_file_segments = 1;
1169 : :
1170 : : /* create a memseg list */
1171 : 100 : msl = &mcfg->memsegs[0];
1172 : :
1173 : 100 : mem_sz = internal_conf->memory;
1174 : : page_sz = RTE_PGSIZE_4K;
1175 : 100 : n_segs = mem_sz / page_sz;
1176 : :
1177 [ + - ]: 100 : if (eal_memseg_list_init_named(
1178 : : msl, "nohugemem", page_sz, n_segs, 0, true)) {
1179 : : return -1;
1180 : : }
1181 : :
1182 : : /* set up parameters for anonymous mmap */
1183 : : fd = -1;
1184 : : flags = MAP_PRIVATE | MAP_ANONYMOUS;
1185 : :
1186 : : /* create a memfd and store it in the segment fd table */
1187 : 100 : memfd = memfd_create("nohuge", 0);
1188 [ - + ]: 100 : if (memfd < 0) {
1189 : 0 : EAL_LOG(DEBUG, "Cannot create memfd: %s",
1190 : : strerror(errno));
1191 : 0 : EAL_LOG(DEBUG, "Falling back to anonymous map");
1192 : : } else {
1193 : : /* we got an fd - now resize it */
1194 [ - + ]: 100 : if (ftruncate(memfd, internal_conf->memory) < 0) {
1195 : 0 : EAL_LOG(ERR, "Cannot resize memfd: %s",
1196 : : strerror(errno));
1197 : 0 : EAL_LOG(ERR, "Falling back to anonymous map");
1198 : 0 : close(memfd);
1199 : : } else {
1200 : : /* creating memfd-backed file was successful.
1201 : : * we want changes to memfd to be visible to
1202 : : * other processes (such as vhost backend), so
1203 : : * map it as shared memory.
1204 : : */
1205 : 100 : EAL_LOG(DEBUG, "Using memfd for anonymous memory");
1206 : : fd = memfd;
1207 : : flags = MAP_SHARED;
1208 : : }
1209 : : }
1210 : : /* preallocate address space for the memory, so that it can be
1211 : : * fit into the DMA mask.
1212 : : */
1213 [ - + ]: 100 : if (eal_memseg_list_alloc(msl, 0)) {
1214 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1215 : 0 : return -1;
1216 : : }
1217 : :
1218 : 100 : prealloc_addr = msl->base_va;
1219 : 100 : addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
1220 : : flags | MAP_FIXED, fd, 0);
1221 [ - + ]: 100 : if (addr == MAP_FAILED || addr != prealloc_addr) {
1222 : 0 : EAL_LOG(ERR, "%s: mmap() failed: %s", __func__,
1223 : : strerror(errno));
1224 : 0 : munmap(prealloc_addr, mem_sz);
1225 : 0 : return -1;
1226 : : }
1227 : :
1228 : : /* we're in single-file segments mode, so only the segment list
1229 : : * fd needs to be set up.
1230 : : */
1231 [ + - ]: 100 : if (fd != -1) {
1232 [ - + ]: 100 : if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
1233 : 0 : EAL_LOG(ERR, "Cannot set up segment list fd");
1234 : : /* not a serious error, proceed */
1235 : : }
1236 : : }
1237 : :
1238 : 100 : eal_memseg_list_populate(msl, addr, n_segs);
1239 : :
1240 [ - + - - ]: 100 : if (mcfg->dma_maskbits &&
1241 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1242 : 0 : EAL_LOG(ERR,
1243 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1244 : : __func__);
1245 [ # # # # ]: 0 : if (rte_eal_iova_mode() == RTE_IOVA_VA &&
1246 : 0 : rte_eal_using_phys_addrs())
1247 : 0 : EAL_LOG(ERR,
1248 : : "%s(): Please try initializing EAL with --iova-mode=pa parameter.",
1249 : : __func__);
1250 : 0 : goto fail;
1251 : : }
1252 : 100 : return 0;
1253 : : }
1254 : :
1255 : : /* calculate total number of hugepages available. at this point we haven't
1256 : : * yet started sorting them so they all are on socket 0 */
1257 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1258 : : /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
1259 : 2 : used_hp[i].hugepage_sz = internal_conf->hugepage_info[i].hugepage_sz;
1260 : :
1261 : 2 : nr_hugepages += internal_conf->hugepage_info[i].num_pages[0];
1262 : : }
1263 : :
1264 : : /*
1265 : : * allocate a memory area for hugepage table.
1266 : : * this isn't shared memory yet. due to the fact that we need some
1267 : : * processing done on these pages, shared memory will be created
1268 : : * at a later stage.
1269 : : */
1270 : 2 : tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
1271 [ - + ]: 2 : if (tmp_hp == NULL)
1272 : 0 : goto fail;
1273 : :
1274 : : memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
1275 : :
1276 : : hp_offset = 0; /* where we start the current page size entries */
1277 : :
1278 : 2 : huge_register_sigbus();
1279 : :
1280 : : /* make a copy of numa_mem, needed for balanced allocation. */
1281 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1282 : 64 : memory[i] = internal_conf->numa_mem[i];
1283 : :
1284 : : /* map all hugepages and sort them */
1285 [ + + ]: 4 : for (i = 0; i < (int)internal_conf->num_hugepage_sizes; i++) {
1286 : : unsigned pages_old, pages_new;
1287 : : struct hugepage_info *hpi;
1288 : :
1289 : : /*
1290 : : * we don't yet mark hugepages as used at this stage, so
1291 : : * we just map all hugepages available to the system
1292 : : * all hugepages are still located on socket 0
1293 : : */
1294 : 2 : hpi = &internal_conf->hugepage_info[i];
1295 : :
1296 [ - + ]: 2 : if (hpi->num_pages[0] == 0)
1297 : 0 : continue;
1298 : :
1299 : : /* map all hugepages available */
1300 : : pages_old = hpi->num_pages[0];
1301 : 2 : pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
1302 [ - + ]: 2 : if (pages_new < pages_old) {
1303 : 0 : EAL_LOG(DEBUG,
1304 : : "%d not %d hugepages of size %u MB allocated",
1305 : : pages_new, pages_old,
1306 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1307 : :
1308 : 0 : int pages = pages_old - pages_new;
1309 : :
1310 : 0 : nr_hugepages -= pages;
1311 : 0 : hpi->num_pages[0] = pages_new;
1312 [ # # ]: 0 : if (pages_new == 0)
1313 : 0 : continue;
1314 : : }
1315 : :
1316 [ + - - + ]: 4 : if (rte_eal_using_phys_addrs() &&
1317 : 2 : rte_eal_iova_mode() != RTE_IOVA_VA) {
1318 : : /* find physical addresses for each hugepage */
1319 [ - + ]: 2 : if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1320 : 0 : EAL_LOG(DEBUG, "Failed to find phys addr "
1321 : : "for %u MB pages",
1322 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1323 : 0 : goto fail;
1324 : : }
1325 : : } else {
1326 : : /* set physical addresses for each hugepage */
1327 : : if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
1328 : : EAL_LOG(DEBUG, "Failed to set phys addr "
1329 : : "for %u MB pages",
1330 : : (unsigned int)(hpi->hugepage_sz / 0x100000));
1331 : : goto fail;
1332 : : }
1333 : : }
1334 : :
1335 [ - + ]: 2 : if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1336 : 0 : EAL_LOG(DEBUG, "Failed to find NUMA socket for %u MB pages",
1337 : : (unsigned)(hpi->hugepage_sz / 0x100000));
1338 : 0 : goto fail;
1339 : : }
1340 : :
1341 : 2 : qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
1342 : : sizeof(struct hugepage_file), cmp_physaddr);
1343 : :
1344 : : /* we have processed a num of hugepages of this size, so inc offset */
1345 : 2 : hp_offset += hpi->num_pages[0];
1346 : : }
1347 : :
1348 : : huge_recover_sigbus();
1349 : :
1350 [ - + - - ]: 2 : if (internal_conf->memory == 0 && internal_conf->force_numa == 0)
1351 : 0 : internal_conf->memory = eal_get_hugepage_mem_size();
1352 : :
1353 : : nr_hugefiles = nr_hugepages;
1354 : :
1355 : :
1356 : : /* clean out the numbers of pages */
1357 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++)
1358 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
1359 : 64 : internal_conf->hugepage_info[i].num_pages[j] = 0;
1360 : :
1361 : : /* get hugepages for each socket */
1362 [ + + ]: 2048 : for (i = 0; i < nr_hugefiles; i++) {
1363 : 2046 : int socket = tmp_hp[i].socket_id;
1364 : :
1365 : : /* find a hugepage info with right size and increment num_pages */
1366 : 2046 : const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
1367 : : (int)internal_conf->num_hugepage_sizes);
1368 [ + + ]: 4092 : for (j = 0; j < nb_hpsizes; j++) {
1369 : 2046 : if (tmp_hp[i].size ==
1370 [ + - ]: 2046 : internal_conf->hugepage_info[j].hugepage_sz) {
1371 : 2046 : internal_conf->hugepage_info[j].num_pages[socket]++;
1372 : : }
1373 : : }
1374 : : }
1375 : :
1376 : : /* make a copy of numa_mem, needed for number of pages calculation */
1377 [ + + ]: 66 : for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1378 : 64 : memory[i] = internal_conf->numa_mem[i];
1379 : :
1380 : : /* calculate final number of pages */
1381 : 2 : nr_hugepages = eal_dynmem_calc_num_pages_per_socket(memory,
1382 : 2 : internal_conf->hugepage_info, used_hp,
1383 : : internal_conf->num_hugepage_sizes);
1384 : :
1385 : : /* error if not enough memory available */
1386 [ - + ]: 2 : if (nr_hugepages < 0)
1387 : 0 : goto fail;
1388 : :
1389 : : /* reporting in! */
1390 [ + + ]: 4 : for (i = 0; i < (int) internal_conf->num_hugepage_sizes; i++) {
1391 [ + + ]: 66 : for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1392 [ + + ]: 64 : if (used_hp[i].num_pages[j] > 0) {
1393 : 2 : EAL_LOG(DEBUG,
1394 : : "Requesting %u pages of size %uMB"
1395 : : " from socket %i",
1396 : : used_hp[i].num_pages[j],
1397 : : (unsigned)
1398 : : (used_hp[i].hugepage_sz / 0x100000),
1399 : : j);
1400 : : }
1401 : : }
1402 : : }
1403 : :
1404 : : /* create shared memory */
1405 : 2 : hugepage = create_shared_memory(eal_hugepage_data_path(),
1406 : : nr_hugefiles * sizeof(struct hugepage_file));
1407 : :
1408 [ - + ]: 2 : if (hugepage == NULL) {
1409 : 0 : EAL_LOG(ERR, "Failed to create shared memory!");
1410 : 0 : goto fail;
1411 : : }
1412 : : memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
1413 : :
1414 : : /*
1415 : : * unmap pages that we won't need (looks at used_hp).
1416 : : * also, sets final_va to NULL on pages that were unmapped.
1417 : : */
1418 [ - + ]: 2 : if (unmap_unneeded_hugepages(tmp_hp, used_hp,
1419 : : internal_conf->num_hugepage_sizes) < 0) {
1420 : 0 : EAL_LOG(ERR, "Unmapping and locking hugepages failed!");
1421 : 0 : goto fail;
1422 : : }
1423 : :
1424 : : /*
1425 : : * copy stuff from malloc'd hugepage* to the actual shared memory.
1426 : : * this procedure only copies those hugepages that have orig_va
1427 : : * not NULL. has overflow protection.
1428 : : */
1429 [ - + ]: 2 : if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
1430 : : tmp_hp, nr_hugefiles) < 0) {
1431 : 0 : EAL_LOG(ERR, "Copying tables to shared memory failed!");
1432 : 0 : goto fail;
1433 : : }
1434 : :
1435 : : #ifndef RTE_ARCH_64
1436 : : /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
1437 : : if (internal_conf->legacy_mem &&
1438 : : prealloc_segments(hugepage, nr_hugefiles)) {
1439 : : EAL_LOG(ERR, "Could not preallocate VA space for hugepages");
1440 : : goto fail;
1441 : : }
1442 : : #endif
1443 : :
1444 : : /* remap all pages we do need into memseg list VA space, so that those
1445 : : * pages become first-class citizens in DPDK memory subsystem
1446 : : */
1447 [ - + ]: 2 : if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
1448 : 0 : EAL_LOG(ERR, "Couldn't remap hugepage files into memseg lists");
1449 : 0 : goto fail;
1450 : : }
1451 : :
1452 : : /* free the hugepage backing files */
1453 [ - + - - ]: 2 : if (internal_conf->hugepage_file.unlink_before_mapping &&
1454 : 0 : unlink_hugepage_files(tmp_hp, internal_conf->num_hugepage_sizes) < 0) {
1455 : 0 : EAL_LOG(ERR, "Unlinking hugepage files failed!");
1456 : 0 : goto fail;
1457 : : }
1458 : :
1459 : : /* free the temporary hugepage table */
1460 : 2 : free(tmp_hp);
1461 : : tmp_hp = NULL;
1462 : :
1463 : 2 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1464 : : hugepage = NULL;
1465 : :
1466 : : /* we're not going to allocate more pages, so release VA space for
1467 : : * unused memseg lists
1468 : : */
1469 [ + + ]: 258 : for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
1470 : : struct rte_memseg_list *msl = &mcfg->memsegs[i];
1471 : : size_t mem_sz;
1472 : :
1473 : : /* skip inactive lists */
1474 [ + + ]: 256 : if (msl->base_va == NULL)
1475 : 240 : continue;
1476 : : /* skip lists where there is at least one page allocated */
1477 [ + + ]: 16 : if (msl->memseg_arr.count > 0)
1478 : 2 : continue;
1479 : : /* this is an unused list, deallocate it */
1480 : 14 : mem_sz = msl->len;
1481 : 14 : munmap(msl->base_va, mem_sz);
1482 : 14 : msl->base_va = NULL;
1483 : 14 : msl->len = 0;
1484 : 14 : msl->heap = 0;
1485 : :
1486 : : /* destroy backing fbarray */
1487 : 14 : rte_fbarray_destroy(&msl->memseg_arr);
1488 : : }
1489 : :
1490 [ - + - - ]: 2 : if (mcfg->dma_maskbits &&
1491 : 0 : rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
1492 : 0 : EAL_LOG(ERR,
1493 : : "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.",
1494 : : __func__);
1495 : 0 : goto fail;
1496 : : }
1497 : :
1498 : : return 0;
1499 : :
1500 [ # # ]: 0 : fail:
1501 : : huge_recover_sigbus();
1502 : 0 : free(tmp_hp);
1503 [ # # ]: 0 : if (hugepage != NULL)
1504 : 0 : munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
1505 : :
1506 : : return -1;
1507 : : }
1508 : :
1509 : : /*
1510 : : * uses fstat to report the size of a file on disk
1511 : : */
1512 : : static off_t
1513 : : getFileSize(int fd)
1514 : : {
1515 : : struct stat st;
1516 [ # # ]: 0 : if (fstat(fd, &st) < 0)
1517 : : return 0;
1518 : 0 : return st.st_size;
1519 : : }
1520 : :
1521 : : /*
1522 : : * This creates the memory mappings in the secondary process to match that of
1523 : : * the server process. It goes through each memory segment in the DPDK runtime
1524 : : * configuration and finds the hugepages which form that segment, mapping them
1525 : : * in order to form a contiguous block in the virtual memory space
1526 : : */
1527 : : static int
1528 : 1 : eal_legacy_hugepage_attach(void)
1529 : : {
1530 : 1 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1531 : : struct hugepage_file *hp = NULL;
1532 : : unsigned int num_hp = 0;
1533 : : unsigned int i = 0;
1534 : : unsigned int cur_seg;
1535 : : off_t size = 0;
1536 : : int fd, fd_hugepage = -1;
1537 : :
1538 [ + - ]: 1 : if (aslr_enabled() > 0) {
1539 : 1 : EAL_LOG(WARNING, "WARNING: Address Space Layout Randomization "
1540 : : "(ASLR) is enabled in the kernel.");
1541 : 1 : EAL_LOG(WARNING, " This may cause issues with mapping memory "
1542 : : "into secondary processes");
1543 : : }
1544 : :
1545 : 1 : fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
1546 [ + - ]: 1 : if (fd_hugepage < 0) {
1547 : 1 : EAL_LOG(ERR, "Could not open %s",
1548 : : eal_hugepage_data_path());
1549 : 1 : goto error;
1550 : : }
1551 : :
1552 : : size = getFileSize(fd_hugepage);
1553 : 0 : hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1554 [ # # ]: 0 : if (hp == MAP_FAILED) {
1555 : 0 : EAL_LOG(ERR, "Could not mmap %s",
1556 : : eal_hugepage_data_path());
1557 : 0 : goto error;
1558 : : }
1559 : :
1560 : 0 : num_hp = size / sizeof(struct hugepage_file);
1561 : 0 : EAL_LOG(DEBUG, "Analysing %u files", num_hp);
1562 : :
1563 : : /* map all segments into memory to make sure we get the addrs. the
1564 : : * segments themselves are already in memseg list (which is shared and
1565 : : * has its VA space already preallocated), so we just need to map
1566 : : * everything into correct addresses.
1567 : : */
1568 [ # # ]: 0 : for (i = 0; i < num_hp; i++) {
1569 : 0 : struct hugepage_file *hf = &hp[i];
1570 : 0 : size_t map_sz = hf->size;
1571 : 0 : void *map_addr = hf->final_va;
1572 : : int msl_idx, ms_idx;
1573 : : struct rte_memseg_list *msl;
1574 : : struct rte_memseg *ms;
1575 : :
1576 : : /* if size is zero, no more pages left */
1577 [ # # ]: 0 : if (map_sz == 0)
1578 : : break;
1579 : :
1580 : 0 : fd = open(hf->filepath, O_RDWR);
1581 [ # # ]: 0 : if (fd < 0) {
1582 : 0 : EAL_LOG(ERR, "Could not open %s: %s",
1583 : : hf->filepath, strerror(errno));
1584 : 0 : goto error;
1585 : : }
1586 : :
1587 : 0 : map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
1588 : : MAP_SHARED | MAP_FIXED, fd, 0);
1589 [ # # ]: 0 : if (map_addr == MAP_FAILED) {
1590 : 0 : EAL_LOG(ERR, "Could not map %s: %s",
1591 : : hf->filepath, strerror(errno));
1592 : 0 : goto fd_error;
1593 : : }
1594 : :
1595 : : /* set shared lock on the file. */
1596 [ # # ]: 0 : if (flock(fd, LOCK_SH) < 0) {
1597 : 0 : EAL_LOG(DEBUG, "%s(): Locking file failed: %s",
1598 : : __func__, strerror(errno));
1599 : 0 : goto mmap_error;
1600 : : }
1601 : :
1602 : : /* find segment data */
1603 : 0 : msl = rte_mem_virt2memseg_list(map_addr);
1604 [ # # ]: 0 : if (msl == NULL) {
1605 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg list",
1606 : : __func__);
1607 : 0 : goto mmap_error;
1608 : : }
1609 : 0 : ms = rte_mem_virt2memseg(map_addr, msl);
1610 [ # # ]: 0 : if (ms == NULL) {
1611 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg",
1612 : : __func__);
1613 : 0 : goto mmap_error;
1614 : : }
1615 : :
1616 : 0 : msl_idx = msl - mcfg->memsegs;
1617 : 0 : ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
1618 [ # # ]: 0 : if (ms_idx < 0) {
1619 : 0 : EAL_LOG(DEBUG, "%s(): Cannot find memseg idx",
1620 : : __func__);
1621 : 0 : goto mmap_error;
1622 : : }
1623 : :
1624 : : /* store segment fd internally */
1625 [ # # ]: 0 : if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
1626 : 0 : EAL_LOG(ERR, "Could not store segment fd: %s",
1627 : : rte_strerror(rte_errno));
1628 : : }
1629 : : /* unmap the hugepage config file, since we are done using it */
1630 : 0 : munmap(hp, size);
1631 : 0 : close(fd_hugepage);
1632 : 0 : return 0;
1633 : :
1634 : 0 : mmap_error:
1635 : 0 : munmap(hp[i].final_va, hp[i].size);
1636 : 0 : fd_error:
1637 : 0 : close(fd);
1638 : 1 : error:
1639 : : /* unwind mmap's done so far */
1640 [ - + ]: 1 : for (cur_seg = 0; cur_seg < i; cur_seg++)
1641 : 0 : munmap(hp[cur_seg].final_va, hp[cur_seg].size);
1642 : :
1643 [ - + ]: 1 : if (hp != NULL && hp != MAP_FAILED)
1644 : 0 : munmap(hp, size);
1645 [ - + ]: 1 : if (fd_hugepage >= 0)
1646 : 0 : close(fd_hugepage);
1647 : : return -1;
1648 : : }
1649 : :
1650 : : static int
1651 : 26 : eal_hugepage_attach(void)
1652 : : {
1653 [ + + ]: 26 : if (eal_memalloc_sync_with_primary()) {
1654 : 1 : EAL_LOG(ERR, "Could not map memory from primary process");
1655 [ + - ]: 1 : if (aslr_enabled() > 0)
1656 : 1 : EAL_LOG(ERR, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes");
1657 : 1 : return -1;
1658 : : }
1659 : : return 0;
1660 : : }
1661 : :
1662 : : int
1663 : 157 : rte_eal_hugepage_init(void)
1664 : : {
1665 : : const struct internal_config *internal_conf =
1666 : 157 : eal_get_internal_configuration();
1667 : :
1668 : 157 : return internal_conf->legacy_mem ?
1669 [ + + ]: 157 : eal_legacy_hugepage_init() :
1670 : 55 : eal_dynmem_hugepage_init();
1671 : : }
1672 : :
1673 : : int
1674 : 27 : rte_eal_hugepage_attach(void)
1675 : : {
1676 : : const struct internal_config *internal_conf =
1677 : 27 : eal_get_internal_configuration();
1678 : :
1679 : 27 : return internal_conf->legacy_mem ?
1680 [ + + ]: 27 : eal_legacy_hugepage_attach() :
1681 : 26 : eal_hugepage_attach();
1682 : : }
1683 : :
1684 : : RTE_EXPORT_SYMBOL(rte_eal_using_phys_addrs)
1685 : : int
1686 : 188 : rte_eal_using_phys_addrs(void)
1687 : : {
1688 [ + + ]: 188 : if (phys_addrs_available == -1) {
1689 : 186 : uint64_t tmp = 0;
1690 : :
1691 [ + + + - ]: 272 : if (rte_eal_has_hugepages() != 0 &&
1692 : 86 : rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
1693 : 86 : phys_addrs_available = 1;
1694 : : else
1695 : 100 : phys_addrs_available = 0;
1696 : : }
1697 : 188 : return phys_addrs_available;
1698 : : }
1699 : :
1700 : : static int __rte_unused
1701 : : memseg_primary_init_32(void)
1702 : : {
1703 : : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1704 : : int active_sockets, hpi_idx, msl_idx = 0;
1705 : : unsigned int socket_id, i;
1706 : : struct rte_memseg_list *msl;
1707 : : uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
1708 : : uint64_t max_mem;
1709 : : struct internal_config *internal_conf =
1710 : : eal_get_internal_configuration();
1711 : :
1712 : : /* no-huge does not need this at all */
1713 : : if (internal_conf->no_hugetlbfs)
1714 : : return 0;
1715 : :
1716 : : /* this is a giant hack, but desperate times call for desperate
1717 : : * measures. in legacy 32-bit mode, we cannot preallocate VA space,
1718 : : * because having upwards of 2 gigabytes of VA space already mapped will
1719 : : * interfere with our ability to map and sort hugepages.
1720 : : *
1721 : : * therefore, in legacy 32-bit mode, we will be initializing memseg
1722 : : * lists much later - in eal_memory.c, right after we unmap all the
1723 : : * unneeded pages. this will not affect secondary processes, as those
1724 : : * should be able to mmap the space without (too many) problems.
1725 : : */
1726 : : if (internal_conf->legacy_mem)
1727 : : return 0;
1728 : :
1729 : : /* 32-bit mode is a very special case. we cannot know in advance where
1730 : : * the user will want to allocate their memory, so we have to do some
1731 : : * heuristics.
1732 : : */
1733 : : active_sockets = 0;
1734 : : total_requested_mem = 0;
1735 : : if (internal_conf->force_numa)
1736 : : for (i = 0; i < rte_socket_count(); i++) {
1737 : : uint64_t mem;
1738 : :
1739 : : socket_id = rte_socket_id_by_idx(i);
1740 : : mem = internal_conf->numa_mem[socket_id];
1741 : :
1742 : : if (mem == 0)
1743 : : continue;
1744 : :
1745 : : active_sockets++;
1746 : : total_requested_mem += mem;
1747 : : }
1748 : : else
1749 : : total_requested_mem = internal_conf->memory;
1750 : :
1751 : : max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
1752 : : if (total_requested_mem > max_mem) {
1753 : : EAL_LOG(ERR, "Invalid parameters: 32-bit process can at most use %uM of memory",
1754 : : (unsigned int)(max_mem >> 20));
1755 : : return -1;
1756 : : }
1757 : : total_extra_mem = max_mem - total_requested_mem;
1758 : : extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
1759 : : total_extra_mem / active_sockets;
1760 : :
1761 : : /* the allocation logic is a little bit convoluted, but here's how it
1762 : : * works, in a nutshell:
1763 : : * - if user hasn't specified on which sockets to allocate memory via
1764 : : * --socket-mem, we allocate all of our memory on main core socket.
1765 : : * - if user has specified sockets to allocate memory on, there may be
1766 : : * some "unused" memory left (e.g. if user has specified --socket-mem
1767 : : * such that not all memory adds up to 2 gigabytes), so add it to all
1768 : : * sockets that are in use equally.
1769 : : *
1770 : : * page sizes are sorted by size in descending order, so we can safely
1771 : : * assume that we dispense with bigger page sizes first.
1772 : : */
1773 : :
1774 : : /* create memseg lists */
1775 : : for (i = 0; i < rte_socket_count(); i++) {
1776 : : int hp_sizes = (int) internal_conf->num_hugepage_sizes;
1777 : : uint64_t max_socket_mem, cur_socket_mem;
1778 : : unsigned int main_lcore_socket;
1779 : : struct rte_config *cfg = rte_eal_get_configuration();
1780 : : bool skip;
1781 : : int ret;
1782 : :
1783 : : ret = rte_socket_id_by_idx(i);
1784 : : if (ret == -1) {
1785 : : EAL_LOG(ERR, "Cannot get socket ID for socket index %u", i);
1786 : : return -1;
1787 : : }
1788 : : socket_id = (unsigned int)ret;
1789 : :
1790 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1791 : : /* we can still sort pages by socket in legacy mode */
1792 : : if (!internal_conf->legacy_mem && socket_id > 0)
1793 : : break;
1794 : : #endif
1795 : :
1796 : : /* if we didn't specifically request memory on this socket */
1797 : : skip = active_sockets != 0 &&
1798 : : internal_conf->numa_mem[socket_id] == 0;
1799 : : /* ...or if we didn't specifically request memory on *any*
1800 : : * socket, and this is not main lcore
1801 : : */
1802 : : main_lcore_socket = rte_lcore_to_socket_id(cfg->main_lcore);
1803 : : skip |= active_sockets == 0 && socket_id != main_lcore_socket;
1804 : :
1805 : : if (skip) {
1806 : : EAL_LOG(DEBUG, "Will not preallocate memory on socket %u",
1807 : : socket_id);
1808 : : continue;
1809 : : }
1810 : :
1811 : : /* max amount of memory on this socket */
1812 : : max_socket_mem = (active_sockets != 0 ?
1813 : : internal_conf->numa_mem[socket_id] :
1814 : : internal_conf->memory) +
1815 : : extra_mem_per_socket;
1816 : : cur_socket_mem = 0;
1817 : :
1818 : : for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
1819 : : uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
1820 : : uint64_t hugepage_sz;
1821 : : struct hugepage_info *hpi;
1822 : : int type_msl_idx, max_segs, total_segs = 0;
1823 : :
1824 : : hpi = &internal_conf->hugepage_info[hpi_idx];
1825 : : hugepage_sz = hpi->hugepage_sz;
1826 : :
1827 : : /* check if pages are actually available */
1828 : : if (hpi->num_pages[socket_id] == 0)
1829 : : continue;
1830 : :
1831 : : max_segs = RTE_MAX_MEMSEG_PER_TYPE;
1832 : : max_pagesz_mem = max_socket_mem - cur_socket_mem;
1833 : :
1834 : : /* make it multiple of page size */
1835 : : max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
1836 : : hugepage_sz);
1837 : :
1838 : : EAL_LOG(DEBUG, "Attempting to preallocate "
1839 : : "%" PRIu64 "M on socket %i",
1840 : : max_pagesz_mem >> 20, socket_id);
1841 : :
1842 : : type_msl_idx = 0;
1843 : : while (cur_pagesz_mem < max_pagesz_mem &&
1844 : : total_segs < max_segs) {
1845 : : uint64_t cur_mem;
1846 : : unsigned int n_segs;
1847 : :
1848 : : if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
1849 : : EAL_LOG(ERR,
1850 : : "No more space in memseg lists, please increase RTE_MAX_MEMSEG_LISTS");
1851 : : return -1;
1852 : : }
1853 : :
1854 : : msl = &mcfg->memsegs[msl_idx];
1855 : :
1856 : : cur_mem = get_mem_amount(hugepage_sz,
1857 : : max_pagesz_mem);
1858 : : n_segs = cur_mem / hugepage_sz;
1859 : :
1860 : : if (eal_memseg_list_init(msl, hugepage_sz,
1861 : : n_segs, socket_id, type_msl_idx,
1862 : : true)) {
1863 : : /* failing to allocate a memseg list is
1864 : : * a serious error.
1865 : : */
1866 : : EAL_LOG(ERR, "Cannot allocate memseg list");
1867 : : return -1;
1868 : : }
1869 : :
1870 : : if (eal_memseg_list_alloc(msl, 0)) {
1871 : : /* if we couldn't allocate VA space, we
1872 : : * can try with smaller page sizes.
1873 : : */
1874 : : EAL_LOG(ERR, "Cannot allocate VA space for memseg list, retrying with different page size");
1875 : : /* deallocate memseg list */
1876 : : if (memseg_list_free(msl))
1877 : : return -1;
1878 : : break;
1879 : : }
1880 : :
1881 : : total_segs += msl->memseg_arr.len;
1882 : : cur_pagesz_mem = total_segs * hugepage_sz;
1883 : : type_msl_idx++;
1884 : : msl_idx++;
1885 : : }
1886 : : cur_socket_mem += cur_pagesz_mem;
1887 : : }
1888 : : if (cur_socket_mem == 0) {
1889 : : EAL_LOG(ERR, "Cannot allocate VA space on socket %u",
1890 : : socket_id);
1891 : : return -1;
1892 : : }
1893 : : }
1894 : :
1895 : : return 0;
1896 : : }
1897 : :
1898 : : static int __rte_unused
1899 : : memseg_primary_init(void)
1900 : : {
1901 : 157 : return eal_dynmem_memseg_lists_init();
1902 : : }
1903 : :
1904 : : static int
1905 : 27 : memseg_secondary_init(void)
1906 : : {
1907 : 27 : struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1908 : : int msl_idx = 0;
1909 : : struct rte_memseg_list *msl;
1910 : :
1911 [ + + ]: 3483 : for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
1912 : :
1913 : 3456 : msl = &mcfg->memsegs[msl_idx];
1914 : :
1915 : : /* skip empty and external memseg lists */
1916 [ + + - + ]: 3456 : if (msl->memseg_arr.len == 0 || msl->external)
1917 : 3247 : continue;
1918 : :
1919 [ - + ]: 209 : if (rte_fbarray_attach(&msl->memseg_arr)) {
1920 : 0 : EAL_LOG(ERR, "Cannot attach to primary process memseg lists");
1921 : 0 : return -1;
1922 : : }
1923 : :
1924 : : /* preallocate VA space */
1925 [ - + ]: 209 : if (eal_memseg_list_alloc(msl, 0)) {
1926 : 0 : EAL_LOG(ERR, "Cannot preallocate VA space for hugepage memory");
1927 : 0 : return -1;
1928 : : }
1929 : : }
1930 : :
1931 : : return 0;
1932 : : }
1933 : :
1934 : : int
1935 : 184 : rte_eal_memseg_init(void)
1936 : : {
1937 : : /* increase rlimit to maximum */
1938 : : struct rlimit lim;
1939 : :
1940 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1941 : : const struct internal_config *internal_conf =
1942 : : eal_get_internal_configuration();
1943 : : #endif
1944 [ + - ]: 184 : if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
1945 : : /* set limit to maximum */
1946 : 184 : lim.rlim_cur = lim.rlim_max;
1947 : :
1948 [ - + ]: 184 : if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
1949 : 0 : EAL_LOG(DEBUG, "Setting maximum number of open files failed: %s",
1950 : : strerror(errno));
1951 : : } else {
1952 : 184 : EAL_LOG(DEBUG, "Setting maximum number of open files to %"
1953 : : PRIu64,
1954 : : (uint64_t)lim.rlim_cur);
1955 : : }
1956 : : } else {
1957 : 0 : EAL_LOG(ERR, "Cannot get current resource limits");
1958 : : }
1959 : : #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
1960 : : if (!internal_conf->legacy_mem && rte_socket_count() > 1) {
1961 : : EAL_LOG(WARNING, "DPDK is running on a NUMA system, but is compiled without NUMA support.");
1962 : : EAL_LOG(WARNING, "This will have adverse consequences for performance and usability.");
1963 : : EAL_LOG(WARNING, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.");
1964 : : }
1965 : : #endif
1966 : :
1967 : 184 : return rte_eal_process_type() == RTE_PROC_PRIMARY ?
1968 : : #ifndef RTE_ARCH_64
1969 : : memseg_primary_init_32() :
1970 : : #else
1971 [ + + ]: 184 : memseg_primary_init() :
1972 : : #endif
1973 : 27 : memseg_secondary_init();
1974 : : }
|