http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/mm/page_alloc.c#L1435/* 1432 * This is the 'heart' of the zoned buddy allocator. 1433 */ 1434 struct page * 1435 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1436 struct zonelist *zonelist, nodemask_t *nodemask) 1437 { 1438 const gfp_t wait = gfp_mask & __GFP_WAIT; 1439 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1440 struct zoneref *z; 1441 struct zone *zone; 1442 struct page *page; 1443 struct reclaim_state reclaim_state; 1444 struct task_struct *p = current; 1445 int do_retry; 1446 int alloc_flags; 1447 unsigned long did_some_progress; 1448 unsigned long pages_reclaimed = 0; 1449 1450 might_sleep_if(wait); 1451 1452 if (should_fail_alloc_page(gfp_mask, order)) 1453 return NULL; 1454 1455 restart: 1456 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1457 1458 if (unlikely(!z->zone)) { 1459 /* 1460 * Happens if we have an empty zonelist as a result of 1461 * GFP_THISNODE being used on a memoryless node 1462 */ 1463 return NULL; 1464 } 1465 1466 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1467 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1468 if (page) 1469 goto got_pg; 1470 1471 /* 1472 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1473 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1474 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1475 * using a larger set of nodes after it has established that the 1476 * allowed per node queues are empty and that nodes are 1477 * over allocated. 1478 */ 1479 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1480 goto nopage; 1481 1482 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1483 wakeup_kswapd(zone, order); 1484 1485 /* 1486 * OK, we're below the kswapd watermark and have kicked background 1487 * reclaim. Now things get more complex, so set up alloc_flags according 1488 * to how we want to proceed. 1489 * 1490 * The caller may dip into page reserves a bit more if the caller 1491 * cannot run direct reclaim, or if the caller has realtime scheduling 1492 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1493 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1494 */ 1495 alloc_flags = ALLOC_WMARK_MIN; 1496 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1497 alloc_flags |= ALLOC_HARDER; 1498 if (gfp_mask & __GFP_HIGH) 1499 alloc_flags |= ALLOC_HIGH; 1500 if (wait) 1501 alloc_flags |= ALLOC_CPUSET; 1502 1503 /* 1504 * Go through the zonelist again. Let __GFP_HIGH and allocations 1505 * coming from realtime tasks go deeper into reserves. 1506 * 1507 * This is the last chance, in general, before the goto nopage. 1508 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1509 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1510 */ 1511 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1512 high_zoneidx, alloc_flags); 1513 if (page) 1514 goto got_pg; 1515 1516 /* This allocation should allow future memory freeing. */ 1517 1518 rebalance: 1519 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1520 && !in_interrupt()) { 1521 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1522 nofail_alloc: 1523 /* go through the zonelist yet again, ignoring mins */ 1524 page = get_page_from_freelist(gfp_mask, nodemask, order, 1525 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1526 if (page) 1527 goto got_pg; 1528 if (gfp_mask & __GFP_NOFAIL) { 1529 congestion_wait(WRITE, HZ/50); 1530 goto nofail_alloc; 1531 } 1532 } 1533 goto nopage; 1534 } 1535 1536 /* Atomic allocations - we can't balance anything */ 1537 if (!wait) 1538 goto nopage; 1539 1540 cond_resched(); 1541 1542 /* We now go into synchronous reclaim */ 1543 cpuset_memory_pressure_bump(); 1544 p->flags |= PF_MEMALLOC; 1545 reclaim_state.reclaimed_slab = 0; 1546 p->reclaim_state = &reclaim_state; 1547 1548 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1549 1550 p->reclaim_state = NULL; 1551 p->flags &= ~PF_MEMALLOC; 1552 1553 cond_resched(); 1554 1555 if (order != 0) 1556 drain_all_pages(); 1557 1558 if (likely(did_some_progress)) { 1559 page = get_page_from_freelist(gfp_mask, nodemask, order, 1560 zonelist, high_zoneidx, alloc_flags); 1561 if (page) 1562 goto got_pg; 1563 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1564 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1565 schedule_timeout_uninterruptible(1); 1566 goto restart; 1567 } 1568 1569 /* 1570 * Go through the zonelist yet one more time, keep 1571 * very high watermark here, this is only to catch 1572 * a parallel oom killing, we must fail if we're still 1573 * under heavy pressure. 1574 */ 1575 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 1576 order, zonelist, high_zoneidx, 1577 ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1578 if (page) { 1579 clear_zonelist_oom(zonelist, gfp_mask); 1580 goto got_pg; 1581 } 1582 1583 /* The OOM killer will not help higher order allocs so fail */ 1584 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1585 clear_zonelist_oom(zonelist, gfp_mask); 1586 goto nopage; 1587 } 1588 1589 out_of_memory(zonelist, gfp_mask, order); 1590 clear_zonelist_oom(zonelist, gfp_mask); 1591 goto restart; 1592 } 1593 1594 /* 1595 * Don't let big-order allocations loop unless the caller explicitly 1596 * requests that. Wait for some write requests to complete then retry. 1597 * 1598 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1599 * means __GFP_NOFAIL, but that may not be true in other 1600 * implementations. 1601 * 1602 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 1603 * specified, then we retry until we no longer reclaim any pages 1604 * (above), or we've reclaimed an order of pages at least as 1605 * large as the allocation's order. In both cases, if the 1606 * allocation still fails, we stop retrying. 1607 */ 1608 pages_reclaimed += did_some_progress; 1609 do_retry = 0; 1610 if (!(gfp_mask & __GFP_NORETRY)) { 1611 if (order <= PAGE_ALLOC_COSTLY_ORDER) { 1612 do_retry = 1; 1613 } else { 1614 if (gfp_mask & __GFP_REPEAT && 1615 pages_reclaimed < (1 << order)) 1616 do_retry = 1; 1617 } 1618 if (gfp_mask & __GFP_NOFAIL) 1619 do_retry = 1; 1620 } 1621 if (do_retry) { 1622 congestion_wait(WRITE, HZ/50); 1623 goto rebalance; 1624 } 1625 1626 nopage: 1627 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1628 printk(KERN_WARNING "%s: page allocation failure." 1629 " order:%d, mode:0x%x\n", 1630 p->comm, order, gfp_mask); 1631 dump_stack(); 1632 show_mem(); 1633 } 1634 got_pg: 1635 return page; 1636 } 1637 EXPORT_SYMBOL(__alloc_pages_internal); 1638 1355 /* 1356 * get_page_from_freelist goes through the zonelist trying to allocate 1357 * a page. 1358 */ 1359 static struct page * 1360 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1361 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1362 { 1363 struct zoneref *z; 1364 struct page *page = NULL; 1365 int classzone_idx; 1366 struct zone *zone, *preferred_zone; 1367 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1368 int zlc_active = 0; /* set if using zonelist_cache */ 1369 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1370 1371 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, 1372 &preferred_zone); 1373 if (!preferred_zone) 1374 return NULL; 1375 1376 classzone_idx = zone_idx(preferred_zone); 1377 1378 zonelist_scan: 1379 /* 1380 * Scan zonelist, looking for a zone with enough free. 1381 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1382 */ 1383 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1384 high_zoneidx, nodemask) { 1385 if (NUMA_BUILD && zlc_active && 1386 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1387 continue; 1388 if ((alloc_flags & ALLOC_CPUSET) && 1389 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1390 goto try_next_zone; 1391 1392 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1393 unsigned long mark; 1394 if (alloc_flags & ALLOC_WMARK_MIN) 1395 mark = zone->pages_min; 1396 else if (alloc_flags & ALLOC_WMARK_LOW) 1397 mark = zone->pages_low; 1398 else 1399 mark = zone->pages_high; 1400 if (!zone_watermark_ok(zone, order, mark, 1401 classzone_idx, alloc_flags)) { 1402 if (!zone_reclaim_mode || 1403 !zone_reclaim(zone, gfp_mask, order)) 1404 goto this_zone_full; 1405 } 1406 } 1407 1408 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1409 if (page) 1410 break; 1411 this_zone_full: 1412 if (NUMA_BUILD) 1413 zlc_mark_zone_full(zonelist, z); 1414 try_next_zone: 1415 if (NUMA_BUILD && !did_zlc_setup) { 1416 /* we do zlc_setup after the first zone is tried */ 1417 allowednodes = zlc_setup(zonelist, alloc_flags); 1418 zlc_active = 1; 1419 did_zlc_setup = 1; 1420 } 1421 } 1422 1423 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1424 /* Disable zlc cache for second zonelist scan */ 1425 zlc_active = 0; 1426 goto zonelist_scan; 1427 } 1428 return page; 1429 } 1430 |
