/*
1432  * This is the 'heart' of the zoned buddy allocator.
1433  */
1434 struct page *
1435 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1436                         struct zonelist *zonelist, nodemask_t *nodemask)
1437 {
1438         const gfp_t wait = gfp_mask & __GFP_WAIT;
1439         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1440         struct zoneref *z;
1441         struct zone *zone;
1442         struct page *page;
1443         struct reclaim_state reclaim_state;
1444         struct task_struct *p = current;
1445         int do_retry;
1446         int alloc_flags;
1447         unsigned long did_some_progress;
1448         unsigned long pages_reclaimed = 0;
1449 
1450         might_sleep_if(wait);
1451 
1452         if (should_fail_alloc_page(gfp_mask, order))
1453                 return NULL;
1454 
1455 restart:
1456         z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
1457 
1458         if (unlikely(!z->zone)) {
1459                 /*
1460                  * Happens if we have an empty zonelist as a result of
1461                  * GFP_THISNODE being used on a memoryless node
1462                  */
1463                 return NULL;
1464         }
1465 
1466         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1467                         zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1468         if (page)
1469                 goto got_pg;
1470 
1471         /*
1472          * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1473          * __GFP_NOWARN set) should not cause reclaim since the subsystem
1474          * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1475          * using a larger set of nodes after it has established that the
1476          * allowed per node queues are empty and that nodes are
1477          * over allocated.
1478          */
1479         if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1480                 goto nopage;
1481 
1482         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1483                 wakeup_kswapd(zone, order);
1484 
1485         /*
1486          * OK, we're below the kswapd watermark and have kicked background
1487          * reclaim. Now things get more complex, so set up alloc_flags according
1488          * to how we want to proceed.
1489          *
1490          * The caller may dip into page reserves a bit more if the caller
1491          * cannot run direct reclaim, or if the caller has realtime scheduling
1492          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
1493          * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1494          */
1495         alloc_flags = ALLOC_WMARK_MIN;
1496         if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1497                 alloc_flags |= ALLOC_HARDER;
1498         if (gfp_mask & __GFP_HIGH)
1499                 alloc_flags |= ALLOC_HIGH;
1500         if (wait)
1501                 alloc_flags |= ALLOC_CPUSET;
1502 
1503         /*
1504          * Go through the zonelist again. Let __GFP_HIGH and allocations
1505          * coming from realtime tasks go deeper into reserves.
1506          *
1507          * This is the last chance, in general, before the goto nopage.
1508          * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1509          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1510          */
1511         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1512                                                 high_zoneidx, alloc_flags);
1513         if (page)
1514                 goto got_pg;
1515 
1516         /* This allocation should allow future memory freeing. */
1517 
1518 rebalance:
1519         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1520                         && !in_interrupt()) {
1521                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1522 nofail_alloc:
1523                         /* go through the zonelist yet again, ignoring mins */
1524                         page = get_page_from_freelist(gfp_mask, nodemask, order,
1525                                 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1526                         if (page)
1527                                 goto got_pg;
1528                         if (gfp_mask & __GFP_NOFAIL) {
1529                                 congestion_wait(WRITE, HZ/50);
1530                                 goto nofail_alloc;
1531                         }
1532                 }
1533                 goto nopage;
1534         }
1535 
1536         /* Atomic allocations - we can't balance anything */
1537         if (!wait)
1538                 goto nopage;
1539 
1540         cond_resched();
1541 
1542         /* We now go into synchronous reclaim */
1543         cpuset_memory_pressure_bump();
1544         p->flags |= PF_MEMALLOC;
1545         reclaim_state.reclaimed_slab = 0;
1546         p->reclaim_state = &reclaim_state;
1547 
1548         did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
1549 
1550         p->reclaim_state = NULL;
1551         p->flags &= ~PF_MEMALLOC;
1552 
1553         cond_resched();
1554 
1555         if (order != 0)
1556                 drain_all_pages();
1557 
1558         if (likely(did_some_progress)) {
1559                 page = get_page_from_freelist(gfp_mask, nodemask, order,
1560                                         zonelist, high_zoneidx, alloc_flags);
1561                 if (page)
1562                         goto got_pg;
1563         } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1564                 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1565                         schedule_timeout_uninterruptible(1);
1566                         goto restart;
1567                 }
1568 
1569                 /*
1570                  * Go through the zonelist yet one more time, keep
1571                  * very high watermark here, this is only to catch
1572                  * a parallel oom killing, we must fail if we're still
1573                  * under heavy pressure.
1574                  */
1575                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1576                         order, zonelist, high_zoneidx,
1577                         ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1578                 if (page) {
1579                         clear_zonelist_oom(zonelist, gfp_mask);
1580                         goto got_pg;
1581                 }
1582 
1583                 /* The OOM killer will not help higher order allocs so fail */
1584                 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1585                         clear_zonelist_oom(zonelist, gfp_mask);
1586                         goto nopage;
1587                 }
1588 
1589                 out_of_memory(zonelist, gfp_mask, order);
1590                 clear_zonelist_oom(zonelist, gfp_mask);
1591                 goto restart;
1592         }
1593 
1594         /*
1595          * Don't let big-order allocations loop unless the caller explicitly
1596          * requests that.  Wait for some write requests to complete then retry.
1597          *
1598          * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1599          * means __GFP_NOFAIL, but that may not be true in other
1600          * implementations.
1601          *
1602          * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1603          * specified, then we retry until we no longer reclaim any pages
1604          * (above), or we've reclaimed an order of pages at least as
1605          * large as the allocation's order. In both cases, if the
1606          * allocation still fails, we stop retrying.
1607          */
1608         pages_reclaimed += did_some_progress;
1609         do_retry = 0;
1610         if (!(gfp_mask & __GFP_NORETRY)) {
1611                 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1612                         do_retry = 1;
1613                 } else {
1614                         if (gfp_mask & __GFP_REPEAT &&
1615                                 pages_reclaimed < (1 << order))
1616                                         do_retry = 1;
1617                 }
1618                 if (gfp_mask & __GFP_NOFAIL)
1619                         do_retry = 1;
1620         }
1621         if (do_retry) {
1622                 congestion_wait(WRITE, HZ/50);
1623                 goto rebalance;
1624         }
1625 
1626 nopage:
1627         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1628                 printk(KERN_WARNING "%s: page allocation failure."
1629                         " order:%d, mode:0x%x\n",
1630                         p->comm, order, gfp_mask);
1631                 dump_stack();
1632                 show_mem();
1633         }
1634 got_pg:
1635         return page;
1636 }
1637 EXPORT_SYMBOL(__alloc_pages_internal);
1638




1355 /*
1356  * get_page_from_freelist goes through the zonelist trying to allocate
1357  * a page.
1358  */
1359 static struct page *
1360 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1361                 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1362 {
1363         struct zoneref *z;
1364         struct page *page = NULL;
1365         int classzone_idx;
1366         struct zone *zone, *preferred_zone;
1367         nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1368         int zlc_active = 0;             /* set if using zonelist_cache */
1369         int did_zlc_setup = 0;          /* just call zlc_setup() one time */
1370 
1371         (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1372                                                         &preferred_zone);
1373         if (!preferred_zone)
1374                 return NULL;
1375 
1376         classzone_idx = zone_idx(preferred_zone);
1377 
1378 zonelist_scan:
1379         /*
1380          * Scan zonelist, looking for a zone with enough free.
1381          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1382          */
1383         for_each_zone_zonelist_nodemask(zone, z, zonelist,
1384                                                 high_zoneidx, nodemask) {
1385                 if (NUMA_BUILD && zlc_active &&
1386                         !zlc_zone_worth_trying(zonelist, z, allowednodes))
1387                                 continue;
1388                 if ((alloc_flags & ALLOC_CPUSET) &&
1389                         !cpuset_zone_allowed_softwall(zone, gfp_mask))
1390                                 goto try_next_zone;
1391 
1392                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1393                         unsigned long mark;
1394                         if (alloc_flags & ALLOC_WMARK_MIN)
1395                                 mark = zone->pages_min;
1396                         else if (alloc_flags & ALLOC_WMARK_LOW)
1397                                 mark = zone->pages_low;
1398                         else
1399                                 mark = zone->pages_high;
1400                         if (!zone_watermark_ok(zone, order, mark,
1401                                     classzone_idx, alloc_flags)) {
1402                                 if (!zone_reclaim_mode ||
1403                                     !zone_reclaim(zone, gfp_mask, order))
1404                                         goto this_zone_full;
1405                         }
1406                 }
1407 
1408                 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
1409                 if (page)
1410                         break;
1411 this_zone_full:
1412                 if (NUMA_BUILD)
1413                         zlc_mark_zone_full(zonelist, z);
1414 try_next_zone:
1415                 if (NUMA_BUILD && !did_zlc_setup) {
1416                         /* we do zlc_setup after the first zone is tried */
1417                         allowednodes = zlc_setup(zonelist, alloc_flags);
1418                         zlc_active = 1;
1419                         did_zlc_setup = 1;
1420                 }
1421         }
1422 
1423         if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1424                 /* Disable zlc cache for second zonelist scan */
1425                 zlc_active = 0;
1426                 goto zonelist_scan;
1427         }
1428         return page;
1429 }
1430

[linuxkernelnewbies] Linux/mm/page_alloc.c

Reply via email to