--- /usr/src/linux-2.4.19pre8.orig/mm/page_alloc.c Mon May 6 19:28:16 2002 +++ /usr/src/linux-2.4.19pre8.mel/mm/page_alloc.c Wed May 22 04:05:43 2002 @@ -25,11 +25,23 @@ int nr_swap_pages; int nr_active_pages; int nr_inactive_pages; + +/* The two LRU list. These of primary interest to the page replacement + * algorithm. Pages that are referenced often will remain in the active_list. + * Pages are moved to the inactive_list by refill_inactive called by kswapd. + * Once in the inactive list, the page is a candidate to be swapped out + */ struct list_head inactive_list; struct list_head active_list; + pg_data_t *pgdat_list; -/* Used to look up the address of the struct zone encoded in page->zone */ +/* zone_table is the replacement for page->zone . It's a simple way of + * quickly looking up what zone a page belongs so. During init, the upper + * most 8 bits of page->flags will be used to store what zone we are in. + * See free_area_init_core . The macro page_zone returns the zone a page + * belongs to. See linux/include/mm.h + */ zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; EXPORT_SYMBOL(zone_table); @@ -81,13 +93,65 @@ * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * - * TODO: give references to descriptions of buddy system allocators, - * describe precisely the silly trick buddy allocators use to avoid - * storing an extra bit, utilizing entry point information. + * --wli + * + * There is a brief explanation of how a buddy algorithm works at + * http://www.memorymanagement.org/articles/alloc.html . A better idea + * is to read the explanation from a book like UNIX Internals by + * Uresh Vahalia + * + * The principle objective of the buddy algorithm is to keep track of blocks + * of contiguous pages that are free or allocated. At the beginning there + * is many free blocks of MAX_ORDER available. These are split up as necessary + * for lower order allocations. + * + * A free_area_t exists for each order in a zone (zone->free_area). The struct + * stores a list free page blocks (area->free_list) of that order that can + * be allocated and a bitmap (area->map) the describes if the buddies + * are allocated or not. + * + * If no page blocks are free of a particular order, the higher order will be + * examined. If there is one, it's removed from the free_list for the higher + * order allocation and split into two blocks of the lower order. One is + * allocated and the other is placed on the free_list. + * + * What is important to note is how the bitmap is used in free_area_t.map . + * Each bit in the map represents two adjacent buddies of a this order. + * At order(0), one bit represents two pages. At order(1), it represents + * four pages, two buddies of two pages each and so on. If the bit is 0, + * both blocks are full or both are free. If it is one, exactly one is full. + * + * Each time the page is used or freed, this bit is toggled. + * + * This is most important in __free_pages_ok() as it tells us if we can merge + * adjacent blocks. As we are freeing a block, we know this buddy is free, + * we flip this bit and check it's new value. + * + * If it is now 0, we know that both are free because here we know the buddy + * we have is free. If it's 1, we know the buddy is allocated because the bit + * says exactly one is free and it can't be the block we are just freeing * - * -- wli + * --mel + * + * One way to describe the bit toggling behavior is saying that each page + * has a "virtual bit", the bit present in the bitmap is the xor of the + * "virtual bits" of the two buddies, and the buddy's bit is recovered by + * xor'ing with the bit of the page we're examining, which is known from + * the entry point. + * + * --wli + * */ +/** + * + * __free_pages_ok - Returns pages to the buddy allocator + * @page: The first page of the block to be freed + * @order: 2^order number of pages are freed + * + * This function returns the pages allocated by __alloc_pages and tries to + * merge buddies if possible. Do not call directly, use free_pages() + **/ static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); static void __free_pages_ok (struct page *page, unsigned int order) { @@ -112,36 +176,64 @@ BUG(); if (PageLocked(page)) BUG(); + + /* FIXME: lru_cache_del above handles the case where a page is on the + * LRU. This check is useless + */ if (PageLRU(page)) BUG(); if (PageActive(page)) BUG(); page->flags &= ~((1<flags & PF_FREE_PAGES) goto local_freelist; + back_local_freelist: zone = page_zone(page); + /* Multiple uses for mask which defies a common name + * -mask == number of pages that will be freed + * page_idx & ~mask == Is page aligned to an order size? + * Also used later to calculate the address of a buddy + */ mask = (~0UL) << order; + + /* + * zone_mem_map = first page in the current zone + * page_idx = page offset within zone_mem_map + * index = bit offset within map representing the buddies + */ + base = zone->zone_mem_map; page_idx = page - base; if (page_idx & ~mask) BUG(); - index = page_idx >> (1 + order); + index = page_idx >> (1 + order); area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->free_pages -= mask; + /* Irregardless of order, this is 0 when MAX_ORDER is reached */ while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; + /* QUERY: This could only be true if order was originally set + * to be a value greater than MAX_ORDER, should the + * sanity check not be made at the beginning of the + * function and then removed from here? + */ if (area >= zone->free_area + MAX_ORDER) BUG(); + + /* See above for explanation */ if (!__test_and_change_bit(index, area->map)) /* * the buddy page is still allocated. @@ -159,7 +251,10 @@ if (BAD_RANGE(zone,buddy2)) BUG(); + /* buddy2 has already been freed */ memlist_del(&buddy1->list); + + /* Prepare to try and merge the higher order buddies */ mask <<= 1; area++; index >>= 1; @@ -171,11 +266,23 @@ return; local_freelist: + /* If the process has already freed pages for itself, don't give it + * more */ if (current->nr_local_pages) goto back_local_freelist; + + /* + * An interrupt doesn't have a current process to store pages on. + * + * QUERY: is this not a dead check, an interrupt could only get here if + * alloc_pages took the slow path through balance_classzones. If an + * interrupt got there, aren't we already dead? + */ if (in_interrupt()) goto back_local_freelist; + /* Add the page onto the local list, update the page information + * and return */ list_add(&page->list, ¤t->local_pages); page->index = order; current->nr_local_pages++; @@ -184,19 +291,49 @@ #define MARK_USED(index, order, area) \ __change_bit((index) >> (1+(order)), (area)->map) +/** + * + * expand - Break up higher order pages until the right size block is available + * @zone: The zone to free pages from + * @page: The first page of the first order to split + * @index: The page address index inside zone_mem_map + * @low: The order block of pages required + * @high: The order of the block of pages that are about to be split + * @area: The array of free areas for this zone + * + * This function will break up higher orders of pages necessary and update the + * bitmaps as it goes along. If it splits, the lower half will be put onto + * the free list and the high half will be either allocated or split + * further. This function is called from rmqueue() and not directly + * + * Note that index here is a page number offset within this zone. In other + * parts of the code, index means a bit offset within map. + **/ static inline struct page * expand (zone_t *zone, struct page *page, unsigned long index, int low, int high, free_area_t * area) { unsigned long size = 1 << high; + /* + * If it turned out there was a free block at the right order to begin + * with, no splitting will take place + */ while (high > low) { if (BAD_RANGE(zone,page)) BUG(); + + /* Prepare to move to next area and sizes */ area--; high--; size >>= 1; + + /* Add the page to the free list for the "lower" area. The + * high half will be split more if necessary + */ memlist_add_head(&(page)->list, &(area)->free_list); MARK_USED(index, high, area); + + /* Move to next page index and address */ index += size; page += size; } @@ -205,6 +342,17 @@ return page; } +/** + * + * rmqueue - Allocate page blocks of 2^order size via the buddy algorithm + * @zone: The zone to allocate from + * @order: The 2^order sized block to allocate + * + * This function is responsible for finding out what order of pages we + * have to go to to satisfy the request. For example if there is no + * page blocks free to satisfy order=0 (1 page), then see if there is + * a free block of order=1 that can be spilt into two order=0 pages + **/ static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); static struct page * rmqueue(zone_t *zone, unsigned int order) { @@ -216,24 +364,39 @@ spin_lock_irqsave(&zone->lock, flags); do { + /* Get the first block of pages free in this area */ head = &area->free_list; curr = memlist_next(head); + /* If there is a free block available, split it up until + * we get the order we want and allocate it + */ if (curr != head) { unsigned int index; page = memlist_entry(curr, struct page, list); if (BAD_RANGE(zone,page)) BUG(); + + /* No longer free so remove it from the list */ memlist_del(curr); + + /* index is the bit within area->map for this pair + * of buddies + */ index = page - zone->zone_mem_map; if (curr_order != MAX_ORDER-1) MARK_USED(index, curr_order, area); + zone->free_pages -= 1UL << order; + /* expand splits blocks of pages till we get a block + * of the order needed + */ page = expand(zone, page, index, order, curr_order, area); spin_unlock_irqrestore(&zone->lock, flags); + /* Mark the page as used and do some checks */ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); @@ -241,10 +404,16 @@ BUG(); if (PageActive(page)) BUG(); + return page; } + + /* There isn't pages ready at this order so examine a block of + * higher orders + */ curr_order++; area++; + } while (curr_order < MAX_ORDER); spin_unlock_irqrestore(&zone->lock, flags); @@ -252,13 +421,40 @@ } #ifndef CONFIG_DISCONTIGMEM +/** + * + * _alloc_pages - Find zone to allocate from and call __alloc_pages + * @gfp_mask - Flags that determine the behaviour of the allocator + * @order - 2^order number of pages will be allocated in a block + * + * This is called by alloc_pages. It's only task is to identify the + * preferred set of zones to allocate from. + **/ struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) { + /* + * Clear the high bits to see if the allocation is from ZONE_DMA, + * ZONE_NORMAL or ZONE_HIGHMEM + */ return __alloc_pages(gfp_mask, order, contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); } #endif +/** + * + * balance_classzone - Free page frames from a zone in a synchronous fashion + * @classzone: Zone to free from + * @gfp_mask: Flags which determine the behaviour of the allocator + * @order: It's a block of 2^order pages the caller is looking for + * @freed: Returns the number of total number of pages freed + * + * In a nutshell, this function does some of the work of kswapd in a synchronous + * fashion when there simply is too little memory to be waiting around. The + * caller will use this when it needs the memory and is willing to block on + * waiting for it. + * + **/ static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) { @@ -271,6 +467,10 @@ BUG(); current->allocation_order = order; + + /* These flags are set so that __free_pages_ok knows to return the + * pages directly to the process + */ current->flags |= PF_MEMALLOC | PF_FREE_PAGES; __freed = try_to_free_pages(classzone, gfp_mask, order); @@ -278,6 +478,7 @@ current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); if (current->nr_local_pages) { + /* If pages were freed */ struct list_head * entry, * local_pages; struct page * tmp; int nr_pages; @@ -285,12 +486,19 @@ local_pages = ¤t->local_pages; if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ + /* pick from the last inserted so we're LIFO */ entry = local_pages->next; do { tmp = list_entry(entry, struct page, list); if (tmp->index == order && memclass(page_zone(tmp), classzone)) { + /* This is a block of pages that is of + * the correct size so remember it + */ list_del(entry); + + /* QUERY: if order > 0, wouldn't the + * nr_local_pages drop by more than 1? + */ current->nr_local_pages--; set_page_count(tmp, 1); page = tmp; @@ -318,7 +526,10 @@ } nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ + /* free in reverse order so that the global order will be lifo + * The pages freed here are ones not of the order we are + * interested in for the moment + */ while ((entry = local_pages->prev) != local_pages) { list_del(entry); tmp = list_entry(entry, struct page, list); @@ -333,8 +544,27 @@ return page; } -/* +/** + * + * __alloc_pages - Allocate pages in a block of size 2^order + * @gfp_mask: Flags for this allocation that determine behaviour of allocator + * @order: 2^order number of pages to allocate + * @zonelist: A list of zones to allocate from starting with the preferred one + * * This is the 'heart' of the zoned buddy allocator: + * There is a few paths the this will take to try and allocate the pages. + * is takes depends on what pages are available and what flags on gfp_mask + * are set. For instance, if the allocation is for an interrupt handler, + * __alloc_pages won't do anything that would block. Each block or attempt + * made gets progressively slower as the function executes. + * + * zonelist is the set of zones which determines the order of fallback if + * one zone is full. An allocation may be for ZONE_HIGHMEM, but if there + * is none available, ZONE_NORMAL may be used or possibly ZONE_DMA. see + * build_zonelist() . + * + * This function should not be called directly. Use either alloc_pages() or + * __get_free_pages() */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { @@ -343,11 +573,21 @@ struct page * page; int freed; + /* zone is the preferred allocation zone. zone++ is a fallback one + * classzone is the first zone of the list. It's a "special" + * zone which keeps track of whether the whole needs to be + * balanced + * class_idx is which zone index within this pg_data_t zone is + */ zone = zonelist->zones; classzone = *zone; if (classzone == NULL) return NULL; min = 1UL << order; + + /* Cycle through the zones and their fallbacks. Allocate from the zone + * if the allocation can be made without the low watermark been hit + */ for (;;) { zone_t *z = *(zone++); if (!z) @@ -361,11 +601,21 @@ } } + /* The watermarks.pages_low has been reached. Mark the zone set + * as needing balancing and wake up kswapd which will start work + * freeing pages in this classzone + */ + classzone->need_balance = 1; mb(); if (waitqueue_active(&kswapd_wait)) wake_up_interruptible(&kswapd_wait); + /* Start again moving through the zones. This time it will allow the + * zone to reach watermarks.min number of free pages. It is hoped that + * kswapd will bring the number of pages above the watermarks again + * later + */ zone = zonelist->zones; min = 1UL << order; for (;;) { @@ -375,9 +625,15 @@ break; local_min = z->pages_min; + + /* If the caller can't wait, allow the zone to be pushed into + * a tighter memory position */ if (!(gfp_mask & __GFP_WAIT)) local_min >>= 2; + min += local_min; + + /* If we are safe to allocate this, allocate the page */ if (z->free_pages > min) { page = rmqueue(z, order); if (page) @@ -388,6 +644,13 @@ /* here we're in the low on memory slow path */ rebalance: + /* + * PF_MEMALLOC if set if the calling process wants to be treated as a + * memory allocator, kswapd for example. This process is high priority + * and should be served if at all possible. in_interrupt() means we + * can't sleep no matter what. This next block will allocate the + * memory no matter what watermark is hit if possible + */ if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { zone = zonelist->zones; for (;;) { @@ -406,10 +669,14 @@ if (!(gfp_mask & __GFP_WAIT)) return NULL; + /* Do the work of kswapd in a synchronous fashion */ page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; + /* pages were freed by balance_classzone but not of the + * proper type. Cycle through in case a higher order was freed + */ zone = zonelist->zones; min = 1UL << order; for (;;) { @@ -436,8 +703,15 @@ goto rebalance; } -/* - * Common helper functions. +/** + * + * __get_free_pages - Get a 2^order block of free pages + * @gfp_mask: Flags which determine the allocator behaviour + * @order: A block sized 2^order will be allocated + * + * This is the highest level function available for allocating a block of + * pages to the caller. Ultimately __alloc_pages() is called to use the + * buddy algorithm to retrieve a block of pages */ unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) { @@ -446,9 +720,15 @@ page = alloc_pages(gfp_mask, order); if (!page) return 0; + return (unsigned long) page_address(page); } +/** + * + * get_zerod_page - Allocates one page from the buddy allocator and zeros it + * @gfp_mask: Flags which determine the allocator behaviour + */ unsigned long get_zeroed_page(unsigned int gfp_mask) { struct page * page; @@ -462,20 +742,43 @@ return 0; } +/** + * + * __free_pages - Sanity check before asking the buddy allocator to take pages + * @page: The first page of the block to free + * @order: Indicates the block size. size = 2^order + */ void __free_pages(struct page *page, unsigned int order) { + /* QUERY: __free_pages_ok() does a load of sanity checks at the + * beginning of the function, would it not make more sense + * to lump them all together and have one function call? + */ if (!PageReserved(page) && put_page_testzero(page)) __free_pages_ok(page, order); } +/** + * + * free_pages - Free pages allocated by the buddy allocator + * addr: The address of the pages to free + * order: The block size to free + * + * This is the highest level function available for freeing pages allocated + * by the buddy allocator + */ void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) __free_pages(virt_to_page(addr), order); } -/* - * Total amount of free (allocatable) RAM: +/** + * + * nr_free_pages - Returns number of free pages in all zones + * + * This function walks through all zones and sums the free page frames in each + * of them. */ unsigned int nr_free_pages (void) { @@ -492,8 +795,12 @@ return sum; } -/* - * Amount of free RAM allocatable as buffer memory: +/** + * + * nr_free_buffer_pages - Amount of free RAM allocatable as buffer memory + * + * This steps through all the zones that are suitable for normal use and + * returns back the totals of "size-pages_high". */ unsigned int nr_free_buffer_pages (void) { @@ -519,6 +826,10 @@ } #if CONFIG_HIGHMEM +/** + * + * nr_free_highpages - Returns the number of free page frames in high memory + */ unsigned int nr_free_highpages (void) { pg_data_t *pgdat = pgdat_list; @@ -532,6 +843,9 @@ } #endif +/* This macro will yield the total amount of RAM in kB + * addressed by x number of pages. + */ #define K(x) ((x) << (PAGE_SHIFT-10)) /* @@ -549,6 +863,8 @@ K(nr_free_pages()), K(nr_free_highpages())); + /* Step through all zones in all pgdats and print out the pertinent + * information about them */ while (tmpdat) { zone_t *zone; for (zone = tmpdat->node_zones; @@ -569,6 +885,10 @@ nr_inactive_pages, nr_free_pages()); + /* This steps through all the zones a second time and checks how + * many blocks of each 2^order block of pages. This helps determine + * how fragmented memory is + */ for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; zone_t *zone = pgdat->node_zones + type; @@ -606,7 +926,10 @@ } /* - * Builds allocation fallback zone lists. + * Builds allocation fallback zone lists. This determines what order zones + * should be used to take pages from if an allocation fails. For example, + * an allocation for HIGHMEM will fall to NORMAL if pages are not available + * and in turn fall to DMA. */ static inline void build_zonelists(pg_data_t *pgdat) {